1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/bpf-cgroup.h> 81 #include <linux/btf_ids.h> 82 #include <linux/dcache.h> 83 #include <linux/errno.h> 84 #include <linux/fcntl.h> 85 #include <linux/file.h> 86 #include <linux/filter.h> 87 #include <linux/fs.h> 88 #include <linux/fs_struct.h> 89 #include <linux/init.h> 90 #include <linux/kernel.h> 91 #include <linux/mount.h> 92 #include <linux/namei.h> 93 #include <linux/net.h> 94 #include <linux/pidfs.h> 95 #include <linux/poll.h> 96 #include <linux/proc_fs.h> 97 #include <linux/sched/signal.h> 98 #include <linux/security.h> 99 #include <linux/seq_file.h> 100 #include <linux/skbuff.h> 101 #include <linux/slab.h> 102 #include <linux/socket.h> 103 #include <linux/splice.h> 104 #include <linux/string.h> 105 #include <linux/uaccess.h> 106 #include <net/af_unix.h> 107 #include <net/net_namespace.h> 108 #include <net/scm.h> 109 #include <net/tcp_states.h> 110 #include <uapi/linux/sockios.h> 111 #include <uapi/linux/termios.h> 112 113 #include "af_unix.h" 114 115 static atomic_long_t unix_nr_socks; 116 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 117 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 118 119 /* SMP locking strategy: 120 * hash table is protected with spinlock. 121 * each socket state is protected by separate spinlock. 122 */ 123 #ifdef CONFIG_PROVE_LOCKING 124 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) 125 126 static int unix_table_lock_cmp_fn(const struct lockdep_map *a, 127 const struct lockdep_map *b) 128 { 129 return cmp_ptr(a, b); 130 } 131 132 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, 133 const struct lockdep_map *_b) 134 { 135 const struct unix_sock *a, *b; 136 137 a = container_of(_a, struct unix_sock, lock.dep_map); 138 b = container_of(_b, struct unix_sock, lock.dep_map); 139 140 if (a->sk.sk_state == TCP_LISTEN) { 141 /* unix_stream_connect(): Before the 2nd unix_state_lock(), 142 * 143 * 1. a is TCP_LISTEN. 144 * 2. b is not a. 145 * 3. concurrent connect(b -> a) must fail. 146 * 147 * Except for 2. & 3., the b's state can be any possible 148 * value due to concurrent connect() or listen(). 149 * 150 * 2. is detected in debug_spin_lock_before(), and 3. cannot 151 * be expressed as lock_cmp_fn. 152 */ 153 switch (b->sk.sk_state) { 154 case TCP_CLOSE: 155 case TCP_ESTABLISHED: 156 case TCP_LISTEN: 157 return -1; 158 default: 159 /* Invalid case. */ 160 return 0; 161 } 162 } 163 164 /* Should never happen. Just to be symmetric. */ 165 if (b->sk.sk_state == TCP_LISTEN) { 166 switch (b->sk.sk_state) { 167 case TCP_CLOSE: 168 case TCP_ESTABLISHED: 169 return 1; 170 default: 171 return 0; 172 } 173 } 174 175 /* unix_state_double_lock(): ascending address order. */ 176 return cmp_ptr(a, b); 177 } 178 179 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, 180 const struct lockdep_map *_b) 181 { 182 const struct sock *a, *b; 183 184 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); 185 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); 186 187 /* unix_collect_skb(): listener -> embryo order. */ 188 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) 189 return -1; 190 191 /* Should never happen. Just to be symmetric. */ 192 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) 193 return 1; 194 195 return 0; 196 } 197 #endif 198 199 static unsigned int unix_unbound_hash(struct sock *sk) 200 { 201 unsigned long hash = (unsigned long)sk; 202 203 hash ^= hash >> 16; 204 hash ^= hash >> 8; 205 hash ^= sk->sk_type; 206 207 return hash & UNIX_HASH_MOD; 208 } 209 210 static unsigned int unix_bsd_hash(struct inode *i) 211 { 212 return i->i_ino & UNIX_HASH_MOD; 213 } 214 215 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 216 int addr_len, int type) 217 { 218 __wsum csum = csum_partial(sunaddr, addr_len, 0); 219 unsigned int hash; 220 221 hash = (__force unsigned int)csum_fold(csum); 222 hash ^= hash >> 8; 223 hash ^= type; 224 225 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 226 } 227 228 static void unix_table_double_lock(struct net *net, 229 unsigned int hash1, unsigned int hash2) 230 { 231 if (hash1 == hash2) { 232 spin_lock(&net->unx.table.locks[hash1]); 233 return; 234 } 235 236 if (hash1 > hash2) 237 swap(hash1, hash2); 238 239 spin_lock(&net->unx.table.locks[hash1]); 240 spin_lock(&net->unx.table.locks[hash2]); 241 } 242 243 static void unix_table_double_unlock(struct net *net, 244 unsigned int hash1, unsigned int hash2) 245 { 246 if (hash1 == hash2) { 247 spin_unlock(&net->unx.table.locks[hash1]); 248 return; 249 } 250 251 spin_unlock(&net->unx.table.locks[hash1]); 252 spin_unlock(&net->unx.table.locks[hash2]); 253 } 254 255 #ifdef CONFIG_SECURITY_NETWORK 256 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 257 { 258 UNIXCB(skb).secid = scm->secid; 259 } 260 261 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 262 { 263 scm->secid = UNIXCB(skb).secid; 264 } 265 266 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 267 { 268 return (scm->secid == UNIXCB(skb).secid); 269 } 270 #else 271 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 272 { } 273 274 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 275 { } 276 277 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 278 { 279 return true; 280 } 281 #endif /* CONFIG_SECURITY_NETWORK */ 282 283 static inline int unix_may_send(struct sock *sk, struct sock *osk) 284 { 285 return !unix_peer(osk) || unix_peer(osk) == sk; 286 } 287 288 static inline int unix_recvq_full_lockless(const struct sock *sk) 289 { 290 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 291 } 292 293 struct sock *unix_peer_get(struct sock *s) 294 { 295 struct sock *peer; 296 297 unix_state_lock(s); 298 peer = unix_peer(s); 299 if (peer) 300 sock_hold(peer); 301 unix_state_unlock(s); 302 return peer; 303 } 304 EXPORT_SYMBOL_GPL(unix_peer_get); 305 306 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 307 int addr_len) 308 { 309 struct unix_address *addr; 310 311 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 312 if (!addr) 313 return NULL; 314 315 refcount_set(&addr->refcnt, 1); 316 addr->len = addr_len; 317 memcpy(addr->name, sunaddr, addr_len); 318 319 return addr; 320 } 321 322 static inline void unix_release_addr(struct unix_address *addr) 323 { 324 if (refcount_dec_and_test(&addr->refcnt)) 325 kfree(addr); 326 } 327 328 /* 329 * Check unix socket name: 330 * - should be not zero length. 331 * - if started by not zero, should be NULL terminated (FS object) 332 * - if started by zero, it is abstract name. 333 */ 334 335 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 336 { 337 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 338 addr_len > sizeof(*sunaddr)) 339 return -EINVAL; 340 341 if (sunaddr->sun_family != AF_UNIX) 342 return -EINVAL; 343 344 return 0; 345 } 346 347 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 348 { 349 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 350 short offset = offsetof(struct sockaddr_storage, __data); 351 352 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 353 354 /* This may look like an off by one error but it is a bit more 355 * subtle. 108 is the longest valid AF_UNIX path for a binding. 356 * sun_path[108] doesn't as such exist. However in kernel space 357 * we are guaranteed that it is a valid memory location in our 358 * kernel address buffer because syscall functions always pass 359 * a pointer of struct sockaddr_storage which has a bigger buffer 360 * than 108. Also, we must terminate sun_path for strlen() in 361 * getname_kernel(). 362 */ 363 addr->__data[addr_len - offset] = 0; 364 365 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 366 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 367 * know the actual buffer. 368 */ 369 return strlen(addr->__data) + offset + 1; 370 } 371 372 static void __unix_remove_socket(struct sock *sk) 373 { 374 sk_del_node_init(sk); 375 } 376 377 static void __unix_insert_socket(struct net *net, struct sock *sk) 378 { 379 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 380 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 381 } 382 383 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 384 struct unix_address *addr, unsigned int hash) 385 { 386 __unix_remove_socket(sk); 387 smp_store_release(&unix_sk(sk)->addr, addr); 388 389 sk->sk_hash = hash; 390 __unix_insert_socket(net, sk); 391 } 392 393 static void unix_remove_socket(struct net *net, struct sock *sk) 394 { 395 spin_lock(&net->unx.table.locks[sk->sk_hash]); 396 __unix_remove_socket(sk); 397 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 398 } 399 400 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 401 { 402 spin_lock(&net->unx.table.locks[sk->sk_hash]); 403 __unix_insert_socket(net, sk); 404 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 405 } 406 407 static void unix_insert_bsd_socket(struct sock *sk) 408 { 409 spin_lock(&bsd_socket_locks[sk->sk_hash]); 410 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 411 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 412 } 413 414 static void unix_remove_bsd_socket(struct sock *sk) 415 { 416 if (!hlist_unhashed(&sk->sk_bind_node)) { 417 spin_lock(&bsd_socket_locks[sk->sk_hash]); 418 __sk_del_bind_node(sk); 419 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 420 421 sk_node_init(&sk->sk_bind_node); 422 } 423 } 424 425 static struct sock *__unix_find_socket_byname(struct net *net, 426 struct sockaddr_un *sunname, 427 int len, unsigned int hash) 428 { 429 struct sock *s; 430 431 sk_for_each(s, &net->unx.table.buckets[hash]) { 432 struct unix_sock *u = unix_sk(s); 433 434 if (u->addr->len == len && 435 !memcmp(u->addr->name, sunname, len)) 436 return s; 437 } 438 return NULL; 439 } 440 441 static inline struct sock *unix_find_socket_byname(struct net *net, 442 struct sockaddr_un *sunname, 443 int len, unsigned int hash) 444 { 445 struct sock *s; 446 447 spin_lock(&net->unx.table.locks[hash]); 448 s = __unix_find_socket_byname(net, sunname, len, hash); 449 if (s) 450 sock_hold(s); 451 spin_unlock(&net->unx.table.locks[hash]); 452 return s; 453 } 454 455 static struct sock *unix_find_socket_byinode(struct inode *i) 456 { 457 unsigned int hash = unix_bsd_hash(i); 458 struct sock *s; 459 460 spin_lock(&bsd_socket_locks[hash]); 461 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 462 struct dentry *dentry = unix_sk(s)->path.dentry; 463 464 if (dentry && d_backing_inode(dentry) == i) { 465 sock_hold(s); 466 spin_unlock(&bsd_socket_locks[hash]); 467 return s; 468 } 469 } 470 spin_unlock(&bsd_socket_locks[hash]); 471 return NULL; 472 } 473 474 /* Support code for asymmetrically connected dgram sockets 475 * 476 * If a datagram socket is connected to a socket not itself connected 477 * to the first socket (eg, /dev/log), clients may only enqueue more 478 * messages if the present receive queue of the server socket is not 479 * "too large". This means there's a second writeability condition 480 * poll and sendmsg need to test. The dgram recv code will do a wake 481 * up on the peer_wait wait queue of a socket upon reception of a 482 * datagram which needs to be propagated to sleeping would-be writers 483 * since these might not have sent anything so far. This can't be 484 * accomplished via poll_wait because the lifetime of the server 485 * socket might be less than that of its clients if these break their 486 * association with it or if the server socket is closed while clients 487 * are still connected to it and there's no way to inform "a polling 488 * implementation" that it should let go of a certain wait queue 489 * 490 * In order to propagate a wake up, a wait_queue_entry_t of the client 491 * socket is enqueued on the peer_wait queue of the server socket 492 * whose wake function does a wake_up on the ordinary client socket 493 * wait queue. This connection is established whenever a write (or 494 * poll for write) hit the flow control condition and broken when the 495 * association to the server socket is dissolved or after a wake up 496 * was relayed. 497 */ 498 499 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 500 void *key) 501 { 502 struct unix_sock *u; 503 wait_queue_head_t *u_sleep; 504 505 u = container_of(q, struct unix_sock, peer_wake); 506 507 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 508 q); 509 u->peer_wake.private = NULL; 510 511 /* relaying can only happen while the wq still exists */ 512 u_sleep = sk_sleep(&u->sk); 513 if (u_sleep) 514 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 515 516 return 0; 517 } 518 519 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 520 { 521 struct unix_sock *u, *u_other; 522 int rc; 523 524 u = unix_sk(sk); 525 u_other = unix_sk(other); 526 rc = 0; 527 spin_lock(&u_other->peer_wait.lock); 528 529 if (!u->peer_wake.private) { 530 u->peer_wake.private = other; 531 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 532 533 rc = 1; 534 } 535 536 spin_unlock(&u_other->peer_wait.lock); 537 return rc; 538 } 539 540 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 541 struct sock *other) 542 { 543 struct unix_sock *u, *u_other; 544 545 u = unix_sk(sk); 546 u_other = unix_sk(other); 547 spin_lock(&u_other->peer_wait.lock); 548 549 if (u->peer_wake.private == other) { 550 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 551 u->peer_wake.private = NULL; 552 } 553 554 spin_unlock(&u_other->peer_wait.lock); 555 } 556 557 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 558 struct sock *other) 559 { 560 unix_dgram_peer_wake_disconnect(sk, other); 561 wake_up_interruptible_poll(sk_sleep(sk), 562 EPOLLOUT | 563 EPOLLWRNORM | 564 EPOLLWRBAND); 565 } 566 567 /* preconditions: 568 * - unix_peer(sk) == other 569 * - association is stable 570 */ 571 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 572 { 573 int connected; 574 575 connected = unix_dgram_peer_wake_connect(sk, other); 576 577 /* If other is SOCK_DEAD, we want to make sure we signal 578 * POLLOUT, such that a subsequent write() can get a 579 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 580 * to other and its full, we will hang waiting for POLLOUT. 581 */ 582 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 583 return 1; 584 585 if (connected) 586 unix_dgram_peer_wake_disconnect(sk, other); 587 588 return 0; 589 } 590 591 static int unix_writable(const struct sock *sk, unsigned char state) 592 { 593 return state != TCP_LISTEN && 594 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 595 } 596 597 static void unix_write_space(struct sock *sk) 598 { 599 struct socket_wq *wq; 600 601 rcu_read_lock(); 602 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 603 wq = rcu_dereference(sk->sk_wq); 604 if (skwq_has_sleeper(wq)) 605 wake_up_interruptible_sync_poll(&wq->wait, 606 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 607 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 608 } 609 rcu_read_unlock(); 610 } 611 612 /* When dgram socket disconnects (or changes its peer), we clear its receive 613 * queue of packets arrived from previous peer. First, it allows to do 614 * flow control based only on wmem_alloc; second, sk connected to peer 615 * may receive messages only from that peer. */ 616 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 617 { 618 if (!skb_queue_empty(&sk->sk_receive_queue)) { 619 skb_queue_purge_reason(&sk->sk_receive_queue, 620 SKB_DROP_REASON_UNIX_DISCONNECT); 621 622 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 623 624 /* If one link of bidirectional dgram pipe is disconnected, 625 * we signal error. Messages are lost. Do not make this, 626 * when peer was not connected to us. 627 */ 628 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 629 WRITE_ONCE(other->sk_err, ECONNRESET); 630 sk_error_report(other); 631 } 632 } 633 } 634 635 static void unix_sock_destructor(struct sock *sk) 636 { 637 struct unix_sock *u = unix_sk(sk); 638 639 skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE); 640 641 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 642 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 643 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 644 if (!sock_flag(sk, SOCK_DEAD)) { 645 pr_info("Attempt to release alive unix socket: %p\n", sk); 646 return; 647 } 648 649 if (u->addr) 650 unix_release_addr(u->addr); 651 652 atomic_long_dec(&unix_nr_socks); 653 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 654 #ifdef UNIX_REFCNT_DEBUG 655 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 656 atomic_long_read(&unix_nr_socks)); 657 #endif 658 } 659 660 static unsigned int unix_skb_len(const struct sk_buff *skb) 661 { 662 return skb->len - UNIXCB(skb).consumed; 663 } 664 665 static void unix_release_sock(struct sock *sk, int embrion) 666 { 667 struct unix_sock *u = unix_sk(sk); 668 struct sock *skpair; 669 struct sk_buff *skb; 670 struct path path; 671 int state; 672 673 unix_remove_socket(sock_net(sk), sk); 674 unix_remove_bsd_socket(sk); 675 676 /* Clear state */ 677 unix_state_lock(sk); 678 sock_orphan(sk); 679 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 680 path = u->path; 681 u->path.dentry = NULL; 682 u->path.mnt = NULL; 683 state = sk->sk_state; 684 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 685 686 skpair = unix_peer(sk); 687 unix_peer(sk) = NULL; 688 689 unix_state_unlock(sk); 690 691 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 692 u->oob_skb = NULL; 693 #endif 694 695 wake_up_interruptible_all(&u->peer_wait); 696 697 if (skpair != NULL) { 698 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 699 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 700 701 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 702 if (skb && !unix_skb_len(skb)) 703 skb = skb_peek_next(skb, &sk->sk_receive_queue); 704 #endif 705 unix_state_lock(skpair); 706 /* No more writes */ 707 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 708 if (skb || embrion) 709 WRITE_ONCE(skpair->sk_err, ECONNRESET); 710 unix_state_unlock(skpair); 711 skpair->sk_state_change(skpair); 712 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 713 } 714 715 unix_dgram_peer_wake_disconnect(sk, skpair); 716 sock_put(skpair); /* It may now die */ 717 } 718 719 /* Try to flush out this socket. Throw out buffers at least */ 720 721 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 722 if (state == TCP_LISTEN) 723 unix_release_sock(skb->sk, 1); 724 725 /* passed fds are erased in the kfree_skb hook */ 726 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 727 } 728 729 if (path.dentry) 730 path_put(&path); 731 732 sock_put(sk); 733 734 /* ---- Socket is dead now and most probably destroyed ---- */ 735 736 if (READ_ONCE(unix_tot_inflight)) 737 unix_gc(); /* Garbage collect fds */ 738 } 739 740 struct unix_peercred { 741 struct pid *peer_pid; 742 const struct cred *peer_cred; 743 }; 744 745 static inline int prepare_peercred(struct unix_peercred *peercred) 746 { 747 struct pid *pid; 748 int err; 749 750 pid = task_tgid(current); 751 err = pidfs_register_pid(pid); 752 if (likely(!err)) { 753 peercred->peer_pid = get_pid(pid); 754 peercred->peer_cred = get_current_cred(); 755 } 756 return err; 757 } 758 759 static void drop_peercred(struct unix_peercred *peercred) 760 { 761 const struct cred *cred = NULL; 762 struct pid *pid = NULL; 763 764 might_sleep(); 765 766 swap(peercred->peer_pid, pid); 767 swap(peercred->peer_cred, cred); 768 769 put_pid(pid); 770 put_cred(cred); 771 } 772 773 static inline void init_peercred(struct sock *sk, 774 const struct unix_peercred *peercred) 775 { 776 sk->sk_peer_pid = peercred->peer_pid; 777 sk->sk_peer_cred = peercred->peer_cred; 778 } 779 780 static void update_peercred(struct sock *sk, struct unix_peercred *peercred) 781 { 782 const struct cred *old_cred; 783 struct pid *old_pid; 784 785 spin_lock(&sk->sk_peer_lock); 786 old_pid = sk->sk_peer_pid; 787 old_cred = sk->sk_peer_cred; 788 init_peercred(sk, peercred); 789 spin_unlock(&sk->sk_peer_lock); 790 791 peercred->peer_pid = old_pid; 792 peercred->peer_cred = old_cred; 793 } 794 795 static void copy_peercred(struct sock *sk, struct sock *peersk) 796 { 797 lockdep_assert_held(&unix_sk(peersk)->lock); 798 799 spin_lock(&sk->sk_peer_lock); 800 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 801 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 802 spin_unlock(&sk->sk_peer_lock); 803 } 804 805 static bool unix_may_passcred(const struct sock *sk) 806 { 807 return sk->sk_scm_credentials || sk->sk_scm_pidfd; 808 } 809 810 static int unix_listen(struct socket *sock, int backlog) 811 { 812 int err; 813 struct sock *sk = sock->sk; 814 struct unix_sock *u = unix_sk(sk); 815 struct unix_peercred peercred = {}; 816 817 err = -EOPNOTSUPP; 818 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 819 goto out; /* Only stream/seqpacket sockets accept */ 820 err = -EINVAL; 821 if (!READ_ONCE(u->addr)) 822 goto out; /* No listens on an unbound socket */ 823 err = prepare_peercred(&peercred); 824 if (err) 825 goto out; 826 unix_state_lock(sk); 827 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 828 goto out_unlock; 829 if (backlog > sk->sk_max_ack_backlog) 830 wake_up_interruptible_all(&u->peer_wait); 831 sk->sk_max_ack_backlog = backlog; 832 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 833 834 /* set credentials so connect can copy them */ 835 update_peercred(sk, &peercred); 836 err = 0; 837 838 out_unlock: 839 unix_state_unlock(sk); 840 drop_peercred(&peercred); 841 out: 842 return err; 843 } 844 845 static int unix_release(struct socket *); 846 static int unix_bind(struct socket *, struct sockaddr_unsized *, int); 847 static int unix_stream_connect(struct socket *, struct sockaddr *, 848 int addr_len, int flags); 849 static int unix_socketpair(struct socket *, struct socket *); 850 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); 851 static int unix_getname(struct socket *, struct sockaddr *, int); 852 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 853 static __poll_t unix_dgram_poll(struct file *, struct socket *, 854 poll_table *); 855 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 856 #ifdef CONFIG_COMPAT 857 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 858 #endif 859 static int unix_shutdown(struct socket *, int); 860 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 861 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 862 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 863 struct pipe_inode_info *, size_t size, 864 unsigned int flags); 865 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 866 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 867 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 868 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 869 static int unix_dgram_connect(struct socket *, struct sockaddr *, 870 int, int); 871 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 872 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 873 int); 874 875 #ifdef CONFIG_PROC_FS 876 static int unix_count_nr_fds(struct sock *sk) 877 { 878 struct sk_buff *skb; 879 struct unix_sock *u; 880 int nr_fds = 0; 881 882 spin_lock(&sk->sk_receive_queue.lock); 883 skb = skb_peek(&sk->sk_receive_queue); 884 while (skb) { 885 u = unix_sk(skb->sk); 886 nr_fds += atomic_read(&u->scm_stat.nr_fds); 887 skb = skb_peek_next(skb, &sk->sk_receive_queue); 888 } 889 spin_unlock(&sk->sk_receive_queue.lock); 890 891 return nr_fds; 892 } 893 894 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 895 { 896 struct sock *sk = sock->sk; 897 unsigned char s_state; 898 struct unix_sock *u; 899 int nr_fds = 0; 900 901 if (sk) { 902 s_state = READ_ONCE(sk->sk_state); 903 u = unix_sk(sk); 904 905 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 906 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 907 * SOCK_DGRAM is ordinary. So, no lock is needed. 908 */ 909 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 910 nr_fds = atomic_read(&u->scm_stat.nr_fds); 911 else if (s_state == TCP_LISTEN) 912 nr_fds = unix_count_nr_fds(sk); 913 914 seq_printf(m, "scm_fds: %u\n", nr_fds); 915 } 916 } 917 #else 918 #define unix_show_fdinfo NULL 919 #endif 920 921 static bool unix_custom_sockopt(int optname) 922 { 923 switch (optname) { 924 case SO_INQ: 925 return true; 926 default: 927 return false; 928 } 929 } 930 931 static int unix_setsockopt(struct socket *sock, int level, int optname, 932 sockptr_t optval, unsigned int optlen) 933 { 934 struct unix_sock *u = unix_sk(sock->sk); 935 struct sock *sk = sock->sk; 936 int val; 937 938 if (level != SOL_SOCKET) 939 return -EOPNOTSUPP; 940 941 if (!unix_custom_sockopt(optname)) 942 return sock_setsockopt(sock, level, optname, optval, optlen); 943 944 if (optlen != sizeof(int)) 945 return -EINVAL; 946 947 if (copy_from_sockptr(&val, optval, sizeof(val))) 948 return -EFAULT; 949 950 switch (optname) { 951 case SO_INQ: 952 if (sk->sk_type != SOCK_STREAM) 953 return -EINVAL; 954 955 if (val > 1 || val < 0) 956 return -EINVAL; 957 958 WRITE_ONCE(u->recvmsg_inq, val); 959 break; 960 default: 961 return -ENOPROTOOPT; 962 } 963 964 return 0; 965 } 966 967 static const struct proto_ops unix_stream_ops = { 968 .family = PF_UNIX, 969 .owner = THIS_MODULE, 970 .release = unix_release, 971 .bind = unix_bind, 972 .connect = unix_stream_connect, 973 .socketpair = unix_socketpair, 974 .accept = unix_accept, 975 .getname = unix_getname, 976 .poll = unix_poll, 977 .ioctl = unix_ioctl, 978 #ifdef CONFIG_COMPAT 979 .compat_ioctl = unix_compat_ioctl, 980 #endif 981 .listen = unix_listen, 982 .shutdown = unix_shutdown, 983 .setsockopt = unix_setsockopt, 984 .sendmsg = unix_stream_sendmsg, 985 .recvmsg = unix_stream_recvmsg, 986 .read_skb = unix_stream_read_skb, 987 .mmap = sock_no_mmap, 988 .splice_read = unix_stream_splice_read, 989 .set_peek_off = sk_set_peek_off, 990 .show_fdinfo = unix_show_fdinfo, 991 }; 992 993 static const struct proto_ops unix_dgram_ops = { 994 .family = PF_UNIX, 995 .owner = THIS_MODULE, 996 .release = unix_release, 997 .bind = unix_bind, 998 .connect = unix_dgram_connect, 999 .socketpair = unix_socketpair, 1000 .accept = sock_no_accept, 1001 .getname = unix_getname, 1002 .poll = unix_dgram_poll, 1003 .ioctl = unix_ioctl, 1004 #ifdef CONFIG_COMPAT 1005 .compat_ioctl = unix_compat_ioctl, 1006 #endif 1007 .listen = sock_no_listen, 1008 .shutdown = unix_shutdown, 1009 .sendmsg = unix_dgram_sendmsg, 1010 .read_skb = unix_read_skb, 1011 .recvmsg = unix_dgram_recvmsg, 1012 .mmap = sock_no_mmap, 1013 .set_peek_off = sk_set_peek_off, 1014 .show_fdinfo = unix_show_fdinfo, 1015 }; 1016 1017 static const struct proto_ops unix_seqpacket_ops = { 1018 .family = PF_UNIX, 1019 .owner = THIS_MODULE, 1020 .release = unix_release, 1021 .bind = unix_bind, 1022 .connect = unix_stream_connect, 1023 .socketpair = unix_socketpair, 1024 .accept = unix_accept, 1025 .getname = unix_getname, 1026 .poll = unix_dgram_poll, 1027 .ioctl = unix_ioctl, 1028 #ifdef CONFIG_COMPAT 1029 .compat_ioctl = unix_compat_ioctl, 1030 #endif 1031 .listen = unix_listen, 1032 .shutdown = unix_shutdown, 1033 .sendmsg = unix_seqpacket_sendmsg, 1034 .recvmsg = unix_seqpacket_recvmsg, 1035 .mmap = sock_no_mmap, 1036 .set_peek_off = sk_set_peek_off, 1037 .show_fdinfo = unix_show_fdinfo, 1038 }; 1039 1040 static void unix_close(struct sock *sk, long timeout) 1041 { 1042 /* Nothing to do here, unix socket does not need a ->close(). 1043 * This is merely for sockmap. 1044 */ 1045 } 1046 1047 static bool unix_bpf_bypass_getsockopt(int level, int optname) 1048 { 1049 if (level == SOL_SOCKET) { 1050 switch (optname) { 1051 case SO_PEERPIDFD: 1052 return true; 1053 default: 1054 return false; 1055 } 1056 } 1057 1058 return false; 1059 } 1060 1061 struct proto unix_dgram_proto = { 1062 .name = "UNIX", 1063 .owner = THIS_MODULE, 1064 .obj_size = sizeof(struct unix_sock), 1065 .close = unix_close, 1066 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 1067 #ifdef CONFIG_BPF_SYSCALL 1068 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 1069 #endif 1070 }; 1071 1072 struct proto unix_stream_proto = { 1073 .name = "UNIX-STREAM", 1074 .owner = THIS_MODULE, 1075 .obj_size = sizeof(struct unix_sock), 1076 .close = unix_close, 1077 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 1078 #ifdef CONFIG_BPF_SYSCALL 1079 .psock_update_sk_prot = unix_stream_bpf_update_proto, 1080 #endif 1081 }; 1082 1083 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 1084 { 1085 struct unix_sock *u; 1086 struct sock *sk; 1087 int err; 1088 1089 atomic_long_inc(&unix_nr_socks); 1090 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 1091 err = -ENFILE; 1092 goto err; 1093 } 1094 1095 if (type == SOCK_STREAM) 1096 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 1097 else /*dgram and seqpacket */ 1098 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 1099 1100 if (!sk) { 1101 err = -ENOMEM; 1102 goto err; 1103 } 1104 1105 sock_init_data(sock, sk); 1106 1107 sk->sk_scm_rights = 1; 1108 sk->sk_hash = unix_unbound_hash(sk); 1109 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 1110 sk->sk_write_space = unix_write_space; 1111 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 1112 sk->sk_destruct = unix_sock_destructor; 1113 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); 1114 1115 u = unix_sk(sk); 1116 u->listener = NULL; 1117 u->vertex = NULL; 1118 u->path.dentry = NULL; 1119 u->path.mnt = NULL; 1120 spin_lock_init(&u->lock); 1121 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); 1122 mutex_init(&u->iolock); /* single task reading lock */ 1123 mutex_init(&u->bindlock); /* single task binding lock */ 1124 init_waitqueue_head(&u->peer_wait); 1125 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1126 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1127 unix_insert_unbound_socket(net, sk); 1128 1129 sock_prot_inuse_add(net, sk->sk_prot, 1); 1130 1131 return sk; 1132 1133 err: 1134 atomic_long_dec(&unix_nr_socks); 1135 return ERR_PTR(err); 1136 } 1137 1138 static int unix_create(struct net *net, struct socket *sock, int protocol, 1139 int kern) 1140 { 1141 struct sock *sk; 1142 1143 if (protocol && protocol != PF_UNIX) 1144 return -EPROTONOSUPPORT; 1145 1146 sock->state = SS_UNCONNECTED; 1147 1148 switch (sock->type) { 1149 case SOCK_STREAM: 1150 set_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags); 1151 sock->ops = &unix_stream_ops; 1152 break; 1153 /* 1154 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1155 * nothing uses it. 1156 */ 1157 case SOCK_RAW: 1158 sock->type = SOCK_DGRAM; 1159 fallthrough; 1160 case SOCK_DGRAM: 1161 sock->ops = &unix_dgram_ops; 1162 break; 1163 case SOCK_SEQPACKET: 1164 sock->ops = &unix_seqpacket_ops; 1165 break; 1166 default: 1167 return -ESOCKTNOSUPPORT; 1168 } 1169 1170 sk = unix_create1(net, sock, kern, sock->type); 1171 if (IS_ERR(sk)) 1172 return PTR_ERR(sk); 1173 1174 return 0; 1175 } 1176 1177 static int unix_release(struct socket *sock) 1178 { 1179 struct sock *sk = sock->sk; 1180 1181 if (!sk) 1182 return 0; 1183 1184 sk->sk_prot->close(sk, 0); 1185 unix_release_sock(sk, 0); 1186 sock->sk = NULL; 1187 1188 return 0; 1189 } 1190 1191 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1192 int type, int flags) 1193 { 1194 struct inode *inode; 1195 struct path path; 1196 struct sock *sk; 1197 int err; 1198 1199 unix_mkname_bsd(sunaddr, addr_len); 1200 1201 if (flags & SOCK_COREDUMP) { 1202 const struct cred *cred; 1203 struct cred *kcred; 1204 struct path root; 1205 1206 kcred = prepare_kernel_cred(&init_task); 1207 if (!kcred) { 1208 err = -ENOMEM; 1209 goto fail; 1210 } 1211 1212 task_lock(&init_task); 1213 get_fs_root(init_task.fs, &root); 1214 task_unlock(&init_task); 1215 1216 cred = override_creds(kcred); 1217 err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path, 1218 LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | 1219 LOOKUP_NO_MAGICLINKS, &path); 1220 put_cred(revert_creds(cred)); 1221 path_put(&root); 1222 if (err) 1223 goto fail; 1224 } else { 1225 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1226 if (err) 1227 goto fail; 1228 1229 err = path_permission(&path, MAY_WRITE); 1230 if (err) 1231 goto path_put; 1232 } 1233 1234 err = -ECONNREFUSED; 1235 inode = d_backing_inode(path.dentry); 1236 if (!S_ISSOCK(inode->i_mode)) 1237 goto path_put; 1238 1239 sk = unix_find_socket_byinode(inode); 1240 if (!sk) 1241 goto path_put; 1242 1243 err = -EPROTOTYPE; 1244 if (sk->sk_type == type) 1245 touch_atime(&path); 1246 else 1247 goto sock_put; 1248 1249 path_put(&path); 1250 1251 return sk; 1252 1253 sock_put: 1254 sock_put(sk); 1255 path_put: 1256 path_put(&path); 1257 fail: 1258 return ERR_PTR(err); 1259 } 1260 1261 static struct sock *unix_find_abstract(struct net *net, 1262 struct sockaddr_un *sunaddr, 1263 int addr_len, int type) 1264 { 1265 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1266 struct dentry *dentry; 1267 struct sock *sk; 1268 1269 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1270 if (!sk) 1271 return ERR_PTR(-ECONNREFUSED); 1272 1273 dentry = unix_sk(sk)->path.dentry; 1274 if (dentry) 1275 touch_atime(&unix_sk(sk)->path); 1276 1277 return sk; 1278 } 1279 1280 static struct sock *unix_find_other(struct net *net, 1281 struct sockaddr_un *sunaddr, 1282 int addr_len, int type, int flags) 1283 { 1284 struct sock *sk; 1285 1286 if (sunaddr->sun_path[0]) 1287 sk = unix_find_bsd(sunaddr, addr_len, type, flags); 1288 else 1289 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1290 1291 return sk; 1292 } 1293 1294 static int unix_autobind(struct sock *sk) 1295 { 1296 struct unix_sock *u = unix_sk(sk); 1297 unsigned int new_hash, old_hash; 1298 struct net *net = sock_net(sk); 1299 struct unix_address *addr; 1300 u32 lastnum, ordernum; 1301 int err; 1302 1303 err = mutex_lock_interruptible(&u->bindlock); 1304 if (err) 1305 return err; 1306 1307 if (u->addr) 1308 goto out; 1309 1310 err = -ENOMEM; 1311 addr = kzalloc(sizeof(*addr) + 1312 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1313 if (!addr) 1314 goto out; 1315 1316 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1317 addr->name->sun_family = AF_UNIX; 1318 refcount_set(&addr->refcnt, 1); 1319 1320 old_hash = sk->sk_hash; 1321 ordernum = get_random_u32(); 1322 lastnum = ordernum & 0xFFFFF; 1323 retry: 1324 ordernum = (ordernum + 1) & 0xFFFFF; 1325 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1326 1327 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1328 unix_table_double_lock(net, old_hash, new_hash); 1329 1330 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1331 unix_table_double_unlock(net, old_hash, new_hash); 1332 1333 /* __unix_find_socket_byname() may take long time if many names 1334 * are already in use. 1335 */ 1336 cond_resched(); 1337 1338 if (ordernum == lastnum) { 1339 /* Give up if all names seems to be in use. */ 1340 err = -ENOSPC; 1341 unix_release_addr(addr); 1342 goto out; 1343 } 1344 1345 goto retry; 1346 } 1347 1348 __unix_set_addr_hash(net, sk, addr, new_hash); 1349 unix_table_double_unlock(net, old_hash, new_hash); 1350 err = 0; 1351 1352 out: mutex_unlock(&u->bindlock); 1353 return err; 1354 } 1355 1356 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1357 int addr_len) 1358 { 1359 umode_t mode = S_IFSOCK | 1360 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1361 struct unix_sock *u = unix_sk(sk); 1362 unsigned int new_hash, old_hash; 1363 struct net *net = sock_net(sk); 1364 struct mnt_idmap *idmap; 1365 struct unix_address *addr; 1366 struct dentry *dentry; 1367 struct path parent; 1368 int err; 1369 1370 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1371 addr = unix_create_addr(sunaddr, addr_len); 1372 if (!addr) 1373 return -ENOMEM; 1374 1375 /* 1376 * Get the parent directory, calculate the hash for last 1377 * component. 1378 */ 1379 dentry = start_creating_path(AT_FDCWD, addr->name->sun_path, &parent, 0); 1380 if (IS_ERR(dentry)) { 1381 err = PTR_ERR(dentry); 1382 goto out; 1383 } 1384 1385 /* 1386 * All right, let's create it. 1387 */ 1388 idmap = mnt_idmap(parent.mnt); 1389 err = security_path_mknod(&parent, dentry, mode, 0); 1390 if (!err) 1391 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1392 if (err) 1393 goto out_path; 1394 err = mutex_lock_interruptible(&u->bindlock); 1395 if (err) 1396 goto out_unlink; 1397 if (u->addr) 1398 goto out_unlock; 1399 1400 old_hash = sk->sk_hash; 1401 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1402 unix_table_double_lock(net, old_hash, new_hash); 1403 u->path.mnt = mntget(parent.mnt); 1404 u->path.dentry = dget(dentry); 1405 __unix_set_addr_hash(net, sk, addr, new_hash); 1406 unix_table_double_unlock(net, old_hash, new_hash); 1407 unix_insert_bsd_socket(sk); 1408 mutex_unlock(&u->bindlock); 1409 end_creating_path(&parent, dentry); 1410 return 0; 1411 1412 out_unlock: 1413 mutex_unlock(&u->bindlock); 1414 err = -EINVAL; 1415 out_unlink: 1416 /* failed after successful mknod? unlink what we'd created... */ 1417 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1418 out_path: 1419 end_creating_path(&parent, dentry); 1420 out: 1421 unix_release_addr(addr); 1422 return err == -EEXIST ? -EADDRINUSE : err; 1423 } 1424 1425 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1426 int addr_len) 1427 { 1428 struct unix_sock *u = unix_sk(sk); 1429 unsigned int new_hash, old_hash; 1430 struct net *net = sock_net(sk); 1431 struct unix_address *addr; 1432 int err; 1433 1434 addr = unix_create_addr(sunaddr, addr_len); 1435 if (!addr) 1436 return -ENOMEM; 1437 1438 err = mutex_lock_interruptible(&u->bindlock); 1439 if (err) 1440 goto out; 1441 1442 if (u->addr) { 1443 err = -EINVAL; 1444 goto out_mutex; 1445 } 1446 1447 old_hash = sk->sk_hash; 1448 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1449 unix_table_double_lock(net, old_hash, new_hash); 1450 1451 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1452 goto out_spin; 1453 1454 __unix_set_addr_hash(net, sk, addr, new_hash); 1455 unix_table_double_unlock(net, old_hash, new_hash); 1456 mutex_unlock(&u->bindlock); 1457 return 0; 1458 1459 out_spin: 1460 unix_table_double_unlock(net, old_hash, new_hash); 1461 err = -EADDRINUSE; 1462 out_mutex: 1463 mutex_unlock(&u->bindlock); 1464 out: 1465 unix_release_addr(addr); 1466 return err; 1467 } 1468 1469 static int unix_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) 1470 { 1471 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1472 struct sock *sk = sock->sk; 1473 int err; 1474 1475 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1476 sunaddr->sun_family == AF_UNIX) 1477 return unix_autobind(sk); 1478 1479 err = unix_validate_addr(sunaddr, addr_len); 1480 if (err) 1481 return err; 1482 1483 if (sunaddr->sun_path[0]) 1484 err = unix_bind_bsd(sk, sunaddr, addr_len); 1485 else 1486 err = unix_bind_abstract(sk, sunaddr, addr_len); 1487 1488 return err; 1489 } 1490 1491 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1492 { 1493 if (unlikely(sk1 == sk2) || !sk2) { 1494 unix_state_lock(sk1); 1495 return; 1496 } 1497 1498 if (sk1 > sk2) 1499 swap(sk1, sk2); 1500 1501 unix_state_lock(sk1); 1502 unix_state_lock(sk2); 1503 } 1504 1505 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1506 { 1507 if (unlikely(sk1 == sk2) || !sk2) { 1508 unix_state_unlock(sk1); 1509 return; 1510 } 1511 unix_state_unlock(sk1); 1512 unix_state_unlock(sk2); 1513 } 1514 1515 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1516 int alen, int flags) 1517 { 1518 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1519 struct sock *sk = sock->sk; 1520 struct sock *other; 1521 int err; 1522 1523 err = -EINVAL; 1524 if (alen < offsetofend(struct sockaddr, sa_family)) 1525 goto out; 1526 1527 if (addr->sa_family != AF_UNSPEC) { 1528 err = unix_validate_addr(sunaddr, alen); 1529 if (err) 1530 goto out; 1531 1532 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1533 if (err) 1534 goto out; 1535 1536 if (unix_may_passcred(sk) && !READ_ONCE(unix_sk(sk)->addr)) { 1537 err = unix_autobind(sk); 1538 if (err) 1539 goto out; 1540 } 1541 1542 restart: 1543 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type, 0); 1544 if (IS_ERR(other)) { 1545 err = PTR_ERR(other); 1546 goto out; 1547 } 1548 1549 unix_state_double_lock(sk, other); 1550 1551 /* Apparently VFS overslept socket death. Retry. */ 1552 if (sock_flag(other, SOCK_DEAD)) { 1553 unix_state_double_unlock(sk, other); 1554 sock_put(other); 1555 goto restart; 1556 } 1557 1558 err = -EPERM; 1559 if (!unix_may_send(sk, other)) 1560 goto out_unlock; 1561 1562 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1563 if (err) 1564 goto out_unlock; 1565 1566 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1567 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1568 } else { 1569 /* 1570 * 1003.1g breaking connected state with AF_UNSPEC 1571 */ 1572 other = NULL; 1573 unix_state_double_lock(sk, other); 1574 } 1575 1576 /* 1577 * If it was connected, reconnect. 1578 */ 1579 if (unix_peer(sk)) { 1580 struct sock *old_peer = unix_peer(sk); 1581 1582 unix_peer(sk) = other; 1583 if (!other) 1584 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1585 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1586 1587 unix_state_double_unlock(sk, other); 1588 1589 if (other != old_peer) { 1590 unix_dgram_disconnected(sk, old_peer); 1591 1592 unix_state_lock(old_peer); 1593 if (!unix_peer(old_peer)) 1594 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1595 unix_state_unlock(old_peer); 1596 } 1597 1598 sock_put(old_peer); 1599 } else { 1600 unix_peer(sk) = other; 1601 unix_state_double_unlock(sk, other); 1602 } 1603 1604 return 0; 1605 1606 out_unlock: 1607 unix_state_double_unlock(sk, other); 1608 sock_put(other); 1609 out: 1610 return err; 1611 } 1612 1613 static long unix_wait_for_peer(struct sock *other, long timeo) 1614 { 1615 struct unix_sock *u = unix_sk(other); 1616 int sched; 1617 DEFINE_WAIT(wait); 1618 1619 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1620 1621 sched = !sock_flag(other, SOCK_DEAD) && 1622 !(other->sk_shutdown & RCV_SHUTDOWN) && 1623 unix_recvq_full_lockless(other); 1624 1625 unix_state_unlock(other); 1626 1627 if (sched) 1628 timeo = schedule_timeout(timeo); 1629 1630 finish_wait(&u->peer_wait, &wait); 1631 return timeo; 1632 } 1633 1634 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1635 int addr_len, int flags) 1636 { 1637 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1638 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1639 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1640 struct unix_peercred peercred = {}; 1641 struct net *net = sock_net(sk); 1642 struct sk_buff *skb = NULL; 1643 unsigned char state; 1644 long timeo; 1645 int err; 1646 1647 err = unix_validate_addr(sunaddr, addr_len); 1648 if (err) 1649 goto out; 1650 1651 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1652 if (err) 1653 goto out; 1654 1655 if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) { 1656 err = unix_autobind(sk); 1657 if (err) 1658 goto out; 1659 } 1660 1661 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1662 1663 /* First of all allocate resources. 1664 * If we will make it after state is locked, 1665 * we will have to recheck all again in any case. 1666 */ 1667 1668 /* create new sock for complete connection */ 1669 newsk = unix_create1(net, NULL, 0, sock->type); 1670 if (IS_ERR(newsk)) { 1671 err = PTR_ERR(newsk); 1672 goto out; 1673 } 1674 1675 err = prepare_peercred(&peercred); 1676 if (err) 1677 goto out; 1678 1679 /* Allocate skb for sending to listening sock */ 1680 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1681 if (!skb) { 1682 err = -ENOMEM; 1683 goto out_free_sk; 1684 } 1685 1686 restart: 1687 /* Find listening sock. */ 1688 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, flags); 1689 if (IS_ERR(other)) { 1690 err = PTR_ERR(other); 1691 goto out_free_skb; 1692 } 1693 1694 unix_state_lock(other); 1695 1696 /* Apparently VFS overslept socket death. Retry. */ 1697 if (sock_flag(other, SOCK_DEAD)) { 1698 unix_state_unlock(other); 1699 sock_put(other); 1700 goto restart; 1701 } 1702 1703 if (other->sk_state != TCP_LISTEN || 1704 other->sk_shutdown & RCV_SHUTDOWN) { 1705 err = -ECONNREFUSED; 1706 goto out_unlock; 1707 } 1708 1709 if (unix_recvq_full_lockless(other)) { 1710 if (!timeo) { 1711 err = -EAGAIN; 1712 goto out_unlock; 1713 } 1714 1715 timeo = unix_wait_for_peer(other, timeo); 1716 sock_put(other); 1717 1718 err = sock_intr_errno(timeo); 1719 if (signal_pending(current)) 1720 goto out_free_skb; 1721 1722 goto restart; 1723 } 1724 1725 /* self connect and simultaneous connect are eliminated 1726 * by rejecting TCP_LISTEN socket to avoid deadlock. 1727 */ 1728 state = READ_ONCE(sk->sk_state); 1729 if (unlikely(state != TCP_CLOSE)) { 1730 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1731 goto out_unlock; 1732 } 1733 1734 unix_state_lock(sk); 1735 1736 if (unlikely(sk->sk_state != TCP_CLOSE)) { 1737 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1738 unix_state_unlock(sk); 1739 goto out_unlock; 1740 } 1741 1742 err = security_unix_stream_connect(sk, other, newsk); 1743 if (err) { 1744 unix_state_unlock(sk); 1745 goto out_unlock; 1746 } 1747 1748 /* The way is open! Fastly set all the necessary fields... */ 1749 1750 sock_hold(sk); 1751 unix_peer(newsk) = sk; 1752 newsk->sk_state = TCP_ESTABLISHED; 1753 newsk->sk_type = sk->sk_type; 1754 newsk->sk_scm_recv_flags = other->sk_scm_recv_flags; 1755 init_peercred(newsk, &peercred); 1756 1757 newu = unix_sk(newsk); 1758 newu->listener = other; 1759 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1760 otheru = unix_sk(other); 1761 1762 /* copy address information from listening to new sock 1763 * 1764 * The contents of *(otheru->addr) and otheru->path 1765 * are seen fully set up here, since we have found 1766 * otheru in hash under its lock. Insertion into the 1767 * hash chain we'd found it in had been done in an 1768 * earlier critical area protected by the chain's lock, 1769 * the same one where we'd set *(otheru->addr) contents, 1770 * as well as otheru->path and otheru->addr itself. 1771 * 1772 * Using smp_store_release() here to set newu->addr 1773 * is enough to make those stores, as well as stores 1774 * to newu->path visible to anyone who gets newu->addr 1775 * by smp_load_acquire(). IOW, the same warranties 1776 * as for unix_sock instances bound in unix_bind() or 1777 * in unix_autobind(). 1778 */ 1779 if (otheru->path.dentry) { 1780 path_get(&otheru->path); 1781 newu->path = otheru->path; 1782 } 1783 refcount_inc(&otheru->addr->refcnt); 1784 smp_store_release(&newu->addr, otheru->addr); 1785 1786 /* Set credentials */ 1787 copy_peercred(sk, other); 1788 1789 sock->state = SS_CONNECTED; 1790 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1791 sock_hold(newsk); 1792 1793 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1794 unix_peer(sk) = newsk; 1795 1796 unix_state_unlock(sk); 1797 1798 /* take ten and send info to listening sock */ 1799 spin_lock(&other->sk_receive_queue.lock); 1800 __skb_queue_tail(&other->sk_receive_queue, skb); 1801 spin_unlock(&other->sk_receive_queue.lock); 1802 unix_state_unlock(other); 1803 other->sk_data_ready(other); 1804 sock_put(other); 1805 return 0; 1806 1807 out_unlock: 1808 unix_state_unlock(other); 1809 sock_put(other); 1810 out_free_skb: 1811 consume_skb(skb); 1812 out_free_sk: 1813 unix_release_sock(newsk, 0); 1814 out: 1815 drop_peercred(&peercred); 1816 return err; 1817 } 1818 1819 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1820 { 1821 struct unix_peercred ska_peercred = {}, skb_peercred = {}; 1822 struct sock *ska = socka->sk, *skb = sockb->sk; 1823 int err; 1824 1825 err = prepare_peercred(&ska_peercred); 1826 if (err) 1827 return err; 1828 1829 err = prepare_peercred(&skb_peercred); 1830 if (err) { 1831 drop_peercred(&ska_peercred); 1832 return err; 1833 } 1834 1835 /* Join our sockets back to back */ 1836 sock_hold(ska); 1837 sock_hold(skb); 1838 unix_peer(ska) = skb; 1839 unix_peer(skb) = ska; 1840 init_peercred(ska, &ska_peercred); 1841 init_peercred(skb, &skb_peercred); 1842 1843 ska->sk_state = TCP_ESTABLISHED; 1844 skb->sk_state = TCP_ESTABLISHED; 1845 socka->state = SS_CONNECTED; 1846 sockb->state = SS_CONNECTED; 1847 return 0; 1848 } 1849 1850 static int unix_accept(struct socket *sock, struct socket *newsock, 1851 struct proto_accept_arg *arg) 1852 { 1853 struct sock *sk = sock->sk; 1854 struct sk_buff *skb; 1855 struct sock *tsk; 1856 1857 arg->err = -EOPNOTSUPP; 1858 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1859 goto out; 1860 1861 arg->err = -EINVAL; 1862 if (READ_ONCE(sk->sk_state) != TCP_LISTEN) 1863 goto out; 1864 1865 /* If socket state is TCP_LISTEN it cannot change (for now...), 1866 * so that no locks are necessary. 1867 */ 1868 1869 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1870 &arg->err); 1871 if (!skb) { 1872 /* This means receive shutdown. */ 1873 if (arg->err == 0) 1874 arg->err = -EINVAL; 1875 goto out; 1876 } 1877 1878 tsk = skb->sk; 1879 skb_free_datagram(sk, skb); 1880 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1881 1882 if (tsk->sk_type == SOCK_STREAM) 1883 set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); 1884 1885 /* attach accepted sock to socket */ 1886 unix_state_lock(tsk); 1887 unix_update_edges(unix_sk(tsk)); 1888 newsock->state = SS_CONNECTED; 1889 sock_graft(tsk, newsock); 1890 unix_state_unlock(tsk); 1891 return 0; 1892 1893 out: 1894 return arg->err; 1895 } 1896 1897 1898 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1899 { 1900 struct sock *sk = sock->sk; 1901 struct unix_address *addr; 1902 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1903 int err = 0; 1904 1905 if (peer) { 1906 sk = unix_peer_get(sk); 1907 1908 err = -ENOTCONN; 1909 if (!sk) 1910 goto out; 1911 err = 0; 1912 } else { 1913 sock_hold(sk); 1914 } 1915 1916 addr = smp_load_acquire(&unix_sk(sk)->addr); 1917 if (!addr) { 1918 sunaddr->sun_family = AF_UNIX; 1919 sunaddr->sun_path[0] = 0; 1920 err = offsetof(struct sockaddr_un, sun_path); 1921 } else { 1922 err = addr->len; 1923 memcpy(sunaddr, addr->name, addr->len); 1924 1925 if (peer) 1926 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1927 CGROUP_UNIX_GETPEERNAME); 1928 else 1929 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1930 CGROUP_UNIX_GETSOCKNAME); 1931 } 1932 sock_put(sk); 1933 out: 1934 return err; 1935 } 1936 1937 /* The "user->unix_inflight" variable is protected by the garbage 1938 * collection lock, and we just read it locklessly here. If you go 1939 * over the limit, there might be a tiny race in actually noticing 1940 * it across threads. Tough. 1941 */ 1942 static inline bool too_many_unix_fds(struct task_struct *p) 1943 { 1944 struct user_struct *user = current_user(); 1945 1946 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1947 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1948 return false; 1949 } 1950 1951 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1952 { 1953 if (too_many_unix_fds(current)) 1954 return -ETOOMANYREFS; 1955 1956 UNIXCB(skb).fp = scm->fp; 1957 scm->fp = NULL; 1958 1959 if (unix_prepare_fpl(UNIXCB(skb).fp)) 1960 return -ENOMEM; 1961 1962 return 0; 1963 } 1964 1965 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1966 { 1967 scm->fp = UNIXCB(skb).fp; 1968 UNIXCB(skb).fp = NULL; 1969 1970 unix_destroy_fpl(scm->fp); 1971 } 1972 1973 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1974 { 1975 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1976 } 1977 1978 static void unix_destruct_scm(struct sk_buff *skb) 1979 { 1980 struct scm_cookie scm; 1981 1982 memset(&scm, 0, sizeof(scm)); 1983 scm.pid = UNIXCB(skb).pid; 1984 if (UNIXCB(skb).fp) 1985 unix_detach_fds(&scm, skb); 1986 1987 /* Alas, it calls VFS */ 1988 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1989 scm_destroy(&scm); 1990 sock_wfree(skb); 1991 } 1992 1993 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1994 { 1995 int err = 0; 1996 1997 UNIXCB(skb).pid = get_pid(scm->pid); 1998 UNIXCB(skb).uid = scm->creds.uid; 1999 UNIXCB(skb).gid = scm->creds.gid; 2000 UNIXCB(skb).fp = NULL; 2001 unix_get_secdata(scm, skb); 2002 if (scm->fp && send_fds) 2003 err = unix_attach_fds(scm, skb); 2004 2005 skb->destructor = unix_destruct_scm; 2006 return err; 2007 } 2008 2009 static void unix_skb_to_scm(struct sk_buff *skb, struct scm_cookie *scm) 2010 { 2011 scm_set_cred(scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2012 unix_set_secdata(scm, skb); 2013 } 2014 2015 /** 2016 * unix_maybe_add_creds() - Adds current task uid/gid and struct pid to skb if needed. 2017 * @skb: skb to attach creds to. 2018 * @sk: Sender sock. 2019 * @other: Receiver sock. 2020 * 2021 * Some apps rely on write() giving SCM_CREDENTIALS 2022 * We include credentials if source or destination socket 2023 * asserted SOCK_PASSCRED. 2024 * 2025 * Context: May sleep. 2026 * Return: On success zero, on error a negative error code is returned. 2027 */ 2028 static int unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, 2029 const struct sock *other) 2030 { 2031 if (UNIXCB(skb).pid) 2032 return 0; 2033 2034 if (unix_may_passcred(sk) || unix_may_passcred(other) || 2035 !other->sk_socket) { 2036 struct pid *pid; 2037 int err; 2038 2039 pid = task_tgid(current); 2040 err = pidfs_register_pid(pid); 2041 if (unlikely(err)) 2042 return err; 2043 2044 UNIXCB(skb).pid = get_pid(pid); 2045 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 2046 } 2047 2048 return 0; 2049 } 2050 2051 static bool unix_skb_scm_eq(struct sk_buff *skb, 2052 struct scm_cookie *scm) 2053 { 2054 return UNIXCB(skb).pid == scm->pid && 2055 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 2056 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 2057 unix_secdata_eq(scm, skb); 2058 } 2059 2060 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 2061 { 2062 struct scm_fp_list *fp = UNIXCB(skb).fp; 2063 struct unix_sock *u = unix_sk(sk); 2064 2065 if (unlikely(fp && fp->count)) { 2066 atomic_add(fp->count, &u->scm_stat.nr_fds); 2067 unix_add_edges(fp, u); 2068 } 2069 } 2070 2071 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 2072 { 2073 struct scm_fp_list *fp = UNIXCB(skb).fp; 2074 struct unix_sock *u = unix_sk(sk); 2075 2076 if (unlikely(fp && fp->count)) { 2077 atomic_sub(fp->count, &u->scm_stat.nr_fds); 2078 unix_del_edges(fp); 2079 } 2080 } 2081 2082 /* 2083 * Send AF_UNIX data. 2084 */ 2085 2086 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 2087 size_t len) 2088 { 2089 struct sock *sk = sock->sk, *other = NULL; 2090 struct unix_sock *u = unix_sk(sk); 2091 struct scm_cookie scm; 2092 struct sk_buff *skb; 2093 int data_len = 0; 2094 int sk_locked; 2095 long timeo; 2096 int err; 2097 2098 err = scm_send(sock, msg, &scm, false); 2099 if (err < 0) 2100 return err; 2101 2102 wait_for_unix_gc(scm.fp); 2103 2104 if (msg->msg_flags & MSG_OOB) { 2105 err = -EOPNOTSUPP; 2106 goto out; 2107 } 2108 2109 if (msg->msg_namelen) { 2110 err = unix_validate_addr(msg->msg_name, msg->msg_namelen); 2111 if (err) 2112 goto out; 2113 2114 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 2115 msg->msg_name, 2116 &msg->msg_namelen, 2117 NULL); 2118 if (err) 2119 goto out; 2120 } 2121 2122 if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) { 2123 err = unix_autobind(sk); 2124 if (err) 2125 goto out; 2126 } 2127 2128 if (len > READ_ONCE(sk->sk_sndbuf) - 32) { 2129 err = -EMSGSIZE; 2130 goto out; 2131 } 2132 2133 if (len > SKB_MAX_ALLOC) { 2134 data_len = min_t(size_t, 2135 len - SKB_MAX_ALLOC, 2136 MAX_SKB_FRAGS * PAGE_SIZE); 2137 data_len = PAGE_ALIGN(data_len); 2138 2139 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 2140 } 2141 2142 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 2143 msg->msg_flags & MSG_DONTWAIT, &err, 2144 PAGE_ALLOC_COSTLY_ORDER); 2145 if (!skb) 2146 goto out; 2147 2148 err = unix_scm_to_skb(&scm, skb, true); 2149 if (err < 0) 2150 goto out_free; 2151 2152 skb_put(skb, len - data_len); 2153 skb->data_len = data_len; 2154 skb->len = len; 2155 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 2156 if (err) 2157 goto out_free; 2158 2159 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 2160 2161 if (msg->msg_namelen) { 2162 lookup: 2163 other = unix_find_other(sock_net(sk), msg->msg_name, 2164 msg->msg_namelen, sk->sk_type, 0); 2165 if (IS_ERR(other)) { 2166 err = PTR_ERR(other); 2167 goto out_free; 2168 } 2169 } else { 2170 other = unix_peer_get(sk); 2171 if (!other) { 2172 err = -ENOTCONN; 2173 goto out_free; 2174 } 2175 } 2176 2177 if (sk_filter(other, skb) < 0) { 2178 /* Toss the packet but do not return any error to the sender */ 2179 err = len; 2180 goto out_sock_put; 2181 } 2182 2183 err = unix_maybe_add_creds(skb, sk, other); 2184 if (err) 2185 goto out_sock_put; 2186 2187 restart: 2188 sk_locked = 0; 2189 unix_state_lock(other); 2190 restart_locked: 2191 2192 if (!unix_may_send(sk, other)) { 2193 err = -EPERM; 2194 goto out_unlock; 2195 } 2196 2197 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2198 /* Check with 1003.1g - what should datagram error */ 2199 2200 unix_state_unlock(other); 2201 2202 if (sk->sk_type == SOCK_SEQPACKET) { 2203 /* We are here only when racing with unix_release_sock() 2204 * is clearing @other. Never change state to TCP_CLOSE 2205 * unlike SOCK_DGRAM wants. 2206 */ 2207 err = -EPIPE; 2208 goto out_sock_put; 2209 } 2210 2211 if (!sk_locked) 2212 unix_state_lock(sk); 2213 2214 if (unix_peer(sk) == other) { 2215 unix_peer(sk) = NULL; 2216 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2217 2218 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2219 unix_state_unlock(sk); 2220 2221 unix_dgram_disconnected(sk, other); 2222 sock_put(other); 2223 err = -ECONNREFUSED; 2224 goto out_sock_put; 2225 } 2226 2227 unix_state_unlock(sk); 2228 2229 if (!msg->msg_namelen) { 2230 err = -ECONNRESET; 2231 goto out_sock_put; 2232 } 2233 2234 sock_put(other); 2235 goto lookup; 2236 } 2237 2238 if (other->sk_shutdown & RCV_SHUTDOWN) { 2239 err = -EPIPE; 2240 goto out_unlock; 2241 } 2242 2243 if (UNIXCB(skb).fp && !other->sk_scm_rights) { 2244 err = -EPERM; 2245 goto out_unlock; 2246 } 2247 2248 if (sk->sk_type != SOCK_SEQPACKET) { 2249 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2250 if (err) 2251 goto out_unlock; 2252 } 2253 2254 /* other == sk && unix_peer(other) != sk if 2255 * - unix_peer(sk) == NULL, destination address bound to sk 2256 * - unix_peer(sk) == sk by time of get but disconnected before lock 2257 */ 2258 if (other != sk && 2259 unlikely(unix_peer(other) != sk && 2260 unix_recvq_full_lockless(other))) { 2261 if (timeo) { 2262 timeo = unix_wait_for_peer(other, timeo); 2263 2264 err = sock_intr_errno(timeo); 2265 if (signal_pending(current)) 2266 goto out_sock_put; 2267 2268 goto restart; 2269 } 2270 2271 if (!sk_locked) { 2272 unix_state_unlock(other); 2273 unix_state_double_lock(sk, other); 2274 } 2275 2276 if (unix_peer(sk) != other || 2277 unix_dgram_peer_wake_me(sk, other)) { 2278 err = -EAGAIN; 2279 sk_locked = 1; 2280 goto out_unlock; 2281 } 2282 2283 if (!sk_locked) { 2284 sk_locked = 1; 2285 goto restart_locked; 2286 } 2287 } 2288 2289 if (unlikely(sk_locked)) 2290 unix_state_unlock(sk); 2291 2292 if (sock_flag(other, SOCK_RCVTSTAMP)) 2293 __net_timestamp(skb); 2294 2295 scm_stat_add(other, skb); 2296 skb_queue_tail(&other->sk_receive_queue, skb); 2297 unix_state_unlock(other); 2298 other->sk_data_ready(other); 2299 sock_put(other); 2300 scm_destroy(&scm); 2301 return len; 2302 2303 out_unlock: 2304 if (sk_locked) 2305 unix_state_unlock(sk); 2306 unix_state_unlock(other); 2307 out_sock_put: 2308 sock_put(other); 2309 out_free: 2310 consume_skb(skb); 2311 out: 2312 scm_destroy(&scm); 2313 return err; 2314 } 2315 2316 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2317 * bytes, and a minimum of a full page. 2318 */ 2319 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2320 2321 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2322 static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, 2323 struct scm_cookie *scm, bool fds_sent) 2324 { 2325 struct unix_sock *ousk = unix_sk(other); 2326 struct sk_buff *skb; 2327 int err; 2328 2329 skb = sock_alloc_send_skb(sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2330 2331 if (!skb) 2332 return err; 2333 2334 err = unix_scm_to_skb(scm, skb, !fds_sent); 2335 if (err < 0) 2336 goto out; 2337 2338 err = unix_maybe_add_creds(skb, sk, other); 2339 if (err) 2340 goto out; 2341 2342 skb_put(skb, 1); 2343 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2344 2345 if (err) 2346 goto out; 2347 2348 unix_state_lock(other); 2349 2350 if (sock_flag(other, SOCK_DEAD) || 2351 (other->sk_shutdown & RCV_SHUTDOWN)) { 2352 err = -EPIPE; 2353 goto out_unlock; 2354 } 2355 2356 if (UNIXCB(skb).fp && !other->sk_scm_rights) { 2357 err = -EPERM; 2358 goto out_unlock; 2359 } 2360 2361 scm_stat_add(other, skb); 2362 2363 spin_lock(&other->sk_receive_queue.lock); 2364 WRITE_ONCE(ousk->oob_skb, skb); 2365 WRITE_ONCE(ousk->inq_len, ousk->inq_len + 1); 2366 __skb_queue_tail(&other->sk_receive_queue, skb); 2367 spin_unlock(&other->sk_receive_queue.lock); 2368 2369 sk_send_sigurg(other); 2370 unix_state_unlock(other); 2371 other->sk_data_ready(other); 2372 2373 return 0; 2374 out_unlock: 2375 unix_state_unlock(other); 2376 out: 2377 consume_skb(skb); 2378 return err; 2379 } 2380 #endif 2381 2382 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2383 size_t len) 2384 { 2385 struct sock *sk = sock->sk; 2386 struct sk_buff *skb = NULL; 2387 struct sock *other = NULL; 2388 struct unix_sock *otheru; 2389 struct scm_cookie scm; 2390 bool fds_sent = false; 2391 int err, sent = 0; 2392 2393 err = scm_send(sock, msg, &scm, false); 2394 if (err < 0) 2395 return err; 2396 2397 wait_for_unix_gc(scm.fp); 2398 2399 if (msg->msg_flags & MSG_OOB) { 2400 err = -EOPNOTSUPP; 2401 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2402 if (len) 2403 len--; 2404 else 2405 #endif 2406 goto out_err; 2407 } 2408 2409 if (msg->msg_namelen) { 2410 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2411 goto out_err; 2412 } 2413 2414 other = unix_peer(sk); 2415 if (!other) { 2416 err = -ENOTCONN; 2417 goto out_err; 2418 } 2419 2420 otheru = unix_sk(other); 2421 2422 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2423 goto out_pipe; 2424 2425 while (sent < len) { 2426 int size = len - sent; 2427 int data_len; 2428 2429 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2430 skb = sock_alloc_send_pskb(sk, 0, 0, 2431 msg->msg_flags & MSG_DONTWAIT, 2432 &err, 0); 2433 } else { 2434 /* Keep two messages in the pipe so it schedules better */ 2435 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2436 2437 /* allow fallback to order-0 allocations */ 2438 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2439 2440 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2441 2442 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2443 2444 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2445 msg->msg_flags & MSG_DONTWAIT, &err, 2446 get_order(UNIX_SKB_FRAGS_SZ)); 2447 } 2448 if (!skb) 2449 goto out_err; 2450 2451 /* Only send the fds in the first buffer */ 2452 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2453 if (err < 0) 2454 goto out_free; 2455 2456 fds_sent = true; 2457 2458 err = unix_maybe_add_creds(skb, sk, other); 2459 if (err) 2460 goto out_free; 2461 2462 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2463 skb->ip_summed = CHECKSUM_UNNECESSARY; 2464 err = skb_splice_from_iter(skb, &msg->msg_iter, size); 2465 if (err < 0) 2466 goto out_free; 2467 2468 size = err; 2469 refcount_add(size, &sk->sk_wmem_alloc); 2470 } else { 2471 skb_put(skb, size - data_len); 2472 skb->data_len = data_len; 2473 skb->len = size; 2474 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2475 if (err) 2476 goto out_free; 2477 } 2478 2479 unix_state_lock(other); 2480 2481 if (sock_flag(other, SOCK_DEAD) || 2482 (other->sk_shutdown & RCV_SHUTDOWN)) 2483 goto out_pipe_unlock; 2484 2485 if (UNIXCB(skb).fp && !other->sk_scm_rights) { 2486 unix_state_unlock(other); 2487 err = -EPERM; 2488 goto out_free; 2489 } 2490 2491 scm_stat_add(other, skb); 2492 2493 spin_lock(&other->sk_receive_queue.lock); 2494 WRITE_ONCE(otheru->inq_len, otheru->inq_len + skb->len); 2495 __skb_queue_tail(&other->sk_receive_queue, skb); 2496 spin_unlock(&other->sk_receive_queue.lock); 2497 2498 unix_state_unlock(other); 2499 other->sk_data_ready(other); 2500 sent += size; 2501 } 2502 2503 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2504 if (msg->msg_flags & MSG_OOB) { 2505 err = queue_oob(sk, msg, other, &scm, fds_sent); 2506 if (err) 2507 goto out_err; 2508 sent++; 2509 } 2510 #endif 2511 2512 scm_destroy(&scm); 2513 2514 return sent; 2515 2516 out_pipe_unlock: 2517 unix_state_unlock(other); 2518 out_pipe: 2519 if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) 2520 send_sig(SIGPIPE, current, 0); 2521 err = -EPIPE; 2522 out_free: 2523 consume_skb(skb); 2524 out_err: 2525 scm_destroy(&scm); 2526 return sent ? : err; 2527 } 2528 2529 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2530 size_t len) 2531 { 2532 int err; 2533 struct sock *sk = sock->sk; 2534 2535 err = sock_error(sk); 2536 if (err) 2537 return err; 2538 2539 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2540 return -ENOTCONN; 2541 2542 if (msg->msg_namelen) 2543 msg->msg_namelen = 0; 2544 2545 return unix_dgram_sendmsg(sock, msg, len); 2546 } 2547 2548 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2549 size_t size, int flags) 2550 { 2551 struct sock *sk = sock->sk; 2552 2553 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2554 return -ENOTCONN; 2555 2556 return unix_dgram_recvmsg(sock, msg, size, flags); 2557 } 2558 2559 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2560 { 2561 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2562 2563 if (addr) { 2564 msg->msg_namelen = addr->len; 2565 memcpy(msg->msg_name, addr->name, addr->len); 2566 } 2567 } 2568 2569 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2570 int flags) 2571 { 2572 struct scm_cookie scm; 2573 struct socket *sock = sk->sk_socket; 2574 struct unix_sock *u = unix_sk(sk); 2575 struct sk_buff *skb, *last; 2576 long timeo; 2577 int skip; 2578 int err; 2579 2580 err = -EOPNOTSUPP; 2581 if (flags&MSG_OOB) 2582 goto out; 2583 2584 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2585 2586 do { 2587 mutex_lock(&u->iolock); 2588 2589 skip = sk_peek_offset(sk, flags); 2590 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2591 &skip, &err, &last); 2592 if (skb) { 2593 if (!(flags & MSG_PEEK)) 2594 scm_stat_del(sk, skb); 2595 break; 2596 } 2597 2598 mutex_unlock(&u->iolock); 2599 2600 if (err != -EAGAIN) 2601 break; 2602 } while (timeo && 2603 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2604 &err, &timeo, last)); 2605 2606 if (!skb) { /* implies iolock unlocked */ 2607 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2608 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2609 (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN)) 2610 err = 0; 2611 goto out; 2612 } 2613 2614 if (wq_has_sleeper(&u->peer_wait)) 2615 wake_up_interruptible_sync_poll(&u->peer_wait, 2616 EPOLLOUT | EPOLLWRNORM | 2617 EPOLLWRBAND); 2618 2619 if (msg->msg_name) { 2620 unix_copy_addr(msg, skb->sk); 2621 2622 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2623 msg->msg_name, 2624 &msg->msg_namelen); 2625 } 2626 2627 if (size > skb->len - skip) 2628 size = skb->len - skip; 2629 else if (size < skb->len - skip) 2630 msg->msg_flags |= MSG_TRUNC; 2631 2632 err = skb_copy_datagram_msg(skb, skip, msg, size); 2633 if (err) 2634 goto out_free; 2635 2636 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2637 __sock_recv_timestamp(msg, sk, skb); 2638 2639 memset(&scm, 0, sizeof(scm)); 2640 2641 unix_skb_to_scm(skb, &scm); 2642 2643 if (!(flags & MSG_PEEK)) { 2644 if (UNIXCB(skb).fp) 2645 unix_detach_fds(&scm, skb); 2646 2647 sk_peek_offset_bwd(sk, skb->len); 2648 } else { 2649 /* It is questionable: on PEEK we could: 2650 - do not return fds - good, but too simple 8) 2651 - return fds, and do not return them on read (old strategy, 2652 apparently wrong) 2653 - clone fds (I chose it for now, it is the most universal 2654 solution) 2655 2656 POSIX 1003.1g does not actually define this clearly 2657 at all. POSIX 1003.1g doesn't define a lot of things 2658 clearly however! 2659 2660 */ 2661 2662 sk_peek_offset_fwd(sk, size); 2663 2664 if (UNIXCB(skb).fp) 2665 unix_peek_fds(&scm, skb); 2666 } 2667 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2668 2669 scm_recv_unix(sock, msg, &scm, flags); 2670 2671 out_free: 2672 skb_free_datagram(sk, skb); 2673 mutex_unlock(&u->iolock); 2674 out: 2675 return err; 2676 } 2677 2678 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2679 int flags) 2680 { 2681 struct sock *sk = sock->sk; 2682 2683 #ifdef CONFIG_BPF_SYSCALL 2684 const struct proto *prot = READ_ONCE(sk->sk_prot); 2685 2686 if (prot != &unix_dgram_proto) 2687 return prot->recvmsg(sk, msg, size, flags, NULL); 2688 #endif 2689 return __unix_dgram_recvmsg(sk, msg, size, flags); 2690 } 2691 2692 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2693 { 2694 struct unix_sock *u = unix_sk(sk); 2695 struct sk_buff *skb; 2696 int err; 2697 2698 mutex_lock(&u->iolock); 2699 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2700 mutex_unlock(&u->iolock); 2701 if (!skb) 2702 return err; 2703 2704 return recv_actor(sk, skb); 2705 } 2706 2707 /* 2708 * Sleep until more data has arrived. But check for races.. 2709 */ 2710 static long unix_stream_data_wait(struct sock *sk, long timeo, 2711 struct sk_buff *last, unsigned int last_len, 2712 bool freezable) 2713 { 2714 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2715 struct sk_buff *tail; 2716 DEFINE_WAIT(wait); 2717 2718 unix_state_lock(sk); 2719 2720 for (;;) { 2721 prepare_to_wait(sk_sleep(sk), &wait, state); 2722 2723 tail = skb_peek_tail(&sk->sk_receive_queue); 2724 if (tail != last || 2725 (tail && tail->len != last_len) || 2726 sk->sk_err || 2727 (sk->sk_shutdown & RCV_SHUTDOWN) || 2728 signal_pending(current) || 2729 !timeo) 2730 break; 2731 2732 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2733 unix_state_unlock(sk); 2734 timeo = schedule_timeout(timeo); 2735 unix_state_lock(sk); 2736 2737 if (sock_flag(sk, SOCK_DEAD)) 2738 break; 2739 2740 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2741 } 2742 2743 finish_wait(sk_sleep(sk), &wait); 2744 unix_state_unlock(sk); 2745 return timeo; 2746 } 2747 2748 struct unix_stream_read_state { 2749 int (*recv_actor)(struct sk_buff *, int, int, 2750 struct unix_stream_read_state *); 2751 struct socket *socket; 2752 struct msghdr *msg; 2753 struct pipe_inode_info *pipe; 2754 size_t size; 2755 int flags; 2756 unsigned int splice_flags; 2757 }; 2758 2759 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2760 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2761 { 2762 struct sk_buff *oob_skb, *read_skb = NULL; 2763 struct socket *sock = state->socket; 2764 struct sock *sk = sock->sk; 2765 struct unix_sock *u = unix_sk(sk); 2766 int chunk = 1; 2767 2768 mutex_lock(&u->iolock); 2769 unix_state_lock(sk); 2770 spin_lock(&sk->sk_receive_queue.lock); 2771 2772 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2773 spin_unlock(&sk->sk_receive_queue.lock); 2774 unix_state_unlock(sk); 2775 mutex_unlock(&u->iolock); 2776 return -EINVAL; 2777 } 2778 2779 oob_skb = u->oob_skb; 2780 2781 if (!(state->flags & MSG_PEEK)) { 2782 WRITE_ONCE(u->oob_skb, NULL); 2783 WRITE_ONCE(u->inq_len, u->inq_len - 1); 2784 2785 if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue && 2786 !unix_skb_len(oob_skb->prev)) { 2787 read_skb = oob_skb->prev; 2788 __skb_unlink(read_skb, &sk->sk_receive_queue); 2789 } 2790 } 2791 2792 spin_unlock(&sk->sk_receive_queue.lock); 2793 unix_state_unlock(sk); 2794 2795 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2796 2797 if (!(state->flags & MSG_PEEK)) 2798 UNIXCB(oob_skb).consumed += 1; 2799 2800 mutex_unlock(&u->iolock); 2801 2802 consume_skb(read_skb); 2803 2804 if (chunk < 0) 2805 return -EFAULT; 2806 2807 state->msg->msg_flags |= MSG_OOB; 2808 return 1; 2809 } 2810 2811 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2812 int flags, int copied) 2813 { 2814 struct sk_buff *read_skb = NULL, *unread_skb = NULL; 2815 struct unix_sock *u = unix_sk(sk); 2816 2817 if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb))) 2818 return skb; 2819 2820 spin_lock(&sk->sk_receive_queue.lock); 2821 2822 if (!unix_skb_len(skb)) { 2823 if (copied && (!u->oob_skb || skb == u->oob_skb)) { 2824 skb = NULL; 2825 } else if (flags & MSG_PEEK) { 2826 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2827 } else { 2828 read_skb = skb; 2829 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2830 __skb_unlink(read_skb, &sk->sk_receive_queue); 2831 } 2832 2833 if (!skb) 2834 goto unlock; 2835 } 2836 2837 if (skb != u->oob_skb) 2838 goto unlock; 2839 2840 if (copied) { 2841 skb = NULL; 2842 } else if (!(flags & MSG_PEEK)) { 2843 WRITE_ONCE(u->oob_skb, NULL); 2844 2845 if (!sock_flag(sk, SOCK_URGINLINE)) { 2846 __skb_unlink(skb, &sk->sk_receive_queue); 2847 unread_skb = skb; 2848 skb = skb_peek(&sk->sk_receive_queue); 2849 } 2850 } else if (!sock_flag(sk, SOCK_URGINLINE)) { 2851 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2852 } 2853 2854 unlock: 2855 spin_unlock(&sk->sk_receive_queue.lock); 2856 2857 consume_skb(read_skb); 2858 kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB); 2859 2860 return skb; 2861 } 2862 #endif 2863 2864 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2865 { 2866 struct sk_buff_head *queue = &sk->sk_receive_queue; 2867 struct unix_sock *u = unix_sk(sk); 2868 struct sk_buff *skb; 2869 int err; 2870 2871 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2872 return -ENOTCONN; 2873 2874 err = sock_error(sk); 2875 if (err) 2876 return err; 2877 2878 mutex_lock(&u->iolock); 2879 spin_lock(&queue->lock); 2880 2881 skb = __skb_dequeue(queue); 2882 if (!skb) { 2883 spin_unlock(&queue->lock); 2884 mutex_unlock(&u->iolock); 2885 return -EAGAIN; 2886 } 2887 2888 WRITE_ONCE(u->inq_len, u->inq_len - skb->len); 2889 2890 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2891 if (skb == u->oob_skb) { 2892 WRITE_ONCE(u->oob_skb, NULL); 2893 spin_unlock(&queue->lock); 2894 mutex_unlock(&u->iolock); 2895 2896 kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); 2897 return -EAGAIN; 2898 } 2899 #endif 2900 2901 spin_unlock(&queue->lock); 2902 mutex_unlock(&u->iolock); 2903 2904 return recv_actor(sk, skb); 2905 } 2906 2907 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2908 bool freezable) 2909 { 2910 int noblock = state->flags & MSG_DONTWAIT; 2911 struct socket *sock = state->socket; 2912 struct msghdr *msg = state->msg; 2913 struct sock *sk = sock->sk; 2914 size_t size = state->size; 2915 int flags = state->flags; 2916 bool check_creds = false; 2917 struct scm_cookie scm; 2918 unsigned int last_len; 2919 struct unix_sock *u; 2920 int copied = 0; 2921 int err = 0; 2922 long timeo; 2923 int target; 2924 int skip; 2925 2926 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2927 err = -EINVAL; 2928 goto out; 2929 } 2930 2931 if (unlikely(flags & MSG_OOB)) { 2932 err = -EOPNOTSUPP; 2933 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2934 err = unix_stream_recv_urg(state); 2935 #endif 2936 goto out; 2937 } 2938 2939 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2940 timeo = sock_rcvtimeo(sk, noblock); 2941 2942 memset(&scm, 0, sizeof(scm)); 2943 2944 u = unix_sk(sk); 2945 2946 /* Lock the socket to prevent queue disordering 2947 * while sleeps in memcpy_tomsg 2948 */ 2949 mutex_lock(&u->iolock); 2950 2951 skip = max(sk_peek_offset(sk, flags), 0); 2952 2953 do { 2954 struct sk_buff *skb, *last; 2955 int chunk; 2956 2957 redo: 2958 unix_state_lock(sk); 2959 if (sock_flag(sk, SOCK_DEAD)) { 2960 err = -ECONNRESET; 2961 goto unlock; 2962 } 2963 last = skb = skb_peek(&sk->sk_receive_queue); 2964 last_len = last ? last->len : 0; 2965 2966 again: 2967 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2968 if (skb) { 2969 skb = manage_oob(skb, sk, flags, copied); 2970 if (!skb && copied) { 2971 unix_state_unlock(sk); 2972 break; 2973 } 2974 } 2975 #endif 2976 if (skb == NULL) { 2977 if (copied >= target) 2978 goto unlock; 2979 2980 /* 2981 * POSIX 1003.1g mandates this order. 2982 */ 2983 2984 err = sock_error(sk); 2985 if (err) 2986 goto unlock; 2987 if (sk->sk_shutdown & RCV_SHUTDOWN) 2988 goto unlock; 2989 2990 unix_state_unlock(sk); 2991 if (!timeo) { 2992 err = -EAGAIN; 2993 break; 2994 } 2995 2996 mutex_unlock(&u->iolock); 2997 2998 timeo = unix_stream_data_wait(sk, timeo, last, 2999 last_len, freezable); 3000 3001 if (signal_pending(current)) { 3002 err = sock_intr_errno(timeo); 3003 scm_destroy(&scm); 3004 goto out; 3005 } 3006 3007 mutex_lock(&u->iolock); 3008 goto redo; 3009 unlock: 3010 unix_state_unlock(sk); 3011 break; 3012 } 3013 3014 while (skip >= unix_skb_len(skb)) { 3015 skip -= unix_skb_len(skb); 3016 last = skb; 3017 last_len = skb->len; 3018 skb = skb_peek_next(skb, &sk->sk_receive_queue); 3019 if (!skb) 3020 goto again; 3021 } 3022 3023 unix_state_unlock(sk); 3024 3025 if (check_creds) { 3026 /* Never glue messages from different writers */ 3027 if (!unix_skb_scm_eq(skb, &scm)) 3028 break; 3029 } else if (unix_may_passcred(sk)) { 3030 /* Copy credentials */ 3031 unix_skb_to_scm(skb, &scm); 3032 check_creds = true; 3033 } 3034 3035 /* Copy address just once */ 3036 if (msg && msg->msg_name) { 3037 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 3038 3039 unix_copy_addr(msg, skb->sk); 3040 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, msg->msg_name, 3041 &msg->msg_namelen); 3042 3043 sunaddr = NULL; 3044 } 3045 3046 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 3047 chunk = state->recv_actor(skb, skip, chunk, state); 3048 if (chunk < 0) { 3049 if (copied == 0) 3050 copied = -EFAULT; 3051 break; 3052 } 3053 copied += chunk; 3054 size -= chunk; 3055 3056 /* Mark read part of skb as used */ 3057 if (!(flags & MSG_PEEK)) { 3058 UNIXCB(skb).consumed += chunk; 3059 3060 sk_peek_offset_bwd(sk, chunk); 3061 3062 if (UNIXCB(skb).fp) { 3063 scm_stat_del(sk, skb); 3064 unix_detach_fds(&scm, skb); 3065 } 3066 3067 if (unix_skb_len(skb)) 3068 break; 3069 3070 spin_lock(&sk->sk_receive_queue.lock); 3071 WRITE_ONCE(u->inq_len, u->inq_len - skb->len); 3072 __skb_unlink(skb, &sk->sk_receive_queue); 3073 spin_unlock(&sk->sk_receive_queue.lock); 3074 3075 consume_skb(skb); 3076 3077 if (scm.fp) 3078 break; 3079 } else { 3080 /* It is questionable, see note in unix_dgram_recvmsg. 3081 */ 3082 if (UNIXCB(skb).fp) 3083 unix_peek_fds(&scm, skb); 3084 3085 sk_peek_offset_fwd(sk, chunk); 3086 3087 if (UNIXCB(skb).fp) 3088 break; 3089 3090 skip = 0; 3091 last = skb; 3092 last_len = skb->len; 3093 unix_state_lock(sk); 3094 skb = skb_peek_next(skb, &sk->sk_receive_queue); 3095 if (skb) 3096 goto again; 3097 unix_state_unlock(sk); 3098 break; 3099 } 3100 } while (size); 3101 3102 mutex_unlock(&u->iolock); 3103 if (msg) { 3104 scm_recv_unix(sock, msg, &scm, flags); 3105 3106 if (READ_ONCE(u->recvmsg_inq) || msg->msg_get_inq) { 3107 msg->msg_inq = READ_ONCE(u->inq_len); 3108 put_cmsg(msg, SOL_SOCKET, SCM_INQ, 3109 sizeof(msg->msg_inq), &msg->msg_inq); 3110 } 3111 } else { 3112 scm_destroy(&scm); 3113 } 3114 out: 3115 return copied ? : err; 3116 } 3117 3118 static int unix_stream_read_actor(struct sk_buff *skb, 3119 int skip, int chunk, 3120 struct unix_stream_read_state *state) 3121 { 3122 int ret; 3123 3124 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 3125 state->msg, chunk); 3126 return ret ?: chunk; 3127 } 3128 3129 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 3130 size_t size, int flags) 3131 { 3132 struct unix_stream_read_state state = { 3133 .recv_actor = unix_stream_read_actor, 3134 .socket = sk->sk_socket, 3135 .msg = msg, 3136 .size = size, 3137 .flags = flags 3138 }; 3139 3140 return unix_stream_read_generic(&state, true); 3141 } 3142 3143 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 3144 size_t size, int flags) 3145 { 3146 struct unix_stream_read_state state = { 3147 .recv_actor = unix_stream_read_actor, 3148 .socket = sock, 3149 .msg = msg, 3150 .size = size, 3151 .flags = flags 3152 }; 3153 3154 #ifdef CONFIG_BPF_SYSCALL 3155 struct sock *sk = sock->sk; 3156 const struct proto *prot = READ_ONCE(sk->sk_prot); 3157 3158 if (prot != &unix_stream_proto) 3159 return prot->recvmsg(sk, msg, size, flags, NULL); 3160 #endif 3161 return unix_stream_read_generic(&state, true); 3162 } 3163 3164 static int unix_stream_splice_actor(struct sk_buff *skb, 3165 int skip, int chunk, 3166 struct unix_stream_read_state *state) 3167 { 3168 return skb_splice_bits(skb, state->socket->sk, 3169 UNIXCB(skb).consumed + skip, 3170 state->pipe, chunk, state->splice_flags); 3171 } 3172 3173 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 3174 struct pipe_inode_info *pipe, 3175 size_t size, unsigned int flags) 3176 { 3177 struct unix_stream_read_state state = { 3178 .recv_actor = unix_stream_splice_actor, 3179 .socket = sock, 3180 .pipe = pipe, 3181 .size = size, 3182 .splice_flags = flags, 3183 }; 3184 3185 if (unlikely(*ppos)) 3186 return -ESPIPE; 3187 3188 if (sock->file->f_flags & O_NONBLOCK || 3189 flags & SPLICE_F_NONBLOCK) 3190 state.flags = MSG_DONTWAIT; 3191 3192 return unix_stream_read_generic(&state, false); 3193 } 3194 3195 static int unix_shutdown(struct socket *sock, int mode) 3196 { 3197 struct sock *sk = sock->sk; 3198 struct sock *other; 3199 3200 if (mode < SHUT_RD || mode > SHUT_RDWR) 3201 return -EINVAL; 3202 /* This maps: 3203 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 3204 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 3205 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 3206 */ 3207 ++mode; 3208 3209 unix_state_lock(sk); 3210 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 3211 other = unix_peer(sk); 3212 if (other) 3213 sock_hold(other); 3214 unix_state_unlock(sk); 3215 sk->sk_state_change(sk); 3216 3217 if (other && 3218 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 3219 3220 int peer_mode = 0; 3221 const struct proto *prot = READ_ONCE(other->sk_prot); 3222 3223 if (prot->unhash) 3224 prot->unhash(other); 3225 if (mode&RCV_SHUTDOWN) 3226 peer_mode |= SEND_SHUTDOWN; 3227 if (mode&SEND_SHUTDOWN) 3228 peer_mode |= RCV_SHUTDOWN; 3229 unix_state_lock(other); 3230 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 3231 unix_state_unlock(other); 3232 other->sk_state_change(other); 3233 if (peer_mode == SHUTDOWN_MASK) 3234 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3235 else if (peer_mode & RCV_SHUTDOWN) 3236 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3237 } 3238 if (other) 3239 sock_put(other); 3240 3241 return 0; 3242 } 3243 3244 long unix_inq_len(struct sock *sk) 3245 { 3246 struct sk_buff *skb; 3247 long amount = 0; 3248 3249 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 3250 return -EINVAL; 3251 3252 if (sk->sk_type == SOCK_STREAM) 3253 return READ_ONCE(unix_sk(sk)->inq_len); 3254 3255 spin_lock(&sk->sk_receive_queue.lock); 3256 if (sk->sk_type == SOCK_SEQPACKET) { 3257 skb_queue_walk(&sk->sk_receive_queue, skb) 3258 amount += unix_skb_len(skb); 3259 } else { 3260 skb = skb_peek(&sk->sk_receive_queue); 3261 if (skb) 3262 amount = skb->len; 3263 } 3264 spin_unlock(&sk->sk_receive_queue.lock); 3265 3266 return amount; 3267 } 3268 EXPORT_SYMBOL_GPL(unix_inq_len); 3269 3270 long unix_outq_len(struct sock *sk) 3271 { 3272 return sk_wmem_alloc_get(sk); 3273 } 3274 EXPORT_SYMBOL_GPL(unix_outq_len); 3275 3276 static int unix_open_file(struct sock *sk) 3277 { 3278 struct file *f; 3279 int fd; 3280 3281 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3282 return -EPERM; 3283 3284 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3285 return -ENOENT; 3286 3287 if (!unix_sk(sk)->path.dentry) 3288 return -ENOENT; 3289 3290 fd = get_unused_fd_flags(O_CLOEXEC); 3291 if (fd < 0) 3292 return fd; 3293 3294 f = dentry_open(&unix_sk(sk)->path, O_PATH, current_cred()); 3295 if (IS_ERR(f)) { 3296 put_unused_fd(fd); 3297 return PTR_ERR(f); 3298 } 3299 3300 fd_install(fd, f); 3301 return fd; 3302 } 3303 3304 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3305 { 3306 struct sock *sk = sock->sk; 3307 long amount = 0; 3308 int err; 3309 3310 switch (cmd) { 3311 case SIOCOUTQ: 3312 amount = unix_outq_len(sk); 3313 err = put_user(amount, (int __user *)arg); 3314 break; 3315 case SIOCINQ: 3316 amount = unix_inq_len(sk); 3317 if (amount < 0) 3318 err = amount; 3319 else 3320 err = put_user(amount, (int __user *)arg); 3321 break; 3322 case SIOCUNIXFILE: 3323 err = unix_open_file(sk); 3324 break; 3325 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3326 case SIOCATMARK: 3327 { 3328 struct unix_sock *u = unix_sk(sk); 3329 struct sk_buff *skb; 3330 int answ = 0; 3331 3332 mutex_lock(&u->iolock); 3333 3334 skb = skb_peek(&sk->sk_receive_queue); 3335 if (skb) { 3336 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); 3337 struct sk_buff *next_skb; 3338 3339 next_skb = skb_peek_next(skb, &sk->sk_receive_queue); 3340 3341 if (skb == oob_skb || 3342 (!unix_skb_len(skb) && 3343 (!oob_skb || next_skb == oob_skb))) 3344 answ = 1; 3345 } 3346 3347 mutex_unlock(&u->iolock); 3348 3349 err = put_user(answ, (int __user *)arg); 3350 } 3351 break; 3352 #endif 3353 default: 3354 err = -ENOIOCTLCMD; 3355 break; 3356 } 3357 return err; 3358 } 3359 3360 #ifdef CONFIG_COMPAT 3361 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3362 { 3363 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3364 } 3365 #endif 3366 3367 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3368 { 3369 struct sock *sk = sock->sk; 3370 unsigned char state; 3371 __poll_t mask; 3372 u8 shutdown; 3373 3374 sock_poll_wait(file, sock, wait); 3375 mask = 0; 3376 shutdown = READ_ONCE(sk->sk_shutdown); 3377 state = READ_ONCE(sk->sk_state); 3378 3379 /* exceptional events? */ 3380 if (READ_ONCE(sk->sk_err)) 3381 mask |= EPOLLERR; 3382 if (shutdown == SHUTDOWN_MASK) 3383 mask |= EPOLLHUP; 3384 if (shutdown & RCV_SHUTDOWN) 3385 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3386 3387 /* readable? */ 3388 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3389 mask |= EPOLLIN | EPOLLRDNORM; 3390 if (sk_is_readable(sk)) 3391 mask |= EPOLLIN | EPOLLRDNORM; 3392 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3393 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3394 mask |= EPOLLPRI; 3395 #endif 3396 3397 /* Connection-based need to check for termination and startup */ 3398 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3399 state == TCP_CLOSE) 3400 mask |= EPOLLHUP; 3401 3402 /* 3403 * we set writable also when the other side has shut down the 3404 * connection. This prevents stuck sockets. 3405 */ 3406 if (unix_writable(sk, state)) 3407 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3408 3409 return mask; 3410 } 3411 3412 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3413 poll_table *wait) 3414 { 3415 struct sock *sk = sock->sk, *other; 3416 unsigned int writable; 3417 unsigned char state; 3418 __poll_t mask; 3419 u8 shutdown; 3420 3421 sock_poll_wait(file, sock, wait); 3422 mask = 0; 3423 shutdown = READ_ONCE(sk->sk_shutdown); 3424 state = READ_ONCE(sk->sk_state); 3425 3426 /* exceptional events? */ 3427 if (READ_ONCE(sk->sk_err) || 3428 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3429 mask |= EPOLLERR | 3430 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3431 3432 if (shutdown & RCV_SHUTDOWN) 3433 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3434 if (shutdown == SHUTDOWN_MASK) 3435 mask |= EPOLLHUP; 3436 3437 /* readable? */ 3438 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3439 mask |= EPOLLIN | EPOLLRDNORM; 3440 if (sk_is_readable(sk)) 3441 mask |= EPOLLIN | EPOLLRDNORM; 3442 3443 /* Connection-based need to check for termination and startup */ 3444 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3445 mask |= EPOLLHUP; 3446 3447 /* No write status requested, avoid expensive OUT tests. */ 3448 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3449 return mask; 3450 3451 writable = unix_writable(sk, state); 3452 if (writable) { 3453 unix_state_lock(sk); 3454 3455 other = unix_peer(sk); 3456 if (other && unix_peer(other) != sk && 3457 unix_recvq_full_lockless(other) && 3458 unix_dgram_peer_wake_me(sk, other)) 3459 writable = 0; 3460 3461 unix_state_unlock(sk); 3462 } 3463 3464 if (writable) 3465 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3466 else 3467 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3468 3469 return mask; 3470 } 3471 3472 #ifdef CONFIG_PROC_FS 3473 3474 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3475 3476 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3477 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3478 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3479 3480 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3481 { 3482 unsigned long offset = get_offset(*pos); 3483 unsigned long bucket = get_bucket(*pos); 3484 unsigned long count = 0; 3485 struct sock *sk; 3486 3487 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3488 sk; sk = sk_next(sk)) { 3489 if (++count == offset) 3490 break; 3491 } 3492 3493 return sk; 3494 } 3495 3496 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3497 { 3498 unsigned long bucket = get_bucket(*pos); 3499 struct net *net = seq_file_net(seq); 3500 struct sock *sk; 3501 3502 while (bucket < UNIX_HASH_SIZE) { 3503 spin_lock(&net->unx.table.locks[bucket]); 3504 3505 sk = unix_from_bucket(seq, pos); 3506 if (sk) 3507 return sk; 3508 3509 spin_unlock(&net->unx.table.locks[bucket]); 3510 3511 *pos = set_bucket_offset(++bucket, 1); 3512 } 3513 3514 return NULL; 3515 } 3516 3517 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3518 loff_t *pos) 3519 { 3520 unsigned long bucket = get_bucket(*pos); 3521 3522 sk = sk_next(sk); 3523 if (sk) 3524 return sk; 3525 3526 3527 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3528 3529 *pos = set_bucket_offset(++bucket, 1); 3530 3531 return unix_get_first(seq, pos); 3532 } 3533 3534 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3535 { 3536 if (!*pos) 3537 return SEQ_START_TOKEN; 3538 3539 return unix_get_first(seq, pos); 3540 } 3541 3542 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3543 { 3544 ++*pos; 3545 3546 if (v == SEQ_START_TOKEN) 3547 return unix_get_first(seq, pos); 3548 3549 return unix_get_next(seq, v, pos); 3550 } 3551 3552 static void unix_seq_stop(struct seq_file *seq, void *v) 3553 { 3554 struct sock *sk = v; 3555 3556 if (sk) 3557 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3558 } 3559 3560 static int unix_seq_show(struct seq_file *seq, void *v) 3561 { 3562 3563 if (v == SEQ_START_TOKEN) 3564 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3565 "Inode Path\n"); 3566 else { 3567 struct sock *s = v; 3568 struct unix_sock *u = unix_sk(s); 3569 unix_state_lock(s); 3570 3571 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3572 s, 3573 refcount_read(&s->sk_refcnt), 3574 0, 3575 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3576 s->sk_type, 3577 s->sk_socket ? 3578 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3579 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3580 sock_i_ino(s)); 3581 3582 if (u->addr) { // under a hash table lock here 3583 int i, len; 3584 seq_putc(seq, ' '); 3585 3586 i = 0; 3587 len = u->addr->len - 3588 offsetof(struct sockaddr_un, sun_path); 3589 if (u->addr->name->sun_path[0]) { 3590 len--; 3591 } else { 3592 seq_putc(seq, '@'); 3593 i++; 3594 } 3595 for ( ; i < len; i++) 3596 seq_putc(seq, u->addr->name->sun_path[i] ?: 3597 '@'); 3598 } 3599 unix_state_unlock(s); 3600 seq_putc(seq, '\n'); 3601 } 3602 3603 return 0; 3604 } 3605 3606 static const struct seq_operations unix_seq_ops = { 3607 .start = unix_seq_start, 3608 .next = unix_seq_next, 3609 .stop = unix_seq_stop, 3610 .show = unix_seq_show, 3611 }; 3612 3613 #ifdef CONFIG_BPF_SYSCALL 3614 struct bpf_unix_iter_state { 3615 struct seq_net_private p; 3616 unsigned int cur_sk; 3617 unsigned int end_sk; 3618 unsigned int max_sk; 3619 struct sock **batch; 3620 bool st_bucket_done; 3621 }; 3622 3623 struct bpf_iter__unix { 3624 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3625 __bpf_md_ptr(struct unix_sock *, unix_sk); 3626 uid_t uid __aligned(8); 3627 }; 3628 3629 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3630 struct unix_sock *unix_sk, uid_t uid) 3631 { 3632 struct bpf_iter__unix ctx; 3633 3634 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3635 ctx.meta = meta; 3636 ctx.unix_sk = unix_sk; 3637 ctx.uid = uid; 3638 return bpf_iter_run_prog(prog, &ctx); 3639 } 3640 3641 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3642 3643 { 3644 struct bpf_unix_iter_state *iter = seq->private; 3645 unsigned int expected = 1; 3646 struct sock *sk; 3647 3648 sock_hold(start_sk); 3649 iter->batch[iter->end_sk++] = start_sk; 3650 3651 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3652 if (iter->end_sk < iter->max_sk) { 3653 sock_hold(sk); 3654 iter->batch[iter->end_sk++] = sk; 3655 } 3656 3657 expected++; 3658 } 3659 3660 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3661 3662 return expected; 3663 } 3664 3665 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3666 { 3667 while (iter->cur_sk < iter->end_sk) 3668 sock_put(iter->batch[iter->cur_sk++]); 3669 } 3670 3671 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3672 unsigned int new_batch_sz) 3673 { 3674 struct sock **new_batch; 3675 3676 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3677 GFP_USER | __GFP_NOWARN); 3678 if (!new_batch) 3679 return -ENOMEM; 3680 3681 bpf_iter_unix_put_batch(iter); 3682 kvfree(iter->batch); 3683 iter->batch = new_batch; 3684 iter->max_sk = new_batch_sz; 3685 3686 return 0; 3687 } 3688 3689 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3690 loff_t *pos) 3691 { 3692 struct bpf_unix_iter_state *iter = seq->private; 3693 unsigned int expected; 3694 bool resized = false; 3695 struct sock *sk; 3696 3697 if (iter->st_bucket_done) 3698 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3699 3700 again: 3701 /* Get a new batch */ 3702 iter->cur_sk = 0; 3703 iter->end_sk = 0; 3704 3705 sk = unix_get_first(seq, pos); 3706 if (!sk) 3707 return NULL; /* Done */ 3708 3709 expected = bpf_iter_unix_hold_batch(seq, sk); 3710 3711 if (iter->end_sk == expected) { 3712 iter->st_bucket_done = true; 3713 return sk; 3714 } 3715 3716 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3717 resized = true; 3718 goto again; 3719 } 3720 3721 return sk; 3722 } 3723 3724 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3725 { 3726 if (!*pos) 3727 return SEQ_START_TOKEN; 3728 3729 /* bpf iter does not support lseek, so it always 3730 * continue from where it was stop()-ped. 3731 */ 3732 return bpf_iter_unix_batch(seq, pos); 3733 } 3734 3735 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3736 { 3737 struct bpf_unix_iter_state *iter = seq->private; 3738 struct sock *sk; 3739 3740 /* Whenever seq_next() is called, the iter->cur_sk is 3741 * done with seq_show(), so advance to the next sk in 3742 * the batch. 3743 */ 3744 if (iter->cur_sk < iter->end_sk) 3745 sock_put(iter->batch[iter->cur_sk++]); 3746 3747 ++*pos; 3748 3749 if (iter->cur_sk < iter->end_sk) 3750 sk = iter->batch[iter->cur_sk]; 3751 else 3752 sk = bpf_iter_unix_batch(seq, pos); 3753 3754 return sk; 3755 } 3756 3757 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3758 { 3759 struct bpf_iter_meta meta; 3760 struct bpf_prog *prog; 3761 struct sock *sk = v; 3762 uid_t uid; 3763 bool slow; 3764 int ret; 3765 3766 if (v == SEQ_START_TOKEN) 3767 return 0; 3768 3769 slow = lock_sock_fast(sk); 3770 3771 if (unlikely(sk_unhashed(sk))) { 3772 ret = SEQ_SKIP; 3773 goto unlock; 3774 } 3775 3776 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3777 meta.seq = seq; 3778 prog = bpf_iter_get_info(&meta, false); 3779 ret = unix_prog_seq_show(prog, &meta, v, uid); 3780 unlock: 3781 unlock_sock_fast(sk, slow); 3782 return ret; 3783 } 3784 3785 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3786 { 3787 struct bpf_unix_iter_state *iter = seq->private; 3788 struct bpf_iter_meta meta; 3789 struct bpf_prog *prog; 3790 3791 if (!v) { 3792 meta.seq = seq; 3793 prog = bpf_iter_get_info(&meta, true); 3794 if (prog) 3795 (void)unix_prog_seq_show(prog, &meta, v, 0); 3796 } 3797 3798 if (iter->cur_sk < iter->end_sk) 3799 bpf_iter_unix_put_batch(iter); 3800 } 3801 3802 static const struct seq_operations bpf_iter_unix_seq_ops = { 3803 .start = bpf_iter_unix_seq_start, 3804 .next = bpf_iter_unix_seq_next, 3805 .stop = bpf_iter_unix_seq_stop, 3806 .show = bpf_iter_unix_seq_show, 3807 }; 3808 #endif 3809 #endif 3810 3811 static const struct net_proto_family unix_family_ops = { 3812 .family = PF_UNIX, 3813 .create = unix_create, 3814 .owner = THIS_MODULE, 3815 }; 3816 3817 3818 static int __net_init unix_net_init(struct net *net) 3819 { 3820 int i; 3821 3822 net->unx.sysctl_max_dgram_qlen = 10; 3823 if (unix_sysctl_register(net)) 3824 goto out; 3825 3826 #ifdef CONFIG_PROC_FS 3827 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3828 sizeof(struct seq_net_private))) 3829 goto err_sysctl; 3830 #endif 3831 3832 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3833 sizeof(spinlock_t), GFP_KERNEL); 3834 if (!net->unx.table.locks) 3835 goto err_proc; 3836 3837 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3838 sizeof(struct hlist_head), 3839 GFP_KERNEL); 3840 if (!net->unx.table.buckets) 3841 goto free_locks; 3842 3843 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3844 spin_lock_init(&net->unx.table.locks[i]); 3845 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); 3846 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3847 } 3848 3849 return 0; 3850 3851 free_locks: 3852 kvfree(net->unx.table.locks); 3853 err_proc: 3854 #ifdef CONFIG_PROC_FS 3855 remove_proc_entry("unix", net->proc_net); 3856 err_sysctl: 3857 #endif 3858 unix_sysctl_unregister(net); 3859 out: 3860 return -ENOMEM; 3861 } 3862 3863 static void __net_exit unix_net_exit(struct net *net) 3864 { 3865 kvfree(net->unx.table.buckets); 3866 kvfree(net->unx.table.locks); 3867 unix_sysctl_unregister(net); 3868 remove_proc_entry("unix", net->proc_net); 3869 } 3870 3871 static struct pernet_operations unix_net_ops = { 3872 .init = unix_net_init, 3873 .exit = unix_net_exit, 3874 }; 3875 3876 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3877 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3878 struct unix_sock *unix_sk, uid_t uid) 3879 3880 #define INIT_BATCH_SZ 16 3881 3882 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3883 { 3884 struct bpf_unix_iter_state *iter = priv_data; 3885 int err; 3886 3887 err = bpf_iter_init_seq_net(priv_data, aux); 3888 if (err) 3889 return err; 3890 3891 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3892 if (err) { 3893 bpf_iter_fini_seq_net(priv_data); 3894 return err; 3895 } 3896 3897 return 0; 3898 } 3899 3900 static void bpf_iter_fini_unix(void *priv_data) 3901 { 3902 struct bpf_unix_iter_state *iter = priv_data; 3903 3904 bpf_iter_fini_seq_net(priv_data); 3905 kvfree(iter->batch); 3906 } 3907 3908 static const struct bpf_iter_seq_info unix_seq_info = { 3909 .seq_ops = &bpf_iter_unix_seq_ops, 3910 .init_seq_private = bpf_iter_init_unix, 3911 .fini_seq_private = bpf_iter_fini_unix, 3912 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3913 }; 3914 3915 static const struct bpf_func_proto * 3916 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3917 const struct bpf_prog *prog) 3918 { 3919 switch (func_id) { 3920 case BPF_FUNC_setsockopt: 3921 return &bpf_sk_setsockopt_proto; 3922 case BPF_FUNC_getsockopt: 3923 return &bpf_sk_getsockopt_proto; 3924 default: 3925 return NULL; 3926 } 3927 } 3928 3929 static struct bpf_iter_reg unix_reg_info = { 3930 .target = "unix", 3931 .ctx_arg_info_size = 1, 3932 .ctx_arg_info = { 3933 { offsetof(struct bpf_iter__unix, unix_sk), 3934 PTR_TO_BTF_ID_OR_NULL }, 3935 }, 3936 .get_func_proto = bpf_iter_unix_get_func_proto, 3937 .seq_info = &unix_seq_info, 3938 }; 3939 3940 static void __init bpf_iter_register(void) 3941 { 3942 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3943 if (bpf_iter_reg_target(&unix_reg_info)) 3944 pr_warn("Warning: could not register bpf iterator unix\n"); 3945 } 3946 #endif 3947 3948 static int __init af_unix_init(void) 3949 { 3950 int i, rc = -1; 3951 3952 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3953 3954 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3955 spin_lock_init(&bsd_socket_locks[i]); 3956 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3957 } 3958 3959 rc = proto_register(&unix_dgram_proto, 1); 3960 if (rc != 0) { 3961 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3962 goto out; 3963 } 3964 3965 rc = proto_register(&unix_stream_proto, 1); 3966 if (rc != 0) { 3967 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3968 proto_unregister(&unix_dgram_proto); 3969 goto out; 3970 } 3971 3972 sock_register(&unix_family_ops); 3973 register_pernet_subsys(&unix_net_ops); 3974 unix_bpf_build_proto(); 3975 3976 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3977 bpf_iter_register(); 3978 #endif 3979 3980 out: 3981 return rc; 3982 } 3983 3984 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3985 fs_initcall(af_unix_init); 3986