1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/bpf-cgroup.h> 81 #include <linux/btf_ids.h> 82 #include <linux/dcache.h> 83 #include <linux/errno.h> 84 #include <linux/fcntl.h> 85 #include <linux/file.h> 86 #include <linux/filter.h> 87 #include <linux/fs.h> 88 #include <linux/fs_struct.h> 89 #include <linux/init.h> 90 #include <linux/kernel.h> 91 #include <linux/mount.h> 92 #include <linux/namei.h> 93 #include <linux/net.h> 94 #include <linux/pidfs.h> 95 #include <linux/poll.h> 96 #include <linux/proc_fs.h> 97 #include <linux/sched/signal.h> 98 #include <linux/security.h> 99 #include <linux/seq_file.h> 100 #include <linux/skbuff.h> 101 #include <linux/slab.h> 102 #include <linux/socket.h> 103 #include <linux/splice.h> 104 #include <linux/string.h> 105 #include <linux/uaccess.h> 106 #include <net/af_unix.h> 107 #include <net/net_namespace.h> 108 #include <net/scm.h> 109 #include <net/tcp_states.h> 110 #include <uapi/linux/sockios.h> 111 #include <uapi/linux/termios.h> 112 113 #include "af_unix.h" 114 115 static atomic_long_t unix_nr_socks; 116 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 117 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 118 119 /* SMP locking strategy: 120 * hash table is protected with spinlock. 121 * each socket state is protected by separate spinlock. 122 */ 123 #ifdef CONFIG_PROVE_LOCKING 124 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) 125 126 static int unix_table_lock_cmp_fn(const struct lockdep_map *a, 127 const struct lockdep_map *b) 128 { 129 return cmp_ptr(a, b); 130 } 131 132 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, 133 const struct lockdep_map *_b) 134 { 135 const struct unix_sock *a, *b; 136 137 a = container_of(_a, struct unix_sock, lock.dep_map); 138 b = container_of(_b, struct unix_sock, lock.dep_map); 139 140 if (a->sk.sk_state == TCP_LISTEN) { 141 /* unix_stream_connect(): Before the 2nd unix_state_lock(), 142 * 143 * 1. a is TCP_LISTEN. 144 * 2. b is not a. 145 * 3. concurrent connect(b -> a) must fail. 146 * 147 * Except for 2. & 3., the b's state can be any possible 148 * value due to concurrent connect() or listen(). 149 * 150 * 2. is detected in debug_spin_lock_before(), and 3. cannot 151 * be expressed as lock_cmp_fn. 152 */ 153 switch (b->sk.sk_state) { 154 case TCP_CLOSE: 155 case TCP_ESTABLISHED: 156 case TCP_LISTEN: 157 return -1; 158 default: 159 /* Invalid case. */ 160 return 0; 161 } 162 } 163 164 /* Should never happen. Just to be symmetric. */ 165 if (b->sk.sk_state == TCP_LISTEN) { 166 switch (b->sk.sk_state) { 167 case TCP_CLOSE: 168 case TCP_ESTABLISHED: 169 return 1; 170 default: 171 return 0; 172 } 173 } 174 175 /* unix_state_double_lock(): ascending address order. */ 176 return cmp_ptr(a, b); 177 } 178 179 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, 180 const struct lockdep_map *_b) 181 { 182 const struct sock *a, *b; 183 184 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); 185 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); 186 187 /* unix_collect_skb(): listener -> embryo order. */ 188 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) 189 return -1; 190 191 /* Should never happen. Just to be symmetric. */ 192 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) 193 return 1; 194 195 return 0; 196 } 197 #endif 198 199 static unsigned int unix_unbound_hash(struct sock *sk) 200 { 201 unsigned long hash = (unsigned long)sk; 202 203 hash ^= hash >> 16; 204 hash ^= hash >> 8; 205 hash ^= sk->sk_type; 206 207 return hash & UNIX_HASH_MOD; 208 } 209 210 static unsigned int unix_bsd_hash(struct inode *i) 211 { 212 return i->i_ino & UNIX_HASH_MOD; 213 } 214 215 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 216 int addr_len, int type) 217 { 218 __wsum csum = csum_partial(sunaddr, addr_len, 0); 219 unsigned int hash; 220 221 hash = (__force unsigned int)csum_fold(csum); 222 hash ^= hash >> 8; 223 hash ^= type; 224 225 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 226 } 227 228 static void unix_table_double_lock(struct net *net, 229 unsigned int hash1, unsigned int hash2) 230 { 231 if (hash1 == hash2) { 232 spin_lock(&net->unx.table.locks[hash1]); 233 return; 234 } 235 236 if (hash1 > hash2) 237 swap(hash1, hash2); 238 239 spin_lock(&net->unx.table.locks[hash1]); 240 spin_lock(&net->unx.table.locks[hash2]); 241 } 242 243 static void unix_table_double_unlock(struct net *net, 244 unsigned int hash1, unsigned int hash2) 245 { 246 if (hash1 == hash2) { 247 spin_unlock(&net->unx.table.locks[hash1]); 248 return; 249 } 250 251 spin_unlock(&net->unx.table.locks[hash1]); 252 spin_unlock(&net->unx.table.locks[hash2]); 253 } 254 255 #ifdef CONFIG_SECURITY_NETWORK 256 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 257 { 258 UNIXCB(skb).secid = scm->secid; 259 } 260 261 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 262 { 263 scm->secid = UNIXCB(skb).secid; 264 } 265 266 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 267 { 268 return (scm->secid == UNIXCB(skb).secid); 269 } 270 #else 271 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 272 { } 273 274 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 275 { } 276 277 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 278 { 279 return true; 280 } 281 #endif /* CONFIG_SECURITY_NETWORK */ 282 283 static inline int unix_may_send(struct sock *sk, struct sock *osk) 284 { 285 return !unix_peer(osk) || unix_peer(osk) == sk; 286 } 287 288 static inline int unix_recvq_full_lockless(const struct sock *sk) 289 { 290 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 291 } 292 293 struct sock *unix_peer_get(struct sock *s) 294 { 295 struct sock *peer; 296 297 unix_state_lock(s); 298 peer = unix_peer(s); 299 if (peer) 300 sock_hold(peer); 301 unix_state_unlock(s); 302 return peer; 303 } 304 EXPORT_SYMBOL_GPL(unix_peer_get); 305 306 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 307 int addr_len) 308 { 309 struct unix_address *addr; 310 311 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 312 if (!addr) 313 return NULL; 314 315 refcount_set(&addr->refcnt, 1); 316 addr->len = addr_len; 317 memcpy(addr->name, sunaddr, addr_len); 318 319 return addr; 320 } 321 322 static inline void unix_release_addr(struct unix_address *addr) 323 { 324 if (refcount_dec_and_test(&addr->refcnt)) 325 kfree(addr); 326 } 327 328 /* 329 * Check unix socket name: 330 * - should be not zero length. 331 * - if started by not zero, should be NULL terminated (FS object) 332 * - if started by zero, it is abstract name. 333 */ 334 335 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 336 { 337 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 338 addr_len > sizeof(*sunaddr)) 339 return -EINVAL; 340 341 if (sunaddr->sun_family != AF_UNIX) 342 return -EINVAL; 343 344 return 0; 345 } 346 347 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 348 { 349 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 350 short offset = offsetof(struct sockaddr_storage, __data); 351 352 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 353 354 /* This may look like an off by one error but it is a bit more 355 * subtle. 108 is the longest valid AF_UNIX path for a binding. 356 * sun_path[108] doesn't as such exist. However in kernel space 357 * we are guaranteed that it is a valid memory location in our 358 * kernel address buffer because syscall functions always pass 359 * a pointer of struct sockaddr_storage which has a bigger buffer 360 * than 108. Also, we must terminate sun_path for strlen() in 361 * getname_kernel(). 362 */ 363 addr->__data[addr_len - offset] = 0; 364 365 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 366 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 367 * know the actual buffer. 368 */ 369 return strlen(addr->__data) + offset + 1; 370 } 371 372 static void __unix_remove_socket(struct sock *sk) 373 { 374 sk_del_node_init(sk); 375 } 376 377 static void __unix_insert_socket(struct net *net, struct sock *sk) 378 { 379 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 380 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 381 } 382 383 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 384 struct unix_address *addr, unsigned int hash) 385 { 386 __unix_remove_socket(sk); 387 smp_store_release(&unix_sk(sk)->addr, addr); 388 389 sk->sk_hash = hash; 390 __unix_insert_socket(net, sk); 391 } 392 393 static void unix_remove_socket(struct net *net, struct sock *sk) 394 { 395 spin_lock(&net->unx.table.locks[sk->sk_hash]); 396 __unix_remove_socket(sk); 397 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 398 } 399 400 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 401 { 402 spin_lock(&net->unx.table.locks[sk->sk_hash]); 403 __unix_insert_socket(net, sk); 404 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 405 } 406 407 static void unix_insert_bsd_socket(struct sock *sk) 408 { 409 spin_lock(&bsd_socket_locks[sk->sk_hash]); 410 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 411 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 412 } 413 414 static void unix_remove_bsd_socket(struct sock *sk) 415 { 416 if (!hlist_unhashed(&sk->sk_bind_node)) { 417 spin_lock(&bsd_socket_locks[sk->sk_hash]); 418 __sk_del_bind_node(sk); 419 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 420 421 sk_node_init(&sk->sk_bind_node); 422 } 423 } 424 425 static struct sock *__unix_find_socket_byname(struct net *net, 426 struct sockaddr_un *sunname, 427 int len, unsigned int hash) 428 { 429 struct sock *s; 430 431 sk_for_each(s, &net->unx.table.buckets[hash]) { 432 struct unix_sock *u = unix_sk(s); 433 434 if (u->addr->len == len && 435 !memcmp(u->addr->name, sunname, len)) 436 return s; 437 } 438 return NULL; 439 } 440 441 static inline struct sock *unix_find_socket_byname(struct net *net, 442 struct sockaddr_un *sunname, 443 int len, unsigned int hash) 444 { 445 struct sock *s; 446 447 spin_lock(&net->unx.table.locks[hash]); 448 s = __unix_find_socket_byname(net, sunname, len, hash); 449 if (s) 450 sock_hold(s); 451 spin_unlock(&net->unx.table.locks[hash]); 452 return s; 453 } 454 455 static struct sock *unix_find_socket_byinode(struct inode *i) 456 { 457 unsigned int hash = unix_bsd_hash(i); 458 struct sock *s; 459 460 spin_lock(&bsd_socket_locks[hash]); 461 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 462 struct dentry *dentry = unix_sk(s)->path.dentry; 463 464 if (dentry && d_backing_inode(dentry) == i) { 465 sock_hold(s); 466 spin_unlock(&bsd_socket_locks[hash]); 467 return s; 468 } 469 } 470 spin_unlock(&bsd_socket_locks[hash]); 471 return NULL; 472 } 473 474 /* Support code for asymmetrically connected dgram sockets 475 * 476 * If a datagram socket is connected to a socket not itself connected 477 * to the first socket (eg, /dev/log), clients may only enqueue more 478 * messages if the present receive queue of the server socket is not 479 * "too large". This means there's a second writeability condition 480 * poll and sendmsg need to test. The dgram recv code will do a wake 481 * up on the peer_wait wait queue of a socket upon reception of a 482 * datagram which needs to be propagated to sleeping would-be writers 483 * since these might not have sent anything so far. This can't be 484 * accomplished via poll_wait because the lifetime of the server 485 * socket might be less than that of its clients if these break their 486 * association with it or if the server socket is closed while clients 487 * are still connected to it and there's no way to inform "a polling 488 * implementation" that it should let go of a certain wait queue 489 * 490 * In order to propagate a wake up, a wait_queue_entry_t of the client 491 * socket is enqueued on the peer_wait queue of the server socket 492 * whose wake function does a wake_up on the ordinary client socket 493 * wait queue. This connection is established whenever a write (or 494 * poll for write) hit the flow control condition and broken when the 495 * association to the server socket is dissolved or after a wake up 496 * was relayed. 497 */ 498 499 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 500 void *key) 501 { 502 struct unix_sock *u; 503 wait_queue_head_t *u_sleep; 504 505 u = container_of(q, struct unix_sock, peer_wake); 506 507 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 508 q); 509 u->peer_wake.private = NULL; 510 511 /* relaying can only happen while the wq still exists */ 512 u_sleep = sk_sleep(&u->sk); 513 if (u_sleep) 514 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 515 516 return 0; 517 } 518 519 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 520 { 521 struct unix_sock *u, *u_other; 522 int rc; 523 524 u = unix_sk(sk); 525 u_other = unix_sk(other); 526 rc = 0; 527 spin_lock(&u_other->peer_wait.lock); 528 529 if (!u->peer_wake.private) { 530 u->peer_wake.private = other; 531 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 532 533 rc = 1; 534 } 535 536 spin_unlock(&u_other->peer_wait.lock); 537 return rc; 538 } 539 540 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 541 struct sock *other) 542 { 543 struct unix_sock *u, *u_other; 544 545 u = unix_sk(sk); 546 u_other = unix_sk(other); 547 spin_lock(&u_other->peer_wait.lock); 548 549 if (u->peer_wake.private == other) { 550 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 551 u->peer_wake.private = NULL; 552 } 553 554 spin_unlock(&u_other->peer_wait.lock); 555 } 556 557 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 558 struct sock *other) 559 { 560 unix_dgram_peer_wake_disconnect(sk, other); 561 wake_up_interruptible_poll(sk_sleep(sk), 562 EPOLLOUT | 563 EPOLLWRNORM | 564 EPOLLWRBAND); 565 } 566 567 /* preconditions: 568 * - unix_peer(sk) == other 569 * - association is stable 570 */ 571 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 572 { 573 int connected; 574 575 connected = unix_dgram_peer_wake_connect(sk, other); 576 577 /* If other is SOCK_DEAD, we want to make sure we signal 578 * POLLOUT, such that a subsequent write() can get a 579 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 580 * to other and its full, we will hang waiting for POLLOUT. 581 */ 582 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 583 return 1; 584 585 if (connected) 586 unix_dgram_peer_wake_disconnect(sk, other); 587 588 return 0; 589 } 590 591 static int unix_writable(const struct sock *sk, unsigned char state) 592 { 593 return state != TCP_LISTEN && 594 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 595 } 596 597 static void unix_write_space(struct sock *sk) 598 { 599 struct socket_wq *wq; 600 601 rcu_read_lock(); 602 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 603 wq = rcu_dereference(sk->sk_wq); 604 if (skwq_has_sleeper(wq)) 605 wake_up_interruptible_sync_poll(&wq->wait, 606 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 607 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 608 } 609 rcu_read_unlock(); 610 } 611 612 /* When dgram socket disconnects (or changes its peer), we clear its receive 613 * queue of packets arrived from previous peer. First, it allows to do 614 * flow control based only on wmem_alloc; second, sk connected to peer 615 * may receive messages only from that peer. */ 616 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 617 { 618 if (!skb_queue_empty(&sk->sk_receive_queue)) { 619 skb_queue_purge_reason(&sk->sk_receive_queue, 620 SKB_DROP_REASON_UNIX_DISCONNECT); 621 622 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 623 624 /* If one link of bidirectional dgram pipe is disconnected, 625 * we signal error. Messages are lost. Do not make this, 626 * when peer was not connected to us. 627 */ 628 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 629 WRITE_ONCE(other->sk_err, ECONNRESET); 630 sk_error_report(other); 631 } 632 } 633 } 634 635 static void unix_sock_destructor(struct sock *sk) 636 { 637 struct unix_sock *u = unix_sk(sk); 638 639 skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE); 640 641 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 642 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 643 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 644 if (!sock_flag(sk, SOCK_DEAD)) { 645 pr_info("Attempt to release alive unix socket: %p\n", sk); 646 return; 647 } 648 649 if (u->addr) 650 unix_release_addr(u->addr); 651 652 atomic_long_dec(&unix_nr_socks); 653 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 654 #ifdef UNIX_REFCNT_DEBUG 655 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 656 atomic_long_read(&unix_nr_socks)); 657 #endif 658 } 659 660 static unsigned int unix_skb_len(const struct sk_buff *skb) 661 { 662 return skb->len - UNIXCB(skb).consumed; 663 } 664 665 static void unix_release_sock(struct sock *sk, int embrion) 666 { 667 struct unix_sock *u = unix_sk(sk); 668 struct sock *skpair; 669 struct sk_buff *skb; 670 struct path path; 671 int state; 672 673 unix_remove_socket(sock_net(sk), sk); 674 unix_remove_bsd_socket(sk); 675 676 /* Clear state */ 677 unix_state_lock(sk); 678 sock_orphan(sk); 679 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 680 path = u->path; 681 u->path.dentry = NULL; 682 u->path.mnt = NULL; 683 state = sk->sk_state; 684 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 685 686 skpair = unix_peer(sk); 687 unix_peer(sk) = NULL; 688 689 unix_state_unlock(sk); 690 691 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 692 u->oob_skb = NULL; 693 #endif 694 695 wake_up_interruptible_all(&u->peer_wait); 696 697 if (skpair != NULL) { 698 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 699 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 700 701 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 702 if (skb && !unix_skb_len(skb)) 703 skb = skb_peek_next(skb, &sk->sk_receive_queue); 704 #endif 705 unix_state_lock(skpair); 706 /* No more writes */ 707 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 708 if (skb || embrion) 709 WRITE_ONCE(skpair->sk_err, ECONNRESET); 710 unix_state_unlock(skpair); 711 skpair->sk_state_change(skpair); 712 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 713 } 714 715 unix_dgram_peer_wake_disconnect(sk, skpair); 716 sock_put(skpair); /* It may now die */ 717 } 718 719 /* Try to flush out this socket. Throw out buffers at least */ 720 721 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 722 if (state == TCP_LISTEN) 723 unix_release_sock(skb->sk, 1); 724 725 /* passed fds are erased in the kfree_skb hook */ 726 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 727 } 728 729 if (path.dentry) 730 path_put(&path); 731 732 sock_put(sk); 733 734 /* ---- Socket is dead now and most probably destroyed ---- */ 735 736 unix_schedule_gc(NULL); 737 } 738 739 struct unix_peercred { 740 struct pid *peer_pid; 741 const struct cred *peer_cred; 742 }; 743 744 static inline int prepare_peercred(struct unix_peercred *peercred) 745 { 746 struct pid *pid; 747 int err; 748 749 pid = task_tgid(current); 750 err = pidfs_register_pid(pid); 751 if (likely(!err)) { 752 peercred->peer_pid = get_pid(pid); 753 peercred->peer_cred = get_current_cred(); 754 } 755 return err; 756 } 757 758 static void drop_peercred(struct unix_peercred *peercred) 759 { 760 const struct cred *cred = NULL; 761 struct pid *pid = NULL; 762 763 might_sleep(); 764 765 swap(peercred->peer_pid, pid); 766 swap(peercred->peer_cred, cred); 767 768 put_pid(pid); 769 put_cred(cred); 770 } 771 772 static inline void init_peercred(struct sock *sk, 773 const struct unix_peercred *peercred) 774 { 775 sk->sk_peer_pid = peercred->peer_pid; 776 sk->sk_peer_cred = peercred->peer_cred; 777 } 778 779 static void update_peercred(struct sock *sk, struct unix_peercred *peercred) 780 { 781 const struct cred *old_cred; 782 struct pid *old_pid; 783 784 spin_lock(&sk->sk_peer_lock); 785 old_pid = sk->sk_peer_pid; 786 old_cred = sk->sk_peer_cred; 787 init_peercred(sk, peercred); 788 spin_unlock(&sk->sk_peer_lock); 789 790 peercred->peer_pid = old_pid; 791 peercred->peer_cred = old_cred; 792 } 793 794 static void copy_peercred(struct sock *sk, struct sock *peersk) 795 { 796 lockdep_assert_held(&unix_sk(peersk)->lock); 797 798 spin_lock(&sk->sk_peer_lock); 799 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 800 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 801 spin_unlock(&sk->sk_peer_lock); 802 } 803 804 static bool unix_may_passcred(const struct sock *sk) 805 { 806 return sk->sk_scm_credentials || sk->sk_scm_pidfd; 807 } 808 809 static int unix_listen(struct socket *sock, int backlog) 810 { 811 int err; 812 struct sock *sk = sock->sk; 813 struct unix_sock *u = unix_sk(sk); 814 struct unix_peercred peercred = {}; 815 816 err = -EOPNOTSUPP; 817 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 818 goto out; /* Only stream/seqpacket sockets accept */ 819 err = -EINVAL; 820 if (!READ_ONCE(u->addr)) 821 goto out; /* No listens on an unbound socket */ 822 err = prepare_peercred(&peercred); 823 if (err) 824 goto out; 825 unix_state_lock(sk); 826 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 827 goto out_unlock; 828 if (backlog > sk->sk_max_ack_backlog) 829 wake_up_interruptible_all(&u->peer_wait); 830 sk->sk_max_ack_backlog = backlog; 831 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 832 833 /* set credentials so connect can copy them */ 834 update_peercred(sk, &peercred); 835 err = 0; 836 837 out_unlock: 838 unix_state_unlock(sk); 839 drop_peercred(&peercred); 840 out: 841 return err; 842 } 843 844 static int unix_release(struct socket *); 845 static int unix_bind(struct socket *, struct sockaddr_unsized *, int); 846 static int unix_stream_connect(struct socket *, struct sockaddr_unsized *, 847 int addr_len, int flags); 848 static int unix_socketpair(struct socket *, struct socket *); 849 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); 850 static int unix_getname(struct socket *, struct sockaddr *, int); 851 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 852 static __poll_t unix_dgram_poll(struct file *, struct socket *, 853 poll_table *); 854 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 855 #ifdef CONFIG_COMPAT 856 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 857 #endif 858 static int unix_shutdown(struct socket *, int); 859 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 860 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 861 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 862 struct pipe_inode_info *, size_t size, 863 unsigned int flags); 864 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 865 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 866 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 867 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 868 static int unix_dgram_connect(struct socket *, struct sockaddr_unsized *, 869 int, int); 870 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 871 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 872 int); 873 874 #ifdef CONFIG_PROC_FS 875 static int unix_count_nr_fds(struct sock *sk) 876 { 877 struct sk_buff *skb; 878 struct unix_sock *u; 879 int nr_fds = 0; 880 881 spin_lock(&sk->sk_receive_queue.lock); 882 skb = skb_peek(&sk->sk_receive_queue); 883 while (skb) { 884 u = unix_sk(skb->sk); 885 nr_fds += atomic_read(&u->scm_stat.nr_fds); 886 skb = skb_peek_next(skb, &sk->sk_receive_queue); 887 } 888 spin_unlock(&sk->sk_receive_queue.lock); 889 890 return nr_fds; 891 } 892 893 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 894 { 895 struct sock *sk = sock->sk; 896 unsigned char s_state; 897 struct unix_sock *u; 898 int nr_fds = 0; 899 900 if (sk) { 901 s_state = READ_ONCE(sk->sk_state); 902 u = unix_sk(sk); 903 904 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 905 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 906 * SOCK_DGRAM is ordinary. So, no lock is needed. 907 */ 908 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 909 nr_fds = atomic_read(&u->scm_stat.nr_fds); 910 else if (s_state == TCP_LISTEN) 911 nr_fds = unix_count_nr_fds(sk); 912 913 seq_printf(m, "scm_fds: %u\n", nr_fds); 914 } 915 } 916 #else 917 #define unix_show_fdinfo NULL 918 #endif 919 920 static bool unix_custom_sockopt(int optname) 921 { 922 switch (optname) { 923 case SO_INQ: 924 return true; 925 default: 926 return false; 927 } 928 } 929 930 static int unix_setsockopt(struct socket *sock, int level, int optname, 931 sockptr_t optval, unsigned int optlen) 932 { 933 struct unix_sock *u = unix_sk(sock->sk); 934 struct sock *sk = sock->sk; 935 int val; 936 937 if (level != SOL_SOCKET) 938 return -EOPNOTSUPP; 939 940 if (!unix_custom_sockopt(optname)) 941 return sock_setsockopt(sock, level, optname, optval, optlen); 942 943 if (optlen != sizeof(int)) 944 return -EINVAL; 945 946 if (copy_from_sockptr(&val, optval, sizeof(val))) 947 return -EFAULT; 948 949 switch (optname) { 950 case SO_INQ: 951 if (sk->sk_type != SOCK_STREAM) 952 return -EINVAL; 953 954 if (val > 1 || val < 0) 955 return -EINVAL; 956 957 WRITE_ONCE(u->recvmsg_inq, val); 958 break; 959 default: 960 return -ENOPROTOOPT; 961 } 962 963 return 0; 964 } 965 966 static const struct proto_ops unix_stream_ops = { 967 .family = PF_UNIX, 968 .owner = THIS_MODULE, 969 .release = unix_release, 970 .bind = unix_bind, 971 .connect = unix_stream_connect, 972 .socketpair = unix_socketpair, 973 .accept = unix_accept, 974 .getname = unix_getname, 975 .poll = unix_poll, 976 .ioctl = unix_ioctl, 977 #ifdef CONFIG_COMPAT 978 .compat_ioctl = unix_compat_ioctl, 979 #endif 980 .listen = unix_listen, 981 .shutdown = unix_shutdown, 982 .setsockopt = unix_setsockopt, 983 .sendmsg = unix_stream_sendmsg, 984 .recvmsg = unix_stream_recvmsg, 985 .read_skb = unix_stream_read_skb, 986 .mmap = sock_no_mmap, 987 .splice_read = unix_stream_splice_read, 988 .set_peek_off = sk_set_peek_off, 989 .show_fdinfo = unix_show_fdinfo, 990 }; 991 992 static const struct proto_ops unix_dgram_ops = { 993 .family = PF_UNIX, 994 .owner = THIS_MODULE, 995 .release = unix_release, 996 .bind = unix_bind, 997 .connect = unix_dgram_connect, 998 .socketpair = unix_socketpair, 999 .accept = sock_no_accept, 1000 .getname = unix_getname, 1001 .poll = unix_dgram_poll, 1002 .ioctl = unix_ioctl, 1003 #ifdef CONFIG_COMPAT 1004 .compat_ioctl = unix_compat_ioctl, 1005 #endif 1006 .listen = sock_no_listen, 1007 .shutdown = unix_shutdown, 1008 .sendmsg = unix_dgram_sendmsg, 1009 .read_skb = unix_read_skb, 1010 .recvmsg = unix_dgram_recvmsg, 1011 .mmap = sock_no_mmap, 1012 .set_peek_off = sk_set_peek_off, 1013 .show_fdinfo = unix_show_fdinfo, 1014 }; 1015 1016 static const struct proto_ops unix_seqpacket_ops = { 1017 .family = PF_UNIX, 1018 .owner = THIS_MODULE, 1019 .release = unix_release, 1020 .bind = unix_bind, 1021 .connect = unix_stream_connect, 1022 .socketpair = unix_socketpair, 1023 .accept = unix_accept, 1024 .getname = unix_getname, 1025 .poll = unix_dgram_poll, 1026 .ioctl = unix_ioctl, 1027 #ifdef CONFIG_COMPAT 1028 .compat_ioctl = unix_compat_ioctl, 1029 #endif 1030 .listen = unix_listen, 1031 .shutdown = unix_shutdown, 1032 .sendmsg = unix_seqpacket_sendmsg, 1033 .recvmsg = unix_seqpacket_recvmsg, 1034 .mmap = sock_no_mmap, 1035 .set_peek_off = sk_set_peek_off, 1036 .show_fdinfo = unix_show_fdinfo, 1037 }; 1038 1039 static void unix_close(struct sock *sk, long timeout) 1040 { 1041 /* Nothing to do here, unix socket does not need a ->close(). 1042 * This is merely for sockmap. 1043 */ 1044 } 1045 1046 static bool unix_bpf_bypass_getsockopt(int level, int optname) 1047 { 1048 if (level == SOL_SOCKET) { 1049 switch (optname) { 1050 case SO_PEERPIDFD: 1051 return true; 1052 default: 1053 return false; 1054 } 1055 } 1056 1057 return false; 1058 } 1059 1060 struct proto unix_dgram_proto = { 1061 .name = "UNIX", 1062 .owner = THIS_MODULE, 1063 .obj_size = sizeof(struct unix_sock), 1064 .close = unix_close, 1065 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 1066 #ifdef CONFIG_BPF_SYSCALL 1067 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 1068 #endif 1069 }; 1070 1071 struct proto unix_stream_proto = { 1072 .name = "UNIX-STREAM", 1073 .owner = THIS_MODULE, 1074 .obj_size = sizeof(struct unix_sock), 1075 .close = unix_close, 1076 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 1077 #ifdef CONFIG_BPF_SYSCALL 1078 .psock_update_sk_prot = unix_stream_bpf_update_proto, 1079 #endif 1080 }; 1081 1082 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 1083 { 1084 struct unix_sock *u; 1085 struct sock *sk; 1086 int err; 1087 1088 atomic_long_inc(&unix_nr_socks); 1089 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 1090 err = -ENFILE; 1091 goto err; 1092 } 1093 1094 if (type == SOCK_STREAM) 1095 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 1096 else /*dgram and seqpacket */ 1097 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 1098 1099 if (!sk) { 1100 err = -ENOMEM; 1101 goto err; 1102 } 1103 1104 sock_init_data(sock, sk); 1105 1106 sk->sk_scm_rights = 1; 1107 sk->sk_hash = unix_unbound_hash(sk); 1108 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 1109 sk->sk_write_space = unix_write_space; 1110 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 1111 sk->sk_destruct = unix_sock_destructor; 1112 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); 1113 1114 u = unix_sk(sk); 1115 u->listener = NULL; 1116 u->vertex = NULL; 1117 u->path.dentry = NULL; 1118 u->path.mnt = NULL; 1119 spin_lock_init(&u->lock); 1120 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); 1121 mutex_init(&u->iolock); /* single task reading lock */ 1122 mutex_init(&u->bindlock); /* single task binding lock */ 1123 init_waitqueue_head(&u->peer_wait); 1124 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1125 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1126 unix_insert_unbound_socket(net, sk); 1127 1128 sock_prot_inuse_add(net, sk->sk_prot, 1); 1129 1130 return sk; 1131 1132 err: 1133 atomic_long_dec(&unix_nr_socks); 1134 return ERR_PTR(err); 1135 } 1136 1137 static int unix_create(struct net *net, struct socket *sock, int protocol, 1138 int kern) 1139 { 1140 struct sock *sk; 1141 1142 if (protocol && protocol != PF_UNIX) 1143 return -EPROTONOSUPPORT; 1144 1145 sock->state = SS_UNCONNECTED; 1146 1147 switch (sock->type) { 1148 case SOCK_STREAM: 1149 set_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags); 1150 sock->ops = &unix_stream_ops; 1151 break; 1152 /* 1153 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1154 * nothing uses it. 1155 */ 1156 case SOCK_RAW: 1157 sock->type = SOCK_DGRAM; 1158 fallthrough; 1159 case SOCK_DGRAM: 1160 sock->ops = &unix_dgram_ops; 1161 break; 1162 case SOCK_SEQPACKET: 1163 sock->ops = &unix_seqpacket_ops; 1164 break; 1165 default: 1166 return -ESOCKTNOSUPPORT; 1167 } 1168 1169 sk = unix_create1(net, sock, kern, sock->type); 1170 if (IS_ERR(sk)) 1171 return PTR_ERR(sk); 1172 1173 return 0; 1174 } 1175 1176 static int unix_release(struct socket *sock) 1177 { 1178 struct sock *sk = sock->sk; 1179 1180 if (!sk) 1181 return 0; 1182 1183 sk->sk_prot->close(sk, 0); 1184 unix_release_sock(sk, 0); 1185 sock->sk = NULL; 1186 1187 return 0; 1188 } 1189 1190 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1191 int type, int flags) 1192 { 1193 struct inode *inode; 1194 struct path path; 1195 struct sock *sk; 1196 int err; 1197 1198 unix_mkname_bsd(sunaddr, addr_len); 1199 1200 if (flags & SOCK_COREDUMP) { 1201 const struct cred *cred; 1202 struct cred *kcred; 1203 struct path root; 1204 1205 kcred = prepare_kernel_cred(&init_task); 1206 if (!kcred) { 1207 err = -ENOMEM; 1208 goto fail; 1209 } 1210 1211 task_lock(&init_task); 1212 get_fs_root(init_task.fs, &root); 1213 task_unlock(&init_task); 1214 1215 cred = override_creds(kcred); 1216 err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path, 1217 LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | 1218 LOOKUP_NO_MAGICLINKS, &path); 1219 put_cred(revert_creds(cred)); 1220 path_put(&root); 1221 if (err) 1222 goto fail; 1223 } else { 1224 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1225 if (err) 1226 goto fail; 1227 1228 err = path_permission(&path, MAY_WRITE); 1229 if (err) 1230 goto path_put; 1231 } 1232 1233 err = -ECONNREFUSED; 1234 inode = d_backing_inode(path.dentry); 1235 if (!S_ISSOCK(inode->i_mode)) 1236 goto path_put; 1237 1238 sk = unix_find_socket_byinode(inode); 1239 if (!sk) 1240 goto path_put; 1241 1242 err = -EPROTOTYPE; 1243 if (sk->sk_type == type) 1244 touch_atime(&path); 1245 else 1246 goto sock_put; 1247 1248 path_put(&path); 1249 1250 return sk; 1251 1252 sock_put: 1253 sock_put(sk); 1254 path_put: 1255 path_put(&path); 1256 fail: 1257 return ERR_PTR(err); 1258 } 1259 1260 static struct sock *unix_find_abstract(struct net *net, 1261 struct sockaddr_un *sunaddr, 1262 int addr_len, int type) 1263 { 1264 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1265 struct dentry *dentry; 1266 struct sock *sk; 1267 1268 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1269 if (!sk) 1270 return ERR_PTR(-ECONNREFUSED); 1271 1272 dentry = unix_sk(sk)->path.dentry; 1273 if (dentry) 1274 touch_atime(&unix_sk(sk)->path); 1275 1276 return sk; 1277 } 1278 1279 static struct sock *unix_find_other(struct net *net, 1280 struct sockaddr_un *sunaddr, 1281 int addr_len, int type, int flags) 1282 { 1283 struct sock *sk; 1284 1285 if (sunaddr->sun_path[0]) 1286 sk = unix_find_bsd(sunaddr, addr_len, type, flags); 1287 else 1288 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1289 1290 return sk; 1291 } 1292 1293 static int unix_autobind(struct sock *sk) 1294 { 1295 struct unix_sock *u = unix_sk(sk); 1296 unsigned int new_hash, old_hash; 1297 struct net *net = sock_net(sk); 1298 struct unix_address *addr; 1299 u32 lastnum, ordernum; 1300 int err; 1301 1302 err = mutex_lock_interruptible(&u->bindlock); 1303 if (err) 1304 return err; 1305 1306 if (u->addr) 1307 goto out; 1308 1309 err = -ENOMEM; 1310 addr = kzalloc(sizeof(*addr) + 1311 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1312 if (!addr) 1313 goto out; 1314 1315 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1316 addr->name->sun_family = AF_UNIX; 1317 refcount_set(&addr->refcnt, 1); 1318 1319 old_hash = sk->sk_hash; 1320 ordernum = get_random_u32(); 1321 lastnum = ordernum & 0xFFFFF; 1322 retry: 1323 ordernum = (ordernum + 1) & 0xFFFFF; 1324 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1325 1326 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1327 unix_table_double_lock(net, old_hash, new_hash); 1328 1329 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1330 unix_table_double_unlock(net, old_hash, new_hash); 1331 1332 /* __unix_find_socket_byname() may take long time if many names 1333 * are already in use. 1334 */ 1335 cond_resched(); 1336 1337 if (ordernum == lastnum) { 1338 /* Give up if all names seems to be in use. */ 1339 err = -ENOSPC; 1340 unix_release_addr(addr); 1341 goto out; 1342 } 1343 1344 goto retry; 1345 } 1346 1347 __unix_set_addr_hash(net, sk, addr, new_hash); 1348 unix_table_double_unlock(net, old_hash, new_hash); 1349 err = 0; 1350 1351 out: mutex_unlock(&u->bindlock); 1352 return err; 1353 } 1354 1355 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1356 int addr_len) 1357 { 1358 umode_t mode = S_IFSOCK | 1359 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1360 struct unix_sock *u = unix_sk(sk); 1361 unsigned int new_hash, old_hash; 1362 struct net *net = sock_net(sk); 1363 struct mnt_idmap *idmap; 1364 struct unix_address *addr; 1365 struct dentry *dentry; 1366 struct path parent; 1367 int err; 1368 1369 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1370 addr = unix_create_addr(sunaddr, addr_len); 1371 if (!addr) 1372 return -ENOMEM; 1373 1374 /* 1375 * Get the parent directory, calculate the hash for last 1376 * component. 1377 */ 1378 dentry = start_creating_path(AT_FDCWD, addr->name->sun_path, &parent, 0); 1379 if (IS_ERR(dentry)) { 1380 err = PTR_ERR(dentry); 1381 goto out; 1382 } 1383 1384 /* 1385 * All right, let's create it. 1386 */ 1387 idmap = mnt_idmap(parent.mnt); 1388 err = security_path_mknod(&parent, dentry, mode, 0); 1389 if (!err) 1390 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1391 if (err) 1392 goto out_path; 1393 err = mutex_lock_interruptible(&u->bindlock); 1394 if (err) 1395 goto out_unlink; 1396 if (u->addr) 1397 goto out_unlock; 1398 1399 old_hash = sk->sk_hash; 1400 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1401 unix_table_double_lock(net, old_hash, new_hash); 1402 u->path.mnt = mntget(parent.mnt); 1403 u->path.dentry = dget(dentry); 1404 __unix_set_addr_hash(net, sk, addr, new_hash); 1405 unix_table_double_unlock(net, old_hash, new_hash); 1406 unix_insert_bsd_socket(sk); 1407 mutex_unlock(&u->bindlock); 1408 end_creating_path(&parent, dentry); 1409 return 0; 1410 1411 out_unlock: 1412 mutex_unlock(&u->bindlock); 1413 err = -EINVAL; 1414 out_unlink: 1415 /* failed after successful mknod? unlink what we'd created... */ 1416 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1417 out_path: 1418 end_creating_path(&parent, dentry); 1419 out: 1420 unix_release_addr(addr); 1421 return err == -EEXIST ? -EADDRINUSE : err; 1422 } 1423 1424 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1425 int addr_len) 1426 { 1427 struct unix_sock *u = unix_sk(sk); 1428 unsigned int new_hash, old_hash; 1429 struct net *net = sock_net(sk); 1430 struct unix_address *addr; 1431 int err; 1432 1433 addr = unix_create_addr(sunaddr, addr_len); 1434 if (!addr) 1435 return -ENOMEM; 1436 1437 err = mutex_lock_interruptible(&u->bindlock); 1438 if (err) 1439 goto out; 1440 1441 if (u->addr) { 1442 err = -EINVAL; 1443 goto out_mutex; 1444 } 1445 1446 old_hash = sk->sk_hash; 1447 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1448 unix_table_double_lock(net, old_hash, new_hash); 1449 1450 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1451 goto out_spin; 1452 1453 __unix_set_addr_hash(net, sk, addr, new_hash); 1454 unix_table_double_unlock(net, old_hash, new_hash); 1455 mutex_unlock(&u->bindlock); 1456 return 0; 1457 1458 out_spin: 1459 unix_table_double_unlock(net, old_hash, new_hash); 1460 err = -EADDRINUSE; 1461 out_mutex: 1462 mutex_unlock(&u->bindlock); 1463 out: 1464 unix_release_addr(addr); 1465 return err; 1466 } 1467 1468 static int unix_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) 1469 { 1470 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1471 struct sock *sk = sock->sk; 1472 int err; 1473 1474 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1475 sunaddr->sun_family == AF_UNIX) 1476 return unix_autobind(sk); 1477 1478 err = unix_validate_addr(sunaddr, addr_len); 1479 if (err) 1480 return err; 1481 1482 if (sunaddr->sun_path[0]) 1483 err = unix_bind_bsd(sk, sunaddr, addr_len); 1484 else 1485 err = unix_bind_abstract(sk, sunaddr, addr_len); 1486 1487 return err; 1488 } 1489 1490 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1491 { 1492 if (unlikely(sk1 == sk2) || !sk2) { 1493 unix_state_lock(sk1); 1494 return; 1495 } 1496 1497 if (sk1 > sk2) 1498 swap(sk1, sk2); 1499 1500 unix_state_lock(sk1); 1501 unix_state_lock(sk2); 1502 } 1503 1504 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1505 { 1506 if (unlikely(sk1 == sk2) || !sk2) { 1507 unix_state_unlock(sk1); 1508 return; 1509 } 1510 unix_state_unlock(sk1); 1511 unix_state_unlock(sk2); 1512 } 1513 1514 static int unix_dgram_connect(struct socket *sock, struct sockaddr_unsized *addr, 1515 int alen, int flags) 1516 { 1517 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1518 struct sock *sk = sock->sk; 1519 struct sock *other; 1520 int err; 1521 1522 err = -EINVAL; 1523 if (alen < offsetofend(struct sockaddr, sa_family)) 1524 goto out; 1525 1526 if (addr->sa_family != AF_UNSPEC) { 1527 err = unix_validate_addr(sunaddr, alen); 1528 if (err) 1529 goto out; 1530 1531 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1532 if (err) 1533 goto out; 1534 1535 if (unix_may_passcred(sk) && !READ_ONCE(unix_sk(sk)->addr)) { 1536 err = unix_autobind(sk); 1537 if (err) 1538 goto out; 1539 } 1540 1541 restart: 1542 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type, 0); 1543 if (IS_ERR(other)) { 1544 err = PTR_ERR(other); 1545 goto out; 1546 } 1547 1548 unix_state_double_lock(sk, other); 1549 1550 /* Apparently VFS overslept socket death. Retry. */ 1551 if (sock_flag(other, SOCK_DEAD)) { 1552 unix_state_double_unlock(sk, other); 1553 sock_put(other); 1554 goto restart; 1555 } 1556 1557 err = -EPERM; 1558 if (!unix_may_send(sk, other)) 1559 goto out_unlock; 1560 1561 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1562 if (err) 1563 goto out_unlock; 1564 1565 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1566 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1567 } else { 1568 /* 1569 * 1003.1g breaking connected state with AF_UNSPEC 1570 */ 1571 other = NULL; 1572 unix_state_double_lock(sk, other); 1573 } 1574 1575 /* 1576 * If it was connected, reconnect. 1577 */ 1578 if (unix_peer(sk)) { 1579 struct sock *old_peer = unix_peer(sk); 1580 1581 unix_peer(sk) = other; 1582 if (!other) 1583 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1584 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1585 1586 unix_state_double_unlock(sk, other); 1587 1588 if (other != old_peer) { 1589 unix_dgram_disconnected(sk, old_peer); 1590 1591 unix_state_lock(old_peer); 1592 if (!unix_peer(old_peer)) 1593 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1594 unix_state_unlock(old_peer); 1595 } 1596 1597 sock_put(old_peer); 1598 } else { 1599 unix_peer(sk) = other; 1600 unix_state_double_unlock(sk, other); 1601 } 1602 1603 return 0; 1604 1605 out_unlock: 1606 unix_state_double_unlock(sk, other); 1607 sock_put(other); 1608 out: 1609 return err; 1610 } 1611 1612 static long unix_wait_for_peer(struct sock *other, long timeo) 1613 { 1614 struct unix_sock *u = unix_sk(other); 1615 int sched; 1616 DEFINE_WAIT(wait); 1617 1618 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1619 1620 sched = !sock_flag(other, SOCK_DEAD) && 1621 !(other->sk_shutdown & RCV_SHUTDOWN) && 1622 unix_recvq_full_lockless(other); 1623 1624 unix_state_unlock(other); 1625 1626 if (sched) 1627 timeo = schedule_timeout(timeo); 1628 1629 finish_wait(&u->peer_wait, &wait); 1630 return timeo; 1631 } 1632 1633 static int unix_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, 1634 int addr_len, int flags) 1635 { 1636 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1637 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1638 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1639 struct unix_peercred peercred = {}; 1640 struct net *net = sock_net(sk); 1641 struct sk_buff *skb = NULL; 1642 unsigned char state; 1643 long timeo; 1644 int err; 1645 1646 err = unix_validate_addr(sunaddr, addr_len); 1647 if (err) 1648 goto out; 1649 1650 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1651 if (err) 1652 goto out; 1653 1654 if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) { 1655 err = unix_autobind(sk); 1656 if (err) 1657 goto out; 1658 } 1659 1660 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1661 1662 /* First of all allocate resources. 1663 * If we will make it after state is locked, 1664 * we will have to recheck all again in any case. 1665 */ 1666 1667 /* create new sock for complete connection */ 1668 newsk = unix_create1(net, NULL, 0, sock->type); 1669 if (IS_ERR(newsk)) { 1670 err = PTR_ERR(newsk); 1671 goto out; 1672 } 1673 1674 err = prepare_peercred(&peercred); 1675 if (err) 1676 goto out; 1677 1678 /* Allocate skb for sending to listening sock */ 1679 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1680 if (!skb) { 1681 err = -ENOMEM; 1682 goto out_free_sk; 1683 } 1684 1685 restart: 1686 /* Find listening sock. */ 1687 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, flags); 1688 if (IS_ERR(other)) { 1689 err = PTR_ERR(other); 1690 goto out_free_skb; 1691 } 1692 1693 unix_state_lock(other); 1694 1695 /* Apparently VFS overslept socket death. Retry. */ 1696 if (sock_flag(other, SOCK_DEAD)) { 1697 unix_state_unlock(other); 1698 sock_put(other); 1699 goto restart; 1700 } 1701 1702 if (other->sk_state != TCP_LISTEN || 1703 other->sk_shutdown & RCV_SHUTDOWN) { 1704 err = -ECONNREFUSED; 1705 goto out_unlock; 1706 } 1707 1708 if (unix_recvq_full_lockless(other)) { 1709 if (!timeo) { 1710 err = -EAGAIN; 1711 goto out_unlock; 1712 } 1713 1714 timeo = unix_wait_for_peer(other, timeo); 1715 sock_put(other); 1716 1717 err = sock_intr_errno(timeo); 1718 if (signal_pending(current)) 1719 goto out_free_skb; 1720 1721 goto restart; 1722 } 1723 1724 /* self connect and simultaneous connect are eliminated 1725 * by rejecting TCP_LISTEN socket to avoid deadlock. 1726 */ 1727 state = READ_ONCE(sk->sk_state); 1728 if (unlikely(state != TCP_CLOSE)) { 1729 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1730 goto out_unlock; 1731 } 1732 1733 unix_state_lock(sk); 1734 1735 if (unlikely(sk->sk_state != TCP_CLOSE)) { 1736 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1737 unix_state_unlock(sk); 1738 goto out_unlock; 1739 } 1740 1741 err = security_unix_stream_connect(sk, other, newsk); 1742 if (err) { 1743 unix_state_unlock(sk); 1744 goto out_unlock; 1745 } 1746 1747 /* The way is open! Fastly set all the necessary fields... */ 1748 1749 sock_hold(sk); 1750 unix_peer(newsk) = sk; 1751 newsk->sk_state = TCP_ESTABLISHED; 1752 newsk->sk_type = sk->sk_type; 1753 newsk->sk_scm_recv_flags = other->sk_scm_recv_flags; 1754 init_peercred(newsk, &peercred); 1755 1756 newu = unix_sk(newsk); 1757 newu->listener = other; 1758 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1759 otheru = unix_sk(other); 1760 1761 /* copy address information from listening to new sock 1762 * 1763 * The contents of *(otheru->addr) and otheru->path 1764 * are seen fully set up here, since we have found 1765 * otheru in hash under its lock. Insertion into the 1766 * hash chain we'd found it in had been done in an 1767 * earlier critical area protected by the chain's lock, 1768 * the same one where we'd set *(otheru->addr) contents, 1769 * as well as otheru->path and otheru->addr itself. 1770 * 1771 * Using smp_store_release() here to set newu->addr 1772 * is enough to make those stores, as well as stores 1773 * to newu->path visible to anyone who gets newu->addr 1774 * by smp_load_acquire(). IOW, the same warranties 1775 * as for unix_sock instances bound in unix_bind() or 1776 * in unix_autobind(). 1777 */ 1778 if (otheru->path.dentry) { 1779 path_get(&otheru->path); 1780 newu->path = otheru->path; 1781 } 1782 refcount_inc(&otheru->addr->refcnt); 1783 smp_store_release(&newu->addr, otheru->addr); 1784 1785 /* Set credentials */ 1786 copy_peercred(sk, other); 1787 1788 sock->state = SS_CONNECTED; 1789 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1790 sock_hold(newsk); 1791 1792 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1793 unix_peer(sk) = newsk; 1794 1795 unix_state_unlock(sk); 1796 1797 /* take ten and send info to listening sock */ 1798 spin_lock(&other->sk_receive_queue.lock); 1799 __skb_queue_tail(&other->sk_receive_queue, skb); 1800 spin_unlock(&other->sk_receive_queue.lock); 1801 unix_state_unlock(other); 1802 other->sk_data_ready(other); 1803 sock_put(other); 1804 return 0; 1805 1806 out_unlock: 1807 unix_state_unlock(other); 1808 sock_put(other); 1809 out_free_skb: 1810 consume_skb(skb); 1811 out_free_sk: 1812 unix_release_sock(newsk, 0); 1813 out: 1814 drop_peercred(&peercred); 1815 return err; 1816 } 1817 1818 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1819 { 1820 struct unix_peercred ska_peercred = {}, skb_peercred = {}; 1821 struct sock *ska = socka->sk, *skb = sockb->sk; 1822 int err; 1823 1824 err = prepare_peercred(&ska_peercred); 1825 if (err) 1826 return err; 1827 1828 err = prepare_peercred(&skb_peercred); 1829 if (err) { 1830 drop_peercred(&ska_peercred); 1831 return err; 1832 } 1833 1834 /* Join our sockets back to back */ 1835 sock_hold(ska); 1836 sock_hold(skb); 1837 unix_peer(ska) = skb; 1838 unix_peer(skb) = ska; 1839 init_peercred(ska, &ska_peercred); 1840 init_peercred(skb, &skb_peercred); 1841 1842 ska->sk_state = TCP_ESTABLISHED; 1843 skb->sk_state = TCP_ESTABLISHED; 1844 socka->state = SS_CONNECTED; 1845 sockb->state = SS_CONNECTED; 1846 return 0; 1847 } 1848 1849 static int unix_accept(struct socket *sock, struct socket *newsock, 1850 struct proto_accept_arg *arg) 1851 { 1852 struct sock *sk = sock->sk; 1853 struct sk_buff *skb; 1854 struct sock *tsk; 1855 1856 arg->err = -EOPNOTSUPP; 1857 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1858 goto out; 1859 1860 arg->err = -EINVAL; 1861 if (READ_ONCE(sk->sk_state) != TCP_LISTEN) 1862 goto out; 1863 1864 /* If socket state is TCP_LISTEN it cannot change (for now...), 1865 * so that no locks are necessary. 1866 */ 1867 1868 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1869 &arg->err); 1870 if (!skb) { 1871 /* This means receive shutdown. */ 1872 if (arg->err == 0) 1873 arg->err = -EINVAL; 1874 goto out; 1875 } 1876 1877 tsk = skb->sk; 1878 skb_free_datagram(sk, skb); 1879 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1880 1881 if (tsk->sk_type == SOCK_STREAM) 1882 set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); 1883 1884 /* attach accepted sock to socket */ 1885 unix_state_lock(tsk); 1886 unix_update_edges(unix_sk(tsk)); 1887 newsock->state = SS_CONNECTED; 1888 sock_graft(tsk, newsock); 1889 unix_state_unlock(tsk); 1890 return 0; 1891 1892 out: 1893 return arg->err; 1894 } 1895 1896 1897 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1898 { 1899 struct sock *sk = sock->sk; 1900 struct unix_address *addr; 1901 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1902 int err = 0; 1903 1904 if (peer) { 1905 sk = unix_peer_get(sk); 1906 1907 err = -ENOTCONN; 1908 if (!sk) 1909 goto out; 1910 err = 0; 1911 } else { 1912 sock_hold(sk); 1913 } 1914 1915 addr = smp_load_acquire(&unix_sk(sk)->addr); 1916 if (!addr) { 1917 sunaddr->sun_family = AF_UNIX; 1918 sunaddr->sun_path[0] = 0; 1919 err = offsetof(struct sockaddr_un, sun_path); 1920 } else { 1921 err = addr->len; 1922 memcpy(sunaddr, addr->name, addr->len); 1923 1924 if (peer) 1925 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1926 CGROUP_UNIX_GETPEERNAME); 1927 else 1928 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1929 CGROUP_UNIX_GETSOCKNAME); 1930 } 1931 sock_put(sk); 1932 out: 1933 return err; 1934 } 1935 1936 /* The "user->unix_inflight" variable is protected by the garbage 1937 * collection lock, and we just read it locklessly here. If you go 1938 * over the limit, there might be a tiny race in actually noticing 1939 * it across threads. Tough. 1940 */ 1941 static inline bool too_many_unix_fds(struct task_struct *p) 1942 { 1943 struct user_struct *user = current_user(); 1944 1945 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1946 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1947 return false; 1948 } 1949 1950 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1951 { 1952 if (too_many_unix_fds(current)) 1953 return -ETOOMANYREFS; 1954 1955 UNIXCB(skb).fp = scm->fp; 1956 scm->fp = NULL; 1957 1958 if (unix_prepare_fpl(UNIXCB(skb).fp)) 1959 return -ENOMEM; 1960 1961 return 0; 1962 } 1963 1964 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1965 { 1966 scm->fp = UNIXCB(skb).fp; 1967 UNIXCB(skb).fp = NULL; 1968 1969 unix_destroy_fpl(scm->fp); 1970 } 1971 1972 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1973 { 1974 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1975 } 1976 1977 static void unix_destruct_scm(struct sk_buff *skb) 1978 { 1979 struct scm_cookie scm; 1980 1981 memset(&scm, 0, sizeof(scm)); 1982 scm.pid = UNIXCB(skb).pid; 1983 if (UNIXCB(skb).fp) 1984 unix_detach_fds(&scm, skb); 1985 1986 /* Alas, it calls VFS */ 1987 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1988 scm_destroy(&scm); 1989 sock_wfree(skb); 1990 } 1991 1992 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1993 { 1994 int err = 0; 1995 1996 UNIXCB(skb).pid = get_pid(scm->pid); 1997 UNIXCB(skb).uid = scm->creds.uid; 1998 UNIXCB(skb).gid = scm->creds.gid; 1999 UNIXCB(skb).fp = NULL; 2000 unix_get_secdata(scm, skb); 2001 if (scm->fp && send_fds) 2002 err = unix_attach_fds(scm, skb); 2003 2004 skb->destructor = unix_destruct_scm; 2005 return err; 2006 } 2007 2008 static void unix_skb_to_scm(struct sk_buff *skb, struct scm_cookie *scm) 2009 { 2010 scm_set_cred(scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2011 unix_set_secdata(scm, skb); 2012 } 2013 2014 /** 2015 * unix_maybe_add_creds() - Adds current task uid/gid and struct pid to skb if needed. 2016 * @skb: skb to attach creds to. 2017 * @sk: Sender sock. 2018 * @other: Receiver sock. 2019 * 2020 * Some apps rely on write() giving SCM_CREDENTIALS 2021 * We include credentials if source or destination socket 2022 * asserted SOCK_PASSCRED. 2023 * 2024 * Context: May sleep. 2025 * Return: On success zero, on error a negative error code is returned. 2026 */ 2027 static int unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, 2028 const struct sock *other) 2029 { 2030 if (UNIXCB(skb).pid) 2031 return 0; 2032 2033 if (unix_may_passcred(sk) || unix_may_passcred(other) || 2034 !other->sk_socket) { 2035 struct pid *pid; 2036 int err; 2037 2038 pid = task_tgid(current); 2039 err = pidfs_register_pid(pid); 2040 if (unlikely(err)) 2041 return err; 2042 2043 UNIXCB(skb).pid = get_pid(pid); 2044 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 2045 } 2046 2047 return 0; 2048 } 2049 2050 static bool unix_skb_scm_eq(struct sk_buff *skb, 2051 struct scm_cookie *scm) 2052 { 2053 return UNIXCB(skb).pid == scm->pid && 2054 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 2055 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 2056 unix_secdata_eq(scm, skb); 2057 } 2058 2059 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 2060 { 2061 struct scm_fp_list *fp = UNIXCB(skb).fp; 2062 struct unix_sock *u = unix_sk(sk); 2063 2064 if (unlikely(fp && fp->count)) { 2065 atomic_add(fp->count, &u->scm_stat.nr_fds); 2066 unix_add_edges(fp, u); 2067 } 2068 } 2069 2070 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 2071 { 2072 struct scm_fp_list *fp = UNIXCB(skb).fp; 2073 struct unix_sock *u = unix_sk(sk); 2074 2075 if (unlikely(fp && fp->count)) { 2076 atomic_sub(fp->count, &u->scm_stat.nr_fds); 2077 unix_del_edges(fp); 2078 } 2079 } 2080 2081 /* 2082 * Send AF_UNIX data. 2083 */ 2084 2085 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 2086 size_t len) 2087 { 2088 struct sock *sk = sock->sk, *other = NULL; 2089 struct unix_sock *u = unix_sk(sk); 2090 struct scm_cookie scm; 2091 struct sk_buff *skb; 2092 int data_len = 0; 2093 int sk_locked; 2094 long timeo; 2095 int err; 2096 2097 err = scm_send(sock, msg, &scm, false); 2098 if (err < 0) 2099 return err; 2100 2101 if (msg->msg_flags & MSG_OOB) { 2102 err = -EOPNOTSUPP; 2103 goto out; 2104 } 2105 2106 if (msg->msg_namelen) { 2107 err = unix_validate_addr(msg->msg_name, msg->msg_namelen); 2108 if (err) 2109 goto out; 2110 2111 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 2112 msg->msg_name, 2113 &msg->msg_namelen, 2114 NULL); 2115 if (err) 2116 goto out; 2117 } 2118 2119 if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) { 2120 err = unix_autobind(sk); 2121 if (err) 2122 goto out; 2123 } 2124 2125 if (len > READ_ONCE(sk->sk_sndbuf) - 32) { 2126 err = -EMSGSIZE; 2127 goto out; 2128 } 2129 2130 if (len > SKB_MAX_ALLOC) { 2131 data_len = min_t(size_t, 2132 len - SKB_MAX_ALLOC, 2133 MAX_SKB_FRAGS * PAGE_SIZE); 2134 data_len = PAGE_ALIGN(data_len); 2135 2136 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 2137 } 2138 2139 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 2140 msg->msg_flags & MSG_DONTWAIT, &err, 2141 PAGE_ALLOC_COSTLY_ORDER); 2142 if (!skb) 2143 goto out; 2144 2145 err = unix_scm_to_skb(&scm, skb, true); 2146 if (err < 0) 2147 goto out_free; 2148 2149 skb_put(skb, len - data_len); 2150 skb->data_len = data_len; 2151 skb->len = len; 2152 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 2153 if (err) 2154 goto out_free; 2155 2156 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 2157 2158 if (msg->msg_namelen) { 2159 lookup: 2160 other = unix_find_other(sock_net(sk), msg->msg_name, 2161 msg->msg_namelen, sk->sk_type, 0); 2162 if (IS_ERR(other)) { 2163 err = PTR_ERR(other); 2164 goto out_free; 2165 } 2166 } else { 2167 other = unix_peer_get(sk); 2168 if (!other) { 2169 err = -ENOTCONN; 2170 goto out_free; 2171 } 2172 } 2173 2174 if (sk_filter(other, skb) < 0) { 2175 /* Toss the packet but do not return any error to the sender */ 2176 err = len; 2177 goto out_sock_put; 2178 } 2179 2180 err = unix_maybe_add_creds(skb, sk, other); 2181 if (err) 2182 goto out_sock_put; 2183 2184 restart: 2185 sk_locked = 0; 2186 unix_state_lock(other); 2187 restart_locked: 2188 2189 if (!unix_may_send(sk, other)) { 2190 err = -EPERM; 2191 goto out_unlock; 2192 } 2193 2194 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2195 /* Check with 1003.1g - what should datagram error */ 2196 2197 unix_state_unlock(other); 2198 2199 if (sk->sk_type == SOCK_SEQPACKET) { 2200 /* We are here only when racing with unix_release_sock() 2201 * is clearing @other. Never change state to TCP_CLOSE 2202 * unlike SOCK_DGRAM wants. 2203 */ 2204 err = -EPIPE; 2205 goto out_sock_put; 2206 } 2207 2208 if (!sk_locked) 2209 unix_state_lock(sk); 2210 2211 if (unix_peer(sk) == other) { 2212 unix_peer(sk) = NULL; 2213 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2214 2215 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2216 unix_state_unlock(sk); 2217 2218 unix_dgram_disconnected(sk, other); 2219 sock_put(other); 2220 err = -ECONNREFUSED; 2221 goto out_sock_put; 2222 } 2223 2224 unix_state_unlock(sk); 2225 2226 if (!msg->msg_namelen) { 2227 err = -ECONNRESET; 2228 goto out_sock_put; 2229 } 2230 2231 sock_put(other); 2232 goto lookup; 2233 } 2234 2235 if (other->sk_shutdown & RCV_SHUTDOWN) { 2236 err = -EPIPE; 2237 goto out_unlock; 2238 } 2239 2240 if (UNIXCB(skb).fp && !other->sk_scm_rights) { 2241 err = -EPERM; 2242 goto out_unlock; 2243 } 2244 2245 if (sk->sk_type != SOCK_SEQPACKET) { 2246 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2247 if (err) 2248 goto out_unlock; 2249 } 2250 2251 /* other == sk && unix_peer(other) != sk if 2252 * - unix_peer(sk) == NULL, destination address bound to sk 2253 * - unix_peer(sk) == sk by time of get but disconnected before lock 2254 */ 2255 if (other != sk && 2256 unlikely(unix_peer(other) != sk && 2257 unix_recvq_full_lockless(other))) { 2258 if (timeo) { 2259 timeo = unix_wait_for_peer(other, timeo); 2260 2261 err = sock_intr_errno(timeo); 2262 if (signal_pending(current)) 2263 goto out_sock_put; 2264 2265 goto restart; 2266 } 2267 2268 if (!sk_locked) { 2269 unix_state_unlock(other); 2270 unix_state_double_lock(sk, other); 2271 } 2272 2273 if (unix_peer(sk) != other || 2274 unix_dgram_peer_wake_me(sk, other)) { 2275 err = -EAGAIN; 2276 sk_locked = 1; 2277 goto out_unlock; 2278 } 2279 2280 if (!sk_locked) { 2281 sk_locked = 1; 2282 goto restart_locked; 2283 } 2284 } 2285 2286 if (unlikely(sk_locked)) 2287 unix_state_unlock(sk); 2288 2289 if (sock_flag(other, SOCK_RCVTSTAMP)) 2290 __net_timestamp(skb); 2291 2292 scm_stat_add(other, skb); 2293 skb_queue_tail(&other->sk_receive_queue, skb); 2294 unix_state_unlock(other); 2295 other->sk_data_ready(other); 2296 sock_put(other); 2297 scm_destroy(&scm); 2298 return len; 2299 2300 out_unlock: 2301 if (sk_locked) 2302 unix_state_unlock(sk); 2303 unix_state_unlock(other); 2304 out_sock_put: 2305 sock_put(other); 2306 out_free: 2307 consume_skb(skb); 2308 out: 2309 scm_destroy(&scm); 2310 return err; 2311 } 2312 2313 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2314 * bytes, and a minimum of a full page. 2315 */ 2316 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2317 2318 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2319 static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, 2320 struct scm_cookie *scm, bool fds_sent) 2321 { 2322 struct unix_sock *ousk = unix_sk(other); 2323 struct sk_buff *skb; 2324 int err; 2325 2326 skb = sock_alloc_send_skb(sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2327 2328 if (!skb) 2329 return err; 2330 2331 err = unix_scm_to_skb(scm, skb, !fds_sent); 2332 if (err < 0) 2333 goto out; 2334 2335 err = unix_maybe_add_creds(skb, sk, other); 2336 if (err) 2337 goto out; 2338 2339 skb_put(skb, 1); 2340 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2341 2342 if (err) 2343 goto out; 2344 2345 unix_state_lock(other); 2346 2347 if (sock_flag(other, SOCK_DEAD) || 2348 (other->sk_shutdown & RCV_SHUTDOWN)) { 2349 err = -EPIPE; 2350 goto out_unlock; 2351 } 2352 2353 if (UNIXCB(skb).fp && !other->sk_scm_rights) { 2354 err = -EPERM; 2355 goto out_unlock; 2356 } 2357 2358 scm_stat_add(other, skb); 2359 2360 spin_lock(&other->sk_receive_queue.lock); 2361 WRITE_ONCE(ousk->oob_skb, skb); 2362 WRITE_ONCE(ousk->inq_len, ousk->inq_len + 1); 2363 __skb_queue_tail(&other->sk_receive_queue, skb); 2364 spin_unlock(&other->sk_receive_queue.lock); 2365 2366 sk_send_sigurg(other); 2367 unix_state_unlock(other); 2368 other->sk_data_ready(other); 2369 2370 return 0; 2371 out_unlock: 2372 unix_state_unlock(other); 2373 out: 2374 consume_skb(skb); 2375 return err; 2376 } 2377 #endif 2378 2379 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2380 size_t len) 2381 { 2382 struct sock *sk = sock->sk; 2383 struct sk_buff *skb = NULL; 2384 struct sock *other = NULL; 2385 struct unix_sock *otheru; 2386 struct scm_cookie scm; 2387 bool fds_sent = false; 2388 int err, sent = 0; 2389 2390 err = scm_send(sock, msg, &scm, false); 2391 if (err < 0) 2392 return err; 2393 2394 if (msg->msg_flags & MSG_OOB) { 2395 err = -EOPNOTSUPP; 2396 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2397 if (len) 2398 len--; 2399 else 2400 #endif 2401 goto out_err; 2402 } 2403 2404 if (msg->msg_namelen) { 2405 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2406 goto out_err; 2407 } 2408 2409 other = unix_peer(sk); 2410 if (!other) { 2411 err = -ENOTCONN; 2412 goto out_err; 2413 } 2414 2415 otheru = unix_sk(other); 2416 2417 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2418 goto out_pipe; 2419 2420 while (sent < len) { 2421 int size = len - sent; 2422 int data_len; 2423 2424 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2425 skb = sock_alloc_send_pskb(sk, 0, 0, 2426 msg->msg_flags & MSG_DONTWAIT, 2427 &err, 0); 2428 } else { 2429 /* Keep two messages in the pipe so it schedules better */ 2430 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2431 2432 /* allow fallback to order-0 allocations */ 2433 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2434 2435 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2436 2437 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2438 2439 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2440 msg->msg_flags & MSG_DONTWAIT, &err, 2441 get_order(UNIX_SKB_FRAGS_SZ)); 2442 } 2443 if (!skb) 2444 goto out_err; 2445 2446 /* Only send the fds in the first buffer */ 2447 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2448 if (err < 0) 2449 goto out_free; 2450 2451 fds_sent = true; 2452 2453 err = unix_maybe_add_creds(skb, sk, other); 2454 if (err) 2455 goto out_free; 2456 2457 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2458 skb->ip_summed = CHECKSUM_UNNECESSARY; 2459 err = skb_splice_from_iter(skb, &msg->msg_iter, size); 2460 if (err < 0) 2461 goto out_free; 2462 2463 size = err; 2464 refcount_add(size, &sk->sk_wmem_alloc); 2465 } else { 2466 skb_put(skb, size - data_len); 2467 skb->data_len = data_len; 2468 skb->len = size; 2469 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2470 if (err) 2471 goto out_free; 2472 } 2473 2474 unix_state_lock(other); 2475 2476 if (sock_flag(other, SOCK_DEAD) || 2477 (other->sk_shutdown & RCV_SHUTDOWN)) 2478 goto out_pipe_unlock; 2479 2480 if (UNIXCB(skb).fp && !other->sk_scm_rights) { 2481 unix_state_unlock(other); 2482 err = -EPERM; 2483 goto out_free; 2484 } 2485 2486 scm_stat_add(other, skb); 2487 2488 spin_lock(&other->sk_receive_queue.lock); 2489 WRITE_ONCE(otheru->inq_len, otheru->inq_len + skb->len); 2490 __skb_queue_tail(&other->sk_receive_queue, skb); 2491 spin_unlock(&other->sk_receive_queue.lock); 2492 2493 unix_state_unlock(other); 2494 other->sk_data_ready(other); 2495 sent += size; 2496 } 2497 2498 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2499 if (msg->msg_flags & MSG_OOB) { 2500 err = queue_oob(sk, msg, other, &scm, fds_sent); 2501 if (err) 2502 goto out_err; 2503 sent++; 2504 } 2505 #endif 2506 2507 scm_destroy(&scm); 2508 2509 return sent; 2510 2511 out_pipe_unlock: 2512 unix_state_unlock(other); 2513 out_pipe: 2514 if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) 2515 send_sig(SIGPIPE, current, 0); 2516 err = -EPIPE; 2517 out_free: 2518 consume_skb(skb); 2519 out_err: 2520 scm_destroy(&scm); 2521 return sent ? : err; 2522 } 2523 2524 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2525 size_t len) 2526 { 2527 int err; 2528 struct sock *sk = sock->sk; 2529 2530 err = sock_error(sk); 2531 if (err) 2532 return err; 2533 2534 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2535 return -ENOTCONN; 2536 2537 if (msg->msg_namelen) 2538 msg->msg_namelen = 0; 2539 2540 return unix_dgram_sendmsg(sock, msg, len); 2541 } 2542 2543 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2544 size_t size, int flags) 2545 { 2546 struct sock *sk = sock->sk; 2547 2548 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2549 return -ENOTCONN; 2550 2551 return unix_dgram_recvmsg(sock, msg, size, flags); 2552 } 2553 2554 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2555 { 2556 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2557 2558 if (addr) { 2559 msg->msg_namelen = addr->len; 2560 memcpy(msg->msg_name, addr->name, addr->len); 2561 } 2562 } 2563 2564 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2565 int flags) 2566 { 2567 struct scm_cookie scm; 2568 struct socket *sock = sk->sk_socket; 2569 struct unix_sock *u = unix_sk(sk); 2570 struct sk_buff *skb, *last; 2571 long timeo; 2572 int skip; 2573 int err; 2574 2575 err = -EOPNOTSUPP; 2576 if (flags&MSG_OOB) 2577 goto out; 2578 2579 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2580 2581 do { 2582 mutex_lock(&u->iolock); 2583 2584 skip = sk_peek_offset(sk, flags); 2585 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2586 &skip, &err, &last); 2587 if (skb) { 2588 if (!(flags & MSG_PEEK)) 2589 scm_stat_del(sk, skb); 2590 break; 2591 } 2592 2593 mutex_unlock(&u->iolock); 2594 2595 if (err != -EAGAIN) 2596 break; 2597 } while (timeo && 2598 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2599 &err, &timeo, last)); 2600 2601 if (!skb) { /* implies iolock unlocked */ 2602 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2603 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2604 (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN)) 2605 err = 0; 2606 goto out; 2607 } 2608 2609 if (wq_has_sleeper(&u->peer_wait)) 2610 wake_up_interruptible_sync_poll(&u->peer_wait, 2611 EPOLLOUT | EPOLLWRNORM | 2612 EPOLLWRBAND); 2613 2614 if (msg->msg_name) { 2615 unix_copy_addr(msg, skb->sk); 2616 2617 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2618 msg->msg_name, 2619 &msg->msg_namelen); 2620 } 2621 2622 if (size > skb->len - skip) 2623 size = skb->len - skip; 2624 else if (size < skb->len - skip) 2625 msg->msg_flags |= MSG_TRUNC; 2626 2627 err = skb_copy_datagram_msg(skb, skip, msg, size); 2628 if (err) 2629 goto out_free; 2630 2631 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2632 __sock_recv_timestamp(msg, sk, skb); 2633 2634 memset(&scm, 0, sizeof(scm)); 2635 2636 unix_skb_to_scm(skb, &scm); 2637 2638 if (!(flags & MSG_PEEK)) { 2639 if (UNIXCB(skb).fp) 2640 unix_detach_fds(&scm, skb); 2641 2642 sk_peek_offset_bwd(sk, skb->len); 2643 } else { 2644 /* It is questionable: on PEEK we could: 2645 - do not return fds - good, but too simple 8) 2646 - return fds, and do not return them on read (old strategy, 2647 apparently wrong) 2648 - clone fds (I chose it for now, it is the most universal 2649 solution) 2650 2651 POSIX 1003.1g does not actually define this clearly 2652 at all. POSIX 1003.1g doesn't define a lot of things 2653 clearly however! 2654 2655 */ 2656 2657 sk_peek_offset_fwd(sk, size); 2658 2659 if (UNIXCB(skb).fp) 2660 unix_peek_fds(&scm, skb); 2661 } 2662 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2663 2664 scm_recv_unix(sock, msg, &scm, flags); 2665 2666 out_free: 2667 skb_free_datagram(sk, skb); 2668 mutex_unlock(&u->iolock); 2669 out: 2670 return err; 2671 } 2672 2673 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2674 int flags) 2675 { 2676 struct sock *sk = sock->sk; 2677 2678 #ifdef CONFIG_BPF_SYSCALL 2679 const struct proto *prot = READ_ONCE(sk->sk_prot); 2680 2681 if (prot != &unix_dgram_proto) 2682 return prot->recvmsg(sk, msg, size, flags, NULL); 2683 #endif 2684 return __unix_dgram_recvmsg(sk, msg, size, flags); 2685 } 2686 2687 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2688 { 2689 struct unix_sock *u = unix_sk(sk); 2690 struct sk_buff *skb; 2691 int err; 2692 2693 mutex_lock(&u->iolock); 2694 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2695 mutex_unlock(&u->iolock); 2696 if (!skb) 2697 return err; 2698 2699 return recv_actor(sk, skb); 2700 } 2701 2702 /* 2703 * Sleep until more data has arrived. But check for races.. 2704 */ 2705 static long unix_stream_data_wait(struct sock *sk, long timeo, 2706 struct sk_buff *last, unsigned int last_len, 2707 bool freezable) 2708 { 2709 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2710 struct sk_buff *tail; 2711 DEFINE_WAIT(wait); 2712 2713 unix_state_lock(sk); 2714 2715 for (;;) { 2716 prepare_to_wait(sk_sleep(sk), &wait, state); 2717 2718 tail = skb_peek_tail(&sk->sk_receive_queue); 2719 if (tail != last || 2720 (tail && tail->len != last_len) || 2721 sk->sk_err || 2722 (sk->sk_shutdown & RCV_SHUTDOWN) || 2723 signal_pending(current) || 2724 !timeo) 2725 break; 2726 2727 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2728 unix_state_unlock(sk); 2729 timeo = schedule_timeout(timeo); 2730 unix_state_lock(sk); 2731 2732 if (sock_flag(sk, SOCK_DEAD)) 2733 break; 2734 2735 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2736 } 2737 2738 finish_wait(sk_sleep(sk), &wait); 2739 unix_state_unlock(sk); 2740 return timeo; 2741 } 2742 2743 struct unix_stream_read_state { 2744 int (*recv_actor)(struct sk_buff *, int, int, 2745 struct unix_stream_read_state *); 2746 struct socket *socket; 2747 struct msghdr *msg; 2748 struct pipe_inode_info *pipe; 2749 size_t size; 2750 int flags; 2751 unsigned int splice_flags; 2752 }; 2753 2754 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2755 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2756 { 2757 struct sk_buff *oob_skb, *read_skb = NULL; 2758 struct socket *sock = state->socket; 2759 struct sock *sk = sock->sk; 2760 struct unix_sock *u = unix_sk(sk); 2761 int chunk = 1; 2762 2763 mutex_lock(&u->iolock); 2764 unix_state_lock(sk); 2765 spin_lock(&sk->sk_receive_queue.lock); 2766 2767 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2768 spin_unlock(&sk->sk_receive_queue.lock); 2769 unix_state_unlock(sk); 2770 mutex_unlock(&u->iolock); 2771 return -EINVAL; 2772 } 2773 2774 oob_skb = u->oob_skb; 2775 2776 if (!(state->flags & MSG_PEEK)) { 2777 WRITE_ONCE(u->oob_skb, NULL); 2778 WRITE_ONCE(u->inq_len, u->inq_len - 1); 2779 2780 if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue && 2781 !unix_skb_len(oob_skb->prev)) { 2782 read_skb = oob_skb->prev; 2783 __skb_unlink(read_skb, &sk->sk_receive_queue); 2784 } 2785 } 2786 2787 spin_unlock(&sk->sk_receive_queue.lock); 2788 unix_state_unlock(sk); 2789 2790 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2791 2792 if (!(state->flags & MSG_PEEK)) 2793 UNIXCB(oob_skb).consumed += 1; 2794 2795 mutex_unlock(&u->iolock); 2796 2797 consume_skb(read_skb); 2798 2799 if (chunk < 0) 2800 return -EFAULT; 2801 2802 state->msg->msg_flags |= MSG_OOB; 2803 return 1; 2804 } 2805 2806 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2807 int flags, int copied) 2808 { 2809 struct sk_buff *read_skb = NULL, *unread_skb = NULL; 2810 struct unix_sock *u = unix_sk(sk); 2811 2812 if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb))) 2813 return skb; 2814 2815 spin_lock(&sk->sk_receive_queue.lock); 2816 2817 if (!unix_skb_len(skb)) { 2818 if (copied && (!u->oob_skb || skb == u->oob_skb)) { 2819 skb = NULL; 2820 } else if (flags & MSG_PEEK) { 2821 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2822 } else { 2823 read_skb = skb; 2824 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2825 __skb_unlink(read_skb, &sk->sk_receive_queue); 2826 } 2827 2828 if (!skb) 2829 goto unlock; 2830 } 2831 2832 if (skb != u->oob_skb) 2833 goto unlock; 2834 2835 if (copied) { 2836 skb = NULL; 2837 } else if (!(flags & MSG_PEEK)) { 2838 WRITE_ONCE(u->oob_skb, NULL); 2839 2840 if (!sock_flag(sk, SOCK_URGINLINE)) { 2841 __skb_unlink(skb, &sk->sk_receive_queue); 2842 unread_skb = skb; 2843 skb = skb_peek(&sk->sk_receive_queue); 2844 } 2845 } else if (!sock_flag(sk, SOCK_URGINLINE)) { 2846 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2847 } 2848 2849 unlock: 2850 spin_unlock(&sk->sk_receive_queue.lock); 2851 2852 consume_skb(read_skb); 2853 kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB); 2854 2855 return skb; 2856 } 2857 #endif 2858 2859 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2860 { 2861 struct sk_buff_head *queue = &sk->sk_receive_queue; 2862 struct unix_sock *u = unix_sk(sk); 2863 struct sk_buff *skb; 2864 int err; 2865 2866 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2867 return -ENOTCONN; 2868 2869 err = sock_error(sk); 2870 if (err) 2871 return err; 2872 2873 mutex_lock(&u->iolock); 2874 spin_lock(&queue->lock); 2875 2876 skb = __skb_dequeue(queue); 2877 if (!skb) { 2878 spin_unlock(&queue->lock); 2879 mutex_unlock(&u->iolock); 2880 return -EAGAIN; 2881 } 2882 2883 WRITE_ONCE(u->inq_len, u->inq_len - skb->len); 2884 2885 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2886 if (skb == u->oob_skb) { 2887 WRITE_ONCE(u->oob_skb, NULL); 2888 spin_unlock(&queue->lock); 2889 mutex_unlock(&u->iolock); 2890 2891 kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); 2892 return -EAGAIN; 2893 } 2894 #endif 2895 2896 spin_unlock(&queue->lock); 2897 mutex_unlock(&u->iolock); 2898 2899 return recv_actor(sk, skb); 2900 } 2901 2902 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2903 bool freezable) 2904 { 2905 int noblock = state->flags & MSG_DONTWAIT; 2906 struct socket *sock = state->socket; 2907 struct msghdr *msg = state->msg; 2908 struct sock *sk = sock->sk; 2909 size_t size = state->size; 2910 int flags = state->flags; 2911 bool check_creds = false; 2912 struct scm_cookie scm; 2913 unsigned int last_len; 2914 struct unix_sock *u; 2915 int copied = 0; 2916 int err = 0; 2917 long timeo; 2918 int target; 2919 int skip; 2920 2921 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2922 err = -EINVAL; 2923 goto out; 2924 } 2925 2926 if (unlikely(flags & MSG_OOB)) { 2927 err = -EOPNOTSUPP; 2928 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2929 err = unix_stream_recv_urg(state); 2930 #endif 2931 goto out; 2932 } 2933 2934 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2935 timeo = sock_rcvtimeo(sk, noblock); 2936 2937 memset(&scm, 0, sizeof(scm)); 2938 2939 u = unix_sk(sk); 2940 2941 redo: 2942 /* Lock the socket to prevent queue disordering 2943 * while sleeps in memcpy_tomsg 2944 */ 2945 mutex_lock(&u->iolock); 2946 2947 skip = max(sk_peek_offset(sk, flags), 0); 2948 2949 do { 2950 struct sk_buff *skb, *last; 2951 int chunk; 2952 2953 unix_state_lock(sk); 2954 if (sock_flag(sk, SOCK_DEAD)) { 2955 err = -ECONNRESET; 2956 goto unlock; 2957 } 2958 last = skb = skb_peek(&sk->sk_receive_queue); 2959 last_len = last ? last->len : 0; 2960 2961 again: 2962 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2963 if (skb) { 2964 skb = manage_oob(skb, sk, flags, copied); 2965 if (!skb && copied) { 2966 unix_state_unlock(sk); 2967 break; 2968 } 2969 } 2970 #endif 2971 if (skb == NULL) { 2972 if (copied >= target) 2973 goto unlock; 2974 2975 /* 2976 * POSIX 1003.1g mandates this order. 2977 */ 2978 2979 err = sock_error(sk); 2980 if (err) 2981 goto unlock; 2982 if (sk->sk_shutdown & RCV_SHUTDOWN) 2983 goto unlock; 2984 2985 unix_state_unlock(sk); 2986 if (!timeo) { 2987 err = -EAGAIN; 2988 break; 2989 } 2990 2991 mutex_unlock(&u->iolock); 2992 2993 timeo = unix_stream_data_wait(sk, timeo, last, 2994 last_len, freezable); 2995 2996 if (signal_pending(current)) { 2997 err = sock_intr_errno(timeo); 2998 scm_destroy(&scm); 2999 goto out; 3000 } 3001 3002 goto redo; 3003 unlock: 3004 unix_state_unlock(sk); 3005 break; 3006 } 3007 3008 while (skip >= unix_skb_len(skb)) { 3009 skip -= unix_skb_len(skb); 3010 last = skb; 3011 last_len = skb->len; 3012 skb = skb_peek_next(skb, &sk->sk_receive_queue); 3013 if (!skb) 3014 goto again; 3015 } 3016 3017 unix_state_unlock(sk); 3018 3019 if (check_creds) { 3020 /* Never glue messages from different writers */ 3021 if (!unix_skb_scm_eq(skb, &scm)) 3022 break; 3023 } else if (unix_may_passcred(sk)) { 3024 /* Copy credentials */ 3025 unix_skb_to_scm(skb, &scm); 3026 check_creds = true; 3027 } 3028 3029 /* Copy address just once */ 3030 if (msg && msg->msg_name) { 3031 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 3032 3033 unix_copy_addr(msg, skb->sk); 3034 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, msg->msg_name, 3035 &msg->msg_namelen); 3036 3037 sunaddr = NULL; 3038 } 3039 3040 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 3041 chunk = state->recv_actor(skb, skip, chunk, state); 3042 if (chunk < 0) { 3043 if (copied == 0) 3044 copied = -EFAULT; 3045 break; 3046 } 3047 copied += chunk; 3048 size -= chunk; 3049 3050 /* Mark read part of skb as used */ 3051 if (!(flags & MSG_PEEK)) { 3052 UNIXCB(skb).consumed += chunk; 3053 3054 sk_peek_offset_bwd(sk, chunk); 3055 3056 if (UNIXCB(skb).fp) { 3057 scm_stat_del(sk, skb); 3058 unix_detach_fds(&scm, skb); 3059 } 3060 3061 if (unix_skb_len(skb)) 3062 break; 3063 3064 spin_lock(&sk->sk_receive_queue.lock); 3065 WRITE_ONCE(u->inq_len, u->inq_len - skb->len); 3066 __skb_unlink(skb, &sk->sk_receive_queue); 3067 spin_unlock(&sk->sk_receive_queue.lock); 3068 3069 consume_skb(skb); 3070 3071 if (scm.fp) 3072 break; 3073 } else { 3074 /* It is questionable, see note in unix_dgram_recvmsg. 3075 */ 3076 if (UNIXCB(skb).fp) 3077 unix_peek_fds(&scm, skb); 3078 3079 sk_peek_offset_fwd(sk, chunk); 3080 3081 if (UNIXCB(skb).fp) 3082 break; 3083 3084 skip = 0; 3085 last = skb; 3086 last_len = skb->len; 3087 unix_state_lock(sk); 3088 skb = skb_peek_next(skb, &sk->sk_receive_queue); 3089 if (skb) 3090 goto again; 3091 unix_state_unlock(sk); 3092 break; 3093 } 3094 } while (size); 3095 3096 mutex_unlock(&u->iolock); 3097 if (msg) { 3098 scm_recv_unix(sock, msg, &scm, flags); 3099 3100 if (READ_ONCE(u->recvmsg_inq) || msg->msg_get_inq) { 3101 msg->msg_inq = READ_ONCE(u->inq_len); 3102 put_cmsg(msg, SOL_SOCKET, SCM_INQ, 3103 sizeof(msg->msg_inq), &msg->msg_inq); 3104 } 3105 } else { 3106 scm_destroy(&scm); 3107 } 3108 out: 3109 return copied ? : err; 3110 } 3111 3112 static int unix_stream_read_actor(struct sk_buff *skb, 3113 int skip, int chunk, 3114 struct unix_stream_read_state *state) 3115 { 3116 int ret; 3117 3118 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 3119 state->msg, chunk); 3120 return ret ?: chunk; 3121 } 3122 3123 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 3124 size_t size, int flags) 3125 { 3126 struct unix_stream_read_state state = { 3127 .recv_actor = unix_stream_read_actor, 3128 .socket = sk->sk_socket, 3129 .msg = msg, 3130 .size = size, 3131 .flags = flags 3132 }; 3133 3134 return unix_stream_read_generic(&state, true); 3135 } 3136 3137 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 3138 size_t size, int flags) 3139 { 3140 struct unix_stream_read_state state = { 3141 .recv_actor = unix_stream_read_actor, 3142 .socket = sock, 3143 .msg = msg, 3144 .size = size, 3145 .flags = flags 3146 }; 3147 3148 #ifdef CONFIG_BPF_SYSCALL 3149 struct sock *sk = sock->sk; 3150 const struct proto *prot = READ_ONCE(sk->sk_prot); 3151 3152 if (prot != &unix_stream_proto) 3153 return prot->recvmsg(sk, msg, size, flags, NULL); 3154 #endif 3155 return unix_stream_read_generic(&state, true); 3156 } 3157 3158 static int unix_stream_splice_actor(struct sk_buff *skb, 3159 int skip, int chunk, 3160 struct unix_stream_read_state *state) 3161 { 3162 return skb_splice_bits(skb, state->socket->sk, 3163 UNIXCB(skb).consumed + skip, 3164 state->pipe, chunk, state->splice_flags); 3165 } 3166 3167 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 3168 struct pipe_inode_info *pipe, 3169 size_t size, unsigned int flags) 3170 { 3171 struct unix_stream_read_state state = { 3172 .recv_actor = unix_stream_splice_actor, 3173 .socket = sock, 3174 .pipe = pipe, 3175 .size = size, 3176 .splice_flags = flags, 3177 }; 3178 3179 if (unlikely(*ppos)) 3180 return -ESPIPE; 3181 3182 if (sock->file->f_flags & O_NONBLOCK || 3183 flags & SPLICE_F_NONBLOCK) 3184 state.flags = MSG_DONTWAIT; 3185 3186 return unix_stream_read_generic(&state, false); 3187 } 3188 3189 static int unix_shutdown(struct socket *sock, int mode) 3190 { 3191 struct sock *sk = sock->sk; 3192 struct sock *other; 3193 3194 if (mode < SHUT_RD || mode > SHUT_RDWR) 3195 return -EINVAL; 3196 /* This maps: 3197 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 3198 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 3199 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 3200 */ 3201 ++mode; 3202 3203 unix_state_lock(sk); 3204 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 3205 other = unix_peer(sk); 3206 if (other) 3207 sock_hold(other); 3208 unix_state_unlock(sk); 3209 sk->sk_state_change(sk); 3210 3211 if (other && 3212 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 3213 3214 int peer_mode = 0; 3215 const struct proto *prot = READ_ONCE(other->sk_prot); 3216 3217 if (prot->unhash) 3218 prot->unhash(other); 3219 if (mode&RCV_SHUTDOWN) 3220 peer_mode |= SEND_SHUTDOWN; 3221 if (mode&SEND_SHUTDOWN) 3222 peer_mode |= RCV_SHUTDOWN; 3223 unix_state_lock(other); 3224 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 3225 unix_state_unlock(other); 3226 other->sk_state_change(other); 3227 if (peer_mode == SHUTDOWN_MASK) 3228 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3229 else if (peer_mode & RCV_SHUTDOWN) 3230 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3231 } 3232 if (other) 3233 sock_put(other); 3234 3235 return 0; 3236 } 3237 3238 long unix_inq_len(struct sock *sk) 3239 { 3240 struct sk_buff *skb; 3241 long amount = 0; 3242 3243 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 3244 return -EINVAL; 3245 3246 if (sk->sk_type == SOCK_STREAM) 3247 return READ_ONCE(unix_sk(sk)->inq_len); 3248 3249 spin_lock(&sk->sk_receive_queue.lock); 3250 if (sk->sk_type == SOCK_SEQPACKET) { 3251 skb_queue_walk(&sk->sk_receive_queue, skb) 3252 amount += unix_skb_len(skb); 3253 } else { 3254 skb = skb_peek(&sk->sk_receive_queue); 3255 if (skb) 3256 amount = skb->len; 3257 } 3258 spin_unlock(&sk->sk_receive_queue.lock); 3259 3260 return amount; 3261 } 3262 EXPORT_SYMBOL_GPL(unix_inq_len); 3263 3264 long unix_outq_len(struct sock *sk) 3265 { 3266 return sk_wmem_alloc_get(sk); 3267 } 3268 EXPORT_SYMBOL_GPL(unix_outq_len); 3269 3270 static int unix_open_file(struct sock *sk) 3271 { 3272 struct file *f; 3273 int fd; 3274 3275 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3276 return -EPERM; 3277 3278 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3279 return -ENOENT; 3280 3281 if (!unix_sk(sk)->path.dentry) 3282 return -ENOENT; 3283 3284 fd = get_unused_fd_flags(O_CLOEXEC); 3285 if (fd < 0) 3286 return fd; 3287 3288 f = dentry_open(&unix_sk(sk)->path, O_PATH, current_cred()); 3289 if (IS_ERR(f)) { 3290 put_unused_fd(fd); 3291 return PTR_ERR(f); 3292 } 3293 3294 fd_install(fd, f); 3295 return fd; 3296 } 3297 3298 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3299 { 3300 struct sock *sk = sock->sk; 3301 long amount = 0; 3302 int err; 3303 3304 switch (cmd) { 3305 case SIOCOUTQ: 3306 amount = unix_outq_len(sk); 3307 err = put_user(amount, (int __user *)arg); 3308 break; 3309 case SIOCINQ: 3310 amount = unix_inq_len(sk); 3311 if (amount < 0) 3312 err = amount; 3313 else 3314 err = put_user(amount, (int __user *)arg); 3315 break; 3316 case SIOCUNIXFILE: 3317 err = unix_open_file(sk); 3318 break; 3319 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3320 case SIOCATMARK: 3321 { 3322 struct unix_sock *u = unix_sk(sk); 3323 struct sk_buff *skb; 3324 int answ = 0; 3325 3326 mutex_lock(&u->iolock); 3327 3328 skb = skb_peek(&sk->sk_receive_queue); 3329 if (skb) { 3330 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); 3331 struct sk_buff *next_skb; 3332 3333 next_skb = skb_peek_next(skb, &sk->sk_receive_queue); 3334 3335 if (skb == oob_skb || 3336 (!unix_skb_len(skb) && 3337 (!oob_skb || next_skb == oob_skb))) 3338 answ = 1; 3339 } 3340 3341 mutex_unlock(&u->iolock); 3342 3343 err = put_user(answ, (int __user *)arg); 3344 } 3345 break; 3346 #endif 3347 default: 3348 err = -ENOIOCTLCMD; 3349 break; 3350 } 3351 return err; 3352 } 3353 3354 #ifdef CONFIG_COMPAT 3355 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3356 { 3357 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3358 } 3359 #endif 3360 3361 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3362 { 3363 struct sock *sk = sock->sk; 3364 unsigned char state; 3365 __poll_t mask; 3366 u8 shutdown; 3367 3368 sock_poll_wait(file, sock, wait); 3369 mask = 0; 3370 shutdown = READ_ONCE(sk->sk_shutdown); 3371 state = READ_ONCE(sk->sk_state); 3372 3373 /* exceptional events? */ 3374 if (READ_ONCE(sk->sk_err)) 3375 mask |= EPOLLERR; 3376 if (shutdown == SHUTDOWN_MASK) 3377 mask |= EPOLLHUP; 3378 if (shutdown & RCV_SHUTDOWN) 3379 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3380 3381 /* readable? */ 3382 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3383 mask |= EPOLLIN | EPOLLRDNORM; 3384 if (sk_is_readable(sk)) 3385 mask |= EPOLLIN | EPOLLRDNORM; 3386 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3387 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3388 mask |= EPOLLPRI; 3389 #endif 3390 3391 /* Connection-based need to check for termination and startup */ 3392 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3393 state == TCP_CLOSE) 3394 mask |= EPOLLHUP; 3395 3396 /* 3397 * we set writable also when the other side has shut down the 3398 * connection. This prevents stuck sockets. 3399 */ 3400 if (unix_writable(sk, state)) 3401 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3402 3403 return mask; 3404 } 3405 3406 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3407 poll_table *wait) 3408 { 3409 struct sock *sk = sock->sk, *other; 3410 unsigned int writable; 3411 unsigned char state; 3412 __poll_t mask; 3413 u8 shutdown; 3414 3415 sock_poll_wait(file, sock, wait); 3416 mask = 0; 3417 shutdown = READ_ONCE(sk->sk_shutdown); 3418 state = READ_ONCE(sk->sk_state); 3419 3420 /* exceptional events? */ 3421 if (READ_ONCE(sk->sk_err) || 3422 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3423 mask |= EPOLLERR | 3424 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3425 3426 if (shutdown & RCV_SHUTDOWN) 3427 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3428 if (shutdown == SHUTDOWN_MASK) 3429 mask |= EPOLLHUP; 3430 3431 /* readable? */ 3432 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3433 mask |= EPOLLIN | EPOLLRDNORM; 3434 if (sk_is_readable(sk)) 3435 mask |= EPOLLIN | EPOLLRDNORM; 3436 3437 /* Connection-based need to check for termination and startup */ 3438 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3439 mask |= EPOLLHUP; 3440 3441 /* No write status requested, avoid expensive OUT tests. */ 3442 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3443 return mask; 3444 3445 writable = unix_writable(sk, state); 3446 if (writable) { 3447 unix_state_lock(sk); 3448 3449 other = unix_peer(sk); 3450 if (other && unix_peer(other) != sk && 3451 unix_recvq_full_lockless(other) && 3452 unix_dgram_peer_wake_me(sk, other)) 3453 writable = 0; 3454 3455 unix_state_unlock(sk); 3456 } 3457 3458 if (writable) 3459 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3460 else 3461 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3462 3463 return mask; 3464 } 3465 3466 #ifdef CONFIG_PROC_FS 3467 3468 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3469 3470 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3471 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3472 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3473 3474 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3475 { 3476 unsigned long offset = get_offset(*pos); 3477 unsigned long bucket = get_bucket(*pos); 3478 unsigned long count = 0; 3479 struct sock *sk; 3480 3481 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3482 sk; sk = sk_next(sk)) { 3483 if (++count == offset) 3484 break; 3485 } 3486 3487 return sk; 3488 } 3489 3490 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3491 { 3492 unsigned long bucket = get_bucket(*pos); 3493 struct net *net = seq_file_net(seq); 3494 struct sock *sk; 3495 3496 while (bucket < UNIX_HASH_SIZE) { 3497 spin_lock(&net->unx.table.locks[bucket]); 3498 3499 sk = unix_from_bucket(seq, pos); 3500 if (sk) 3501 return sk; 3502 3503 spin_unlock(&net->unx.table.locks[bucket]); 3504 3505 *pos = set_bucket_offset(++bucket, 1); 3506 } 3507 3508 return NULL; 3509 } 3510 3511 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3512 loff_t *pos) 3513 { 3514 unsigned long bucket = get_bucket(*pos); 3515 3516 sk = sk_next(sk); 3517 if (sk) 3518 return sk; 3519 3520 3521 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3522 3523 *pos = set_bucket_offset(++bucket, 1); 3524 3525 return unix_get_first(seq, pos); 3526 } 3527 3528 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3529 { 3530 if (!*pos) 3531 return SEQ_START_TOKEN; 3532 3533 return unix_get_first(seq, pos); 3534 } 3535 3536 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3537 { 3538 ++*pos; 3539 3540 if (v == SEQ_START_TOKEN) 3541 return unix_get_first(seq, pos); 3542 3543 return unix_get_next(seq, v, pos); 3544 } 3545 3546 static void unix_seq_stop(struct seq_file *seq, void *v) 3547 { 3548 struct sock *sk = v; 3549 3550 if (sk) 3551 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3552 } 3553 3554 static int unix_seq_show(struct seq_file *seq, void *v) 3555 { 3556 3557 if (v == SEQ_START_TOKEN) 3558 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3559 "Inode Path\n"); 3560 else { 3561 struct sock *s = v; 3562 struct unix_sock *u = unix_sk(s); 3563 unix_state_lock(s); 3564 3565 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3566 s, 3567 refcount_read(&s->sk_refcnt), 3568 0, 3569 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3570 s->sk_type, 3571 s->sk_socket ? 3572 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3573 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3574 sock_i_ino(s)); 3575 3576 if (u->addr) { // under a hash table lock here 3577 int i, len; 3578 seq_putc(seq, ' '); 3579 3580 i = 0; 3581 len = u->addr->len - 3582 offsetof(struct sockaddr_un, sun_path); 3583 if (u->addr->name->sun_path[0]) { 3584 len--; 3585 } else { 3586 seq_putc(seq, '@'); 3587 i++; 3588 } 3589 for ( ; i < len; i++) 3590 seq_putc(seq, u->addr->name->sun_path[i] ?: 3591 '@'); 3592 } 3593 unix_state_unlock(s); 3594 seq_putc(seq, '\n'); 3595 } 3596 3597 return 0; 3598 } 3599 3600 static const struct seq_operations unix_seq_ops = { 3601 .start = unix_seq_start, 3602 .next = unix_seq_next, 3603 .stop = unix_seq_stop, 3604 .show = unix_seq_show, 3605 }; 3606 3607 #ifdef CONFIG_BPF_SYSCALL 3608 struct bpf_unix_iter_state { 3609 struct seq_net_private p; 3610 unsigned int cur_sk; 3611 unsigned int end_sk; 3612 unsigned int max_sk; 3613 struct sock **batch; 3614 bool st_bucket_done; 3615 }; 3616 3617 struct bpf_iter__unix { 3618 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3619 __bpf_md_ptr(struct unix_sock *, unix_sk); 3620 uid_t uid __aligned(8); 3621 }; 3622 3623 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3624 struct unix_sock *unix_sk, uid_t uid) 3625 { 3626 struct bpf_iter__unix ctx; 3627 3628 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3629 ctx.meta = meta; 3630 ctx.unix_sk = unix_sk; 3631 ctx.uid = uid; 3632 return bpf_iter_run_prog(prog, &ctx); 3633 } 3634 3635 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3636 3637 { 3638 struct bpf_unix_iter_state *iter = seq->private; 3639 unsigned int expected = 1; 3640 struct sock *sk; 3641 3642 sock_hold(start_sk); 3643 iter->batch[iter->end_sk++] = start_sk; 3644 3645 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3646 if (iter->end_sk < iter->max_sk) { 3647 sock_hold(sk); 3648 iter->batch[iter->end_sk++] = sk; 3649 } 3650 3651 expected++; 3652 } 3653 3654 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3655 3656 return expected; 3657 } 3658 3659 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3660 { 3661 while (iter->cur_sk < iter->end_sk) 3662 sock_put(iter->batch[iter->cur_sk++]); 3663 } 3664 3665 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3666 unsigned int new_batch_sz) 3667 { 3668 struct sock **new_batch; 3669 3670 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3671 GFP_USER | __GFP_NOWARN); 3672 if (!new_batch) 3673 return -ENOMEM; 3674 3675 bpf_iter_unix_put_batch(iter); 3676 kvfree(iter->batch); 3677 iter->batch = new_batch; 3678 iter->max_sk = new_batch_sz; 3679 3680 return 0; 3681 } 3682 3683 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3684 loff_t *pos) 3685 { 3686 struct bpf_unix_iter_state *iter = seq->private; 3687 unsigned int expected; 3688 bool resized = false; 3689 struct sock *sk; 3690 3691 if (iter->st_bucket_done) 3692 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3693 3694 again: 3695 /* Get a new batch */ 3696 iter->cur_sk = 0; 3697 iter->end_sk = 0; 3698 3699 sk = unix_get_first(seq, pos); 3700 if (!sk) 3701 return NULL; /* Done */ 3702 3703 expected = bpf_iter_unix_hold_batch(seq, sk); 3704 3705 if (iter->end_sk == expected) { 3706 iter->st_bucket_done = true; 3707 return sk; 3708 } 3709 3710 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3711 resized = true; 3712 goto again; 3713 } 3714 3715 return sk; 3716 } 3717 3718 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3719 { 3720 if (!*pos) 3721 return SEQ_START_TOKEN; 3722 3723 /* bpf iter does not support lseek, so it always 3724 * continue from where it was stop()-ped. 3725 */ 3726 return bpf_iter_unix_batch(seq, pos); 3727 } 3728 3729 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3730 { 3731 struct bpf_unix_iter_state *iter = seq->private; 3732 struct sock *sk; 3733 3734 /* Whenever seq_next() is called, the iter->cur_sk is 3735 * done with seq_show(), so advance to the next sk in 3736 * the batch. 3737 */ 3738 if (iter->cur_sk < iter->end_sk) 3739 sock_put(iter->batch[iter->cur_sk++]); 3740 3741 ++*pos; 3742 3743 if (iter->cur_sk < iter->end_sk) 3744 sk = iter->batch[iter->cur_sk]; 3745 else 3746 sk = bpf_iter_unix_batch(seq, pos); 3747 3748 return sk; 3749 } 3750 3751 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3752 { 3753 struct bpf_iter_meta meta; 3754 struct bpf_prog *prog; 3755 struct sock *sk = v; 3756 uid_t uid; 3757 bool slow; 3758 int ret; 3759 3760 if (v == SEQ_START_TOKEN) 3761 return 0; 3762 3763 slow = lock_sock_fast(sk); 3764 3765 if (unlikely(sk_unhashed(sk))) { 3766 ret = SEQ_SKIP; 3767 goto unlock; 3768 } 3769 3770 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3771 meta.seq = seq; 3772 prog = bpf_iter_get_info(&meta, false); 3773 ret = unix_prog_seq_show(prog, &meta, v, uid); 3774 unlock: 3775 unlock_sock_fast(sk, slow); 3776 return ret; 3777 } 3778 3779 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3780 { 3781 struct bpf_unix_iter_state *iter = seq->private; 3782 struct bpf_iter_meta meta; 3783 struct bpf_prog *prog; 3784 3785 if (!v) { 3786 meta.seq = seq; 3787 prog = bpf_iter_get_info(&meta, true); 3788 if (prog) 3789 (void)unix_prog_seq_show(prog, &meta, v, 0); 3790 } 3791 3792 if (iter->cur_sk < iter->end_sk) 3793 bpf_iter_unix_put_batch(iter); 3794 } 3795 3796 static const struct seq_operations bpf_iter_unix_seq_ops = { 3797 .start = bpf_iter_unix_seq_start, 3798 .next = bpf_iter_unix_seq_next, 3799 .stop = bpf_iter_unix_seq_stop, 3800 .show = bpf_iter_unix_seq_show, 3801 }; 3802 #endif 3803 #endif 3804 3805 static const struct net_proto_family unix_family_ops = { 3806 .family = PF_UNIX, 3807 .create = unix_create, 3808 .owner = THIS_MODULE, 3809 }; 3810 3811 3812 static int __net_init unix_net_init(struct net *net) 3813 { 3814 int i; 3815 3816 net->unx.sysctl_max_dgram_qlen = 10; 3817 if (unix_sysctl_register(net)) 3818 goto out; 3819 3820 #ifdef CONFIG_PROC_FS 3821 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3822 sizeof(struct seq_net_private))) 3823 goto err_sysctl; 3824 #endif 3825 3826 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3827 sizeof(spinlock_t), GFP_KERNEL); 3828 if (!net->unx.table.locks) 3829 goto err_proc; 3830 3831 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3832 sizeof(struct hlist_head), 3833 GFP_KERNEL); 3834 if (!net->unx.table.buckets) 3835 goto free_locks; 3836 3837 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3838 spin_lock_init(&net->unx.table.locks[i]); 3839 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); 3840 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3841 } 3842 3843 return 0; 3844 3845 free_locks: 3846 kvfree(net->unx.table.locks); 3847 err_proc: 3848 #ifdef CONFIG_PROC_FS 3849 remove_proc_entry("unix", net->proc_net); 3850 err_sysctl: 3851 #endif 3852 unix_sysctl_unregister(net); 3853 out: 3854 return -ENOMEM; 3855 } 3856 3857 static void __net_exit unix_net_exit(struct net *net) 3858 { 3859 kvfree(net->unx.table.buckets); 3860 kvfree(net->unx.table.locks); 3861 unix_sysctl_unregister(net); 3862 remove_proc_entry("unix", net->proc_net); 3863 } 3864 3865 static struct pernet_operations unix_net_ops = { 3866 .init = unix_net_init, 3867 .exit = unix_net_exit, 3868 }; 3869 3870 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3871 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3872 struct unix_sock *unix_sk, uid_t uid) 3873 3874 #define INIT_BATCH_SZ 16 3875 3876 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3877 { 3878 struct bpf_unix_iter_state *iter = priv_data; 3879 int err; 3880 3881 err = bpf_iter_init_seq_net(priv_data, aux); 3882 if (err) 3883 return err; 3884 3885 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3886 if (err) { 3887 bpf_iter_fini_seq_net(priv_data); 3888 return err; 3889 } 3890 3891 return 0; 3892 } 3893 3894 static void bpf_iter_fini_unix(void *priv_data) 3895 { 3896 struct bpf_unix_iter_state *iter = priv_data; 3897 3898 bpf_iter_fini_seq_net(priv_data); 3899 kvfree(iter->batch); 3900 } 3901 3902 static const struct bpf_iter_seq_info unix_seq_info = { 3903 .seq_ops = &bpf_iter_unix_seq_ops, 3904 .init_seq_private = bpf_iter_init_unix, 3905 .fini_seq_private = bpf_iter_fini_unix, 3906 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3907 }; 3908 3909 static const struct bpf_func_proto * 3910 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3911 const struct bpf_prog *prog) 3912 { 3913 switch (func_id) { 3914 case BPF_FUNC_setsockopt: 3915 return &bpf_sk_setsockopt_proto; 3916 case BPF_FUNC_getsockopt: 3917 return &bpf_sk_getsockopt_proto; 3918 default: 3919 return NULL; 3920 } 3921 } 3922 3923 static struct bpf_iter_reg unix_reg_info = { 3924 .target = "unix", 3925 .ctx_arg_info_size = 1, 3926 .ctx_arg_info = { 3927 { offsetof(struct bpf_iter__unix, unix_sk), 3928 PTR_TO_BTF_ID_OR_NULL }, 3929 }, 3930 .get_func_proto = bpf_iter_unix_get_func_proto, 3931 .seq_info = &unix_seq_info, 3932 }; 3933 3934 static void __init bpf_iter_register(void) 3935 { 3936 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3937 if (bpf_iter_reg_target(&unix_reg_info)) 3938 pr_warn("Warning: could not register bpf iterator unix\n"); 3939 } 3940 #endif 3941 3942 static int __init af_unix_init(void) 3943 { 3944 int i, rc = -1; 3945 3946 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3947 3948 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3949 spin_lock_init(&bsd_socket_locks[i]); 3950 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3951 } 3952 3953 rc = proto_register(&unix_dgram_proto, 1); 3954 if (rc != 0) { 3955 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3956 goto out; 3957 } 3958 3959 rc = proto_register(&unix_stream_proto, 1); 3960 if (rc != 0) { 3961 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3962 proto_unregister(&unix_dgram_proto); 3963 goto out; 3964 } 3965 3966 sock_register(&unix_family_ops); 3967 register_pernet_subsys(&unix_net_ops); 3968 unix_bpf_build_proto(); 3969 3970 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3971 bpf_iter_register(); 3972 #endif 3973 3974 out: 3975 return rc; 3976 } 3977 3978 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3979 fs_initcall(af_unix_init); 3980