1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/bpf-cgroup.h> 81 #include <linux/btf_ids.h> 82 #include <linux/dcache.h> 83 #include <linux/errno.h> 84 #include <linux/fcntl.h> 85 #include <linux/file.h> 86 #include <linux/filter.h> 87 #include <linux/fs.h> 88 #include <linux/init.h> 89 #include <linux/kernel.h> 90 #include <linux/mount.h> 91 #include <linux/namei.h> 92 #include <linux/poll.h> 93 #include <linux/proc_fs.h> 94 #include <linux/sched/signal.h> 95 #include <linux/security.h> 96 #include <linux/seq_file.h> 97 #include <linux/skbuff.h> 98 #include <linux/slab.h> 99 #include <linux/socket.h> 100 #include <linux/splice.h> 101 #include <linux/string.h> 102 #include <linux/uaccess.h> 103 #include <net/af_unix.h> 104 #include <net/net_namespace.h> 105 #include <net/scm.h> 106 #include <net/tcp_states.h> 107 #include <uapi/linux/sockios.h> 108 #include <uapi/linux/termios.h> 109 110 #include "af_unix.h" 111 112 static atomic_long_t unix_nr_socks; 113 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 114 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 115 116 /* SMP locking strategy: 117 * hash table is protected with spinlock. 118 * each socket state is protected by separate spinlock. 119 */ 120 #ifdef CONFIG_PROVE_LOCKING 121 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) 122 123 static int unix_table_lock_cmp_fn(const struct lockdep_map *a, 124 const struct lockdep_map *b) 125 { 126 return cmp_ptr(a, b); 127 } 128 129 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, 130 const struct lockdep_map *_b) 131 { 132 const struct unix_sock *a, *b; 133 134 a = container_of(_a, struct unix_sock, lock.dep_map); 135 b = container_of(_b, struct unix_sock, lock.dep_map); 136 137 if (a->sk.sk_state == TCP_LISTEN) { 138 /* unix_stream_connect(): Before the 2nd unix_state_lock(), 139 * 140 * 1. a is TCP_LISTEN. 141 * 2. b is not a. 142 * 3. concurrent connect(b -> a) must fail. 143 * 144 * Except for 2. & 3., the b's state can be any possible 145 * value due to concurrent connect() or listen(). 146 * 147 * 2. is detected in debug_spin_lock_before(), and 3. cannot 148 * be expressed as lock_cmp_fn. 149 */ 150 switch (b->sk.sk_state) { 151 case TCP_CLOSE: 152 case TCP_ESTABLISHED: 153 case TCP_LISTEN: 154 return -1; 155 default: 156 /* Invalid case. */ 157 return 0; 158 } 159 } 160 161 /* Should never happen. Just to be symmetric. */ 162 if (b->sk.sk_state == TCP_LISTEN) { 163 switch (b->sk.sk_state) { 164 case TCP_CLOSE: 165 case TCP_ESTABLISHED: 166 return 1; 167 default: 168 return 0; 169 } 170 } 171 172 /* unix_state_double_lock(): ascending address order. */ 173 return cmp_ptr(a, b); 174 } 175 176 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, 177 const struct lockdep_map *_b) 178 { 179 const struct sock *a, *b; 180 181 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); 182 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); 183 184 /* unix_collect_skb(): listener -> embryo order. */ 185 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) 186 return -1; 187 188 /* Should never happen. Just to be symmetric. */ 189 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) 190 return 1; 191 192 return 0; 193 } 194 #endif 195 196 static unsigned int unix_unbound_hash(struct sock *sk) 197 { 198 unsigned long hash = (unsigned long)sk; 199 200 hash ^= hash >> 16; 201 hash ^= hash >> 8; 202 hash ^= sk->sk_type; 203 204 return hash & UNIX_HASH_MOD; 205 } 206 207 static unsigned int unix_bsd_hash(struct inode *i) 208 { 209 return i->i_ino & UNIX_HASH_MOD; 210 } 211 212 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 213 int addr_len, int type) 214 { 215 __wsum csum = csum_partial(sunaddr, addr_len, 0); 216 unsigned int hash; 217 218 hash = (__force unsigned int)csum_fold(csum); 219 hash ^= hash >> 8; 220 hash ^= type; 221 222 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 223 } 224 225 static void unix_table_double_lock(struct net *net, 226 unsigned int hash1, unsigned int hash2) 227 { 228 if (hash1 == hash2) { 229 spin_lock(&net->unx.table.locks[hash1]); 230 return; 231 } 232 233 if (hash1 > hash2) 234 swap(hash1, hash2); 235 236 spin_lock(&net->unx.table.locks[hash1]); 237 spin_lock(&net->unx.table.locks[hash2]); 238 } 239 240 static void unix_table_double_unlock(struct net *net, 241 unsigned int hash1, unsigned int hash2) 242 { 243 if (hash1 == hash2) { 244 spin_unlock(&net->unx.table.locks[hash1]); 245 return; 246 } 247 248 spin_unlock(&net->unx.table.locks[hash1]); 249 spin_unlock(&net->unx.table.locks[hash2]); 250 } 251 252 #ifdef CONFIG_SECURITY_NETWORK 253 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 254 { 255 UNIXCB(skb).secid = scm->secid; 256 } 257 258 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 259 { 260 scm->secid = UNIXCB(skb).secid; 261 } 262 263 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 264 { 265 return (scm->secid == UNIXCB(skb).secid); 266 } 267 #else 268 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 269 { } 270 271 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 272 { } 273 274 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 275 { 276 return true; 277 } 278 #endif /* CONFIG_SECURITY_NETWORK */ 279 280 static inline int unix_may_send(struct sock *sk, struct sock *osk) 281 { 282 return !unix_peer(osk) || unix_peer(osk) == sk; 283 } 284 285 static inline int unix_recvq_full_lockless(const struct sock *sk) 286 { 287 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 288 } 289 290 struct sock *unix_peer_get(struct sock *s) 291 { 292 struct sock *peer; 293 294 unix_state_lock(s); 295 peer = unix_peer(s); 296 if (peer) 297 sock_hold(peer); 298 unix_state_unlock(s); 299 return peer; 300 } 301 EXPORT_SYMBOL_GPL(unix_peer_get); 302 303 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 304 int addr_len) 305 { 306 struct unix_address *addr; 307 308 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 309 if (!addr) 310 return NULL; 311 312 refcount_set(&addr->refcnt, 1); 313 addr->len = addr_len; 314 memcpy(addr->name, sunaddr, addr_len); 315 316 return addr; 317 } 318 319 static inline void unix_release_addr(struct unix_address *addr) 320 { 321 if (refcount_dec_and_test(&addr->refcnt)) 322 kfree(addr); 323 } 324 325 /* 326 * Check unix socket name: 327 * - should be not zero length. 328 * - if started by not zero, should be NULL terminated (FS object) 329 * - if started by zero, it is abstract name. 330 */ 331 332 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 333 { 334 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 335 addr_len > sizeof(*sunaddr)) 336 return -EINVAL; 337 338 if (sunaddr->sun_family != AF_UNIX) 339 return -EINVAL; 340 341 return 0; 342 } 343 344 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 345 { 346 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 347 short offset = offsetof(struct sockaddr_storage, __data); 348 349 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 350 351 /* This may look like an off by one error but it is a bit more 352 * subtle. 108 is the longest valid AF_UNIX path for a binding. 353 * sun_path[108] doesn't as such exist. However in kernel space 354 * we are guaranteed that it is a valid memory location in our 355 * kernel address buffer because syscall functions always pass 356 * a pointer of struct sockaddr_storage which has a bigger buffer 357 * than 108. Also, we must terminate sun_path for strlen() in 358 * getname_kernel(). 359 */ 360 addr->__data[addr_len - offset] = 0; 361 362 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 363 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 364 * know the actual buffer. 365 */ 366 return strlen(addr->__data) + offset + 1; 367 } 368 369 static void __unix_remove_socket(struct sock *sk) 370 { 371 sk_del_node_init(sk); 372 } 373 374 static void __unix_insert_socket(struct net *net, struct sock *sk) 375 { 376 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 377 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 378 } 379 380 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 381 struct unix_address *addr, unsigned int hash) 382 { 383 __unix_remove_socket(sk); 384 smp_store_release(&unix_sk(sk)->addr, addr); 385 386 sk->sk_hash = hash; 387 __unix_insert_socket(net, sk); 388 } 389 390 static void unix_remove_socket(struct net *net, struct sock *sk) 391 { 392 spin_lock(&net->unx.table.locks[sk->sk_hash]); 393 __unix_remove_socket(sk); 394 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 395 } 396 397 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 398 { 399 spin_lock(&net->unx.table.locks[sk->sk_hash]); 400 __unix_insert_socket(net, sk); 401 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 402 } 403 404 static void unix_insert_bsd_socket(struct sock *sk) 405 { 406 spin_lock(&bsd_socket_locks[sk->sk_hash]); 407 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 408 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 409 } 410 411 static void unix_remove_bsd_socket(struct sock *sk) 412 { 413 if (!hlist_unhashed(&sk->sk_bind_node)) { 414 spin_lock(&bsd_socket_locks[sk->sk_hash]); 415 __sk_del_bind_node(sk); 416 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 417 418 sk_node_init(&sk->sk_bind_node); 419 } 420 } 421 422 static struct sock *__unix_find_socket_byname(struct net *net, 423 struct sockaddr_un *sunname, 424 int len, unsigned int hash) 425 { 426 struct sock *s; 427 428 sk_for_each(s, &net->unx.table.buckets[hash]) { 429 struct unix_sock *u = unix_sk(s); 430 431 if (u->addr->len == len && 432 !memcmp(u->addr->name, sunname, len)) 433 return s; 434 } 435 return NULL; 436 } 437 438 static inline struct sock *unix_find_socket_byname(struct net *net, 439 struct sockaddr_un *sunname, 440 int len, unsigned int hash) 441 { 442 struct sock *s; 443 444 spin_lock(&net->unx.table.locks[hash]); 445 s = __unix_find_socket_byname(net, sunname, len, hash); 446 if (s) 447 sock_hold(s); 448 spin_unlock(&net->unx.table.locks[hash]); 449 return s; 450 } 451 452 static struct sock *unix_find_socket_byinode(struct inode *i) 453 { 454 unsigned int hash = unix_bsd_hash(i); 455 struct sock *s; 456 457 spin_lock(&bsd_socket_locks[hash]); 458 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 459 struct dentry *dentry = unix_sk(s)->path.dentry; 460 461 if (dentry && d_backing_inode(dentry) == i) { 462 sock_hold(s); 463 spin_unlock(&bsd_socket_locks[hash]); 464 return s; 465 } 466 } 467 spin_unlock(&bsd_socket_locks[hash]); 468 return NULL; 469 } 470 471 /* Support code for asymmetrically connected dgram sockets 472 * 473 * If a datagram socket is connected to a socket not itself connected 474 * to the first socket (eg, /dev/log), clients may only enqueue more 475 * messages if the present receive queue of the server socket is not 476 * "too large". This means there's a second writeability condition 477 * poll and sendmsg need to test. The dgram recv code will do a wake 478 * up on the peer_wait wait queue of a socket upon reception of a 479 * datagram which needs to be propagated to sleeping would-be writers 480 * since these might not have sent anything so far. This can't be 481 * accomplished via poll_wait because the lifetime of the server 482 * socket might be less than that of its clients if these break their 483 * association with it or if the server socket is closed while clients 484 * are still connected to it and there's no way to inform "a polling 485 * implementation" that it should let go of a certain wait queue 486 * 487 * In order to propagate a wake up, a wait_queue_entry_t of the client 488 * socket is enqueued on the peer_wait queue of the server socket 489 * whose wake function does a wake_up on the ordinary client socket 490 * wait queue. This connection is established whenever a write (or 491 * poll for write) hit the flow control condition and broken when the 492 * association to the server socket is dissolved or after a wake up 493 * was relayed. 494 */ 495 496 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 497 void *key) 498 { 499 struct unix_sock *u; 500 wait_queue_head_t *u_sleep; 501 502 u = container_of(q, struct unix_sock, peer_wake); 503 504 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 505 q); 506 u->peer_wake.private = NULL; 507 508 /* relaying can only happen while the wq still exists */ 509 u_sleep = sk_sleep(&u->sk); 510 if (u_sleep) 511 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 512 513 return 0; 514 } 515 516 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 517 { 518 struct unix_sock *u, *u_other; 519 int rc; 520 521 u = unix_sk(sk); 522 u_other = unix_sk(other); 523 rc = 0; 524 spin_lock(&u_other->peer_wait.lock); 525 526 if (!u->peer_wake.private) { 527 u->peer_wake.private = other; 528 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 529 530 rc = 1; 531 } 532 533 spin_unlock(&u_other->peer_wait.lock); 534 return rc; 535 } 536 537 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 538 struct sock *other) 539 { 540 struct unix_sock *u, *u_other; 541 542 u = unix_sk(sk); 543 u_other = unix_sk(other); 544 spin_lock(&u_other->peer_wait.lock); 545 546 if (u->peer_wake.private == other) { 547 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 548 u->peer_wake.private = NULL; 549 } 550 551 spin_unlock(&u_other->peer_wait.lock); 552 } 553 554 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 555 struct sock *other) 556 { 557 unix_dgram_peer_wake_disconnect(sk, other); 558 wake_up_interruptible_poll(sk_sleep(sk), 559 EPOLLOUT | 560 EPOLLWRNORM | 561 EPOLLWRBAND); 562 } 563 564 /* preconditions: 565 * - unix_peer(sk) == other 566 * - association is stable 567 */ 568 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 569 { 570 int connected; 571 572 connected = unix_dgram_peer_wake_connect(sk, other); 573 574 /* If other is SOCK_DEAD, we want to make sure we signal 575 * POLLOUT, such that a subsequent write() can get a 576 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 577 * to other and its full, we will hang waiting for POLLOUT. 578 */ 579 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 580 return 1; 581 582 if (connected) 583 unix_dgram_peer_wake_disconnect(sk, other); 584 585 return 0; 586 } 587 588 static int unix_writable(const struct sock *sk, unsigned char state) 589 { 590 return state != TCP_LISTEN && 591 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 592 } 593 594 static void unix_write_space(struct sock *sk) 595 { 596 struct socket_wq *wq; 597 598 rcu_read_lock(); 599 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 600 wq = rcu_dereference(sk->sk_wq); 601 if (skwq_has_sleeper(wq)) 602 wake_up_interruptible_sync_poll(&wq->wait, 603 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 604 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 605 } 606 rcu_read_unlock(); 607 } 608 609 /* When dgram socket disconnects (or changes its peer), we clear its receive 610 * queue of packets arrived from previous peer. First, it allows to do 611 * flow control based only on wmem_alloc; second, sk connected to peer 612 * may receive messages only from that peer. */ 613 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 614 { 615 if (!skb_queue_empty(&sk->sk_receive_queue)) { 616 skb_queue_purge_reason(&sk->sk_receive_queue, 617 SKB_DROP_REASON_UNIX_DISCONNECT); 618 619 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 620 621 /* If one link of bidirectional dgram pipe is disconnected, 622 * we signal error. Messages are lost. Do not make this, 623 * when peer was not connected to us. 624 */ 625 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 626 WRITE_ONCE(other->sk_err, ECONNRESET); 627 sk_error_report(other); 628 } 629 } 630 } 631 632 static void unix_sock_destructor(struct sock *sk) 633 { 634 struct unix_sock *u = unix_sk(sk); 635 636 skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE); 637 638 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 639 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 640 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 641 if (!sock_flag(sk, SOCK_DEAD)) { 642 pr_info("Attempt to release alive unix socket: %p\n", sk); 643 return; 644 } 645 646 if (u->addr) 647 unix_release_addr(u->addr); 648 649 atomic_long_dec(&unix_nr_socks); 650 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 651 #ifdef UNIX_REFCNT_DEBUG 652 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 653 atomic_long_read(&unix_nr_socks)); 654 #endif 655 } 656 657 static void unix_release_sock(struct sock *sk, int embrion) 658 { 659 struct unix_sock *u = unix_sk(sk); 660 struct sock *skpair; 661 struct sk_buff *skb; 662 struct path path; 663 int state; 664 665 unix_remove_socket(sock_net(sk), sk); 666 unix_remove_bsd_socket(sk); 667 668 /* Clear state */ 669 unix_state_lock(sk); 670 sock_orphan(sk); 671 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 672 path = u->path; 673 u->path.dentry = NULL; 674 u->path.mnt = NULL; 675 state = sk->sk_state; 676 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 677 678 skpair = unix_peer(sk); 679 unix_peer(sk) = NULL; 680 681 unix_state_unlock(sk); 682 683 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 684 u->oob_skb = NULL; 685 #endif 686 687 wake_up_interruptible_all(&u->peer_wait); 688 689 if (skpair != NULL) { 690 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 691 unix_state_lock(skpair); 692 /* No more writes */ 693 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 694 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) 695 WRITE_ONCE(skpair->sk_err, ECONNRESET); 696 unix_state_unlock(skpair); 697 skpair->sk_state_change(skpair); 698 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 699 } 700 701 unix_dgram_peer_wake_disconnect(sk, skpair); 702 sock_put(skpair); /* It may now die */ 703 } 704 705 /* Try to flush out this socket. Throw out buffers at least */ 706 707 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 708 if (state == TCP_LISTEN) 709 unix_release_sock(skb->sk, 1); 710 711 /* passed fds are erased in the kfree_skb hook */ 712 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 713 } 714 715 if (path.dentry) 716 path_put(&path); 717 718 sock_put(sk); 719 720 /* ---- Socket is dead now and most probably destroyed ---- */ 721 722 /* 723 * Fixme: BSD difference: In BSD all sockets connected to us get 724 * ECONNRESET and we die on the spot. In Linux we behave 725 * like files and pipes do and wait for the last 726 * dereference. 727 * 728 * Can't we simply set sock->err? 729 * 730 * What the above comment does talk about? --ANK(980817) 731 */ 732 733 if (READ_ONCE(unix_tot_inflight)) 734 unix_gc(); /* Garbage collect fds */ 735 } 736 737 static void init_peercred(struct sock *sk) 738 { 739 sk->sk_peer_pid = get_pid(task_tgid(current)); 740 sk->sk_peer_cred = get_current_cred(); 741 } 742 743 static void update_peercred(struct sock *sk) 744 { 745 const struct cred *old_cred; 746 struct pid *old_pid; 747 748 spin_lock(&sk->sk_peer_lock); 749 old_pid = sk->sk_peer_pid; 750 old_cred = sk->sk_peer_cred; 751 init_peercred(sk); 752 spin_unlock(&sk->sk_peer_lock); 753 754 put_pid(old_pid); 755 put_cred(old_cred); 756 } 757 758 static void copy_peercred(struct sock *sk, struct sock *peersk) 759 { 760 lockdep_assert_held(&unix_sk(peersk)->lock); 761 762 spin_lock(&sk->sk_peer_lock); 763 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 764 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 765 spin_unlock(&sk->sk_peer_lock); 766 } 767 768 static bool unix_may_passcred(const struct sock *sk) 769 { 770 return sk->sk_scm_credentials || sk->sk_scm_pidfd; 771 } 772 773 static int unix_listen(struct socket *sock, int backlog) 774 { 775 int err; 776 struct sock *sk = sock->sk; 777 struct unix_sock *u = unix_sk(sk); 778 779 err = -EOPNOTSUPP; 780 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 781 goto out; /* Only stream/seqpacket sockets accept */ 782 err = -EINVAL; 783 if (!READ_ONCE(u->addr)) 784 goto out; /* No listens on an unbound socket */ 785 unix_state_lock(sk); 786 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 787 goto out_unlock; 788 if (backlog > sk->sk_max_ack_backlog) 789 wake_up_interruptible_all(&u->peer_wait); 790 sk->sk_max_ack_backlog = backlog; 791 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 792 793 /* set credentials so connect can copy them */ 794 update_peercred(sk); 795 err = 0; 796 797 out_unlock: 798 unix_state_unlock(sk); 799 out: 800 return err; 801 } 802 803 static int unix_release(struct socket *); 804 static int unix_bind(struct socket *, struct sockaddr *, int); 805 static int unix_stream_connect(struct socket *, struct sockaddr *, 806 int addr_len, int flags); 807 static int unix_socketpair(struct socket *, struct socket *); 808 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); 809 static int unix_getname(struct socket *, struct sockaddr *, int); 810 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 811 static __poll_t unix_dgram_poll(struct file *, struct socket *, 812 poll_table *); 813 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 814 #ifdef CONFIG_COMPAT 815 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 816 #endif 817 static int unix_shutdown(struct socket *, int); 818 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 819 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 820 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 821 struct pipe_inode_info *, size_t size, 822 unsigned int flags); 823 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 824 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 825 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 826 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 827 static int unix_dgram_connect(struct socket *, struct sockaddr *, 828 int, int); 829 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 830 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 831 int); 832 833 #ifdef CONFIG_PROC_FS 834 static int unix_count_nr_fds(struct sock *sk) 835 { 836 struct sk_buff *skb; 837 struct unix_sock *u; 838 int nr_fds = 0; 839 840 spin_lock(&sk->sk_receive_queue.lock); 841 skb = skb_peek(&sk->sk_receive_queue); 842 while (skb) { 843 u = unix_sk(skb->sk); 844 nr_fds += atomic_read(&u->scm_stat.nr_fds); 845 skb = skb_peek_next(skb, &sk->sk_receive_queue); 846 } 847 spin_unlock(&sk->sk_receive_queue.lock); 848 849 return nr_fds; 850 } 851 852 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 853 { 854 struct sock *sk = sock->sk; 855 unsigned char s_state; 856 struct unix_sock *u; 857 int nr_fds = 0; 858 859 if (sk) { 860 s_state = READ_ONCE(sk->sk_state); 861 u = unix_sk(sk); 862 863 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 864 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 865 * SOCK_DGRAM is ordinary. So, no lock is needed. 866 */ 867 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 868 nr_fds = atomic_read(&u->scm_stat.nr_fds); 869 else if (s_state == TCP_LISTEN) 870 nr_fds = unix_count_nr_fds(sk); 871 872 seq_printf(m, "scm_fds: %u\n", nr_fds); 873 } 874 } 875 #else 876 #define unix_show_fdinfo NULL 877 #endif 878 879 static const struct proto_ops unix_stream_ops = { 880 .family = PF_UNIX, 881 .owner = THIS_MODULE, 882 .release = unix_release, 883 .bind = unix_bind, 884 .connect = unix_stream_connect, 885 .socketpair = unix_socketpair, 886 .accept = unix_accept, 887 .getname = unix_getname, 888 .poll = unix_poll, 889 .ioctl = unix_ioctl, 890 #ifdef CONFIG_COMPAT 891 .compat_ioctl = unix_compat_ioctl, 892 #endif 893 .listen = unix_listen, 894 .shutdown = unix_shutdown, 895 .sendmsg = unix_stream_sendmsg, 896 .recvmsg = unix_stream_recvmsg, 897 .read_skb = unix_stream_read_skb, 898 .mmap = sock_no_mmap, 899 .splice_read = unix_stream_splice_read, 900 .set_peek_off = sk_set_peek_off, 901 .show_fdinfo = unix_show_fdinfo, 902 }; 903 904 static const struct proto_ops unix_dgram_ops = { 905 .family = PF_UNIX, 906 .owner = THIS_MODULE, 907 .release = unix_release, 908 .bind = unix_bind, 909 .connect = unix_dgram_connect, 910 .socketpair = unix_socketpair, 911 .accept = sock_no_accept, 912 .getname = unix_getname, 913 .poll = unix_dgram_poll, 914 .ioctl = unix_ioctl, 915 #ifdef CONFIG_COMPAT 916 .compat_ioctl = unix_compat_ioctl, 917 #endif 918 .listen = sock_no_listen, 919 .shutdown = unix_shutdown, 920 .sendmsg = unix_dgram_sendmsg, 921 .read_skb = unix_read_skb, 922 .recvmsg = unix_dgram_recvmsg, 923 .mmap = sock_no_mmap, 924 .set_peek_off = sk_set_peek_off, 925 .show_fdinfo = unix_show_fdinfo, 926 }; 927 928 static const struct proto_ops unix_seqpacket_ops = { 929 .family = PF_UNIX, 930 .owner = THIS_MODULE, 931 .release = unix_release, 932 .bind = unix_bind, 933 .connect = unix_stream_connect, 934 .socketpair = unix_socketpair, 935 .accept = unix_accept, 936 .getname = unix_getname, 937 .poll = unix_dgram_poll, 938 .ioctl = unix_ioctl, 939 #ifdef CONFIG_COMPAT 940 .compat_ioctl = unix_compat_ioctl, 941 #endif 942 .listen = unix_listen, 943 .shutdown = unix_shutdown, 944 .sendmsg = unix_seqpacket_sendmsg, 945 .recvmsg = unix_seqpacket_recvmsg, 946 .mmap = sock_no_mmap, 947 .set_peek_off = sk_set_peek_off, 948 .show_fdinfo = unix_show_fdinfo, 949 }; 950 951 static void unix_close(struct sock *sk, long timeout) 952 { 953 /* Nothing to do here, unix socket does not need a ->close(). 954 * This is merely for sockmap. 955 */ 956 } 957 958 static bool unix_bpf_bypass_getsockopt(int level, int optname) 959 { 960 if (level == SOL_SOCKET) { 961 switch (optname) { 962 case SO_PEERPIDFD: 963 return true; 964 default: 965 return false; 966 } 967 } 968 969 return false; 970 } 971 972 struct proto unix_dgram_proto = { 973 .name = "UNIX", 974 .owner = THIS_MODULE, 975 .obj_size = sizeof(struct unix_sock), 976 .close = unix_close, 977 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 978 #ifdef CONFIG_BPF_SYSCALL 979 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 980 #endif 981 }; 982 983 struct proto unix_stream_proto = { 984 .name = "UNIX-STREAM", 985 .owner = THIS_MODULE, 986 .obj_size = sizeof(struct unix_sock), 987 .close = unix_close, 988 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 989 #ifdef CONFIG_BPF_SYSCALL 990 .psock_update_sk_prot = unix_stream_bpf_update_proto, 991 #endif 992 }; 993 994 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 995 { 996 struct unix_sock *u; 997 struct sock *sk; 998 int err; 999 1000 atomic_long_inc(&unix_nr_socks); 1001 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 1002 err = -ENFILE; 1003 goto err; 1004 } 1005 1006 if (type == SOCK_STREAM) 1007 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 1008 else /*dgram and seqpacket */ 1009 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 1010 1011 if (!sk) { 1012 err = -ENOMEM; 1013 goto err; 1014 } 1015 1016 sock_init_data(sock, sk); 1017 1018 sk->sk_scm_rights = 1; 1019 sk->sk_hash = unix_unbound_hash(sk); 1020 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 1021 sk->sk_write_space = unix_write_space; 1022 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 1023 sk->sk_destruct = unix_sock_destructor; 1024 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); 1025 1026 u = unix_sk(sk); 1027 u->listener = NULL; 1028 u->vertex = NULL; 1029 u->path.dentry = NULL; 1030 u->path.mnt = NULL; 1031 spin_lock_init(&u->lock); 1032 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); 1033 mutex_init(&u->iolock); /* single task reading lock */ 1034 mutex_init(&u->bindlock); /* single task binding lock */ 1035 init_waitqueue_head(&u->peer_wait); 1036 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1037 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1038 unix_insert_unbound_socket(net, sk); 1039 1040 sock_prot_inuse_add(net, sk->sk_prot, 1); 1041 1042 return sk; 1043 1044 err: 1045 atomic_long_dec(&unix_nr_socks); 1046 return ERR_PTR(err); 1047 } 1048 1049 static int unix_create(struct net *net, struct socket *sock, int protocol, 1050 int kern) 1051 { 1052 struct sock *sk; 1053 1054 if (protocol && protocol != PF_UNIX) 1055 return -EPROTONOSUPPORT; 1056 1057 sock->state = SS_UNCONNECTED; 1058 1059 switch (sock->type) { 1060 case SOCK_STREAM: 1061 sock->ops = &unix_stream_ops; 1062 break; 1063 /* 1064 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1065 * nothing uses it. 1066 */ 1067 case SOCK_RAW: 1068 sock->type = SOCK_DGRAM; 1069 fallthrough; 1070 case SOCK_DGRAM: 1071 sock->ops = &unix_dgram_ops; 1072 break; 1073 case SOCK_SEQPACKET: 1074 sock->ops = &unix_seqpacket_ops; 1075 break; 1076 default: 1077 return -ESOCKTNOSUPPORT; 1078 } 1079 1080 sk = unix_create1(net, sock, kern, sock->type); 1081 if (IS_ERR(sk)) 1082 return PTR_ERR(sk); 1083 1084 return 0; 1085 } 1086 1087 static int unix_release(struct socket *sock) 1088 { 1089 struct sock *sk = sock->sk; 1090 1091 if (!sk) 1092 return 0; 1093 1094 sk->sk_prot->close(sk, 0); 1095 unix_release_sock(sk, 0); 1096 sock->sk = NULL; 1097 1098 return 0; 1099 } 1100 1101 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1102 int type) 1103 { 1104 struct inode *inode; 1105 struct path path; 1106 struct sock *sk; 1107 int err; 1108 1109 unix_mkname_bsd(sunaddr, addr_len); 1110 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1111 if (err) 1112 goto fail; 1113 1114 err = path_permission(&path, MAY_WRITE); 1115 if (err) 1116 goto path_put; 1117 1118 err = -ECONNREFUSED; 1119 inode = d_backing_inode(path.dentry); 1120 if (!S_ISSOCK(inode->i_mode)) 1121 goto path_put; 1122 1123 sk = unix_find_socket_byinode(inode); 1124 if (!sk) 1125 goto path_put; 1126 1127 err = -EPROTOTYPE; 1128 if (sk->sk_type == type) 1129 touch_atime(&path); 1130 else 1131 goto sock_put; 1132 1133 path_put(&path); 1134 1135 return sk; 1136 1137 sock_put: 1138 sock_put(sk); 1139 path_put: 1140 path_put(&path); 1141 fail: 1142 return ERR_PTR(err); 1143 } 1144 1145 static struct sock *unix_find_abstract(struct net *net, 1146 struct sockaddr_un *sunaddr, 1147 int addr_len, int type) 1148 { 1149 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1150 struct dentry *dentry; 1151 struct sock *sk; 1152 1153 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1154 if (!sk) 1155 return ERR_PTR(-ECONNREFUSED); 1156 1157 dentry = unix_sk(sk)->path.dentry; 1158 if (dentry) 1159 touch_atime(&unix_sk(sk)->path); 1160 1161 return sk; 1162 } 1163 1164 static struct sock *unix_find_other(struct net *net, 1165 struct sockaddr_un *sunaddr, 1166 int addr_len, int type) 1167 { 1168 struct sock *sk; 1169 1170 if (sunaddr->sun_path[0]) 1171 sk = unix_find_bsd(sunaddr, addr_len, type); 1172 else 1173 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1174 1175 return sk; 1176 } 1177 1178 static int unix_autobind(struct sock *sk) 1179 { 1180 struct unix_sock *u = unix_sk(sk); 1181 unsigned int new_hash, old_hash; 1182 struct net *net = sock_net(sk); 1183 struct unix_address *addr; 1184 u32 lastnum, ordernum; 1185 int err; 1186 1187 err = mutex_lock_interruptible(&u->bindlock); 1188 if (err) 1189 return err; 1190 1191 if (u->addr) 1192 goto out; 1193 1194 err = -ENOMEM; 1195 addr = kzalloc(sizeof(*addr) + 1196 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1197 if (!addr) 1198 goto out; 1199 1200 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1201 addr->name->sun_family = AF_UNIX; 1202 refcount_set(&addr->refcnt, 1); 1203 1204 old_hash = sk->sk_hash; 1205 ordernum = get_random_u32(); 1206 lastnum = ordernum & 0xFFFFF; 1207 retry: 1208 ordernum = (ordernum + 1) & 0xFFFFF; 1209 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1210 1211 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1212 unix_table_double_lock(net, old_hash, new_hash); 1213 1214 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1215 unix_table_double_unlock(net, old_hash, new_hash); 1216 1217 /* __unix_find_socket_byname() may take long time if many names 1218 * are already in use. 1219 */ 1220 cond_resched(); 1221 1222 if (ordernum == lastnum) { 1223 /* Give up if all names seems to be in use. */ 1224 err = -ENOSPC; 1225 unix_release_addr(addr); 1226 goto out; 1227 } 1228 1229 goto retry; 1230 } 1231 1232 __unix_set_addr_hash(net, sk, addr, new_hash); 1233 unix_table_double_unlock(net, old_hash, new_hash); 1234 err = 0; 1235 1236 out: mutex_unlock(&u->bindlock); 1237 return err; 1238 } 1239 1240 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1241 int addr_len) 1242 { 1243 umode_t mode = S_IFSOCK | 1244 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1245 struct unix_sock *u = unix_sk(sk); 1246 unsigned int new_hash, old_hash; 1247 struct net *net = sock_net(sk); 1248 struct mnt_idmap *idmap; 1249 struct unix_address *addr; 1250 struct dentry *dentry; 1251 struct path parent; 1252 int err; 1253 1254 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1255 addr = unix_create_addr(sunaddr, addr_len); 1256 if (!addr) 1257 return -ENOMEM; 1258 1259 /* 1260 * Get the parent directory, calculate the hash for last 1261 * component. 1262 */ 1263 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1264 if (IS_ERR(dentry)) { 1265 err = PTR_ERR(dentry); 1266 goto out; 1267 } 1268 1269 /* 1270 * All right, let's create it. 1271 */ 1272 idmap = mnt_idmap(parent.mnt); 1273 err = security_path_mknod(&parent, dentry, mode, 0); 1274 if (!err) 1275 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1276 if (err) 1277 goto out_path; 1278 err = mutex_lock_interruptible(&u->bindlock); 1279 if (err) 1280 goto out_unlink; 1281 if (u->addr) 1282 goto out_unlock; 1283 1284 old_hash = sk->sk_hash; 1285 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1286 unix_table_double_lock(net, old_hash, new_hash); 1287 u->path.mnt = mntget(parent.mnt); 1288 u->path.dentry = dget(dentry); 1289 __unix_set_addr_hash(net, sk, addr, new_hash); 1290 unix_table_double_unlock(net, old_hash, new_hash); 1291 unix_insert_bsd_socket(sk); 1292 mutex_unlock(&u->bindlock); 1293 done_path_create(&parent, dentry); 1294 return 0; 1295 1296 out_unlock: 1297 mutex_unlock(&u->bindlock); 1298 err = -EINVAL; 1299 out_unlink: 1300 /* failed after successful mknod? unlink what we'd created... */ 1301 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1302 out_path: 1303 done_path_create(&parent, dentry); 1304 out: 1305 unix_release_addr(addr); 1306 return err == -EEXIST ? -EADDRINUSE : err; 1307 } 1308 1309 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1310 int addr_len) 1311 { 1312 struct unix_sock *u = unix_sk(sk); 1313 unsigned int new_hash, old_hash; 1314 struct net *net = sock_net(sk); 1315 struct unix_address *addr; 1316 int err; 1317 1318 addr = unix_create_addr(sunaddr, addr_len); 1319 if (!addr) 1320 return -ENOMEM; 1321 1322 err = mutex_lock_interruptible(&u->bindlock); 1323 if (err) 1324 goto out; 1325 1326 if (u->addr) { 1327 err = -EINVAL; 1328 goto out_mutex; 1329 } 1330 1331 old_hash = sk->sk_hash; 1332 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1333 unix_table_double_lock(net, old_hash, new_hash); 1334 1335 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1336 goto out_spin; 1337 1338 __unix_set_addr_hash(net, sk, addr, new_hash); 1339 unix_table_double_unlock(net, old_hash, new_hash); 1340 mutex_unlock(&u->bindlock); 1341 return 0; 1342 1343 out_spin: 1344 unix_table_double_unlock(net, old_hash, new_hash); 1345 err = -EADDRINUSE; 1346 out_mutex: 1347 mutex_unlock(&u->bindlock); 1348 out: 1349 unix_release_addr(addr); 1350 return err; 1351 } 1352 1353 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1354 { 1355 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1356 struct sock *sk = sock->sk; 1357 int err; 1358 1359 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1360 sunaddr->sun_family == AF_UNIX) 1361 return unix_autobind(sk); 1362 1363 err = unix_validate_addr(sunaddr, addr_len); 1364 if (err) 1365 return err; 1366 1367 if (sunaddr->sun_path[0]) 1368 err = unix_bind_bsd(sk, sunaddr, addr_len); 1369 else 1370 err = unix_bind_abstract(sk, sunaddr, addr_len); 1371 1372 return err; 1373 } 1374 1375 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1376 { 1377 if (unlikely(sk1 == sk2) || !sk2) { 1378 unix_state_lock(sk1); 1379 return; 1380 } 1381 1382 if (sk1 > sk2) 1383 swap(sk1, sk2); 1384 1385 unix_state_lock(sk1); 1386 unix_state_lock(sk2); 1387 } 1388 1389 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1390 { 1391 if (unlikely(sk1 == sk2) || !sk2) { 1392 unix_state_unlock(sk1); 1393 return; 1394 } 1395 unix_state_unlock(sk1); 1396 unix_state_unlock(sk2); 1397 } 1398 1399 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1400 int alen, int flags) 1401 { 1402 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1403 struct sock *sk = sock->sk; 1404 struct sock *other; 1405 int err; 1406 1407 err = -EINVAL; 1408 if (alen < offsetofend(struct sockaddr, sa_family)) 1409 goto out; 1410 1411 if (addr->sa_family != AF_UNSPEC) { 1412 err = unix_validate_addr(sunaddr, alen); 1413 if (err) 1414 goto out; 1415 1416 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1417 if (err) 1418 goto out; 1419 1420 if (unix_may_passcred(sk) && !READ_ONCE(unix_sk(sk)->addr)) { 1421 err = unix_autobind(sk); 1422 if (err) 1423 goto out; 1424 } 1425 1426 restart: 1427 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1428 if (IS_ERR(other)) { 1429 err = PTR_ERR(other); 1430 goto out; 1431 } 1432 1433 unix_state_double_lock(sk, other); 1434 1435 /* Apparently VFS overslept socket death. Retry. */ 1436 if (sock_flag(other, SOCK_DEAD)) { 1437 unix_state_double_unlock(sk, other); 1438 sock_put(other); 1439 goto restart; 1440 } 1441 1442 err = -EPERM; 1443 if (!unix_may_send(sk, other)) 1444 goto out_unlock; 1445 1446 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1447 if (err) 1448 goto out_unlock; 1449 1450 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1451 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1452 } else { 1453 /* 1454 * 1003.1g breaking connected state with AF_UNSPEC 1455 */ 1456 other = NULL; 1457 unix_state_double_lock(sk, other); 1458 } 1459 1460 /* 1461 * If it was connected, reconnect. 1462 */ 1463 if (unix_peer(sk)) { 1464 struct sock *old_peer = unix_peer(sk); 1465 1466 unix_peer(sk) = other; 1467 if (!other) 1468 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1469 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1470 1471 unix_state_double_unlock(sk, other); 1472 1473 if (other != old_peer) { 1474 unix_dgram_disconnected(sk, old_peer); 1475 1476 unix_state_lock(old_peer); 1477 if (!unix_peer(old_peer)) 1478 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1479 unix_state_unlock(old_peer); 1480 } 1481 1482 sock_put(old_peer); 1483 } else { 1484 unix_peer(sk) = other; 1485 unix_state_double_unlock(sk, other); 1486 } 1487 1488 return 0; 1489 1490 out_unlock: 1491 unix_state_double_unlock(sk, other); 1492 sock_put(other); 1493 out: 1494 return err; 1495 } 1496 1497 static long unix_wait_for_peer(struct sock *other, long timeo) 1498 { 1499 struct unix_sock *u = unix_sk(other); 1500 int sched; 1501 DEFINE_WAIT(wait); 1502 1503 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1504 1505 sched = !sock_flag(other, SOCK_DEAD) && 1506 !(other->sk_shutdown & RCV_SHUTDOWN) && 1507 unix_recvq_full_lockless(other); 1508 1509 unix_state_unlock(other); 1510 1511 if (sched) 1512 timeo = schedule_timeout(timeo); 1513 1514 finish_wait(&u->peer_wait, &wait); 1515 return timeo; 1516 } 1517 1518 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1519 int addr_len, int flags) 1520 { 1521 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1522 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1523 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1524 struct net *net = sock_net(sk); 1525 struct sk_buff *skb = NULL; 1526 unsigned char state; 1527 long timeo; 1528 int err; 1529 1530 err = unix_validate_addr(sunaddr, addr_len); 1531 if (err) 1532 goto out; 1533 1534 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1535 if (err) 1536 goto out; 1537 1538 if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) { 1539 err = unix_autobind(sk); 1540 if (err) 1541 goto out; 1542 } 1543 1544 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1545 1546 /* First of all allocate resources. 1547 * If we will make it after state is locked, 1548 * we will have to recheck all again in any case. 1549 */ 1550 1551 /* create new sock for complete connection */ 1552 newsk = unix_create1(net, NULL, 0, sock->type); 1553 if (IS_ERR(newsk)) { 1554 err = PTR_ERR(newsk); 1555 goto out; 1556 } 1557 1558 /* Allocate skb for sending to listening sock */ 1559 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1560 if (!skb) { 1561 err = -ENOMEM; 1562 goto out_free_sk; 1563 } 1564 1565 restart: 1566 /* Find listening sock. */ 1567 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1568 if (IS_ERR(other)) { 1569 err = PTR_ERR(other); 1570 goto out_free_skb; 1571 } 1572 1573 unix_state_lock(other); 1574 1575 /* Apparently VFS overslept socket death. Retry. */ 1576 if (sock_flag(other, SOCK_DEAD)) { 1577 unix_state_unlock(other); 1578 sock_put(other); 1579 goto restart; 1580 } 1581 1582 if (other->sk_state != TCP_LISTEN || 1583 other->sk_shutdown & RCV_SHUTDOWN) { 1584 err = -ECONNREFUSED; 1585 goto out_unlock; 1586 } 1587 1588 if (unix_recvq_full_lockless(other)) { 1589 if (!timeo) { 1590 err = -EAGAIN; 1591 goto out_unlock; 1592 } 1593 1594 timeo = unix_wait_for_peer(other, timeo); 1595 sock_put(other); 1596 1597 err = sock_intr_errno(timeo); 1598 if (signal_pending(current)) 1599 goto out_free_skb; 1600 1601 goto restart; 1602 } 1603 1604 /* self connect and simultaneous connect are eliminated 1605 * by rejecting TCP_LISTEN socket to avoid deadlock. 1606 */ 1607 state = READ_ONCE(sk->sk_state); 1608 if (unlikely(state != TCP_CLOSE)) { 1609 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1610 goto out_unlock; 1611 } 1612 1613 unix_state_lock(sk); 1614 1615 if (unlikely(sk->sk_state != TCP_CLOSE)) { 1616 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1617 unix_state_unlock(sk); 1618 goto out_unlock; 1619 } 1620 1621 err = security_unix_stream_connect(sk, other, newsk); 1622 if (err) { 1623 unix_state_unlock(sk); 1624 goto out_unlock; 1625 } 1626 1627 /* The way is open! Fastly set all the necessary fields... */ 1628 1629 sock_hold(sk); 1630 unix_peer(newsk) = sk; 1631 newsk->sk_state = TCP_ESTABLISHED; 1632 newsk->sk_type = sk->sk_type; 1633 newsk->sk_scm_recv_flags = other->sk_scm_recv_flags; 1634 init_peercred(newsk); 1635 1636 newu = unix_sk(newsk); 1637 newu->listener = other; 1638 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1639 otheru = unix_sk(other); 1640 1641 /* copy address information from listening to new sock 1642 * 1643 * The contents of *(otheru->addr) and otheru->path 1644 * are seen fully set up here, since we have found 1645 * otheru in hash under its lock. Insertion into the 1646 * hash chain we'd found it in had been done in an 1647 * earlier critical area protected by the chain's lock, 1648 * the same one where we'd set *(otheru->addr) contents, 1649 * as well as otheru->path and otheru->addr itself. 1650 * 1651 * Using smp_store_release() here to set newu->addr 1652 * is enough to make those stores, as well as stores 1653 * to newu->path visible to anyone who gets newu->addr 1654 * by smp_load_acquire(). IOW, the same warranties 1655 * as for unix_sock instances bound in unix_bind() or 1656 * in unix_autobind(). 1657 */ 1658 if (otheru->path.dentry) { 1659 path_get(&otheru->path); 1660 newu->path = otheru->path; 1661 } 1662 refcount_inc(&otheru->addr->refcnt); 1663 smp_store_release(&newu->addr, otheru->addr); 1664 1665 /* Set credentials */ 1666 copy_peercred(sk, other); 1667 1668 sock->state = SS_CONNECTED; 1669 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1670 sock_hold(newsk); 1671 1672 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1673 unix_peer(sk) = newsk; 1674 1675 unix_state_unlock(sk); 1676 1677 /* take ten and send info to listening sock */ 1678 spin_lock(&other->sk_receive_queue.lock); 1679 __skb_queue_tail(&other->sk_receive_queue, skb); 1680 spin_unlock(&other->sk_receive_queue.lock); 1681 unix_state_unlock(other); 1682 other->sk_data_ready(other); 1683 sock_put(other); 1684 return 0; 1685 1686 out_unlock: 1687 unix_state_unlock(other); 1688 sock_put(other); 1689 out_free_skb: 1690 consume_skb(skb); 1691 out_free_sk: 1692 unix_release_sock(newsk, 0); 1693 out: 1694 return err; 1695 } 1696 1697 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1698 { 1699 struct sock *ska = socka->sk, *skb = sockb->sk; 1700 1701 /* Join our sockets back to back */ 1702 sock_hold(ska); 1703 sock_hold(skb); 1704 unix_peer(ska) = skb; 1705 unix_peer(skb) = ska; 1706 init_peercred(ska); 1707 init_peercred(skb); 1708 1709 ska->sk_state = TCP_ESTABLISHED; 1710 skb->sk_state = TCP_ESTABLISHED; 1711 socka->state = SS_CONNECTED; 1712 sockb->state = SS_CONNECTED; 1713 return 0; 1714 } 1715 1716 static int unix_accept(struct socket *sock, struct socket *newsock, 1717 struct proto_accept_arg *arg) 1718 { 1719 struct sock *sk = sock->sk; 1720 struct sk_buff *skb; 1721 struct sock *tsk; 1722 1723 arg->err = -EOPNOTSUPP; 1724 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1725 goto out; 1726 1727 arg->err = -EINVAL; 1728 if (READ_ONCE(sk->sk_state) != TCP_LISTEN) 1729 goto out; 1730 1731 /* If socket state is TCP_LISTEN it cannot change (for now...), 1732 * so that no locks are necessary. 1733 */ 1734 1735 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1736 &arg->err); 1737 if (!skb) { 1738 /* This means receive shutdown. */ 1739 if (arg->err == 0) 1740 arg->err = -EINVAL; 1741 goto out; 1742 } 1743 1744 tsk = skb->sk; 1745 skb_free_datagram(sk, skb); 1746 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1747 1748 /* attach accepted sock to socket */ 1749 unix_state_lock(tsk); 1750 unix_update_edges(unix_sk(tsk)); 1751 newsock->state = SS_CONNECTED; 1752 sock_graft(tsk, newsock); 1753 unix_state_unlock(tsk); 1754 return 0; 1755 1756 out: 1757 return arg->err; 1758 } 1759 1760 1761 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1762 { 1763 struct sock *sk = sock->sk; 1764 struct unix_address *addr; 1765 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1766 int err = 0; 1767 1768 if (peer) { 1769 sk = unix_peer_get(sk); 1770 1771 err = -ENOTCONN; 1772 if (!sk) 1773 goto out; 1774 err = 0; 1775 } else { 1776 sock_hold(sk); 1777 } 1778 1779 addr = smp_load_acquire(&unix_sk(sk)->addr); 1780 if (!addr) { 1781 sunaddr->sun_family = AF_UNIX; 1782 sunaddr->sun_path[0] = 0; 1783 err = offsetof(struct sockaddr_un, sun_path); 1784 } else { 1785 err = addr->len; 1786 memcpy(sunaddr, addr->name, addr->len); 1787 1788 if (peer) 1789 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1790 CGROUP_UNIX_GETPEERNAME); 1791 else 1792 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1793 CGROUP_UNIX_GETSOCKNAME); 1794 } 1795 sock_put(sk); 1796 out: 1797 return err; 1798 } 1799 1800 /* The "user->unix_inflight" variable is protected by the garbage 1801 * collection lock, and we just read it locklessly here. If you go 1802 * over the limit, there might be a tiny race in actually noticing 1803 * it across threads. Tough. 1804 */ 1805 static inline bool too_many_unix_fds(struct task_struct *p) 1806 { 1807 struct user_struct *user = current_user(); 1808 1809 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1810 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1811 return false; 1812 } 1813 1814 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1815 { 1816 if (too_many_unix_fds(current)) 1817 return -ETOOMANYREFS; 1818 1819 UNIXCB(skb).fp = scm->fp; 1820 scm->fp = NULL; 1821 1822 if (unix_prepare_fpl(UNIXCB(skb).fp)) 1823 return -ENOMEM; 1824 1825 return 0; 1826 } 1827 1828 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1829 { 1830 scm->fp = UNIXCB(skb).fp; 1831 UNIXCB(skb).fp = NULL; 1832 1833 unix_destroy_fpl(scm->fp); 1834 } 1835 1836 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1837 { 1838 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1839 } 1840 1841 static void unix_destruct_scm(struct sk_buff *skb) 1842 { 1843 struct scm_cookie scm; 1844 1845 memset(&scm, 0, sizeof(scm)); 1846 scm.pid = UNIXCB(skb).pid; 1847 if (UNIXCB(skb).fp) 1848 unix_detach_fds(&scm, skb); 1849 1850 /* Alas, it calls VFS */ 1851 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1852 scm_destroy(&scm); 1853 sock_wfree(skb); 1854 } 1855 1856 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1857 { 1858 int err = 0; 1859 1860 UNIXCB(skb).pid = get_pid(scm->pid); 1861 UNIXCB(skb).uid = scm->creds.uid; 1862 UNIXCB(skb).gid = scm->creds.gid; 1863 UNIXCB(skb).fp = NULL; 1864 unix_get_secdata(scm, skb); 1865 if (scm->fp && send_fds) 1866 err = unix_attach_fds(scm, skb); 1867 1868 skb->destructor = unix_destruct_scm; 1869 return err; 1870 } 1871 1872 /* 1873 * Some apps rely on write() giving SCM_CREDENTIALS 1874 * We include credentials if source or destination socket 1875 * asserted SOCK_PASSCRED. 1876 */ 1877 static void unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, 1878 const struct sock *other) 1879 { 1880 if (UNIXCB(skb).pid) 1881 return; 1882 1883 if (unix_may_passcred(sk) || unix_may_passcred(other)) { 1884 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1885 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1886 } 1887 } 1888 1889 static bool unix_skb_scm_eq(struct sk_buff *skb, 1890 struct scm_cookie *scm) 1891 { 1892 return UNIXCB(skb).pid == scm->pid && 1893 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1894 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1895 unix_secdata_eq(scm, skb); 1896 } 1897 1898 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1899 { 1900 struct scm_fp_list *fp = UNIXCB(skb).fp; 1901 struct unix_sock *u = unix_sk(sk); 1902 1903 if (unlikely(fp && fp->count)) { 1904 atomic_add(fp->count, &u->scm_stat.nr_fds); 1905 unix_add_edges(fp, u); 1906 } 1907 } 1908 1909 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1910 { 1911 struct scm_fp_list *fp = UNIXCB(skb).fp; 1912 struct unix_sock *u = unix_sk(sk); 1913 1914 if (unlikely(fp && fp->count)) { 1915 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1916 unix_del_edges(fp); 1917 } 1918 } 1919 1920 /* 1921 * Send AF_UNIX data. 1922 */ 1923 1924 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1925 size_t len) 1926 { 1927 struct sock *sk = sock->sk, *other = NULL; 1928 struct unix_sock *u = unix_sk(sk); 1929 struct scm_cookie scm; 1930 struct sk_buff *skb; 1931 int data_len = 0; 1932 int sk_locked; 1933 long timeo; 1934 int err; 1935 1936 err = scm_send(sock, msg, &scm, false); 1937 if (err < 0) 1938 return err; 1939 1940 wait_for_unix_gc(scm.fp); 1941 1942 if (msg->msg_flags & MSG_OOB) { 1943 err = -EOPNOTSUPP; 1944 goto out; 1945 } 1946 1947 if (msg->msg_namelen) { 1948 err = unix_validate_addr(msg->msg_name, msg->msg_namelen); 1949 if (err) 1950 goto out; 1951 1952 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 1953 msg->msg_name, 1954 &msg->msg_namelen, 1955 NULL); 1956 if (err) 1957 goto out; 1958 } 1959 1960 if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) { 1961 err = unix_autobind(sk); 1962 if (err) 1963 goto out; 1964 } 1965 1966 if (len > READ_ONCE(sk->sk_sndbuf) - 32) { 1967 err = -EMSGSIZE; 1968 goto out; 1969 } 1970 1971 if (len > SKB_MAX_ALLOC) { 1972 data_len = min_t(size_t, 1973 len - SKB_MAX_ALLOC, 1974 MAX_SKB_FRAGS * PAGE_SIZE); 1975 data_len = PAGE_ALIGN(data_len); 1976 1977 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 1978 } 1979 1980 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 1981 msg->msg_flags & MSG_DONTWAIT, &err, 1982 PAGE_ALLOC_COSTLY_ORDER); 1983 if (!skb) 1984 goto out; 1985 1986 err = unix_scm_to_skb(&scm, skb, true); 1987 if (err < 0) 1988 goto out_free; 1989 1990 skb_put(skb, len - data_len); 1991 skb->data_len = data_len; 1992 skb->len = len; 1993 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 1994 if (err) 1995 goto out_free; 1996 1997 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1998 1999 if (msg->msg_namelen) { 2000 lookup: 2001 other = unix_find_other(sock_net(sk), msg->msg_name, 2002 msg->msg_namelen, sk->sk_type); 2003 if (IS_ERR(other)) { 2004 err = PTR_ERR(other); 2005 goto out_free; 2006 } 2007 } else { 2008 other = unix_peer_get(sk); 2009 if (!other) { 2010 err = -ENOTCONN; 2011 goto out_free; 2012 } 2013 } 2014 2015 if (sk_filter(other, skb) < 0) { 2016 /* Toss the packet but do not return any error to the sender */ 2017 err = len; 2018 goto out_sock_put; 2019 } 2020 2021 restart: 2022 sk_locked = 0; 2023 unix_state_lock(other); 2024 restart_locked: 2025 2026 if (!unix_may_send(sk, other)) { 2027 err = -EPERM; 2028 goto out_unlock; 2029 } 2030 2031 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2032 /* Check with 1003.1g - what should datagram error */ 2033 2034 unix_state_unlock(other); 2035 2036 if (sk->sk_type == SOCK_SEQPACKET) { 2037 /* We are here only when racing with unix_release_sock() 2038 * is clearing @other. Never change state to TCP_CLOSE 2039 * unlike SOCK_DGRAM wants. 2040 */ 2041 err = -EPIPE; 2042 goto out_sock_put; 2043 } 2044 2045 if (!sk_locked) 2046 unix_state_lock(sk); 2047 2048 if (unix_peer(sk) == other) { 2049 unix_peer(sk) = NULL; 2050 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2051 2052 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2053 unix_state_unlock(sk); 2054 2055 unix_dgram_disconnected(sk, other); 2056 sock_put(other); 2057 err = -ECONNREFUSED; 2058 goto out_sock_put; 2059 } 2060 2061 unix_state_unlock(sk); 2062 2063 if (!msg->msg_namelen) { 2064 err = -ECONNRESET; 2065 goto out_sock_put; 2066 } 2067 2068 sock_put(other); 2069 goto lookup; 2070 } 2071 2072 if (other->sk_shutdown & RCV_SHUTDOWN) { 2073 err = -EPIPE; 2074 goto out_unlock; 2075 } 2076 2077 if (UNIXCB(skb).fp && !other->sk_scm_rights) { 2078 err = -EPERM; 2079 goto out_unlock; 2080 } 2081 2082 if (sk->sk_type != SOCK_SEQPACKET) { 2083 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2084 if (err) 2085 goto out_unlock; 2086 } 2087 2088 /* other == sk && unix_peer(other) != sk if 2089 * - unix_peer(sk) == NULL, destination address bound to sk 2090 * - unix_peer(sk) == sk by time of get but disconnected before lock 2091 */ 2092 if (other != sk && 2093 unlikely(unix_peer(other) != sk && 2094 unix_recvq_full_lockless(other))) { 2095 if (timeo) { 2096 timeo = unix_wait_for_peer(other, timeo); 2097 2098 err = sock_intr_errno(timeo); 2099 if (signal_pending(current)) 2100 goto out_sock_put; 2101 2102 goto restart; 2103 } 2104 2105 if (!sk_locked) { 2106 unix_state_unlock(other); 2107 unix_state_double_lock(sk, other); 2108 } 2109 2110 if (unix_peer(sk) != other || 2111 unix_dgram_peer_wake_me(sk, other)) { 2112 err = -EAGAIN; 2113 sk_locked = 1; 2114 goto out_unlock; 2115 } 2116 2117 if (!sk_locked) { 2118 sk_locked = 1; 2119 goto restart_locked; 2120 } 2121 } 2122 2123 if (unlikely(sk_locked)) 2124 unix_state_unlock(sk); 2125 2126 if (sock_flag(other, SOCK_RCVTSTAMP)) 2127 __net_timestamp(skb); 2128 2129 unix_maybe_add_creds(skb, sk, other); 2130 scm_stat_add(other, skb); 2131 skb_queue_tail(&other->sk_receive_queue, skb); 2132 unix_state_unlock(other); 2133 other->sk_data_ready(other); 2134 sock_put(other); 2135 scm_destroy(&scm); 2136 return len; 2137 2138 out_unlock: 2139 if (sk_locked) 2140 unix_state_unlock(sk); 2141 unix_state_unlock(other); 2142 out_sock_put: 2143 sock_put(other); 2144 out_free: 2145 consume_skb(skb); 2146 out: 2147 scm_destroy(&scm); 2148 return err; 2149 } 2150 2151 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2152 * bytes, and a minimum of a full page. 2153 */ 2154 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2155 2156 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2157 static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, 2158 struct scm_cookie *scm, bool fds_sent) 2159 { 2160 struct unix_sock *ousk = unix_sk(other); 2161 struct sk_buff *skb; 2162 int err; 2163 2164 skb = sock_alloc_send_skb(sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2165 2166 if (!skb) 2167 return err; 2168 2169 err = unix_scm_to_skb(scm, skb, !fds_sent); 2170 if (err < 0) 2171 goto out; 2172 2173 skb_put(skb, 1); 2174 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2175 2176 if (err) 2177 goto out; 2178 2179 unix_state_lock(other); 2180 2181 if (sock_flag(other, SOCK_DEAD) || 2182 (other->sk_shutdown & RCV_SHUTDOWN)) { 2183 err = -EPIPE; 2184 goto out_unlock; 2185 } 2186 2187 if (UNIXCB(skb).fp && !other->sk_scm_rights) { 2188 err = -EPERM; 2189 goto out_unlock; 2190 } 2191 2192 unix_maybe_add_creds(skb, sk, other); 2193 scm_stat_add(other, skb); 2194 2195 spin_lock(&other->sk_receive_queue.lock); 2196 WRITE_ONCE(ousk->oob_skb, skb); 2197 __skb_queue_tail(&other->sk_receive_queue, skb); 2198 spin_unlock(&other->sk_receive_queue.lock); 2199 2200 sk_send_sigurg(other); 2201 unix_state_unlock(other); 2202 other->sk_data_ready(other); 2203 2204 return 0; 2205 out_unlock: 2206 unix_state_unlock(other); 2207 out: 2208 consume_skb(skb); 2209 return err; 2210 } 2211 #endif 2212 2213 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2214 size_t len) 2215 { 2216 struct sock *sk = sock->sk; 2217 struct sk_buff *skb = NULL; 2218 struct sock *other = NULL; 2219 struct scm_cookie scm; 2220 bool fds_sent = false; 2221 int err, sent = 0; 2222 2223 err = scm_send(sock, msg, &scm, false); 2224 if (err < 0) 2225 return err; 2226 2227 wait_for_unix_gc(scm.fp); 2228 2229 if (msg->msg_flags & MSG_OOB) { 2230 err = -EOPNOTSUPP; 2231 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2232 if (len) 2233 len--; 2234 else 2235 #endif 2236 goto out_err; 2237 } 2238 2239 if (msg->msg_namelen) { 2240 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2241 goto out_err; 2242 } else { 2243 other = unix_peer(sk); 2244 if (!other) { 2245 err = -ENOTCONN; 2246 goto out_err; 2247 } 2248 } 2249 2250 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2251 goto out_pipe; 2252 2253 while (sent < len) { 2254 int size = len - sent; 2255 int data_len; 2256 2257 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2258 skb = sock_alloc_send_pskb(sk, 0, 0, 2259 msg->msg_flags & MSG_DONTWAIT, 2260 &err, 0); 2261 } else { 2262 /* Keep two messages in the pipe so it schedules better */ 2263 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2264 2265 /* allow fallback to order-0 allocations */ 2266 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2267 2268 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2269 2270 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2271 2272 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2273 msg->msg_flags & MSG_DONTWAIT, &err, 2274 get_order(UNIX_SKB_FRAGS_SZ)); 2275 } 2276 if (!skb) 2277 goto out_err; 2278 2279 /* Only send the fds in the first buffer */ 2280 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2281 if (err < 0) 2282 goto out_free; 2283 2284 fds_sent = true; 2285 2286 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2287 skb->ip_summed = CHECKSUM_UNNECESSARY; 2288 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2289 sk->sk_allocation); 2290 if (err < 0) 2291 goto out_free; 2292 2293 size = err; 2294 refcount_add(size, &sk->sk_wmem_alloc); 2295 } else { 2296 skb_put(skb, size - data_len); 2297 skb->data_len = data_len; 2298 skb->len = size; 2299 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2300 if (err) 2301 goto out_free; 2302 } 2303 2304 unix_state_lock(other); 2305 2306 if (sock_flag(other, SOCK_DEAD) || 2307 (other->sk_shutdown & RCV_SHUTDOWN)) 2308 goto out_pipe_unlock; 2309 2310 if (UNIXCB(skb).fp && !other->sk_scm_rights) { 2311 unix_state_unlock(other); 2312 err = -EPERM; 2313 goto out_free; 2314 } 2315 2316 unix_maybe_add_creds(skb, sk, other); 2317 scm_stat_add(other, skb); 2318 skb_queue_tail(&other->sk_receive_queue, skb); 2319 unix_state_unlock(other); 2320 other->sk_data_ready(other); 2321 sent += size; 2322 } 2323 2324 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2325 if (msg->msg_flags & MSG_OOB) { 2326 err = queue_oob(sk, msg, other, &scm, fds_sent); 2327 if (err) 2328 goto out_err; 2329 sent++; 2330 } 2331 #endif 2332 2333 scm_destroy(&scm); 2334 2335 return sent; 2336 2337 out_pipe_unlock: 2338 unix_state_unlock(other); 2339 out_pipe: 2340 if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) 2341 send_sig(SIGPIPE, current, 0); 2342 err = -EPIPE; 2343 out_free: 2344 consume_skb(skb); 2345 out_err: 2346 scm_destroy(&scm); 2347 return sent ? : err; 2348 } 2349 2350 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2351 size_t len) 2352 { 2353 int err; 2354 struct sock *sk = sock->sk; 2355 2356 err = sock_error(sk); 2357 if (err) 2358 return err; 2359 2360 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2361 return -ENOTCONN; 2362 2363 if (msg->msg_namelen) 2364 msg->msg_namelen = 0; 2365 2366 return unix_dgram_sendmsg(sock, msg, len); 2367 } 2368 2369 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2370 size_t size, int flags) 2371 { 2372 struct sock *sk = sock->sk; 2373 2374 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2375 return -ENOTCONN; 2376 2377 return unix_dgram_recvmsg(sock, msg, size, flags); 2378 } 2379 2380 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2381 { 2382 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2383 2384 if (addr) { 2385 msg->msg_namelen = addr->len; 2386 memcpy(msg->msg_name, addr->name, addr->len); 2387 } 2388 } 2389 2390 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2391 int flags) 2392 { 2393 struct scm_cookie scm; 2394 struct socket *sock = sk->sk_socket; 2395 struct unix_sock *u = unix_sk(sk); 2396 struct sk_buff *skb, *last; 2397 long timeo; 2398 int skip; 2399 int err; 2400 2401 err = -EOPNOTSUPP; 2402 if (flags&MSG_OOB) 2403 goto out; 2404 2405 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2406 2407 do { 2408 mutex_lock(&u->iolock); 2409 2410 skip = sk_peek_offset(sk, flags); 2411 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2412 &skip, &err, &last); 2413 if (skb) { 2414 if (!(flags & MSG_PEEK)) 2415 scm_stat_del(sk, skb); 2416 break; 2417 } 2418 2419 mutex_unlock(&u->iolock); 2420 2421 if (err != -EAGAIN) 2422 break; 2423 } while (timeo && 2424 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2425 &err, &timeo, last)); 2426 2427 if (!skb) { /* implies iolock unlocked */ 2428 unix_state_lock(sk); 2429 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2430 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2431 (sk->sk_shutdown & RCV_SHUTDOWN)) 2432 err = 0; 2433 unix_state_unlock(sk); 2434 goto out; 2435 } 2436 2437 if (wq_has_sleeper(&u->peer_wait)) 2438 wake_up_interruptible_sync_poll(&u->peer_wait, 2439 EPOLLOUT | EPOLLWRNORM | 2440 EPOLLWRBAND); 2441 2442 if (msg->msg_name) { 2443 unix_copy_addr(msg, skb->sk); 2444 2445 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2446 msg->msg_name, 2447 &msg->msg_namelen); 2448 } 2449 2450 if (size > skb->len - skip) 2451 size = skb->len - skip; 2452 else if (size < skb->len - skip) 2453 msg->msg_flags |= MSG_TRUNC; 2454 2455 err = skb_copy_datagram_msg(skb, skip, msg, size); 2456 if (err) 2457 goto out_free; 2458 2459 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2460 __sock_recv_timestamp(msg, sk, skb); 2461 2462 memset(&scm, 0, sizeof(scm)); 2463 2464 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2465 unix_set_secdata(&scm, skb); 2466 2467 if (!(flags & MSG_PEEK)) { 2468 if (UNIXCB(skb).fp) 2469 unix_detach_fds(&scm, skb); 2470 2471 sk_peek_offset_bwd(sk, skb->len); 2472 } else { 2473 /* It is questionable: on PEEK we could: 2474 - do not return fds - good, but too simple 8) 2475 - return fds, and do not return them on read (old strategy, 2476 apparently wrong) 2477 - clone fds (I chose it for now, it is the most universal 2478 solution) 2479 2480 POSIX 1003.1g does not actually define this clearly 2481 at all. POSIX 1003.1g doesn't define a lot of things 2482 clearly however! 2483 2484 */ 2485 2486 sk_peek_offset_fwd(sk, size); 2487 2488 if (UNIXCB(skb).fp) 2489 unix_peek_fds(&scm, skb); 2490 } 2491 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2492 2493 scm_recv_unix(sock, msg, &scm, flags); 2494 2495 out_free: 2496 skb_free_datagram(sk, skb); 2497 mutex_unlock(&u->iolock); 2498 out: 2499 return err; 2500 } 2501 2502 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2503 int flags) 2504 { 2505 struct sock *sk = sock->sk; 2506 2507 #ifdef CONFIG_BPF_SYSCALL 2508 const struct proto *prot = READ_ONCE(sk->sk_prot); 2509 2510 if (prot != &unix_dgram_proto) 2511 return prot->recvmsg(sk, msg, size, flags, NULL); 2512 #endif 2513 return __unix_dgram_recvmsg(sk, msg, size, flags); 2514 } 2515 2516 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2517 { 2518 struct unix_sock *u = unix_sk(sk); 2519 struct sk_buff *skb; 2520 int err; 2521 2522 mutex_lock(&u->iolock); 2523 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2524 mutex_unlock(&u->iolock); 2525 if (!skb) 2526 return err; 2527 2528 return recv_actor(sk, skb); 2529 } 2530 2531 /* 2532 * Sleep until more data has arrived. But check for races.. 2533 */ 2534 static long unix_stream_data_wait(struct sock *sk, long timeo, 2535 struct sk_buff *last, unsigned int last_len, 2536 bool freezable) 2537 { 2538 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2539 struct sk_buff *tail; 2540 DEFINE_WAIT(wait); 2541 2542 unix_state_lock(sk); 2543 2544 for (;;) { 2545 prepare_to_wait(sk_sleep(sk), &wait, state); 2546 2547 tail = skb_peek_tail(&sk->sk_receive_queue); 2548 if (tail != last || 2549 (tail && tail->len != last_len) || 2550 sk->sk_err || 2551 (sk->sk_shutdown & RCV_SHUTDOWN) || 2552 signal_pending(current) || 2553 !timeo) 2554 break; 2555 2556 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2557 unix_state_unlock(sk); 2558 timeo = schedule_timeout(timeo); 2559 unix_state_lock(sk); 2560 2561 if (sock_flag(sk, SOCK_DEAD)) 2562 break; 2563 2564 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2565 } 2566 2567 finish_wait(sk_sleep(sk), &wait); 2568 unix_state_unlock(sk); 2569 return timeo; 2570 } 2571 2572 static unsigned int unix_skb_len(const struct sk_buff *skb) 2573 { 2574 return skb->len - UNIXCB(skb).consumed; 2575 } 2576 2577 struct unix_stream_read_state { 2578 int (*recv_actor)(struct sk_buff *, int, int, 2579 struct unix_stream_read_state *); 2580 struct socket *socket; 2581 struct msghdr *msg; 2582 struct pipe_inode_info *pipe; 2583 size_t size; 2584 int flags; 2585 unsigned int splice_flags; 2586 }; 2587 2588 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2589 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2590 { 2591 struct socket *sock = state->socket; 2592 struct sock *sk = sock->sk; 2593 struct unix_sock *u = unix_sk(sk); 2594 int chunk = 1; 2595 struct sk_buff *oob_skb; 2596 2597 mutex_lock(&u->iolock); 2598 unix_state_lock(sk); 2599 spin_lock(&sk->sk_receive_queue.lock); 2600 2601 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2602 spin_unlock(&sk->sk_receive_queue.lock); 2603 unix_state_unlock(sk); 2604 mutex_unlock(&u->iolock); 2605 return -EINVAL; 2606 } 2607 2608 oob_skb = u->oob_skb; 2609 2610 if (!(state->flags & MSG_PEEK)) 2611 WRITE_ONCE(u->oob_skb, NULL); 2612 2613 spin_unlock(&sk->sk_receive_queue.lock); 2614 unix_state_unlock(sk); 2615 2616 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2617 2618 if (!(state->flags & MSG_PEEK)) 2619 UNIXCB(oob_skb).consumed += 1; 2620 2621 mutex_unlock(&u->iolock); 2622 2623 if (chunk < 0) 2624 return -EFAULT; 2625 2626 state->msg->msg_flags |= MSG_OOB; 2627 return 1; 2628 } 2629 2630 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2631 int flags, int copied) 2632 { 2633 struct sk_buff *read_skb = NULL, *unread_skb = NULL; 2634 struct unix_sock *u = unix_sk(sk); 2635 2636 if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb))) 2637 return skb; 2638 2639 spin_lock(&sk->sk_receive_queue.lock); 2640 2641 if (!unix_skb_len(skb)) { 2642 if (copied && (!u->oob_skb || skb == u->oob_skb)) { 2643 skb = NULL; 2644 } else if (flags & MSG_PEEK) { 2645 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2646 } else { 2647 read_skb = skb; 2648 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2649 __skb_unlink(read_skb, &sk->sk_receive_queue); 2650 } 2651 2652 if (!skb) 2653 goto unlock; 2654 } 2655 2656 if (skb != u->oob_skb) 2657 goto unlock; 2658 2659 if (copied) { 2660 skb = NULL; 2661 } else if (!(flags & MSG_PEEK)) { 2662 WRITE_ONCE(u->oob_skb, NULL); 2663 2664 if (!sock_flag(sk, SOCK_URGINLINE)) { 2665 __skb_unlink(skb, &sk->sk_receive_queue); 2666 unread_skb = skb; 2667 skb = skb_peek(&sk->sk_receive_queue); 2668 } 2669 } else if (!sock_flag(sk, SOCK_URGINLINE)) { 2670 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2671 } 2672 2673 unlock: 2674 spin_unlock(&sk->sk_receive_queue.lock); 2675 2676 consume_skb(read_skb); 2677 kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB); 2678 2679 return skb; 2680 } 2681 #endif 2682 2683 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2684 { 2685 struct unix_sock *u = unix_sk(sk); 2686 struct sk_buff *skb; 2687 int err; 2688 2689 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2690 return -ENOTCONN; 2691 2692 mutex_lock(&u->iolock); 2693 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2694 mutex_unlock(&u->iolock); 2695 if (!skb) 2696 return err; 2697 2698 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2699 if (unlikely(skb == READ_ONCE(u->oob_skb))) { 2700 bool drop = false; 2701 2702 unix_state_lock(sk); 2703 2704 if (sock_flag(sk, SOCK_DEAD)) { 2705 unix_state_unlock(sk); 2706 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 2707 return -ECONNRESET; 2708 } 2709 2710 spin_lock(&sk->sk_receive_queue.lock); 2711 if (likely(skb == u->oob_skb)) { 2712 WRITE_ONCE(u->oob_skb, NULL); 2713 drop = true; 2714 } 2715 spin_unlock(&sk->sk_receive_queue.lock); 2716 2717 unix_state_unlock(sk); 2718 2719 if (drop) { 2720 kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); 2721 return -EAGAIN; 2722 } 2723 } 2724 #endif 2725 2726 return recv_actor(sk, skb); 2727 } 2728 2729 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2730 bool freezable) 2731 { 2732 struct scm_cookie scm; 2733 struct socket *sock = state->socket; 2734 struct sock *sk = sock->sk; 2735 struct unix_sock *u = unix_sk(sk); 2736 int copied = 0; 2737 int flags = state->flags; 2738 int noblock = flags & MSG_DONTWAIT; 2739 bool check_creds = false; 2740 int target; 2741 int err = 0; 2742 long timeo; 2743 int skip; 2744 size_t size = state->size; 2745 unsigned int last_len; 2746 2747 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2748 err = -EINVAL; 2749 goto out; 2750 } 2751 2752 if (unlikely(flags & MSG_OOB)) { 2753 err = -EOPNOTSUPP; 2754 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2755 err = unix_stream_recv_urg(state); 2756 #endif 2757 goto out; 2758 } 2759 2760 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2761 timeo = sock_rcvtimeo(sk, noblock); 2762 2763 memset(&scm, 0, sizeof(scm)); 2764 2765 /* Lock the socket to prevent queue disordering 2766 * while sleeps in memcpy_tomsg 2767 */ 2768 mutex_lock(&u->iolock); 2769 2770 skip = max(sk_peek_offset(sk, flags), 0); 2771 2772 do { 2773 struct sk_buff *skb, *last; 2774 int chunk; 2775 2776 redo: 2777 unix_state_lock(sk); 2778 if (sock_flag(sk, SOCK_DEAD)) { 2779 err = -ECONNRESET; 2780 goto unlock; 2781 } 2782 last = skb = skb_peek(&sk->sk_receive_queue); 2783 last_len = last ? last->len : 0; 2784 2785 again: 2786 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2787 if (skb) { 2788 skb = manage_oob(skb, sk, flags, copied); 2789 if (!skb && copied) { 2790 unix_state_unlock(sk); 2791 break; 2792 } 2793 } 2794 #endif 2795 if (skb == NULL) { 2796 if (copied >= target) 2797 goto unlock; 2798 2799 /* 2800 * POSIX 1003.1g mandates this order. 2801 */ 2802 2803 err = sock_error(sk); 2804 if (err) 2805 goto unlock; 2806 if (sk->sk_shutdown & RCV_SHUTDOWN) 2807 goto unlock; 2808 2809 unix_state_unlock(sk); 2810 if (!timeo) { 2811 err = -EAGAIN; 2812 break; 2813 } 2814 2815 mutex_unlock(&u->iolock); 2816 2817 timeo = unix_stream_data_wait(sk, timeo, last, 2818 last_len, freezable); 2819 2820 if (signal_pending(current)) { 2821 err = sock_intr_errno(timeo); 2822 scm_destroy(&scm); 2823 goto out; 2824 } 2825 2826 mutex_lock(&u->iolock); 2827 goto redo; 2828 unlock: 2829 unix_state_unlock(sk); 2830 break; 2831 } 2832 2833 while (skip >= unix_skb_len(skb)) { 2834 skip -= unix_skb_len(skb); 2835 last = skb; 2836 last_len = skb->len; 2837 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2838 if (!skb) 2839 goto again; 2840 } 2841 2842 unix_state_unlock(sk); 2843 2844 if (check_creds) { 2845 /* Never glue messages from different writers */ 2846 if (!unix_skb_scm_eq(skb, &scm)) 2847 break; 2848 } else if (unix_may_passcred(sk)) { 2849 /* Copy credentials */ 2850 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2851 unix_set_secdata(&scm, skb); 2852 check_creds = true; 2853 } 2854 2855 /* Copy address just once */ 2856 if (state->msg && state->msg->msg_name) { 2857 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2858 state->msg->msg_name); 2859 unix_copy_addr(state->msg, skb->sk); 2860 2861 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2862 state->msg->msg_name, 2863 &state->msg->msg_namelen); 2864 2865 sunaddr = NULL; 2866 } 2867 2868 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2869 chunk = state->recv_actor(skb, skip, chunk, state); 2870 if (chunk < 0) { 2871 if (copied == 0) 2872 copied = -EFAULT; 2873 break; 2874 } 2875 copied += chunk; 2876 size -= chunk; 2877 2878 /* Mark read part of skb as used */ 2879 if (!(flags & MSG_PEEK)) { 2880 UNIXCB(skb).consumed += chunk; 2881 2882 sk_peek_offset_bwd(sk, chunk); 2883 2884 if (UNIXCB(skb).fp) { 2885 scm_stat_del(sk, skb); 2886 unix_detach_fds(&scm, skb); 2887 } 2888 2889 if (unix_skb_len(skb)) 2890 break; 2891 2892 skb_unlink(skb, &sk->sk_receive_queue); 2893 consume_skb(skb); 2894 2895 if (scm.fp) 2896 break; 2897 } else { 2898 /* It is questionable, see note in unix_dgram_recvmsg. 2899 */ 2900 if (UNIXCB(skb).fp) 2901 unix_peek_fds(&scm, skb); 2902 2903 sk_peek_offset_fwd(sk, chunk); 2904 2905 if (UNIXCB(skb).fp) 2906 break; 2907 2908 skip = 0; 2909 last = skb; 2910 last_len = skb->len; 2911 unix_state_lock(sk); 2912 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2913 if (skb) 2914 goto again; 2915 unix_state_unlock(sk); 2916 break; 2917 } 2918 } while (size); 2919 2920 mutex_unlock(&u->iolock); 2921 if (state->msg) 2922 scm_recv_unix(sock, state->msg, &scm, flags); 2923 else 2924 scm_destroy(&scm); 2925 out: 2926 return copied ? : err; 2927 } 2928 2929 static int unix_stream_read_actor(struct sk_buff *skb, 2930 int skip, int chunk, 2931 struct unix_stream_read_state *state) 2932 { 2933 int ret; 2934 2935 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2936 state->msg, chunk); 2937 return ret ?: chunk; 2938 } 2939 2940 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2941 size_t size, int flags) 2942 { 2943 struct unix_stream_read_state state = { 2944 .recv_actor = unix_stream_read_actor, 2945 .socket = sk->sk_socket, 2946 .msg = msg, 2947 .size = size, 2948 .flags = flags 2949 }; 2950 2951 return unix_stream_read_generic(&state, true); 2952 } 2953 2954 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2955 size_t size, int flags) 2956 { 2957 struct unix_stream_read_state state = { 2958 .recv_actor = unix_stream_read_actor, 2959 .socket = sock, 2960 .msg = msg, 2961 .size = size, 2962 .flags = flags 2963 }; 2964 2965 #ifdef CONFIG_BPF_SYSCALL 2966 struct sock *sk = sock->sk; 2967 const struct proto *prot = READ_ONCE(sk->sk_prot); 2968 2969 if (prot != &unix_stream_proto) 2970 return prot->recvmsg(sk, msg, size, flags, NULL); 2971 #endif 2972 return unix_stream_read_generic(&state, true); 2973 } 2974 2975 static int unix_stream_splice_actor(struct sk_buff *skb, 2976 int skip, int chunk, 2977 struct unix_stream_read_state *state) 2978 { 2979 return skb_splice_bits(skb, state->socket->sk, 2980 UNIXCB(skb).consumed + skip, 2981 state->pipe, chunk, state->splice_flags); 2982 } 2983 2984 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2985 struct pipe_inode_info *pipe, 2986 size_t size, unsigned int flags) 2987 { 2988 struct unix_stream_read_state state = { 2989 .recv_actor = unix_stream_splice_actor, 2990 .socket = sock, 2991 .pipe = pipe, 2992 .size = size, 2993 .splice_flags = flags, 2994 }; 2995 2996 if (unlikely(*ppos)) 2997 return -ESPIPE; 2998 2999 if (sock->file->f_flags & O_NONBLOCK || 3000 flags & SPLICE_F_NONBLOCK) 3001 state.flags = MSG_DONTWAIT; 3002 3003 return unix_stream_read_generic(&state, false); 3004 } 3005 3006 static int unix_shutdown(struct socket *sock, int mode) 3007 { 3008 struct sock *sk = sock->sk; 3009 struct sock *other; 3010 3011 if (mode < SHUT_RD || mode > SHUT_RDWR) 3012 return -EINVAL; 3013 /* This maps: 3014 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 3015 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 3016 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 3017 */ 3018 ++mode; 3019 3020 unix_state_lock(sk); 3021 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 3022 other = unix_peer(sk); 3023 if (other) 3024 sock_hold(other); 3025 unix_state_unlock(sk); 3026 sk->sk_state_change(sk); 3027 3028 if (other && 3029 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 3030 3031 int peer_mode = 0; 3032 const struct proto *prot = READ_ONCE(other->sk_prot); 3033 3034 if (prot->unhash) 3035 prot->unhash(other); 3036 if (mode&RCV_SHUTDOWN) 3037 peer_mode |= SEND_SHUTDOWN; 3038 if (mode&SEND_SHUTDOWN) 3039 peer_mode |= RCV_SHUTDOWN; 3040 unix_state_lock(other); 3041 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 3042 unix_state_unlock(other); 3043 other->sk_state_change(other); 3044 if (peer_mode == SHUTDOWN_MASK) 3045 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3046 else if (peer_mode & RCV_SHUTDOWN) 3047 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3048 } 3049 if (other) 3050 sock_put(other); 3051 3052 return 0; 3053 } 3054 3055 long unix_inq_len(struct sock *sk) 3056 { 3057 struct sk_buff *skb; 3058 long amount = 0; 3059 3060 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 3061 return -EINVAL; 3062 3063 spin_lock(&sk->sk_receive_queue.lock); 3064 if (sk->sk_type == SOCK_STREAM || 3065 sk->sk_type == SOCK_SEQPACKET) { 3066 skb_queue_walk(&sk->sk_receive_queue, skb) 3067 amount += unix_skb_len(skb); 3068 } else { 3069 skb = skb_peek(&sk->sk_receive_queue); 3070 if (skb) 3071 amount = skb->len; 3072 } 3073 spin_unlock(&sk->sk_receive_queue.lock); 3074 3075 return amount; 3076 } 3077 EXPORT_SYMBOL_GPL(unix_inq_len); 3078 3079 long unix_outq_len(struct sock *sk) 3080 { 3081 return sk_wmem_alloc_get(sk); 3082 } 3083 EXPORT_SYMBOL_GPL(unix_outq_len); 3084 3085 static int unix_open_file(struct sock *sk) 3086 { 3087 struct path path; 3088 struct file *f; 3089 int fd; 3090 3091 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3092 return -EPERM; 3093 3094 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3095 return -ENOENT; 3096 3097 path = unix_sk(sk)->path; 3098 if (!path.dentry) 3099 return -ENOENT; 3100 3101 path_get(&path); 3102 3103 fd = get_unused_fd_flags(O_CLOEXEC); 3104 if (fd < 0) 3105 goto out; 3106 3107 f = dentry_open(&path, O_PATH, current_cred()); 3108 if (IS_ERR(f)) { 3109 put_unused_fd(fd); 3110 fd = PTR_ERR(f); 3111 goto out; 3112 } 3113 3114 fd_install(fd, f); 3115 out: 3116 path_put(&path); 3117 3118 return fd; 3119 } 3120 3121 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3122 { 3123 struct sock *sk = sock->sk; 3124 long amount = 0; 3125 int err; 3126 3127 switch (cmd) { 3128 case SIOCOUTQ: 3129 amount = unix_outq_len(sk); 3130 err = put_user(amount, (int __user *)arg); 3131 break; 3132 case SIOCINQ: 3133 amount = unix_inq_len(sk); 3134 if (amount < 0) 3135 err = amount; 3136 else 3137 err = put_user(amount, (int __user *)arg); 3138 break; 3139 case SIOCUNIXFILE: 3140 err = unix_open_file(sk); 3141 break; 3142 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3143 case SIOCATMARK: 3144 { 3145 struct unix_sock *u = unix_sk(sk); 3146 struct sk_buff *skb; 3147 int answ = 0; 3148 3149 mutex_lock(&u->iolock); 3150 3151 skb = skb_peek(&sk->sk_receive_queue); 3152 if (skb) { 3153 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); 3154 struct sk_buff *next_skb; 3155 3156 next_skb = skb_peek_next(skb, &sk->sk_receive_queue); 3157 3158 if (skb == oob_skb || 3159 (!unix_skb_len(skb) && 3160 (!oob_skb || next_skb == oob_skb))) 3161 answ = 1; 3162 } 3163 3164 mutex_unlock(&u->iolock); 3165 3166 err = put_user(answ, (int __user *)arg); 3167 } 3168 break; 3169 #endif 3170 default: 3171 err = -ENOIOCTLCMD; 3172 break; 3173 } 3174 return err; 3175 } 3176 3177 #ifdef CONFIG_COMPAT 3178 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3179 { 3180 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3181 } 3182 #endif 3183 3184 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3185 { 3186 struct sock *sk = sock->sk; 3187 unsigned char state; 3188 __poll_t mask; 3189 u8 shutdown; 3190 3191 sock_poll_wait(file, sock, wait); 3192 mask = 0; 3193 shutdown = READ_ONCE(sk->sk_shutdown); 3194 state = READ_ONCE(sk->sk_state); 3195 3196 /* exceptional events? */ 3197 if (READ_ONCE(sk->sk_err)) 3198 mask |= EPOLLERR; 3199 if (shutdown == SHUTDOWN_MASK) 3200 mask |= EPOLLHUP; 3201 if (shutdown & RCV_SHUTDOWN) 3202 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3203 3204 /* readable? */ 3205 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3206 mask |= EPOLLIN | EPOLLRDNORM; 3207 if (sk_is_readable(sk)) 3208 mask |= EPOLLIN | EPOLLRDNORM; 3209 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3210 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3211 mask |= EPOLLPRI; 3212 #endif 3213 3214 /* Connection-based need to check for termination and startup */ 3215 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3216 state == TCP_CLOSE) 3217 mask |= EPOLLHUP; 3218 3219 /* 3220 * we set writable also when the other side has shut down the 3221 * connection. This prevents stuck sockets. 3222 */ 3223 if (unix_writable(sk, state)) 3224 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3225 3226 return mask; 3227 } 3228 3229 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3230 poll_table *wait) 3231 { 3232 struct sock *sk = sock->sk, *other; 3233 unsigned int writable; 3234 unsigned char state; 3235 __poll_t mask; 3236 u8 shutdown; 3237 3238 sock_poll_wait(file, sock, wait); 3239 mask = 0; 3240 shutdown = READ_ONCE(sk->sk_shutdown); 3241 state = READ_ONCE(sk->sk_state); 3242 3243 /* exceptional events? */ 3244 if (READ_ONCE(sk->sk_err) || 3245 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3246 mask |= EPOLLERR | 3247 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3248 3249 if (shutdown & RCV_SHUTDOWN) 3250 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3251 if (shutdown == SHUTDOWN_MASK) 3252 mask |= EPOLLHUP; 3253 3254 /* readable? */ 3255 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3256 mask |= EPOLLIN | EPOLLRDNORM; 3257 if (sk_is_readable(sk)) 3258 mask |= EPOLLIN | EPOLLRDNORM; 3259 3260 /* Connection-based need to check for termination and startup */ 3261 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3262 mask |= EPOLLHUP; 3263 3264 /* No write status requested, avoid expensive OUT tests. */ 3265 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3266 return mask; 3267 3268 writable = unix_writable(sk, state); 3269 if (writable) { 3270 unix_state_lock(sk); 3271 3272 other = unix_peer(sk); 3273 if (other && unix_peer(other) != sk && 3274 unix_recvq_full_lockless(other) && 3275 unix_dgram_peer_wake_me(sk, other)) 3276 writable = 0; 3277 3278 unix_state_unlock(sk); 3279 } 3280 3281 if (writable) 3282 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3283 else 3284 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3285 3286 return mask; 3287 } 3288 3289 #ifdef CONFIG_PROC_FS 3290 3291 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3292 3293 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3294 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3295 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3296 3297 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3298 { 3299 unsigned long offset = get_offset(*pos); 3300 unsigned long bucket = get_bucket(*pos); 3301 unsigned long count = 0; 3302 struct sock *sk; 3303 3304 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3305 sk; sk = sk_next(sk)) { 3306 if (++count == offset) 3307 break; 3308 } 3309 3310 return sk; 3311 } 3312 3313 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3314 { 3315 unsigned long bucket = get_bucket(*pos); 3316 struct net *net = seq_file_net(seq); 3317 struct sock *sk; 3318 3319 while (bucket < UNIX_HASH_SIZE) { 3320 spin_lock(&net->unx.table.locks[bucket]); 3321 3322 sk = unix_from_bucket(seq, pos); 3323 if (sk) 3324 return sk; 3325 3326 spin_unlock(&net->unx.table.locks[bucket]); 3327 3328 *pos = set_bucket_offset(++bucket, 1); 3329 } 3330 3331 return NULL; 3332 } 3333 3334 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3335 loff_t *pos) 3336 { 3337 unsigned long bucket = get_bucket(*pos); 3338 3339 sk = sk_next(sk); 3340 if (sk) 3341 return sk; 3342 3343 3344 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3345 3346 *pos = set_bucket_offset(++bucket, 1); 3347 3348 return unix_get_first(seq, pos); 3349 } 3350 3351 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3352 { 3353 if (!*pos) 3354 return SEQ_START_TOKEN; 3355 3356 return unix_get_first(seq, pos); 3357 } 3358 3359 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3360 { 3361 ++*pos; 3362 3363 if (v == SEQ_START_TOKEN) 3364 return unix_get_first(seq, pos); 3365 3366 return unix_get_next(seq, v, pos); 3367 } 3368 3369 static void unix_seq_stop(struct seq_file *seq, void *v) 3370 { 3371 struct sock *sk = v; 3372 3373 if (sk) 3374 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3375 } 3376 3377 static int unix_seq_show(struct seq_file *seq, void *v) 3378 { 3379 3380 if (v == SEQ_START_TOKEN) 3381 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3382 "Inode Path\n"); 3383 else { 3384 struct sock *s = v; 3385 struct unix_sock *u = unix_sk(s); 3386 unix_state_lock(s); 3387 3388 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3389 s, 3390 refcount_read(&s->sk_refcnt), 3391 0, 3392 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3393 s->sk_type, 3394 s->sk_socket ? 3395 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3396 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3397 sock_i_ino(s)); 3398 3399 if (u->addr) { // under a hash table lock here 3400 int i, len; 3401 seq_putc(seq, ' '); 3402 3403 i = 0; 3404 len = u->addr->len - 3405 offsetof(struct sockaddr_un, sun_path); 3406 if (u->addr->name->sun_path[0]) { 3407 len--; 3408 } else { 3409 seq_putc(seq, '@'); 3410 i++; 3411 } 3412 for ( ; i < len; i++) 3413 seq_putc(seq, u->addr->name->sun_path[i] ?: 3414 '@'); 3415 } 3416 unix_state_unlock(s); 3417 seq_putc(seq, '\n'); 3418 } 3419 3420 return 0; 3421 } 3422 3423 static const struct seq_operations unix_seq_ops = { 3424 .start = unix_seq_start, 3425 .next = unix_seq_next, 3426 .stop = unix_seq_stop, 3427 .show = unix_seq_show, 3428 }; 3429 3430 #ifdef CONFIG_BPF_SYSCALL 3431 struct bpf_unix_iter_state { 3432 struct seq_net_private p; 3433 unsigned int cur_sk; 3434 unsigned int end_sk; 3435 unsigned int max_sk; 3436 struct sock **batch; 3437 bool st_bucket_done; 3438 }; 3439 3440 struct bpf_iter__unix { 3441 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3442 __bpf_md_ptr(struct unix_sock *, unix_sk); 3443 uid_t uid __aligned(8); 3444 }; 3445 3446 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3447 struct unix_sock *unix_sk, uid_t uid) 3448 { 3449 struct bpf_iter__unix ctx; 3450 3451 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3452 ctx.meta = meta; 3453 ctx.unix_sk = unix_sk; 3454 ctx.uid = uid; 3455 return bpf_iter_run_prog(prog, &ctx); 3456 } 3457 3458 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3459 3460 { 3461 struct bpf_unix_iter_state *iter = seq->private; 3462 unsigned int expected = 1; 3463 struct sock *sk; 3464 3465 sock_hold(start_sk); 3466 iter->batch[iter->end_sk++] = start_sk; 3467 3468 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3469 if (iter->end_sk < iter->max_sk) { 3470 sock_hold(sk); 3471 iter->batch[iter->end_sk++] = sk; 3472 } 3473 3474 expected++; 3475 } 3476 3477 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3478 3479 return expected; 3480 } 3481 3482 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3483 { 3484 while (iter->cur_sk < iter->end_sk) 3485 sock_put(iter->batch[iter->cur_sk++]); 3486 } 3487 3488 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3489 unsigned int new_batch_sz) 3490 { 3491 struct sock **new_batch; 3492 3493 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3494 GFP_USER | __GFP_NOWARN); 3495 if (!new_batch) 3496 return -ENOMEM; 3497 3498 bpf_iter_unix_put_batch(iter); 3499 kvfree(iter->batch); 3500 iter->batch = new_batch; 3501 iter->max_sk = new_batch_sz; 3502 3503 return 0; 3504 } 3505 3506 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3507 loff_t *pos) 3508 { 3509 struct bpf_unix_iter_state *iter = seq->private; 3510 unsigned int expected; 3511 bool resized = false; 3512 struct sock *sk; 3513 3514 if (iter->st_bucket_done) 3515 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3516 3517 again: 3518 /* Get a new batch */ 3519 iter->cur_sk = 0; 3520 iter->end_sk = 0; 3521 3522 sk = unix_get_first(seq, pos); 3523 if (!sk) 3524 return NULL; /* Done */ 3525 3526 expected = bpf_iter_unix_hold_batch(seq, sk); 3527 3528 if (iter->end_sk == expected) { 3529 iter->st_bucket_done = true; 3530 return sk; 3531 } 3532 3533 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3534 resized = true; 3535 goto again; 3536 } 3537 3538 return sk; 3539 } 3540 3541 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3542 { 3543 if (!*pos) 3544 return SEQ_START_TOKEN; 3545 3546 /* bpf iter does not support lseek, so it always 3547 * continue from where it was stop()-ped. 3548 */ 3549 return bpf_iter_unix_batch(seq, pos); 3550 } 3551 3552 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3553 { 3554 struct bpf_unix_iter_state *iter = seq->private; 3555 struct sock *sk; 3556 3557 /* Whenever seq_next() is called, the iter->cur_sk is 3558 * done with seq_show(), so advance to the next sk in 3559 * the batch. 3560 */ 3561 if (iter->cur_sk < iter->end_sk) 3562 sock_put(iter->batch[iter->cur_sk++]); 3563 3564 ++*pos; 3565 3566 if (iter->cur_sk < iter->end_sk) 3567 sk = iter->batch[iter->cur_sk]; 3568 else 3569 sk = bpf_iter_unix_batch(seq, pos); 3570 3571 return sk; 3572 } 3573 3574 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3575 { 3576 struct bpf_iter_meta meta; 3577 struct bpf_prog *prog; 3578 struct sock *sk = v; 3579 uid_t uid; 3580 bool slow; 3581 int ret; 3582 3583 if (v == SEQ_START_TOKEN) 3584 return 0; 3585 3586 slow = lock_sock_fast(sk); 3587 3588 if (unlikely(sk_unhashed(sk))) { 3589 ret = SEQ_SKIP; 3590 goto unlock; 3591 } 3592 3593 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3594 meta.seq = seq; 3595 prog = bpf_iter_get_info(&meta, false); 3596 ret = unix_prog_seq_show(prog, &meta, v, uid); 3597 unlock: 3598 unlock_sock_fast(sk, slow); 3599 return ret; 3600 } 3601 3602 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3603 { 3604 struct bpf_unix_iter_state *iter = seq->private; 3605 struct bpf_iter_meta meta; 3606 struct bpf_prog *prog; 3607 3608 if (!v) { 3609 meta.seq = seq; 3610 prog = bpf_iter_get_info(&meta, true); 3611 if (prog) 3612 (void)unix_prog_seq_show(prog, &meta, v, 0); 3613 } 3614 3615 if (iter->cur_sk < iter->end_sk) 3616 bpf_iter_unix_put_batch(iter); 3617 } 3618 3619 static const struct seq_operations bpf_iter_unix_seq_ops = { 3620 .start = bpf_iter_unix_seq_start, 3621 .next = bpf_iter_unix_seq_next, 3622 .stop = bpf_iter_unix_seq_stop, 3623 .show = bpf_iter_unix_seq_show, 3624 }; 3625 #endif 3626 #endif 3627 3628 static const struct net_proto_family unix_family_ops = { 3629 .family = PF_UNIX, 3630 .create = unix_create, 3631 .owner = THIS_MODULE, 3632 }; 3633 3634 3635 static int __net_init unix_net_init(struct net *net) 3636 { 3637 int i; 3638 3639 net->unx.sysctl_max_dgram_qlen = 10; 3640 if (unix_sysctl_register(net)) 3641 goto out; 3642 3643 #ifdef CONFIG_PROC_FS 3644 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3645 sizeof(struct seq_net_private))) 3646 goto err_sysctl; 3647 #endif 3648 3649 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3650 sizeof(spinlock_t), GFP_KERNEL); 3651 if (!net->unx.table.locks) 3652 goto err_proc; 3653 3654 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3655 sizeof(struct hlist_head), 3656 GFP_KERNEL); 3657 if (!net->unx.table.buckets) 3658 goto free_locks; 3659 3660 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3661 spin_lock_init(&net->unx.table.locks[i]); 3662 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); 3663 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3664 } 3665 3666 return 0; 3667 3668 free_locks: 3669 kvfree(net->unx.table.locks); 3670 err_proc: 3671 #ifdef CONFIG_PROC_FS 3672 remove_proc_entry("unix", net->proc_net); 3673 err_sysctl: 3674 #endif 3675 unix_sysctl_unregister(net); 3676 out: 3677 return -ENOMEM; 3678 } 3679 3680 static void __net_exit unix_net_exit(struct net *net) 3681 { 3682 kvfree(net->unx.table.buckets); 3683 kvfree(net->unx.table.locks); 3684 unix_sysctl_unregister(net); 3685 remove_proc_entry("unix", net->proc_net); 3686 } 3687 3688 static struct pernet_operations unix_net_ops = { 3689 .init = unix_net_init, 3690 .exit = unix_net_exit, 3691 }; 3692 3693 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3694 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3695 struct unix_sock *unix_sk, uid_t uid) 3696 3697 #define INIT_BATCH_SZ 16 3698 3699 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3700 { 3701 struct bpf_unix_iter_state *iter = priv_data; 3702 int err; 3703 3704 err = bpf_iter_init_seq_net(priv_data, aux); 3705 if (err) 3706 return err; 3707 3708 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3709 if (err) { 3710 bpf_iter_fini_seq_net(priv_data); 3711 return err; 3712 } 3713 3714 return 0; 3715 } 3716 3717 static void bpf_iter_fini_unix(void *priv_data) 3718 { 3719 struct bpf_unix_iter_state *iter = priv_data; 3720 3721 bpf_iter_fini_seq_net(priv_data); 3722 kvfree(iter->batch); 3723 } 3724 3725 static const struct bpf_iter_seq_info unix_seq_info = { 3726 .seq_ops = &bpf_iter_unix_seq_ops, 3727 .init_seq_private = bpf_iter_init_unix, 3728 .fini_seq_private = bpf_iter_fini_unix, 3729 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3730 }; 3731 3732 static const struct bpf_func_proto * 3733 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3734 const struct bpf_prog *prog) 3735 { 3736 switch (func_id) { 3737 case BPF_FUNC_setsockopt: 3738 return &bpf_sk_setsockopt_proto; 3739 case BPF_FUNC_getsockopt: 3740 return &bpf_sk_getsockopt_proto; 3741 default: 3742 return NULL; 3743 } 3744 } 3745 3746 static struct bpf_iter_reg unix_reg_info = { 3747 .target = "unix", 3748 .ctx_arg_info_size = 1, 3749 .ctx_arg_info = { 3750 { offsetof(struct bpf_iter__unix, unix_sk), 3751 PTR_TO_BTF_ID_OR_NULL }, 3752 }, 3753 .get_func_proto = bpf_iter_unix_get_func_proto, 3754 .seq_info = &unix_seq_info, 3755 }; 3756 3757 static void __init bpf_iter_register(void) 3758 { 3759 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3760 if (bpf_iter_reg_target(&unix_reg_info)) 3761 pr_warn("Warning: could not register bpf iterator unix\n"); 3762 } 3763 #endif 3764 3765 static int __init af_unix_init(void) 3766 { 3767 int i, rc = -1; 3768 3769 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3770 3771 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3772 spin_lock_init(&bsd_socket_locks[i]); 3773 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3774 } 3775 3776 rc = proto_register(&unix_dgram_proto, 1); 3777 if (rc != 0) { 3778 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3779 goto out; 3780 } 3781 3782 rc = proto_register(&unix_stream_proto, 1); 3783 if (rc != 0) { 3784 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3785 proto_unregister(&unix_dgram_proto); 3786 goto out; 3787 } 3788 3789 sock_register(&unix_family_ops); 3790 register_pernet_subsys(&unix_net_ops); 3791 unix_bpf_build_proto(); 3792 3793 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3794 bpf_iter_register(); 3795 #endif 3796 3797 out: 3798 return rc; 3799 } 3800 3801 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3802 fs_initcall(af_unix_init); 3803