1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/bpf-cgroup.h> 81 #include <linux/btf_ids.h> 82 #include <linux/dcache.h> 83 #include <linux/errno.h> 84 #include <linux/fcntl.h> 85 #include <linux/file.h> 86 #include <linux/filter.h> 87 #include <linux/fs.h> 88 #include <linux/init.h> 89 #include <linux/kernel.h> 90 #include <linux/mount.h> 91 #include <linux/namei.h> 92 #include <linux/poll.h> 93 #include <linux/proc_fs.h> 94 #include <linux/sched/signal.h> 95 #include <linux/security.h> 96 #include <linux/seq_file.h> 97 #include <linux/skbuff.h> 98 #include <linux/slab.h> 99 #include <linux/socket.h> 100 #include <linux/splice.h> 101 #include <linux/string.h> 102 #include <linux/uaccess.h> 103 #include <net/af_unix.h> 104 #include <net/net_namespace.h> 105 #include <net/scm.h> 106 #include <net/tcp_states.h> 107 #include <uapi/linux/sockios.h> 108 #include <uapi/linux/termios.h> 109 110 #include "af_unix.h" 111 112 static atomic_long_t unix_nr_socks; 113 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 114 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 115 116 /* SMP locking strategy: 117 * hash table is protected with spinlock. 118 * each socket state is protected by separate spinlock. 119 */ 120 #ifdef CONFIG_PROVE_LOCKING 121 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) 122 123 static int unix_table_lock_cmp_fn(const struct lockdep_map *a, 124 const struct lockdep_map *b) 125 { 126 return cmp_ptr(a, b); 127 } 128 129 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, 130 const struct lockdep_map *_b) 131 { 132 const struct unix_sock *a, *b; 133 134 a = container_of(_a, struct unix_sock, lock.dep_map); 135 b = container_of(_b, struct unix_sock, lock.dep_map); 136 137 if (a->sk.sk_state == TCP_LISTEN) { 138 /* unix_stream_connect(): Before the 2nd unix_state_lock(), 139 * 140 * 1. a is TCP_LISTEN. 141 * 2. b is not a. 142 * 3. concurrent connect(b -> a) must fail. 143 * 144 * Except for 2. & 3., the b's state can be any possible 145 * value due to concurrent connect() or listen(). 146 * 147 * 2. is detected in debug_spin_lock_before(), and 3. cannot 148 * be expressed as lock_cmp_fn. 149 */ 150 switch (b->sk.sk_state) { 151 case TCP_CLOSE: 152 case TCP_ESTABLISHED: 153 case TCP_LISTEN: 154 return -1; 155 default: 156 /* Invalid case. */ 157 return 0; 158 } 159 } 160 161 /* Should never happen. Just to be symmetric. */ 162 if (b->sk.sk_state == TCP_LISTEN) { 163 switch (b->sk.sk_state) { 164 case TCP_CLOSE: 165 case TCP_ESTABLISHED: 166 return 1; 167 default: 168 return 0; 169 } 170 } 171 172 /* unix_state_double_lock(): ascending address order. */ 173 return cmp_ptr(a, b); 174 } 175 176 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, 177 const struct lockdep_map *_b) 178 { 179 const struct sock *a, *b; 180 181 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); 182 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); 183 184 /* unix_collect_skb(): listener -> embryo order. */ 185 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) 186 return -1; 187 188 /* Should never happen. Just to be symmetric. */ 189 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) 190 return 1; 191 192 return 0; 193 } 194 #endif 195 196 static unsigned int unix_unbound_hash(struct sock *sk) 197 { 198 unsigned long hash = (unsigned long)sk; 199 200 hash ^= hash >> 16; 201 hash ^= hash >> 8; 202 hash ^= sk->sk_type; 203 204 return hash & UNIX_HASH_MOD; 205 } 206 207 static unsigned int unix_bsd_hash(struct inode *i) 208 { 209 return i->i_ino & UNIX_HASH_MOD; 210 } 211 212 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 213 int addr_len, int type) 214 { 215 __wsum csum = csum_partial(sunaddr, addr_len, 0); 216 unsigned int hash; 217 218 hash = (__force unsigned int)csum_fold(csum); 219 hash ^= hash >> 8; 220 hash ^= type; 221 222 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 223 } 224 225 static void unix_table_double_lock(struct net *net, 226 unsigned int hash1, unsigned int hash2) 227 { 228 if (hash1 == hash2) { 229 spin_lock(&net->unx.table.locks[hash1]); 230 return; 231 } 232 233 if (hash1 > hash2) 234 swap(hash1, hash2); 235 236 spin_lock(&net->unx.table.locks[hash1]); 237 spin_lock(&net->unx.table.locks[hash2]); 238 } 239 240 static void unix_table_double_unlock(struct net *net, 241 unsigned int hash1, unsigned int hash2) 242 { 243 if (hash1 == hash2) { 244 spin_unlock(&net->unx.table.locks[hash1]); 245 return; 246 } 247 248 spin_unlock(&net->unx.table.locks[hash1]); 249 spin_unlock(&net->unx.table.locks[hash2]); 250 } 251 252 #ifdef CONFIG_SECURITY_NETWORK 253 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 254 { 255 UNIXCB(skb).secid = scm->secid; 256 } 257 258 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 259 { 260 scm->secid = UNIXCB(skb).secid; 261 } 262 263 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 264 { 265 return (scm->secid == UNIXCB(skb).secid); 266 } 267 #else 268 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 269 { } 270 271 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 272 { } 273 274 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 275 { 276 return true; 277 } 278 #endif /* CONFIG_SECURITY_NETWORK */ 279 280 static inline int unix_may_send(struct sock *sk, struct sock *osk) 281 { 282 return !unix_peer(osk) || unix_peer(osk) == sk; 283 } 284 285 static inline int unix_recvq_full_lockless(const struct sock *sk) 286 { 287 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 288 } 289 290 struct sock *unix_peer_get(struct sock *s) 291 { 292 struct sock *peer; 293 294 unix_state_lock(s); 295 peer = unix_peer(s); 296 if (peer) 297 sock_hold(peer); 298 unix_state_unlock(s); 299 return peer; 300 } 301 EXPORT_SYMBOL_GPL(unix_peer_get); 302 303 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 304 int addr_len) 305 { 306 struct unix_address *addr; 307 308 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 309 if (!addr) 310 return NULL; 311 312 refcount_set(&addr->refcnt, 1); 313 addr->len = addr_len; 314 memcpy(addr->name, sunaddr, addr_len); 315 316 return addr; 317 } 318 319 static inline void unix_release_addr(struct unix_address *addr) 320 { 321 if (refcount_dec_and_test(&addr->refcnt)) 322 kfree(addr); 323 } 324 325 /* 326 * Check unix socket name: 327 * - should be not zero length. 328 * - if started by not zero, should be NULL terminated (FS object) 329 * - if started by zero, it is abstract name. 330 */ 331 332 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 333 { 334 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 335 addr_len > sizeof(*sunaddr)) 336 return -EINVAL; 337 338 if (sunaddr->sun_family != AF_UNIX) 339 return -EINVAL; 340 341 return 0; 342 } 343 344 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 345 { 346 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 347 short offset = offsetof(struct sockaddr_storage, __data); 348 349 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 350 351 /* This may look like an off by one error but it is a bit more 352 * subtle. 108 is the longest valid AF_UNIX path for a binding. 353 * sun_path[108] doesn't as such exist. However in kernel space 354 * we are guaranteed that it is a valid memory location in our 355 * kernel address buffer because syscall functions always pass 356 * a pointer of struct sockaddr_storage which has a bigger buffer 357 * than 108. Also, we must terminate sun_path for strlen() in 358 * getname_kernel(). 359 */ 360 addr->__data[addr_len - offset] = 0; 361 362 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 363 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 364 * know the actual buffer. 365 */ 366 return strlen(addr->__data) + offset + 1; 367 } 368 369 static void __unix_remove_socket(struct sock *sk) 370 { 371 sk_del_node_init(sk); 372 } 373 374 static void __unix_insert_socket(struct net *net, struct sock *sk) 375 { 376 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 377 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 378 } 379 380 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 381 struct unix_address *addr, unsigned int hash) 382 { 383 __unix_remove_socket(sk); 384 smp_store_release(&unix_sk(sk)->addr, addr); 385 386 sk->sk_hash = hash; 387 __unix_insert_socket(net, sk); 388 } 389 390 static void unix_remove_socket(struct net *net, struct sock *sk) 391 { 392 spin_lock(&net->unx.table.locks[sk->sk_hash]); 393 __unix_remove_socket(sk); 394 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 395 } 396 397 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 398 { 399 spin_lock(&net->unx.table.locks[sk->sk_hash]); 400 __unix_insert_socket(net, sk); 401 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 402 } 403 404 static void unix_insert_bsd_socket(struct sock *sk) 405 { 406 spin_lock(&bsd_socket_locks[sk->sk_hash]); 407 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 408 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 409 } 410 411 static void unix_remove_bsd_socket(struct sock *sk) 412 { 413 if (!hlist_unhashed(&sk->sk_bind_node)) { 414 spin_lock(&bsd_socket_locks[sk->sk_hash]); 415 __sk_del_bind_node(sk); 416 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 417 418 sk_node_init(&sk->sk_bind_node); 419 } 420 } 421 422 static struct sock *__unix_find_socket_byname(struct net *net, 423 struct sockaddr_un *sunname, 424 int len, unsigned int hash) 425 { 426 struct sock *s; 427 428 sk_for_each(s, &net->unx.table.buckets[hash]) { 429 struct unix_sock *u = unix_sk(s); 430 431 if (u->addr->len == len && 432 !memcmp(u->addr->name, sunname, len)) 433 return s; 434 } 435 return NULL; 436 } 437 438 static inline struct sock *unix_find_socket_byname(struct net *net, 439 struct sockaddr_un *sunname, 440 int len, unsigned int hash) 441 { 442 struct sock *s; 443 444 spin_lock(&net->unx.table.locks[hash]); 445 s = __unix_find_socket_byname(net, sunname, len, hash); 446 if (s) 447 sock_hold(s); 448 spin_unlock(&net->unx.table.locks[hash]); 449 return s; 450 } 451 452 static struct sock *unix_find_socket_byinode(struct inode *i) 453 { 454 unsigned int hash = unix_bsd_hash(i); 455 struct sock *s; 456 457 spin_lock(&bsd_socket_locks[hash]); 458 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 459 struct dentry *dentry = unix_sk(s)->path.dentry; 460 461 if (dentry && d_backing_inode(dentry) == i) { 462 sock_hold(s); 463 spin_unlock(&bsd_socket_locks[hash]); 464 return s; 465 } 466 } 467 spin_unlock(&bsd_socket_locks[hash]); 468 return NULL; 469 } 470 471 /* Support code for asymmetrically connected dgram sockets 472 * 473 * If a datagram socket is connected to a socket not itself connected 474 * to the first socket (eg, /dev/log), clients may only enqueue more 475 * messages if the present receive queue of the server socket is not 476 * "too large". This means there's a second writeability condition 477 * poll and sendmsg need to test. The dgram recv code will do a wake 478 * up on the peer_wait wait queue of a socket upon reception of a 479 * datagram which needs to be propagated to sleeping would-be writers 480 * since these might not have sent anything so far. This can't be 481 * accomplished via poll_wait because the lifetime of the server 482 * socket might be less than that of its clients if these break their 483 * association with it or if the server socket is closed while clients 484 * are still connected to it and there's no way to inform "a polling 485 * implementation" that it should let go of a certain wait queue 486 * 487 * In order to propagate a wake up, a wait_queue_entry_t of the client 488 * socket is enqueued on the peer_wait queue of the server socket 489 * whose wake function does a wake_up on the ordinary client socket 490 * wait queue. This connection is established whenever a write (or 491 * poll for write) hit the flow control condition and broken when the 492 * association to the server socket is dissolved or after a wake up 493 * was relayed. 494 */ 495 496 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 497 void *key) 498 { 499 struct unix_sock *u; 500 wait_queue_head_t *u_sleep; 501 502 u = container_of(q, struct unix_sock, peer_wake); 503 504 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 505 q); 506 u->peer_wake.private = NULL; 507 508 /* relaying can only happen while the wq still exists */ 509 u_sleep = sk_sleep(&u->sk); 510 if (u_sleep) 511 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 512 513 return 0; 514 } 515 516 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 517 { 518 struct unix_sock *u, *u_other; 519 int rc; 520 521 u = unix_sk(sk); 522 u_other = unix_sk(other); 523 rc = 0; 524 spin_lock(&u_other->peer_wait.lock); 525 526 if (!u->peer_wake.private) { 527 u->peer_wake.private = other; 528 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 529 530 rc = 1; 531 } 532 533 spin_unlock(&u_other->peer_wait.lock); 534 return rc; 535 } 536 537 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 538 struct sock *other) 539 { 540 struct unix_sock *u, *u_other; 541 542 u = unix_sk(sk); 543 u_other = unix_sk(other); 544 spin_lock(&u_other->peer_wait.lock); 545 546 if (u->peer_wake.private == other) { 547 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 548 u->peer_wake.private = NULL; 549 } 550 551 spin_unlock(&u_other->peer_wait.lock); 552 } 553 554 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 555 struct sock *other) 556 { 557 unix_dgram_peer_wake_disconnect(sk, other); 558 wake_up_interruptible_poll(sk_sleep(sk), 559 EPOLLOUT | 560 EPOLLWRNORM | 561 EPOLLWRBAND); 562 } 563 564 /* preconditions: 565 * - unix_peer(sk) == other 566 * - association is stable 567 */ 568 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 569 { 570 int connected; 571 572 connected = unix_dgram_peer_wake_connect(sk, other); 573 574 /* If other is SOCK_DEAD, we want to make sure we signal 575 * POLLOUT, such that a subsequent write() can get a 576 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 577 * to other and its full, we will hang waiting for POLLOUT. 578 */ 579 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 580 return 1; 581 582 if (connected) 583 unix_dgram_peer_wake_disconnect(sk, other); 584 585 return 0; 586 } 587 588 static int unix_writable(const struct sock *sk, unsigned char state) 589 { 590 return state != TCP_LISTEN && 591 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 592 } 593 594 static void unix_write_space(struct sock *sk) 595 { 596 struct socket_wq *wq; 597 598 rcu_read_lock(); 599 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 600 wq = rcu_dereference(sk->sk_wq); 601 if (skwq_has_sleeper(wq)) 602 wake_up_interruptible_sync_poll(&wq->wait, 603 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 604 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 605 } 606 rcu_read_unlock(); 607 } 608 609 /* When dgram socket disconnects (or changes its peer), we clear its receive 610 * queue of packets arrived from previous peer. First, it allows to do 611 * flow control based only on wmem_alloc; second, sk connected to peer 612 * may receive messages only from that peer. */ 613 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 614 { 615 if (!skb_queue_empty(&sk->sk_receive_queue)) { 616 skb_queue_purge_reason(&sk->sk_receive_queue, 617 SKB_DROP_REASON_UNIX_DISCONNECT); 618 619 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 620 621 /* If one link of bidirectional dgram pipe is disconnected, 622 * we signal error. Messages are lost. Do not make this, 623 * when peer was not connected to us. 624 */ 625 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 626 WRITE_ONCE(other->sk_err, ECONNRESET); 627 sk_error_report(other); 628 } 629 } 630 } 631 632 static void unix_sock_destructor(struct sock *sk) 633 { 634 struct unix_sock *u = unix_sk(sk); 635 636 skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE); 637 638 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 639 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 640 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 641 if (!sock_flag(sk, SOCK_DEAD)) { 642 pr_info("Attempt to release alive unix socket: %p\n", sk); 643 return; 644 } 645 646 if (u->addr) 647 unix_release_addr(u->addr); 648 649 atomic_long_dec(&unix_nr_socks); 650 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 651 #ifdef UNIX_REFCNT_DEBUG 652 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 653 atomic_long_read(&unix_nr_socks)); 654 #endif 655 } 656 657 static void unix_release_sock(struct sock *sk, int embrion) 658 { 659 struct unix_sock *u = unix_sk(sk); 660 struct sock *skpair; 661 struct sk_buff *skb; 662 struct path path; 663 int state; 664 665 unix_remove_socket(sock_net(sk), sk); 666 unix_remove_bsd_socket(sk); 667 668 /* Clear state */ 669 unix_state_lock(sk); 670 sock_orphan(sk); 671 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 672 path = u->path; 673 u->path.dentry = NULL; 674 u->path.mnt = NULL; 675 state = sk->sk_state; 676 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 677 678 skpair = unix_peer(sk); 679 unix_peer(sk) = NULL; 680 681 unix_state_unlock(sk); 682 683 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 684 u->oob_skb = NULL; 685 #endif 686 687 wake_up_interruptible_all(&u->peer_wait); 688 689 if (skpair != NULL) { 690 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 691 unix_state_lock(skpair); 692 /* No more writes */ 693 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 694 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) 695 WRITE_ONCE(skpair->sk_err, ECONNRESET); 696 unix_state_unlock(skpair); 697 skpair->sk_state_change(skpair); 698 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 699 } 700 701 unix_dgram_peer_wake_disconnect(sk, skpair); 702 sock_put(skpair); /* It may now die */ 703 } 704 705 /* Try to flush out this socket. Throw out buffers at least */ 706 707 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 708 if (state == TCP_LISTEN) 709 unix_release_sock(skb->sk, 1); 710 711 /* passed fds are erased in the kfree_skb hook */ 712 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 713 } 714 715 if (path.dentry) 716 path_put(&path); 717 718 sock_put(sk); 719 720 /* ---- Socket is dead now and most probably destroyed ---- */ 721 722 /* 723 * Fixme: BSD difference: In BSD all sockets connected to us get 724 * ECONNRESET and we die on the spot. In Linux we behave 725 * like files and pipes do and wait for the last 726 * dereference. 727 * 728 * Can't we simply set sock->err? 729 * 730 * What the above comment does talk about? --ANK(980817) 731 */ 732 733 if (READ_ONCE(unix_tot_inflight)) 734 unix_gc(); /* Garbage collect fds */ 735 } 736 737 static void init_peercred(struct sock *sk) 738 { 739 sk->sk_peer_pid = get_pid(task_tgid(current)); 740 sk->sk_peer_cred = get_current_cred(); 741 } 742 743 static void update_peercred(struct sock *sk) 744 { 745 const struct cred *old_cred; 746 struct pid *old_pid; 747 748 spin_lock(&sk->sk_peer_lock); 749 old_pid = sk->sk_peer_pid; 750 old_cred = sk->sk_peer_cred; 751 init_peercred(sk); 752 spin_unlock(&sk->sk_peer_lock); 753 754 put_pid(old_pid); 755 put_cred(old_cred); 756 } 757 758 static void copy_peercred(struct sock *sk, struct sock *peersk) 759 { 760 lockdep_assert_held(&unix_sk(peersk)->lock); 761 762 spin_lock(&sk->sk_peer_lock); 763 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 764 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 765 spin_unlock(&sk->sk_peer_lock); 766 } 767 768 static int unix_listen(struct socket *sock, int backlog) 769 { 770 int err; 771 struct sock *sk = sock->sk; 772 struct unix_sock *u = unix_sk(sk); 773 774 err = -EOPNOTSUPP; 775 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 776 goto out; /* Only stream/seqpacket sockets accept */ 777 err = -EINVAL; 778 if (!READ_ONCE(u->addr)) 779 goto out; /* No listens on an unbound socket */ 780 unix_state_lock(sk); 781 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 782 goto out_unlock; 783 if (backlog > sk->sk_max_ack_backlog) 784 wake_up_interruptible_all(&u->peer_wait); 785 sk->sk_max_ack_backlog = backlog; 786 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 787 788 /* set credentials so connect can copy them */ 789 update_peercred(sk); 790 err = 0; 791 792 out_unlock: 793 unix_state_unlock(sk); 794 out: 795 return err; 796 } 797 798 static int unix_release(struct socket *); 799 static int unix_bind(struct socket *, struct sockaddr *, int); 800 static int unix_stream_connect(struct socket *, struct sockaddr *, 801 int addr_len, int flags); 802 static int unix_socketpair(struct socket *, struct socket *); 803 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); 804 static int unix_getname(struct socket *, struct sockaddr *, int); 805 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 806 static __poll_t unix_dgram_poll(struct file *, struct socket *, 807 poll_table *); 808 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 809 #ifdef CONFIG_COMPAT 810 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 811 #endif 812 static int unix_shutdown(struct socket *, int); 813 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 814 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 815 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 816 struct pipe_inode_info *, size_t size, 817 unsigned int flags); 818 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 819 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 820 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 821 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 822 static int unix_dgram_connect(struct socket *, struct sockaddr *, 823 int, int); 824 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 825 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 826 int); 827 828 #ifdef CONFIG_PROC_FS 829 static int unix_count_nr_fds(struct sock *sk) 830 { 831 struct sk_buff *skb; 832 struct unix_sock *u; 833 int nr_fds = 0; 834 835 spin_lock(&sk->sk_receive_queue.lock); 836 skb = skb_peek(&sk->sk_receive_queue); 837 while (skb) { 838 u = unix_sk(skb->sk); 839 nr_fds += atomic_read(&u->scm_stat.nr_fds); 840 skb = skb_peek_next(skb, &sk->sk_receive_queue); 841 } 842 spin_unlock(&sk->sk_receive_queue.lock); 843 844 return nr_fds; 845 } 846 847 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 848 { 849 struct sock *sk = sock->sk; 850 unsigned char s_state; 851 struct unix_sock *u; 852 int nr_fds = 0; 853 854 if (sk) { 855 s_state = READ_ONCE(sk->sk_state); 856 u = unix_sk(sk); 857 858 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 859 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 860 * SOCK_DGRAM is ordinary. So, no lock is needed. 861 */ 862 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 863 nr_fds = atomic_read(&u->scm_stat.nr_fds); 864 else if (s_state == TCP_LISTEN) 865 nr_fds = unix_count_nr_fds(sk); 866 867 seq_printf(m, "scm_fds: %u\n", nr_fds); 868 } 869 } 870 #else 871 #define unix_show_fdinfo NULL 872 #endif 873 874 static const struct proto_ops unix_stream_ops = { 875 .family = PF_UNIX, 876 .owner = THIS_MODULE, 877 .release = unix_release, 878 .bind = unix_bind, 879 .connect = unix_stream_connect, 880 .socketpair = unix_socketpair, 881 .accept = unix_accept, 882 .getname = unix_getname, 883 .poll = unix_poll, 884 .ioctl = unix_ioctl, 885 #ifdef CONFIG_COMPAT 886 .compat_ioctl = unix_compat_ioctl, 887 #endif 888 .listen = unix_listen, 889 .shutdown = unix_shutdown, 890 .sendmsg = unix_stream_sendmsg, 891 .recvmsg = unix_stream_recvmsg, 892 .read_skb = unix_stream_read_skb, 893 .mmap = sock_no_mmap, 894 .splice_read = unix_stream_splice_read, 895 .set_peek_off = sk_set_peek_off, 896 .show_fdinfo = unix_show_fdinfo, 897 }; 898 899 static const struct proto_ops unix_dgram_ops = { 900 .family = PF_UNIX, 901 .owner = THIS_MODULE, 902 .release = unix_release, 903 .bind = unix_bind, 904 .connect = unix_dgram_connect, 905 .socketpair = unix_socketpair, 906 .accept = sock_no_accept, 907 .getname = unix_getname, 908 .poll = unix_dgram_poll, 909 .ioctl = unix_ioctl, 910 #ifdef CONFIG_COMPAT 911 .compat_ioctl = unix_compat_ioctl, 912 #endif 913 .listen = sock_no_listen, 914 .shutdown = unix_shutdown, 915 .sendmsg = unix_dgram_sendmsg, 916 .read_skb = unix_read_skb, 917 .recvmsg = unix_dgram_recvmsg, 918 .mmap = sock_no_mmap, 919 .set_peek_off = sk_set_peek_off, 920 .show_fdinfo = unix_show_fdinfo, 921 }; 922 923 static const struct proto_ops unix_seqpacket_ops = { 924 .family = PF_UNIX, 925 .owner = THIS_MODULE, 926 .release = unix_release, 927 .bind = unix_bind, 928 .connect = unix_stream_connect, 929 .socketpair = unix_socketpair, 930 .accept = unix_accept, 931 .getname = unix_getname, 932 .poll = unix_dgram_poll, 933 .ioctl = unix_ioctl, 934 #ifdef CONFIG_COMPAT 935 .compat_ioctl = unix_compat_ioctl, 936 #endif 937 .listen = unix_listen, 938 .shutdown = unix_shutdown, 939 .sendmsg = unix_seqpacket_sendmsg, 940 .recvmsg = unix_seqpacket_recvmsg, 941 .mmap = sock_no_mmap, 942 .set_peek_off = sk_set_peek_off, 943 .show_fdinfo = unix_show_fdinfo, 944 }; 945 946 static void unix_close(struct sock *sk, long timeout) 947 { 948 /* Nothing to do here, unix socket does not need a ->close(). 949 * This is merely for sockmap. 950 */ 951 } 952 953 static void unix_unhash(struct sock *sk) 954 { 955 /* Nothing to do here, unix socket does not need a ->unhash(). 956 * This is merely for sockmap. 957 */ 958 } 959 960 static bool unix_bpf_bypass_getsockopt(int level, int optname) 961 { 962 if (level == SOL_SOCKET) { 963 switch (optname) { 964 case SO_PEERPIDFD: 965 return true; 966 default: 967 return false; 968 } 969 } 970 971 return false; 972 } 973 974 struct proto unix_dgram_proto = { 975 .name = "UNIX", 976 .owner = THIS_MODULE, 977 .obj_size = sizeof(struct unix_sock), 978 .close = unix_close, 979 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 980 #ifdef CONFIG_BPF_SYSCALL 981 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 982 #endif 983 }; 984 985 struct proto unix_stream_proto = { 986 .name = "UNIX-STREAM", 987 .owner = THIS_MODULE, 988 .obj_size = sizeof(struct unix_sock), 989 .close = unix_close, 990 .unhash = unix_unhash, 991 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 992 #ifdef CONFIG_BPF_SYSCALL 993 .psock_update_sk_prot = unix_stream_bpf_update_proto, 994 #endif 995 }; 996 997 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 998 { 999 struct unix_sock *u; 1000 struct sock *sk; 1001 int err; 1002 1003 atomic_long_inc(&unix_nr_socks); 1004 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 1005 err = -ENFILE; 1006 goto err; 1007 } 1008 1009 if (type == SOCK_STREAM) 1010 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 1011 else /*dgram and seqpacket */ 1012 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 1013 1014 if (!sk) { 1015 err = -ENOMEM; 1016 goto err; 1017 } 1018 1019 sock_init_data(sock, sk); 1020 1021 sk->sk_hash = unix_unbound_hash(sk); 1022 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 1023 sk->sk_write_space = unix_write_space; 1024 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 1025 sk->sk_destruct = unix_sock_destructor; 1026 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); 1027 1028 u = unix_sk(sk); 1029 u->listener = NULL; 1030 u->vertex = NULL; 1031 u->path.dentry = NULL; 1032 u->path.mnt = NULL; 1033 spin_lock_init(&u->lock); 1034 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); 1035 mutex_init(&u->iolock); /* single task reading lock */ 1036 mutex_init(&u->bindlock); /* single task binding lock */ 1037 init_waitqueue_head(&u->peer_wait); 1038 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1039 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1040 unix_insert_unbound_socket(net, sk); 1041 1042 sock_prot_inuse_add(net, sk->sk_prot, 1); 1043 1044 return sk; 1045 1046 err: 1047 atomic_long_dec(&unix_nr_socks); 1048 return ERR_PTR(err); 1049 } 1050 1051 static int unix_create(struct net *net, struct socket *sock, int protocol, 1052 int kern) 1053 { 1054 struct sock *sk; 1055 1056 if (protocol && protocol != PF_UNIX) 1057 return -EPROTONOSUPPORT; 1058 1059 sock->state = SS_UNCONNECTED; 1060 1061 switch (sock->type) { 1062 case SOCK_STREAM: 1063 sock->ops = &unix_stream_ops; 1064 break; 1065 /* 1066 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1067 * nothing uses it. 1068 */ 1069 case SOCK_RAW: 1070 sock->type = SOCK_DGRAM; 1071 fallthrough; 1072 case SOCK_DGRAM: 1073 sock->ops = &unix_dgram_ops; 1074 break; 1075 case SOCK_SEQPACKET: 1076 sock->ops = &unix_seqpacket_ops; 1077 break; 1078 default: 1079 return -ESOCKTNOSUPPORT; 1080 } 1081 1082 sk = unix_create1(net, sock, kern, sock->type); 1083 if (IS_ERR(sk)) 1084 return PTR_ERR(sk); 1085 1086 return 0; 1087 } 1088 1089 static int unix_release(struct socket *sock) 1090 { 1091 struct sock *sk = sock->sk; 1092 1093 if (!sk) 1094 return 0; 1095 1096 sk->sk_prot->close(sk, 0); 1097 unix_release_sock(sk, 0); 1098 sock->sk = NULL; 1099 1100 return 0; 1101 } 1102 1103 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1104 int type) 1105 { 1106 struct inode *inode; 1107 struct path path; 1108 struct sock *sk; 1109 int err; 1110 1111 unix_mkname_bsd(sunaddr, addr_len); 1112 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1113 if (err) 1114 goto fail; 1115 1116 err = path_permission(&path, MAY_WRITE); 1117 if (err) 1118 goto path_put; 1119 1120 err = -ECONNREFUSED; 1121 inode = d_backing_inode(path.dentry); 1122 if (!S_ISSOCK(inode->i_mode)) 1123 goto path_put; 1124 1125 sk = unix_find_socket_byinode(inode); 1126 if (!sk) 1127 goto path_put; 1128 1129 err = -EPROTOTYPE; 1130 if (sk->sk_type == type) 1131 touch_atime(&path); 1132 else 1133 goto sock_put; 1134 1135 path_put(&path); 1136 1137 return sk; 1138 1139 sock_put: 1140 sock_put(sk); 1141 path_put: 1142 path_put(&path); 1143 fail: 1144 return ERR_PTR(err); 1145 } 1146 1147 static struct sock *unix_find_abstract(struct net *net, 1148 struct sockaddr_un *sunaddr, 1149 int addr_len, int type) 1150 { 1151 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1152 struct dentry *dentry; 1153 struct sock *sk; 1154 1155 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1156 if (!sk) 1157 return ERR_PTR(-ECONNREFUSED); 1158 1159 dentry = unix_sk(sk)->path.dentry; 1160 if (dentry) 1161 touch_atime(&unix_sk(sk)->path); 1162 1163 return sk; 1164 } 1165 1166 static struct sock *unix_find_other(struct net *net, 1167 struct sockaddr_un *sunaddr, 1168 int addr_len, int type) 1169 { 1170 struct sock *sk; 1171 1172 if (sunaddr->sun_path[0]) 1173 sk = unix_find_bsd(sunaddr, addr_len, type); 1174 else 1175 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1176 1177 return sk; 1178 } 1179 1180 static int unix_autobind(struct sock *sk) 1181 { 1182 struct unix_sock *u = unix_sk(sk); 1183 unsigned int new_hash, old_hash; 1184 struct net *net = sock_net(sk); 1185 struct unix_address *addr; 1186 u32 lastnum, ordernum; 1187 int err; 1188 1189 err = mutex_lock_interruptible(&u->bindlock); 1190 if (err) 1191 return err; 1192 1193 if (u->addr) 1194 goto out; 1195 1196 err = -ENOMEM; 1197 addr = kzalloc(sizeof(*addr) + 1198 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1199 if (!addr) 1200 goto out; 1201 1202 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1203 addr->name->sun_family = AF_UNIX; 1204 refcount_set(&addr->refcnt, 1); 1205 1206 old_hash = sk->sk_hash; 1207 ordernum = get_random_u32(); 1208 lastnum = ordernum & 0xFFFFF; 1209 retry: 1210 ordernum = (ordernum + 1) & 0xFFFFF; 1211 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1212 1213 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1214 unix_table_double_lock(net, old_hash, new_hash); 1215 1216 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1217 unix_table_double_unlock(net, old_hash, new_hash); 1218 1219 /* __unix_find_socket_byname() may take long time if many names 1220 * are already in use. 1221 */ 1222 cond_resched(); 1223 1224 if (ordernum == lastnum) { 1225 /* Give up if all names seems to be in use. */ 1226 err = -ENOSPC; 1227 unix_release_addr(addr); 1228 goto out; 1229 } 1230 1231 goto retry; 1232 } 1233 1234 __unix_set_addr_hash(net, sk, addr, new_hash); 1235 unix_table_double_unlock(net, old_hash, new_hash); 1236 err = 0; 1237 1238 out: mutex_unlock(&u->bindlock); 1239 return err; 1240 } 1241 1242 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1243 int addr_len) 1244 { 1245 umode_t mode = S_IFSOCK | 1246 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1247 struct unix_sock *u = unix_sk(sk); 1248 unsigned int new_hash, old_hash; 1249 struct net *net = sock_net(sk); 1250 struct mnt_idmap *idmap; 1251 struct unix_address *addr; 1252 struct dentry *dentry; 1253 struct path parent; 1254 int err; 1255 1256 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1257 addr = unix_create_addr(sunaddr, addr_len); 1258 if (!addr) 1259 return -ENOMEM; 1260 1261 /* 1262 * Get the parent directory, calculate the hash for last 1263 * component. 1264 */ 1265 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1266 if (IS_ERR(dentry)) { 1267 err = PTR_ERR(dentry); 1268 goto out; 1269 } 1270 1271 /* 1272 * All right, let's create it. 1273 */ 1274 idmap = mnt_idmap(parent.mnt); 1275 err = security_path_mknod(&parent, dentry, mode, 0); 1276 if (!err) 1277 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1278 if (err) 1279 goto out_path; 1280 err = mutex_lock_interruptible(&u->bindlock); 1281 if (err) 1282 goto out_unlink; 1283 if (u->addr) 1284 goto out_unlock; 1285 1286 old_hash = sk->sk_hash; 1287 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1288 unix_table_double_lock(net, old_hash, new_hash); 1289 u->path.mnt = mntget(parent.mnt); 1290 u->path.dentry = dget(dentry); 1291 __unix_set_addr_hash(net, sk, addr, new_hash); 1292 unix_table_double_unlock(net, old_hash, new_hash); 1293 unix_insert_bsd_socket(sk); 1294 mutex_unlock(&u->bindlock); 1295 done_path_create(&parent, dentry); 1296 return 0; 1297 1298 out_unlock: 1299 mutex_unlock(&u->bindlock); 1300 err = -EINVAL; 1301 out_unlink: 1302 /* failed after successful mknod? unlink what we'd created... */ 1303 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1304 out_path: 1305 done_path_create(&parent, dentry); 1306 out: 1307 unix_release_addr(addr); 1308 return err == -EEXIST ? -EADDRINUSE : err; 1309 } 1310 1311 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1312 int addr_len) 1313 { 1314 struct unix_sock *u = unix_sk(sk); 1315 unsigned int new_hash, old_hash; 1316 struct net *net = sock_net(sk); 1317 struct unix_address *addr; 1318 int err; 1319 1320 addr = unix_create_addr(sunaddr, addr_len); 1321 if (!addr) 1322 return -ENOMEM; 1323 1324 err = mutex_lock_interruptible(&u->bindlock); 1325 if (err) 1326 goto out; 1327 1328 if (u->addr) { 1329 err = -EINVAL; 1330 goto out_mutex; 1331 } 1332 1333 old_hash = sk->sk_hash; 1334 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1335 unix_table_double_lock(net, old_hash, new_hash); 1336 1337 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1338 goto out_spin; 1339 1340 __unix_set_addr_hash(net, sk, addr, new_hash); 1341 unix_table_double_unlock(net, old_hash, new_hash); 1342 mutex_unlock(&u->bindlock); 1343 return 0; 1344 1345 out_spin: 1346 unix_table_double_unlock(net, old_hash, new_hash); 1347 err = -EADDRINUSE; 1348 out_mutex: 1349 mutex_unlock(&u->bindlock); 1350 out: 1351 unix_release_addr(addr); 1352 return err; 1353 } 1354 1355 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1356 { 1357 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1358 struct sock *sk = sock->sk; 1359 int err; 1360 1361 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1362 sunaddr->sun_family == AF_UNIX) 1363 return unix_autobind(sk); 1364 1365 err = unix_validate_addr(sunaddr, addr_len); 1366 if (err) 1367 return err; 1368 1369 if (sunaddr->sun_path[0]) 1370 err = unix_bind_bsd(sk, sunaddr, addr_len); 1371 else 1372 err = unix_bind_abstract(sk, sunaddr, addr_len); 1373 1374 return err; 1375 } 1376 1377 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1378 { 1379 if (unlikely(sk1 == sk2) || !sk2) { 1380 unix_state_lock(sk1); 1381 return; 1382 } 1383 1384 if (sk1 > sk2) 1385 swap(sk1, sk2); 1386 1387 unix_state_lock(sk1); 1388 unix_state_lock(sk2); 1389 } 1390 1391 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1392 { 1393 if (unlikely(sk1 == sk2) || !sk2) { 1394 unix_state_unlock(sk1); 1395 return; 1396 } 1397 unix_state_unlock(sk1); 1398 unix_state_unlock(sk2); 1399 } 1400 1401 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1402 int alen, int flags) 1403 { 1404 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1405 struct sock *sk = sock->sk; 1406 struct sock *other; 1407 int err; 1408 1409 err = -EINVAL; 1410 if (alen < offsetofend(struct sockaddr, sa_family)) 1411 goto out; 1412 1413 if (addr->sa_family != AF_UNSPEC) { 1414 err = unix_validate_addr(sunaddr, alen); 1415 if (err) 1416 goto out; 1417 1418 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1419 if (err) 1420 goto out; 1421 1422 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1423 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1424 !READ_ONCE(unix_sk(sk)->addr)) { 1425 err = unix_autobind(sk); 1426 if (err) 1427 goto out; 1428 } 1429 1430 restart: 1431 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1432 if (IS_ERR(other)) { 1433 err = PTR_ERR(other); 1434 goto out; 1435 } 1436 1437 unix_state_double_lock(sk, other); 1438 1439 /* Apparently VFS overslept socket death. Retry. */ 1440 if (sock_flag(other, SOCK_DEAD)) { 1441 unix_state_double_unlock(sk, other); 1442 sock_put(other); 1443 goto restart; 1444 } 1445 1446 err = -EPERM; 1447 if (!unix_may_send(sk, other)) 1448 goto out_unlock; 1449 1450 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1451 if (err) 1452 goto out_unlock; 1453 1454 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1455 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1456 } else { 1457 /* 1458 * 1003.1g breaking connected state with AF_UNSPEC 1459 */ 1460 other = NULL; 1461 unix_state_double_lock(sk, other); 1462 } 1463 1464 /* 1465 * If it was connected, reconnect. 1466 */ 1467 if (unix_peer(sk)) { 1468 struct sock *old_peer = unix_peer(sk); 1469 1470 unix_peer(sk) = other; 1471 if (!other) 1472 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1473 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1474 1475 unix_state_double_unlock(sk, other); 1476 1477 if (other != old_peer) { 1478 unix_dgram_disconnected(sk, old_peer); 1479 1480 unix_state_lock(old_peer); 1481 if (!unix_peer(old_peer)) 1482 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1483 unix_state_unlock(old_peer); 1484 } 1485 1486 sock_put(old_peer); 1487 } else { 1488 unix_peer(sk) = other; 1489 unix_state_double_unlock(sk, other); 1490 } 1491 1492 return 0; 1493 1494 out_unlock: 1495 unix_state_double_unlock(sk, other); 1496 sock_put(other); 1497 out: 1498 return err; 1499 } 1500 1501 static long unix_wait_for_peer(struct sock *other, long timeo) 1502 { 1503 struct unix_sock *u = unix_sk(other); 1504 int sched; 1505 DEFINE_WAIT(wait); 1506 1507 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1508 1509 sched = !sock_flag(other, SOCK_DEAD) && 1510 !(other->sk_shutdown & RCV_SHUTDOWN) && 1511 unix_recvq_full_lockless(other); 1512 1513 unix_state_unlock(other); 1514 1515 if (sched) 1516 timeo = schedule_timeout(timeo); 1517 1518 finish_wait(&u->peer_wait, &wait); 1519 return timeo; 1520 } 1521 1522 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1523 int addr_len, int flags) 1524 { 1525 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1526 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1527 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1528 struct net *net = sock_net(sk); 1529 struct sk_buff *skb = NULL; 1530 unsigned char state; 1531 long timeo; 1532 int err; 1533 1534 err = unix_validate_addr(sunaddr, addr_len); 1535 if (err) 1536 goto out; 1537 1538 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1539 if (err) 1540 goto out; 1541 1542 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1543 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1544 !READ_ONCE(u->addr)) { 1545 err = unix_autobind(sk); 1546 if (err) 1547 goto out; 1548 } 1549 1550 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1551 1552 /* First of all allocate resources. 1553 * If we will make it after state is locked, 1554 * we will have to recheck all again in any case. 1555 */ 1556 1557 /* create new sock for complete connection */ 1558 newsk = unix_create1(net, NULL, 0, sock->type); 1559 if (IS_ERR(newsk)) { 1560 err = PTR_ERR(newsk); 1561 goto out; 1562 } 1563 1564 /* Allocate skb for sending to listening sock */ 1565 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1566 if (!skb) { 1567 err = -ENOMEM; 1568 goto out_free_sk; 1569 } 1570 1571 restart: 1572 /* Find listening sock. */ 1573 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1574 if (IS_ERR(other)) { 1575 err = PTR_ERR(other); 1576 goto out_free_skb; 1577 } 1578 1579 unix_state_lock(other); 1580 1581 /* Apparently VFS overslept socket death. Retry. */ 1582 if (sock_flag(other, SOCK_DEAD)) { 1583 unix_state_unlock(other); 1584 sock_put(other); 1585 goto restart; 1586 } 1587 1588 if (other->sk_state != TCP_LISTEN || 1589 other->sk_shutdown & RCV_SHUTDOWN) { 1590 err = -ECONNREFUSED; 1591 goto out_unlock; 1592 } 1593 1594 if (unix_recvq_full_lockless(other)) { 1595 if (!timeo) { 1596 err = -EAGAIN; 1597 goto out_unlock; 1598 } 1599 1600 timeo = unix_wait_for_peer(other, timeo); 1601 sock_put(other); 1602 1603 err = sock_intr_errno(timeo); 1604 if (signal_pending(current)) 1605 goto out_free_skb; 1606 1607 goto restart; 1608 } 1609 1610 /* self connect and simultaneous connect are eliminated 1611 * by rejecting TCP_LISTEN socket to avoid deadlock. 1612 */ 1613 state = READ_ONCE(sk->sk_state); 1614 if (unlikely(state != TCP_CLOSE)) { 1615 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1616 goto out_unlock; 1617 } 1618 1619 unix_state_lock(sk); 1620 1621 if (unlikely(sk->sk_state != TCP_CLOSE)) { 1622 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1623 unix_state_unlock(sk); 1624 goto out_unlock; 1625 } 1626 1627 err = security_unix_stream_connect(sk, other, newsk); 1628 if (err) { 1629 unix_state_unlock(sk); 1630 goto out_unlock; 1631 } 1632 1633 /* The way is open! Fastly set all the necessary fields... */ 1634 1635 sock_hold(sk); 1636 unix_peer(newsk) = sk; 1637 newsk->sk_state = TCP_ESTABLISHED; 1638 newsk->sk_type = sk->sk_type; 1639 init_peercred(newsk); 1640 newu = unix_sk(newsk); 1641 newu->listener = other; 1642 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1643 otheru = unix_sk(other); 1644 1645 /* copy address information from listening to new sock 1646 * 1647 * The contents of *(otheru->addr) and otheru->path 1648 * are seen fully set up here, since we have found 1649 * otheru in hash under its lock. Insertion into the 1650 * hash chain we'd found it in had been done in an 1651 * earlier critical area protected by the chain's lock, 1652 * the same one where we'd set *(otheru->addr) contents, 1653 * as well as otheru->path and otheru->addr itself. 1654 * 1655 * Using smp_store_release() here to set newu->addr 1656 * is enough to make those stores, as well as stores 1657 * to newu->path visible to anyone who gets newu->addr 1658 * by smp_load_acquire(). IOW, the same warranties 1659 * as for unix_sock instances bound in unix_bind() or 1660 * in unix_autobind(). 1661 */ 1662 if (otheru->path.dentry) { 1663 path_get(&otheru->path); 1664 newu->path = otheru->path; 1665 } 1666 refcount_inc(&otheru->addr->refcnt); 1667 smp_store_release(&newu->addr, otheru->addr); 1668 1669 /* Set credentials */ 1670 copy_peercred(sk, other); 1671 1672 sock->state = SS_CONNECTED; 1673 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1674 sock_hold(newsk); 1675 1676 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1677 unix_peer(sk) = newsk; 1678 1679 unix_state_unlock(sk); 1680 1681 /* take ten and send info to listening sock */ 1682 spin_lock(&other->sk_receive_queue.lock); 1683 __skb_queue_tail(&other->sk_receive_queue, skb); 1684 spin_unlock(&other->sk_receive_queue.lock); 1685 unix_state_unlock(other); 1686 other->sk_data_ready(other); 1687 sock_put(other); 1688 return 0; 1689 1690 out_unlock: 1691 unix_state_unlock(other); 1692 sock_put(other); 1693 out_free_skb: 1694 consume_skb(skb); 1695 out_free_sk: 1696 unix_release_sock(newsk, 0); 1697 out: 1698 return err; 1699 } 1700 1701 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1702 { 1703 struct sock *ska = socka->sk, *skb = sockb->sk; 1704 1705 /* Join our sockets back to back */ 1706 sock_hold(ska); 1707 sock_hold(skb); 1708 unix_peer(ska) = skb; 1709 unix_peer(skb) = ska; 1710 init_peercred(ska); 1711 init_peercred(skb); 1712 1713 ska->sk_state = TCP_ESTABLISHED; 1714 skb->sk_state = TCP_ESTABLISHED; 1715 socka->state = SS_CONNECTED; 1716 sockb->state = SS_CONNECTED; 1717 return 0; 1718 } 1719 1720 static void unix_sock_inherit_flags(const struct socket *old, 1721 struct socket *new) 1722 { 1723 if (test_bit(SOCK_PASSCRED, &old->flags)) 1724 set_bit(SOCK_PASSCRED, &new->flags); 1725 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1726 set_bit(SOCK_PASSPIDFD, &new->flags); 1727 if (test_bit(SOCK_PASSSEC, &old->flags)) 1728 set_bit(SOCK_PASSSEC, &new->flags); 1729 } 1730 1731 static int unix_accept(struct socket *sock, struct socket *newsock, 1732 struct proto_accept_arg *arg) 1733 { 1734 struct sock *sk = sock->sk; 1735 struct sk_buff *skb; 1736 struct sock *tsk; 1737 1738 arg->err = -EOPNOTSUPP; 1739 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1740 goto out; 1741 1742 arg->err = -EINVAL; 1743 if (READ_ONCE(sk->sk_state) != TCP_LISTEN) 1744 goto out; 1745 1746 /* If socket state is TCP_LISTEN it cannot change (for now...), 1747 * so that no locks are necessary. 1748 */ 1749 1750 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1751 &arg->err); 1752 if (!skb) { 1753 /* This means receive shutdown. */ 1754 if (arg->err == 0) 1755 arg->err = -EINVAL; 1756 goto out; 1757 } 1758 1759 tsk = skb->sk; 1760 skb_free_datagram(sk, skb); 1761 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1762 1763 /* attach accepted sock to socket */ 1764 unix_state_lock(tsk); 1765 unix_update_edges(unix_sk(tsk)); 1766 newsock->state = SS_CONNECTED; 1767 unix_sock_inherit_flags(sock, newsock); 1768 sock_graft(tsk, newsock); 1769 unix_state_unlock(tsk); 1770 return 0; 1771 1772 out: 1773 return arg->err; 1774 } 1775 1776 1777 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1778 { 1779 struct sock *sk = sock->sk; 1780 struct unix_address *addr; 1781 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1782 int err = 0; 1783 1784 if (peer) { 1785 sk = unix_peer_get(sk); 1786 1787 err = -ENOTCONN; 1788 if (!sk) 1789 goto out; 1790 err = 0; 1791 } else { 1792 sock_hold(sk); 1793 } 1794 1795 addr = smp_load_acquire(&unix_sk(sk)->addr); 1796 if (!addr) { 1797 sunaddr->sun_family = AF_UNIX; 1798 sunaddr->sun_path[0] = 0; 1799 err = offsetof(struct sockaddr_un, sun_path); 1800 } else { 1801 err = addr->len; 1802 memcpy(sunaddr, addr->name, addr->len); 1803 1804 if (peer) 1805 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1806 CGROUP_UNIX_GETPEERNAME); 1807 else 1808 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1809 CGROUP_UNIX_GETSOCKNAME); 1810 } 1811 sock_put(sk); 1812 out: 1813 return err; 1814 } 1815 1816 /* The "user->unix_inflight" variable is protected by the garbage 1817 * collection lock, and we just read it locklessly here. If you go 1818 * over the limit, there might be a tiny race in actually noticing 1819 * it across threads. Tough. 1820 */ 1821 static inline bool too_many_unix_fds(struct task_struct *p) 1822 { 1823 struct user_struct *user = current_user(); 1824 1825 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1826 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1827 return false; 1828 } 1829 1830 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1831 { 1832 if (too_many_unix_fds(current)) 1833 return -ETOOMANYREFS; 1834 1835 UNIXCB(skb).fp = scm->fp; 1836 scm->fp = NULL; 1837 1838 if (unix_prepare_fpl(UNIXCB(skb).fp)) 1839 return -ENOMEM; 1840 1841 return 0; 1842 } 1843 1844 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1845 { 1846 scm->fp = UNIXCB(skb).fp; 1847 UNIXCB(skb).fp = NULL; 1848 1849 unix_destroy_fpl(scm->fp); 1850 } 1851 1852 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1853 { 1854 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1855 } 1856 1857 static void unix_destruct_scm(struct sk_buff *skb) 1858 { 1859 struct scm_cookie scm; 1860 1861 memset(&scm, 0, sizeof(scm)); 1862 scm.pid = UNIXCB(skb).pid; 1863 if (UNIXCB(skb).fp) 1864 unix_detach_fds(&scm, skb); 1865 1866 /* Alas, it calls VFS */ 1867 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1868 scm_destroy(&scm); 1869 sock_wfree(skb); 1870 } 1871 1872 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1873 { 1874 int err = 0; 1875 1876 UNIXCB(skb).pid = get_pid(scm->pid); 1877 UNIXCB(skb).uid = scm->creds.uid; 1878 UNIXCB(skb).gid = scm->creds.gid; 1879 UNIXCB(skb).fp = NULL; 1880 unix_get_secdata(scm, skb); 1881 if (scm->fp && send_fds) 1882 err = unix_attach_fds(scm, skb); 1883 1884 skb->destructor = unix_destruct_scm; 1885 return err; 1886 } 1887 1888 static bool unix_passcred_enabled(const struct socket *sock, 1889 const struct sock *other) 1890 { 1891 return test_bit(SOCK_PASSCRED, &sock->flags) || 1892 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1893 !other->sk_socket || 1894 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1895 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1896 } 1897 1898 /* 1899 * Some apps rely on write() giving SCM_CREDENTIALS 1900 * We include credentials if source or destination socket 1901 * asserted SOCK_PASSCRED. 1902 */ 1903 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1904 const struct sock *other) 1905 { 1906 if (UNIXCB(skb).pid) 1907 return; 1908 if (unix_passcred_enabled(sock, other)) { 1909 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1910 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1911 } 1912 } 1913 1914 static bool unix_skb_scm_eq(struct sk_buff *skb, 1915 struct scm_cookie *scm) 1916 { 1917 return UNIXCB(skb).pid == scm->pid && 1918 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1919 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1920 unix_secdata_eq(scm, skb); 1921 } 1922 1923 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1924 { 1925 struct scm_fp_list *fp = UNIXCB(skb).fp; 1926 struct unix_sock *u = unix_sk(sk); 1927 1928 if (unlikely(fp && fp->count)) { 1929 atomic_add(fp->count, &u->scm_stat.nr_fds); 1930 unix_add_edges(fp, u); 1931 } 1932 } 1933 1934 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1935 { 1936 struct scm_fp_list *fp = UNIXCB(skb).fp; 1937 struct unix_sock *u = unix_sk(sk); 1938 1939 if (unlikely(fp && fp->count)) { 1940 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1941 unix_del_edges(fp); 1942 } 1943 } 1944 1945 /* 1946 * Send AF_UNIX data. 1947 */ 1948 1949 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1950 size_t len) 1951 { 1952 struct sock *sk = sock->sk, *other = NULL; 1953 struct unix_sock *u = unix_sk(sk); 1954 struct scm_cookie scm; 1955 struct sk_buff *skb; 1956 int data_len = 0; 1957 int sk_locked; 1958 long timeo; 1959 int err; 1960 1961 err = scm_send(sock, msg, &scm, false); 1962 if (err < 0) 1963 return err; 1964 1965 wait_for_unix_gc(scm.fp); 1966 1967 if (msg->msg_flags & MSG_OOB) { 1968 err = -EOPNOTSUPP; 1969 goto out; 1970 } 1971 1972 if (msg->msg_namelen) { 1973 err = unix_validate_addr(msg->msg_name, msg->msg_namelen); 1974 if (err) 1975 goto out; 1976 1977 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 1978 msg->msg_name, 1979 &msg->msg_namelen, 1980 NULL); 1981 if (err) 1982 goto out; 1983 } 1984 1985 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1986 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1987 !READ_ONCE(u->addr)) { 1988 err = unix_autobind(sk); 1989 if (err) 1990 goto out; 1991 } 1992 1993 if (len > READ_ONCE(sk->sk_sndbuf) - 32) { 1994 err = -EMSGSIZE; 1995 goto out; 1996 } 1997 1998 if (len > SKB_MAX_ALLOC) { 1999 data_len = min_t(size_t, 2000 len - SKB_MAX_ALLOC, 2001 MAX_SKB_FRAGS * PAGE_SIZE); 2002 data_len = PAGE_ALIGN(data_len); 2003 2004 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 2005 } 2006 2007 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 2008 msg->msg_flags & MSG_DONTWAIT, &err, 2009 PAGE_ALLOC_COSTLY_ORDER); 2010 if (!skb) 2011 goto out; 2012 2013 err = unix_scm_to_skb(&scm, skb, true); 2014 if (err < 0) 2015 goto out_free; 2016 2017 skb_put(skb, len - data_len); 2018 skb->data_len = data_len; 2019 skb->len = len; 2020 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 2021 if (err) 2022 goto out_free; 2023 2024 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 2025 2026 if (msg->msg_namelen) { 2027 lookup: 2028 other = unix_find_other(sock_net(sk), msg->msg_name, 2029 msg->msg_namelen, sk->sk_type); 2030 if (IS_ERR(other)) { 2031 err = PTR_ERR(other); 2032 goto out_free; 2033 } 2034 } else { 2035 other = unix_peer_get(sk); 2036 if (!other) { 2037 err = -ENOTCONN; 2038 goto out_free; 2039 } 2040 } 2041 2042 if (sk_filter(other, skb) < 0) { 2043 /* Toss the packet but do not return any error to the sender */ 2044 err = len; 2045 goto out_sock_put; 2046 } 2047 2048 restart: 2049 sk_locked = 0; 2050 unix_state_lock(other); 2051 restart_locked: 2052 2053 if (!unix_may_send(sk, other)) { 2054 err = -EPERM; 2055 goto out_unlock; 2056 } 2057 2058 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2059 /* Check with 1003.1g - what should datagram error */ 2060 2061 unix_state_unlock(other); 2062 2063 if (sk->sk_type == SOCK_SEQPACKET) { 2064 /* We are here only when racing with unix_release_sock() 2065 * is clearing @other. Never change state to TCP_CLOSE 2066 * unlike SOCK_DGRAM wants. 2067 */ 2068 err = -EPIPE; 2069 goto out_sock_put; 2070 } 2071 2072 if (!sk_locked) 2073 unix_state_lock(sk); 2074 2075 if (unix_peer(sk) == other) { 2076 unix_peer(sk) = NULL; 2077 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2078 2079 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2080 unix_state_unlock(sk); 2081 2082 unix_dgram_disconnected(sk, other); 2083 sock_put(other); 2084 err = -ECONNREFUSED; 2085 goto out_sock_put; 2086 } 2087 2088 unix_state_unlock(sk); 2089 2090 if (!msg->msg_namelen) { 2091 err = -ECONNRESET; 2092 goto out_sock_put; 2093 } 2094 2095 sock_put(other); 2096 goto lookup; 2097 } 2098 2099 if (other->sk_shutdown & RCV_SHUTDOWN) { 2100 err = -EPIPE; 2101 goto out_unlock; 2102 } 2103 2104 if (sk->sk_type != SOCK_SEQPACKET) { 2105 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2106 if (err) 2107 goto out_unlock; 2108 } 2109 2110 /* other == sk && unix_peer(other) != sk if 2111 * - unix_peer(sk) == NULL, destination address bound to sk 2112 * - unix_peer(sk) == sk by time of get but disconnected before lock 2113 */ 2114 if (other != sk && 2115 unlikely(unix_peer(other) != sk && 2116 unix_recvq_full_lockless(other))) { 2117 if (timeo) { 2118 timeo = unix_wait_for_peer(other, timeo); 2119 2120 err = sock_intr_errno(timeo); 2121 if (signal_pending(current)) 2122 goto out_sock_put; 2123 2124 goto restart; 2125 } 2126 2127 if (!sk_locked) { 2128 unix_state_unlock(other); 2129 unix_state_double_lock(sk, other); 2130 } 2131 2132 if (unix_peer(sk) != other || 2133 unix_dgram_peer_wake_me(sk, other)) { 2134 err = -EAGAIN; 2135 sk_locked = 1; 2136 goto out_unlock; 2137 } 2138 2139 if (!sk_locked) { 2140 sk_locked = 1; 2141 goto restart_locked; 2142 } 2143 } 2144 2145 if (unlikely(sk_locked)) 2146 unix_state_unlock(sk); 2147 2148 if (sock_flag(other, SOCK_RCVTSTAMP)) 2149 __net_timestamp(skb); 2150 maybe_add_creds(skb, sock, other); 2151 scm_stat_add(other, skb); 2152 skb_queue_tail(&other->sk_receive_queue, skb); 2153 unix_state_unlock(other); 2154 other->sk_data_ready(other); 2155 sock_put(other); 2156 scm_destroy(&scm); 2157 return len; 2158 2159 out_unlock: 2160 if (sk_locked) 2161 unix_state_unlock(sk); 2162 unix_state_unlock(other); 2163 out_sock_put: 2164 sock_put(other); 2165 out_free: 2166 consume_skb(skb); 2167 out: 2168 scm_destroy(&scm); 2169 return err; 2170 } 2171 2172 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2173 * bytes, and a minimum of a full page. 2174 */ 2175 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2176 2177 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2178 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2179 struct scm_cookie *scm, bool fds_sent) 2180 { 2181 struct unix_sock *ousk = unix_sk(other); 2182 struct sk_buff *skb; 2183 int err; 2184 2185 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2186 2187 if (!skb) 2188 return err; 2189 2190 err = unix_scm_to_skb(scm, skb, !fds_sent); 2191 if (err < 0) 2192 goto out; 2193 2194 skb_put(skb, 1); 2195 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2196 2197 if (err) 2198 goto out; 2199 2200 unix_state_lock(other); 2201 2202 if (sock_flag(other, SOCK_DEAD) || 2203 (other->sk_shutdown & RCV_SHUTDOWN)) { 2204 unix_state_unlock(other); 2205 err = -EPIPE; 2206 goto out; 2207 } 2208 2209 maybe_add_creds(skb, sock, other); 2210 scm_stat_add(other, skb); 2211 2212 spin_lock(&other->sk_receive_queue.lock); 2213 WRITE_ONCE(ousk->oob_skb, skb); 2214 __skb_queue_tail(&other->sk_receive_queue, skb); 2215 spin_unlock(&other->sk_receive_queue.lock); 2216 2217 sk_send_sigurg(other); 2218 unix_state_unlock(other); 2219 other->sk_data_ready(other); 2220 2221 return 0; 2222 out: 2223 consume_skb(skb); 2224 return err; 2225 } 2226 #endif 2227 2228 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2229 size_t len) 2230 { 2231 struct sock *sk = sock->sk; 2232 struct sk_buff *skb = NULL; 2233 struct sock *other = NULL; 2234 struct scm_cookie scm; 2235 bool fds_sent = false; 2236 int err, sent = 0; 2237 2238 err = scm_send(sock, msg, &scm, false); 2239 if (err < 0) 2240 return err; 2241 2242 wait_for_unix_gc(scm.fp); 2243 2244 if (msg->msg_flags & MSG_OOB) { 2245 err = -EOPNOTSUPP; 2246 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2247 if (len) 2248 len--; 2249 else 2250 #endif 2251 goto out_err; 2252 } 2253 2254 if (msg->msg_namelen) { 2255 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2256 goto out_err; 2257 } else { 2258 other = unix_peer(sk); 2259 if (!other) { 2260 err = -ENOTCONN; 2261 goto out_err; 2262 } 2263 } 2264 2265 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2266 goto out_pipe; 2267 2268 while (sent < len) { 2269 int size = len - sent; 2270 int data_len; 2271 2272 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2273 skb = sock_alloc_send_pskb(sk, 0, 0, 2274 msg->msg_flags & MSG_DONTWAIT, 2275 &err, 0); 2276 } else { 2277 /* Keep two messages in the pipe so it schedules better */ 2278 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2279 2280 /* allow fallback to order-0 allocations */ 2281 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2282 2283 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2284 2285 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2286 2287 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2288 msg->msg_flags & MSG_DONTWAIT, &err, 2289 get_order(UNIX_SKB_FRAGS_SZ)); 2290 } 2291 if (!skb) 2292 goto out_err; 2293 2294 /* Only send the fds in the first buffer */ 2295 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2296 if (err < 0) 2297 goto out_free; 2298 2299 fds_sent = true; 2300 2301 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2302 skb->ip_summed = CHECKSUM_UNNECESSARY; 2303 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2304 sk->sk_allocation); 2305 if (err < 0) 2306 goto out_free; 2307 2308 size = err; 2309 refcount_add(size, &sk->sk_wmem_alloc); 2310 } else { 2311 skb_put(skb, size - data_len); 2312 skb->data_len = data_len; 2313 skb->len = size; 2314 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2315 if (err) 2316 goto out_free; 2317 } 2318 2319 unix_state_lock(other); 2320 2321 if (sock_flag(other, SOCK_DEAD) || 2322 (other->sk_shutdown & RCV_SHUTDOWN)) 2323 goto out_pipe_unlock; 2324 2325 maybe_add_creds(skb, sock, other); 2326 scm_stat_add(other, skb); 2327 skb_queue_tail(&other->sk_receive_queue, skb); 2328 unix_state_unlock(other); 2329 other->sk_data_ready(other); 2330 sent += size; 2331 } 2332 2333 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2334 if (msg->msg_flags & MSG_OOB) { 2335 err = queue_oob(sock, msg, other, &scm, fds_sent); 2336 if (err) 2337 goto out_err; 2338 sent++; 2339 } 2340 #endif 2341 2342 scm_destroy(&scm); 2343 2344 return sent; 2345 2346 out_pipe_unlock: 2347 unix_state_unlock(other); 2348 out_pipe: 2349 if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) 2350 send_sig(SIGPIPE, current, 0); 2351 err = -EPIPE; 2352 out_free: 2353 consume_skb(skb); 2354 out_err: 2355 scm_destroy(&scm); 2356 return sent ? : err; 2357 } 2358 2359 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2360 size_t len) 2361 { 2362 int err; 2363 struct sock *sk = sock->sk; 2364 2365 err = sock_error(sk); 2366 if (err) 2367 return err; 2368 2369 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2370 return -ENOTCONN; 2371 2372 if (msg->msg_namelen) 2373 msg->msg_namelen = 0; 2374 2375 return unix_dgram_sendmsg(sock, msg, len); 2376 } 2377 2378 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2379 size_t size, int flags) 2380 { 2381 struct sock *sk = sock->sk; 2382 2383 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2384 return -ENOTCONN; 2385 2386 return unix_dgram_recvmsg(sock, msg, size, flags); 2387 } 2388 2389 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2390 { 2391 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2392 2393 if (addr) { 2394 msg->msg_namelen = addr->len; 2395 memcpy(msg->msg_name, addr->name, addr->len); 2396 } 2397 } 2398 2399 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2400 int flags) 2401 { 2402 struct scm_cookie scm; 2403 struct socket *sock = sk->sk_socket; 2404 struct unix_sock *u = unix_sk(sk); 2405 struct sk_buff *skb, *last; 2406 long timeo; 2407 int skip; 2408 int err; 2409 2410 err = -EOPNOTSUPP; 2411 if (flags&MSG_OOB) 2412 goto out; 2413 2414 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2415 2416 do { 2417 mutex_lock(&u->iolock); 2418 2419 skip = sk_peek_offset(sk, flags); 2420 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2421 &skip, &err, &last); 2422 if (skb) { 2423 if (!(flags & MSG_PEEK)) 2424 scm_stat_del(sk, skb); 2425 break; 2426 } 2427 2428 mutex_unlock(&u->iolock); 2429 2430 if (err != -EAGAIN) 2431 break; 2432 } while (timeo && 2433 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2434 &err, &timeo, last)); 2435 2436 if (!skb) { /* implies iolock unlocked */ 2437 unix_state_lock(sk); 2438 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2439 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2440 (sk->sk_shutdown & RCV_SHUTDOWN)) 2441 err = 0; 2442 unix_state_unlock(sk); 2443 goto out; 2444 } 2445 2446 if (wq_has_sleeper(&u->peer_wait)) 2447 wake_up_interruptible_sync_poll(&u->peer_wait, 2448 EPOLLOUT | EPOLLWRNORM | 2449 EPOLLWRBAND); 2450 2451 if (msg->msg_name) { 2452 unix_copy_addr(msg, skb->sk); 2453 2454 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2455 msg->msg_name, 2456 &msg->msg_namelen); 2457 } 2458 2459 if (size > skb->len - skip) 2460 size = skb->len - skip; 2461 else if (size < skb->len - skip) 2462 msg->msg_flags |= MSG_TRUNC; 2463 2464 err = skb_copy_datagram_msg(skb, skip, msg, size); 2465 if (err) 2466 goto out_free; 2467 2468 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2469 __sock_recv_timestamp(msg, sk, skb); 2470 2471 memset(&scm, 0, sizeof(scm)); 2472 2473 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2474 unix_set_secdata(&scm, skb); 2475 2476 if (!(flags & MSG_PEEK)) { 2477 if (UNIXCB(skb).fp) 2478 unix_detach_fds(&scm, skb); 2479 2480 sk_peek_offset_bwd(sk, skb->len); 2481 } else { 2482 /* It is questionable: on PEEK we could: 2483 - do not return fds - good, but too simple 8) 2484 - return fds, and do not return them on read (old strategy, 2485 apparently wrong) 2486 - clone fds (I chose it for now, it is the most universal 2487 solution) 2488 2489 POSIX 1003.1g does not actually define this clearly 2490 at all. POSIX 1003.1g doesn't define a lot of things 2491 clearly however! 2492 2493 */ 2494 2495 sk_peek_offset_fwd(sk, size); 2496 2497 if (UNIXCB(skb).fp) 2498 unix_peek_fds(&scm, skb); 2499 } 2500 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2501 2502 scm_recv_unix(sock, msg, &scm, flags); 2503 2504 out_free: 2505 skb_free_datagram(sk, skb); 2506 mutex_unlock(&u->iolock); 2507 out: 2508 return err; 2509 } 2510 2511 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2512 int flags) 2513 { 2514 struct sock *sk = sock->sk; 2515 2516 #ifdef CONFIG_BPF_SYSCALL 2517 const struct proto *prot = READ_ONCE(sk->sk_prot); 2518 2519 if (prot != &unix_dgram_proto) 2520 return prot->recvmsg(sk, msg, size, flags, NULL); 2521 #endif 2522 return __unix_dgram_recvmsg(sk, msg, size, flags); 2523 } 2524 2525 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2526 { 2527 struct unix_sock *u = unix_sk(sk); 2528 struct sk_buff *skb; 2529 int err; 2530 2531 mutex_lock(&u->iolock); 2532 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2533 mutex_unlock(&u->iolock); 2534 if (!skb) 2535 return err; 2536 2537 return recv_actor(sk, skb); 2538 } 2539 2540 /* 2541 * Sleep until more data has arrived. But check for races.. 2542 */ 2543 static long unix_stream_data_wait(struct sock *sk, long timeo, 2544 struct sk_buff *last, unsigned int last_len, 2545 bool freezable) 2546 { 2547 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2548 struct sk_buff *tail; 2549 DEFINE_WAIT(wait); 2550 2551 unix_state_lock(sk); 2552 2553 for (;;) { 2554 prepare_to_wait(sk_sleep(sk), &wait, state); 2555 2556 tail = skb_peek_tail(&sk->sk_receive_queue); 2557 if (tail != last || 2558 (tail && tail->len != last_len) || 2559 sk->sk_err || 2560 (sk->sk_shutdown & RCV_SHUTDOWN) || 2561 signal_pending(current) || 2562 !timeo) 2563 break; 2564 2565 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2566 unix_state_unlock(sk); 2567 timeo = schedule_timeout(timeo); 2568 unix_state_lock(sk); 2569 2570 if (sock_flag(sk, SOCK_DEAD)) 2571 break; 2572 2573 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2574 } 2575 2576 finish_wait(sk_sleep(sk), &wait); 2577 unix_state_unlock(sk); 2578 return timeo; 2579 } 2580 2581 static unsigned int unix_skb_len(const struct sk_buff *skb) 2582 { 2583 return skb->len - UNIXCB(skb).consumed; 2584 } 2585 2586 struct unix_stream_read_state { 2587 int (*recv_actor)(struct sk_buff *, int, int, 2588 struct unix_stream_read_state *); 2589 struct socket *socket; 2590 struct msghdr *msg; 2591 struct pipe_inode_info *pipe; 2592 size_t size; 2593 int flags; 2594 unsigned int splice_flags; 2595 }; 2596 2597 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2598 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2599 { 2600 struct socket *sock = state->socket; 2601 struct sock *sk = sock->sk; 2602 struct unix_sock *u = unix_sk(sk); 2603 int chunk = 1; 2604 struct sk_buff *oob_skb; 2605 2606 mutex_lock(&u->iolock); 2607 unix_state_lock(sk); 2608 spin_lock(&sk->sk_receive_queue.lock); 2609 2610 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2611 spin_unlock(&sk->sk_receive_queue.lock); 2612 unix_state_unlock(sk); 2613 mutex_unlock(&u->iolock); 2614 return -EINVAL; 2615 } 2616 2617 oob_skb = u->oob_skb; 2618 2619 if (!(state->flags & MSG_PEEK)) 2620 WRITE_ONCE(u->oob_skb, NULL); 2621 2622 spin_unlock(&sk->sk_receive_queue.lock); 2623 unix_state_unlock(sk); 2624 2625 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2626 2627 if (!(state->flags & MSG_PEEK)) 2628 UNIXCB(oob_skb).consumed += 1; 2629 2630 mutex_unlock(&u->iolock); 2631 2632 if (chunk < 0) 2633 return -EFAULT; 2634 2635 state->msg->msg_flags |= MSG_OOB; 2636 return 1; 2637 } 2638 2639 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2640 int flags, int copied) 2641 { 2642 struct sk_buff *read_skb = NULL, *unread_skb = NULL; 2643 struct unix_sock *u = unix_sk(sk); 2644 2645 if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb))) 2646 return skb; 2647 2648 spin_lock(&sk->sk_receive_queue.lock); 2649 2650 if (!unix_skb_len(skb)) { 2651 if (copied && (!u->oob_skb || skb == u->oob_skb)) { 2652 skb = NULL; 2653 } else if (flags & MSG_PEEK) { 2654 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2655 } else { 2656 read_skb = skb; 2657 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2658 __skb_unlink(read_skb, &sk->sk_receive_queue); 2659 } 2660 2661 if (!skb) 2662 goto unlock; 2663 } 2664 2665 if (skb != u->oob_skb) 2666 goto unlock; 2667 2668 if (copied) { 2669 skb = NULL; 2670 } else if (!(flags & MSG_PEEK)) { 2671 WRITE_ONCE(u->oob_skb, NULL); 2672 2673 if (!sock_flag(sk, SOCK_URGINLINE)) { 2674 __skb_unlink(skb, &sk->sk_receive_queue); 2675 unread_skb = skb; 2676 skb = skb_peek(&sk->sk_receive_queue); 2677 } 2678 } else if (!sock_flag(sk, SOCK_URGINLINE)) { 2679 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2680 } 2681 2682 unlock: 2683 spin_unlock(&sk->sk_receive_queue.lock); 2684 2685 consume_skb(read_skb); 2686 kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB); 2687 2688 return skb; 2689 } 2690 #endif 2691 2692 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2693 { 2694 struct unix_sock *u = unix_sk(sk); 2695 struct sk_buff *skb; 2696 int err; 2697 2698 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2699 return -ENOTCONN; 2700 2701 mutex_lock(&u->iolock); 2702 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2703 mutex_unlock(&u->iolock); 2704 if (!skb) 2705 return err; 2706 2707 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2708 if (unlikely(skb == READ_ONCE(u->oob_skb))) { 2709 bool drop = false; 2710 2711 unix_state_lock(sk); 2712 2713 if (sock_flag(sk, SOCK_DEAD)) { 2714 unix_state_unlock(sk); 2715 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 2716 return -ECONNRESET; 2717 } 2718 2719 spin_lock(&sk->sk_receive_queue.lock); 2720 if (likely(skb == u->oob_skb)) { 2721 WRITE_ONCE(u->oob_skb, NULL); 2722 drop = true; 2723 } 2724 spin_unlock(&sk->sk_receive_queue.lock); 2725 2726 unix_state_unlock(sk); 2727 2728 if (drop) { 2729 kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); 2730 return -EAGAIN; 2731 } 2732 } 2733 #endif 2734 2735 return recv_actor(sk, skb); 2736 } 2737 2738 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2739 bool freezable) 2740 { 2741 struct scm_cookie scm; 2742 struct socket *sock = state->socket; 2743 struct sock *sk = sock->sk; 2744 struct unix_sock *u = unix_sk(sk); 2745 int copied = 0; 2746 int flags = state->flags; 2747 int noblock = flags & MSG_DONTWAIT; 2748 bool check_creds = false; 2749 int target; 2750 int err = 0; 2751 long timeo; 2752 int skip; 2753 size_t size = state->size; 2754 unsigned int last_len; 2755 2756 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2757 err = -EINVAL; 2758 goto out; 2759 } 2760 2761 if (unlikely(flags & MSG_OOB)) { 2762 err = -EOPNOTSUPP; 2763 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2764 err = unix_stream_recv_urg(state); 2765 #endif 2766 goto out; 2767 } 2768 2769 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2770 timeo = sock_rcvtimeo(sk, noblock); 2771 2772 memset(&scm, 0, sizeof(scm)); 2773 2774 /* Lock the socket to prevent queue disordering 2775 * while sleeps in memcpy_tomsg 2776 */ 2777 mutex_lock(&u->iolock); 2778 2779 skip = max(sk_peek_offset(sk, flags), 0); 2780 2781 do { 2782 struct sk_buff *skb, *last; 2783 int chunk; 2784 2785 redo: 2786 unix_state_lock(sk); 2787 if (sock_flag(sk, SOCK_DEAD)) { 2788 err = -ECONNRESET; 2789 goto unlock; 2790 } 2791 last = skb = skb_peek(&sk->sk_receive_queue); 2792 last_len = last ? last->len : 0; 2793 2794 again: 2795 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2796 if (skb) { 2797 skb = manage_oob(skb, sk, flags, copied); 2798 if (!skb && copied) { 2799 unix_state_unlock(sk); 2800 break; 2801 } 2802 } 2803 #endif 2804 if (skb == NULL) { 2805 if (copied >= target) 2806 goto unlock; 2807 2808 /* 2809 * POSIX 1003.1g mandates this order. 2810 */ 2811 2812 err = sock_error(sk); 2813 if (err) 2814 goto unlock; 2815 if (sk->sk_shutdown & RCV_SHUTDOWN) 2816 goto unlock; 2817 2818 unix_state_unlock(sk); 2819 if (!timeo) { 2820 err = -EAGAIN; 2821 break; 2822 } 2823 2824 mutex_unlock(&u->iolock); 2825 2826 timeo = unix_stream_data_wait(sk, timeo, last, 2827 last_len, freezable); 2828 2829 if (signal_pending(current)) { 2830 err = sock_intr_errno(timeo); 2831 scm_destroy(&scm); 2832 goto out; 2833 } 2834 2835 mutex_lock(&u->iolock); 2836 goto redo; 2837 unlock: 2838 unix_state_unlock(sk); 2839 break; 2840 } 2841 2842 while (skip >= unix_skb_len(skb)) { 2843 skip -= unix_skb_len(skb); 2844 last = skb; 2845 last_len = skb->len; 2846 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2847 if (!skb) 2848 goto again; 2849 } 2850 2851 unix_state_unlock(sk); 2852 2853 if (check_creds) { 2854 /* Never glue messages from different writers */ 2855 if (!unix_skb_scm_eq(skb, &scm)) 2856 break; 2857 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2858 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2859 /* Copy credentials */ 2860 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2861 unix_set_secdata(&scm, skb); 2862 check_creds = true; 2863 } 2864 2865 /* Copy address just once */ 2866 if (state->msg && state->msg->msg_name) { 2867 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2868 state->msg->msg_name); 2869 unix_copy_addr(state->msg, skb->sk); 2870 2871 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2872 state->msg->msg_name, 2873 &state->msg->msg_namelen); 2874 2875 sunaddr = NULL; 2876 } 2877 2878 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2879 chunk = state->recv_actor(skb, skip, chunk, state); 2880 if (chunk < 0) { 2881 if (copied == 0) 2882 copied = -EFAULT; 2883 break; 2884 } 2885 copied += chunk; 2886 size -= chunk; 2887 2888 /* Mark read part of skb as used */ 2889 if (!(flags & MSG_PEEK)) { 2890 UNIXCB(skb).consumed += chunk; 2891 2892 sk_peek_offset_bwd(sk, chunk); 2893 2894 if (UNIXCB(skb).fp) { 2895 scm_stat_del(sk, skb); 2896 unix_detach_fds(&scm, skb); 2897 } 2898 2899 if (unix_skb_len(skb)) 2900 break; 2901 2902 skb_unlink(skb, &sk->sk_receive_queue); 2903 consume_skb(skb); 2904 2905 if (scm.fp) 2906 break; 2907 } else { 2908 /* It is questionable, see note in unix_dgram_recvmsg. 2909 */ 2910 if (UNIXCB(skb).fp) 2911 unix_peek_fds(&scm, skb); 2912 2913 sk_peek_offset_fwd(sk, chunk); 2914 2915 if (UNIXCB(skb).fp) 2916 break; 2917 2918 skip = 0; 2919 last = skb; 2920 last_len = skb->len; 2921 unix_state_lock(sk); 2922 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2923 if (skb) 2924 goto again; 2925 unix_state_unlock(sk); 2926 break; 2927 } 2928 } while (size); 2929 2930 mutex_unlock(&u->iolock); 2931 if (state->msg) 2932 scm_recv_unix(sock, state->msg, &scm, flags); 2933 else 2934 scm_destroy(&scm); 2935 out: 2936 return copied ? : err; 2937 } 2938 2939 static int unix_stream_read_actor(struct sk_buff *skb, 2940 int skip, int chunk, 2941 struct unix_stream_read_state *state) 2942 { 2943 int ret; 2944 2945 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2946 state->msg, chunk); 2947 return ret ?: chunk; 2948 } 2949 2950 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2951 size_t size, int flags) 2952 { 2953 struct unix_stream_read_state state = { 2954 .recv_actor = unix_stream_read_actor, 2955 .socket = sk->sk_socket, 2956 .msg = msg, 2957 .size = size, 2958 .flags = flags 2959 }; 2960 2961 return unix_stream_read_generic(&state, true); 2962 } 2963 2964 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2965 size_t size, int flags) 2966 { 2967 struct unix_stream_read_state state = { 2968 .recv_actor = unix_stream_read_actor, 2969 .socket = sock, 2970 .msg = msg, 2971 .size = size, 2972 .flags = flags 2973 }; 2974 2975 #ifdef CONFIG_BPF_SYSCALL 2976 struct sock *sk = sock->sk; 2977 const struct proto *prot = READ_ONCE(sk->sk_prot); 2978 2979 if (prot != &unix_stream_proto) 2980 return prot->recvmsg(sk, msg, size, flags, NULL); 2981 #endif 2982 return unix_stream_read_generic(&state, true); 2983 } 2984 2985 static int unix_stream_splice_actor(struct sk_buff *skb, 2986 int skip, int chunk, 2987 struct unix_stream_read_state *state) 2988 { 2989 return skb_splice_bits(skb, state->socket->sk, 2990 UNIXCB(skb).consumed + skip, 2991 state->pipe, chunk, state->splice_flags); 2992 } 2993 2994 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2995 struct pipe_inode_info *pipe, 2996 size_t size, unsigned int flags) 2997 { 2998 struct unix_stream_read_state state = { 2999 .recv_actor = unix_stream_splice_actor, 3000 .socket = sock, 3001 .pipe = pipe, 3002 .size = size, 3003 .splice_flags = flags, 3004 }; 3005 3006 if (unlikely(*ppos)) 3007 return -ESPIPE; 3008 3009 if (sock->file->f_flags & O_NONBLOCK || 3010 flags & SPLICE_F_NONBLOCK) 3011 state.flags = MSG_DONTWAIT; 3012 3013 return unix_stream_read_generic(&state, false); 3014 } 3015 3016 static int unix_shutdown(struct socket *sock, int mode) 3017 { 3018 struct sock *sk = sock->sk; 3019 struct sock *other; 3020 3021 if (mode < SHUT_RD || mode > SHUT_RDWR) 3022 return -EINVAL; 3023 /* This maps: 3024 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 3025 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 3026 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 3027 */ 3028 ++mode; 3029 3030 unix_state_lock(sk); 3031 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 3032 other = unix_peer(sk); 3033 if (other) 3034 sock_hold(other); 3035 unix_state_unlock(sk); 3036 sk->sk_state_change(sk); 3037 3038 if (other && 3039 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 3040 3041 int peer_mode = 0; 3042 const struct proto *prot = READ_ONCE(other->sk_prot); 3043 3044 if (prot->unhash) 3045 prot->unhash(other); 3046 if (mode&RCV_SHUTDOWN) 3047 peer_mode |= SEND_SHUTDOWN; 3048 if (mode&SEND_SHUTDOWN) 3049 peer_mode |= RCV_SHUTDOWN; 3050 unix_state_lock(other); 3051 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 3052 unix_state_unlock(other); 3053 other->sk_state_change(other); 3054 if (peer_mode == SHUTDOWN_MASK) 3055 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3056 else if (peer_mode & RCV_SHUTDOWN) 3057 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3058 } 3059 if (other) 3060 sock_put(other); 3061 3062 return 0; 3063 } 3064 3065 long unix_inq_len(struct sock *sk) 3066 { 3067 struct sk_buff *skb; 3068 long amount = 0; 3069 3070 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 3071 return -EINVAL; 3072 3073 spin_lock(&sk->sk_receive_queue.lock); 3074 if (sk->sk_type == SOCK_STREAM || 3075 sk->sk_type == SOCK_SEQPACKET) { 3076 skb_queue_walk(&sk->sk_receive_queue, skb) 3077 amount += unix_skb_len(skb); 3078 } else { 3079 skb = skb_peek(&sk->sk_receive_queue); 3080 if (skb) 3081 amount = skb->len; 3082 } 3083 spin_unlock(&sk->sk_receive_queue.lock); 3084 3085 return amount; 3086 } 3087 EXPORT_SYMBOL_GPL(unix_inq_len); 3088 3089 long unix_outq_len(struct sock *sk) 3090 { 3091 return sk_wmem_alloc_get(sk); 3092 } 3093 EXPORT_SYMBOL_GPL(unix_outq_len); 3094 3095 static int unix_open_file(struct sock *sk) 3096 { 3097 struct path path; 3098 struct file *f; 3099 int fd; 3100 3101 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3102 return -EPERM; 3103 3104 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3105 return -ENOENT; 3106 3107 path = unix_sk(sk)->path; 3108 if (!path.dentry) 3109 return -ENOENT; 3110 3111 path_get(&path); 3112 3113 fd = get_unused_fd_flags(O_CLOEXEC); 3114 if (fd < 0) 3115 goto out; 3116 3117 f = dentry_open(&path, O_PATH, current_cred()); 3118 if (IS_ERR(f)) { 3119 put_unused_fd(fd); 3120 fd = PTR_ERR(f); 3121 goto out; 3122 } 3123 3124 fd_install(fd, f); 3125 out: 3126 path_put(&path); 3127 3128 return fd; 3129 } 3130 3131 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3132 { 3133 struct sock *sk = sock->sk; 3134 long amount = 0; 3135 int err; 3136 3137 switch (cmd) { 3138 case SIOCOUTQ: 3139 amount = unix_outq_len(sk); 3140 err = put_user(amount, (int __user *)arg); 3141 break; 3142 case SIOCINQ: 3143 amount = unix_inq_len(sk); 3144 if (amount < 0) 3145 err = amount; 3146 else 3147 err = put_user(amount, (int __user *)arg); 3148 break; 3149 case SIOCUNIXFILE: 3150 err = unix_open_file(sk); 3151 break; 3152 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3153 case SIOCATMARK: 3154 { 3155 struct unix_sock *u = unix_sk(sk); 3156 struct sk_buff *skb; 3157 int answ = 0; 3158 3159 mutex_lock(&u->iolock); 3160 3161 skb = skb_peek(&sk->sk_receive_queue); 3162 if (skb) { 3163 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); 3164 struct sk_buff *next_skb; 3165 3166 next_skb = skb_peek_next(skb, &sk->sk_receive_queue); 3167 3168 if (skb == oob_skb || 3169 (!unix_skb_len(skb) && 3170 (!oob_skb || next_skb == oob_skb))) 3171 answ = 1; 3172 } 3173 3174 mutex_unlock(&u->iolock); 3175 3176 err = put_user(answ, (int __user *)arg); 3177 } 3178 break; 3179 #endif 3180 default: 3181 err = -ENOIOCTLCMD; 3182 break; 3183 } 3184 return err; 3185 } 3186 3187 #ifdef CONFIG_COMPAT 3188 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3189 { 3190 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3191 } 3192 #endif 3193 3194 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3195 { 3196 struct sock *sk = sock->sk; 3197 unsigned char state; 3198 __poll_t mask; 3199 u8 shutdown; 3200 3201 sock_poll_wait(file, sock, wait); 3202 mask = 0; 3203 shutdown = READ_ONCE(sk->sk_shutdown); 3204 state = READ_ONCE(sk->sk_state); 3205 3206 /* exceptional events? */ 3207 if (READ_ONCE(sk->sk_err)) 3208 mask |= EPOLLERR; 3209 if (shutdown == SHUTDOWN_MASK) 3210 mask |= EPOLLHUP; 3211 if (shutdown & RCV_SHUTDOWN) 3212 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3213 3214 /* readable? */ 3215 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3216 mask |= EPOLLIN | EPOLLRDNORM; 3217 if (sk_is_readable(sk)) 3218 mask |= EPOLLIN | EPOLLRDNORM; 3219 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3220 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3221 mask |= EPOLLPRI; 3222 #endif 3223 3224 /* Connection-based need to check for termination and startup */ 3225 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3226 state == TCP_CLOSE) 3227 mask |= EPOLLHUP; 3228 3229 /* 3230 * we set writable also when the other side has shut down the 3231 * connection. This prevents stuck sockets. 3232 */ 3233 if (unix_writable(sk, state)) 3234 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3235 3236 return mask; 3237 } 3238 3239 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3240 poll_table *wait) 3241 { 3242 struct sock *sk = sock->sk, *other; 3243 unsigned int writable; 3244 unsigned char state; 3245 __poll_t mask; 3246 u8 shutdown; 3247 3248 sock_poll_wait(file, sock, wait); 3249 mask = 0; 3250 shutdown = READ_ONCE(sk->sk_shutdown); 3251 state = READ_ONCE(sk->sk_state); 3252 3253 /* exceptional events? */ 3254 if (READ_ONCE(sk->sk_err) || 3255 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3256 mask |= EPOLLERR | 3257 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3258 3259 if (shutdown & RCV_SHUTDOWN) 3260 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3261 if (shutdown == SHUTDOWN_MASK) 3262 mask |= EPOLLHUP; 3263 3264 /* readable? */ 3265 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3266 mask |= EPOLLIN | EPOLLRDNORM; 3267 if (sk_is_readable(sk)) 3268 mask |= EPOLLIN | EPOLLRDNORM; 3269 3270 /* Connection-based need to check for termination and startup */ 3271 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3272 mask |= EPOLLHUP; 3273 3274 /* No write status requested, avoid expensive OUT tests. */ 3275 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3276 return mask; 3277 3278 writable = unix_writable(sk, state); 3279 if (writable) { 3280 unix_state_lock(sk); 3281 3282 other = unix_peer(sk); 3283 if (other && unix_peer(other) != sk && 3284 unix_recvq_full_lockless(other) && 3285 unix_dgram_peer_wake_me(sk, other)) 3286 writable = 0; 3287 3288 unix_state_unlock(sk); 3289 } 3290 3291 if (writable) 3292 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3293 else 3294 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3295 3296 return mask; 3297 } 3298 3299 #ifdef CONFIG_PROC_FS 3300 3301 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3302 3303 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3304 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3305 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3306 3307 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3308 { 3309 unsigned long offset = get_offset(*pos); 3310 unsigned long bucket = get_bucket(*pos); 3311 unsigned long count = 0; 3312 struct sock *sk; 3313 3314 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3315 sk; sk = sk_next(sk)) { 3316 if (++count == offset) 3317 break; 3318 } 3319 3320 return sk; 3321 } 3322 3323 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3324 { 3325 unsigned long bucket = get_bucket(*pos); 3326 struct net *net = seq_file_net(seq); 3327 struct sock *sk; 3328 3329 while (bucket < UNIX_HASH_SIZE) { 3330 spin_lock(&net->unx.table.locks[bucket]); 3331 3332 sk = unix_from_bucket(seq, pos); 3333 if (sk) 3334 return sk; 3335 3336 spin_unlock(&net->unx.table.locks[bucket]); 3337 3338 *pos = set_bucket_offset(++bucket, 1); 3339 } 3340 3341 return NULL; 3342 } 3343 3344 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3345 loff_t *pos) 3346 { 3347 unsigned long bucket = get_bucket(*pos); 3348 3349 sk = sk_next(sk); 3350 if (sk) 3351 return sk; 3352 3353 3354 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3355 3356 *pos = set_bucket_offset(++bucket, 1); 3357 3358 return unix_get_first(seq, pos); 3359 } 3360 3361 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3362 { 3363 if (!*pos) 3364 return SEQ_START_TOKEN; 3365 3366 return unix_get_first(seq, pos); 3367 } 3368 3369 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3370 { 3371 ++*pos; 3372 3373 if (v == SEQ_START_TOKEN) 3374 return unix_get_first(seq, pos); 3375 3376 return unix_get_next(seq, v, pos); 3377 } 3378 3379 static void unix_seq_stop(struct seq_file *seq, void *v) 3380 { 3381 struct sock *sk = v; 3382 3383 if (sk) 3384 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3385 } 3386 3387 static int unix_seq_show(struct seq_file *seq, void *v) 3388 { 3389 3390 if (v == SEQ_START_TOKEN) 3391 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3392 "Inode Path\n"); 3393 else { 3394 struct sock *s = v; 3395 struct unix_sock *u = unix_sk(s); 3396 unix_state_lock(s); 3397 3398 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3399 s, 3400 refcount_read(&s->sk_refcnt), 3401 0, 3402 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3403 s->sk_type, 3404 s->sk_socket ? 3405 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3406 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3407 sock_i_ino(s)); 3408 3409 if (u->addr) { // under a hash table lock here 3410 int i, len; 3411 seq_putc(seq, ' '); 3412 3413 i = 0; 3414 len = u->addr->len - 3415 offsetof(struct sockaddr_un, sun_path); 3416 if (u->addr->name->sun_path[0]) { 3417 len--; 3418 } else { 3419 seq_putc(seq, '@'); 3420 i++; 3421 } 3422 for ( ; i < len; i++) 3423 seq_putc(seq, u->addr->name->sun_path[i] ?: 3424 '@'); 3425 } 3426 unix_state_unlock(s); 3427 seq_putc(seq, '\n'); 3428 } 3429 3430 return 0; 3431 } 3432 3433 static const struct seq_operations unix_seq_ops = { 3434 .start = unix_seq_start, 3435 .next = unix_seq_next, 3436 .stop = unix_seq_stop, 3437 .show = unix_seq_show, 3438 }; 3439 3440 #ifdef CONFIG_BPF_SYSCALL 3441 struct bpf_unix_iter_state { 3442 struct seq_net_private p; 3443 unsigned int cur_sk; 3444 unsigned int end_sk; 3445 unsigned int max_sk; 3446 struct sock **batch; 3447 bool st_bucket_done; 3448 }; 3449 3450 struct bpf_iter__unix { 3451 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3452 __bpf_md_ptr(struct unix_sock *, unix_sk); 3453 uid_t uid __aligned(8); 3454 }; 3455 3456 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3457 struct unix_sock *unix_sk, uid_t uid) 3458 { 3459 struct bpf_iter__unix ctx; 3460 3461 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3462 ctx.meta = meta; 3463 ctx.unix_sk = unix_sk; 3464 ctx.uid = uid; 3465 return bpf_iter_run_prog(prog, &ctx); 3466 } 3467 3468 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3469 3470 { 3471 struct bpf_unix_iter_state *iter = seq->private; 3472 unsigned int expected = 1; 3473 struct sock *sk; 3474 3475 sock_hold(start_sk); 3476 iter->batch[iter->end_sk++] = start_sk; 3477 3478 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3479 if (iter->end_sk < iter->max_sk) { 3480 sock_hold(sk); 3481 iter->batch[iter->end_sk++] = sk; 3482 } 3483 3484 expected++; 3485 } 3486 3487 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3488 3489 return expected; 3490 } 3491 3492 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3493 { 3494 while (iter->cur_sk < iter->end_sk) 3495 sock_put(iter->batch[iter->cur_sk++]); 3496 } 3497 3498 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3499 unsigned int new_batch_sz) 3500 { 3501 struct sock **new_batch; 3502 3503 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3504 GFP_USER | __GFP_NOWARN); 3505 if (!new_batch) 3506 return -ENOMEM; 3507 3508 bpf_iter_unix_put_batch(iter); 3509 kvfree(iter->batch); 3510 iter->batch = new_batch; 3511 iter->max_sk = new_batch_sz; 3512 3513 return 0; 3514 } 3515 3516 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3517 loff_t *pos) 3518 { 3519 struct bpf_unix_iter_state *iter = seq->private; 3520 unsigned int expected; 3521 bool resized = false; 3522 struct sock *sk; 3523 3524 if (iter->st_bucket_done) 3525 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3526 3527 again: 3528 /* Get a new batch */ 3529 iter->cur_sk = 0; 3530 iter->end_sk = 0; 3531 3532 sk = unix_get_first(seq, pos); 3533 if (!sk) 3534 return NULL; /* Done */ 3535 3536 expected = bpf_iter_unix_hold_batch(seq, sk); 3537 3538 if (iter->end_sk == expected) { 3539 iter->st_bucket_done = true; 3540 return sk; 3541 } 3542 3543 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3544 resized = true; 3545 goto again; 3546 } 3547 3548 return sk; 3549 } 3550 3551 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3552 { 3553 if (!*pos) 3554 return SEQ_START_TOKEN; 3555 3556 /* bpf iter does not support lseek, so it always 3557 * continue from where it was stop()-ped. 3558 */ 3559 return bpf_iter_unix_batch(seq, pos); 3560 } 3561 3562 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3563 { 3564 struct bpf_unix_iter_state *iter = seq->private; 3565 struct sock *sk; 3566 3567 /* Whenever seq_next() is called, the iter->cur_sk is 3568 * done with seq_show(), so advance to the next sk in 3569 * the batch. 3570 */ 3571 if (iter->cur_sk < iter->end_sk) 3572 sock_put(iter->batch[iter->cur_sk++]); 3573 3574 ++*pos; 3575 3576 if (iter->cur_sk < iter->end_sk) 3577 sk = iter->batch[iter->cur_sk]; 3578 else 3579 sk = bpf_iter_unix_batch(seq, pos); 3580 3581 return sk; 3582 } 3583 3584 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3585 { 3586 struct bpf_iter_meta meta; 3587 struct bpf_prog *prog; 3588 struct sock *sk = v; 3589 uid_t uid; 3590 bool slow; 3591 int ret; 3592 3593 if (v == SEQ_START_TOKEN) 3594 return 0; 3595 3596 slow = lock_sock_fast(sk); 3597 3598 if (unlikely(sk_unhashed(sk))) { 3599 ret = SEQ_SKIP; 3600 goto unlock; 3601 } 3602 3603 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3604 meta.seq = seq; 3605 prog = bpf_iter_get_info(&meta, false); 3606 ret = unix_prog_seq_show(prog, &meta, v, uid); 3607 unlock: 3608 unlock_sock_fast(sk, slow); 3609 return ret; 3610 } 3611 3612 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3613 { 3614 struct bpf_unix_iter_state *iter = seq->private; 3615 struct bpf_iter_meta meta; 3616 struct bpf_prog *prog; 3617 3618 if (!v) { 3619 meta.seq = seq; 3620 prog = bpf_iter_get_info(&meta, true); 3621 if (prog) 3622 (void)unix_prog_seq_show(prog, &meta, v, 0); 3623 } 3624 3625 if (iter->cur_sk < iter->end_sk) 3626 bpf_iter_unix_put_batch(iter); 3627 } 3628 3629 static const struct seq_operations bpf_iter_unix_seq_ops = { 3630 .start = bpf_iter_unix_seq_start, 3631 .next = bpf_iter_unix_seq_next, 3632 .stop = bpf_iter_unix_seq_stop, 3633 .show = bpf_iter_unix_seq_show, 3634 }; 3635 #endif 3636 #endif 3637 3638 static const struct net_proto_family unix_family_ops = { 3639 .family = PF_UNIX, 3640 .create = unix_create, 3641 .owner = THIS_MODULE, 3642 }; 3643 3644 3645 static int __net_init unix_net_init(struct net *net) 3646 { 3647 int i; 3648 3649 net->unx.sysctl_max_dgram_qlen = 10; 3650 if (unix_sysctl_register(net)) 3651 goto out; 3652 3653 #ifdef CONFIG_PROC_FS 3654 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3655 sizeof(struct seq_net_private))) 3656 goto err_sysctl; 3657 #endif 3658 3659 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3660 sizeof(spinlock_t), GFP_KERNEL); 3661 if (!net->unx.table.locks) 3662 goto err_proc; 3663 3664 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3665 sizeof(struct hlist_head), 3666 GFP_KERNEL); 3667 if (!net->unx.table.buckets) 3668 goto free_locks; 3669 3670 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3671 spin_lock_init(&net->unx.table.locks[i]); 3672 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); 3673 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3674 } 3675 3676 return 0; 3677 3678 free_locks: 3679 kvfree(net->unx.table.locks); 3680 err_proc: 3681 #ifdef CONFIG_PROC_FS 3682 remove_proc_entry("unix", net->proc_net); 3683 err_sysctl: 3684 #endif 3685 unix_sysctl_unregister(net); 3686 out: 3687 return -ENOMEM; 3688 } 3689 3690 static void __net_exit unix_net_exit(struct net *net) 3691 { 3692 kvfree(net->unx.table.buckets); 3693 kvfree(net->unx.table.locks); 3694 unix_sysctl_unregister(net); 3695 remove_proc_entry("unix", net->proc_net); 3696 } 3697 3698 static struct pernet_operations unix_net_ops = { 3699 .init = unix_net_init, 3700 .exit = unix_net_exit, 3701 }; 3702 3703 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3704 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3705 struct unix_sock *unix_sk, uid_t uid) 3706 3707 #define INIT_BATCH_SZ 16 3708 3709 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3710 { 3711 struct bpf_unix_iter_state *iter = priv_data; 3712 int err; 3713 3714 err = bpf_iter_init_seq_net(priv_data, aux); 3715 if (err) 3716 return err; 3717 3718 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3719 if (err) { 3720 bpf_iter_fini_seq_net(priv_data); 3721 return err; 3722 } 3723 3724 return 0; 3725 } 3726 3727 static void bpf_iter_fini_unix(void *priv_data) 3728 { 3729 struct bpf_unix_iter_state *iter = priv_data; 3730 3731 bpf_iter_fini_seq_net(priv_data); 3732 kvfree(iter->batch); 3733 } 3734 3735 static const struct bpf_iter_seq_info unix_seq_info = { 3736 .seq_ops = &bpf_iter_unix_seq_ops, 3737 .init_seq_private = bpf_iter_init_unix, 3738 .fini_seq_private = bpf_iter_fini_unix, 3739 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3740 }; 3741 3742 static const struct bpf_func_proto * 3743 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3744 const struct bpf_prog *prog) 3745 { 3746 switch (func_id) { 3747 case BPF_FUNC_setsockopt: 3748 return &bpf_sk_setsockopt_proto; 3749 case BPF_FUNC_getsockopt: 3750 return &bpf_sk_getsockopt_proto; 3751 default: 3752 return NULL; 3753 } 3754 } 3755 3756 static struct bpf_iter_reg unix_reg_info = { 3757 .target = "unix", 3758 .ctx_arg_info_size = 1, 3759 .ctx_arg_info = { 3760 { offsetof(struct bpf_iter__unix, unix_sk), 3761 PTR_TO_BTF_ID_OR_NULL }, 3762 }, 3763 .get_func_proto = bpf_iter_unix_get_func_proto, 3764 .seq_info = &unix_seq_info, 3765 }; 3766 3767 static void __init bpf_iter_register(void) 3768 { 3769 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3770 if (bpf_iter_reg_target(&unix_reg_info)) 3771 pr_warn("Warning: could not register bpf iterator unix\n"); 3772 } 3773 #endif 3774 3775 static int __init af_unix_init(void) 3776 { 3777 int i, rc = -1; 3778 3779 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3780 3781 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3782 spin_lock_init(&bsd_socket_locks[i]); 3783 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3784 } 3785 3786 rc = proto_register(&unix_dgram_proto, 1); 3787 if (rc != 0) { 3788 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3789 goto out; 3790 } 3791 3792 rc = proto_register(&unix_stream_proto, 1); 3793 if (rc != 0) { 3794 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3795 proto_unregister(&unix_dgram_proto); 3796 goto out; 3797 } 3798 3799 sock_register(&unix_family_ops); 3800 register_pernet_subsys(&unix_net_ops); 3801 unix_bpf_build_proto(); 3802 3803 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3804 bpf_iter_register(); 3805 #endif 3806 3807 out: 3808 return rc; 3809 } 3810 3811 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3812 fs_initcall(af_unix_init); 3813