1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 #include <linux/bpf-cgroup.h> 120 121 static atomic_long_t unix_nr_socks; 122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 124 125 /* SMP locking strategy: 126 * hash table is protected with spinlock. 127 * each socket state is protected by separate spinlock. 128 */ 129 #ifdef CONFIG_PROVE_LOCKING 130 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) 131 132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a, 133 const struct lockdep_map *b) 134 { 135 return cmp_ptr(a, b); 136 } 137 138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, 139 const struct lockdep_map *_b) 140 { 141 const struct unix_sock *a, *b; 142 143 a = container_of(_a, struct unix_sock, lock.dep_map); 144 b = container_of(_b, struct unix_sock, lock.dep_map); 145 146 if (a->sk.sk_state == TCP_LISTEN) { 147 /* unix_stream_connect(): Before the 2nd unix_state_lock(), 148 * 149 * 1. a is TCP_LISTEN. 150 * 2. b is not a. 151 * 3. concurrent connect(b -> a) must fail. 152 * 153 * Except for 2. & 3., the b's state can be any possible 154 * value due to concurrent connect() or listen(). 155 * 156 * 2. is detected in debug_spin_lock_before(), and 3. cannot 157 * be expressed as lock_cmp_fn. 158 */ 159 switch (b->sk.sk_state) { 160 case TCP_CLOSE: 161 case TCP_ESTABLISHED: 162 case TCP_LISTEN: 163 return -1; 164 default: 165 /* Invalid case. */ 166 return 0; 167 } 168 } 169 170 /* Should never happen. Just to be symmetric. */ 171 if (b->sk.sk_state == TCP_LISTEN) { 172 switch (b->sk.sk_state) { 173 case TCP_CLOSE: 174 case TCP_ESTABLISHED: 175 return 1; 176 default: 177 return 0; 178 } 179 } 180 181 /* unix_state_double_lock(): ascending address order. */ 182 return cmp_ptr(a, b); 183 } 184 185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, 186 const struct lockdep_map *_b) 187 { 188 const struct sock *a, *b; 189 190 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); 191 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); 192 193 /* unix_collect_skb(): listener -> embryo order. */ 194 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) 195 return -1; 196 197 /* Should never happen. Just to be symmetric. */ 198 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) 199 return 1; 200 201 return 0; 202 } 203 #endif 204 205 static unsigned int unix_unbound_hash(struct sock *sk) 206 { 207 unsigned long hash = (unsigned long)sk; 208 209 hash ^= hash >> 16; 210 hash ^= hash >> 8; 211 hash ^= sk->sk_type; 212 213 return hash & UNIX_HASH_MOD; 214 } 215 216 static unsigned int unix_bsd_hash(struct inode *i) 217 { 218 return i->i_ino & UNIX_HASH_MOD; 219 } 220 221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 222 int addr_len, int type) 223 { 224 __wsum csum = csum_partial(sunaddr, addr_len, 0); 225 unsigned int hash; 226 227 hash = (__force unsigned int)csum_fold(csum); 228 hash ^= hash >> 8; 229 hash ^= type; 230 231 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 232 } 233 234 static void unix_table_double_lock(struct net *net, 235 unsigned int hash1, unsigned int hash2) 236 { 237 if (hash1 == hash2) { 238 spin_lock(&net->unx.table.locks[hash1]); 239 return; 240 } 241 242 if (hash1 > hash2) 243 swap(hash1, hash2); 244 245 spin_lock(&net->unx.table.locks[hash1]); 246 spin_lock(&net->unx.table.locks[hash2]); 247 } 248 249 static void unix_table_double_unlock(struct net *net, 250 unsigned int hash1, unsigned int hash2) 251 { 252 if (hash1 == hash2) { 253 spin_unlock(&net->unx.table.locks[hash1]); 254 return; 255 } 256 257 spin_unlock(&net->unx.table.locks[hash1]); 258 spin_unlock(&net->unx.table.locks[hash2]); 259 } 260 261 #ifdef CONFIG_SECURITY_NETWORK 262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 263 { 264 UNIXCB(skb).secid = scm->secid; 265 } 266 267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 268 { 269 scm->secid = UNIXCB(skb).secid; 270 } 271 272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 273 { 274 return (scm->secid == UNIXCB(skb).secid); 275 } 276 #else 277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 278 { } 279 280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 281 { } 282 283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 284 { 285 return true; 286 } 287 #endif /* CONFIG_SECURITY_NETWORK */ 288 289 static inline int unix_our_peer(struct sock *sk, struct sock *osk) 290 { 291 return unix_peer(osk) == sk; 292 } 293 294 static inline int unix_may_send(struct sock *sk, struct sock *osk) 295 { 296 return unix_peer(osk) == NULL || unix_our_peer(sk, osk); 297 } 298 299 static inline int unix_recvq_full_lockless(const struct sock *sk) 300 { 301 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 302 } 303 304 struct sock *unix_peer_get(struct sock *s) 305 { 306 struct sock *peer; 307 308 unix_state_lock(s); 309 peer = unix_peer(s); 310 if (peer) 311 sock_hold(peer); 312 unix_state_unlock(s); 313 return peer; 314 } 315 EXPORT_SYMBOL_GPL(unix_peer_get); 316 317 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 318 int addr_len) 319 { 320 struct unix_address *addr; 321 322 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 323 if (!addr) 324 return NULL; 325 326 refcount_set(&addr->refcnt, 1); 327 addr->len = addr_len; 328 memcpy(addr->name, sunaddr, addr_len); 329 330 return addr; 331 } 332 333 static inline void unix_release_addr(struct unix_address *addr) 334 { 335 if (refcount_dec_and_test(&addr->refcnt)) 336 kfree(addr); 337 } 338 339 /* 340 * Check unix socket name: 341 * - should be not zero length. 342 * - if started by not zero, should be NULL terminated (FS object) 343 * - if started by zero, it is abstract name. 344 */ 345 346 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 347 { 348 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 349 addr_len > sizeof(*sunaddr)) 350 return -EINVAL; 351 352 if (sunaddr->sun_family != AF_UNIX) 353 return -EINVAL; 354 355 return 0; 356 } 357 358 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 359 { 360 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 361 short offset = offsetof(struct sockaddr_storage, __data); 362 363 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 364 365 /* This may look like an off by one error but it is a bit more 366 * subtle. 108 is the longest valid AF_UNIX path for a binding. 367 * sun_path[108] doesn't as such exist. However in kernel space 368 * we are guaranteed that it is a valid memory location in our 369 * kernel address buffer because syscall functions always pass 370 * a pointer of struct sockaddr_storage which has a bigger buffer 371 * than 108. Also, we must terminate sun_path for strlen() in 372 * getname_kernel(). 373 */ 374 addr->__data[addr_len - offset] = 0; 375 376 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 377 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 378 * know the actual buffer. 379 */ 380 return strlen(addr->__data) + offset + 1; 381 } 382 383 static void __unix_remove_socket(struct sock *sk) 384 { 385 sk_del_node_init(sk); 386 } 387 388 static void __unix_insert_socket(struct net *net, struct sock *sk) 389 { 390 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 391 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 392 } 393 394 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 395 struct unix_address *addr, unsigned int hash) 396 { 397 __unix_remove_socket(sk); 398 smp_store_release(&unix_sk(sk)->addr, addr); 399 400 sk->sk_hash = hash; 401 __unix_insert_socket(net, sk); 402 } 403 404 static void unix_remove_socket(struct net *net, struct sock *sk) 405 { 406 spin_lock(&net->unx.table.locks[sk->sk_hash]); 407 __unix_remove_socket(sk); 408 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 409 } 410 411 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 412 { 413 spin_lock(&net->unx.table.locks[sk->sk_hash]); 414 __unix_insert_socket(net, sk); 415 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 416 } 417 418 static void unix_insert_bsd_socket(struct sock *sk) 419 { 420 spin_lock(&bsd_socket_locks[sk->sk_hash]); 421 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 422 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 423 } 424 425 static void unix_remove_bsd_socket(struct sock *sk) 426 { 427 if (!hlist_unhashed(&sk->sk_bind_node)) { 428 spin_lock(&bsd_socket_locks[sk->sk_hash]); 429 __sk_del_bind_node(sk); 430 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 431 432 sk_node_init(&sk->sk_bind_node); 433 } 434 } 435 436 static struct sock *__unix_find_socket_byname(struct net *net, 437 struct sockaddr_un *sunname, 438 int len, unsigned int hash) 439 { 440 struct sock *s; 441 442 sk_for_each(s, &net->unx.table.buckets[hash]) { 443 struct unix_sock *u = unix_sk(s); 444 445 if (u->addr->len == len && 446 !memcmp(u->addr->name, sunname, len)) 447 return s; 448 } 449 return NULL; 450 } 451 452 static inline struct sock *unix_find_socket_byname(struct net *net, 453 struct sockaddr_un *sunname, 454 int len, unsigned int hash) 455 { 456 struct sock *s; 457 458 spin_lock(&net->unx.table.locks[hash]); 459 s = __unix_find_socket_byname(net, sunname, len, hash); 460 if (s) 461 sock_hold(s); 462 spin_unlock(&net->unx.table.locks[hash]); 463 return s; 464 } 465 466 static struct sock *unix_find_socket_byinode(struct inode *i) 467 { 468 unsigned int hash = unix_bsd_hash(i); 469 struct sock *s; 470 471 spin_lock(&bsd_socket_locks[hash]); 472 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 473 struct dentry *dentry = unix_sk(s)->path.dentry; 474 475 if (dentry && d_backing_inode(dentry) == i) { 476 sock_hold(s); 477 spin_unlock(&bsd_socket_locks[hash]); 478 return s; 479 } 480 } 481 spin_unlock(&bsd_socket_locks[hash]); 482 return NULL; 483 } 484 485 /* Support code for asymmetrically connected dgram sockets 486 * 487 * If a datagram socket is connected to a socket not itself connected 488 * to the first socket (eg, /dev/log), clients may only enqueue more 489 * messages if the present receive queue of the server socket is not 490 * "too large". This means there's a second writeability condition 491 * poll and sendmsg need to test. The dgram recv code will do a wake 492 * up on the peer_wait wait queue of a socket upon reception of a 493 * datagram which needs to be propagated to sleeping would-be writers 494 * since these might not have sent anything so far. This can't be 495 * accomplished via poll_wait because the lifetime of the server 496 * socket might be less than that of its clients if these break their 497 * association with it or if the server socket is closed while clients 498 * are still connected to it and there's no way to inform "a polling 499 * implementation" that it should let go of a certain wait queue 500 * 501 * In order to propagate a wake up, a wait_queue_entry_t of the client 502 * socket is enqueued on the peer_wait queue of the server socket 503 * whose wake function does a wake_up on the ordinary client socket 504 * wait queue. This connection is established whenever a write (or 505 * poll for write) hit the flow control condition and broken when the 506 * association to the server socket is dissolved or after a wake up 507 * was relayed. 508 */ 509 510 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 511 void *key) 512 { 513 struct unix_sock *u; 514 wait_queue_head_t *u_sleep; 515 516 u = container_of(q, struct unix_sock, peer_wake); 517 518 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 519 q); 520 u->peer_wake.private = NULL; 521 522 /* relaying can only happen while the wq still exists */ 523 u_sleep = sk_sleep(&u->sk); 524 if (u_sleep) 525 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 526 527 return 0; 528 } 529 530 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 531 { 532 struct unix_sock *u, *u_other; 533 int rc; 534 535 u = unix_sk(sk); 536 u_other = unix_sk(other); 537 rc = 0; 538 spin_lock(&u_other->peer_wait.lock); 539 540 if (!u->peer_wake.private) { 541 u->peer_wake.private = other; 542 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 543 544 rc = 1; 545 } 546 547 spin_unlock(&u_other->peer_wait.lock); 548 return rc; 549 } 550 551 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 552 struct sock *other) 553 { 554 struct unix_sock *u, *u_other; 555 556 u = unix_sk(sk); 557 u_other = unix_sk(other); 558 spin_lock(&u_other->peer_wait.lock); 559 560 if (u->peer_wake.private == other) { 561 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 562 u->peer_wake.private = NULL; 563 } 564 565 spin_unlock(&u_other->peer_wait.lock); 566 } 567 568 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 569 struct sock *other) 570 { 571 unix_dgram_peer_wake_disconnect(sk, other); 572 wake_up_interruptible_poll(sk_sleep(sk), 573 EPOLLOUT | 574 EPOLLWRNORM | 575 EPOLLWRBAND); 576 } 577 578 /* preconditions: 579 * - unix_peer(sk) == other 580 * - association is stable 581 */ 582 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 583 { 584 int connected; 585 586 connected = unix_dgram_peer_wake_connect(sk, other); 587 588 /* If other is SOCK_DEAD, we want to make sure we signal 589 * POLLOUT, such that a subsequent write() can get a 590 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 591 * to other and its full, we will hang waiting for POLLOUT. 592 */ 593 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 594 return 1; 595 596 if (connected) 597 unix_dgram_peer_wake_disconnect(sk, other); 598 599 return 0; 600 } 601 602 static int unix_writable(const struct sock *sk, unsigned char state) 603 { 604 return state != TCP_LISTEN && 605 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 606 } 607 608 static void unix_write_space(struct sock *sk) 609 { 610 struct socket_wq *wq; 611 612 rcu_read_lock(); 613 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 614 wq = rcu_dereference(sk->sk_wq); 615 if (skwq_has_sleeper(wq)) 616 wake_up_interruptible_sync_poll(&wq->wait, 617 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 618 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 619 } 620 rcu_read_unlock(); 621 } 622 623 /* When dgram socket disconnects (or changes its peer), we clear its receive 624 * queue of packets arrived from previous peer. First, it allows to do 625 * flow control based only on wmem_alloc; second, sk connected to peer 626 * may receive messages only from that peer. */ 627 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 628 { 629 if (!skb_queue_empty(&sk->sk_receive_queue)) { 630 skb_queue_purge(&sk->sk_receive_queue); 631 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 632 633 /* If one link of bidirectional dgram pipe is disconnected, 634 * we signal error. Messages are lost. Do not make this, 635 * when peer was not connected to us. 636 */ 637 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 638 WRITE_ONCE(other->sk_err, ECONNRESET); 639 sk_error_report(other); 640 } 641 } 642 } 643 644 static void unix_sock_destructor(struct sock *sk) 645 { 646 struct unix_sock *u = unix_sk(sk); 647 648 skb_queue_purge(&sk->sk_receive_queue); 649 650 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 651 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 652 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 653 if (!sock_flag(sk, SOCK_DEAD)) { 654 pr_info("Attempt to release alive unix socket: %p\n", sk); 655 return; 656 } 657 658 if (u->addr) 659 unix_release_addr(u->addr); 660 661 atomic_long_dec(&unix_nr_socks); 662 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 663 #ifdef UNIX_REFCNT_DEBUG 664 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 665 atomic_long_read(&unix_nr_socks)); 666 #endif 667 } 668 669 static void unix_release_sock(struct sock *sk, int embrion) 670 { 671 struct unix_sock *u = unix_sk(sk); 672 struct sock *skpair; 673 struct sk_buff *skb; 674 struct path path; 675 int state; 676 677 unix_remove_socket(sock_net(sk), sk); 678 unix_remove_bsd_socket(sk); 679 680 /* Clear state */ 681 unix_state_lock(sk); 682 sock_orphan(sk); 683 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 684 path = u->path; 685 u->path.dentry = NULL; 686 u->path.mnt = NULL; 687 state = sk->sk_state; 688 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 689 690 skpair = unix_peer(sk); 691 unix_peer(sk) = NULL; 692 693 unix_state_unlock(sk); 694 695 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 696 if (u->oob_skb) { 697 kfree_skb(u->oob_skb); 698 u->oob_skb = NULL; 699 } 700 #endif 701 702 wake_up_interruptible_all(&u->peer_wait); 703 704 if (skpair != NULL) { 705 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 706 unix_state_lock(skpair); 707 /* No more writes */ 708 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 709 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) 710 WRITE_ONCE(skpair->sk_err, ECONNRESET); 711 unix_state_unlock(skpair); 712 skpair->sk_state_change(skpair); 713 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 714 } 715 716 unix_dgram_peer_wake_disconnect(sk, skpair); 717 sock_put(skpair); /* It may now die */ 718 } 719 720 /* Try to flush out this socket. Throw out buffers at least */ 721 722 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 723 if (state == TCP_LISTEN) 724 unix_release_sock(skb->sk, 1); 725 726 /* passed fds are erased in the kfree_skb hook */ 727 kfree_skb(skb); 728 } 729 730 if (path.dentry) 731 path_put(&path); 732 733 sock_put(sk); 734 735 /* ---- Socket is dead now and most probably destroyed ---- */ 736 737 /* 738 * Fixme: BSD difference: In BSD all sockets connected to us get 739 * ECONNRESET and we die on the spot. In Linux we behave 740 * like files and pipes do and wait for the last 741 * dereference. 742 * 743 * Can't we simply set sock->err? 744 * 745 * What the above comment does talk about? --ANK(980817) 746 */ 747 748 if (READ_ONCE(unix_tot_inflight)) 749 unix_gc(); /* Garbage collect fds */ 750 } 751 752 static void init_peercred(struct sock *sk) 753 { 754 sk->sk_peer_pid = get_pid(task_tgid(current)); 755 sk->sk_peer_cred = get_current_cred(); 756 } 757 758 static void update_peercred(struct sock *sk) 759 { 760 const struct cred *old_cred; 761 struct pid *old_pid; 762 763 spin_lock(&sk->sk_peer_lock); 764 old_pid = sk->sk_peer_pid; 765 old_cred = sk->sk_peer_cred; 766 init_peercred(sk); 767 spin_unlock(&sk->sk_peer_lock); 768 769 put_pid(old_pid); 770 put_cred(old_cred); 771 } 772 773 static void copy_peercred(struct sock *sk, struct sock *peersk) 774 { 775 lockdep_assert_held(&unix_sk(peersk)->lock); 776 777 spin_lock(&sk->sk_peer_lock); 778 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 779 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 780 spin_unlock(&sk->sk_peer_lock); 781 } 782 783 static int unix_listen(struct socket *sock, int backlog) 784 { 785 int err; 786 struct sock *sk = sock->sk; 787 struct unix_sock *u = unix_sk(sk); 788 789 err = -EOPNOTSUPP; 790 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 791 goto out; /* Only stream/seqpacket sockets accept */ 792 err = -EINVAL; 793 if (!READ_ONCE(u->addr)) 794 goto out; /* No listens on an unbound socket */ 795 unix_state_lock(sk); 796 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 797 goto out_unlock; 798 if (backlog > sk->sk_max_ack_backlog) 799 wake_up_interruptible_all(&u->peer_wait); 800 sk->sk_max_ack_backlog = backlog; 801 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 802 803 /* set credentials so connect can copy them */ 804 update_peercred(sk); 805 err = 0; 806 807 out_unlock: 808 unix_state_unlock(sk); 809 out: 810 return err; 811 } 812 813 static int unix_release(struct socket *); 814 static int unix_bind(struct socket *, struct sockaddr *, int); 815 static int unix_stream_connect(struct socket *, struct sockaddr *, 816 int addr_len, int flags); 817 static int unix_socketpair(struct socket *, struct socket *); 818 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); 819 static int unix_getname(struct socket *, struct sockaddr *, int); 820 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 821 static __poll_t unix_dgram_poll(struct file *, struct socket *, 822 poll_table *); 823 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 824 #ifdef CONFIG_COMPAT 825 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 826 #endif 827 static int unix_shutdown(struct socket *, int); 828 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 829 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 830 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 831 struct pipe_inode_info *, size_t size, 832 unsigned int flags); 833 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 834 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 835 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 836 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 837 static int unix_dgram_connect(struct socket *, struct sockaddr *, 838 int, int); 839 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 840 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 841 int); 842 843 #ifdef CONFIG_PROC_FS 844 static int unix_count_nr_fds(struct sock *sk) 845 { 846 struct sk_buff *skb; 847 struct unix_sock *u; 848 int nr_fds = 0; 849 850 spin_lock(&sk->sk_receive_queue.lock); 851 skb = skb_peek(&sk->sk_receive_queue); 852 while (skb) { 853 u = unix_sk(skb->sk); 854 nr_fds += atomic_read(&u->scm_stat.nr_fds); 855 skb = skb_peek_next(skb, &sk->sk_receive_queue); 856 } 857 spin_unlock(&sk->sk_receive_queue.lock); 858 859 return nr_fds; 860 } 861 862 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 863 { 864 struct sock *sk = sock->sk; 865 unsigned char s_state; 866 struct unix_sock *u; 867 int nr_fds = 0; 868 869 if (sk) { 870 s_state = READ_ONCE(sk->sk_state); 871 u = unix_sk(sk); 872 873 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 874 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 875 * SOCK_DGRAM is ordinary. So, no lock is needed. 876 */ 877 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 878 nr_fds = atomic_read(&u->scm_stat.nr_fds); 879 else if (s_state == TCP_LISTEN) 880 nr_fds = unix_count_nr_fds(sk); 881 882 seq_printf(m, "scm_fds: %u\n", nr_fds); 883 } 884 } 885 #else 886 #define unix_show_fdinfo NULL 887 #endif 888 889 static const struct proto_ops unix_stream_ops = { 890 .family = PF_UNIX, 891 .owner = THIS_MODULE, 892 .release = unix_release, 893 .bind = unix_bind, 894 .connect = unix_stream_connect, 895 .socketpair = unix_socketpair, 896 .accept = unix_accept, 897 .getname = unix_getname, 898 .poll = unix_poll, 899 .ioctl = unix_ioctl, 900 #ifdef CONFIG_COMPAT 901 .compat_ioctl = unix_compat_ioctl, 902 #endif 903 .listen = unix_listen, 904 .shutdown = unix_shutdown, 905 .sendmsg = unix_stream_sendmsg, 906 .recvmsg = unix_stream_recvmsg, 907 .read_skb = unix_stream_read_skb, 908 .mmap = sock_no_mmap, 909 .splice_read = unix_stream_splice_read, 910 .set_peek_off = sk_set_peek_off, 911 .show_fdinfo = unix_show_fdinfo, 912 }; 913 914 static const struct proto_ops unix_dgram_ops = { 915 .family = PF_UNIX, 916 .owner = THIS_MODULE, 917 .release = unix_release, 918 .bind = unix_bind, 919 .connect = unix_dgram_connect, 920 .socketpair = unix_socketpair, 921 .accept = sock_no_accept, 922 .getname = unix_getname, 923 .poll = unix_dgram_poll, 924 .ioctl = unix_ioctl, 925 #ifdef CONFIG_COMPAT 926 .compat_ioctl = unix_compat_ioctl, 927 #endif 928 .listen = sock_no_listen, 929 .shutdown = unix_shutdown, 930 .sendmsg = unix_dgram_sendmsg, 931 .read_skb = unix_read_skb, 932 .recvmsg = unix_dgram_recvmsg, 933 .mmap = sock_no_mmap, 934 .set_peek_off = sk_set_peek_off, 935 .show_fdinfo = unix_show_fdinfo, 936 }; 937 938 static const struct proto_ops unix_seqpacket_ops = { 939 .family = PF_UNIX, 940 .owner = THIS_MODULE, 941 .release = unix_release, 942 .bind = unix_bind, 943 .connect = unix_stream_connect, 944 .socketpair = unix_socketpair, 945 .accept = unix_accept, 946 .getname = unix_getname, 947 .poll = unix_dgram_poll, 948 .ioctl = unix_ioctl, 949 #ifdef CONFIG_COMPAT 950 .compat_ioctl = unix_compat_ioctl, 951 #endif 952 .listen = unix_listen, 953 .shutdown = unix_shutdown, 954 .sendmsg = unix_seqpacket_sendmsg, 955 .recvmsg = unix_seqpacket_recvmsg, 956 .mmap = sock_no_mmap, 957 .set_peek_off = sk_set_peek_off, 958 .show_fdinfo = unix_show_fdinfo, 959 }; 960 961 static void unix_close(struct sock *sk, long timeout) 962 { 963 /* Nothing to do here, unix socket does not need a ->close(). 964 * This is merely for sockmap. 965 */ 966 } 967 968 static void unix_unhash(struct sock *sk) 969 { 970 /* Nothing to do here, unix socket does not need a ->unhash(). 971 * This is merely for sockmap. 972 */ 973 } 974 975 static bool unix_bpf_bypass_getsockopt(int level, int optname) 976 { 977 if (level == SOL_SOCKET) { 978 switch (optname) { 979 case SO_PEERPIDFD: 980 return true; 981 default: 982 return false; 983 } 984 } 985 986 return false; 987 } 988 989 struct proto unix_dgram_proto = { 990 .name = "UNIX", 991 .owner = THIS_MODULE, 992 .obj_size = sizeof(struct unix_sock), 993 .close = unix_close, 994 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 995 #ifdef CONFIG_BPF_SYSCALL 996 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 997 #endif 998 }; 999 1000 struct proto unix_stream_proto = { 1001 .name = "UNIX-STREAM", 1002 .owner = THIS_MODULE, 1003 .obj_size = sizeof(struct unix_sock), 1004 .close = unix_close, 1005 .unhash = unix_unhash, 1006 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 1007 #ifdef CONFIG_BPF_SYSCALL 1008 .psock_update_sk_prot = unix_stream_bpf_update_proto, 1009 #endif 1010 }; 1011 1012 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 1013 { 1014 struct unix_sock *u; 1015 struct sock *sk; 1016 int err; 1017 1018 atomic_long_inc(&unix_nr_socks); 1019 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 1020 err = -ENFILE; 1021 goto err; 1022 } 1023 1024 if (type == SOCK_STREAM) 1025 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 1026 else /*dgram and seqpacket */ 1027 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 1028 1029 if (!sk) { 1030 err = -ENOMEM; 1031 goto err; 1032 } 1033 1034 sock_init_data(sock, sk); 1035 1036 sk->sk_hash = unix_unbound_hash(sk); 1037 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 1038 sk->sk_write_space = unix_write_space; 1039 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 1040 sk->sk_destruct = unix_sock_destructor; 1041 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); 1042 1043 u = unix_sk(sk); 1044 u->listener = NULL; 1045 u->vertex = NULL; 1046 u->path.dentry = NULL; 1047 u->path.mnt = NULL; 1048 spin_lock_init(&u->lock); 1049 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); 1050 mutex_init(&u->iolock); /* single task reading lock */ 1051 mutex_init(&u->bindlock); /* single task binding lock */ 1052 init_waitqueue_head(&u->peer_wait); 1053 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1054 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1055 unix_insert_unbound_socket(net, sk); 1056 1057 sock_prot_inuse_add(net, sk->sk_prot, 1); 1058 1059 return sk; 1060 1061 err: 1062 atomic_long_dec(&unix_nr_socks); 1063 return ERR_PTR(err); 1064 } 1065 1066 static int unix_create(struct net *net, struct socket *sock, int protocol, 1067 int kern) 1068 { 1069 struct sock *sk; 1070 1071 if (protocol && protocol != PF_UNIX) 1072 return -EPROTONOSUPPORT; 1073 1074 sock->state = SS_UNCONNECTED; 1075 1076 switch (sock->type) { 1077 case SOCK_STREAM: 1078 sock->ops = &unix_stream_ops; 1079 break; 1080 /* 1081 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1082 * nothing uses it. 1083 */ 1084 case SOCK_RAW: 1085 sock->type = SOCK_DGRAM; 1086 fallthrough; 1087 case SOCK_DGRAM: 1088 sock->ops = &unix_dgram_ops; 1089 break; 1090 case SOCK_SEQPACKET: 1091 sock->ops = &unix_seqpacket_ops; 1092 break; 1093 default: 1094 return -ESOCKTNOSUPPORT; 1095 } 1096 1097 sk = unix_create1(net, sock, kern, sock->type); 1098 if (IS_ERR(sk)) 1099 return PTR_ERR(sk); 1100 1101 return 0; 1102 } 1103 1104 static int unix_release(struct socket *sock) 1105 { 1106 struct sock *sk = sock->sk; 1107 1108 if (!sk) 1109 return 0; 1110 1111 sk->sk_prot->close(sk, 0); 1112 unix_release_sock(sk, 0); 1113 sock->sk = NULL; 1114 1115 return 0; 1116 } 1117 1118 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1119 int type) 1120 { 1121 struct inode *inode; 1122 struct path path; 1123 struct sock *sk; 1124 int err; 1125 1126 unix_mkname_bsd(sunaddr, addr_len); 1127 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1128 if (err) 1129 goto fail; 1130 1131 err = path_permission(&path, MAY_WRITE); 1132 if (err) 1133 goto path_put; 1134 1135 err = -ECONNREFUSED; 1136 inode = d_backing_inode(path.dentry); 1137 if (!S_ISSOCK(inode->i_mode)) 1138 goto path_put; 1139 1140 sk = unix_find_socket_byinode(inode); 1141 if (!sk) 1142 goto path_put; 1143 1144 err = -EPROTOTYPE; 1145 if (sk->sk_type == type) 1146 touch_atime(&path); 1147 else 1148 goto sock_put; 1149 1150 path_put(&path); 1151 1152 return sk; 1153 1154 sock_put: 1155 sock_put(sk); 1156 path_put: 1157 path_put(&path); 1158 fail: 1159 return ERR_PTR(err); 1160 } 1161 1162 static struct sock *unix_find_abstract(struct net *net, 1163 struct sockaddr_un *sunaddr, 1164 int addr_len, int type) 1165 { 1166 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1167 struct dentry *dentry; 1168 struct sock *sk; 1169 1170 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1171 if (!sk) 1172 return ERR_PTR(-ECONNREFUSED); 1173 1174 dentry = unix_sk(sk)->path.dentry; 1175 if (dentry) 1176 touch_atime(&unix_sk(sk)->path); 1177 1178 return sk; 1179 } 1180 1181 static struct sock *unix_find_other(struct net *net, 1182 struct sockaddr_un *sunaddr, 1183 int addr_len, int type) 1184 { 1185 struct sock *sk; 1186 1187 if (sunaddr->sun_path[0]) 1188 sk = unix_find_bsd(sunaddr, addr_len, type); 1189 else 1190 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1191 1192 return sk; 1193 } 1194 1195 static int unix_autobind(struct sock *sk) 1196 { 1197 struct unix_sock *u = unix_sk(sk); 1198 unsigned int new_hash, old_hash; 1199 struct net *net = sock_net(sk); 1200 struct unix_address *addr; 1201 u32 lastnum, ordernum; 1202 int err; 1203 1204 err = mutex_lock_interruptible(&u->bindlock); 1205 if (err) 1206 return err; 1207 1208 if (u->addr) 1209 goto out; 1210 1211 err = -ENOMEM; 1212 addr = kzalloc(sizeof(*addr) + 1213 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1214 if (!addr) 1215 goto out; 1216 1217 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1218 addr->name->sun_family = AF_UNIX; 1219 refcount_set(&addr->refcnt, 1); 1220 1221 old_hash = sk->sk_hash; 1222 ordernum = get_random_u32(); 1223 lastnum = ordernum & 0xFFFFF; 1224 retry: 1225 ordernum = (ordernum + 1) & 0xFFFFF; 1226 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1227 1228 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1229 unix_table_double_lock(net, old_hash, new_hash); 1230 1231 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1232 unix_table_double_unlock(net, old_hash, new_hash); 1233 1234 /* __unix_find_socket_byname() may take long time if many names 1235 * are already in use. 1236 */ 1237 cond_resched(); 1238 1239 if (ordernum == lastnum) { 1240 /* Give up if all names seems to be in use. */ 1241 err = -ENOSPC; 1242 unix_release_addr(addr); 1243 goto out; 1244 } 1245 1246 goto retry; 1247 } 1248 1249 __unix_set_addr_hash(net, sk, addr, new_hash); 1250 unix_table_double_unlock(net, old_hash, new_hash); 1251 err = 0; 1252 1253 out: mutex_unlock(&u->bindlock); 1254 return err; 1255 } 1256 1257 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1258 int addr_len) 1259 { 1260 umode_t mode = S_IFSOCK | 1261 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1262 struct unix_sock *u = unix_sk(sk); 1263 unsigned int new_hash, old_hash; 1264 struct net *net = sock_net(sk); 1265 struct mnt_idmap *idmap; 1266 struct unix_address *addr; 1267 struct dentry *dentry; 1268 struct path parent; 1269 int err; 1270 1271 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1272 addr = unix_create_addr(sunaddr, addr_len); 1273 if (!addr) 1274 return -ENOMEM; 1275 1276 /* 1277 * Get the parent directory, calculate the hash for last 1278 * component. 1279 */ 1280 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1281 if (IS_ERR(dentry)) { 1282 err = PTR_ERR(dentry); 1283 goto out; 1284 } 1285 1286 /* 1287 * All right, let's create it. 1288 */ 1289 idmap = mnt_idmap(parent.mnt); 1290 err = security_path_mknod(&parent, dentry, mode, 0); 1291 if (!err) 1292 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1293 if (err) 1294 goto out_path; 1295 err = mutex_lock_interruptible(&u->bindlock); 1296 if (err) 1297 goto out_unlink; 1298 if (u->addr) 1299 goto out_unlock; 1300 1301 old_hash = sk->sk_hash; 1302 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1303 unix_table_double_lock(net, old_hash, new_hash); 1304 u->path.mnt = mntget(parent.mnt); 1305 u->path.dentry = dget(dentry); 1306 __unix_set_addr_hash(net, sk, addr, new_hash); 1307 unix_table_double_unlock(net, old_hash, new_hash); 1308 unix_insert_bsd_socket(sk); 1309 mutex_unlock(&u->bindlock); 1310 done_path_create(&parent, dentry); 1311 return 0; 1312 1313 out_unlock: 1314 mutex_unlock(&u->bindlock); 1315 err = -EINVAL; 1316 out_unlink: 1317 /* failed after successful mknod? unlink what we'd created... */ 1318 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1319 out_path: 1320 done_path_create(&parent, dentry); 1321 out: 1322 unix_release_addr(addr); 1323 return err == -EEXIST ? -EADDRINUSE : err; 1324 } 1325 1326 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1327 int addr_len) 1328 { 1329 struct unix_sock *u = unix_sk(sk); 1330 unsigned int new_hash, old_hash; 1331 struct net *net = sock_net(sk); 1332 struct unix_address *addr; 1333 int err; 1334 1335 addr = unix_create_addr(sunaddr, addr_len); 1336 if (!addr) 1337 return -ENOMEM; 1338 1339 err = mutex_lock_interruptible(&u->bindlock); 1340 if (err) 1341 goto out; 1342 1343 if (u->addr) { 1344 err = -EINVAL; 1345 goto out_mutex; 1346 } 1347 1348 old_hash = sk->sk_hash; 1349 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1350 unix_table_double_lock(net, old_hash, new_hash); 1351 1352 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1353 goto out_spin; 1354 1355 __unix_set_addr_hash(net, sk, addr, new_hash); 1356 unix_table_double_unlock(net, old_hash, new_hash); 1357 mutex_unlock(&u->bindlock); 1358 return 0; 1359 1360 out_spin: 1361 unix_table_double_unlock(net, old_hash, new_hash); 1362 err = -EADDRINUSE; 1363 out_mutex: 1364 mutex_unlock(&u->bindlock); 1365 out: 1366 unix_release_addr(addr); 1367 return err; 1368 } 1369 1370 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1371 { 1372 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1373 struct sock *sk = sock->sk; 1374 int err; 1375 1376 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1377 sunaddr->sun_family == AF_UNIX) 1378 return unix_autobind(sk); 1379 1380 err = unix_validate_addr(sunaddr, addr_len); 1381 if (err) 1382 return err; 1383 1384 if (sunaddr->sun_path[0]) 1385 err = unix_bind_bsd(sk, sunaddr, addr_len); 1386 else 1387 err = unix_bind_abstract(sk, sunaddr, addr_len); 1388 1389 return err; 1390 } 1391 1392 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1393 { 1394 if (unlikely(sk1 == sk2) || !sk2) { 1395 unix_state_lock(sk1); 1396 return; 1397 } 1398 1399 if (sk1 > sk2) 1400 swap(sk1, sk2); 1401 1402 unix_state_lock(sk1); 1403 unix_state_lock(sk2); 1404 } 1405 1406 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1407 { 1408 if (unlikely(sk1 == sk2) || !sk2) { 1409 unix_state_unlock(sk1); 1410 return; 1411 } 1412 unix_state_unlock(sk1); 1413 unix_state_unlock(sk2); 1414 } 1415 1416 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1417 int alen, int flags) 1418 { 1419 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1420 struct sock *sk = sock->sk; 1421 struct sock *other; 1422 int err; 1423 1424 err = -EINVAL; 1425 if (alen < offsetofend(struct sockaddr, sa_family)) 1426 goto out; 1427 1428 if (addr->sa_family != AF_UNSPEC) { 1429 err = unix_validate_addr(sunaddr, alen); 1430 if (err) 1431 goto out; 1432 1433 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1434 if (err) 1435 goto out; 1436 1437 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1438 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1439 !READ_ONCE(unix_sk(sk)->addr)) { 1440 err = unix_autobind(sk); 1441 if (err) 1442 goto out; 1443 } 1444 1445 restart: 1446 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1447 if (IS_ERR(other)) { 1448 err = PTR_ERR(other); 1449 goto out; 1450 } 1451 1452 unix_state_double_lock(sk, other); 1453 1454 /* Apparently VFS overslept socket death. Retry. */ 1455 if (sock_flag(other, SOCK_DEAD)) { 1456 unix_state_double_unlock(sk, other); 1457 sock_put(other); 1458 goto restart; 1459 } 1460 1461 err = -EPERM; 1462 if (!unix_may_send(sk, other)) 1463 goto out_unlock; 1464 1465 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1466 if (err) 1467 goto out_unlock; 1468 1469 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1470 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1471 } else { 1472 /* 1473 * 1003.1g breaking connected state with AF_UNSPEC 1474 */ 1475 other = NULL; 1476 unix_state_double_lock(sk, other); 1477 } 1478 1479 /* 1480 * If it was connected, reconnect. 1481 */ 1482 if (unix_peer(sk)) { 1483 struct sock *old_peer = unix_peer(sk); 1484 1485 unix_peer(sk) = other; 1486 if (!other) 1487 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1488 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1489 1490 unix_state_double_unlock(sk, other); 1491 1492 if (other != old_peer) { 1493 unix_dgram_disconnected(sk, old_peer); 1494 1495 unix_state_lock(old_peer); 1496 if (!unix_peer(old_peer)) 1497 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1498 unix_state_unlock(old_peer); 1499 } 1500 1501 sock_put(old_peer); 1502 } else { 1503 unix_peer(sk) = other; 1504 unix_state_double_unlock(sk, other); 1505 } 1506 1507 return 0; 1508 1509 out_unlock: 1510 unix_state_double_unlock(sk, other); 1511 sock_put(other); 1512 out: 1513 return err; 1514 } 1515 1516 static long unix_wait_for_peer(struct sock *other, long timeo) 1517 __releases(&unix_sk(other)->lock) 1518 { 1519 struct unix_sock *u = unix_sk(other); 1520 int sched; 1521 DEFINE_WAIT(wait); 1522 1523 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1524 1525 sched = !sock_flag(other, SOCK_DEAD) && 1526 !(other->sk_shutdown & RCV_SHUTDOWN) && 1527 unix_recvq_full_lockless(other); 1528 1529 unix_state_unlock(other); 1530 1531 if (sched) 1532 timeo = schedule_timeout(timeo); 1533 1534 finish_wait(&u->peer_wait, &wait); 1535 return timeo; 1536 } 1537 1538 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1539 int addr_len, int flags) 1540 { 1541 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1542 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1543 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1544 struct net *net = sock_net(sk); 1545 struct sk_buff *skb = NULL; 1546 unsigned char state; 1547 long timeo; 1548 int err; 1549 1550 err = unix_validate_addr(sunaddr, addr_len); 1551 if (err) 1552 goto out; 1553 1554 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1555 if (err) 1556 goto out; 1557 1558 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1559 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1560 !READ_ONCE(u->addr)) { 1561 err = unix_autobind(sk); 1562 if (err) 1563 goto out; 1564 } 1565 1566 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1567 1568 /* First of all allocate resources. 1569 If we will make it after state is locked, 1570 we will have to recheck all again in any case. 1571 */ 1572 1573 /* create new sock for complete connection */ 1574 newsk = unix_create1(net, NULL, 0, sock->type); 1575 if (IS_ERR(newsk)) { 1576 err = PTR_ERR(newsk); 1577 newsk = NULL; 1578 goto out; 1579 } 1580 1581 err = -ENOMEM; 1582 1583 /* Allocate skb for sending to listening sock */ 1584 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1585 if (skb == NULL) 1586 goto out; 1587 1588 restart: 1589 /* Find listening sock. */ 1590 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1591 if (IS_ERR(other)) { 1592 err = PTR_ERR(other); 1593 other = NULL; 1594 goto out; 1595 } 1596 1597 unix_state_lock(other); 1598 1599 /* Apparently VFS overslept socket death. Retry. */ 1600 if (sock_flag(other, SOCK_DEAD)) { 1601 unix_state_unlock(other); 1602 sock_put(other); 1603 goto restart; 1604 } 1605 1606 err = -ECONNREFUSED; 1607 if (other->sk_state != TCP_LISTEN) 1608 goto out_unlock; 1609 if (other->sk_shutdown & RCV_SHUTDOWN) 1610 goto out_unlock; 1611 1612 if (unix_recvq_full_lockless(other)) { 1613 err = -EAGAIN; 1614 if (!timeo) 1615 goto out_unlock; 1616 1617 timeo = unix_wait_for_peer(other, timeo); 1618 1619 err = sock_intr_errno(timeo); 1620 if (signal_pending(current)) 1621 goto out; 1622 sock_put(other); 1623 goto restart; 1624 } 1625 1626 /* self connect and simultaneous connect are eliminated 1627 * by rejecting TCP_LISTEN socket to avoid deadlock. 1628 */ 1629 state = READ_ONCE(sk->sk_state); 1630 if (unlikely(state != TCP_CLOSE)) { 1631 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1632 goto out_unlock; 1633 } 1634 1635 unix_state_lock(sk); 1636 1637 if (unlikely(sk->sk_state != TCP_CLOSE)) { 1638 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1639 unix_state_unlock(sk); 1640 goto out_unlock; 1641 } 1642 1643 err = security_unix_stream_connect(sk, other, newsk); 1644 if (err) { 1645 unix_state_unlock(sk); 1646 goto out_unlock; 1647 } 1648 1649 /* The way is open! Fastly set all the necessary fields... */ 1650 1651 sock_hold(sk); 1652 unix_peer(newsk) = sk; 1653 newsk->sk_state = TCP_ESTABLISHED; 1654 newsk->sk_type = sk->sk_type; 1655 init_peercred(newsk); 1656 newu = unix_sk(newsk); 1657 newu->listener = other; 1658 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1659 otheru = unix_sk(other); 1660 1661 /* copy address information from listening to new sock 1662 * 1663 * The contents of *(otheru->addr) and otheru->path 1664 * are seen fully set up here, since we have found 1665 * otheru in hash under its lock. Insertion into the 1666 * hash chain we'd found it in had been done in an 1667 * earlier critical area protected by the chain's lock, 1668 * the same one where we'd set *(otheru->addr) contents, 1669 * as well as otheru->path and otheru->addr itself. 1670 * 1671 * Using smp_store_release() here to set newu->addr 1672 * is enough to make those stores, as well as stores 1673 * to newu->path visible to anyone who gets newu->addr 1674 * by smp_load_acquire(). IOW, the same warranties 1675 * as for unix_sock instances bound in unix_bind() or 1676 * in unix_autobind(). 1677 */ 1678 if (otheru->path.dentry) { 1679 path_get(&otheru->path); 1680 newu->path = otheru->path; 1681 } 1682 refcount_inc(&otheru->addr->refcnt); 1683 smp_store_release(&newu->addr, otheru->addr); 1684 1685 /* Set credentials */ 1686 copy_peercred(sk, other); 1687 1688 sock->state = SS_CONNECTED; 1689 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1690 sock_hold(newsk); 1691 1692 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1693 unix_peer(sk) = newsk; 1694 1695 unix_state_unlock(sk); 1696 1697 /* take ten and send info to listening sock */ 1698 spin_lock(&other->sk_receive_queue.lock); 1699 __skb_queue_tail(&other->sk_receive_queue, skb); 1700 spin_unlock(&other->sk_receive_queue.lock); 1701 unix_state_unlock(other); 1702 other->sk_data_ready(other); 1703 sock_put(other); 1704 return 0; 1705 1706 out_unlock: 1707 if (other) 1708 unix_state_unlock(other); 1709 1710 out: 1711 kfree_skb(skb); 1712 if (newsk) 1713 unix_release_sock(newsk, 0); 1714 if (other) 1715 sock_put(other); 1716 return err; 1717 } 1718 1719 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1720 { 1721 struct sock *ska = socka->sk, *skb = sockb->sk; 1722 1723 /* Join our sockets back to back */ 1724 sock_hold(ska); 1725 sock_hold(skb); 1726 unix_peer(ska) = skb; 1727 unix_peer(skb) = ska; 1728 init_peercred(ska); 1729 init_peercred(skb); 1730 1731 ska->sk_state = TCP_ESTABLISHED; 1732 skb->sk_state = TCP_ESTABLISHED; 1733 socka->state = SS_CONNECTED; 1734 sockb->state = SS_CONNECTED; 1735 return 0; 1736 } 1737 1738 static void unix_sock_inherit_flags(const struct socket *old, 1739 struct socket *new) 1740 { 1741 if (test_bit(SOCK_PASSCRED, &old->flags)) 1742 set_bit(SOCK_PASSCRED, &new->flags); 1743 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1744 set_bit(SOCK_PASSPIDFD, &new->flags); 1745 if (test_bit(SOCK_PASSSEC, &old->flags)) 1746 set_bit(SOCK_PASSSEC, &new->flags); 1747 } 1748 1749 static int unix_accept(struct socket *sock, struct socket *newsock, 1750 struct proto_accept_arg *arg) 1751 { 1752 struct sock *sk = sock->sk; 1753 struct sk_buff *skb; 1754 struct sock *tsk; 1755 1756 arg->err = -EOPNOTSUPP; 1757 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1758 goto out; 1759 1760 arg->err = -EINVAL; 1761 if (READ_ONCE(sk->sk_state) != TCP_LISTEN) 1762 goto out; 1763 1764 /* If socket state is TCP_LISTEN it cannot change (for now...), 1765 * so that no locks are necessary. 1766 */ 1767 1768 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1769 &arg->err); 1770 if (!skb) { 1771 /* This means receive shutdown. */ 1772 if (arg->err == 0) 1773 arg->err = -EINVAL; 1774 goto out; 1775 } 1776 1777 tsk = skb->sk; 1778 skb_free_datagram(sk, skb); 1779 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1780 1781 /* attach accepted sock to socket */ 1782 unix_state_lock(tsk); 1783 unix_update_edges(unix_sk(tsk)); 1784 newsock->state = SS_CONNECTED; 1785 unix_sock_inherit_flags(sock, newsock); 1786 sock_graft(tsk, newsock); 1787 unix_state_unlock(tsk); 1788 return 0; 1789 1790 out: 1791 return arg->err; 1792 } 1793 1794 1795 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1796 { 1797 struct sock *sk = sock->sk; 1798 struct unix_address *addr; 1799 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1800 int err = 0; 1801 1802 if (peer) { 1803 sk = unix_peer_get(sk); 1804 1805 err = -ENOTCONN; 1806 if (!sk) 1807 goto out; 1808 err = 0; 1809 } else { 1810 sock_hold(sk); 1811 } 1812 1813 addr = smp_load_acquire(&unix_sk(sk)->addr); 1814 if (!addr) { 1815 sunaddr->sun_family = AF_UNIX; 1816 sunaddr->sun_path[0] = 0; 1817 err = offsetof(struct sockaddr_un, sun_path); 1818 } else { 1819 err = addr->len; 1820 memcpy(sunaddr, addr->name, addr->len); 1821 1822 if (peer) 1823 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1824 CGROUP_UNIX_GETPEERNAME); 1825 else 1826 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1827 CGROUP_UNIX_GETSOCKNAME); 1828 } 1829 sock_put(sk); 1830 out: 1831 return err; 1832 } 1833 1834 /* The "user->unix_inflight" variable is protected by the garbage 1835 * collection lock, and we just read it locklessly here. If you go 1836 * over the limit, there might be a tiny race in actually noticing 1837 * it across threads. Tough. 1838 */ 1839 static inline bool too_many_unix_fds(struct task_struct *p) 1840 { 1841 struct user_struct *user = current_user(); 1842 1843 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1844 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1845 return false; 1846 } 1847 1848 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1849 { 1850 if (too_many_unix_fds(current)) 1851 return -ETOOMANYREFS; 1852 1853 UNIXCB(skb).fp = scm->fp; 1854 scm->fp = NULL; 1855 1856 if (unix_prepare_fpl(UNIXCB(skb).fp)) 1857 return -ENOMEM; 1858 1859 return 0; 1860 } 1861 1862 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1863 { 1864 scm->fp = UNIXCB(skb).fp; 1865 UNIXCB(skb).fp = NULL; 1866 1867 unix_destroy_fpl(scm->fp); 1868 } 1869 1870 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1871 { 1872 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1873 } 1874 1875 static void unix_destruct_scm(struct sk_buff *skb) 1876 { 1877 struct scm_cookie scm; 1878 1879 memset(&scm, 0, sizeof(scm)); 1880 scm.pid = UNIXCB(skb).pid; 1881 if (UNIXCB(skb).fp) 1882 unix_detach_fds(&scm, skb); 1883 1884 /* Alas, it calls VFS */ 1885 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1886 scm_destroy(&scm); 1887 sock_wfree(skb); 1888 } 1889 1890 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1891 { 1892 int err = 0; 1893 1894 UNIXCB(skb).pid = get_pid(scm->pid); 1895 UNIXCB(skb).uid = scm->creds.uid; 1896 UNIXCB(skb).gid = scm->creds.gid; 1897 UNIXCB(skb).fp = NULL; 1898 unix_get_secdata(scm, skb); 1899 if (scm->fp && send_fds) 1900 err = unix_attach_fds(scm, skb); 1901 1902 skb->destructor = unix_destruct_scm; 1903 return err; 1904 } 1905 1906 static bool unix_passcred_enabled(const struct socket *sock, 1907 const struct sock *other) 1908 { 1909 return test_bit(SOCK_PASSCRED, &sock->flags) || 1910 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1911 !other->sk_socket || 1912 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1913 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1914 } 1915 1916 /* 1917 * Some apps rely on write() giving SCM_CREDENTIALS 1918 * We include credentials if source or destination socket 1919 * asserted SOCK_PASSCRED. 1920 */ 1921 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1922 const struct sock *other) 1923 { 1924 if (UNIXCB(skb).pid) 1925 return; 1926 if (unix_passcred_enabled(sock, other)) { 1927 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1928 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1929 } 1930 } 1931 1932 static bool unix_skb_scm_eq(struct sk_buff *skb, 1933 struct scm_cookie *scm) 1934 { 1935 return UNIXCB(skb).pid == scm->pid && 1936 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1937 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1938 unix_secdata_eq(scm, skb); 1939 } 1940 1941 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1942 { 1943 struct scm_fp_list *fp = UNIXCB(skb).fp; 1944 struct unix_sock *u = unix_sk(sk); 1945 1946 if (unlikely(fp && fp->count)) { 1947 atomic_add(fp->count, &u->scm_stat.nr_fds); 1948 unix_add_edges(fp, u); 1949 } 1950 } 1951 1952 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1953 { 1954 struct scm_fp_list *fp = UNIXCB(skb).fp; 1955 struct unix_sock *u = unix_sk(sk); 1956 1957 if (unlikely(fp && fp->count)) { 1958 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1959 unix_del_edges(fp); 1960 } 1961 } 1962 1963 /* 1964 * Send AF_UNIX data. 1965 */ 1966 1967 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1968 size_t len) 1969 { 1970 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 1971 struct sock *sk = sock->sk, *other = NULL; 1972 struct unix_sock *u = unix_sk(sk); 1973 struct scm_cookie scm; 1974 struct sk_buff *skb; 1975 int data_len = 0; 1976 int sk_locked; 1977 long timeo; 1978 int err; 1979 1980 err = scm_send(sock, msg, &scm, false); 1981 if (err < 0) 1982 return err; 1983 1984 wait_for_unix_gc(scm.fp); 1985 1986 err = -EOPNOTSUPP; 1987 if (msg->msg_flags&MSG_OOB) 1988 goto out; 1989 1990 if (msg->msg_namelen) { 1991 err = unix_validate_addr(sunaddr, msg->msg_namelen); 1992 if (err) 1993 goto out; 1994 1995 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 1996 msg->msg_name, 1997 &msg->msg_namelen, 1998 NULL); 1999 if (err) 2000 goto out; 2001 } else { 2002 sunaddr = NULL; 2003 err = -ENOTCONN; 2004 other = unix_peer_get(sk); 2005 if (!other) 2006 goto out; 2007 } 2008 2009 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 2010 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 2011 !READ_ONCE(u->addr)) { 2012 err = unix_autobind(sk); 2013 if (err) 2014 goto out; 2015 } 2016 2017 err = -EMSGSIZE; 2018 if (len > READ_ONCE(sk->sk_sndbuf) - 32) 2019 goto out; 2020 2021 if (len > SKB_MAX_ALLOC) { 2022 data_len = min_t(size_t, 2023 len - SKB_MAX_ALLOC, 2024 MAX_SKB_FRAGS * PAGE_SIZE); 2025 data_len = PAGE_ALIGN(data_len); 2026 2027 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 2028 } 2029 2030 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 2031 msg->msg_flags & MSG_DONTWAIT, &err, 2032 PAGE_ALLOC_COSTLY_ORDER); 2033 if (skb == NULL) 2034 goto out; 2035 2036 err = unix_scm_to_skb(&scm, skb, true); 2037 if (err < 0) 2038 goto out_free; 2039 2040 skb_put(skb, len - data_len); 2041 skb->data_len = data_len; 2042 skb->len = len; 2043 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 2044 if (err) 2045 goto out_free; 2046 2047 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 2048 2049 restart: 2050 if (!other) { 2051 err = -ECONNRESET; 2052 if (sunaddr == NULL) 2053 goto out_free; 2054 2055 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, 2056 sk->sk_type); 2057 if (IS_ERR(other)) { 2058 err = PTR_ERR(other); 2059 other = NULL; 2060 goto out_free; 2061 } 2062 } 2063 2064 if (sk_filter(other, skb) < 0) { 2065 /* Toss the packet but do not return any error to the sender */ 2066 err = len; 2067 goto out_free; 2068 } 2069 2070 sk_locked = 0; 2071 unix_state_lock(other); 2072 restart_locked: 2073 err = -EPERM; 2074 if (!unix_may_send(sk, other)) 2075 goto out_unlock; 2076 2077 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2078 /* 2079 * Check with 1003.1g - what should 2080 * datagram error 2081 */ 2082 unix_state_unlock(other); 2083 sock_put(other); 2084 2085 if (!sk_locked) 2086 unix_state_lock(sk); 2087 2088 err = 0; 2089 if (sk->sk_type == SOCK_SEQPACKET) { 2090 /* We are here only when racing with unix_release_sock() 2091 * is clearing @other. Never change state to TCP_CLOSE 2092 * unlike SOCK_DGRAM wants. 2093 */ 2094 unix_state_unlock(sk); 2095 err = -EPIPE; 2096 } else if (unix_peer(sk) == other) { 2097 unix_peer(sk) = NULL; 2098 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2099 2100 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2101 unix_state_unlock(sk); 2102 2103 unix_dgram_disconnected(sk, other); 2104 sock_put(other); 2105 err = -ECONNREFUSED; 2106 } else { 2107 unix_state_unlock(sk); 2108 } 2109 2110 other = NULL; 2111 if (err) 2112 goto out_free; 2113 goto restart; 2114 } 2115 2116 err = -EPIPE; 2117 if (other->sk_shutdown & RCV_SHUTDOWN) 2118 goto out_unlock; 2119 2120 if (sk->sk_type != SOCK_SEQPACKET) { 2121 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2122 if (err) 2123 goto out_unlock; 2124 } 2125 2126 /* other == sk && unix_peer(other) != sk if 2127 * - unix_peer(sk) == NULL, destination address bound to sk 2128 * - unix_peer(sk) == sk by time of get but disconnected before lock 2129 */ 2130 if (other != sk && 2131 unlikely(unix_peer(other) != sk && 2132 unix_recvq_full_lockless(other))) { 2133 if (timeo) { 2134 timeo = unix_wait_for_peer(other, timeo); 2135 2136 err = sock_intr_errno(timeo); 2137 if (signal_pending(current)) 2138 goto out_free; 2139 2140 goto restart; 2141 } 2142 2143 if (!sk_locked) { 2144 unix_state_unlock(other); 2145 unix_state_double_lock(sk, other); 2146 } 2147 2148 if (unix_peer(sk) != other || 2149 unix_dgram_peer_wake_me(sk, other)) { 2150 err = -EAGAIN; 2151 sk_locked = 1; 2152 goto out_unlock; 2153 } 2154 2155 if (!sk_locked) { 2156 sk_locked = 1; 2157 goto restart_locked; 2158 } 2159 } 2160 2161 if (unlikely(sk_locked)) 2162 unix_state_unlock(sk); 2163 2164 if (sock_flag(other, SOCK_RCVTSTAMP)) 2165 __net_timestamp(skb); 2166 maybe_add_creds(skb, sock, other); 2167 scm_stat_add(other, skb); 2168 skb_queue_tail(&other->sk_receive_queue, skb); 2169 unix_state_unlock(other); 2170 other->sk_data_ready(other); 2171 sock_put(other); 2172 scm_destroy(&scm); 2173 return len; 2174 2175 out_unlock: 2176 if (sk_locked) 2177 unix_state_unlock(sk); 2178 unix_state_unlock(other); 2179 out_free: 2180 kfree_skb(skb); 2181 out: 2182 if (other) 2183 sock_put(other); 2184 scm_destroy(&scm); 2185 return err; 2186 } 2187 2188 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2189 * bytes, and a minimum of a full page. 2190 */ 2191 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2192 2193 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2194 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2195 struct scm_cookie *scm, bool fds_sent) 2196 { 2197 struct unix_sock *ousk = unix_sk(other); 2198 struct sk_buff *skb; 2199 int err = 0; 2200 2201 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2202 2203 if (!skb) 2204 return err; 2205 2206 err = unix_scm_to_skb(scm, skb, !fds_sent); 2207 if (err < 0) { 2208 kfree_skb(skb); 2209 return err; 2210 } 2211 skb_put(skb, 1); 2212 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2213 2214 if (err) { 2215 kfree_skb(skb); 2216 return err; 2217 } 2218 2219 unix_state_lock(other); 2220 2221 if (sock_flag(other, SOCK_DEAD) || 2222 (other->sk_shutdown & RCV_SHUTDOWN)) { 2223 unix_state_unlock(other); 2224 kfree_skb(skb); 2225 return -EPIPE; 2226 } 2227 2228 maybe_add_creds(skb, sock, other); 2229 skb_get(skb); 2230 2231 scm_stat_add(other, skb); 2232 2233 spin_lock(&other->sk_receive_queue.lock); 2234 if (ousk->oob_skb) 2235 consume_skb(ousk->oob_skb); 2236 WRITE_ONCE(ousk->oob_skb, skb); 2237 __skb_queue_tail(&other->sk_receive_queue, skb); 2238 spin_unlock(&other->sk_receive_queue.lock); 2239 2240 sk_send_sigurg(other); 2241 unix_state_unlock(other); 2242 other->sk_data_ready(other); 2243 2244 return err; 2245 } 2246 #endif 2247 2248 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2249 size_t len) 2250 { 2251 struct sock *sk = sock->sk; 2252 struct sock *other = NULL; 2253 int err, size; 2254 struct sk_buff *skb; 2255 int sent = 0; 2256 struct scm_cookie scm; 2257 bool fds_sent = false; 2258 int data_len; 2259 2260 err = scm_send(sock, msg, &scm, false); 2261 if (err < 0) 2262 return err; 2263 2264 wait_for_unix_gc(scm.fp); 2265 2266 err = -EOPNOTSUPP; 2267 if (msg->msg_flags & MSG_OOB) { 2268 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2269 if (len) 2270 len--; 2271 else 2272 #endif 2273 goto out_err; 2274 } 2275 2276 if (msg->msg_namelen) { 2277 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2278 goto out_err; 2279 } else { 2280 err = -ENOTCONN; 2281 other = unix_peer(sk); 2282 if (!other) 2283 goto out_err; 2284 } 2285 2286 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2287 goto pipe_err; 2288 2289 while (sent < len) { 2290 size = len - sent; 2291 2292 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2293 skb = sock_alloc_send_pskb(sk, 0, 0, 2294 msg->msg_flags & MSG_DONTWAIT, 2295 &err, 0); 2296 } else { 2297 /* Keep two messages in the pipe so it schedules better */ 2298 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2299 2300 /* allow fallback to order-0 allocations */ 2301 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2302 2303 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2304 2305 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2306 2307 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2308 msg->msg_flags & MSG_DONTWAIT, &err, 2309 get_order(UNIX_SKB_FRAGS_SZ)); 2310 } 2311 if (!skb) 2312 goto out_err; 2313 2314 /* Only send the fds in the first buffer */ 2315 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2316 if (err < 0) { 2317 kfree_skb(skb); 2318 goto out_err; 2319 } 2320 fds_sent = true; 2321 2322 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2323 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2324 sk->sk_allocation); 2325 if (err < 0) { 2326 kfree_skb(skb); 2327 goto out_err; 2328 } 2329 size = err; 2330 refcount_add(size, &sk->sk_wmem_alloc); 2331 } else { 2332 skb_put(skb, size - data_len); 2333 skb->data_len = data_len; 2334 skb->len = size; 2335 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2336 if (err) { 2337 kfree_skb(skb); 2338 goto out_err; 2339 } 2340 } 2341 2342 unix_state_lock(other); 2343 2344 if (sock_flag(other, SOCK_DEAD) || 2345 (other->sk_shutdown & RCV_SHUTDOWN)) 2346 goto pipe_err_free; 2347 2348 maybe_add_creds(skb, sock, other); 2349 scm_stat_add(other, skb); 2350 skb_queue_tail(&other->sk_receive_queue, skb); 2351 unix_state_unlock(other); 2352 other->sk_data_ready(other); 2353 sent += size; 2354 } 2355 2356 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2357 if (msg->msg_flags & MSG_OOB) { 2358 err = queue_oob(sock, msg, other, &scm, fds_sent); 2359 if (err) 2360 goto out_err; 2361 sent++; 2362 } 2363 #endif 2364 2365 scm_destroy(&scm); 2366 2367 return sent; 2368 2369 pipe_err_free: 2370 unix_state_unlock(other); 2371 kfree_skb(skb); 2372 pipe_err: 2373 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) 2374 send_sig(SIGPIPE, current, 0); 2375 err = -EPIPE; 2376 out_err: 2377 scm_destroy(&scm); 2378 return sent ? : err; 2379 } 2380 2381 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2382 size_t len) 2383 { 2384 int err; 2385 struct sock *sk = sock->sk; 2386 2387 err = sock_error(sk); 2388 if (err) 2389 return err; 2390 2391 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2392 return -ENOTCONN; 2393 2394 if (msg->msg_namelen) 2395 msg->msg_namelen = 0; 2396 2397 return unix_dgram_sendmsg(sock, msg, len); 2398 } 2399 2400 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2401 size_t size, int flags) 2402 { 2403 struct sock *sk = sock->sk; 2404 2405 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2406 return -ENOTCONN; 2407 2408 return unix_dgram_recvmsg(sock, msg, size, flags); 2409 } 2410 2411 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2412 { 2413 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2414 2415 if (addr) { 2416 msg->msg_namelen = addr->len; 2417 memcpy(msg->msg_name, addr->name, addr->len); 2418 } 2419 } 2420 2421 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2422 int flags) 2423 { 2424 struct scm_cookie scm; 2425 struct socket *sock = sk->sk_socket; 2426 struct unix_sock *u = unix_sk(sk); 2427 struct sk_buff *skb, *last; 2428 long timeo; 2429 int skip; 2430 int err; 2431 2432 err = -EOPNOTSUPP; 2433 if (flags&MSG_OOB) 2434 goto out; 2435 2436 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2437 2438 do { 2439 mutex_lock(&u->iolock); 2440 2441 skip = sk_peek_offset(sk, flags); 2442 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2443 &skip, &err, &last); 2444 if (skb) { 2445 if (!(flags & MSG_PEEK)) 2446 scm_stat_del(sk, skb); 2447 break; 2448 } 2449 2450 mutex_unlock(&u->iolock); 2451 2452 if (err != -EAGAIN) 2453 break; 2454 } while (timeo && 2455 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2456 &err, &timeo, last)); 2457 2458 if (!skb) { /* implies iolock unlocked */ 2459 unix_state_lock(sk); 2460 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2461 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2462 (sk->sk_shutdown & RCV_SHUTDOWN)) 2463 err = 0; 2464 unix_state_unlock(sk); 2465 goto out; 2466 } 2467 2468 if (wq_has_sleeper(&u->peer_wait)) 2469 wake_up_interruptible_sync_poll(&u->peer_wait, 2470 EPOLLOUT | EPOLLWRNORM | 2471 EPOLLWRBAND); 2472 2473 if (msg->msg_name) { 2474 unix_copy_addr(msg, skb->sk); 2475 2476 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2477 msg->msg_name, 2478 &msg->msg_namelen); 2479 } 2480 2481 if (size > skb->len - skip) 2482 size = skb->len - skip; 2483 else if (size < skb->len - skip) 2484 msg->msg_flags |= MSG_TRUNC; 2485 2486 err = skb_copy_datagram_msg(skb, skip, msg, size); 2487 if (err) 2488 goto out_free; 2489 2490 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2491 __sock_recv_timestamp(msg, sk, skb); 2492 2493 memset(&scm, 0, sizeof(scm)); 2494 2495 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2496 unix_set_secdata(&scm, skb); 2497 2498 if (!(flags & MSG_PEEK)) { 2499 if (UNIXCB(skb).fp) 2500 unix_detach_fds(&scm, skb); 2501 2502 sk_peek_offset_bwd(sk, skb->len); 2503 } else { 2504 /* It is questionable: on PEEK we could: 2505 - do not return fds - good, but too simple 8) 2506 - return fds, and do not return them on read (old strategy, 2507 apparently wrong) 2508 - clone fds (I chose it for now, it is the most universal 2509 solution) 2510 2511 POSIX 1003.1g does not actually define this clearly 2512 at all. POSIX 1003.1g doesn't define a lot of things 2513 clearly however! 2514 2515 */ 2516 2517 sk_peek_offset_fwd(sk, size); 2518 2519 if (UNIXCB(skb).fp) 2520 unix_peek_fds(&scm, skb); 2521 } 2522 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2523 2524 scm_recv_unix(sock, msg, &scm, flags); 2525 2526 out_free: 2527 skb_free_datagram(sk, skb); 2528 mutex_unlock(&u->iolock); 2529 out: 2530 return err; 2531 } 2532 2533 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2534 int flags) 2535 { 2536 struct sock *sk = sock->sk; 2537 2538 #ifdef CONFIG_BPF_SYSCALL 2539 const struct proto *prot = READ_ONCE(sk->sk_prot); 2540 2541 if (prot != &unix_dgram_proto) 2542 return prot->recvmsg(sk, msg, size, flags, NULL); 2543 #endif 2544 return __unix_dgram_recvmsg(sk, msg, size, flags); 2545 } 2546 2547 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2548 { 2549 struct unix_sock *u = unix_sk(sk); 2550 struct sk_buff *skb; 2551 int err; 2552 2553 mutex_lock(&u->iolock); 2554 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2555 mutex_unlock(&u->iolock); 2556 if (!skb) 2557 return err; 2558 2559 return recv_actor(sk, skb); 2560 } 2561 2562 /* 2563 * Sleep until more data has arrived. But check for races.. 2564 */ 2565 static long unix_stream_data_wait(struct sock *sk, long timeo, 2566 struct sk_buff *last, unsigned int last_len, 2567 bool freezable) 2568 { 2569 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2570 struct sk_buff *tail; 2571 DEFINE_WAIT(wait); 2572 2573 unix_state_lock(sk); 2574 2575 for (;;) { 2576 prepare_to_wait(sk_sleep(sk), &wait, state); 2577 2578 tail = skb_peek_tail(&sk->sk_receive_queue); 2579 if (tail != last || 2580 (tail && tail->len != last_len) || 2581 sk->sk_err || 2582 (sk->sk_shutdown & RCV_SHUTDOWN) || 2583 signal_pending(current) || 2584 !timeo) 2585 break; 2586 2587 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2588 unix_state_unlock(sk); 2589 timeo = schedule_timeout(timeo); 2590 unix_state_lock(sk); 2591 2592 if (sock_flag(sk, SOCK_DEAD)) 2593 break; 2594 2595 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2596 } 2597 2598 finish_wait(sk_sleep(sk), &wait); 2599 unix_state_unlock(sk); 2600 return timeo; 2601 } 2602 2603 static unsigned int unix_skb_len(const struct sk_buff *skb) 2604 { 2605 return skb->len - UNIXCB(skb).consumed; 2606 } 2607 2608 struct unix_stream_read_state { 2609 int (*recv_actor)(struct sk_buff *, int, int, 2610 struct unix_stream_read_state *); 2611 struct socket *socket; 2612 struct msghdr *msg; 2613 struct pipe_inode_info *pipe; 2614 size_t size; 2615 int flags; 2616 unsigned int splice_flags; 2617 }; 2618 2619 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2620 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2621 { 2622 struct socket *sock = state->socket; 2623 struct sock *sk = sock->sk; 2624 struct unix_sock *u = unix_sk(sk); 2625 int chunk = 1; 2626 struct sk_buff *oob_skb; 2627 2628 mutex_lock(&u->iolock); 2629 unix_state_lock(sk); 2630 spin_lock(&sk->sk_receive_queue.lock); 2631 2632 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2633 spin_unlock(&sk->sk_receive_queue.lock); 2634 unix_state_unlock(sk); 2635 mutex_unlock(&u->iolock); 2636 return -EINVAL; 2637 } 2638 2639 oob_skb = u->oob_skb; 2640 2641 if (!(state->flags & MSG_PEEK)) 2642 WRITE_ONCE(u->oob_skb, NULL); 2643 else 2644 skb_get(oob_skb); 2645 2646 spin_unlock(&sk->sk_receive_queue.lock); 2647 unix_state_unlock(sk); 2648 2649 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2650 2651 if (!(state->flags & MSG_PEEK)) 2652 UNIXCB(oob_skb).consumed += 1; 2653 2654 consume_skb(oob_skb); 2655 2656 mutex_unlock(&u->iolock); 2657 2658 if (chunk < 0) 2659 return -EFAULT; 2660 2661 state->msg->msg_flags |= MSG_OOB; 2662 return 1; 2663 } 2664 2665 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2666 int flags, int copied) 2667 { 2668 struct unix_sock *u = unix_sk(sk); 2669 2670 if (!unix_skb_len(skb)) { 2671 struct sk_buff *unlinked_skb = NULL; 2672 2673 spin_lock(&sk->sk_receive_queue.lock); 2674 2675 if (copied && (!u->oob_skb || skb == u->oob_skb)) { 2676 skb = NULL; 2677 } else if (flags & MSG_PEEK) { 2678 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2679 } else { 2680 unlinked_skb = skb; 2681 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2682 __skb_unlink(unlinked_skb, &sk->sk_receive_queue); 2683 } 2684 2685 spin_unlock(&sk->sk_receive_queue.lock); 2686 2687 consume_skb(unlinked_skb); 2688 } else { 2689 struct sk_buff *unlinked_skb = NULL; 2690 2691 spin_lock(&sk->sk_receive_queue.lock); 2692 2693 if (skb == u->oob_skb) { 2694 if (copied) { 2695 skb = NULL; 2696 } else if (!(flags & MSG_PEEK)) { 2697 if (sock_flag(sk, SOCK_URGINLINE)) { 2698 WRITE_ONCE(u->oob_skb, NULL); 2699 consume_skb(skb); 2700 } else { 2701 __skb_unlink(skb, &sk->sk_receive_queue); 2702 WRITE_ONCE(u->oob_skb, NULL); 2703 unlinked_skb = skb; 2704 skb = skb_peek(&sk->sk_receive_queue); 2705 } 2706 } else if (!sock_flag(sk, SOCK_URGINLINE)) { 2707 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2708 } 2709 } 2710 2711 spin_unlock(&sk->sk_receive_queue.lock); 2712 2713 if (unlinked_skb) { 2714 WARN_ON_ONCE(skb_unref(unlinked_skb)); 2715 kfree_skb(unlinked_skb); 2716 } 2717 } 2718 return skb; 2719 } 2720 #endif 2721 2722 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2723 { 2724 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2725 return -ENOTCONN; 2726 2727 return unix_read_skb(sk, recv_actor); 2728 } 2729 2730 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2731 bool freezable) 2732 { 2733 struct scm_cookie scm; 2734 struct socket *sock = state->socket; 2735 struct sock *sk = sock->sk; 2736 struct unix_sock *u = unix_sk(sk); 2737 int copied = 0; 2738 int flags = state->flags; 2739 int noblock = flags & MSG_DONTWAIT; 2740 bool check_creds = false; 2741 int target; 2742 int err = 0; 2743 long timeo; 2744 int skip; 2745 size_t size = state->size; 2746 unsigned int last_len; 2747 2748 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2749 err = -EINVAL; 2750 goto out; 2751 } 2752 2753 if (unlikely(flags & MSG_OOB)) { 2754 err = -EOPNOTSUPP; 2755 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2756 err = unix_stream_recv_urg(state); 2757 #endif 2758 goto out; 2759 } 2760 2761 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2762 timeo = sock_rcvtimeo(sk, noblock); 2763 2764 memset(&scm, 0, sizeof(scm)); 2765 2766 /* Lock the socket to prevent queue disordering 2767 * while sleeps in memcpy_tomsg 2768 */ 2769 mutex_lock(&u->iolock); 2770 2771 skip = max(sk_peek_offset(sk, flags), 0); 2772 2773 do { 2774 struct sk_buff *skb, *last; 2775 int chunk; 2776 2777 redo: 2778 unix_state_lock(sk); 2779 if (sock_flag(sk, SOCK_DEAD)) { 2780 err = -ECONNRESET; 2781 goto unlock; 2782 } 2783 last = skb = skb_peek(&sk->sk_receive_queue); 2784 last_len = last ? last->len : 0; 2785 2786 again: 2787 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2788 if (skb) { 2789 skb = manage_oob(skb, sk, flags, copied); 2790 if (!skb && copied) { 2791 unix_state_unlock(sk); 2792 break; 2793 } 2794 } 2795 #endif 2796 if (skb == NULL) { 2797 if (copied >= target) 2798 goto unlock; 2799 2800 /* 2801 * POSIX 1003.1g mandates this order. 2802 */ 2803 2804 err = sock_error(sk); 2805 if (err) 2806 goto unlock; 2807 if (sk->sk_shutdown & RCV_SHUTDOWN) 2808 goto unlock; 2809 2810 unix_state_unlock(sk); 2811 if (!timeo) { 2812 err = -EAGAIN; 2813 break; 2814 } 2815 2816 mutex_unlock(&u->iolock); 2817 2818 timeo = unix_stream_data_wait(sk, timeo, last, 2819 last_len, freezable); 2820 2821 if (signal_pending(current)) { 2822 err = sock_intr_errno(timeo); 2823 scm_destroy(&scm); 2824 goto out; 2825 } 2826 2827 mutex_lock(&u->iolock); 2828 goto redo; 2829 unlock: 2830 unix_state_unlock(sk); 2831 break; 2832 } 2833 2834 while (skip >= unix_skb_len(skb)) { 2835 skip -= unix_skb_len(skb); 2836 last = skb; 2837 last_len = skb->len; 2838 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2839 if (!skb) 2840 goto again; 2841 } 2842 2843 unix_state_unlock(sk); 2844 2845 if (check_creds) { 2846 /* Never glue messages from different writers */ 2847 if (!unix_skb_scm_eq(skb, &scm)) 2848 break; 2849 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2850 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2851 /* Copy credentials */ 2852 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2853 unix_set_secdata(&scm, skb); 2854 check_creds = true; 2855 } 2856 2857 /* Copy address just once */ 2858 if (state->msg && state->msg->msg_name) { 2859 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2860 state->msg->msg_name); 2861 unix_copy_addr(state->msg, skb->sk); 2862 2863 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2864 state->msg->msg_name, 2865 &state->msg->msg_namelen); 2866 2867 sunaddr = NULL; 2868 } 2869 2870 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2871 chunk = state->recv_actor(skb, skip, chunk, state); 2872 if (chunk < 0) { 2873 if (copied == 0) 2874 copied = -EFAULT; 2875 break; 2876 } 2877 copied += chunk; 2878 size -= chunk; 2879 2880 /* Mark read part of skb as used */ 2881 if (!(flags & MSG_PEEK)) { 2882 UNIXCB(skb).consumed += chunk; 2883 2884 sk_peek_offset_bwd(sk, chunk); 2885 2886 if (UNIXCB(skb).fp) { 2887 scm_stat_del(sk, skb); 2888 unix_detach_fds(&scm, skb); 2889 } 2890 2891 if (unix_skb_len(skb)) 2892 break; 2893 2894 skb_unlink(skb, &sk->sk_receive_queue); 2895 consume_skb(skb); 2896 2897 if (scm.fp) 2898 break; 2899 } else { 2900 /* It is questionable, see note in unix_dgram_recvmsg. 2901 */ 2902 if (UNIXCB(skb).fp) 2903 unix_peek_fds(&scm, skb); 2904 2905 sk_peek_offset_fwd(sk, chunk); 2906 2907 if (UNIXCB(skb).fp) 2908 break; 2909 2910 skip = 0; 2911 last = skb; 2912 last_len = skb->len; 2913 unix_state_lock(sk); 2914 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2915 if (skb) 2916 goto again; 2917 unix_state_unlock(sk); 2918 break; 2919 } 2920 } while (size); 2921 2922 mutex_unlock(&u->iolock); 2923 if (state->msg) 2924 scm_recv_unix(sock, state->msg, &scm, flags); 2925 else 2926 scm_destroy(&scm); 2927 out: 2928 return copied ? : err; 2929 } 2930 2931 static int unix_stream_read_actor(struct sk_buff *skb, 2932 int skip, int chunk, 2933 struct unix_stream_read_state *state) 2934 { 2935 int ret; 2936 2937 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2938 state->msg, chunk); 2939 return ret ?: chunk; 2940 } 2941 2942 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2943 size_t size, int flags) 2944 { 2945 struct unix_stream_read_state state = { 2946 .recv_actor = unix_stream_read_actor, 2947 .socket = sk->sk_socket, 2948 .msg = msg, 2949 .size = size, 2950 .flags = flags 2951 }; 2952 2953 return unix_stream_read_generic(&state, true); 2954 } 2955 2956 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2957 size_t size, int flags) 2958 { 2959 struct unix_stream_read_state state = { 2960 .recv_actor = unix_stream_read_actor, 2961 .socket = sock, 2962 .msg = msg, 2963 .size = size, 2964 .flags = flags 2965 }; 2966 2967 #ifdef CONFIG_BPF_SYSCALL 2968 struct sock *sk = sock->sk; 2969 const struct proto *prot = READ_ONCE(sk->sk_prot); 2970 2971 if (prot != &unix_stream_proto) 2972 return prot->recvmsg(sk, msg, size, flags, NULL); 2973 #endif 2974 return unix_stream_read_generic(&state, true); 2975 } 2976 2977 static int unix_stream_splice_actor(struct sk_buff *skb, 2978 int skip, int chunk, 2979 struct unix_stream_read_state *state) 2980 { 2981 return skb_splice_bits(skb, state->socket->sk, 2982 UNIXCB(skb).consumed + skip, 2983 state->pipe, chunk, state->splice_flags); 2984 } 2985 2986 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2987 struct pipe_inode_info *pipe, 2988 size_t size, unsigned int flags) 2989 { 2990 struct unix_stream_read_state state = { 2991 .recv_actor = unix_stream_splice_actor, 2992 .socket = sock, 2993 .pipe = pipe, 2994 .size = size, 2995 .splice_flags = flags, 2996 }; 2997 2998 if (unlikely(*ppos)) 2999 return -ESPIPE; 3000 3001 if (sock->file->f_flags & O_NONBLOCK || 3002 flags & SPLICE_F_NONBLOCK) 3003 state.flags = MSG_DONTWAIT; 3004 3005 return unix_stream_read_generic(&state, false); 3006 } 3007 3008 static int unix_shutdown(struct socket *sock, int mode) 3009 { 3010 struct sock *sk = sock->sk; 3011 struct sock *other; 3012 3013 if (mode < SHUT_RD || mode > SHUT_RDWR) 3014 return -EINVAL; 3015 /* This maps: 3016 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 3017 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 3018 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 3019 */ 3020 ++mode; 3021 3022 unix_state_lock(sk); 3023 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 3024 other = unix_peer(sk); 3025 if (other) 3026 sock_hold(other); 3027 unix_state_unlock(sk); 3028 sk->sk_state_change(sk); 3029 3030 if (other && 3031 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 3032 3033 int peer_mode = 0; 3034 const struct proto *prot = READ_ONCE(other->sk_prot); 3035 3036 if (prot->unhash) 3037 prot->unhash(other); 3038 if (mode&RCV_SHUTDOWN) 3039 peer_mode |= SEND_SHUTDOWN; 3040 if (mode&SEND_SHUTDOWN) 3041 peer_mode |= RCV_SHUTDOWN; 3042 unix_state_lock(other); 3043 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 3044 unix_state_unlock(other); 3045 other->sk_state_change(other); 3046 if (peer_mode == SHUTDOWN_MASK) 3047 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3048 else if (peer_mode & RCV_SHUTDOWN) 3049 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3050 } 3051 if (other) 3052 sock_put(other); 3053 3054 return 0; 3055 } 3056 3057 long unix_inq_len(struct sock *sk) 3058 { 3059 struct sk_buff *skb; 3060 long amount = 0; 3061 3062 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 3063 return -EINVAL; 3064 3065 spin_lock(&sk->sk_receive_queue.lock); 3066 if (sk->sk_type == SOCK_STREAM || 3067 sk->sk_type == SOCK_SEQPACKET) { 3068 skb_queue_walk(&sk->sk_receive_queue, skb) 3069 amount += unix_skb_len(skb); 3070 } else { 3071 skb = skb_peek(&sk->sk_receive_queue); 3072 if (skb) 3073 amount = skb->len; 3074 } 3075 spin_unlock(&sk->sk_receive_queue.lock); 3076 3077 return amount; 3078 } 3079 EXPORT_SYMBOL_GPL(unix_inq_len); 3080 3081 long unix_outq_len(struct sock *sk) 3082 { 3083 return sk_wmem_alloc_get(sk); 3084 } 3085 EXPORT_SYMBOL_GPL(unix_outq_len); 3086 3087 static int unix_open_file(struct sock *sk) 3088 { 3089 struct path path; 3090 struct file *f; 3091 int fd; 3092 3093 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3094 return -EPERM; 3095 3096 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3097 return -ENOENT; 3098 3099 path = unix_sk(sk)->path; 3100 if (!path.dentry) 3101 return -ENOENT; 3102 3103 path_get(&path); 3104 3105 fd = get_unused_fd_flags(O_CLOEXEC); 3106 if (fd < 0) 3107 goto out; 3108 3109 f = dentry_open(&path, O_PATH, current_cred()); 3110 if (IS_ERR(f)) { 3111 put_unused_fd(fd); 3112 fd = PTR_ERR(f); 3113 goto out; 3114 } 3115 3116 fd_install(fd, f); 3117 out: 3118 path_put(&path); 3119 3120 return fd; 3121 } 3122 3123 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3124 { 3125 struct sock *sk = sock->sk; 3126 long amount = 0; 3127 int err; 3128 3129 switch (cmd) { 3130 case SIOCOUTQ: 3131 amount = unix_outq_len(sk); 3132 err = put_user(amount, (int __user *)arg); 3133 break; 3134 case SIOCINQ: 3135 amount = unix_inq_len(sk); 3136 if (amount < 0) 3137 err = amount; 3138 else 3139 err = put_user(amount, (int __user *)arg); 3140 break; 3141 case SIOCUNIXFILE: 3142 err = unix_open_file(sk); 3143 break; 3144 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3145 case SIOCATMARK: 3146 { 3147 struct unix_sock *u = unix_sk(sk); 3148 struct sk_buff *skb; 3149 int answ = 0; 3150 3151 mutex_lock(&u->iolock); 3152 3153 skb = skb_peek(&sk->sk_receive_queue); 3154 if (skb) { 3155 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); 3156 3157 if (skb == oob_skb || 3158 (!oob_skb && !unix_skb_len(skb))) 3159 answ = 1; 3160 } 3161 3162 mutex_unlock(&u->iolock); 3163 3164 err = put_user(answ, (int __user *)arg); 3165 } 3166 break; 3167 #endif 3168 default: 3169 err = -ENOIOCTLCMD; 3170 break; 3171 } 3172 return err; 3173 } 3174 3175 #ifdef CONFIG_COMPAT 3176 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3177 { 3178 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3179 } 3180 #endif 3181 3182 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3183 { 3184 struct sock *sk = sock->sk; 3185 unsigned char state; 3186 __poll_t mask; 3187 u8 shutdown; 3188 3189 sock_poll_wait(file, sock, wait); 3190 mask = 0; 3191 shutdown = READ_ONCE(sk->sk_shutdown); 3192 state = READ_ONCE(sk->sk_state); 3193 3194 /* exceptional events? */ 3195 if (READ_ONCE(sk->sk_err)) 3196 mask |= EPOLLERR; 3197 if (shutdown == SHUTDOWN_MASK) 3198 mask |= EPOLLHUP; 3199 if (shutdown & RCV_SHUTDOWN) 3200 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3201 3202 /* readable? */ 3203 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3204 mask |= EPOLLIN | EPOLLRDNORM; 3205 if (sk_is_readable(sk)) 3206 mask |= EPOLLIN | EPOLLRDNORM; 3207 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3208 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3209 mask |= EPOLLPRI; 3210 #endif 3211 3212 /* Connection-based need to check for termination and startup */ 3213 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3214 state == TCP_CLOSE) 3215 mask |= EPOLLHUP; 3216 3217 /* 3218 * we set writable also when the other side has shut down the 3219 * connection. This prevents stuck sockets. 3220 */ 3221 if (unix_writable(sk, state)) 3222 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3223 3224 return mask; 3225 } 3226 3227 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3228 poll_table *wait) 3229 { 3230 struct sock *sk = sock->sk, *other; 3231 unsigned int writable; 3232 unsigned char state; 3233 __poll_t mask; 3234 u8 shutdown; 3235 3236 sock_poll_wait(file, sock, wait); 3237 mask = 0; 3238 shutdown = READ_ONCE(sk->sk_shutdown); 3239 state = READ_ONCE(sk->sk_state); 3240 3241 /* exceptional events? */ 3242 if (READ_ONCE(sk->sk_err) || 3243 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3244 mask |= EPOLLERR | 3245 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3246 3247 if (shutdown & RCV_SHUTDOWN) 3248 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3249 if (shutdown == SHUTDOWN_MASK) 3250 mask |= EPOLLHUP; 3251 3252 /* readable? */ 3253 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3254 mask |= EPOLLIN | EPOLLRDNORM; 3255 if (sk_is_readable(sk)) 3256 mask |= EPOLLIN | EPOLLRDNORM; 3257 3258 /* Connection-based need to check for termination and startup */ 3259 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3260 mask |= EPOLLHUP; 3261 3262 /* No write status requested, avoid expensive OUT tests. */ 3263 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3264 return mask; 3265 3266 writable = unix_writable(sk, state); 3267 if (writable) { 3268 unix_state_lock(sk); 3269 3270 other = unix_peer(sk); 3271 if (other && unix_peer(other) != sk && 3272 unix_recvq_full_lockless(other) && 3273 unix_dgram_peer_wake_me(sk, other)) 3274 writable = 0; 3275 3276 unix_state_unlock(sk); 3277 } 3278 3279 if (writable) 3280 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3281 else 3282 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3283 3284 return mask; 3285 } 3286 3287 #ifdef CONFIG_PROC_FS 3288 3289 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3290 3291 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3292 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3293 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3294 3295 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3296 { 3297 unsigned long offset = get_offset(*pos); 3298 unsigned long bucket = get_bucket(*pos); 3299 unsigned long count = 0; 3300 struct sock *sk; 3301 3302 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3303 sk; sk = sk_next(sk)) { 3304 if (++count == offset) 3305 break; 3306 } 3307 3308 return sk; 3309 } 3310 3311 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3312 { 3313 unsigned long bucket = get_bucket(*pos); 3314 struct net *net = seq_file_net(seq); 3315 struct sock *sk; 3316 3317 while (bucket < UNIX_HASH_SIZE) { 3318 spin_lock(&net->unx.table.locks[bucket]); 3319 3320 sk = unix_from_bucket(seq, pos); 3321 if (sk) 3322 return sk; 3323 3324 spin_unlock(&net->unx.table.locks[bucket]); 3325 3326 *pos = set_bucket_offset(++bucket, 1); 3327 } 3328 3329 return NULL; 3330 } 3331 3332 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3333 loff_t *pos) 3334 { 3335 unsigned long bucket = get_bucket(*pos); 3336 3337 sk = sk_next(sk); 3338 if (sk) 3339 return sk; 3340 3341 3342 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3343 3344 *pos = set_bucket_offset(++bucket, 1); 3345 3346 return unix_get_first(seq, pos); 3347 } 3348 3349 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3350 { 3351 if (!*pos) 3352 return SEQ_START_TOKEN; 3353 3354 return unix_get_first(seq, pos); 3355 } 3356 3357 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3358 { 3359 ++*pos; 3360 3361 if (v == SEQ_START_TOKEN) 3362 return unix_get_first(seq, pos); 3363 3364 return unix_get_next(seq, v, pos); 3365 } 3366 3367 static void unix_seq_stop(struct seq_file *seq, void *v) 3368 { 3369 struct sock *sk = v; 3370 3371 if (sk) 3372 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3373 } 3374 3375 static int unix_seq_show(struct seq_file *seq, void *v) 3376 { 3377 3378 if (v == SEQ_START_TOKEN) 3379 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3380 "Inode Path\n"); 3381 else { 3382 struct sock *s = v; 3383 struct unix_sock *u = unix_sk(s); 3384 unix_state_lock(s); 3385 3386 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3387 s, 3388 refcount_read(&s->sk_refcnt), 3389 0, 3390 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3391 s->sk_type, 3392 s->sk_socket ? 3393 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3394 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3395 sock_i_ino(s)); 3396 3397 if (u->addr) { // under a hash table lock here 3398 int i, len; 3399 seq_putc(seq, ' '); 3400 3401 i = 0; 3402 len = u->addr->len - 3403 offsetof(struct sockaddr_un, sun_path); 3404 if (u->addr->name->sun_path[0]) { 3405 len--; 3406 } else { 3407 seq_putc(seq, '@'); 3408 i++; 3409 } 3410 for ( ; i < len; i++) 3411 seq_putc(seq, u->addr->name->sun_path[i] ?: 3412 '@'); 3413 } 3414 unix_state_unlock(s); 3415 seq_putc(seq, '\n'); 3416 } 3417 3418 return 0; 3419 } 3420 3421 static const struct seq_operations unix_seq_ops = { 3422 .start = unix_seq_start, 3423 .next = unix_seq_next, 3424 .stop = unix_seq_stop, 3425 .show = unix_seq_show, 3426 }; 3427 3428 #ifdef CONFIG_BPF_SYSCALL 3429 struct bpf_unix_iter_state { 3430 struct seq_net_private p; 3431 unsigned int cur_sk; 3432 unsigned int end_sk; 3433 unsigned int max_sk; 3434 struct sock **batch; 3435 bool st_bucket_done; 3436 }; 3437 3438 struct bpf_iter__unix { 3439 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3440 __bpf_md_ptr(struct unix_sock *, unix_sk); 3441 uid_t uid __aligned(8); 3442 }; 3443 3444 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3445 struct unix_sock *unix_sk, uid_t uid) 3446 { 3447 struct bpf_iter__unix ctx; 3448 3449 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3450 ctx.meta = meta; 3451 ctx.unix_sk = unix_sk; 3452 ctx.uid = uid; 3453 return bpf_iter_run_prog(prog, &ctx); 3454 } 3455 3456 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3457 3458 { 3459 struct bpf_unix_iter_state *iter = seq->private; 3460 unsigned int expected = 1; 3461 struct sock *sk; 3462 3463 sock_hold(start_sk); 3464 iter->batch[iter->end_sk++] = start_sk; 3465 3466 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3467 if (iter->end_sk < iter->max_sk) { 3468 sock_hold(sk); 3469 iter->batch[iter->end_sk++] = sk; 3470 } 3471 3472 expected++; 3473 } 3474 3475 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3476 3477 return expected; 3478 } 3479 3480 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3481 { 3482 while (iter->cur_sk < iter->end_sk) 3483 sock_put(iter->batch[iter->cur_sk++]); 3484 } 3485 3486 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3487 unsigned int new_batch_sz) 3488 { 3489 struct sock **new_batch; 3490 3491 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3492 GFP_USER | __GFP_NOWARN); 3493 if (!new_batch) 3494 return -ENOMEM; 3495 3496 bpf_iter_unix_put_batch(iter); 3497 kvfree(iter->batch); 3498 iter->batch = new_batch; 3499 iter->max_sk = new_batch_sz; 3500 3501 return 0; 3502 } 3503 3504 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3505 loff_t *pos) 3506 { 3507 struct bpf_unix_iter_state *iter = seq->private; 3508 unsigned int expected; 3509 bool resized = false; 3510 struct sock *sk; 3511 3512 if (iter->st_bucket_done) 3513 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3514 3515 again: 3516 /* Get a new batch */ 3517 iter->cur_sk = 0; 3518 iter->end_sk = 0; 3519 3520 sk = unix_get_first(seq, pos); 3521 if (!sk) 3522 return NULL; /* Done */ 3523 3524 expected = bpf_iter_unix_hold_batch(seq, sk); 3525 3526 if (iter->end_sk == expected) { 3527 iter->st_bucket_done = true; 3528 return sk; 3529 } 3530 3531 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3532 resized = true; 3533 goto again; 3534 } 3535 3536 return sk; 3537 } 3538 3539 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3540 { 3541 if (!*pos) 3542 return SEQ_START_TOKEN; 3543 3544 /* bpf iter does not support lseek, so it always 3545 * continue from where it was stop()-ped. 3546 */ 3547 return bpf_iter_unix_batch(seq, pos); 3548 } 3549 3550 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3551 { 3552 struct bpf_unix_iter_state *iter = seq->private; 3553 struct sock *sk; 3554 3555 /* Whenever seq_next() is called, the iter->cur_sk is 3556 * done with seq_show(), so advance to the next sk in 3557 * the batch. 3558 */ 3559 if (iter->cur_sk < iter->end_sk) 3560 sock_put(iter->batch[iter->cur_sk++]); 3561 3562 ++*pos; 3563 3564 if (iter->cur_sk < iter->end_sk) 3565 sk = iter->batch[iter->cur_sk]; 3566 else 3567 sk = bpf_iter_unix_batch(seq, pos); 3568 3569 return sk; 3570 } 3571 3572 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3573 { 3574 struct bpf_iter_meta meta; 3575 struct bpf_prog *prog; 3576 struct sock *sk = v; 3577 uid_t uid; 3578 bool slow; 3579 int ret; 3580 3581 if (v == SEQ_START_TOKEN) 3582 return 0; 3583 3584 slow = lock_sock_fast(sk); 3585 3586 if (unlikely(sk_unhashed(sk))) { 3587 ret = SEQ_SKIP; 3588 goto unlock; 3589 } 3590 3591 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3592 meta.seq = seq; 3593 prog = bpf_iter_get_info(&meta, false); 3594 ret = unix_prog_seq_show(prog, &meta, v, uid); 3595 unlock: 3596 unlock_sock_fast(sk, slow); 3597 return ret; 3598 } 3599 3600 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3601 { 3602 struct bpf_unix_iter_state *iter = seq->private; 3603 struct bpf_iter_meta meta; 3604 struct bpf_prog *prog; 3605 3606 if (!v) { 3607 meta.seq = seq; 3608 prog = bpf_iter_get_info(&meta, true); 3609 if (prog) 3610 (void)unix_prog_seq_show(prog, &meta, v, 0); 3611 } 3612 3613 if (iter->cur_sk < iter->end_sk) 3614 bpf_iter_unix_put_batch(iter); 3615 } 3616 3617 static const struct seq_operations bpf_iter_unix_seq_ops = { 3618 .start = bpf_iter_unix_seq_start, 3619 .next = bpf_iter_unix_seq_next, 3620 .stop = bpf_iter_unix_seq_stop, 3621 .show = bpf_iter_unix_seq_show, 3622 }; 3623 #endif 3624 #endif 3625 3626 static const struct net_proto_family unix_family_ops = { 3627 .family = PF_UNIX, 3628 .create = unix_create, 3629 .owner = THIS_MODULE, 3630 }; 3631 3632 3633 static int __net_init unix_net_init(struct net *net) 3634 { 3635 int i; 3636 3637 net->unx.sysctl_max_dgram_qlen = 10; 3638 if (unix_sysctl_register(net)) 3639 goto out; 3640 3641 #ifdef CONFIG_PROC_FS 3642 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3643 sizeof(struct seq_net_private))) 3644 goto err_sysctl; 3645 #endif 3646 3647 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3648 sizeof(spinlock_t), GFP_KERNEL); 3649 if (!net->unx.table.locks) 3650 goto err_proc; 3651 3652 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3653 sizeof(struct hlist_head), 3654 GFP_KERNEL); 3655 if (!net->unx.table.buckets) 3656 goto free_locks; 3657 3658 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3659 spin_lock_init(&net->unx.table.locks[i]); 3660 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); 3661 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3662 } 3663 3664 return 0; 3665 3666 free_locks: 3667 kvfree(net->unx.table.locks); 3668 err_proc: 3669 #ifdef CONFIG_PROC_FS 3670 remove_proc_entry("unix", net->proc_net); 3671 err_sysctl: 3672 #endif 3673 unix_sysctl_unregister(net); 3674 out: 3675 return -ENOMEM; 3676 } 3677 3678 static void __net_exit unix_net_exit(struct net *net) 3679 { 3680 kvfree(net->unx.table.buckets); 3681 kvfree(net->unx.table.locks); 3682 unix_sysctl_unregister(net); 3683 remove_proc_entry("unix", net->proc_net); 3684 } 3685 3686 static struct pernet_operations unix_net_ops = { 3687 .init = unix_net_init, 3688 .exit = unix_net_exit, 3689 }; 3690 3691 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3692 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3693 struct unix_sock *unix_sk, uid_t uid) 3694 3695 #define INIT_BATCH_SZ 16 3696 3697 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3698 { 3699 struct bpf_unix_iter_state *iter = priv_data; 3700 int err; 3701 3702 err = bpf_iter_init_seq_net(priv_data, aux); 3703 if (err) 3704 return err; 3705 3706 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3707 if (err) { 3708 bpf_iter_fini_seq_net(priv_data); 3709 return err; 3710 } 3711 3712 return 0; 3713 } 3714 3715 static void bpf_iter_fini_unix(void *priv_data) 3716 { 3717 struct bpf_unix_iter_state *iter = priv_data; 3718 3719 bpf_iter_fini_seq_net(priv_data); 3720 kvfree(iter->batch); 3721 } 3722 3723 static const struct bpf_iter_seq_info unix_seq_info = { 3724 .seq_ops = &bpf_iter_unix_seq_ops, 3725 .init_seq_private = bpf_iter_init_unix, 3726 .fini_seq_private = bpf_iter_fini_unix, 3727 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3728 }; 3729 3730 static const struct bpf_func_proto * 3731 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3732 const struct bpf_prog *prog) 3733 { 3734 switch (func_id) { 3735 case BPF_FUNC_setsockopt: 3736 return &bpf_sk_setsockopt_proto; 3737 case BPF_FUNC_getsockopt: 3738 return &bpf_sk_getsockopt_proto; 3739 default: 3740 return NULL; 3741 } 3742 } 3743 3744 static struct bpf_iter_reg unix_reg_info = { 3745 .target = "unix", 3746 .ctx_arg_info_size = 1, 3747 .ctx_arg_info = { 3748 { offsetof(struct bpf_iter__unix, unix_sk), 3749 PTR_TO_BTF_ID_OR_NULL }, 3750 }, 3751 .get_func_proto = bpf_iter_unix_get_func_proto, 3752 .seq_info = &unix_seq_info, 3753 }; 3754 3755 static void __init bpf_iter_register(void) 3756 { 3757 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3758 if (bpf_iter_reg_target(&unix_reg_info)) 3759 pr_warn("Warning: could not register bpf iterator unix\n"); 3760 } 3761 #endif 3762 3763 static int __init af_unix_init(void) 3764 { 3765 int i, rc = -1; 3766 3767 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3768 3769 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3770 spin_lock_init(&bsd_socket_locks[i]); 3771 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3772 } 3773 3774 rc = proto_register(&unix_dgram_proto, 1); 3775 if (rc != 0) { 3776 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3777 goto out; 3778 } 3779 3780 rc = proto_register(&unix_stream_proto, 1); 3781 if (rc != 0) { 3782 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3783 proto_unregister(&unix_dgram_proto); 3784 goto out; 3785 } 3786 3787 sock_register(&unix_family_ops); 3788 register_pernet_subsys(&unix_net_ops); 3789 unix_bpf_build_proto(); 3790 3791 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3792 bpf_iter_register(); 3793 #endif 3794 3795 out: 3796 return rc; 3797 } 3798 3799 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3800 fs_initcall(af_unix_init); 3801