1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 #include <linux/bpf-cgroup.h> 120 121 static atomic_long_t unix_nr_socks; 122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 124 125 /* SMP locking strategy: 126 * hash table is protected with spinlock. 127 * each socket state is protected by separate spinlock. 128 */ 129 #ifdef CONFIG_PROVE_LOCKING 130 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) 131 132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a, 133 const struct lockdep_map *b) 134 { 135 return cmp_ptr(a, b); 136 } 137 138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, 139 const struct lockdep_map *_b) 140 { 141 const struct unix_sock *a, *b; 142 143 a = container_of(_a, struct unix_sock, lock.dep_map); 144 b = container_of(_b, struct unix_sock, lock.dep_map); 145 146 if (a->sk.sk_state == TCP_LISTEN) { 147 /* unix_stream_connect(): Before the 2nd unix_state_lock(), 148 * 149 * 1. a is TCP_LISTEN. 150 * 2. b is not a. 151 * 3. concurrent connect(b -> a) must fail. 152 * 153 * Except for 2. & 3., the b's state can be any possible 154 * value due to concurrent connect() or listen(). 155 * 156 * 2. is detected in debug_spin_lock_before(), and 3. cannot 157 * be expressed as lock_cmp_fn. 158 */ 159 switch (b->sk.sk_state) { 160 case TCP_CLOSE: 161 case TCP_ESTABLISHED: 162 case TCP_LISTEN: 163 return -1; 164 default: 165 /* Invalid case. */ 166 return 0; 167 } 168 } 169 170 /* Should never happen. Just to be symmetric. */ 171 if (b->sk.sk_state == TCP_LISTEN) { 172 switch (b->sk.sk_state) { 173 case TCP_CLOSE: 174 case TCP_ESTABLISHED: 175 return 1; 176 default: 177 return 0; 178 } 179 } 180 181 /* unix_state_double_lock(): ascending address order. */ 182 return cmp_ptr(a, b); 183 } 184 185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, 186 const struct lockdep_map *_b) 187 { 188 const struct sock *a, *b; 189 190 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); 191 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); 192 193 /* unix_collect_skb(): listener -> embryo order. */ 194 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) 195 return -1; 196 197 /* Should never happen. Just to be symmetric. */ 198 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) 199 return 1; 200 201 return 0; 202 } 203 #endif 204 205 static unsigned int unix_unbound_hash(struct sock *sk) 206 { 207 unsigned long hash = (unsigned long)sk; 208 209 hash ^= hash >> 16; 210 hash ^= hash >> 8; 211 hash ^= sk->sk_type; 212 213 return hash & UNIX_HASH_MOD; 214 } 215 216 static unsigned int unix_bsd_hash(struct inode *i) 217 { 218 return i->i_ino & UNIX_HASH_MOD; 219 } 220 221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 222 int addr_len, int type) 223 { 224 __wsum csum = csum_partial(sunaddr, addr_len, 0); 225 unsigned int hash; 226 227 hash = (__force unsigned int)csum_fold(csum); 228 hash ^= hash >> 8; 229 hash ^= type; 230 231 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 232 } 233 234 static void unix_table_double_lock(struct net *net, 235 unsigned int hash1, unsigned int hash2) 236 { 237 if (hash1 == hash2) { 238 spin_lock(&net->unx.table.locks[hash1]); 239 return; 240 } 241 242 if (hash1 > hash2) 243 swap(hash1, hash2); 244 245 spin_lock(&net->unx.table.locks[hash1]); 246 spin_lock(&net->unx.table.locks[hash2]); 247 } 248 249 static void unix_table_double_unlock(struct net *net, 250 unsigned int hash1, unsigned int hash2) 251 { 252 if (hash1 == hash2) { 253 spin_unlock(&net->unx.table.locks[hash1]); 254 return; 255 } 256 257 spin_unlock(&net->unx.table.locks[hash1]); 258 spin_unlock(&net->unx.table.locks[hash2]); 259 } 260 261 #ifdef CONFIG_SECURITY_NETWORK 262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 263 { 264 UNIXCB(skb).secid = scm->secid; 265 } 266 267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 268 { 269 scm->secid = UNIXCB(skb).secid; 270 } 271 272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 273 { 274 return (scm->secid == UNIXCB(skb).secid); 275 } 276 #else 277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 278 { } 279 280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 281 { } 282 283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 284 { 285 return true; 286 } 287 #endif /* CONFIG_SECURITY_NETWORK */ 288 289 static inline int unix_our_peer(struct sock *sk, struct sock *osk) 290 { 291 return unix_peer(osk) == sk; 292 } 293 294 static inline int unix_may_send(struct sock *sk, struct sock *osk) 295 { 296 return unix_peer(osk) == NULL || unix_our_peer(sk, osk); 297 } 298 299 static inline int unix_recvq_full_lockless(const struct sock *sk) 300 { 301 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 302 } 303 304 struct sock *unix_peer_get(struct sock *s) 305 { 306 struct sock *peer; 307 308 unix_state_lock(s); 309 peer = unix_peer(s); 310 if (peer) 311 sock_hold(peer); 312 unix_state_unlock(s); 313 return peer; 314 } 315 EXPORT_SYMBOL_GPL(unix_peer_get); 316 317 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 318 int addr_len) 319 { 320 struct unix_address *addr; 321 322 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 323 if (!addr) 324 return NULL; 325 326 refcount_set(&addr->refcnt, 1); 327 addr->len = addr_len; 328 memcpy(addr->name, sunaddr, addr_len); 329 330 return addr; 331 } 332 333 static inline void unix_release_addr(struct unix_address *addr) 334 { 335 if (refcount_dec_and_test(&addr->refcnt)) 336 kfree(addr); 337 } 338 339 /* 340 * Check unix socket name: 341 * - should be not zero length. 342 * - if started by not zero, should be NULL terminated (FS object) 343 * - if started by zero, it is abstract name. 344 */ 345 346 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 347 { 348 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 349 addr_len > sizeof(*sunaddr)) 350 return -EINVAL; 351 352 if (sunaddr->sun_family != AF_UNIX) 353 return -EINVAL; 354 355 return 0; 356 } 357 358 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 359 { 360 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 361 short offset = offsetof(struct sockaddr_storage, __data); 362 363 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 364 365 /* This may look like an off by one error but it is a bit more 366 * subtle. 108 is the longest valid AF_UNIX path for a binding. 367 * sun_path[108] doesn't as such exist. However in kernel space 368 * we are guaranteed that it is a valid memory location in our 369 * kernel address buffer because syscall functions always pass 370 * a pointer of struct sockaddr_storage which has a bigger buffer 371 * than 108. Also, we must terminate sun_path for strlen() in 372 * getname_kernel(). 373 */ 374 addr->__data[addr_len - offset] = 0; 375 376 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 377 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 378 * know the actual buffer. 379 */ 380 return strlen(addr->__data) + offset + 1; 381 } 382 383 static void __unix_remove_socket(struct sock *sk) 384 { 385 sk_del_node_init(sk); 386 } 387 388 static void __unix_insert_socket(struct net *net, struct sock *sk) 389 { 390 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 391 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 392 } 393 394 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 395 struct unix_address *addr, unsigned int hash) 396 { 397 __unix_remove_socket(sk); 398 smp_store_release(&unix_sk(sk)->addr, addr); 399 400 sk->sk_hash = hash; 401 __unix_insert_socket(net, sk); 402 } 403 404 static void unix_remove_socket(struct net *net, struct sock *sk) 405 { 406 spin_lock(&net->unx.table.locks[sk->sk_hash]); 407 __unix_remove_socket(sk); 408 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 409 } 410 411 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 412 { 413 spin_lock(&net->unx.table.locks[sk->sk_hash]); 414 __unix_insert_socket(net, sk); 415 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 416 } 417 418 static void unix_insert_bsd_socket(struct sock *sk) 419 { 420 spin_lock(&bsd_socket_locks[sk->sk_hash]); 421 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 422 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 423 } 424 425 static void unix_remove_bsd_socket(struct sock *sk) 426 { 427 if (!hlist_unhashed(&sk->sk_bind_node)) { 428 spin_lock(&bsd_socket_locks[sk->sk_hash]); 429 __sk_del_bind_node(sk); 430 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 431 432 sk_node_init(&sk->sk_bind_node); 433 } 434 } 435 436 static struct sock *__unix_find_socket_byname(struct net *net, 437 struct sockaddr_un *sunname, 438 int len, unsigned int hash) 439 { 440 struct sock *s; 441 442 sk_for_each(s, &net->unx.table.buckets[hash]) { 443 struct unix_sock *u = unix_sk(s); 444 445 if (u->addr->len == len && 446 !memcmp(u->addr->name, sunname, len)) 447 return s; 448 } 449 return NULL; 450 } 451 452 static inline struct sock *unix_find_socket_byname(struct net *net, 453 struct sockaddr_un *sunname, 454 int len, unsigned int hash) 455 { 456 struct sock *s; 457 458 spin_lock(&net->unx.table.locks[hash]); 459 s = __unix_find_socket_byname(net, sunname, len, hash); 460 if (s) 461 sock_hold(s); 462 spin_unlock(&net->unx.table.locks[hash]); 463 return s; 464 } 465 466 static struct sock *unix_find_socket_byinode(struct inode *i) 467 { 468 unsigned int hash = unix_bsd_hash(i); 469 struct sock *s; 470 471 spin_lock(&bsd_socket_locks[hash]); 472 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 473 struct dentry *dentry = unix_sk(s)->path.dentry; 474 475 if (dentry && d_backing_inode(dentry) == i) { 476 sock_hold(s); 477 spin_unlock(&bsd_socket_locks[hash]); 478 return s; 479 } 480 } 481 spin_unlock(&bsd_socket_locks[hash]); 482 return NULL; 483 } 484 485 /* Support code for asymmetrically connected dgram sockets 486 * 487 * If a datagram socket is connected to a socket not itself connected 488 * to the first socket (eg, /dev/log), clients may only enqueue more 489 * messages if the present receive queue of the server socket is not 490 * "too large". This means there's a second writeability condition 491 * poll and sendmsg need to test. The dgram recv code will do a wake 492 * up on the peer_wait wait queue of a socket upon reception of a 493 * datagram which needs to be propagated to sleeping would-be writers 494 * since these might not have sent anything so far. This can't be 495 * accomplished via poll_wait because the lifetime of the server 496 * socket might be less than that of its clients if these break their 497 * association with it or if the server socket is closed while clients 498 * are still connected to it and there's no way to inform "a polling 499 * implementation" that it should let go of a certain wait queue 500 * 501 * In order to propagate a wake up, a wait_queue_entry_t of the client 502 * socket is enqueued on the peer_wait queue of the server socket 503 * whose wake function does a wake_up on the ordinary client socket 504 * wait queue. This connection is established whenever a write (or 505 * poll for write) hit the flow control condition and broken when the 506 * association to the server socket is dissolved or after a wake up 507 * was relayed. 508 */ 509 510 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 511 void *key) 512 { 513 struct unix_sock *u; 514 wait_queue_head_t *u_sleep; 515 516 u = container_of(q, struct unix_sock, peer_wake); 517 518 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 519 q); 520 u->peer_wake.private = NULL; 521 522 /* relaying can only happen while the wq still exists */ 523 u_sleep = sk_sleep(&u->sk); 524 if (u_sleep) 525 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 526 527 return 0; 528 } 529 530 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 531 { 532 struct unix_sock *u, *u_other; 533 int rc; 534 535 u = unix_sk(sk); 536 u_other = unix_sk(other); 537 rc = 0; 538 spin_lock(&u_other->peer_wait.lock); 539 540 if (!u->peer_wake.private) { 541 u->peer_wake.private = other; 542 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 543 544 rc = 1; 545 } 546 547 spin_unlock(&u_other->peer_wait.lock); 548 return rc; 549 } 550 551 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 552 struct sock *other) 553 { 554 struct unix_sock *u, *u_other; 555 556 u = unix_sk(sk); 557 u_other = unix_sk(other); 558 spin_lock(&u_other->peer_wait.lock); 559 560 if (u->peer_wake.private == other) { 561 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 562 u->peer_wake.private = NULL; 563 } 564 565 spin_unlock(&u_other->peer_wait.lock); 566 } 567 568 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 569 struct sock *other) 570 { 571 unix_dgram_peer_wake_disconnect(sk, other); 572 wake_up_interruptible_poll(sk_sleep(sk), 573 EPOLLOUT | 574 EPOLLWRNORM | 575 EPOLLWRBAND); 576 } 577 578 /* preconditions: 579 * - unix_peer(sk) == other 580 * - association is stable 581 */ 582 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 583 { 584 int connected; 585 586 connected = unix_dgram_peer_wake_connect(sk, other); 587 588 /* If other is SOCK_DEAD, we want to make sure we signal 589 * POLLOUT, such that a subsequent write() can get a 590 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 591 * to other and its full, we will hang waiting for POLLOUT. 592 */ 593 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 594 return 1; 595 596 if (connected) 597 unix_dgram_peer_wake_disconnect(sk, other); 598 599 return 0; 600 } 601 602 static int unix_writable(const struct sock *sk, unsigned char state) 603 { 604 return state != TCP_LISTEN && 605 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 606 } 607 608 static void unix_write_space(struct sock *sk) 609 { 610 struct socket_wq *wq; 611 612 rcu_read_lock(); 613 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 614 wq = rcu_dereference(sk->sk_wq); 615 if (skwq_has_sleeper(wq)) 616 wake_up_interruptible_sync_poll(&wq->wait, 617 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 618 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 619 } 620 rcu_read_unlock(); 621 } 622 623 /* When dgram socket disconnects (or changes its peer), we clear its receive 624 * queue of packets arrived from previous peer. First, it allows to do 625 * flow control based only on wmem_alloc; second, sk connected to peer 626 * may receive messages only from that peer. */ 627 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 628 { 629 if (!skb_queue_empty(&sk->sk_receive_queue)) { 630 skb_queue_purge(&sk->sk_receive_queue); 631 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 632 633 /* If one link of bidirectional dgram pipe is disconnected, 634 * we signal error. Messages are lost. Do not make this, 635 * when peer was not connected to us. 636 */ 637 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 638 WRITE_ONCE(other->sk_err, ECONNRESET); 639 sk_error_report(other); 640 } 641 } 642 } 643 644 static void unix_sock_destructor(struct sock *sk) 645 { 646 struct unix_sock *u = unix_sk(sk); 647 648 skb_queue_purge(&sk->sk_receive_queue); 649 650 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 651 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 652 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 653 if (!sock_flag(sk, SOCK_DEAD)) { 654 pr_info("Attempt to release alive unix socket: %p\n", sk); 655 return; 656 } 657 658 if (u->addr) 659 unix_release_addr(u->addr); 660 661 atomic_long_dec(&unix_nr_socks); 662 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 663 #ifdef UNIX_REFCNT_DEBUG 664 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 665 atomic_long_read(&unix_nr_socks)); 666 #endif 667 } 668 669 static void unix_release_sock(struct sock *sk, int embrion) 670 { 671 struct unix_sock *u = unix_sk(sk); 672 struct sock *skpair; 673 struct sk_buff *skb; 674 struct path path; 675 int state; 676 677 unix_remove_socket(sock_net(sk), sk); 678 unix_remove_bsd_socket(sk); 679 680 /* Clear state */ 681 unix_state_lock(sk); 682 sock_orphan(sk); 683 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 684 path = u->path; 685 u->path.dentry = NULL; 686 u->path.mnt = NULL; 687 state = sk->sk_state; 688 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 689 690 skpair = unix_peer(sk); 691 unix_peer(sk) = NULL; 692 693 unix_state_unlock(sk); 694 695 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 696 if (u->oob_skb) { 697 kfree_skb(u->oob_skb); 698 u->oob_skb = NULL; 699 } 700 #endif 701 702 wake_up_interruptible_all(&u->peer_wait); 703 704 if (skpair != NULL) { 705 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 706 unix_state_lock(skpair); 707 /* No more writes */ 708 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 709 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) 710 WRITE_ONCE(skpair->sk_err, ECONNRESET); 711 unix_state_unlock(skpair); 712 skpair->sk_state_change(skpair); 713 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 714 } 715 716 unix_dgram_peer_wake_disconnect(sk, skpair); 717 sock_put(skpair); /* It may now die */ 718 } 719 720 /* Try to flush out this socket. Throw out buffers at least */ 721 722 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 723 if (state == TCP_LISTEN) 724 unix_release_sock(skb->sk, 1); 725 726 /* passed fds are erased in the kfree_skb hook */ 727 kfree_skb(skb); 728 } 729 730 if (path.dentry) 731 path_put(&path); 732 733 sock_put(sk); 734 735 /* ---- Socket is dead now and most probably destroyed ---- */ 736 737 /* 738 * Fixme: BSD difference: In BSD all sockets connected to us get 739 * ECONNRESET and we die on the spot. In Linux we behave 740 * like files and pipes do and wait for the last 741 * dereference. 742 * 743 * Can't we simply set sock->err? 744 * 745 * What the above comment does talk about? --ANK(980817) 746 */ 747 748 if (READ_ONCE(unix_tot_inflight)) 749 unix_gc(); /* Garbage collect fds */ 750 } 751 752 static void init_peercred(struct sock *sk) 753 { 754 sk->sk_peer_pid = get_pid(task_tgid(current)); 755 sk->sk_peer_cred = get_current_cred(); 756 } 757 758 static void update_peercred(struct sock *sk) 759 { 760 const struct cred *old_cred; 761 struct pid *old_pid; 762 763 spin_lock(&sk->sk_peer_lock); 764 old_pid = sk->sk_peer_pid; 765 old_cred = sk->sk_peer_cred; 766 init_peercred(sk); 767 spin_unlock(&sk->sk_peer_lock); 768 769 put_pid(old_pid); 770 put_cred(old_cred); 771 } 772 773 static void copy_peercred(struct sock *sk, struct sock *peersk) 774 { 775 lockdep_assert_held(&unix_sk(peersk)->lock); 776 777 spin_lock(&sk->sk_peer_lock); 778 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 779 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 780 spin_unlock(&sk->sk_peer_lock); 781 } 782 783 static int unix_listen(struct socket *sock, int backlog) 784 { 785 int err; 786 struct sock *sk = sock->sk; 787 struct unix_sock *u = unix_sk(sk); 788 789 err = -EOPNOTSUPP; 790 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 791 goto out; /* Only stream/seqpacket sockets accept */ 792 err = -EINVAL; 793 if (!READ_ONCE(u->addr)) 794 goto out; /* No listens on an unbound socket */ 795 unix_state_lock(sk); 796 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 797 goto out_unlock; 798 if (backlog > sk->sk_max_ack_backlog) 799 wake_up_interruptible_all(&u->peer_wait); 800 sk->sk_max_ack_backlog = backlog; 801 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 802 803 /* set credentials so connect can copy them */ 804 update_peercred(sk); 805 err = 0; 806 807 out_unlock: 808 unix_state_unlock(sk); 809 out: 810 return err; 811 } 812 813 static int unix_release(struct socket *); 814 static int unix_bind(struct socket *, struct sockaddr *, int); 815 static int unix_stream_connect(struct socket *, struct sockaddr *, 816 int addr_len, int flags); 817 static int unix_socketpair(struct socket *, struct socket *); 818 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); 819 static int unix_getname(struct socket *, struct sockaddr *, int); 820 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 821 static __poll_t unix_dgram_poll(struct file *, struct socket *, 822 poll_table *); 823 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 824 #ifdef CONFIG_COMPAT 825 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 826 #endif 827 static int unix_shutdown(struct socket *, int); 828 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 829 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 830 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 831 struct pipe_inode_info *, size_t size, 832 unsigned int flags); 833 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 834 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 835 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 836 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 837 static int unix_dgram_connect(struct socket *, struct sockaddr *, 838 int, int); 839 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 840 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 841 int); 842 843 #ifdef CONFIG_PROC_FS 844 static int unix_count_nr_fds(struct sock *sk) 845 { 846 struct sk_buff *skb; 847 struct unix_sock *u; 848 int nr_fds = 0; 849 850 spin_lock(&sk->sk_receive_queue.lock); 851 skb = skb_peek(&sk->sk_receive_queue); 852 while (skb) { 853 u = unix_sk(skb->sk); 854 nr_fds += atomic_read(&u->scm_stat.nr_fds); 855 skb = skb_peek_next(skb, &sk->sk_receive_queue); 856 } 857 spin_unlock(&sk->sk_receive_queue.lock); 858 859 return nr_fds; 860 } 861 862 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 863 { 864 struct sock *sk = sock->sk; 865 unsigned char s_state; 866 struct unix_sock *u; 867 int nr_fds = 0; 868 869 if (sk) { 870 s_state = READ_ONCE(sk->sk_state); 871 u = unix_sk(sk); 872 873 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 874 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 875 * SOCK_DGRAM is ordinary. So, no lock is needed. 876 */ 877 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 878 nr_fds = atomic_read(&u->scm_stat.nr_fds); 879 else if (s_state == TCP_LISTEN) 880 nr_fds = unix_count_nr_fds(sk); 881 882 seq_printf(m, "scm_fds: %u\n", nr_fds); 883 } 884 } 885 #else 886 #define unix_show_fdinfo NULL 887 #endif 888 889 static const struct proto_ops unix_stream_ops = { 890 .family = PF_UNIX, 891 .owner = THIS_MODULE, 892 .release = unix_release, 893 .bind = unix_bind, 894 .connect = unix_stream_connect, 895 .socketpair = unix_socketpair, 896 .accept = unix_accept, 897 .getname = unix_getname, 898 .poll = unix_poll, 899 .ioctl = unix_ioctl, 900 #ifdef CONFIG_COMPAT 901 .compat_ioctl = unix_compat_ioctl, 902 #endif 903 .listen = unix_listen, 904 .shutdown = unix_shutdown, 905 .sendmsg = unix_stream_sendmsg, 906 .recvmsg = unix_stream_recvmsg, 907 .read_skb = unix_stream_read_skb, 908 .mmap = sock_no_mmap, 909 .splice_read = unix_stream_splice_read, 910 .set_peek_off = sk_set_peek_off, 911 .show_fdinfo = unix_show_fdinfo, 912 }; 913 914 static const struct proto_ops unix_dgram_ops = { 915 .family = PF_UNIX, 916 .owner = THIS_MODULE, 917 .release = unix_release, 918 .bind = unix_bind, 919 .connect = unix_dgram_connect, 920 .socketpair = unix_socketpair, 921 .accept = sock_no_accept, 922 .getname = unix_getname, 923 .poll = unix_dgram_poll, 924 .ioctl = unix_ioctl, 925 #ifdef CONFIG_COMPAT 926 .compat_ioctl = unix_compat_ioctl, 927 #endif 928 .listen = sock_no_listen, 929 .shutdown = unix_shutdown, 930 .sendmsg = unix_dgram_sendmsg, 931 .read_skb = unix_read_skb, 932 .recvmsg = unix_dgram_recvmsg, 933 .mmap = sock_no_mmap, 934 .set_peek_off = sk_set_peek_off, 935 .show_fdinfo = unix_show_fdinfo, 936 }; 937 938 static const struct proto_ops unix_seqpacket_ops = { 939 .family = PF_UNIX, 940 .owner = THIS_MODULE, 941 .release = unix_release, 942 .bind = unix_bind, 943 .connect = unix_stream_connect, 944 .socketpair = unix_socketpair, 945 .accept = unix_accept, 946 .getname = unix_getname, 947 .poll = unix_dgram_poll, 948 .ioctl = unix_ioctl, 949 #ifdef CONFIG_COMPAT 950 .compat_ioctl = unix_compat_ioctl, 951 #endif 952 .listen = unix_listen, 953 .shutdown = unix_shutdown, 954 .sendmsg = unix_seqpacket_sendmsg, 955 .recvmsg = unix_seqpacket_recvmsg, 956 .mmap = sock_no_mmap, 957 .set_peek_off = sk_set_peek_off, 958 .show_fdinfo = unix_show_fdinfo, 959 }; 960 961 static void unix_close(struct sock *sk, long timeout) 962 { 963 /* Nothing to do here, unix socket does not need a ->close(). 964 * This is merely for sockmap. 965 */ 966 } 967 968 static void unix_unhash(struct sock *sk) 969 { 970 /* Nothing to do here, unix socket does not need a ->unhash(). 971 * This is merely for sockmap. 972 */ 973 } 974 975 static bool unix_bpf_bypass_getsockopt(int level, int optname) 976 { 977 if (level == SOL_SOCKET) { 978 switch (optname) { 979 case SO_PEERPIDFD: 980 return true; 981 default: 982 return false; 983 } 984 } 985 986 return false; 987 } 988 989 struct proto unix_dgram_proto = { 990 .name = "UNIX", 991 .owner = THIS_MODULE, 992 .obj_size = sizeof(struct unix_sock), 993 .close = unix_close, 994 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 995 #ifdef CONFIG_BPF_SYSCALL 996 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 997 #endif 998 }; 999 1000 struct proto unix_stream_proto = { 1001 .name = "UNIX-STREAM", 1002 .owner = THIS_MODULE, 1003 .obj_size = sizeof(struct unix_sock), 1004 .close = unix_close, 1005 .unhash = unix_unhash, 1006 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 1007 #ifdef CONFIG_BPF_SYSCALL 1008 .psock_update_sk_prot = unix_stream_bpf_update_proto, 1009 #endif 1010 }; 1011 1012 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 1013 { 1014 struct unix_sock *u; 1015 struct sock *sk; 1016 int err; 1017 1018 atomic_long_inc(&unix_nr_socks); 1019 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 1020 err = -ENFILE; 1021 goto err; 1022 } 1023 1024 if (type == SOCK_STREAM) 1025 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 1026 else /*dgram and seqpacket */ 1027 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 1028 1029 if (!sk) { 1030 err = -ENOMEM; 1031 goto err; 1032 } 1033 1034 sock_init_data(sock, sk); 1035 1036 sk->sk_hash = unix_unbound_hash(sk); 1037 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 1038 sk->sk_write_space = unix_write_space; 1039 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 1040 sk->sk_destruct = unix_sock_destructor; 1041 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); 1042 1043 u = unix_sk(sk); 1044 u->listener = NULL; 1045 u->vertex = NULL; 1046 u->path.dentry = NULL; 1047 u->path.mnt = NULL; 1048 spin_lock_init(&u->lock); 1049 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); 1050 mutex_init(&u->iolock); /* single task reading lock */ 1051 mutex_init(&u->bindlock); /* single task binding lock */ 1052 init_waitqueue_head(&u->peer_wait); 1053 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1054 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1055 unix_insert_unbound_socket(net, sk); 1056 1057 sock_prot_inuse_add(net, sk->sk_prot, 1); 1058 1059 return sk; 1060 1061 err: 1062 atomic_long_dec(&unix_nr_socks); 1063 return ERR_PTR(err); 1064 } 1065 1066 static int unix_create(struct net *net, struct socket *sock, int protocol, 1067 int kern) 1068 { 1069 struct sock *sk; 1070 1071 if (protocol && protocol != PF_UNIX) 1072 return -EPROTONOSUPPORT; 1073 1074 sock->state = SS_UNCONNECTED; 1075 1076 switch (sock->type) { 1077 case SOCK_STREAM: 1078 sock->ops = &unix_stream_ops; 1079 break; 1080 /* 1081 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1082 * nothing uses it. 1083 */ 1084 case SOCK_RAW: 1085 sock->type = SOCK_DGRAM; 1086 fallthrough; 1087 case SOCK_DGRAM: 1088 sock->ops = &unix_dgram_ops; 1089 break; 1090 case SOCK_SEQPACKET: 1091 sock->ops = &unix_seqpacket_ops; 1092 break; 1093 default: 1094 return -ESOCKTNOSUPPORT; 1095 } 1096 1097 sk = unix_create1(net, sock, kern, sock->type); 1098 if (IS_ERR(sk)) 1099 return PTR_ERR(sk); 1100 1101 return 0; 1102 } 1103 1104 static int unix_release(struct socket *sock) 1105 { 1106 struct sock *sk = sock->sk; 1107 1108 if (!sk) 1109 return 0; 1110 1111 sk->sk_prot->close(sk, 0); 1112 unix_release_sock(sk, 0); 1113 sock->sk = NULL; 1114 1115 return 0; 1116 } 1117 1118 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1119 int type) 1120 { 1121 struct inode *inode; 1122 struct path path; 1123 struct sock *sk; 1124 int err; 1125 1126 unix_mkname_bsd(sunaddr, addr_len); 1127 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1128 if (err) 1129 goto fail; 1130 1131 err = path_permission(&path, MAY_WRITE); 1132 if (err) 1133 goto path_put; 1134 1135 err = -ECONNREFUSED; 1136 inode = d_backing_inode(path.dentry); 1137 if (!S_ISSOCK(inode->i_mode)) 1138 goto path_put; 1139 1140 sk = unix_find_socket_byinode(inode); 1141 if (!sk) 1142 goto path_put; 1143 1144 err = -EPROTOTYPE; 1145 if (sk->sk_type == type) 1146 touch_atime(&path); 1147 else 1148 goto sock_put; 1149 1150 path_put(&path); 1151 1152 return sk; 1153 1154 sock_put: 1155 sock_put(sk); 1156 path_put: 1157 path_put(&path); 1158 fail: 1159 return ERR_PTR(err); 1160 } 1161 1162 static struct sock *unix_find_abstract(struct net *net, 1163 struct sockaddr_un *sunaddr, 1164 int addr_len, int type) 1165 { 1166 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1167 struct dentry *dentry; 1168 struct sock *sk; 1169 1170 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1171 if (!sk) 1172 return ERR_PTR(-ECONNREFUSED); 1173 1174 dentry = unix_sk(sk)->path.dentry; 1175 if (dentry) 1176 touch_atime(&unix_sk(sk)->path); 1177 1178 return sk; 1179 } 1180 1181 static struct sock *unix_find_other(struct net *net, 1182 struct sockaddr_un *sunaddr, 1183 int addr_len, int type) 1184 { 1185 struct sock *sk; 1186 1187 if (sunaddr->sun_path[0]) 1188 sk = unix_find_bsd(sunaddr, addr_len, type); 1189 else 1190 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1191 1192 return sk; 1193 } 1194 1195 static int unix_autobind(struct sock *sk) 1196 { 1197 struct unix_sock *u = unix_sk(sk); 1198 unsigned int new_hash, old_hash; 1199 struct net *net = sock_net(sk); 1200 struct unix_address *addr; 1201 u32 lastnum, ordernum; 1202 int err; 1203 1204 err = mutex_lock_interruptible(&u->bindlock); 1205 if (err) 1206 return err; 1207 1208 if (u->addr) 1209 goto out; 1210 1211 err = -ENOMEM; 1212 addr = kzalloc(sizeof(*addr) + 1213 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1214 if (!addr) 1215 goto out; 1216 1217 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1218 addr->name->sun_family = AF_UNIX; 1219 refcount_set(&addr->refcnt, 1); 1220 1221 old_hash = sk->sk_hash; 1222 ordernum = get_random_u32(); 1223 lastnum = ordernum & 0xFFFFF; 1224 retry: 1225 ordernum = (ordernum + 1) & 0xFFFFF; 1226 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1227 1228 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1229 unix_table_double_lock(net, old_hash, new_hash); 1230 1231 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1232 unix_table_double_unlock(net, old_hash, new_hash); 1233 1234 /* __unix_find_socket_byname() may take long time if many names 1235 * are already in use. 1236 */ 1237 cond_resched(); 1238 1239 if (ordernum == lastnum) { 1240 /* Give up if all names seems to be in use. */ 1241 err = -ENOSPC; 1242 unix_release_addr(addr); 1243 goto out; 1244 } 1245 1246 goto retry; 1247 } 1248 1249 __unix_set_addr_hash(net, sk, addr, new_hash); 1250 unix_table_double_unlock(net, old_hash, new_hash); 1251 err = 0; 1252 1253 out: mutex_unlock(&u->bindlock); 1254 return err; 1255 } 1256 1257 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1258 int addr_len) 1259 { 1260 umode_t mode = S_IFSOCK | 1261 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1262 struct unix_sock *u = unix_sk(sk); 1263 unsigned int new_hash, old_hash; 1264 struct net *net = sock_net(sk); 1265 struct mnt_idmap *idmap; 1266 struct unix_address *addr; 1267 struct dentry *dentry; 1268 struct path parent; 1269 int err; 1270 1271 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1272 addr = unix_create_addr(sunaddr, addr_len); 1273 if (!addr) 1274 return -ENOMEM; 1275 1276 /* 1277 * Get the parent directory, calculate the hash for last 1278 * component. 1279 */ 1280 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1281 if (IS_ERR(dentry)) { 1282 err = PTR_ERR(dentry); 1283 goto out; 1284 } 1285 1286 /* 1287 * All right, let's create it. 1288 */ 1289 idmap = mnt_idmap(parent.mnt); 1290 err = security_path_mknod(&parent, dentry, mode, 0); 1291 if (!err) 1292 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1293 if (err) 1294 goto out_path; 1295 err = mutex_lock_interruptible(&u->bindlock); 1296 if (err) 1297 goto out_unlink; 1298 if (u->addr) 1299 goto out_unlock; 1300 1301 old_hash = sk->sk_hash; 1302 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1303 unix_table_double_lock(net, old_hash, new_hash); 1304 u->path.mnt = mntget(parent.mnt); 1305 u->path.dentry = dget(dentry); 1306 __unix_set_addr_hash(net, sk, addr, new_hash); 1307 unix_table_double_unlock(net, old_hash, new_hash); 1308 unix_insert_bsd_socket(sk); 1309 mutex_unlock(&u->bindlock); 1310 done_path_create(&parent, dentry); 1311 return 0; 1312 1313 out_unlock: 1314 mutex_unlock(&u->bindlock); 1315 err = -EINVAL; 1316 out_unlink: 1317 /* failed after successful mknod? unlink what we'd created... */ 1318 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1319 out_path: 1320 done_path_create(&parent, dentry); 1321 out: 1322 unix_release_addr(addr); 1323 return err == -EEXIST ? -EADDRINUSE : err; 1324 } 1325 1326 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1327 int addr_len) 1328 { 1329 struct unix_sock *u = unix_sk(sk); 1330 unsigned int new_hash, old_hash; 1331 struct net *net = sock_net(sk); 1332 struct unix_address *addr; 1333 int err; 1334 1335 addr = unix_create_addr(sunaddr, addr_len); 1336 if (!addr) 1337 return -ENOMEM; 1338 1339 err = mutex_lock_interruptible(&u->bindlock); 1340 if (err) 1341 goto out; 1342 1343 if (u->addr) { 1344 err = -EINVAL; 1345 goto out_mutex; 1346 } 1347 1348 old_hash = sk->sk_hash; 1349 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1350 unix_table_double_lock(net, old_hash, new_hash); 1351 1352 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1353 goto out_spin; 1354 1355 __unix_set_addr_hash(net, sk, addr, new_hash); 1356 unix_table_double_unlock(net, old_hash, new_hash); 1357 mutex_unlock(&u->bindlock); 1358 return 0; 1359 1360 out_spin: 1361 unix_table_double_unlock(net, old_hash, new_hash); 1362 err = -EADDRINUSE; 1363 out_mutex: 1364 mutex_unlock(&u->bindlock); 1365 out: 1366 unix_release_addr(addr); 1367 return err; 1368 } 1369 1370 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1371 { 1372 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1373 struct sock *sk = sock->sk; 1374 int err; 1375 1376 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1377 sunaddr->sun_family == AF_UNIX) 1378 return unix_autobind(sk); 1379 1380 err = unix_validate_addr(sunaddr, addr_len); 1381 if (err) 1382 return err; 1383 1384 if (sunaddr->sun_path[0]) 1385 err = unix_bind_bsd(sk, sunaddr, addr_len); 1386 else 1387 err = unix_bind_abstract(sk, sunaddr, addr_len); 1388 1389 return err; 1390 } 1391 1392 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1393 { 1394 if (unlikely(sk1 == sk2) || !sk2) { 1395 unix_state_lock(sk1); 1396 return; 1397 } 1398 1399 if (sk1 > sk2) 1400 swap(sk1, sk2); 1401 1402 unix_state_lock(sk1); 1403 unix_state_lock(sk2); 1404 } 1405 1406 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1407 { 1408 if (unlikely(sk1 == sk2) || !sk2) { 1409 unix_state_unlock(sk1); 1410 return; 1411 } 1412 unix_state_unlock(sk1); 1413 unix_state_unlock(sk2); 1414 } 1415 1416 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1417 int alen, int flags) 1418 { 1419 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1420 struct sock *sk = sock->sk; 1421 struct sock *other; 1422 int err; 1423 1424 err = -EINVAL; 1425 if (alen < offsetofend(struct sockaddr, sa_family)) 1426 goto out; 1427 1428 if (addr->sa_family != AF_UNSPEC) { 1429 err = unix_validate_addr(sunaddr, alen); 1430 if (err) 1431 goto out; 1432 1433 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1434 if (err) 1435 goto out; 1436 1437 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1438 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1439 !READ_ONCE(unix_sk(sk)->addr)) { 1440 err = unix_autobind(sk); 1441 if (err) 1442 goto out; 1443 } 1444 1445 restart: 1446 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1447 if (IS_ERR(other)) { 1448 err = PTR_ERR(other); 1449 goto out; 1450 } 1451 1452 unix_state_double_lock(sk, other); 1453 1454 /* Apparently VFS overslept socket death. Retry. */ 1455 if (sock_flag(other, SOCK_DEAD)) { 1456 unix_state_double_unlock(sk, other); 1457 sock_put(other); 1458 goto restart; 1459 } 1460 1461 err = -EPERM; 1462 if (!unix_may_send(sk, other)) 1463 goto out_unlock; 1464 1465 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1466 if (err) 1467 goto out_unlock; 1468 1469 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1470 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1471 } else { 1472 /* 1473 * 1003.1g breaking connected state with AF_UNSPEC 1474 */ 1475 other = NULL; 1476 unix_state_double_lock(sk, other); 1477 } 1478 1479 /* 1480 * If it was connected, reconnect. 1481 */ 1482 if (unix_peer(sk)) { 1483 struct sock *old_peer = unix_peer(sk); 1484 1485 unix_peer(sk) = other; 1486 if (!other) 1487 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1488 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1489 1490 unix_state_double_unlock(sk, other); 1491 1492 if (other != old_peer) { 1493 unix_dgram_disconnected(sk, old_peer); 1494 1495 unix_state_lock(old_peer); 1496 if (!unix_peer(old_peer)) 1497 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1498 unix_state_unlock(old_peer); 1499 } 1500 1501 sock_put(old_peer); 1502 } else { 1503 unix_peer(sk) = other; 1504 unix_state_double_unlock(sk, other); 1505 } 1506 1507 return 0; 1508 1509 out_unlock: 1510 unix_state_double_unlock(sk, other); 1511 sock_put(other); 1512 out: 1513 return err; 1514 } 1515 1516 static long unix_wait_for_peer(struct sock *other, long timeo) 1517 __releases(&unix_sk(other)->lock) 1518 { 1519 struct unix_sock *u = unix_sk(other); 1520 int sched; 1521 DEFINE_WAIT(wait); 1522 1523 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1524 1525 sched = !sock_flag(other, SOCK_DEAD) && 1526 !(other->sk_shutdown & RCV_SHUTDOWN) && 1527 unix_recvq_full_lockless(other); 1528 1529 unix_state_unlock(other); 1530 1531 if (sched) 1532 timeo = schedule_timeout(timeo); 1533 1534 finish_wait(&u->peer_wait, &wait); 1535 return timeo; 1536 } 1537 1538 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1539 int addr_len, int flags) 1540 { 1541 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1542 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1543 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1544 struct net *net = sock_net(sk); 1545 struct sk_buff *skb = NULL; 1546 unsigned char state; 1547 long timeo; 1548 int err; 1549 1550 err = unix_validate_addr(sunaddr, addr_len); 1551 if (err) 1552 goto out; 1553 1554 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1555 if (err) 1556 goto out; 1557 1558 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1559 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1560 !READ_ONCE(u->addr)) { 1561 err = unix_autobind(sk); 1562 if (err) 1563 goto out; 1564 } 1565 1566 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1567 1568 /* First of all allocate resources. 1569 If we will make it after state is locked, 1570 we will have to recheck all again in any case. 1571 */ 1572 1573 /* create new sock for complete connection */ 1574 newsk = unix_create1(net, NULL, 0, sock->type); 1575 if (IS_ERR(newsk)) { 1576 err = PTR_ERR(newsk); 1577 newsk = NULL; 1578 goto out; 1579 } 1580 1581 err = -ENOMEM; 1582 1583 /* Allocate skb for sending to listening sock */ 1584 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1585 if (skb == NULL) 1586 goto out; 1587 1588 restart: 1589 /* Find listening sock. */ 1590 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1591 if (IS_ERR(other)) { 1592 err = PTR_ERR(other); 1593 other = NULL; 1594 goto out; 1595 } 1596 1597 unix_state_lock(other); 1598 1599 /* Apparently VFS overslept socket death. Retry. */ 1600 if (sock_flag(other, SOCK_DEAD)) { 1601 unix_state_unlock(other); 1602 sock_put(other); 1603 goto restart; 1604 } 1605 1606 err = -ECONNREFUSED; 1607 if (other->sk_state != TCP_LISTEN) 1608 goto out_unlock; 1609 if (other->sk_shutdown & RCV_SHUTDOWN) 1610 goto out_unlock; 1611 1612 if (unix_recvq_full_lockless(other)) { 1613 err = -EAGAIN; 1614 if (!timeo) 1615 goto out_unlock; 1616 1617 timeo = unix_wait_for_peer(other, timeo); 1618 1619 err = sock_intr_errno(timeo); 1620 if (signal_pending(current)) 1621 goto out; 1622 sock_put(other); 1623 goto restart; 1624 } 1625 1626 /* self connect and simultaneous connect are eliminated 1627 * by rejecting TCP_LISTEN socket to avoid deadlock. 1628 */ 1629 state = READ_ONCE(sk->sk_state); 1630 if (unlikely(state != TCP_CLOSE)) { 1631 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1632 goto out_unlock; 1633 } 1634 1635 unix_state_lock(sk); 1636 1637 if (unlikely(sk->sk_state != TCP_CLOSE)) { 1638 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1639 unix_state_unlock(sk); 1640 goto out_unlock; 1641 } 1642 1643 err = security_unix_stream_connect(sk, other, newsk); 1644 if (err) { 1645 unix_state_unlock(sk); 1646 goto out_unlock; 1647 } 1648 1649 /* The way is open! Fastly set all the necessary fields... */ 1650 1651 sock_hold(sk); 1652 unix_peer(newsk) = sk; 1653 newsk->sk_state = TCP_ESTABLISHED; 1654 newsk->sk_type = sk->sk_type; 1655 init_peercred(newsk); 1656 newu = unix_sk(newsk); 1657 newu->listener = other; 1658 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1659 otheru = unix_sk(other); 1660 1661 /* copy address information from listening to new sock 1662 * 1663 * The contents of *(otheru->addr) and otheru->path 1664 * are seen fully set up here, since we have found 1665 * otheru in hash under its lock. Insertion into the 1666 * hash chain we'd found it in had been done in an 1667 * earlier critical area protected by the chain's lock, 1668 * the same one where we'd set *(otheru->addr) contents, 1669 * as well as otheru->path and otheru->addr itself. 1670 * 1671 * Using smp_store_release() here to set newu->addr 1672 * is enough to make those stores, as well as stores 1673 * to newu->path visible to anyone who gets newu->addr 1674 * by smp_load_acquire(). IOW, the same warranties 1675 * as for unix_sock instances bound in unix_bind() or 1676 * in unix_autobind(). 1677 */ 1678 if (otheru->path.dentry) { 1679 path_get(&otheru->path); 1680 newu->path = otheru->path; 1681 } 1682 refcount_inc(&otheru->addr->refcnt); 1683 smp_store_release(&newu->addr, otheru->addr); 1684 1685 /* Set credentials */ 1686 copy_peercred(sk, other); 1687 1688 sock->state = SS_CONNECTED; 1689 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1690 sock_hold(newsk); 1691 1692 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1693 unix_peer(sk) = newsk; 1694 1695 unix_state_unlock(sk); 1696 1697 /* take ten and send info to listening sock */ 1698 spin_lock(&other->sk_receive_queue.lock); 1699 __skb_queue_tail(&other->sk_receive_queue, skb); 1700 spin_unlock(&other->sk_receive_queue.lock); 1701 unix_state_unlock(other); 1702 other->sk_data_ready(other); 1703 sock_put(other); 1704 return 0; 1705 1706 out_unlock: 1707 if (other) 1708 unix_state_unlock(other); 1709 1710 out: 1711 kfree_skb(skb); 1712 if (newsk) 1713 unix_release_sock(newsk, 0); 1714 if (other) 1715 sock_put(other); 1716 return err; 1717 } 1718 1719 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1720 { 1721 struct sock *ska = socka->sk, *skb = sockb->sk; 1722 1723 /* Join our sockets back to back */ 1724 sock_hold(ska); 1725 sock_hold(skb); 1726 unix_peer(ska) = skb; 1727 unix_peer(skb) = ska; 1728 init_peercred(ska); 1729 init_peercred(skb); 1730 1731 ska->sk_state = TCP_ESTABLISHED; 1732 skb->sk_state = TCP_ESTABLISHED; 1733 socka->state = SS_CONNECTED; 1734 sockb->state = SS_CONNECTED; 1735 return 0; 1736 } 1737 1738 static void unix_sock_inherit_flags(const struct socket *old, 1739 struct socket *new) 1740 { 1741 if (test_bit(SOCK_PASSCRED, &old->flags)) 1742 set_bit(SOCK_PASSCRED, &new->flags); 1743 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1744 set_bit(SOCK_PASSPIDFD, &new->flags); 1745 if (test_bit(SOCK_PASSSEC, &old->flags)) 1746 set_bit(SOCK_PASSSEC, &new->flags); 1747 } 1748 1749 static int unix_accept(struct socket *sock, struct socket *newsock, 1750 struct proto_accept_arg *arg) 1751 { 1752 struct sock *sk = sock->sk; 1753 struct sk_buff *skb; 1754 struct sock *tsk; 1755 1756 arg->err = -EOPNOTSUPP; 1757 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1758 goto out; 1759 1760 arg->err = -EINVAL; 1761 if (READ_ONCE(sk->sk_state) != TCP_LISTEN) 1762 goto out; 1763 1764 /* If socket state is TCP_LISTEN it cannot change (for now...), 1765 * so that no locks are necessary. 1766 */ 1767 1768 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1769 &arg->err); 1770 if (!skb) { 1771 /* This means receive shutdown. */ 1772 if (arg->err == 0) 1773 arg->err = -EINVAL; 1774 goto out; 1775 } 1776 1777 tsk = skb->sk; 1778 skb_free_datagram(sk, skb); 1779 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1780 1781 /* attach accepted sock to socket */ 1782 unix_state_lock(tsk); 1783 unix_update_edges(unix_sk(tsk)); 1784 newsock->state = SS_CONNECTED; 1785 unix_sock_inherit_flags(sock, newsock); 1786 sock_graft(tsk, newsock); 1787 unix_state_unlock(tsk); 1788 return 0; 1789 1790 out: 1791 return arg->err; 1792 } 1793 1794 1795 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1796 { 1797 struct sock *sk = sock->sk; 1798 struct unix_address *addr; 1799 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1800 int err = 0; 1801 1802 if (peer) { 1803 sk = unix_peer_get(sk); 1804 1805 err = -ENOTCONN; 1806 if (!sk) 1807 goto out; 1808 err = 0; 1809 } else { 1810 sock_hold(sk); 1811 } 1812 1813 addr = smp_load_acquire(&unix_sk(sk)->addr); 1814 if (!addr) { 1815 sunaddr->sun_family = AF_UNIX; 1816 sunaddr->sun_path[0] = 0; 1817 err = offsetof(struct sockaddr_un, sun_path); 1818 } else { 1819 err = addr->len; 1820 memcpy(sunaddr, addr->name, addr->len); 1821 1822 if (peer) 1823 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1824 CGROUP_UNIX_GETPEERNAME); 1825 else 1826 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1827 CGROUP_UNIX_GETSOCKNAME); 1828 } 1829 sock_put(sk); 1830 out: 1831 return err; 1832 } 1833 1834 /* The "user->unix_inflight" variable is protected by the garbage 1835 * collection lock, and we just read it locklessly here. If you go 1836 * over the limit, there might be a tiny race in actually noticing 1837 * it across threads. Tough. 1838 */ 1839 static inline bool too_many_unix_fds(struct task_struct *p) 1840 { 1841 struct user_struct *user = current_user(); 1842 1843 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1844 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1845 return false; 1846 } 1847 1848 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1849 { 1850 if (too_many_unix_fds(current)) 1851 return -ETOOMANYREFS; 1852 1853 UNIXCB(skb).fp = scm->fp; 1854 scm->fp = NULL; 1855 1856 if (unix_prepare_fpl(UNIXCB(skb).fp)) 1857 return -ENOMEM; 1858 1859 return 0; 1860 } 1861 1862 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1863 { 1864 scm->fp = UNIXCB(skb).fp; 1865 UNIXCB(skb).fp = NULL; 1866 1867 unix_destroy_fpl(scm->fp); 1868 } 1869 1870 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1871 { 1872 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1873 } 1874 1875 static void unix_destruct_scm(struct sk_buff *skb) 1876 { 1877 struct scm_cookie scm; 1878 1879 memset(&scm, 0, sizeof(scm)); 1880 scm.pid = UNIXCB(skb).pid; 1881 if (UNIXCB(skb).fp) 1882 unix_detach_fds(&scm, skb); 1883 1884 /* Alas, it calls VFS */ 1885 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1886 scm_destroy(&scm); 1887 sock_wfree(skb); 1888 } 1889 1890 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1891 { 1892 int err = 0; 1893 1894 UNIXCB(skb).pid = get_pid(scm->pid); 1895 UNIXCB(skb).uid = scm->creds.uid; 1896 UNIXCB(skb).gid = scm->creds.gid; 1897 UNIXCB(skb).fp = NULL; 1898 unix_get_secdata(scm, skb); 1899 if (scm->fp && send_fds) 1900 err = unix_attach_fds(scm, skb); 1901 1902 skb->destructor = unix_destruct_scm; 1903 return err; 1904 } 1905 1906 static bool unix_passcred_enabled(const struct socket *sock, 1907 const struct sock *other) 1908 { 1909 return test_bit(SOCK_PASSCRED, &sock->flags) || 1910 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1911 !other->sk_socket || 1912 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1913 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1914 } 1915 1916 /* 1917 * Some apps rely on write() giving SCM_CREDENTIALS 1918 * We include credentials if source or destination socket 1919 * asserted SOCK_PASSCRED. 1920 */ 1921 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1922 const struct sock *other) 1923 { 1924 if (UNIXCB(skb).pid) 1925 return; 1926 if (unix_passcred_enabled(sock, other)) { 1927 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1928 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1929 } 1930 } 1931 1932 static bool unix_skb_scm_eq(struct sk_buff *skb, 1933 struct scm_cookie *scm) 1934 { 1935 return UNIXCB(skb).pid == scm->pid && 1936 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1937 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1938 unix_secdata_eq(scm, skb); 1939 } 1940 1941 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1942 { 1943 struct scm_fp_list *fp = UNIXCB(skb).fp; 1944 struct unix_sock *u = unix_sk(sk); 1945 1946 if (unlikely(fp && fp->count)) { 1947 atomic_add(fp->count, &u->scm_stat.nr_fds); 1948 unix_add_edges(fp, u); 1949 } 1950 } 1951 1952 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1953 { 1954 struct scm_fp_list *fp = UNIXCB(skb).fp; 1955 struct unix_sock *u = unix_sk(sk); 1956 1957 if (unlikely(fp && fp->count)) { 1958 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1959 unix_del_edges(fp); 1960 } 1961 } 1962 1963 /* 1964 * Send AF_UNIX data. 1965 */ 1966 1967 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1968 size_t len) 1969 { 1970 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 1971 struct sock *sk = sock->sk, *other = NULL; 1972 struct unix_sock *u = unix_sk(sk); 1973 struct scm_cookie scm; 1974 struct sk_buff *skb; 1975 int data_len = 0; 1976 int sk_locked; 1977 long timeo; 1978 int err; 1979 1980 err = scm_send(sock, msg, &scm, false); 1981 if (err < 0) 1982 return err; 1983 1984 wait_for_unix_gc(scm.fp); 1985 1986 err = -EOPNOTSUPP; 1987 if (msg->msg_flags&MSG_OOB) 1988 goto out; 1989 1990 if (msg->msg_namelen) { 1991 err = unix_validate_addr(sunaddr, msg->msg_namelen); 1992 if (err) 1993 goto out; 1994 1995 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 1996 msg->msg_name, 1997 &msg->msg_namelen, 1998 NULL); 1999 if (err) 2000 goto out; 2001 } else { 2002 sunaddr = NULL; 2003 err = -ENOTCONN; 2004 other = unix_peer_get(sk); 2005 if (!other) 2006 goto out; 2007 } 2008 2009 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 2010 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 2011 !READ_ONCE(u->addr)) { 2012 err = unix_autobind(sk); 2013 if (err) 2014 goto out; 2015 } 2016 2017 err = -EMSGSIZE; 2018 if (len > READ_ONCE(sk->sk_sndbuf) - 32) 2019 goto out; 2020 2021 if (len > SKB_MAX_ALLOC) { 2022 data_len = min_t(size_t, 2023 len - SKB_MAX_ALLOC, 2024 MAX_SKB_FRAGS * PAGE_SIZE); 2025 data_len = PAGE_ALIGN(data_len); 2026 2027 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 2028 } 2029 2030 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 2031 msg->msg_flags & MSG_DONTWAIT, &err, 2032 PAGE_ALLOC_COSTLY_ORDER); 2033 if (skb == NULL) 2034 goto out; 2035 2036 err = unix_scm_to_skb(&scm, skb, true); 2037 if (err < 0) 2038 goto out_free; 2039 2040 skb_put(skb, len - data_len); 2041 skb->data_len = data_len; 2042 skb->len = len; 2043 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 2044 if (err) 2045 goto out_free; 2046 2047 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 2048 2049 restart: 2050 if (!other) { 2051 err = -ECONNRESET; 2052 if (sunaddr == NULL) 2053 goto out_free; 2054 2055 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, 2056 sk->sk_type); 2057 if (IS_ERR(other)) { 2058 err = PTR_ERR(other); 2059 other = NULL; 2060 goto out_free; 2061 } 2062 } 2063 2064 if (sk_filter(other, skb) < 0) { 2065 /* Toss the packet but do not return any error to the sender */ 2066 err = len; 2067 goto out_free; 2068 } 2069 2070 sk_locked = 0; 2071 unix_state_lock(other); 2072 restart_locked: 2073 err = -EPERM; 2074 if (!unix_may_send(sk, other)) 2075 goto out_unlock; 2076 2077 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2078 /* 2079 * Check with 1003.1g - what should 2080 * datagram error 2081 */ 2082 unix_state_unlock(other); 2083 sock_put(other); 2084 2085 if (!sk_locked) 2086 unix_state_lock(sk); 2087 2088 err = 0; 2089 if (sk->sk_type == SOCK_SEQPACKET) { 2090 /* We are here only when racing with unix_release_sock() 2091 * is clearing @other. Never change state to TCP_CLOSE 2092 * unlike SOCK_DGRAM wants. 2093 */ 2094 unix_state_unlock(sk); 2095 err = -EPIPE; 2096 } else if (unix_peer(sk) == other) { 2097 unix_peer(sk) = NULL; 2098 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2099 2100 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2101 unix_state_unlock(sk); 2102 2103 unix_dgram_disconnected(sk, other); 2104 sock_put(other); 2105 err = -ECONNREFUSED; 2106 } else { 2107 unix_state_unlock(sk); 2108 } 2109 2110 other = NULL; 2111 if (err) 2112 goto out_free; 2113 goto restart; 2114 } 2115 2116 err = -EPIPE; 2117 if (other->sk_shutdown & RCV_SHUTDOWN) 2118 goto out_unlock; 2119 2120 if (sk->sk_type != SOCK_SEQPACKET) { 2121 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2122 if (err) 2123 goto out_unlock; 2124 } 2125 2126 /* other == sk && unix_peer(other) != sk if 2127 * - unix_peer(sk) == NULL, destination address bound to sk 2128 * - unix_peer(sk) == sk by time of get but disconnected before lock 2129 */ 2130 if (other != sk && 2131 unlikely(unix_peer(other) != sk && 2132 unix_recvq_full_lockless(other))) { 2133 if (timeo) { 2134 timeo = unix_wait_for_peer(other, timeo); 2135 2136 err = sock_intr_errno(timeo); 2137 if (signal_pending(current)) 2138 goto out_free; 2139 2140 goto restart; 2141 } 2142 2143 if (!sk_locked) { 2144 unix_state_unlock(other); 2145 unix_state_double_lock(sk, other); 2146 } 2147 2148 if (unix_peer(sk) != other || 2149 unix_dgram_peer_wake_me(sk, other)) { 2150 err = -EAGAIN; 2151 sk_locked = 1; 2152 goto out_unlock; 2153 } 2154 2155 if (!sk_locked) { 2156 sk_locked = 1; 2157 goto restart_locked; 2158 } 2159 } 2160 2161 if (unlikely(sk_locked)) 2162 unix_state_unlock(sk); 2163 2164 if (sock_flag(other, SOCK_RCVTSTAMP)) 2165 __net_timestamp(skb); 2166 maybe_add_creds(skb, sock, other); 2167 scm_stat_add(other, skb); 2168 skb_queue_tail(&other->sk_receive_queue, skb); 2169 unix_state_unlock(other); 2170 other->sk_data_ready(other); 2171 sock_put(other); 2172 scm_destroy(&scm); 2173 return len; 2174 2175 out_unlock: 2176 if (sk_locked) 2177 unix_state_unlock(sk); 2178 unix_state_unlock(other); 2179 out_free: 2180 kfree_skb(skb); 2181 out: 2182 if (other) 2183 sock_put(other); 2184 scm_destroy(&scm); 2185 return err; 2186 } 2187 2188 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2189 * bytes, and a minimum of a full page. 2190 */ 2191 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2192 2193 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2194 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2195 struct scm_cookie *scm, bool fds_sent) 2196 { 2197 struct unix_sock *ousk = unix_sk(other); 2198 struct sk_buff *skb; 2199 int err = 0; 2200 2201 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2202 2203 if (!skb) 2204 return err; 2205 2206 err = unix_scm_to_skb(scm, skb, !fds_sent); 2207 if (err < 0) { 2208 kfree_skb(skb); 2209 return err; 2210 } 2211 skb_put(skb, 1); 2212 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2213 2214 if (err) { 2215 kfree_skb(skb); 2216 return err; 2217 } 2218 2219 unix_state_lock(other); 2220 2221 if (sock_flag(other, SOCK_DEAD) || 2222 (other->sk_shutdown & RCV_SHUTDOWN)) { 2223 unix_state_unlock(other); 2224 kfree_skb(skb); 2225 return -EPIPE; 2226 } 2227 2228 maybe_add_creds(skb, sock, other); 2229 skb_get(skb); 2230 2231 scm_stat_add(other, skb); 2232 2233 spin_lock(&other->sk_receive_queue.lock); 2234 if (ousk->oob_skb) 2235 consume_skb(ousk->oob_skb); 2236 WRITE_ONCE(ousk->oob_skb, skb); 2237 __skb_queue_tail(&other->sk_receive_queue, skb); 2238 spin_unlock(&other->sk_receive_queue.lock); 2239 2240 sk_send_sigurg(other); 2241 unix_state_unlock(other); 2242 other->sk_data_ready(other); 2243 2244 return err; 2245 } 2246 #endif 2247 2248 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2249 size_t len) 2250 { 2251 struct sock *sk = sock->sk; 2252 struct sock *other = NULL; 2253 int err, size; 2254 struct sk_buff *skb; 2255 int sent = 0; 2256 struct scm_cookie scm; 2257 bool fds_sent = false; 2258 int data_len; 2259 2260 err = scm_send(sock, msg, &scm, false); 2261 if (err < 0) 2262 return err; 2263 2264 wait_for_unix_gc(scm.fp); 2265 2266 err = -EOPNOTSUPP; 2267 if (msg->msg_flags & MSG_OOB) { 2268 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2269 if (len) 2270 len--; 2271 else 2272 #endif 2273 goto out_err; 2274 } 2275 2276 if (msg->msg_namelen) { 2277 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2278 goto out_err; 2279 } else { 2280 err = -ENOTCONN; 2281 other = unix_peer(sk); 2282 if (!other) 2283 goto out_err; 2284 } 2285 2286 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2287 goto pipe_err; 2288 2289 while (sent < len) { 2290 size = len - sent; 2291 2292 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2293 skb = sock_alloc_send_pskb(sk, 0, 0, 2294 msg->msg_flags & MSG_DONTWAIT, 2295 &err, 0); 2296 } else { 2297 /* Keep two messages in the pipe so it schedules better */ 2298 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2299 2300 /* allow fallback to order-0 allocations */ 2301 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2302 2303 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2304 2305 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2306 2307 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2308 msg->msg_flags & MSG_DONTWAIT, &err, 2309 get_order(UNIX_SKB_FRAGS_SZ)); 2310 } 2311 if (!skb) 2312 goto out_err; 2313 2314 /* Only send the fds in the first buffer */ 2315 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2316 if (err < 0) { 2317 kfree_skb(skb); 2318 goto out_err; 2319 } 2320 fds_sent = true; 2321 2322 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2323 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2324 sk->sk_allocation); 2325 if (err < 0) { 2326 kfree_skb(skb); 2327 goto out_err; 2328 } 2329 size = err; 2330 refcount_add(size, &sk->sk_wmem_alloc); 2331 } else { 2332 skb_put(skb, size - data_len); 2333 skb->data_len = data_len; 2334 skb->len = size; 2335 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2336 if (err) { 2337 kfree_skb(skb); 2338 goto out_err; 2339 } 2340 } 2341 2342 unix_state_lock(other); 2343 2344 if (sock_flag(other, SOCK_DEAD) || 2345 (other->sk_shutdown & RCV_SHUTDOWN)) 2346 goto pipe_err_free; 2347 2348 maybe_add_creds(skb, sock, other); 2349 scm_stat_add(other, skb); 2350 skb_queue_tail(&other->sk_receive_queue, skb); 2351 unix_state_unlock(other); 2352 other->sk_data_ready(other); 2353 sent += size; 2354 } 2355 2356 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2357 if (msg->msg_flags & MSG_OOB) { 2358 err = queue_oob(sock, msg, other, &scm, fds_sent); 2359 if (err) 2360 goto out_err; 2361 sent++; 2362 } 2363 #endif 2364 2365 scm_destroy(&scm); 2366 2367 return sent; 2368 2369 pipe_err_free: 2370 unix_state_unlock(other); 2371 kfree_skb(skb); 2372 pipe_err: 2373 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) 2374 send_sig(SIGPIPE, current, 0); 2375 err = -EPIPE; 2376 out_err: 2377 scm_destroy(&scm); 2378 return sent ? : err; 2379 } 2380 2381 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2382 size_t len) 2383 { 2384 int err; 2385 struct sock *sk = sock->sk; 2386 2387 err = sock_error(sk); 2388 if (err) 2389 return err; 2390 2391 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2392 return -ENOTCONN; 2393 2394 if (msg->msg_namelen) 2395 msg->msg_namelen = 0; 2396 2397 return unix_dgram_sendmsg(sock, msg, len); 2398 } 2399 2400 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2401 size_t size, int flags) 2402 { 2403 struct sock *sk = sock->sk; 2404 2405 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2406 return -ENOTCONN; 2407 2408 return unix_dgram_recvmsg(sock, msg, size, flags); 2409 } 2410 2411 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2412 { 2413 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2414 2415 if (addr) { 2416 msg->msg_namelen = addr->len; 2417 memcpy(msg->msg_name, addr->name, addr->len); 2418 } 2419 } 2420 2421 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2422 int flags) 2423 { 2424 struct scm_cookie scm; 2425 struct socket *sock = sk->sk_socket; 2426 struct unix_sock *u = unix_sk(sk); 2427 struct sk_buff *skb, *last; 2428 long timeo; 2429 int skip; 2430 int err; 2431 2432 err = -EOPNOTSUPP; 2433 if (flags&MSG_OOB) 2434 goto out; 2435 2436 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2437 2438 do { 2439 mutex_lock(&u->iolock); 2440 2441 skip = sk_peek_offset(sk, flags); 2442 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2443 &skip, &err, &last); 2444 if (skb) { 2445 if (!(flags & MSG_PEEK)) 2446 scm_stat_del(sk, skb); 2447 break; 2448 } 2449 2450 mutex_unlock(&u->iolock); 2451 2452 if (err != -EAGAIN) 2453 break; 2454 } while (timeo && 2455 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2456 &err, &timeo, last)); 2457 2458 if (!skb) { /* implies iolock unlocked */ 2459 unix_state_lock(sk); 2460 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2461 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2462 (sk->sk_shutdown & RCV_SHUTDOWN)) 2463 err = 0; 2464 unix_state_unlock(sk); 2465 goto out; 2466 } 2467 2468 if (wq_has_sleeper(&u->peer_wait)) 2469 wake_up_interruptible_sync_poll(&u->peer_wait, 2470 EPOLLOUT | EPOLLWRNORM | 2471 EPOLLWRBAND); 2472 2473 if (msg->msg_name) { 2474 unix_copy_addr(msg, skb->sk); 2475 2476 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2477 msg->msg_name, 2478 &msg->msg_namelen); 2479 } 2480 2481 if (size > skb->len - skip) 2482 size = skb->len - skip; 2483 else if (size < skb->len - skip) 2484 msg->msg_flags |= MSG_TRUNC; 2485 2486 err = skb_copy_datagram_msg(skb, skip, msg, size); 2487 if (err) 2488 goto out_free; 2489 2490 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2491 __sock_recv_timestamp(msg, sk, skb); 2492 2493 memset(&scm, 0, sizeof(scm)); 2494 2495 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2496 unix_set_secdata(&scm, skb); 2497 2498 if (!(flags & MSG_PEEK)) { 2499 if (UNIXCB(skb).fp) 2500 unix_detach_fds(&scm, skb); 2501 2502 sk_peek_offset_bwd(sk, skb->len); 2503 } else { 2504 /* It is questionable: on PEEK we could: 2505 - do not return fds - good, but too simple 8) 2506 - return fds, and do not return them on read (old strategy, 2507 apparently wrong) 2508 - clone fds (I chose it for now, it is the most universal 2509 solution) 2510 2511 POSIX 1003.1g does not actually define this clearly 2512 at all. POSIX 1003.1g doesn't define a lot of things 2513 clearly however! 2514 2515 */ 2516 2517 sk_peek_offset_fwd(sk, size); 2518 2519 if (UNIXCB(skb).fp) 2520 unix_peek_fds(&scm, skb); 2521 } 2522 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2523 2524 scm_recv_unix(sock, msg, &scm, flags); 2525 2526 out_free: 2527 skb_free_datagram(sk, skb); 2528 mutex_unlock(&u->iolock); 2529 out: 2530 return err; 2531 } 2532 2533 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2534 int flags) 2535 { 2536 struct sock *sk = sock->sk; 2537 2538 #ifdef CONFIG_BPF_SYSCALL 2539 const struct proto *prot = READ_ONCE(sk->sk_prot); 2540 2541 if (prot != &unix_dgram_proto) 2542 return prot->recvmsg(sk, msg, size, flags, NULL); 2543 #endif 2544 return __unix_dgram_recvmsg(sk, msg, size, flags); 2545 } 2546 2547 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2548 { 2549 struct unix_sock *u = unix_sk(sk); 2550 struct sk_buff *skb; 2551 int err; 2552 2553 mutex_lock(&u->iolock); 2554 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2555 mutex_unlock(&u->iolock); 2556 if (!skb) 2557 return err; 2558 2559 return recv_actor(sk, skb); 2560 } 2561 2562 /* 2563 * Sleep until more data has arrived. But check for races.. 2564 */ 2565 static long unix_stream_data_wait(struct sock *sk, long timeo, 2566 struct sk_buff *last, unsigned int last_len, 2567 bool freezable) 2568 { 2569 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2570 struct sk_buff *tail; 2571 DEFINE_WAIT(wait); 2572 2573 unix_state_lock(sk); 2574 2575 for (;;) { 2576 prepare_to_wait(sk_sleep(sk), &wait, state); 2577 2578 tail = skb_peek_tail(&sk->sk_receive_queue); 2579 if (tail != last || 2580 (tail && tail->len != last_len) || 2581 sk->sk_err || 2582 (sk->sk_shutdown & RCV_SHUTDOWN) || 2583 signal_pending(current) || 2584 !timeo) 2585 break; 2586 2587 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2588 unix_state_unlock(sk); 2589 timeo = schedule_timeout(timeo); 2590 unix_state_lock(sk); 2591 2592 if (sock_flag(sk, SOCK_DEAD)) 2593 break; 2594 2595 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2596 } 2597 2598 finish_wait(sk_sleep(sk), &wait); 2599 unix_state_unlock(sk); 2600 return timeo; 2601 } 2602 2603 static unsigned int unix_skb_len(const struct sk_buff *skb) 2604 { 2605 return skb->len - UNIXCB(skb).consumed; 2606 } 2607 2608 struct unix_stream_read_state { 2609 int (*recv_actor)(struct sk_buff *, int, int, 2610 struct unix_stream_read_state *); 2611 struct socket *socket; 2612 struct msghdr *msg; 2613 struct pipe_inode_info *pipe; 2614 size_t size; 2615 int flags; 2616 unsigned int splice_flags; 2617 }; 2618 2619 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2620 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2621 { 2622 struct socket *sock = state->socket; 2623 struct sock *sk = sock->sk; 2624 struct unix_sock *u = unix_sk(sk); 2625 int chunk = 1; 2626 struct sk_buff *oob_skb; 2627 2628 mutex_lock(&u->iolock); 2629 unix_state_lock(sk); 2630 spin_lock(&sk->sk_receive_queue.lock); 2631 2632 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2633 spin_unlock(&sk->sk_receive_queue.lock); 2634 unix_state_unlock(sk); 2635 mutex_unlock(&u->iolock); 2636 return -EINVAL; 2637 } 2638 2639 oob_skb = u->oob_skb; 2640 2641 if (!(state->flags & MSG_PEEK)) 2642 WRITE_ONCE(u->oob_skb, NULL); 2643 else 2644 skb_get(oob_skb); 2645 2646 spin_unlock(&sk->sk_receive_queue.lock); 2647 unix_state_unlock(sk); 2648 2649 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2650 2651 if (!(state->flags & MSG_PEEK)) 2652 UNIXCB(oob_skb).consumed += 1; 2653 2654 consume_skb(oob_skb); 2655 2656 mutex_unlock(&u->iolock); 2657 2658 if (chunk < 0) 2659 return -EFAULT; 2660 2661 state->msg->msg_flags |= MSG_OOB; 2662 return 1; 2663 } 2664 2665 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2666 int flags, int copied) 2667 { 2668 struct unix_sock *u = unix_sk(sk); 2669 2670 if (!unix_skb_len(skb)) { 2671 struct sk_buff *unlinked_skb = NULL; 2672 2673 spin_lock(&sk->sk_receive_queue.lock); 2674 2675 if (copied && (!u->oob_skb || skb == u->oob_skb)) { 2676 skb = NULL; 2677 } else if (flags & MSG_PEEK) { 2678 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2679 } else { 2680 unlinked_skb = skb; 2681 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2682 __skb_unlink(unlinked_skb, &sk->sk_receive_queue); 2683 } 2684 2685 spin_unlock(&sk->sk_receive_queue.lock); 2686 2687 consume_skb(unlinked_skb); 2688 } else { 2689 struct sk_buff *unlinked_skb = NULL; 2690 2691 spin_lock(&sk->sk_receive_queue.lock); 2692 2693 if (skb == u->oob_skb) { 2694 if (copied) { 2695 skb = NULL; 2696 } else if (!(flags & MSG_PEEK)) { 2697 if (sock_flag(sk, SOCK_URGINLINE)) { 2698 WRITE_ONCE(u->oob_skb, NULL); 2699 consume_skb(skb); 2700 } else { 2701 __skb_unlink(skb, &sk->sk_receive_queue); 2702 WRITE_ONCE(u->oob_skb, NULL); 2703 unlinked_skb = skb; 2704 skb = skb_peek(&sk->sk_receive_queue); 2705 } 2706 } else if (!sock_flag(sk, SOCK_URGINLINE)) { 2707 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2708 } 2709 } 2710 2711 spin_unlock(&sk->sk_receive_queue.lock); 2712 2713 if (unlinked_skb) { 2714 WARN_ON_ONCE(skb_unref(unlinked_skb)); 2715 kfree_skb(unlinked_skb); 2716 } 2717 } 2718 return skb; 2719 } 2720 #endif 2721 2722 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2723 { 2724 struct unix_sock *u = unix_sk(sk); 2725 struct sk_buff *skb; 2726 int err; 2727 2728 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2729 return -ENOTCONN; 2730 2731 mutex_lock(&u->iolock); 2732 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2733 mutex_unlock(&u->iolock); 2734 if (!skb) 2735 return err; 2736 2737 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2738 if (unlikely(skb == READ_ONCE(u->oob_skb))) { 2739 bool drop = false; 2740 2741 unix_state_lock(sk); 2742 2743 if (sock_flag(sk, SOCK_DEAD)) { 2744 unix_state_unlock(sk); 2745 kfree_skb(skb); 2746 return -ECONNRESET; 2747 } 2748 2749 spin_lock(&sk->sk_receive_queue.lock); 2750 if (likely(skb == u->oob_skb)) { 2751 WRITE_ONCE(u->oob_skb, NULL); 2752 drop = true; 2753 } 2754 spin_unlock(&sk->sk_receive_queue.lock); 2755 2756 unix_state_unlock(sk); 2757 2758 if (drop) { 2759 WARN_ON_ONCE(skb_unref(skb)); 2760 kfree_skb(skb); 2761 return -EAGAIN; 2762 } 2763 } 2764 #endif 2765 2766 return recv_actor(sk, skb); 2767 } 2768 2769 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2770 bool freezable) 2771 { 2772 struct scm_cookie scm; 2773 struct socket *sock = state->socket; 2774 struct sock *sk = sock->sk; 2775 struct unix_sock *u = unix_sk(sk); 2776 int copied = 0; 2777 int flags = state->flags; 2778 int noblock = flags & MSG_DONTWAIT; 2779 bool check_creds = false; 2780 int target; 2781 int err = 0; 2782 long timeo; 2783 int skip; 2784 size_t size = state->size; 2785 unsigned int last_len; 2786 2787 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2788 err = -EINVAL; 2789 goto out; 2790 } 2791 2792 if (unlikely(flags & MSG_OOB)) { 2793 err = -EOPNOTSUPP; 2794 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2795 err = unix_stream_recv_urg(state); 2796 #endif 2797 goto out; 2798 } 2799 2800 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2801 timeo = sock_rcvtimeo(sk, noblock); 2802 2803 memset(&scm, 0, sizeof(scm)); 2804 2805 /* Lock the socket to prevent queue disordering 2806 * while sleeps in memcpy_tomsg 2807 */ 2808 mutex_lock(&u->iolock); 2809 2810 skip = max(sk_peek_offset(sk, flags), 0); 2811 2812 do { 2813 struct sk_buff *skb, *last; 2814 int chunk; 2815 2816 redo: 2817 unix_state_lock(sk); 2818 if (sock_flag(sk, SOCK_DEAD)) { 2819 err = -ECONNRESET; 2820 goto unlock; 2821 } 2822 last = skb = skb_peek(&sk->sk_receive_queue); 2823 last_len = last ? last->len : 0; 2824 2825 again: 2826 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2827 if (skb) { 2828 skb = manage_oob(skb, sk, flags, copied); 2829 if (!skb && copied) { 2830 unix_state_unlock(sk); 2831 break; 2832 } 2833 } 2834 #endif 2835 if (skb == NULL) { 2836 if (copied >= target) 2837 goto unlock; 2838 2839 /* 2840 * POSIX 1003.1g mandates this order. 2841 */ 2842 2843 err = sock_error(sk); 2844 if (err) 2845 goto unlock; 2846 if (sk->sk_shutdown & RCV_SHUTDOWN) 2847 goto unlock; 2848 2849 unix_state_unlock(sk); 2850 if (!timeo) { 2851 err = -EAGAIN; 2852 break; 2853 } 2854 2855 mutex_unlock(&u->iolock); 2856 2857 timeo = unix_stream_data_wait(sk, timeo, last, 2858 last_len, freezable); 2859 2860 if (signal_pending(current)) { 2861 err = sock_intr_errno(timeo); 2862 scm_destroy(&scm); 2863 goto out; 2864 } 2865 2866 mutex_lock(&u->iolock); 2867 goto redo; 2868 unlock: 2869 unix_state_unlock(sk); 2870 break; 2871 } 2872 2873 while (skip >= unix_skb_len(skb)) { 2874 skip -= unix_skb_len(skb); 2875 last = skb; 2876 last_len = skb->len; 2877 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2878 if (!skb) 2879 goto again; 2880 } 2881 2882 unix_state_unlock(sk); 2883 2884 if (check_creds) { 2885 /* Never glue messages from different writers */ 2886 if (!unix_skb_scm_eq(skb, &scm)) 2887 break; 2888 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2889 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2890 /* Copy credentials */ 2891 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2892 unix_set_secdata(&scm, skb); 2893 check_creds = true; 2894 } 2895 2896 /* Copy address just once */ 2897 if (state->msg && state->msg->msg_name) { 2898 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2899 state->msg->msg_name); 2900 unix_copy_addr(state->msg, skb->sk); 2901 2902 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2903 state->msg->msg_name, 2904 &state->msg->msg_namelen); 2905 2906 sunaddr = NULL; 2907 } 2908 2909 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2910 chunk = state->recv_actor(skb, skip, chunk, state); 2911 if (chunk < 0) { 2912 if (copied == 0) 2913 copied = -EFAULT; 2914 break; 2915 } 2916 copied += chunk; 2917 size -= chunk; 2918 2919 /* Mark read part of skb as used */ 2920 if (!(flags & MSG_PEEK)) { 2921 UNIXCB(skb).consumed += chunk; 2922 2923 sk_peek_offset_bwd(sk, chunk); 2924 2925 if (UNIXCB(skb).fp) { 2926 scm_stat_del(sk, skb); 2927 unix_detach_fds(&scm, skb); 2928 } 2929 2930 if (unix_skb_len(skb)) 2931 break; 2932 2933 skb_unlink(skb, &sk->sk_receive_queue); 2934 consume_skb(skb); 2935 2936 if (scm.fp) 2937 break; 2938 } else { 2939 /* It is questionable, see note in unix_dgram_recvmsg. 2940 */ 2941 if (UNIXCB(skb).fp) 2942 unix_peek_fds(&scm, skb); 2943 2944 sk_peek_offset_fwd(sk, chunk); 2945 2946 if (UNIXCB(skb).fp) 2947 break; 2948 2949 skip = 0; 2950 last = skb; 2951 last_len = skb->len; 2952 unix_state_lock(sk); 2953 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2954 if (skb) 2955 goto again; 2956 unix_state_unlock(sk); 2957 break; 2958 } 2959 } while (size); 2960 2961 mutex_unlock(&u->iolock); 2962 if (state->msg) 2963 scm_recv_unix(sock, state->msg, &scm, flags); 2964 else 2965 scm_destroy(&scm); 2966 out: 2967 return copied ? : err; 2968 } 2969 2970 static int unix_stream_read_actor(struct sk_buff *skb, 2971 int skip, int chunk, 2972 struct unix_stream_read_state *state) 2973 { 2974 int ret; 2975 2976 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2977 state->msg, chunk); 2978 return ret ?: chunk; 2979 } 2980 2981 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2982 size_t size, int flags) 2983 { 2984 struct unix_stream_read_state state = { 2985 .recv_actor = unix_stream_read_actor, 2986 .socket = sk->sk_socket, 2987 .msg = msg, 2988 .size = size, 2989 .flags = flags 2990 }; 2991 2992 return unix_stream_read_generic(&state, true); 2993 } 2994 2995 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2996 size_t size, int flags) 2997 { 2998 struct unix_stream_read_state state = { 2999 .recv_actor = unix_stream_read_actor, 3000 .socket = sock, 3001 .msg = msg, 3002 .size = size, 3003 .flags = flags 3004 }; 3005 3006 #ifdef CONFIG_BPF_SYSCALL 3007 struct sock *sk = sock->sk; 3008 const struct proto *prot = READ_ONCE(sk->sk_prot); 3009 3010 if (prot != &unix_stream_proto) 3011 return prot->recvmsg(sk, msg, size, flags, NULL); 3012 #endif 3013 return unix_stream_read_generic(&state, true); 3014 } 3015 3016 static int unix_stream_splice_actor(struct sk_buff *skb, 3017 int skip, int chunk, 3018 struct unix_stream_read_state *state) 3019 { 3020 return skb_splice_bits(skb, state->socket->sk, 3021 UNIXCB(skb).consumed + skip, 3022 state->pipe, chunk, state->splice_flags); 3023 } 3024 3025 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 3026 struct pipe_inode_info *pipe, 3027 size_t size, unsigned int flags) 3028 { 3029 struct unix_stream_read_state state = { 3030 .recv_actor = unix_stream_splice_actor, 3031 .socket = sock, 3032 .pipe = pipe, 3033 .size = size, 3034 .splice_flags = flags, 3035 }; 3036 3037 if (unlikely(*ppos)) 3038 return -ESPIPE; 3039 3040 if (sock->file->f_flags & O_NONBLOCK || 3041 flags & SPLICE_F_NONBLOCK) 3042 state.flags = MSG_DONTWAIT; 3043 3044 return unix_stream_read_generic(&state, false); 3045 } 3046 3047 static int unix_shutdown(struct socket *sock, int mode) 3048 { 3049 struct sock *sk = sock->sk; 3050 struct sock *other; 3051 3052 if (mode < SHUT_RD || mode > SHUT_RDWR) 3053 return -EINVAL; 3054 /* This maps: 3055 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 3056 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 3057 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 3058 */ 3059 ++mode; 3060 3061 unix_state_lock(sk); 3062 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 3063 other = unix_peer(sk); 3064 if (other) 3065 sock_hold(other); 3066 unix_state_unlock(sk); 3067 sk->sk_state_change(sk); 3068 3069 if (other && 3070 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 3071 3072 int peer_mode = 0; 3073 const struct proto *prot = READ_ONCE(other->sk_prot); 3074 3075 if (prot->unhash) 3076 prot->unhash(other); 3077 if (mode&RCV_SHUTDOWN) 3078 peer_mode |= SEND_SHUTDOWN; 3079 if (mode&SEND_SHUTDOWN) 3080 peer_mode |= RCV_SHUTDOWN; 3081 unix_state_lock(other); 3082 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 3083 unix_state_unlock(other); 3084 other->sk_state_change(other); 3085 if (peer_mode == SHUTDOWN_MASK) 3086 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3087 else if (peer_mode & RCV_SHUTDOWN) 3088 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3089 } 3090 if (other) 3091 sock_put(other); 3092 3093 return 0; 3094 } 3095 3096 long unix_inq_len(struct sock *sk) 3097 { 3098 struct sk_buff *skb; 3099 long amount = 0; 3100 3101 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 3102 return -EINVAL; 3103 3104 spin_lock(&sk->sk_receive_queue.lock); 3105 if (sk->sk_type == SOCK_STREAM || 3106 sk->sk_type == SOCK_SEQPACKET) { 3107 skb_queue_walk(&sk->sk_receive_queue, skb) 3108 amount += unix_skb_len(skb); 3109 } else { 3110 skb = skb_peek(&sk->sk_receive_queue); 3111 if (skb) 3112 amount = skb->len; 3113 } 3114 spin_unlock(&sk->sk_receive_queue.lock); 3115 3116 return amount; 3117 } 3118 EXPORT_SYMBOL_GPL(unix_inq_len); 3119 3120 long unix_outq_len(struct sock *sk) 3121 { 3122 return sk_wmem_alloc_get(sk); 3123 } 3124 EXPORT_SYMBOL_GPL(unix_outq_len); 3125 3126 static int unix_open_file(struct sock *sk) 3127 { 3128 struct path path; 3129 struct file *f; 3130 int fd; 3131 3132 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3133 return -EPERM; 3134 3135 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3136 return -ENOENT; 3137 3138 path = unix_sk(sk)->path; 3139 if (!path.dentry) 3140 return -ENOENT; 3141 3142 path_get(&path); 3143 3144 fd = get_unused_fd_flags(O_CLOEXEC); 3145 if (fd < 0) 3146 goto out; 3147 3148 f = dentry_open(&path, O_PATH, current_cred()); 3149 if (IS_ERR(f)) { 3150 put_unused_fd(fd); 3151 fd = PTR_ERR(f); 3152 goto out; 3153 } 3154 3155 fd_install(fd, f); 3156 out: 3157 path_put(&path); 3158 3159 return fd; 3160 } 3161 3162 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3163 { 3164 struct sock *sk = sock->sk; 3165 long amount = 0; 3166 int err; 3167 3168 switch (cmd) { 3169 case SIOCOUTQ: 3170 amount = unix_outq_len(sk); 3171 err = put_user(amount, (int __user *)arg); 3172 break; 3173 case SIOCINQ: 3174 amount = unix_inq_len(sk); 3175 if (amount < 0) 3176 err = amount; 3177 else 3178 err = put_user(amount, (int __user *)arg); 3179 break; 3180 case SIOCUNIXFILE: 3181 err = unix_open_file(sk); 3182 break; 3183 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3184 case SIOCATMARK: 3185 { 3186 struct unix_sock *u = unix_sk(sk); 3187 struct sk_buff *skb; 3188 int answ = 0; 3189 3190 mutex_lock(&u->iolock); 3191 3192 skb = skb_peek(&sk->sk_receive_queue); 3193 if (skb) { 3194 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); 3195 3196 if (skb == oob_skb || 3197 (!oob_skb && !unix_skb_len(skb))) 3198 answ = 1; 3199 } 3200 3201 mutex_unlock(&u->iolock); 3202 3203 err = put_user(answ, (int __user *)arg); 3204 } 3205 break; 3206 #endif 3207 default: 3208 err = -ENOIOCTLCMD; 3209 break; 3210 } 3211 return err; 3212 } 3213 3214 #ifdef CONFIG_COMPAT 3215 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3216 { 3217 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3218 } 3219 #endif 3220 3221 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3222 { 3223 struct sock *sk = sock->sk; 3224 unsigned char state; 3225 __poll_t mask; 3226 u8 shutdown; 3227 3228 sock_poll_wait(file, sock, wait); 3229 mask = 0; 3230 shutdown = READ_ONCE(sk->sk_shutdown); 3231 state = READ_ONCE(sk->sk_state); 3232 3233 /* exceptional events? */ 3234 if (READ_ONCE(sk->sk_err)) 3235 mask |= EPOLLERR; 3236 if (shutdown == SHUTDOWN_MASK) 3237 mask |= EPOLLHUP; 3238 if (shutdown & RCV_SHUTDOWN) 3239 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3240 3241 /* readable? */ 3242 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3243 mask |= EPOLLIN | EPOLLRDNORM; 3244 if (sk_is_readable(sk)) 3245 mask |= EPOLLIN | EPOLLRDNORM; 3246 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3247 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3248 mask |= EPOLLPRI; 3249 #endif 3250 3251 /* Connection-based need to check for termination and startup */ 3252 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3253 state == TCP_CLOSE) 3254 mask |= EPOLLHUP; 3255 3256 /* 3257 * we set writable also when the other side has shut down the 3258 * connection. This prevents stuck sockets. 3259 */ 3260 if (unix_writable(sk, state)) 3261 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3262 3263 return mask; 3264 } 3265 3266 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3267 poll_table *wait) 3268 { 3269 struct sock *sk = sock->sk, *other; 3270 unsigned int writable; 3271 unsigned char state; 3272 __poll_t mask; 3273 u8 shutdown; 3274 3275 sock_poll_wait(file, sock, wait); 3276 mask = 0; 3277 shutdown = READ_ONCE(sk->sk_shutdown); 3278 state = READ_ONCE(sk->sk_state); 3279 3280 /* exceptional events? */ 3281 if (READ_ONCE(sk->sk_err) || 3282 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3283 mask |= EPOLLERR | 3284 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3285 3286 if (shutdown & RCV_SHUTDOWN) 3287 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3288 if (shutdown == SHUTDOWN_MASK) 3289 mask |= EPOLLHUP; 3290 3291 /* readable? */ 3292 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3293 mask |= EPOLLIN | EPOLLRDNORM; 3294 if (sk_is_readable(sk)) 3295 mask |= EPOLLIN | EPOLLRDNORM; 3296 3297 /* Connection-based need to check for termination and startup */ 3298 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3299 mask |= EPOLLHUP; 3300 3301 /* No write status requested, avoid expensive OUT tests. */ 3302 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3303 return mask; 3304 3305 writable = unix_writable(sk, state); 3306 if (writable) { 3307 unix_state_lock(sk); 3308 3309 other = unix_peer(sk); 3310 if (other && unix_peer(other) != sk && 3311 unix_recvq_full_lockless(other) && 3312 unix_dgram_peer_wake_me(sk, other)) 3313 writable = 0; 3314 3315 unix_state_unlock(sk); 3316 } 3317 3318 if (writable) 3319 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3320 else 3321 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3322 3323 return mask; 3324 } 3325 3326 #ifdef CONFIG_PROC_FS 3327 3328 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3329 3330 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3331 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3332 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3333 3334 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3335 { 3336 unsigned long offset = get_offset(*pos); 3337 unsigned long bucket = get_bucket(*pos); 3338 unsigned long count = 0; 3339 struct sock *sk; 3340 3341 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3342 sk; sk = sk_next(sk)) { 3343 if (++count == offset) 3344 break; 3345 } 3346 3347 return sk; 3348 } 3349 3350 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3351 { 3352 unsigned long bucket = get_bucket(*pos); 3353 struct net *net = seq_file_net(seq); 3354 struct sock *sk; 3355 3356 while (bucket < UNIX_HASH_SIZE) { 3357 spin_lock(&net->unx.table.locks[bucket]); 3358 3359 sk = unix_from_bucket(seq, pos); 3360 if (sk) 3361 return sk; 3362 3363 spin_unlock(&net->unx.table.locks[bucket]); 3364 3365 *pos = set_bucket_offset(++bucket, 1); 3366 } 3367 3368 return NULL; 3369 } 3370 3371 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3372 loff_t *pos) 3373 { 3374 unsigned long bucket = get_bucket(*pos); 3375 3376 sk = sk_next(sk); 3377 if (sk) 3378 return sk; 3379 3380 3381 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3382 3383 *pos = set_bucket_offset(++bucket, 1); 3384 3385 return unix_get_first(seq, pos); 3386 } 3387 3388 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3389 { 3390 if (!*pos) 3391 return SEQ_START_TOKEN; 3392 3393 return unix_get_first(seq, pos); 3394 } 3395 3396 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3397 { 3398 ++*pos; 3399 3400 if (v == SEQ_START_TOKEN) 3401 return unix_get_first(seq, pos); 3402 3403 return unix_get_next(seq, v, pos); 3404 } 3405 3406 static void unix_seq_stop(struct seq_file *seq, void *v) 3407 { 3408 struct sock *sk = v; 3409 3410 if (sk) 3411 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3412 } 3413 3414 static int unix_seq_show(struct seq_file *seq, void *v) 3415 { 3416 3417 if (v == SEQ_START_TOKEN) 3418 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3419 "Inode Path\n"); 3420 else { 3421 struct sock *s = v; 3422 struct unix_sock *u = unix_sk(s); 3423 unix_state_lock(s); 3424 3425 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3426 s, 3427 refcount_read(&s->sk_refcnt), 3428 0, 3429 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3430 s->sk_type, 3431 s->sk_socket ? 3432 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3433 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3434 sock_i_ino(s)); 3435 3436 if (u->addr) { // under a hash table lock here 3437 int i, len; 3438 seq_putc(seq, ' '); 3439 3440 i = 0; 3441 len = u->addr->len - 3442 offsetof(struct sockaddr_un, sun_path); 3443 if (u->addr->name->sun_path[0]) { 3444 len--; 3445 } else { 3446 seq_putc(seq, '@'); 3447 i++; 3448 } 3449 for ( ; i < len; i++) 3450 seq_putc(seq, u->addr->name->sun_path[i] ?: 3451 '@'); 3452 } 3453 unix_state_unlock(s); 3454 seq_putc(seq, '\n'); 3455 } 3456 3457 return 0; 3458 } 3459 3460 static const struct seq_operations unix_seq_ops = { 3461 .start = unix_seq_start, 3462 .next = unix_seq_next, 3463 .stop = unix_seq_stop, 3464 .show = unix_seq_show, 3465 }; 3466 3467 #ifdef CONFIG_BPF_SYSCALL 3468 struct bpf_unix_iter_state { 3469 struct seq_net_private p; 3470 unsigned int cur_sk; 3471 unsigned int end_sk; 3472 unsigned int max_sk; 3473 struct sock **batch; 3474 bool st_bucket_done; 3475 }; 3476 3477 struct bpf_iter__unix { 3478 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3479 __bpf_md_ptr(struct unix_sock *, unix_sk); 3480 uid_t uid __aligned(8); 3481 }; 3482 3483 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3484 struct unix_sock *unix_sk, uid_t uid) 3485 { 3486 struct bpf_iter__unix ctx; 3487 3488 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3489 ctx.meta = meta; 3490 ctx.unix_sk = unix_sk; 3491 ctx.uid = uid; 3492 return bpf_iter_run_prog(prog, &ctx); 3493 } 3494 3495 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3496 3497 { 3498 struct bpf_unix_iter_state *iter = seq->private; 3499 unsigned int expected = 1; 3500 struct sock *sk; 3501 3502 sock_hold(start_sk); 3503 iter->batch[iter->end_sk++] = start_sk; 3504 3505 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3506 if (iter->end_sk < iter->max_sk) { 3507 sock_hold(sk); 3508 iter->batch[iter->end_sk++] = sk; 3509 } 3510 3511 expected++; 3512 } 3513 3514 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3515 3516 return expected; 3517 } 3518 3519 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3520 { 3521 while (iter->cur_sk < iter->end_sk) 3522 sock_put(iter->batch[iter->cur_sk++]); 3523 } 3524 3525 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3526 unsigned int new_batch_sz) 3527 { 3528 struct sock **new_batch; 3529 3530 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3531 GFP_USER | __GFP_NOWARN); 3532 if (!new_batch) 3533 return -ENOMEM; 3534 3535 bpf_iter_unix_put_batch(iter); 3536 kvfree(iter->batch); 3537 iter->batch = new_batch; 3538 iter->max_sk = new_batch_sz; 3539 3540 return 0; 3541 } 3542 3543 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3544 loff_t *pos) 3545 { 3546 struct bpf_unix_iter_state *iter = seq->private; 3547 unsigned int expected; 3548 bool resized = false; 3549 struct sock *sk; 3550 3551 if (iter->st_bucket_done) 3552 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3553 3554 again: 3555 /* Get a new batch */ 3556 iter->cur_sk = 0; 3557 iter->end_sk = 0; 3558 3559 sk = unix_get_first(seq, pos); 3560 if (!sk) 3561 return NULL; /* Done */ 3562 3563 expected = bpf_iter_unix_hold_batch(seq, sk); 3564 3565 if (iter->end_sk == expected) { 3566 iter->st_bucket_done = true; 3567 return sk; 3568 } 3569 3570 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3571 resized = true; 3572 goto again; 3573 } 3574 3575 return sk; 3576 } 3577 3578 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3579 { 3580 if (!*pos) 3581 return SEQ_START_TOKEN; 3582 3583 /* bpf iter does not support lseek, so it always 3584 * continue from where it was stop()-ped. 3585 */ 3586 return bpf_iter_unix_batch(seq, pos); 3587 } 3588 3589 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3590 { 3591 struct bpf_unix_iter_state *iter = seq->private; 3592 struct sock *sk; 3593 3594 /* Whenever seq_next() is called, the iter->cur_sk is 3595 * done with seq_show(), so advance to the next sk in 3596 * the batch. 3597 */ 3598 if (iter->cur_sk < iter->end_sk) 3599 sock_put(iter->batch[iter->cur_sk++]); 3600 3601 ++*pos; 3602 3603 if (iter->cur_sk < iter->end_sk) 3604 sk = iter->batch[iter->cur_sk]; 3605 else 3606 sk = bpf_iter_unix_batch(seq, pos); 3607 3608 return sk; 3609 } 3610 3611 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3612 { 3613 struct bpf_iter_meta meta; 3614 struct bpf_prog *prog; 3615 struct sock *sk = v; 3616 uid_t uid; 3617 bool slow; 3618 int ret; 3619 3620 if (v == SEQ_START_TOKEN) 3621 return 0; 3622 3623 slow = lock_sock_fast(sk); 3624 3625 if (unlikely(sk_unhashed(sk))) { 3626 ret = SEQ_SKIP; 3627 goto unlock; 3628 } 3629 3630 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3631 meta.seq = seq; 3632 prog = bpf_iter_get_info(&meta, false); 3633 ret = unix_prog_seq_show(prog, &meta, v, uid); 3634 unlock: 3635 unlock_sock_fast(sk, slow); 3636 return ret; 3637 } 3638 3639 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3640 { 3641 struct bpf_unix_iter_state *iter = seq->private; 3642 struct bpf_iter_meta meta; 3643 struct bpf_prog *prog; 3644 3645 if (!v) { 3646 meta.seq = seq; 3647 prog = bpf_iter_get_info(&meta, true); 3648 if (prog) 3649 (void)unix_prog_seq_show(prog, &meta, v, 0); 3650 } 3651 3652 if (iter->cur_sk < iter->end_sk) 3653 bpf_iter_unix_put_batch(iter); 3654 } 3655 3656 static const struct seq_operations bpf_iter_unix_seq_ops = { 3657 .start = bpf_iter_unix_seq_start, 3658 .next = bpf_iter_unix_seq_next, 3659 .stop = bpf_iter_unix_seq_stop, 3660 .show = bpf_iter_unix_seq_show, 3661 }; 3662 #endif 3663 #endif 3664 3665 static const struct net_proto_family unix_family_ops = { 3666 .family = PF_UNIX, 3667 .create = unix_create, 3668 .owner = THIS_MODULE, 3669 }; 3670 3671 3672 static int __net_init unix_net_init(struct net *net) 3673 { 3674 int i; 3675 3676 net->unx.sysctl_max_dgram_qlen = 10; 3677 if (unix_sysctl_register(net)) 3678 goto out; 3679 3680 #ifdef CONFIG_PROC_FS 3681 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3682 sizeof(struct seq_net_private))) 3683 goto err_sysctl; 3684 #endif 3685 3686 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3687 sizeof(spinlock_t), GFP_KERNEL); 3688 if (!net->unx.table.locks) 3689 goto err_proc; 3690 3691 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3692 sizeof(struct hlist_head), 3693 GFP_KERNEL); 3694 if (!net->unx.table.buckets) 3695 goto free_locks; 3696 3697 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3698 spin_lock_init(&net->unx.table.locks[i]); 3699 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); 3700 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3701 } 3702 3703 return 0; 3704 3705 free_locks: 3706 kvfree(net->unx.table.locks); 3707 err_proc: 3708 #ifdef CONFIG_PROC_FS 3709 remove_proc_entry("unix", net->proc_net); 3710 err_sysctl: 3711 #endif 3712 unix_sysctl_unregister(net); 3713 out: 3714 return -ENOMEM; 3715 } 3716 3717 static void __net_exit unix_net_exit(struct net *net) 3718 { 3719 kvfree(net->unx.table.buckets); 3720 kvfree(net->unx.table.locks); 3721 unix_sysctl_unregister(net); 3722 remove_proc_entry("unix", net->proc_net); 3723 } 3724 3725 static struct pernet_operations unix_net_ops = { 3726 .init = unix_net_init, 3727 .exit = unix_net_exit, 3728 }; 3729 3730 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3731 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3732 struct unix_sock *unix_sk, uid_t uid) 3733 3734 #define INIT_BATCH_SZ 16 3735 3736 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3737 { 3738 struct bpf_unix_iter_state *iter = priv_data; 3739 int err; 3740 3741 err = bpf_iter_init_seq_net(priv_data, aux); 3742 if (err) 3743 return err; 3744 3745 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3746 if (err) { 3747 bpf_iter_fini_seq_net(priv_data); 3748 return err; 3749 } 3750 3751 return 0; 3752 } 3753 3754 static void bpf_iter_fini_unix(void *priv_data) 3755 { 3756 struct bpf_unix_iter_state *iter = priv_data; 3757 3758 bpf_iter_fini_seq_net(priv_data); 3759 kvfree(iter->batch); 3760 } 3761 3762 static const struct bpf_iter_seq_info unix_seq_info = { 3763 .seq_ops = &bpf_iter_unix_seq_ops, 3764 .init_seq_private = bpf_iter_init_unix, 3765 .fini_seq_private = bpf_iter_fini_unix, 3766 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3767 }; 3768 3769 static const struct bpf_func_proto * 3770 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3771 const struct bpf_prog *prog) 3772 { 3773 switch (func_id) { 3774 case BPF_FUNC_setsockopt: 3775 return &bpf_sk_setsockopt_proto; 3776 case BPF_FUNC_getsockopt: 3777 return &bpf_sk_getsockopt_proto; 3778 default: 3779 return NULL; 3780 } 3781 } 3782 3783 static struct bpf_iter_reg unix_reg_info = { 3784 .target = "unix", 3785 .ctx_arg_info_size = 1, 3786 .ctx_arg_info = { 3787 { offsetof(struct bpf_iter__unix, unix_sk), 3788 PTR_TO_BTF_ID_OR_NULL }, 3789 }, 3790 .get_func_proto = bpf_iter_unix_get_func_proto, 3791 .seq_info = &unix_seq_info, 3792 }; 3793 3794 static void __init bpf_iter_register(void) 3795 { 3796 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3797 if (bpf_iter_reg_target(&unix_reg_info)) 3798 pr_warn("Warning: could not register bpf iterator unix\n"); 3799 } 3800 #endif 3801 3802 static int __init af_unix_init(void) 3803 { 3804 int i, rc = -1; 3805 3806 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3807 3808 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3809 spin_lock_init(&bsd_socket_locks[i]); 3810 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3811 } 3812 3813 rc = proto_register(&unix_dgram_proto, 1); 3814 if (rc != 0) { 3815 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3816 goto out; 3817 } 3818 3819 rc = proto_register(&unix_stream_proto, 1); 3820 if (rc != 0) { 3821 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3822 proto_unregister(&unix_dgram_proto); 3823 goto out; 3824 } 3825 3826 sock_register(&unix_family_ops); 3827 register_pernet_subsys(&unix_net_ops); 3828 unix_bpf_build_proto(); 3829 3830 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3831 bpf_iter_register(); 3832 #endif 3833 3834 out: 3835 return rc; 3836 } 3837 3838 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3839 fs_initcall(af_unix_init); 3840