1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 #include <linux/bpf-cgroup.h> 120 121 static atomic_long_t unix_nr_socks; 122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 124 125 /* SMP locking strategy: 126 * hash table is protected with spinlock. 127 * each socket state is protected by separate spinlock. 128 */ 129 #ifdef CONFIG_PROVE_LOCKING 130 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) 131 132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a, 133 const struct lockdep_map *b) 134 { 135 return cmp_ptr(a, b); 136 } 137 138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, 139 const struct lockdep_map *_b) 140 { 141 const struct unix_sock *a, *b; 142 143 a = container_of(_a, struct unix_sock, lock.dep_map); 144 b = container_of(_b, struct unix_sock, lock.dep_map); 145 146 if (a->sk.sk_state == TCP_LISTEN) { 147 /* unix_stream_connect(): Before the 2nd unix_state_lock(), 148 * 149 * 1. a is TCP_LISTEN. 150 * 2. b is not a. 151 * 3. concurrent connect(b -> a) must fail. 152 * 153 * Except for 2. & 3., the b's state can be any possible 154 * value due to concurrent connect() or listen(). 155 * 156 * 2. is detected in debug_spin_lock_before(), and 3. cannot 157 * be expressed as lock_cmp_fn. 158 */ 159 switch (b->sk.sk_state) { 160 case TCP_CLOSE: 161 case TCP_ESTABLISHED: 162 case TCP_LISTEN: 163 return -1; 164 default: 165 /* Invalid case. */ 166 return 0; 167 } 168 } 169 170 /* Should never happen. Just to be symmetric. */ 171 if (b->sk.sk_state == TCP_LISTEN) { 172 switch (b->sk.sk_state) { 173 case TCP_CLOSE: 174 case TCP_ESTABLISHED: 175 return 1; 176 default: 177 return 0; 178 } 179 } 180 181 /* unix_state_double_lock(): ascending address order. */ 182 return cmp_ptr(a, b); 183 } 184 185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, 186 const struct lockdep_map *_b) 187 { 188 const struct sock *a, *b; 189 190 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); 191 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); 192 193 /* unix_collect_skb(): listener -> embryo order. */ 194 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) 195 return -1; 196 197 /* Should never happen. Just to be symmetric. */ 198 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) 199 return 1; 200 201 return 0; 202 } 203 #endif 204 205 static unsigned int unix_unbound_hash(struct sock *sk) 206 { 207 unsigned long hash = (unsigned long)sk; 208 209 hash ^= hash >> 16; 210 hash ^= hash >> 8; 211 hash ^= sk->sk_type; 212 213 return hash & UNIX_HASH_MOD; 214 } 215 216 static unsigned int unix_bsd_hash(struct inode *i) 217 { 218 return i->i_ino & UNIX_HASH_MOD; 219 } 220 221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 222 int addr_len, int type) 223 { 224 __wsum csum = csum_partial(sunaddr, addr_len, 0); 225 unsigned int hash; 226 227 hash = (__force unsigned int)csum_fold(csum); 228 hash ^= hash >> 8; 229 hash ^= type; 230 231 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 232 } 233 234 static void unix_table_double_lock(struct net *net, 235 unsigned int hash1, unsigned int hash2) 236 { 237 if (hash1 == hash2) { 238 spin_lock(&net->unx.table.locks[hash1]); 239 return; 240 } 241 242 if (hash1 > hash2) 243 swap(hash1, hash2); 244 245 spin_lock(&net->unx.table.locks[hash1]); 246 spin_lock(&net->unx.table.locks[hash2]); 247 } 248 249 static void unix_table_double_unlock(struct net *net, 250 unsigned int hash1, unsigned int hash2) 251 { 252 if (hash1 == hash2) { 253 spin_unlock(&net->unx.table.locks[hash1]); 254 return; 255 } 256 257 spin_unlock(&net->unx.table.locks[hash1]); 258 spin_unlock(&net->unx.table.locks[hash2]); 259 } 260 261 #ifdef CONFIG_SECURITY_NETWORK 262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 263 { 264 UNIXCB(skb).secid = scm->secid; 265 } 266 267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 268 { 269 scm->secid = UNIXCB(skb).secid; 270 } 271 272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 273 { 274 return (scm->secid == UNIXCB(skb).secid); 275 } 276 #else 277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 278 { } 279 280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 281 { } 282 283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 284 { 285 return true; 286 } 287 #endif /* CONFIG_SECURITY_NETWORK */ 288 289 static inline int unix_may_send(struct sock *sk, struct sock *osk) 290 { 291 return !unix_peer(osk) || unix_peer(osk) == sk; 292 } 293 294 static inline int unix_recvq_full_lockless(const struct sock *sk) 295 { 296 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 297 } 298 299 struct sock *unix_peer_get(struct sock *s) 300 { 301 struct sock *peer; 302 303 unix_state_lock(s); 304 peer = unix_peer(s); 305 if (peer) 306 sock_hold(peer); 307 unix_state_unlock(s); 308 return peer; 309 } 310 EXPORT_SYMBOL_GPL(unix_peer_get); 311 312 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 313 int addr_len) 314 { 315 struct unix_address *addr; 316 317 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 318 if (!addr) 319 return NULL; 320 321 refcount_set(&addr->refcnt, 1); 322 addr->len = addr_len; 323 memcpy(addr->name, sunaddr, addr_len); 324 325 return addr; 326 } 327 328 static inline void unix_release_addr(struct unix_address *addr) 329 { 330 if (refcount_dec_and_test(&addr->refcnt)) 331 kfree(addr); 332 } 333 334 /* 335 * Check unix socket name: 336 * - should be not zero length. 337 * - if started by not zero, should be NULL terminated (FS object) 338 * - if started by zero, it is abstract name. 339 */ 340 341 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 342 { 343 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 344 addr_len > sizeof(*sunaddr)) 345 return -EINVAL; 346 347 if (sunaddr->sun_family != AF_UNIX) 348 return -EINVAL; 349 350 return 0; 351 } 352 353 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 354 { 355 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 356 short offset = offsetof(struct sockaddr_storage, __data); 357 358 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 359 360 /* This may look like an off by one error but it is a bit more 361 * subtle. 108 is the longest valid AF_UNIX path for a binding. 362 * sun_path[108] doesn't as such exist. However in kernel space 363 * we are guaranteed that it is a valid memory location in our 364 * kernel address buffer because syscall functions always pass 365 * a pointer of struct sockaddr_storage which has a bigger buffer 366 * than 108. Also, we must terminate sun_path for strlen() in 367 * getname_kernel(). 368 */ 369 addr->__data[addr_len - offset] = 0; 370 371 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 372 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 373 * know the actual buffer. 374 */ 375 return strlen(addr->__data) + offset + 1; 376 } 377 378 static void __unix_remove_socket(struct sock *sk) 379 { 380 sk_del_node_init(sk); 381 } 382 383 static void __unix_insert_socket(struct net *net, struct sock *sk) 384 { 385 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 386 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 387 } 388 389 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 390 struct unix_address *addr, unsigned int hash) 391 { 392 __unix_remove_socket(sk); 393 smp_store_release(&unix_sk(sk)->addr, addr); 394 395 sk->sk_hash = hash; 396 __unix_insert_socket(net, sk); 397 } 398 399 static void unix_remove_socket(struct net *net, struct sock *sk) 400 { 401 spin_lock(&net->unx.table.locks[sk->sk_hash]); 402 __unix_remove_socket(sk); 403 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 404 } 405 406 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 407 { 408 spin_lock(&net->unx.table.locks[sk->sk_hash]); 409 __unix_insert_socket(net, sk); 410 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 411 } 412 413 static void unix_insert_bsd_socket(struct sock *sk) 414 { 415 spin_lock(&bsd_socket_locks[sk->sk_hash]); 416 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 417 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 418 } 419 420 static void unix_remove_bsd_socket(struct sock *sk) 421 { 422 if (!hlist_unhashed(&sk->sk_bind_node)) { 423 spin_lock(&bsd_socket_locks[sk->sk_hash]); 424 __sk_del_bind_node(sk); 425 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 426 427 sk_node_init(&sk->sk_bind_node); 428 } 429 } 430 431 static struct sock *__unix_find_socket_byname(struct net *net, 432 struct sockaddr_un *sunname, 433 int len, unsigned int hash) 434 { 435 struct sock *s; 436 437 sk_for_each(s, &net->unx.table.buckets[hash]) { 438 struct unix_sock *u = unix_sk(s); 439 440 if (u->addr->len == len && 441 !memcmp(u->addr->name, sunname, len)) 442 return s; 443 } 444 return NULL; 445 } 446 447 static inline struct sock *unix_find_socket_byname(struct net *net, 448 struct sockaddr_un *sunname, 449 int len, unsigned int hash) 450 { 451 struct sock *s; 452 453 spin_lock(&net->unx.table.locks[hash]); 454 s = __unix_find_socket_byname(net, sunname, len, hash); 455 if (s) 456 sock_hold(s); 457 spin_unlock(&net->unx.table.locks[hash]); 458 return s; 459 } 460 461 static struct sock *unix_find_socket_byinode(struct inode *i) 462 { 463 unsigned int hash = unix_bsd_hash(i); 464 struct sock *s; 465 466 spin_lock(&bsd_socket_locks[hash]); 467 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 468 struct dentry *dentry = unix_sk(s)->path.dentry; 469 470 if (dentry && d_backing_inode(dentry) == i) { 471 sock_hold(s); 472 spin_unlock(&bsd_socket_locks[hash]); 473 return s; 474 } 475 } 476 spin_unlock(&bsd_socket_locks[hash]); 477 return NULL; 478 } 479 480 /* Support code for asymmetrically connected dgram sockets 481 * 482 * If a datagram socket is connected to a socket not itself connected 483 * to the first socket (eg, /dev/log), clients may only enqueue more 484 * messages if the present receive queue of the server socket is not 485 * "too large". This means there's a second writeability condition 486 * poll and sendmsg need to test. The dgram recv code will do a wake 487 * up on the peer_wait wait queue of a socket upon reception of a 488 * datagram which needs to be propagated to sleeping would-be writers 489 * since these might not have sent anything so far. This can't be 490 * accomplished via poll_wait because the lifetime of the server 491 * socket might be less than that of its clients if these break their 492 * association with it or if the server socket is closed while clients 493 * are still connected to it and there's no way to inform "a polling 494 * implementation" that it should let go of a certain wait queue 495 * 496 * In order to propagate a wake up, a wait_queue_entry_t of the client 497 * socket is enqueued on the peer_wait queue of the server socket 498 * whose wake function does a wake_up on the ordinary client socket 499 * wait queue. This connection is established whenever a write (or 500 * poll for write) hit the flow control condition and broken when the 501 * association to the server socket is dissolved or after a wake up 502 * was relayed. 503 */ 504 505 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 506 void *key) 507 { 508 struct unix_sock *u; 509 wait_queue_head_t *u_sleep; 510 511 u = container_of(q, struct unix_sock, peer_wake); 512 513 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 514 q); 515 u->peer_wake.private = NULL; 516 517 /* relaying can only happen while the wq still exists */ 518 u_sleep = sk_sleep(&u->sk); 519 if (u_sleep) 520 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 521 522 return 0; 523 } 524 525 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 526 { 527 struct unix_sock *u, *u_other; 528 int rc; 529 530 u = unix_sk(sk); 531 u_other = unix_sk(other); 532 rc = 0; 533 spin_lock(&u_other->peer_wait.lock); 534 535 if (!u->peer_wake.private) { 536 u->peer_wake.private = other; 537 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 538 539 rc = 1; 540 } 541 542 spin_unlock(&u_other->peer_wait.lock); 543 return rc; 544 } 545 546 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 547 struct sock *other) 548 { 549 struct unix_sock *u, *u_other; 550 551 u = unix_sk(sk); 552 u_other = unix_sk(other); 553 spin_lock(&u_other->peer_wait.lock); 554 555 if (u->peer_wake.private == other) { 556 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 557 u->peer_wake.private = NULL; 558 } 559 560 spin_unlock(&u_other->peer_wait.lock); 561 } 562 563 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 564 struct sock *other) 565 { 566 unix_dgram_peer_wake_disconnect(sk, other); 567 wake_up_interruptible_poll(sk_sleep(sk), 568 EPOLLOUT | 569 EPOLLWRNORM | 570 EPOLLWRBAND); 571 } 572 573 /* preconditions: 574 * - unix_peer(sk) == other 575 * - association is stable 576 */ 577 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 578 { 579 int connected; 580 581 connected = unix_dgram_peer_wake_connect(sk, other); 582 583 /* If other is SOCK_DEAD, we want to make sure we signal 584 * POLLOUT, such that a subsequent write() can get a 585 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 586 * to other and its full, we will hang waiting for POLLOUT. 587 */ 588 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 589 return 1; 590 591 if (connected) 592 unix_dgram_peer_wake_disconnect(sk, other); 593 594 return 0; 595 } 596 597 static int unix_writable(const struct sock *sk, unsigned char state) 598 { 599 return state != TCP_LISTEN && 600 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 601 } 602 603 static void unix_write_space(struct sock *sk) 604 { 605 struct socket_wq *wq; 606 607 rcu_read_lock(); 608 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 609 wq = rcu_dereference(sk->sk_wq); 610 if (skwq_has_sleeper(wq)) 611 wake_up_interruptible_sync_poll(&wq->wait, 612 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 613 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 614 } 615 rcu_read_unlock(); 616 } 617 618 /* When dgram socket disconnects (or changes its peer), we clear its receive 619 * queue of packets arrived from previous peer. First, it allows to do 620 * flow control based only on wmem_alloc; second, sk connected to peer 621 * may receive messages only from that peer. */ 622 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 623 { 624 if (!skb_queue_empty(&sk->sk_receive_queue)) { 625 skb_queue_purge_reason(&sk->sk_receive_queue, 626 SKB_DROP_REASON_UNIX_DISCONNECT); 627 628 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 629 630 /* If one link of bidirectional dgram pipe is disconnected, 631 * we signal error. Messages are lost. Do not make this, 632 * when peer was not connected to us. 633 */ 634 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 635 WRITE_ONCE(other->sk_err, ECONNRESET); 636 sk_error_report(other); 637 } 638 } 639 } 640 641 static void unix_sock_destructor(struct sock *sk) 642 { 643 struct unix_sock *u = unix_sk(sk); 644 645 skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE); 646 647 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 648 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 649 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 650 if (!sock_flag(sk, SOCK_DEAD)) { 651 pr_info("Attempt to release alive unix socket: %p\n", sk); 652 return; 653 } 654 655 if (u->addr) 656 unix_release_addr(u->addr); 657 658 atomic_long_dec(&unix_nr_socks); 659 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 660 #ifdef UNIX_REFCNT_DEBUG 661 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 662 atomic_long_read(&unix_nr_socks)); 663 #endif 664 } 665 666 static void unix_release_sock(struct sock *sk, int embrion) 667 { 668 struct unix_sock *u = unix_sk(sk); 669 struct sock *skpair; 670 struct sk_buff *skb; 671 struct path path; 672 int state; 673 674 unix_remove_socket(sock_net(sk), sk); 675 unix_remove_bsd_socket(sk); 676 677 /* Clear state */ 678 unix_state_lock(sk); 679 sock_orphan(sk); 680 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 681 path = u->path; 682 u->path.dentry = NULL; 683 u->path.mnt = NULL; 684 state = sk->sk_state; 685 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 686 687 skpair = unix_peer(sk); 688 unix_peer(sk) = NULL; 689 690 unix_state_unlock(sk); 691 692 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 693 u->oob_skb = NULL; 694 #endif 695 696 wake_up_interruptible_all(&u->peer_wait); 697 698 if (skpair != NULL) { 699 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 700 unix_state_lock(skpair); 701 /* No more writes */ 702 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 703 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) 704 WRITE_ONCE(skpair->sk_err, ECONNRESET); 705 unix_state_unlock(skpair); 706 skpair->sk_state_change(skpair); 707 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 708 } 709 710 unix_dgram_peer_wake_disconnect(sk, skpair); 711 sock_put(skpair); /* It may now die */ 712 } 713 714 /* Try to flush out this socket. Throw out buffers at least */ 715 716 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 717 if (state == TCP_LISTEN) 718 unix_release_sock(skb->sk, 1); 719 720 /* passed fds are erased in the kfree_skb hook */ 721 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 722 } 723 724 if (path.dentry) 725 path_put(&path); 726 727 sock_put(sk); 728 729 /* ---- Socket is dead now and most probably destroyed ---- */ 730 731 /* 732 * Fixme: BSD difference: In BSD all sockets connected to us get 733 * ECONNRESET and we die on the spot. In Linux we behave 734 * like files and pipes do and wait for the last 735 * dereference. 736 * 737 * Can't we simply set sock->err? 738 * 739 * What the above comment does talk about? --ANK(980817) 740 */ 741 742 if (READ_ONCE(unix_tot_inflight)) 743 unix_gc(); /* Garbage collect fds */ 744 } 745 746 static void init_peercred(struct sock *sk) 747 { 748 sk->sk_peer_pid = get_pid(task_tgid(current)); 749 sk->sk_peer_cred = get_current_cred(); 750 } 751 752 static void update_peercred(struct sock *sk) 753 { 754 const struct cred *old_cred; 755 struct pid *old_pid; 756 757 spin_lock(&sk->sk_peer_lock); 758 old_pid = sk->sk_peer_pid; 759 old_cred = sk->sk_peer_cred; 760 init_peercred(sk); 761 spin_unlock(&sk->sk_peer_lock); 762 763 put_pid(old_pid); 764 put_cred(old_cred); 765 } 766 767 static void copy_peercred(struct sock *sk, struct sock *peersk) 768 { 769 lockdep_assert_held(&unix_sk(peersk)->lock); 770 771 spin_lock(&sk->sk_peer_lock); 772 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 773 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 774 spin_unlock(&sk->sk_peer_lock); 775 } 776 777 static int unix_listen(struct socket *sock, int backlog) 778 { 779 int err; 780 struct sock *sk = sock->sk; 781 struct unix_sock *u = unix_sk(sk); 782 783 err = -EOPNOTSUPP; 784 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 785 goto out; /* Only stream/seqpacket sockets accept */ 786 err = -EINVAL; 787 if (!READ_ONCE(u->addr)) 788 goto out; /* No listens on an unbound socket */ 789 unix_state_lock(sk); 790 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 791 goto out_unlock; 792 if (backlog > sk->sk_max_ack_backlog) 793 wake_up_interruptible_all(&u->peer_wait); 794 sk->sk_max_ack_backlog = backlog; 795 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 796 797 /* set credentials so connect can copy them */ 798 update_peercred(sk); 799 err = 0; 800 801 out_unlock: 802 unix_state_unlock(sk); 803 out: 804 return err; 805 } 806 807 static int unix_release(struct socket *); 808 static int unix_bind(struct socket *, struct sockaddr *, int); 809 static int unix_stream_connect(struct socket *, struct sockaddr *, 810 int addr_len, int flags); 811 static int unix_socketpair(struct socket *, struct socket *); 812 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); 813 static int unix_getname(struct socket *, struct sockaddr *, int); 814 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 815 static __poll_t unix_dgram_poll(struct file *, struct socket *, 816 poll_table *); 817 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 818 #ifdef CONFIG_COMPAT 819 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 820 #endif 821 static int unix_shutdown(struct socket *, int); 822 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 823 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 824 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 825 struct pipe_inode_info *, size_t size, 826 unsigned int flags); 827 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 828 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 829 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 830 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 831 static int unix_dgram_connect(struct socket *, struct sockaddr *, 832 int, int); 833 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 834 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 835 int); 836 837 #ifdef CONFIG_PROC_FS 838 static int unix_count_nr_fds(struct sock *sk) 839 { 840 struct sk_buff *skb; 841 struct unix_sock *u; 842 int nr_fds = 0; 843 844 spin_lock(&sk->sk_receive_queue.lock); 845 skb = skb_peek(&sk->sk_receive_queue); 846 while (skb) { 847 u = unix_sk(skb->sk); 848 nr_fds += atomic_read(&u->scm_stat.nr_fds); 849 skb = skb_peek_next(skb, &sk->sk_receive_queue); 850 } 851 spin_unlock(&sk->sk_receive_queue.lock); 852 853 return nr_fds; 854 } 855 856 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 857 { 858 struct sock *sk = sock->sk; 859 unsigned char s_state; 860 struct unix_sock *u; 861 int nr_fds = 0; 862 863 if (sk) { 864 s_state = READ_ONCE(sk->sk_state); 865 u = unix_sk(sk); 866 867 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 868 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 869 * SOCK_DGRAM is ordinary. So, no lock is needed. 870 */ 871 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 872 nr_fds = atomic_read(&u->scm_stat.nr_fds); 873 else if (s_state == TCP_LISTEN) 874 nr_fds = unix_count_nr_fds(sk); 875 876 seq_printf(m, "scm_fds: %u\n", nr_fds); 877 } 878 } 879 #else 880 #define unix_show_fdinfo NULL 881 #endif 882 883 static const struct proto_ops unix_stream_ops = { 884 .family = PF_UNIX, 885 .owner = THIS_MODULE, 886 .release = unix_release, 887 .bind = unix_bind, 888 .connect = unix_stream_connect, 889 .socketpair = unix_socketpair, 890 .accept = unix_accept, 891 .getname = unix_getname, 892 .poll = unix_poll, 893 .ioctl = unix_ioctl, 894 #ifdef CONFIG_COMPAT 895 .compat_ioctl = unix_compat_ioctl, 896 #endif 897 .listen = unix_listen, 898 .shutdown = unix_shutdown, 899 .sendmsg = unix_stream_sendmsg, 900 .recvmsg = unix_stream_recvmsg, 901 .read_skb = unix_stream_read_skb, 902 .mmap = sock_no_mmap, 903 .splice_read = unix_stream_splice_read, 904 .set_peek_off = sk_set_peek_off, 905 .show_fdinfo = unix_show_fdinfo, 906 }; 907 908 static const struct proto_ops unix_dgram_ops = { 909 .family = PF_UNIX, 910 .owner = THIS_MODULE, 911 .release = unix_release, 912 .bind = unix_bind, 913 .connect = unix_dgram_connect, 914 .socketpair = unix_socketpair, 915 .accept = sock_no_accept, 916 .getname = unix_getname, 917 .poll = unix_dgram_poll, 918 .ioctl = unix_ioctl, 919 #ifdef CONFIG_COMPAT 920 .compat_ioctl = unix_compat_ioctl, 921 #endif 922 .listen = sock_no_listen, 923 .shutdown = unix_shutdown, 924 .sendmsg = unix_dgram_sendmsg, 925 .read_skb = unix_read_skb, 926 .recvmsg = unix_dgram_recvmsg, 927 .mmap = sock_no_mmap, 928 .set_peek_off = sk_set_peek_off, 929 .show_fdinfo = unix_show_fdinfo, 930 }; 931 932 static const struct proto_ops unix_seqpacket_ops = { 933 .family = PF_UNIX, 934 .owner = THIS_MODULE, 935 .release = unix_release, 936 .bind = unix_bind, 937 .connect = unix_stream_connect, 938 .socketpair = unix_socketpair, 939 .accept = unix_accept, 940 .getname = unix_getname, 941 .poll = unix_dgram_poll, 942 .ioctl = unix_ioctl, 943 #ifdef CONFIG_COMPAT 944 .compat_ioctl = unix_compat_ioctl, 945 #endif 946 .listen = unix_listen, 947 .shutdown = unix_shutdown, 948 .sendmsg = unix_seqpacket_sendmsg, 949 .recvmsg = unix_seqpacket_recvmsg, 950 .mmap = sock_no_mmap, 951 .set_peek_off = sk_set_peek_off, 952 .show_fdinfo = unix_show_fdinfo, 953 }; 954 955 static void unix_close(struct sock *sk, long timeout) 956 { 957 /* Nothing to do here, unix socket does not need a ->close(). 958 * This is merely for sockmap. 959 */ 960 } 961 962 static void unix_unhash(struct sock *sk) 963 { 964 /* Nothing to do here, unix socket does not need a ->unhash(). 965 * This is merely for sockmap. 966 */ 967 } 968 969 static bool unix_bpf_bypass_getsockopt(int level, int optname) 970 { 971 if (level == SOL_SOCKET) { 972 switch (optname) { 973 case SO_PEERPIDFD: 974 return true; 975 default: 976 return false; 977 } 978 } 979 980 return false; 981 } 982 983 struct proto unix_dgram_proto = { 984 .name = "UNIX", 985 .owner = THIS_MODULE, 986 .obj_size = sizeof(struct unix_sock), 987 .close = unix_close, 988 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 989 #ifdef CONFIG_BPF_SYSCALL 990 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 991 #endif 992 }; 993 994 struct proto unix_stream_proto = { 995 .name = "UNIX-STREAM", 996 .owner = THIS_MODULE, 997 .obj_size = sizeof(struct unix_sock), 998 .close = unix_close, 999 .unhash = unix_unhash, 1000 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 1001 #ifdef CONFIG_BPF_SYSCALL 1002 .psock_update_sk_prot = unix_stream_bpf_update_proto, 1003 #endif 1004 }; 1005 1006 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 1007 { 1008 struct unix_sock *u; 1009 struct sock *sk; 1010 int err; 1011 1012 atomic_long_inc(&unix_nr_socks); 1013 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 1014 err = -ENFILE; 1015 goto err; 1016 } 1017 1018 if (type == SOCK_STREAM) 1019 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 1020 else /*dgram and seqpacket */ 1021 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 1022 1023 if (!sk) { 1024 err = -ENOMEM; 1025 goto err; 1026 } 1027 1028 sock_init_data(sock, sk); 1029 1030 sk->sk_hash = unix_unbound_hash(sk); 1031 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 1032 sk->sk_write_space = unix_write_space; 1033 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 1034 sk->sk_destruct = unix_sock_destructor; 1035 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); 1036 1037 u = unix_sk(sk); 1038 u->listener = NULL; 1039 u->vertex = NULL; 1040 u->path.dentry = NULL; 1041 u->path.mnt = NULL; 1042 spin_lock_init(&u->lock); 1043 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); 1044 mutex_init(&u->iolock); /* single task reading lock */ 1045 mutex_init(&u->bindlock); /* single task binding lock */ 1046 init_waitqueue_head(&u->peer_wait); 1047 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1048 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1049 unix_insert_unbound_socket(net, sk); 1050 1051 sock_prot_inuse_add(net, sk->sk_prot, 1); 1052 1053 return sk; 1054 1055 err: 1056 atomic_long_dec(&unix_nr_socks); 1057 return ERR_PTR(err); 1058 } 1059 1060 static int unix_create(struct net *net, struct socket *sock, int protocol, 1061 int kern) 1062 { 1063 struct sock *sk; 1064 1065 if (protocol && protocol != PF_UNIX) 1066 return -EPROTONOSUPPORT; 1067 1068 sock->state = SS_UNCONNECTED; 1069 1070 switch (sock->type) { 1071 case SOCK_STREAM: 1072 sock->ops = &unix_stream_ops; 1073 break; 1074 /* 1075 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1076 * nothing uses it. 1077 */ 1078 case SOCK_RAW: 1079 sock->type = SOCK_DGRAM; 1080 fallthrough; 1081 case SOCK_DGRAM: 1082 sock->ops = &unix_dgram_ops; 1083 break; 1084 case SOCK_SEQPACKET: 1085 sock->ops = &unix_seqpacket_ops; 1086 break; 1087 default: 1088 return -ESOCKTNOSUPPORT; 1089 } 1090 1091 sk = unix_create1(net, sock, kern, sock->type); 1092 if (IS_ERR(sk)) 1093 return PTR_ERR(sk); 1094 1095 return 0; 1096 } 1097 1098 static int unix_release(struct socket *sock) 1099 { 1100 struct sock *sk = sock->sk; 1101 1102 if (!sk) 1103 return 0; 1104 1105 sk->sk_prot->close(sk, 0); 1106 unix_release_sock(sk, 0); 1107 sock->sk = NULL; 1108 1109 return 0; 1110 } 1111 1112 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1113 int type) 1114 { 1115 struct inode *inode; 1116 struct path path; 1117 struct sock *sk; 1118 int err; 1119 1120 unix_mkname_bsd(sunaddr, addr_len); 1121 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1122 if (err) 1123 goto fail; 1124 1125 err = path_permission(&path, MAY_WRITE); 1126 if (err) 1127 goto path_put; 1128 1129 err = -ECONNREFUSED; 1130 inode = d_backing_inode(path.dentry); 1131 if (!S_ISSOCK(inode->i_mode)) 1132 goto path_put; 1133 1134 sk = unix_find_socket_byinode(inode); 1135 if (!sk) 1136 goto path_put; 1137 1138 err = -EPROTOTYPE; 1139 if (sk->sk_type == type) 1140 touch_atime(&path); 1141 else 1142 goto sock_put; 1143 1144 path_put(&path); 1145 1146 return sk; 1147 1148 sock_put: 1149 sock_put(sk); 1150 path_put: 1151 path_put(&path); 1152 fail: 1153 return ERR_PTR(err); 1154 } 1155 1156 static struct sock *unix_find_abstract(struct net *net, 1157 struct sockaddr_un *sunaddr, 1158 int addr_len, int type) 1159 { 1160 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1161 struct dentry *dentry; 1162 struct sock *sk; 1163 1164 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1165 if (!sk) 1166 return ERR_PTR(-ECONNREFUSED); 1167 1168 dentry = unix_sk(sk)->path.dentry; 1169 if (dentry) 1170 touch_atime(&unix_sk(sk)->path); 1171 1172 return sk; 1173 } 1174 1175 static struct sock *unix_find_other(struct net *net, 1176 struct sockaddr_un *sunaddr, 1177 int addr_len, int type) 1178 { 1179 struct sock *sk; 1180 1181 if (sunaddr->sun_path[0]) 1182 sk = unix_find_bsd(sunaddr, addr_len, type); 1183 else 1184 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1185 1186 return sk; 1187 } 1188 1189 static int unix_autobind(struct sock *sk) 1190 { 1191 struct unix_sock *u = unix_sk(sk); 1192 unsigned int new_hash, old_hash; 1193 struct net *net = sock_net(sk); 1194 struct unix_address *addr; 1195 u32 lastnum, ordernum; 1196 int err; 1197 1198 err = mutex_lock_interruptible(&u->bindlock); 1199 if (err) 1200 return err; 1201 1202 if (u->addr) 1203 goto out; 1204 1205 err = -ENOMEM; 1206 addr = kzalloc(sizeof(*addr) + 1207 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1208 if (!addr) 1209 goto out; 1210 1211 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1212 addr->name->sun_family = AF_UNIX; 1213 refcount_set(&addr->refcnt, 1); 1214 1215 old_hash = sk->sk_hash; 1216 ordernum = get_random_u32(); 1217 lastnum = ordernum & 0xFFFFF; 1218 retry: 1219 ordernum = (ordernum + 1) & 0xFFFFF; 1220 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1221 1222 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1223 unix_table_double_lock(net, old_hash, new_hash); 1224 1225 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1226 unix_table_double_unlock(net, old_hash, new_hash); 1227 1228 /* __unix_find_socket_byname() may take long time if many names 1229 * are already in use. 1230 */ 1231 cond_resched(); 1232 1233 if (ordernum == lastnum) { 1234 /* Give up if all names seems to be in use. */ 1235 err = -ENOSPC; 1236 unix_release_addr(addr); 1237 goto out; 1238 } 1239 1240 goto retry; 1241 } 1242 1243 __unix_set_addr_hash(net, sk, addr, new_hash); 1244 unix_table_double_unlock(net, old_hash, new_hash); 1245 err = 0; 1246 1247 out: mutex_unlock(&u->bindlock); 1248 return err; 1249 } 1250 1251 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1252 int addr_len) 1253 { 1254 umode_t mode = S_IFSOCK | 1255 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1256 struct unix_sock *u = unix_sk(sk); 1257 unsigned int new_hash, old_hash; 1258 struct net *net = sock_net(sk); 1259 struct mnt_idmap *idmap; 1260 struct unix_address *addr; 1261 struct dentry *dentry; 1262 struct path parent; 1263 int err; 1264 1265 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1266 addr = unix_create_addr(sunaddr, addr_len); 1267 if (!addr) 1268 return -ENOMEM; 1269 1270 /* 1271 * Get the parent directory, calculate the hash for last 1272 * component. 1273 */ 1274 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1275 if (IS_ERR(dentry)) { 1276 err = PTR_ERR(dentry); 1277 goto out; 1278 } 1279 1280 /* 1281 * All right, let's create it. 1282 */ 1283 idmap = mnt_idmap(parent.mnt); 1284 err = security_path_mknod(&parent, dentry, mode, 0); 1285 if (!err) 1286 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1287 if (err) 1288 goto out_path; 1289 err = mutex_lock_interruptible(&u->bindlock); 1290 if (err) 1291 goto out_unlink; 1292 if (u->addr) 1293 goto out_unlock; 1294 1295 old_hash = sk->sk_hash; 1296 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1297 unix_table_double_lock(net, old_hash, new_hash); 1298 u->path.mnt = mntget(parent.mnt); 1299 u->path.dentry = dget(dentry); 1300 __unix_set_addr_hash(net, sk, addr, new_hash); 1301 unix_table_double_unlock(net, old_hash, new_hash); 1302 unix_insert_bsd_socket(sk); 1303 mutex_unlock(&u->bindlock); 1304 done_path_create(&parent, dentry); 1305 return 0; 1306 1307 out_unlock: 1308 mutex_unlock(&u->bindlock); 1309 err = -EINVAL; 1310 out_unlink: 1311 /* failed after successful mknod? unlink what we'd created... */ 1312 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1313 out_path: 1314 done_path_create(&parent, dentry); 1315 out: 1316 unix_release_addr(addr); 1317 return err == -EEXIST ? -EADDRINUSE : err; 1318 } 1319 1320 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1321 int addr_len) 1322 { 1323 struct unix_sock *u = unix_sk(sk); 1324 unsigned int new_hash, old_hash; 1325 struct net *net = sock_net(sk); 1326 struct unix_address *addr; 1327 int err; 1328 1329 addr = unix_create_addr(sunaddr, addr_len); 1330 if (!addr) 1331 return -ENOMEM; 1332 1333 err = mutex_lock_interruptible(&u->bindlock); 1334 if (err) 1335 goto out; 1336 1337 if (u->addr) { 1338 err = -EINVAL; 1339 goto out_mutex; 1340 } 1341 1342 old_hash = sk->sk_hash; 1343 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1344 unix_table_double_lock(net, old_hash, new_hash); 1345 1346 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1347 goto out_spin; 1348 1349 __unix_set_addr_hash(net, sk, addr, new_hash); 1350 unix_table_double_unlock(net, old_hash, new_hash); 1351 mutex_unlock(&u->bindlock); 1352 return 0; 1353 1354 out_spin: 1355 unix_table_double_unlock(net, old_hash, new_hash); 1356 err = -EADDRINUSE; 1357 out_mutex: 1358 mutex_unlock(&u->bindlock); 1359 out: 1360 unix_release_addr(addr); 1361 return err; 1362 } 1363 1364 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1365 { 1366 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1367 struct sock *sk = sock->sk; 1368 int err; 1369 1370 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1371 sunaddr->sun_family == AF_UNIX) 1372 return unix_autobind(sk); 1373 1374 err = unix_validate_addr(sunaddr, addr_len); 1375 if (err) 1376 return err; 1377 1378 if (sunaddr->sun_path[0]) 1379 err = unix_bind_bsd(sk, sunaddr, addr_len); 1380 else 1381 err = unix_bind_abstract(sk, sunaddr, addr_len); 1382 1383 return err; 1384 } 1385 1386 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1387 { 1388 if (unlikely(sk1 == sk2) || !sk2) { 1389 unix_state_lock(sk1); 1390 return; 1391 } 1392 1393 if (sk1 > sk2) 1394 swap(sk1, sk2); 1395 1396 unix_state_lock(sk1); 1397 unix_state_lock(sk2); 1398 } 1399 1400 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1401 { 1402 if (unlikely(sk1 == sk2) || !sk2) { 1403 unix_state_unlock(sk1); 1404 return; 1405 } 1406 unix_state_unlock(sk1); 1407 unix_state_unlock(sk2); 1408 } 1409 1410 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1411 int alen, int flags) 1412 { 1413 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1414 struct sock *sk = sock->sk; 1415 struct sock *other; 1416 int err; 1417 1418 err = -EINVAL; 1419 if (alen < offsetofend(struct sockaddr, sa_family)) 1420 goto out; 1421 1422 if (addr->sa_family != AF_UNSPEC) { 1423 err = unix_validate_addr(sunaddr, alen); 1424 if (err) 1425 goto out; 1426 1427 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1428 if (err) 1429 goto out; 1430 1431 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1432 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1433 !READ_ONCE(unix_sk(sk)->addr)) { 1434 err = unix_autobind(sk); 1435 if (err) 1436 goto out; 1437 } 1438 1439 restart: 1440 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1441 if (IS_ERR(other)) { 1442 err = PTR_ERR(other); 1443 goto out; 1444 } 1445 1446 unix_state_double_lock(sk, other); 1447 1448 /* Apparently VFS overslept socket death. Retry. */ 1449 if (sock_flag(other, SOCK_DEAD)) { 1450 unix_state_double_unlock(sk, other); 1451 sock_put(other); 1452 goto restart; 1453 } 1454 1455 err = -EPERM; 1456 if (!unix_may_send(sk, other)) 1457 goto out_unlock; 1458 1459 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1460 if (err) 1461 goto out_unlock; 1462 1463 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1464 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1465 } else { 1466 /* 1467 * 1003.1g breaking connected state with AF_UNSPEC 1468 */ 1469 other = NULL; 1470 unix_state_double_lock(sk, other); 1471 } 1472 1473 /* 1474 * If it was connected, reconnect. 1475 */ 1476 if (unix_peer(sk)) { 1477 struct sock *old_peer = unix_peer(sk); 1478 1479 unix_peer(sk) = other; 1480 if (!other) 1481 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1482 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1483 1484 unix_state_double_unlock(sk, other); 1485 1486 if (other != old_peer) { 1487 unix_dgram_disconnected(sk, old_peer); 1488 1489 unix_state_lock(old_peer); 1490 if (!unix_peer(old_peer)) 1491 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1492 unix_state_unlock(old_peer); 1493 } 1494 1495 sock_put(old_peer); 1496 } else { 1497 unix_peer(sk) = other; 1498 unix_state_double_unlock(sk, other); 1499 } 1500 1501 return 0; 1502 1503 out_unlock: 1504 unix_state_double_unlock(sk, other); 1505 sock_put(other); 1506 out: 1507 return err; 1508 } 1509 1510 static long unix_wait_for_peer(struct sock *other, long timeo) 1511 __releases(&unix_sk(other)->lock) 1512 { 1513 struct unix_sock *u = unix_sk(other); 1514 int sched; 1515 DEFINE_WAIT(wait); 1516 1517 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1518 1519 sched = !sock_flag(other, SOCK_DEAD) && 1520 !(other->sk_shutdown & RCV_SHUTDOWN) && 1521 unix_recvq_full_lockless(other); 1522 1523 unix_state_unlock(other); 1524 1525 if (sched) 1526 timeo = schedule_timeout(timeo); 1527 1528 finish_wait(&u->peer_wait, &wait); 1529 return timeo; 1530 } 1531 1532 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1533 int addr_len, int flags) 1534 { 1535 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1536 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1537 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1538 struct net *net = sock_net(sk); 1539 struct sk_buff *skb = NULL; 1540 unsigned char state; 1541 long timeo; 1542 int err; 1543 1544 err = unix_validate_addr(sunaddr, addr_len); 1545 if (err) 1546 goto out; 1547 1548 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1549 if (err) 1550 goto out; 1551 1552 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1553 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1554 !READ_ONCE(u->addr)) { 1555 err = unix_autobind(sk); 1556 if (err) 1557 goto out; 1558 } 1559 1560 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1561 1562 /* First of all allocate resources. 1563 * If we will make it after state is locked, 1564 * we will have to recheck all again in any case. 1565 */ 1566 1567 /* create new sock for complete connection */ 1568 newsk = unix_create1(net, NULL, 0, sock->type); 1569 if (IS_ERR(newsk)) { 1570 err = PTR_ERR(newsk); 1571 goto out; 1572 } 1573 1574 /* Allocate skb for sending to listening sock */ 1575 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1576 if (!skb) { 1577 err = -ENOMEM; 1578 goto out_free_sk; 1579 } 1580 1581 restart: 1582 /* Find listening sock. */ 1583 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1584 if (IS_ERR(other)) { 1585 err = PTR_ERR(other); 1586 goto out_free_skb; 1587 } 1588 1589 unix_state_lock(other); 1590 1591 /* Apparently VFS overslept socket death. Retry. */ 1592 if (sock_flag(other, SOCK_DEAD)) { 1593 unix_state_unlock(other); 1594 sock_put(other); 1595 goto restart; 1596 } 1597 1598 if (other->sk_state != TCP_LISTEN || 1599 other->sk_shutdown & RCV_SHUTDOWN) { 1600 err = -ECONNREFUSED; 1601 goto out_unlock; 1602 } 1603 1604 if (unix_recvq_full_lockless(other)) { 1605 if (!timeo) { 1606 err = -EAGAIN; 1607 goto out_unlock; 1608 } 1609 1610 timeo = unix_wait_for_peer(other, timeo); 1611 sock_put(other); 1612 1613 err = sock_intr_errno(timeo); 1614 if (signal_pending(current)) 1615 goto out_free_skb; 1616 1617 goto restart; 1618 } 1619 1620 /* self connect and simultaneous connect are eliminated 1621 * by rejecting TCP_LISTEN socket to avoid deadlock. 1622 */ 1623 state = READ_ONCE(sk->sk_state); 1624 if (unlikely(state != TCP_CLOSE)) { 1625 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1626 goto out_unlock; 1627 } 1628 1629 unix_state_lock(sk); 1630 1631 if (unlikely(sk->sk_state != TCP_CLOSE)) { 1632 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1633 unix_state_unlock(sk); 1634 goto out_unlock; 1635 } 1636 1637 err = security_unix_stream_connect(sk, other, newsk); 1638 if (err) { 1639 unix_state_unlock(sk); 1640 goto out_unlock; 1641 } 1642 1643 /* The way is open! Fastly set all the necessary fields... */ 1644 1645 sock_hold(sk); 1646 unix_peer(newsk) = sk; 1647 newsk->sk_state = TCP_ESTABLISHED; 1648 newsk->sk_type = sk->sk_type; 1649 init_peercred(newsk); 1650 newu = unix_sk(newsk); 1651 newu->listener = other; 1652 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1653 otheru = unix_sk(other); 1654 1655 /* copy address information from listening to new sock 1656 * 1657 * The contents of *(otheru->addr) and otheru->path 1658 * are seen fully set up here, since we have found 1659 * otheru in hash under its lock. Insertion into the 1660 * hash chain we'd found it in had been done in an 1661 * earlier critical area protected by the chain's lock, 1662 * the same one where we'd set *(otheru->addr) contents, 1663 * as well as otheru->path and otheru->addr itself. 1664 * 1665 * Using smp_store_release() here to set newu->addr 1666 * is enough to make those stores, as well as stores 1667 * to newu->path visible to anyone who gets newu->addr 1668 * by smp_load_acquire(). IOW, the same warranties 1669 * as for unix_sock instances bound in unix_bind() or 1670 * in unix_autobind(). 1671 */ 1672 if (otheru->path.dentry) { 1673 path_get(&otheru->path); 1674 newu->path = otheru->path; 1675 } 1676 refcount_inc(&otheru->addr->refcnt); 1677 smp_store_release(&newu->addr, otheru->addr); 1678 1679 /* Set credentials */ 1680 copy_peercred(sk, other); 1681 1682 sock->state = SS_CONNECTED; 1683 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1684 sock_hold(newsk); 1685 1686 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1687 unix_peer(sk) = newsk; 1688 1689 unix_state_unlock(sk); 1690 1691 /* take ten and send info to listening sock */ 1692 spin_lock(&other->sk_receive_queue.lock); 1693 __skb_queue_tail(&other->sk_receive_queue, skb); 1694 spin_unlock(&other->sk_receive_queue.lock); 1695 unix_state_unlock(other); 1696 other->sk_data_ready(other); 1697 sock_put(other); 1698 return 0; 1699 1700 out_unlock: 1701 unix_state_unlock(other); 1702 sock_put(other); 1703 out_free_skb: 1704 consume_skb(skb); 1705 out_free_sk: 1706 unix_release_sock(newsk, 0); 1707 out: 1708 return err; 1709 } 1710 1711 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1712 { 1713 struct sock *ska = socka->sk, *skb = sockb->sk; 1714 1715 /* Join our sockets back to back */ 1716 sock_hold(ska); 1717 sock_hold(skb); 1718 unix_peer(ska) = skb; 1719 unix_peer(skb) = ska; 1720 init_peercred(ska); 1721 init_peercred(skb); 1722 1723 ska->sk_state = TCP_ESTABLISHED; 1724 skb->sk_state = TCP_ESTABLISHED; 1725 socka->state = SS_CONNECTED; 1726 sockb->state = SS_CONNECTED; 1727 return 0; 1728 } 1729 1730 static void unix_sock_inherit_flags(const struct socket *old, 1731 struct socket *new) 1732 { 1733 if (test_bit(SOCK_PASSCRED, &old->flags)) 1734 set_bit(SOCK_PASSCRED, &new->flags); 1735 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1736 set_bit(SOCK_PASSPIDFD, &new->flags); 1737 if (test_bit(SOCK_PASSSEC, &old->flags)) 1738 set_bit(SOCK_PASSSEC, &new->flags); 1739 } 1740 1741 static int unix_accept(struct socket *sock, struct socket *newsock, 1742 struct proto_accept_arg *arg) 1743 { 1744 struct sock *sk = sock->sk; 1745 struct sk_buff *skb; 1746 struct sock *tsk; 1747 1748 arg->err = -EOPNOTSUPP; 1749 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1750 goto out; 1751 1752 arg->err = -EINVAL; 1753 if (READ_ONCE(sk->sk_state) != TCP_LISTEN) 1754 goto out; 1755 1756 /* If socket state is TCP_LISTEN it cannot change (for now...), 1757 * so that no locks are necessary. 1758 */ 1759 1760 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1761 &arg->err); 1762 if (!skb) { 1763 /* This means receive shutdown. */ 1764 if (arg->err == 0) 1765 arg->err = -EINVAL; 1766 goto out; 1767 } 1768 1769 tsk = skb->sk; 1770 skb_free_datagram(sk, skb); 1771 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1772 1773 /* attach accepted sock to socket */ 1774 unix_state_lock(tsk); 1775 unix_update_edges(unix_sk(tsk)); 1776 newsock->state = SS_CONNECTED; 1777 unix_sock_inherit_flags(sock, newsock); 1778 sock_graft(tsk, newsock); 1779 unix_state_unlock(tsk); 1780 return 0; 1781 1782 out: 1783 return arg->err; 1784 } 1785 1786 1787 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1788 { 1789 struct sock *sk = sock->sk; 1790 struct unix_address *addr; 1791 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1792 int err = 0; 1793 1794 if (peer) { 1795 sk = unix_peer_get(sk); 1796 1797 err = -ENOTCONN; 1798 if (!sk) 1799 goto out; 1800 err = 0; 1801 } else { 1802 sock_hold(sk); 1803 } 1804 1805 addr = smp_load_acquire(&unix_sk(sk)->addr); 1806 if (!addr) { 1807 sunaddr->sun_family = AF_UNIX; 1808 sunaddr->sun_path[0] = 0; 1809 err = offsetof(struct sockaddr_un, sun_path); 1810 } else { 1811 err = addr->len; 1812 memcpy(sunaddr, addr->name, addr->len); 1813 1814 if (peer) 1815 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1816 CGROUP_UNIX_GETPEERNAME); 1817 else 1818 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1819 CGROUP_UNIX_GETSOCKNAME); 1820 } 1821 sock_put(sk); 1822 out: 1823 return err; 1824 } 1825 1826 /* The "user->unix_inflight" variable is protected by the garbage 1827 * collection lock, and we just read it locklessly here. If you go 1828 * over the limit, there might be a tiny race in actually noticing 1829 * it across threads. Tough. 1830 */ 1831 static inline bool too_many_unix_fds(struct task_struct *p) 1832 { 1833 struct user_struct *user = current_user(); 1834 1835 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1836 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1837 return false; 1838 } 1839 1840 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1841 { 1842 if (too_many_unix_fds(current)) 1843 return -ETOOMANYREFS; 1844 1845 UNIXCB(skb).fp = scm->fp; 1846 scm->fp = NULL; 1847 1848 if (unix_prepare_fpl(UNIXCB(skb).fp)) 1849 return -ENOMEM; 1850 1851 return 0; 1852 } 1853 1854 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1855 { 1856 scm->fp = UNIXCB(skb).fp; 1857 UNIXCB(skb).fp = NULL; 1858 1859 unix_destroy_fpl(scm->fp); 1860 } 1861 1862 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1863 { 1864 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1865 } 1866 1867 static void unix_destruct_scm(struct sk_buff *skb) 1868 { 1869 struct scm_cookie scm; 1870 1871 memset(&scm, 0, sizeof(scm)); 1872 scm.pid = UNIXCB(skb).pid; 1873 if (UNIXCB(skb).fp) 1874 unix_detach_fds(&scm, skb); 1875 1876 /* Alas, it calls VFS */ 1877 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1878 scm_destroy(&scm); 1879 sock_wfree(skb); 1880 } 1881 1882 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1883 { 1884 int err = 0; 1885 1886 UNIXCB(skb).pid = get_pid(scm->pid); 1887 UNIXCB(skb).uid = scm->creds.uid; 1888 UNIXCB(skb).gid = scm->creds.gid; 1889 UNIXCB(skb).fp = NULL; 1890 unix_get_secdata(scm, skb); 1891 if (scm->fp && send_fds) 1892 err = unix_attach_fds(scm, skb); 1893 1894 skb->destructor = unix_destruct_scm; 1895 return err; 1896 } 1897 1898 static bool unix_passcred_enabled(const struct socket *sock, 1899 const struct sock *other) 1900 { 1901 return test_bit(SOCK_PASSCRED, &sock->flags) || 1902 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1903 !other->sk_socket || 1904 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1905 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1906 } 1907 1908 /* 1909 * Some apps rely on write() giving SCM_CREDENTIALS 1910 * We include credentials if source or destination socket 1911 * asserted SOCK_PASSCRED. 1912 */ 1913 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1914 const struct sock *other) 1915 { 1916 if (UNIXCB(skb).pid) 1917 return; 1918 if (unix_passcred_enabled(sock, other)) { 1919 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1920 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1921 } 1922 } 1923 1924 static bool unix_skb_scm_eq(struct sk_buff *skb, 1925 struct scm_cookie *scm) 1926 { 1927 return UNIXCB(skb).pid == scm->pid && 1928 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1929 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1930 unix_secdata_eq(scm, skb); 1931 } 1932 1933 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1934 { 1935 struct scm_fp_list *fp = UNIXCB(skb).fp; 1936 struct unix_sock *u = unix_sk(sk); 1937 1938 if (unlikely(fp && fp->count)) { 1939 atomic_add(fp->count, &u->scm_stat.nr_fds); 1940 unix_add_edges(fp, u); 1941 } 1942 } 1943 1944 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1945 { 1946 struct scm_fp_list *fp = UNIXCB(skb).fp; 1947 struct unix_sock *u = unix_sk(sk); 1948 1949 if (unlikely(fp && fp->count)) { 1950 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1951 unix_del_edges(fp); 1952 } 1953 } 1954 1955 /* 1956 * Send AF_UNIX data. 1957 */ 1958 1959 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1960 size_t len) 1961 { 1962 struct sock *sk = sock->sk, *other = NULL; 1963 struct unix_sock *u = unix_sk(sk); 1964 struct scm_cookie scm; 1965 struct sk_buff *skb; 1966 int data_len = 0; 1967 int sk_locked; 1968 long timeo; 1969 int err; 1970 1971 err = scm_send(sock, msg, &scm, false); 1972 if (err < 0) 1973 return err; 1974 1975 wait_for_unix_gc(scm.fp); 1976 1977 if (msg->msg_flags & MSG_OOB) { 1978 err = -EOPNOTSUPP; 1979 goto out; 1980 } 1981 1982 if (msg->msg_namelen) { 1983 err = unix_validate_addr(msg->msg_name, msg->msg_namelen); 1984 if (err) 1985 goto out; 1986 1987 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 1988 msg->msg_name, 1989 &msg->msg_namelen, 1990 NULL); 1991 if (err) 1992 goto out; 1993 } 1994 1995 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1996 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1997 !READ_ONCE(u->addr)) { 1998 err = unix_autobind(sk); 1999 if (err) 2000 goto out; 2001 } 2002 2003 if (len > READ_ONCE(sk->sk_sndbuf) - 32) { 2004 err = -EMSGSIZE; 2005 goto out; 2006 } 2007 2008 if (len > SKB_MAX_ALLOC) { 2009 data_len = min_t(size_t, 2010 len - SKB_MAX_ALLOC, 2011 MAX_SKB_FRAGS * PAGE_SIZE); 2012 data_len = PAGE_ALIGN(data_len); 2013 2014 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 2015 } 2016 2017 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 2018 msg->msg_flags & MSG_DONTWAIT, &err, 2019 PAGE_ALLOC_COSTLY_ORDER); 2020 if (!skb) 2021 goto out; 2022 2023 err = unix_scm_to_skb(&scm, skb, true); 2024 if (err < 0) 2025 goto out_free; 2026 2027 skb_put(skb, len - data_len); 2028 skb->data_len = data_len; 2029 skb->len = len; 2030 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 2031 if (err) 2032 goto out_free; 2033 2034 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 2035 2036 if (msg->msg_namelen) { 2037 lookup: 2038 other = unix_find_other(sock_net(sk), msg->msg_name, 2039 msg->msg_namelen, sk->sk_type); 2040 if (IS_ERR(other)) { 2041 err = PTR_ERR(other); 2042 goto out_free; 2043 } 2044 } else { 2045 other = unix_peer_get(sk); 2046 if (!other) { 2047 err = -ENOTCONN; 2048 goto out_free; 2049 } 2050 } 2051 2052 if (sk_filter(other, skb) < 0) { 2053 /* Toss the packet but do not return any error to the sender */ 2054 err = len; 2055 goto out_sock_put; 2056 } 2057 2058 restart: 2059 sk_locked = 0; 2060 unix_state_lock(other); 2061 restart_locked: 2062 2063 if (!unix_may_send(sk, other)) { 2064 err = -EPERM; 2065 goto out_unlock; 2066 } 2067 2068 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2069 /* Check with 1003.1g - what should datagram error */ 2070 2071 unix_state_unlock(other); 2072 2073 if (sk->sk_type == SOCK_SEQPACKET) { 2074 /* We are here only when racing with unix_release_sock() 2075 * is clearing @other. Never change state to TCP_CLOSE 2076 * unlike SOCK_DGRAM wants. 2077 */ 2078 err = -EPIPE; 2079 goto out_sock_put; 2080 } 2081 2082 if (!sk_locked) 2083 unix_state_lock(sk); 2084 2085 if (unix_peer(sk) == other) { 2086 unix_peer(sk) = NULL; 2087 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2088 2089 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2090 unix_state_unlock(sk); 2091 2092 unix_dgram_disconnected(sk, other); 2093 sock_put(other); 2094 err = -ECONNREFUSED; 2095 goto out_sock_put; 2096 } 2097 2098 unix_state_unlock(sk); 2099 2100 if (!msg->msg_namelen) { 2101 err = -ECONNRESET; 2102 goto out_sock_put; 2103 } 2104 2105 sock_put(other); 2106 goto lookup; 2107 } 2108 2109 if (other->sk_shutdown & RCV_SHUTDOWN) { 2110 err = -EPIPE; 2111 goto out_unlock; 2112 } 2113 2114 if (sk->sk_type != SOCK_SEQPACKET) { 2115 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2116 if (err) 2117 goto out_unlock; 2118 } 2119 2120 /* other == sk && unix_peer(other) != sk if 2121 * - unix_peer(sk) == NULL, destination address bound to sk 2122 * - unix_peer(sk) == sk by time of get but disconnected before lock 2123 */ 2124 if (other != sk && 2125 unlikely(unix_peer(other) != sk && 2126 unix_recvq_full_lockless(other))) { 2127 if (timeo) { 2128 timeo = unix_wait_for_peer(other, timeo); 2129 2130 err = sock_intr_errno(timeo); 2131 if (signal_pending(current)) 2132 goto out_sock_put; 2133 2134 goto restart; 2135 } 2136 2137 if (!sk_locked) { 2138 unix_state_unlock(other); 2139 unix_state_double_lock(sk, other); 2140 } 2141 2142 if (unix_peer(sk) != other || 2143 unix_dgram_peer_wake_me(sk, other)) { 2144 err = -EAGAIN; 2145 sk_locked = 1; 2146 goto out_unlock; 2147 } 2148 2149 if (!sk_locked) { 2150 sk_locked = 1; 2151 goto restart_locked; 2152 } 2153 } 2154 2155 if (unlikely(sk_locked)) 2156 unix_state_unlock(sk); 2157 2158 if (sock_flag(other, SOCK_RCVTSTAMP)) 2159 __net_timestamp(skb); 2160 maybe_add_creds(skb, sock, other); 2161 scm_stat_add(other, skb); 2162 skb_queue_tail(&other->sk_receive_queue, skb); 2163 unix_state_unlock(other); 2164 other->sk_data_ready(other); 2165 sock_put(other); 2166 scm_destroy(&scm); 2167 return len; 2168 2169 out_unlock: 2170 if (sk_locked) 2171 unix_state_unlock(sk); 2172 unix_state_unlock(other); 2173 out_sock_put: 2174 sock_put(other); 2175 out_free: 2176 consume_skb(skb); 2177 out: 2178 scm_destroy(&scm); 2179 return err; 2180 } 2181 2182 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2183 * bytes, and a minimum of a full page. 2184 */ 2185 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2186 2187 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2188 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2189 struct scm_cookie *scm, bool fds_sent) 2190 { 2191 struct unix_sock *ousk = unix_sk(other); 2192 struct sk_buff *skb; 2193 int err; 2194 2195 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2196 2197 if (!skb) 2198 return err; 2199 2200 err = unix_scm_to_skb(scm, skb, !fds_sent); 2201 if (err < 0) 2202 goto out; 2203 2204 skb_put(skb, 1); 2205 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2206 2207 if (err) 2208 goto out; 2209 2210 unix_state_lock(other); 2211 2212 if (sock_flag(other, SOCK_DEAD) || 2213 (other->sk_shutdown & RCV_SHUTDOWN)) { 2214 unix_state_unlock(other); 2215 err = -EPIPE; 2216 goto out; 2217 } 2218 2219 maybe_add_creds(skb, sock, other); 2220 scm_stat_add(other, skb); 2221 2222 spin_lock(&other->sk_receive_queue.lock); 2223 WRITE_ONCE(ousk->oob_skb, skb); 2224 __skb_queue_tail(&other->sk_receive_queue, skb); 2225 spin_unlock(&other->sk_receive_queue.lock); 2226 2227 sk_send_sigurg(other); 2228 unix_state_unlock(other); 2229 other->sk_data_ready(other); 2230 2231 return 0; 2232 out: 2233 consume_skb(skb); 2234 return err; 2235 } 2236 #endif 2237 2238 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2239 size_t len) 2240 { 2241 struct sock *sk = sock->sk; 2242 struct sk_buff *skb = NULL; 2243 struct sock *other = NULL; 2244 struct scm_cookie scm; 2245 bool fds_sent = false; 2246 int err, sent = 0; 2247 2248 err = scm_send(sock, msg, &scm, false); 2249 if (err < 0) 2250 return err; 2251 2252 wait_for_unix_gc(scm.fp); 2253 2254 if (msg->msg_flags & MSG_OOB) { 2255 err = -EOPNOTSUPP; 2256 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2257 if (len) 2258 len--; 2259 else 2260 #endif 2261 goto out_err; 2262 } 2263 2264 if (msg->msg_namelen) { 2265 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2266 goto out_err; 2267 } else { 2268 other = unix_peer(sk); 2269 if (!other) { 2270 err = -ENOTCONN; 2271 goto out_err; 2272 } 2273 } 2274 2275 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2276 goto out_pipe; 2277 2278 while (sent < len) { 2279 int size = len - sent; 2280 int data_len; 2281 2282 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2283 skb = sock_alloc_send_pskb(sk, 0, 0, 2284 msg->msg_flags & MSG_DONTWAIT, 2285 &err, 0); 2286 } else { 2287 /* Keep two messages in the pipe so it schedules better */ 2288 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2289 2290 /* allow fallback to order-0 allocations */ 2291 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2292 2293 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2294 2295 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2296 2297 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2298 msg->msg_flags & MSG_DONTWAIT, &err, 2299 get_order(UNIX_SKB_FRAGS_SZ)); 2300 } 2301 if (!skb) 2302 goto out_err; 2303 2304 /* Only send the fds in the first buffer */ 2305 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2306 if (err < 0) 2307 goto out_free; 2308 2309 fds_sent = true; 2310 2311 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2312 skb->ip_summed = CHECKSUM_UNNECESSARY; 2313 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2314 sk->sk_allocation); 2315 if (err < 0) 2316 goto out_free; 2317 2318 size = err; 2319 refcount_add(size, &sk->sk_wmem_alloc); 2320 } else { 2321 skb_put(skb, size - data_len); 2322 skb->data_len = data_len; 2323 skb->len = size; 2324 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2325 if (err) 2326 goto out_free; 2327 } 2328 2329 unix_state_lock(other); 2330 2331 if (sock_flag(other, SOCK_DEAD) || 2332 (other->sk_shutdown & RCV_SHUTDOWN)) 2333 goto out_pipe_unlock; 2334 2335 maybe_add_creds(skb, sock, other); 2336 scm_stat_add(other, skb); 2337 skb_queue_tail(&other->sk_receive_queue, skb); 2338 unix_state_unlock(other); 2339 other->sk_data_ready(other); 2340 sent += size; 2341 } 2342 2343 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2344 if (msg->msg_flags & MSG_OOB) { 2345 err = queue_oob(sock, msg, other, &scm, fds_sent); 2346 if (err) 2347 goto out_err; 2348 sent++; 2349 } 2350 #endif 2351 2352 scm_destroy(&scm); 2353 2354 return sent; 2355 2356 out_pipe_unlock: 2357 unix_state_unlock(other); 2358 out_pipe: 2359 if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) 2360 send_sig(SIGPIPE, current, 0); 2361 err = -EPIPE; 2362 out_free: 2363 consume_skb(skb); 2364 out_err: 2365 scm_destroy(&scm); 2366 return sent ? : err; 2367 } 2368 2369 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2370 size_t len) 2371 { 2372 int err; 2373 struct sock *sk = sock->sk; 2374 2375 err = sock_error(sk); 2376 if (err) 2377 return err; 2378 2379 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2380 return -ENOTCONN; 2381 2382 if (msg->msg_namelen) 2383 msg->msg_namelen = 0; 2384 2385 return unix_dgram_sendmsg(sock, msg, len); 2386 } 2387 2388 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2389 size_t size, int flags) 2390 { 2391 struct sock *sk = sock->sk; 2392 2393 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2394 return -ENOTCONN; 2395 2396 return unix_dgram_recvmsg(sock, msg, size, flags); 2397 } 2398 2399 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2400 { 2401 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2402 2403 if (addr) { 2404 msg->msg_namelen = addr->len; 2405 memcpy(msg->msg_name, addr->name, addr->len); 2406 } 2407 } 2408 2409 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2410 int flags) 2411 { 2412 struct scm_cookie scm; 2413 struct socket *sock = sk->sk_socket; 2414 struct unix_sock *u = unix_sk(sk); 2415 struct sk_buff *skb, *last; 2416 long timeo; 2417 int skip; 2418 int err; 2419 2420 err = -EOPNOTSUPP; 2421 if (flags&MSG_OOB) 2422 goto out; 2423 2424 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2425 2426 do { 2427 mutex_lock(&u->iolock); 2428 2429 skip = sk_peek_offset(sk, flags); 2430 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2431 &skip, &err, &last); 2432 if (skb) { 2433 if (!(flags & MSG_PEEK)) 2434 scm_stat_del(sk, skb); 2435 break; 2436 } 2437 2438 mutex_unlock(&u->iolock); 2439 2440 if (err != -EAGAIN) 2441 break; 2442 } while (timeo && 2443 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2444 &err, &timeo, last)); 2445 2446 if (!skb) { /* implies iolock unlocked */ 2447 unix_state_lock(sk); 2448 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2449 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2450 (sk->sk_shutdown & RCV_SHUTDOWN)) 2451 err = 0; 2452 unix_state_unlock(sk); 2453 goto out; 2454 } 2455 2456 if (wq_has_sleeper(&u->peer_wait)) 2457 wake_up_interruptible_sync_poll(&u->peer_wait, 2458 EPOLLOUT | EPOLLWRNORM | 2459 EPOLLWRBAND); 2460 2461 if (msg->msg_name) { 2462 unix_copy_addr(msg, skb->sk); 2463 2464 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2465 msg->msg_name, 2466 &msg->msg_namelen); 2467 } 2468 2469 if (size > skb->len - skip) 2470 size = skb->len - skip; 2471 else if (size < skb->len - skip) 2472 msg->msg_flags |= MSG_TRUNC; 2473 2474 err = skb_copy_datagram_msg(skb, skip, msg, size); 2475 if (err) 2476 goto out_free; 2477 2478 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2479 __sock_recv_timestamp(msg, sk, skb); 2480 2481 memset(&scm, 0, sizeof(scm)); 2482 2483 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2484 unix_set_secdata(&scm, skb); 2485 2486 if (!(flags & MSG_PEEK)) { 2487 if (UNIXCB(skb).fp) 2488 unix_detach_fds(&scm, skb); 2489 2490 sk_peek_offset_bwd(sk, skb->len); 2491 } else { 2492 /* It is questionable: on PEEK we could: 2493 - do not return fds - good, but too simple 8) 2494 - return fds, and do not return them on read (old strategy, 2495 apparently wrong) 2496 - clone fds (I chose it for now, it is the most universal 2497 solution) 2498 2499 POSIX 1003.1g does not actually define this clearly 2500 at all. POSIX 1003.1g doesn't define a lot of things 2501 clearly however! 2502 2503 */ 2504 2505 sk_peek_offset_fwd(sk, size); 2506 2507 if (UNIXCB(skb).fp) 2508 unix_peek_fds(&scm, skb); 2509 } 2510 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2511 2512 scm_recv_unix(sock, msg, &scm, flags); 2513 2514 out_free: 2515 skb_free_datagram(sk, skb); 2516 mutex_unlock(&u->iolock); 2517 out: 2518 return err; 2519 } 2520 2521 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2522 int flags) 2523 { 2524 struct sock *sk = sock->sk; 2525 2526 #ifdef CONFIG_BPF_SYSCALL 2527 const struct proto *prot = READ_ONCE(sk->sk_prot); 2528 2529 if (prot != &unix_dgram_proto) 2530 return prot->recvmsg(sk, msg, size, flags, NULL); 2531 #endif 2532 return __unix_dgram_recvmsg(sk, msg, size, flags); 2533 } 2534 2535 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2536 { 2537 struct unix_sock *u = unix_sk(sk); 2538 struct sk_buff *skb; 2539 int err; 2540 2541 mutex_lock(&u->iolock); 2542 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2543 mutex_unlock(&u->iolock); 2544 if (!skb) 2545 return err; 2546 2547 return recv_actor(sk, skb); 2548 } 2549 2550 /* 2551 * Sleep until more data has arrived. But check for races.. 2552 */ 2553 static long unix_stream_data_wait(struct sock *sk, long timeo, 2554 struct sk_buff *last, unsigned int last_len, 2555 bool freezable) 2556 { 2557 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2558 struct sk_buff *tail; 2559 DEFINE_WAIT(wait); 2560 2561 unix_state_lock(sk); 2562 2563 for (;;) { 2564 prepare_to_wait(sk_sleep(sk), &wait, state); 2565 2566 tail = skb_peek_tail(&sk->sk_receive_queue); 2567 if (tail != last || 2568 (tail && tail->len != last_len) || 2569 sk->sk_err || 2570 (sk->sk_shutdown & RCV_SHUTDOWN) || 2571 signal_pending(current) || 2572 !timeo) 2573 break; 2574 2575 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2576 unix_state_unlock(sk); 2577 timeo = schedule_timeout(timeo); 2578 unix_state_lock(sk); 2579 2580 if (sock_flag(sk, SOCK_DEAD)) 2581 break; 2582 2583 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2584 } 2585 2586 finish_wait(sk_sleep(sk), &wait); 2587 unix_state_unlock(sk); 2588 return timeo; 2589 } 2590 2591 static unsigned int unix_skb_len(const struct sk_buff *skb) 2592 { 2593 return skb->len - UNIXCB(skb).consumed; 2594 } 2595 2596 struct unix_stream_read_state { 2597 int (*recv_actor)(struct sk_buff *, int, int, 2598 struct unix_stream_read_state *); 2599 struct socket *socket; 2600 struct msghdr *msg; 2601 struct pipe_inode_info *pipe; 2602 size_t size; 2603 int flags; 2604 unsigned int splice_flags; 2605 }; 2606 2607 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2608 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2609 { 2610 struct socket *sock = state->socket; 2611 struct sock *sk = sock->sk; 2612 struct unix_sock *u = unix_sk(sk); 2613 int chunk = 1; 2614 struct sk_buff *oob_skb; 2615 2616 mutex_lock(&u->iolock); 2617 unix_state_lock(sk); 2618 spin_lock(&sk->sk_receive_queue.lock); 2619 2620 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2621 spin_unlock(&sk->sk_receive_queue.lock); 2622 unix_state_unlock(sk); 2623 mutex_unlock(&u->iolock); 2624 return -EINVAL; 2625 } 2626 2627 oob_skb = u->oob_skb; 2628 2629 if (!(state->flags & MSG_PEEK)) 2630 WRITE_ONCE(u->oob_skb, NULL); 2631 2632 spin_unlock(&sk->sk_receive_queue.lock); 2633 unix_state_unlock(sk); 2634 2635 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2636 2637 if (!(state->flags & MSG_PEEK)) 2638 UNIXCB(oob_skb).consumed += 1; 2639 2640 mutex_unlock(&u->iolock); 2641 2642 if (chunk < 0) 2643 return -EFAULT; 2644 2645 state->msg->msg_flags |= MSG_OOB; 2646 return 1; 2647 } 2648 2649 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2650 int flags, int copied) 2651 { 2652 struct sk_buff *read_skb = NULL, *unread_skb = NULL; 2653 struct unix_sock *u = unix_sk(sk); 2654 2655 if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb))) 2656 return skb; 2657 2658 spin_lock(&sk->sk_receive_queue.lock); 2659 2660 if (!unix_skb_len(skb)) { 2661 if (copied && (!u->oob_skb || skb == u->oob_skb)) { 2662 skb = NULL; 2663 } else if (flags & MSG_PEEK) { 2664 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2665 } else { 2666 read_skb = skb; 2667 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2668 __skb_unlink(read_skb, &sk->sk_receive_queue); 2669 } 2670 2671 if (!skb) 2672 goto unlock; 2673 } 2674 2675 if (skb != u->oob_skb) 2676 goto unlock; 2677 2678 if (copied) { 2679 skb = NULL; 2680 } else if (!(flags & MSG_PEEK)) { 2681 WRITE_ONCE(u->oob_skb, NULL); 2682 2683 if (!sock_flag(sk, SOCK_URGINLINE)) { 2684 __skb_unlink(skb, &sk->sk_receive_queue); 2685 unread_skb = skb; 2686 skb = skb_peek(&sk->sk_receive_queue); 2687 } 2688 } else if (!sock_flag(sk, SOCK_URGINLINE)) { 2689 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2690 } 2691 2692 unlock: 2693 spin_unlock(&sk->sk_receive_queue.lock); 2694 2695 consume_skb(read_skb); 2696 kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB); 2697 2698 return skb; 2699 } 2700 #endif 2701 2702 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2703 { 2704 struct unix_sock *u = unix_sk(sk); 2705 struct sk_buff *skb; 2706 int err; 2707 2708 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2709 return -ENOTCONN; 2710 2711 mutex_lock(&u->iolock); 2712 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2713 mutex_unlock(&u->iolock); 2714 if (!skb) 2715 return err; 2716 2717 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2718 if (unlikely(skb == READ_ONCE(u->oob_skb))) { 2719 bool drop = false; 2720 2721 unix_state_lock(sk); 2722 2723 if (sock_flag(sk, SOCK_DEAD)) { 2724 unix_state_unlock(sk); 2725 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 2726 return -ECONNRESET; 2727 } 2728 2729 spin_lock(&sk->sk_receive_queue.lock); 2730 if (likely(skb == u->oob_skb)) { 2731 WRITE_ONCE(u->oob_skb, NULL); 2732 drop = true; 2733 } 2734 spin_unlock(&sk->sk_receive_queue.lock); 2735 2736 unix_state_unlock(sk); 2737 2738 if (drop) { 2739 kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); 2740 return -EAGAIN; 2741 } 2742 } 2743 #endif 2744 2745 return recv_actor(sk, skb); 2746 } 2747 2748 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2749 bool freezable) 2750 { 2751 struct scm_cookie scm; 2752 struct socket *sock = state->socket; 2753 struct sock *sk = sock->sk; 2754 struct unix_sock *u = unix_sk(sk); 2755 int copied = 0; 2756 int flags = state->flags; 2757 int noblock = flags & MSG_DONTWAIT; 2758 bool check_creds = false; 2759 int target; 2760 int err = 0; 2761 long timeo; 2762 int skip; 2763 size_t size = state->size; 2764 unsigned int last_len; 2765 2766 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2767 err = -EINVAL; 2768 goto out; 2769 } 2770 2771 if (unlikely(flags & MSG_OOB)) { 2772 err = -EOPNOTSUPP; 2773 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2774 err = unix_stream_recv_urg(state); 2775 #endif 2776 goto out; 2777 } 2778 2779 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2780 timeo = sock_rcvtimeo(sk, noblock); 2781 2782 memset(&scm, 0, sizeof(scm)); 2783 2784 /* Lock the socket to prevent queue disordering 2785 * while sleeps in memcpy_tomsg 2786 */ 2787 mutex_lock(&u->iolock); 2788 2789 skip = max(sk_peek_offset(sk, flags), 0); 2790 2791 do { 2792 struct sk_buff *skb, *last; 2793 int chunk; 2794 2795 redo: 2796 unix_state_lock(sk); 2797 if (sock_flag(sk, SOCK_DEAD)) { 2798 err = -ECONNRESET; 2799 goto unlock; 2800 } 2801 last = skb = skb_peek(&sk->sk_receive_queue); 2802 last_len = last ? last->len : 0; 2803 2804 again: 2805 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2806 if (skb) { 2807 skb = manage_oob(skb, sk, flags, copied); 2808 if (!skb && copied) { 2809 unix_state_unlock(sk); 2810 break; 2811 } 2812 } 2813 #endif 2814 if (skb == NULL) { 2815 if (copied >= target) 2816 goto unlock; 2817 2818 /* 2819 * POSIX 1003.1g mandates this order. 2820 */ 2821 2822 err = sock_error(sk); 2823 if (err) 2824 goto unlock; 2825 if (sk->sk_shutdown & RCV_SHUTDOWN) 2826 goto unlock; 2827 2828 unix_state_unlock(sk); 2829 if (!timeo) { 2830 err = -EAGAIN; 2831 break; 2832 } 2833 2834 mutex_unlock(&u->iolock); 2835 2836 timeo = unix_stream_data_wait(sk, timeo, last, 2837 last_len, freezable); 2838 2839 if (signal_pending(current)) { 2840 err = sock_intr_errno(timeo); 2841 scm_destroy(&scm); 2842 goto out; 2843 } 2844 2845 mutex_lock(&u->iolock); 2846 goto redo; 2847 unlock: 2848 unix_state_unlock(sk); 2849 break; 2850 } 2851 2852 while (skip >= unix_skb_len(skb)) { 2853 skip -= unix_skb_len(skb); 2854 last = skb; 2855 last_len = skb->len; 2856 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2857 if (!skb) 2858 goto again; 2859 } 2860 2861 unix_state_unlock(sk); 2862 2863 if (check_creds) { 2864 /* Never glue messages from different writers */ 2865 if (!unix_skb_scm_eq(skb, &scm)) 2866 break; 2867 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2868 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2869 /* Copy credentials */ 2870 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2871 unix_set_secdata(&scm, skb); 2872 check_creds = true; 2873 } 2874 2875 /* Copy address just once */ 2876 if (state->msg && state->msg->msg_name) { 2877 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2878 state->msg->msg_name); 2879 unix_copy_addr(state->msg, skb->sk); 2880 2881 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2882 state->msg->msg_name, 2883 &state->msg->msg_namelen); 2884 2885 sunaddr = NULL; 2886 } 2887 2888 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2889 chunk = state->recv_actor(skb, skip, chunk, state); 2890 if (chunk < 0) { 2891 if (copied == 0) 2892 copied = -EFAULT; 2893 break; 2894 } 2895 copied += chunk; 2896 size -= chunk; 2897 2898 /* Mark read part of skb as used */ 2899 if (!(flags & MSG_PEEK)) { 2900 UNIXCB(skb).consumed += chunk; 2901 2902 sk_peek_offset_bwd(sk, chunk); 2903 2904 if (UNIXCB(skb).fp) { 2905 scm_stat_del(sk, skb); 2906 unix_detach_fds(&scm, skb); 2907 } 2908 2909 if (unix_skb_len(skb)) 2910 break; 2911 2912 skb_unlink(skb, &sk->sk_receive_queue); 2913 consume_skb(skb); 2914 2915 if (scm.fp) 2916 break; 2917 } else { 2918 /* It is questionable, see note in unix_dgram_recvmsg. 2919 */ 2920 if (UNIXCB(skb).fp) 2921 unix_peek_fds(&scm, skb); 2922 2923 sk_peek_offset_fwd(sk, chunk); 2924 2925 if (UNIXCB(skb).fp) 2926 break; 2927 2928 skip = 0; 2929 last = skb; 2930 last_len = skb->len; 2931 unix_state_lock(sk); 2932 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2933 if (skb) 2934 goto again; 2935 unix_state_unlock(sk); 2936 break; 2937 } 2938 } while (size); 2939 2940 mutex_unlock(&u->iolock); 2941 if (state->msg) 2942 scm_recv_unix(sock, state->msg, &scm, flags); 2943 else 2944 scm_destroy(&scm); 2945 out: 2946 return copied ? : err; 2947 } 2948 2949 static int unix_stream_read_actor(struct sk_buff *skb, 2950 int skip, int chunk, 2951 struct unix_stream_read_state *state) 2952 { 2953 int ret; 2954 2955 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2956 state->msg, chunk); 2957 return ret ?: chunk; 2958 } 2959 2960 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2961 size_t size, int flags) 2962 { 2963 struct unix_stream_read_state state = { 2964 .recv_actor = unix_stream_read_actor, 2965 .socket = sk->sk_socket, 2966 .msg = msg, 2967 .size = size, 2968 .flags = flags 2969 }; 2970 2971 return unix_stream_read_generic(&state, true); 2972 } 2973 2974 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2975 size_t size, int flags) 2976 { 2977 struct unix_stream_read_state state = { 2978 .recv_actor = unix_stream_read_actor, 2979 .socket = sock, 2980 .msg = msg, 2981 .size = size, 2982 .flags = flags 2983 }; 2984 2985 #ifdef CONFIG_BPF_SYSCALL 2986 struct sock *sk = sock->sk; 2987 const struct proto *prot = READ_ONCE(sk->sk_prot); 2988 2989 if (prot != &unix_stream_proto) 2990 return prot->recvmsg(sk, msg, size, flags, NULL); 2991 #endif 2992 return unix_stream_read_generic(&state, true); 2993 } 2994 2995 static int unix_stream_splice_actor(struct sk_buff *skb, 2996 int skip, int chunk, 2997 struct unix_stream_read_state *state) 2998 { 2999 return skb_splice_bits(skb, state->socket->sk, 3000 UNIXCB(skb).consumed + skip, 3001 state->pipe, chunk, state->splice_flags); 3002 } 3003 3004 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 3005 struct pipe_inode_info *pipe, 3006 size_t size, unsigned int flags) 3007 { 3008 struct unix_stream_read_state state = { 3009 .recv_actor = unix_stream_splice_actor, 3010 .socket = sock, 3011 .pipe = pipe, 3012 .size = size, 3013 .splice_flags = flags, 3014 }; 3015 3016 if (unlikely(*ppos)) 3017 return -ESPIPE; 3018 3019 if (sock->file->f_flags & O_NONBLOCK || 3020 flags & SPLICE_F_NONBLOCK) 3021 state.flags = MSG_DONTWAIT; 3022 3023 return unix_stream_read_generic(&state, false); 3024 } 3025 3026 static int unix_shutdown(struct socket *sock, int mode) 3027 { 3028 struct sock *sk = sock->sk; 3029 struct sock *other; 3030 3031 if (mode < SHUT_RD || mode > SHUT_RDWR) 3032 return -EINVAL; 3033 /* This maps: 3034 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 3035 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 3036 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 3037 */ 3038 ++mode; 3039 3040 unix_state_lock(sk); 3041 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 3042 other = unix_peer(sk); 3043 if (other) 3044 sock_hold(other); 3045 unix_state_unlock(sk); 3046 sk->sk_state_change(sk); 3047 3048 if (other && 3049 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 3050 3051 int peer_mode = 0; 3052 const struct proto *prot = READ_ONCE(other->sk_prot); 3053 3054 if (prot->unhash) 3055 prot->unhash(other); 3056 if (mode&RCV_SHUTDOWN) 3057 peer_mode |= SEND_SHUTDOWN; 3058 if (mode&SEND_SHUTDOWN) 3059 peer_mode |= RCV_SHUTDOWN; 3060 unix_state_lock(other); 3061 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 3062 unix_state_unlock(other); 3063 other->sk_state_change(other); 3064 if (peer_mode == SHUTDOWN_MASK) 3065 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3066 else if (peer_mode & RCV_SHUTDOWN) 3067 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3068 } 3069 if (other) 3070 sock_put(other); 3071 3072 return 0; 3073 } 3074 3075 long unix_inq_len(struct sock *sk) 3076 { 3077 struct sk_buff *skb; 3078 long amount = 0; 3079 3080 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 3081 return -EINVAL; 3082 3083 spin_lock(&sk->sk_receive_queue.lock); 3084 if (sk->sk_type == SOCK_STREAM || 3085 sk->sk_type == SOCK_SEQPACKET) { 3086 skb_queue_walk(&sk->sk_receive_queue, skb) 3087 amount += unix_skb_len(skb); 3088 } else { 3089 skb = skb_peek(&sk->sk_receive_queue); 3090 if (skb) 3091 amount = skb->len; 3092 } 3093 spin_unlock(&sk->sk_receive_queue.lock); 3094 3095 return amount; 3096 } 3097 EXPORT_SYMBOL_GPL(unix_inq_len); 3098 3099 long unix_outq_len(struct sock *sk) 3100 { 3101 return sk_wmem_alloc_get(sk); 3102 } 3103 EXPORT_SYMBOL_GPL(unix_outq_len); 3104 3105 static int unix_open_file(struct sock *sk) 3106 { 3107 struct path path; 3108 struct file *f; 3109 int fd; 3110 3111 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3112 return -EPERM; 3113 3114 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3115 return -ENOENT; 3116 3117 path = unix_sk(sk)->path; 3118 if (!path.dentry) 3119 return -ENOENT; 3120 3121 path_get(&path); 3122 3123 fd = get_unused_fd_flags(O_CLOEXEC); 3124 if (fd < 0) 3125 goto out; 3126 3127 f = dentry_open(&path, O_PATH, current_cred()); 3128 if (IS_ERR(f)) { 3129 put_unused_fd(fd); 3130 fd = PTR_ERR(f); 3131 goto out; 3132 } 3133 3134 fd_install(fd, f); 3135 out: 3136 path_put(&path); 3137 3138 return fd; 3139 } 3140 3141 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3142 { 3143 struct sock *sk = sock->sk; 3144 long amount = 0; 3145 int err; 3146 3147 switch (cmd) { 3148 case SIOCOUTQ: 3149 amount = unix_outq_len(sk); 3150 err = put_user(amount, (int __user *)arg); 3151 break; 3152 case SIOCINQ: 3153 amount = unix_inq_len(sk); 3154 if (amount < 0) 3155 err = amount; 3156 else 3157 err = put_user(amount, (int __user *)arg); 3158 break; 3159 case SIOCUNIXFILE: 3160 err = unix_open_file(sk); 3161 break; 3162 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3163 case SIOCATMARK: 3164 { 3165 struct unix_sock *u = unix_sk(sk); 3166 struct sk_buff *skb; 3167 int answ = 0; 3168 3169 mutex_lock(&u->iolock); 3170 3171 skb = skb_peek(&sk->sk_receive_queue); 3172 if (skb) { 3173 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); 3174 struct sk_buff *next_skb; 3175 3176 next_skb = skb_peek_next(skb, &sk->sk_receive_queue); 3177 3178 if (skb == oob_skb || 3179 (!unix_skb_len(skb) && 3180 (!oob_skb || next_skb == oob_skb))) 3181 answ = 1; 3182 } 3183 3184 mutex_unlock(&u->iolock); 3185 3186 err = put_user(answ, (int __user *)arg); 3187 } 3188 break; 3189 #endif 3190 default: 3191 err = -ENOIOCTLCMD; 3192 break; 3193 } 3194 return err; 3195 } 3196 3197 #ifdef CONFIG_COMPAT 3198 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3199 { 3200 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3201 } 3202 #endif 3203 3204 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3205 { 3206 struct sock *sk = sock->sk; 3207 unsigned char state; 3208 __poll_t mask; 3209 u8 shutdown; 3210 3211 sock_poll_wait(file, sock, wait); 3212 mask = 0; 3213 shutdown = READ_ONCE(sk->sk_shutdown); 3214 state = READ_ONCE(sk->sk_state); 3215 3216 /* exceptional events? */ 3217 if (READ_ONCE(sk->sk_err)) 3218 mask |= EPOLLERR; 3219 if (shutdown == SHUTDOWN_MASK) 3220 mask |= EPOLLHUP; 3221 if (shutdown & RCV_SHUTDOWN) 3222 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3223 3224 /* readable? */ 3225 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3226 mask |= EPOLLIN | EPOLLRDNORM; 3227 if (sk_is_readable(sk)) 3228 mask |= EPOLLIN | EPOLLRDNORM; 3229 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3230 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3231 mask |= EPOLLPRI; 3232 #endif 3233 3234 /* Connection-based need to check for termination and startup */ 3235 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3236 state == TCP_CLOSE) 3237 mask |= EPOLLHUP; 3238 3239 /* 3240 * we set writable also when the other side has shut down the 3241 * connection. This prevents stuck sockets. 3242 */ 3243 if (unix_writable(sk, state)) 3244 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3245 3246 return mask; 3247 } 3248 3249 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3250 poll_table *wait) 3251 { 3252 struct sock *sk = sock->sk, *other; 3253 unsigned int writable; 3254 unsigned char state; 3255 __poll_t mask; 3256 u8 shutdown; 3257 3258 sock_poll_wait(file, sock, wait); 3259 mask = 0; 3260 shutdown = READ_ONCE(sk->sk_shutdown); 3261 state = READ_ONCE(sk->sk_state); 3262 3263 /* exceptional events? */ 3264 if (READ_ONCE(sk->sk_err) || 3265 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3266 mask |= EPOLLERR | 3267 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3268 3269 if (shutdown & RCV_SHUTDOWN) 3270 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3271 if (shutdown == SHUTDOWN_MASK) 3272 mask |= EPOLLHUP; 3273 3274 /* readable? */ 3275 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3276 mask |= EPOLLIN | EPOLLRDNORM; 3277 if (sk_is_readable(sk)) 3278 mask |= EPOLLIN | EPOLLRDNORM; 3279 3280 /* Connection-based need to check for termination and startup */ 3281 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3282 mask |= EPOLLHUP; 3283 3284 /* No write status requested, avoid expensive OUT tests. */ 3285 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3286 return mask; 3287 3288 writable = unix_writable(sk, state); 3289 if (writable) { 3290 unix_state_lock(sk); 3291 3292 other = unix_peer(sk); 3293 if (other && unix_peer(other) != sk && 3294 unix_recvq_full_lockless(other) && 3295 unix_dgram_peer_wake_me(sk, other)) 3296 writable = 0; 3297 3298 unix_state_unlock(sk); 3299 } 3300 3301 if (writable) 3302 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3303 else 3304 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3305 3306 return mask; 3307 } 3308 3309 #ifdef CONFIG_PROC_FS 3310 3311 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3312 3313 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3314 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3315 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3316 3317 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3318 { 3319 unsigned long offset = get_offset(*pos); 3320 unsigned long bucket = get_bucket(*pos); 3321 unsigned long count = 0; 3322 struct sock *sk; 3323 3324 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3325 sk; sk = sk_next(sk)) { 3326 if (++count == offset) 3327 break; 3328 } 3329 3330 return sk; 3331 } 3332 3333 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3334 { 3335 unsigned long bucket = get_bucket(*pos); 3336 struct net *net = seq_file_net(seq); 3337 struct sock *sk; 3338 3339 while (bucket < UNIX_HASH_SIZE) { 3340 spin_lock(&net->unx.table.locks[bucket]); 3341 3342 sk = unix_from_bucket(seq, pos); 3343 if (sk) 3344 return sk; 3345 3346 spin_unlock(&net->unx.table.locks[bucket]); 3347 3348 *pos = set_bucket_offset(++bucket, 1); 3349 } 3350 3351 return NULL; 3352 } 3353 3354 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3355 loff_t *pos) 3356 { 3357 unsigned long bucket = get_bucket(*pos); 3358 3359 sk = sk_next(sk); 3360 if (sk) 3361 return sk; 3362 3363 3364 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3365 3366 *pos = set_bucket_offset(++bucket, 1); 3367 3368 return unix_get_first(seq, pos); 3369 } 3370 3371 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3372 { 3373 if (!*pos) 3374 return SEQ_START_TOKEN; 3375 3376 return unix_get_first(seq, pos); 3377 } 3378 3379 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3380 { 3381 ++*pos; 3382 3383 if (v == SEQ_START_TOKEN) 3384 return unix_get_first(seq, pos); 3385 3386 return unix_get_next(seq, v, pos); 3387 } 3388 3389 static void unix_seq_stop(struct seq_file *seq, void *v) 3390 { 3391 struct sock *sk = v; 3392 3393 if (sk) 3394 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3395 } 3396 3397 static int unix_seq_show(struct seq_file *seq, void *v) 3398 { 3399 3400 if (v == SEQ_START_TOKEN) 3401 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3402 "Inode Path\n"); 3403 else { 3404 struct sock *s = v; 3405 struct unix_sock *u = unix_sk(s); 3406 unix_state_lock(s); 3407 3408 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3409 s, 3410 refcount_read(&s->sk_refcnt), 3411 0, 3412 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3413 s->sk_type, 3414 s->sk_socket ? 3415 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3416 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3417 sock_i_ino(s)); 3418 3419 if (u->addr) { // under a hash table lock here 3420 int i, len; 3421 seq_putc(seq, ' '); 3422 3423 i = 0; 3424 len = u->addr->len - 3425 offsetof(struct sockaddr_un, sun_path); 3426 if (u->addr->name->sun_path[0]) { 3427 len--; 3428 } else { 3429 seq_putc(seq, '@'); 3430 i++; 3431 } 3432 for ( ; i < len; i++) 3433 seq_putc(seq, u->addr->name->sun_path[i] ?: 3434 '@'); 3435 } 3436 unix_state_unlock(s); 3437 seq_putc(seq, '\n'); 3438 } 3439 3440 return 0; 3441 } 3442 3443 static const struct seq_operations unix_seq_ops = { 3444 .start = unix_seq_start, 3445 .next = unix_seq_next, 3446 .stop = unix_seq_stop, 3447 .show = unix_seq_show, 3448 }; 3449 3450 #ifdef CONFIG_BPF_SYSCALL 3451 struct bpf_unix_iter_state { 3452 struct seq_net_private p; 3453 unsigned int cur_sk; 3454 unsigned int end_sk; 3455 unsigned int max_sk; 3456 struct sock **batch; 3457 bool st_bucket_done; 3458 }; 3459 3460 struct bpf_iter__unix { 3461 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3462 __bpf_md_ptr(struct unix_sock *, unix_sk); 3463 uid_t uid __aligned(8); 3464 }; 3465 3466 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3467 struct unix_sock *unix_sk, uid_t uid) 3468 { 3469 struct bpf_iter__unix ctx; 3470 3471 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3472 ctx.meta = meta; 3473 ctx.unix_sk = unix_sk; 3474 ctx.uid = uid; 3475 return bpf_iter_run_prog(prog, &ctx); 3476 } 3477 3478 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3479 3480 { 3481 struct bpf_unix_iter_state *iter = seq->private; 3482 unsigned int expected = 1; 3483 struct sock *sk; 3484 3485 sock_hold(start_sk); 3486 iter->batch[iter->end_sk++] = start_sk; 3487 3488 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3489 if (iter->end_sk < iter->max_sk) { 3490 sock_hold(sk); 3491 iter->batch[iter->end_sk++] = sk; 3492 } 3493 3494 expected++; 3495 } 3496 3497 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3498 3499 return expected; 3500 } 3501 3502 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3503 { 3504 while (iter->cur_sk < iter->end_sk) 3505 sock_put(iter->batch[iter->cur_sk++]); 3506 } 3507 3508 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3509 unsigned int new_batch_sz) 3510 { 3511 struct sock **new_batch; 3512 3513 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3514 GFP_USER | __GFP_NOWARN); 3515 if (!new_batch) 3516 return -ENOMEM; 3517 3518 bpf_iter_unix_put_batch(iter); 3519 kvfree(iter->batch); 3520 iter->batch = new_batch; 3521 iter->max_sk = new_batch_sz; 3522 3523 return 0; 3524 } 3525 3526 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3527 loff_t *pos) 3528 { 3529 struct bpf_unix_iter_state *iter = seq->private; 3530 unsigned int expected; 3531 bool resized = false; 3532 struct sock *sk; 3533 3534 if (iter->st_bucket_done) 3535 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3536 3537 again: 3538 /* Get a new batch */ 3539 iter->cur_sk = 0; 3540 iter->end_sk = 0; 3541 3542 sk = unix_get_first(seq, pos); 3543 if (!sk) 3544 return NULL; /* Done */ 3545 3546 expected = bpf_iter_unix_hold_batch(seq, sk); 3547 3548 if (iter->end_sk == expected) { 3549 iter->st_bucket_done = true; 3550 return sk; 3551 } 3552 3553 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3554 resized = true; 3555 goto again; 3556 } 3557 3558 return sk; 3559 } 3560 3561 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3562 { 3563 if (!*pos) 3564 return SEQ_START_TOKEN; 3565 3566 /* bpf iter does not support lseek, so it always 3567 * continue from where it was stop()-ped. 3568 */ 3569 return bpf_iter_unix_batch(seq, pos); 3570 } 3571 3572 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3573 { 3574 struct bpf_unix_iter_state *iter = seq->private; 3575 struct sock *sk; 3576 3577 /* Whenever seq_next() is called, the iter->cur_sk is 3578 * done with seq_show(), so advance to the next sk in 3579 * the batch. 3580 */ 3581 if (iter->cur_sk < iter->end_sk) 3582 sock_put(iter->batch[iter->cur_sk++]); 3583 3584 ++*pos; 3585 3586 if (iter->cur_sk < iter->end_sk) 3587 sk = iter->batch[iter->cur_sk]; 3588 else 3589 sk = bpf_iter_unix_batch(seq, pos); 3590 3591 return sk; 3592 } 3593 3594 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3595 { 3596 struct bpf_iter_meta meta; 3597 struct bpf_prog *prog; 3598 struct sock *sk = v; 3599 uid_t uid; 3600 bool slow; 3601 int ret; 3602 3603 if (v == SEQ_START_TOKEN) 3604 return 0; 3605 3606 slow = lock_sock_fast(sk); 3607 3608 if (unlikely(sk_unhashed(sk))) { 3609 ret = SEQ_SKIP; 3610 goto unlock; 3611 } 3612 3613 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3614 meta.seq = seq; 3615 prog = bpf_iter_get_info(&meta, false); 3616 ret = unix_prog_seq_show(prog, &meta, v, uid); 3617 unlock: 3618 unlock_sock_fast(sk, slow); 3619 return ret; 3620 } 3621 3622 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3623 { 3624 struct bpf_unix_iter_state *iter = seq->private; 3625 struct bpf_iter_meta meta; 3626 struct bpf_prog *prog; 3627 3628 if (!v) { 3629 meta.seq = seq; 3630 prog = bpf_iter_get_info(&meta, true); 3631 if (prog) 3632 (void)unix_prog_seq_show(prog, &meta, v, 0); 3633 } 3634 3635 if (iter->cur_sk < iter->end_sk) 3636 bpf_iter_unix_put_batch(iter); 3637 } 3638 3639 static const struct seq_operations bpf_iter_unix_seq_ops = { 3640 .start = bpf_iter_unix_seq_start, 3641 .next = bpf_iter_unix_seq_next, 3642 .stop = bpf_iter_unix_seq_stop, 3643 .show = bpf_iter_unix_seq_show, 3644 }; 3645 #endif 3646 #endif 3647 3648 static const struct net_proto_family unix_family_ops = { 3649 .family = PF_UNIX, 3650 .create = unix_create, 3651 .owner = THIS_MODULE, 3652 }; 3653 3654 3655 static int __net_init unix_net_init(struct net *net) 3656 { 3657 int i; 3658 3659 net->unx.sysctl_max_dgram_qlen = 10; 3660 if (unix_sysctl_register(net)) 3661 goto out; 3662 3663 #ifdef CONFIG_PROC_FS 3664 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3665 sizeof(struct seq_net_private))) 3666 goto err_sysctl; 3667 #endif 3668 3669 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3670 sizeof(spinlock_t), GFP_KERNEL); 3671 if (!net->unx.table.locks) 3672 goto err_proc; 3673 3674 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3675 sizeof(struct hlist_head), 3676 GFP_KERNEL); 3677 if (!net->unx.table.buckets) 3678 goto free_locks; 3679 3680 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3681 spin_lock_init(&net->unx.table.locks[i]); 3682 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); 3683 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3684 } 3685 3686 return 0; 3687 3688 free_locks: 3689 kvfree(net->unx.table.locks); 3690 err_proc: 3691 #ifdef CONFIG_PROC_FS 3692 remove_proc_entry("unix", net->proc_net); 3693 err_sysctl: 3694 #endif 3695 unix_sysctl_unregister(net); 3696 out: 3697 return -ENOMEM; 3698 } 3699 3700 static void __net_exit unix_net_exit(struct net *net) 3701 { 3702 kvfree(net->unx.table.buckets); 3703 kvfree(net->unx.table.locks); 3704 unix_sysctl_unregister(net); 3705 remove_proc_entry("unix", net->proc_net); 3706 } 3707 3708 static struct pernet_operations unix_net_ops = { 3709 .init = unix_net_init, 3710 .exit = unix_net_exit, 3711 }; 3712 3713 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3714 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3715 struct unix_sock *unix_sk, uid_t uid) 3716 3717 #define INIT_BATCH_SZ 16 3718 3719 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3720 { 3721 struct bpf_unix_iter_state *iter = priv_data; 3722 int err; 3723 3724 err = bpf_iter_init_seq_net(priv_data, aux); 3725 if (err) 3726 return err; 3727 3728 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3729 if (err) { 3730 bpf_iter_fini_seq_net(priv_data); 3731 return err; 3732 } 3733 3734 return 0; 3735 } 3736 3737 static void bpf_iter_fini_unix(void *priv_data) 3738 { 3739 struct bpf_unix_iter_state *iter = priv_data; 3740 3741 bpf_iter_fini_seq_net(priv_data); 3742 kvfree(iter->batch); 3743 } 3744 3745 static const struct bpf_iter_seq_info unix_seq_info = { 3746 .seq_ops = &bpf_iter_unix_seq_ops, 3747 .init_seq_private = bpf_iter_init_unix, 3748 .fini_seq_private = bpf_iter_fini_unix, 3749 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3750 }; 3751 3752 static const struct bpf_func_proto * 3753 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3754 const struct bpf_prog *prog) 3755 { 3756 switch (func_id) { 3757 case BPF_FUNC_setsockopt: 3758 return &bpf_sk_setsockopt_proto; 3759 case BPF_FUNC_getsockopt: 3760 return &bpf_sk_getsockopt_proto; 3761 default: 3762 return NULL; 3763 } 3764 } 3765 3766 static struct bpf_iter_reg unix_reg_info = { 3767 .target = "unix", 3768 .ctx_arg_info_size = 1, 3769 .ctx_arg_info = { 3770 { offsetof(struct bpf_iter__unix, unix_sk), 3771 PTR_TO_BTF_ID_OR_NULL }, 3772 }, 3773 .get_func_proto = bpf_iter_unix_get_func_proto, 3774 .seq_info = &unix_seq_info, 3775 }; 3776 3777 static void __init bpf_iter_register(void) 3778 { 3779 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3780 if (bpf_iter_reg_target(&unix_reg_info)) 3781 pr_warn("Warning: could not register bpf iterator unix\n"); 3782 } 3783 #endif 3784 3785 static int __init af_unix_init(void) 3786 { 3787 int i, rc = -1; 3788 3789 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3790 3791 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3792 spin_lock_init(&bsd_socket_locks[i]); 3793 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3794 } 3795 3796 rc = proto_register(&unix_dgram_proto, 1); 3797 if (rc != 0) { 3798 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3799 goto out; 3800 } 3801 3802 rc = proto_register(&unix_stream_proto, 1); 3803 if (rc != 0) { 3804 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3805 proto_unregister(&unix_dgram_proto); 3806 goto out; 3807 } 3808 3809 sock_register(&unix_family_ops); 3810 register_pernet_subsys(&unix_net_ops); 3811 unix_bpf_build_proto(); 3812 3813 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3814 bpf_iter_register(); 3815 #endif 3816 3817 out: 3818 return rc; 3819 } 3820 3821 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3822 fs_initcall(af_unix_init); 3823