1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 #include <linux/bpf-cgroup.h> 120 121 static atomic_long_t unix_nr_socks; 122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 124 125 /* SMP locking strategy: 126 * hash table is protected with spinlock. 127 * each socket state is protected by separate spinlock. 128 */ 129 #ifdef CONFIG_PROVE_LOCKING 130 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) 131 132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a, 133 const struct lockdep_map *b) 134 { 135 return cmp_ptr(a, b); 136 } 137 138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, 139 const struct lockdep_map *_b) 140 { 141 const struct unix_sock *a, *b; 142 143 a = container_of(_a, struct unix_sock, lock.dep_map); 144 b = container_of(_b, struct unix_sock, lock.dep_map); 145 146 if (a->sk.sk_state == TCP_LISTEN) { 147 /* unix_stream_connect(): Before the 2nd unix_state_lock(), 148 * 149 * 1. a is TCP_LISTEN. 150 * 2. b is not a. 151 * 3. concurrent connect(b -> a) must fail. 152 * 153 * Except for 2. & 3., the b's state can be any possible 154 * value due to concurrent connect() or listen(). 155 * 156 * 2. is detected in debug_spin_lock_before(), and 3. cannot 157 * be expressed as lock_cmp_fn. 158 */ 159 switch (b->sk.sk_state) { 160 case TCP_CLOSE: 161 case TCP_ESTABLISHED: 162 case TCP_LISTEN: 163 return -1; 164 default: 165 /* Invalid case. */ 166 return 0; 167 } 168 } 169 170 /* Should never happen. Just to be symmetric. */ 171 if (b->sk.sk_state == TCP_LISTEN) { 172 switch (b->sk.sk_state) { 173 case TCP_CLOSE: 174 case TCP_ESTABLISHED: 175 return 1; 176 default: 177 return 0; 178 } 179 } 180 181 /* unix_state_double_lock(): ascending address order. */ 182 return cmp_ptr(a, b); 183 } 184 185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, 186 const struct lockdep_map *_b) 187 { 188 const struct sock *a, *b; 189 190 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); 191 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); 192 193 /* unix_collect_skb(): listener -> embryo order. */ 194 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) 195 return -1; 196 197 /* Should never happen. Just to be symmetric. */ 198 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) 199 return 1; 200 201 return 0; 202 } 203 #endif 204 205 static unsigned int unix_unbound_hash(struct sock *sk) 206 { 207 unsigned long hash = (unsigned long)sk; 208 209 hash ^= hash >> 16; 210 hash ^= hash >> 8; 211 hash ^= sk->sk_type; 212 213 return hash & UNIX_HASH_MOD; 214 } 215 216 static unsigned int unix_bsd_hash(struct inode *i) 217 { 218 return i->i_ino & UNIX_HASH_MOD; 219 } 220 221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 222 int addr_len, int type) 223 { 224 __wsum csum = csum_partial(sunaddr, addr_len, 0); 225 unsigned int hash; 226 227 hash = (__force unsigned int)csum_fold(csum); 228 hash ^= hash >> 8; 229 hash ^= type; 230 231 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 232 } 233 234 static void unix_table_double_lock(struct net *net, 235 unsigned int hash1, unsigned int hash2) 236 { 237 if (hash1 == hash2) { 238 spin_lock(&net->unx.table.locks[hash1]); 239 return; 240 } 241 242 if (hash1 > hash2) 243 swap(hash1, hash2); 244 245 spin_lock(&net->unx.table.locks[hash1]); 246 spin_lock(&net->unx.table.locks[hash2]); 247 } 248 249 static void unix_table_double_unlock(struct net *net, 250 unsigned int hash1, unsigned int hash2) 251 { 252 if (hash1 == hash2) { 253 spin_unlock(&net->unx.table.locks[hash1]); 254 return; 255 } 256 257 spin_unlock(&net->unx.table.locks[hash1]); 258 spin_unlock(&net->unx.table.locks[hash2]); 259 } 260 261 #ifdef CONFIG_SECURITY_NETWORK 262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 263 { 264 UNIXCB(skb).secid = scm->secid; 265 } 266 267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 268 { 269 scm->secid = UNIXCB(skb).secid; 270 } 271 272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 273 { 274 return (scm->secid == UNIXCB(skb).secid); 275 } 276 #else 277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 278 { } 279 280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 281 { } 282 283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 284 { 285 return true; 286 } 287 #endif /* CONFIG_SECURITY_NETWORK */ 288 289 static inline int unix_may_send(struct sock *sk, struct sock *osk) 290 { 291 return !unix_peer(osk) || unix_peer(osk) == sk; 292 } 293 294 static inline int unix_recvq_full_lockless(const struct sock *sk) 295 { 296 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 297 } 298 299 struct sock *unix_peer_get(struct sock *s) 300 { 301 struct sock *peer; 302 303 unix_state_lock(s); 304 peer = unix_peer(s); 305 if (peer) 306 sock_hold(peer); 307 unix_state_unlock(s); 308 return peer; 309 } 310 EXPORT_SYMBOL_GPL(unix_peer_get); 311 312 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 313 int addr_len) 314 { 315 struct unix_address *addr; 316 317 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 318 if (!addr) 319 return NULL; 320 321 refcount_set(&addr->refcnt, 1); 322 addr->len = addr_len; 323 memcpy(addr->name, sunaddr, addr_len); 324 325 return addr; 326 } 327 328 static inline void unix_release_addr(struct unix_address *addr) 329 { 330 if (refcount_dec_and_test(&addr->refcnt)) 331 kfree(addr); 332 } 333 334 /* 335 * Check unix socket name: 336 * - should be not zero length. 337 * - if started by not zero, should be NULL terminated (FS object) 338 * - if started by zero, it is abstract name. 339 */ 340 341 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 342 { 343 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 344 addr_len > sizeof(*sunaddr)) 345 return -EINVAL; 346 347 if (sunaddr->sun_family != AF_UNIX) 348 return -EINVAL; 349 350 return 0; 351 } 352 353 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 354 { 355 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 356 short offset = offsetof(struct sockaddr_storage, __data); 357 358 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 359 360 /* This may look like an off by one error but it is a bit more 361 * subtle. 108 is the longest valid AF_UNIX path for a binding. 362 * sun_path[108] doesn't as such exist. However in kernel space 363 * we are guaranteed that it is a valid memory location in our 364 * kernel address buffer because syscall functions always pass 365 * a pointer of struct sockaddr_storage which has a bigger buffer 366 * than 108. Also, we must terminate sun_path for strlen() in 367 * getname_kernel(). 368 */ 369 addr->__data[addr_len - offset] = 0; 370 371 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 372 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 373 * know the actual buffer. 374 */ 375 return strlen(addr->__data) + offset + 1; 376 } 377 378 static void __unix_remove_socket(struct sock *sk) 379 { 380 sk_del_node_init(sk); 381 } 382 383 static void __unix_insert_socket(struct net *net, struct sock *sk) 384 { 385 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 386 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 387 } 388 389 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 390 struct unix_address *addr, unsigned int hash) 391 { 392 __unix_remove_socket(sk); 393 smp_store_release(&unix_sk(sk)->addr, addr); 394 395 sk->sk_hash = hash; 396 __unix_insert_socket(net, sk); 397 } 398 399 static void unix_remove_socket(struct net *net, struct sock *sk) 400 { 401 spin_lock(&net->unx.table.locks[sk->sk_hash]); 402 __unix_remove_socket(sk); 403 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 404 } 405 406 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 407 { 408 spin_lock(&net->unx.table.locks[sk->sk_hash]); 409 __unix_insert_socket(net, sk); 410 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 411 } 412 413 static void unix_insert_bsd_socket(struct sock *sk) 414 { 415 spin_lock(&bsd_socket_locks[sk->sk_hash]); 416 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 417 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 418 } 419 420 static void unix_remove_bsd_socket(struct sock *sk) 421 { 422 if (!hlist_unhashed(&sk->sk_bind_node)) { 423 spin_lock(&bsd_socket_locks[sk->sk_hash]); 424 __sk_del_bind_node(sk); 425 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 426 427 sk_node_init(&sk->sk_bind_node); 428 } 429 } 430 431 static struct sock *__unix_find_socket_byname(struct net *net, 432 struct sockaddr_un *sunname, 433 int len, unsigned int hash) 434 { 435 struct sock *s; 436 437 sk_for_each(s, &net->unx.table.buckets[hash]) { 438 struct unix_sock *u = unix_sk(s); 439 440 if (u->addr->len == len && 441 !memcmp(u->addr->name, sunname, len)) 442 return s; 443 } 444 return NULL; 445 } 446 447 static inline struct sock *unix_find_socket_byname(struct net *net, 448 struct sockaddr_un *sunname, 449 int len, unsigned int hash) 450 { 451 struct sock *s; 452 453 spin_lock(&net->unx.table.locks[hash]); 454 s = __unix_find_socket_byname(net, sunname, len, hash); 455 if (s) 456 sock_hold(s); 457 spin_unlock(&net->unx.table.locks[hash]); 458 return s; 459 } 460 461 static struct sock *unix_find_socket_byinode(struct inode *i) 462 { 463 unsigned int hash = unix_bsd_hash(i); 464 struct sock *s; 465 466 spin_lock(&bsd_socket_locks[hash]); 467 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 468 struct dentry *dentry = unix_sk(s)->path.dentry; 469 470 if (dentry && d_backing_inode(dentry) == i) { 471 sock_hold(s); 472 spin_unlock(&bsd_socket_locks[hash]); 473 return s; 474 } 475 } 476 spin_unlock(&bsd_socket_locks[hash]); 477 return NULL; 478 } 479 480 /* Support code for asymmetrically connected dgram sockets 481 * 482 * If a datagram socket is connected to a socket not itself connected 483 * to the first socket (eg, /dev/log), clients may only enqueue more 484 * messages if the present receive queue of the server socket is not 485 * "too large". This means there's a second writeability condition 486 * poll and sendmsg need to test. The dgram recv code will do a wake 487 * up on the peer_wait wait queue of a socket upon reception of a 488 * datagram which needs to be propagated to sleeping would-be writers 489 * since these might not have sent anything so far. This can't be 490 * accomplished via poll_wait because the lifetime of the server 491 * socket might be less than that of its clients if these break their 492 * association with it or if the server socket is closed while clients 493 * are still connected to it and there's no way to inform "a polling 494 * implementation" that it should let go of a certain wait queue 495 * 496 * In order to propagate a wake up, a wait_queue_entry_t of the client 497 * socket is enqueued on the peer_wait queue of the server socket 498 * whose wake function does a wake_up on the ordinary client socket 499 * wait queue. This connection is established whenever a write (or 500 * poll for write) hit the flow control condition and broken when the 501 * association to the server socket is dissolved or after a wake up 502 * was relayed. 503 */ 504 505 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 506 void *key) 507 { 508 struct unix_sock *u; 509 wait_queue_head_t *u_sleep; 510 511 u = container_of(q, struct unix_sock, peer_wake); 512 513 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 514 q); 515 u->peer_wake.private = NULL; 516 517 /* relaying can only happen while the wq still exists */ 518 u_sleep = sk_sleep(&u->sk); 519 if (u_sleep) 520 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 521 522 return 0; 523 } 524 525 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 526 { 527 struct unix_sock *u, *u_other; 528 int rc; 529 530 u = unix_sk(sk); 531 u_other = unix_sk(other); 532 rc = 0; 533 spin_lock(&u_other->peer_wait.lock); 534 535 if (!u->peer_wake.private) { 536 u->peer_wake.private = other; 537 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 538 539 rc = 1; 540 } 541 542 spin_unlock(&u_other->peer_wait.lock); 543 return rc; 544 } 545 546 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 547 struct sock *other) 548 { 549 struct unix_sock *u, *u_other; 550 551 u = unix_sk(sk); 552 u_other = unix_sk(other); 553 spin_lock(&u_other->peer_wait.lock); 554 555 if (u->peer_wake.private == other) { 556 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 557 u->peer_wake.private = NULL; 558 } 559 560 spin_unlock(&u_other->peer_wait.lock); 561 } 562 563 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 564 struct sock *other) 565 { 566 unix_dgram_peer_wake_disconnect(sk, other); 567 wake_up_interruptible_poll(sk_sleep(sk), 568 EPOLLOUT | 569 EPOLLWRNORM | 570 EPOLLWRBAND); 571 } 572 573 /* preconditions: 574 * - unix_peer(sk) == other 575 * - association is stable 576 */ 577 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 578 { 579 int connected; 580 581 connected = unix_dgram_peer_wake_connect(sk, other); 582 583 /* If other is SOCK_DEAD, we want to make sure we signal 584 * POLLOUT, such that a subsequent write() can get a 585 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 586 * to other and its full, we will hang waiting for POLLOUT. 587 */ 588 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 589 return 1; 590 591 if (connected) 592 unix_dgram_peer_wake_disconnect(sk, other); 593 594 return 0; 595 } 596 597 static int unix_writable(const struct sock *sk, unsigned char state) 598 { 599 return state != TCP_LISTEN && 600 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 601 } 602 603 static void unix_write_space(struct sock *sk) 604 { 605 struct socket_wq *wq; 606 607 rcu_read_lock(); 608 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 609 wq = rcu_dereference(sk->sk_wq); 610 if (skwq_has_sleeper(wq)) 611 wake_up_interruptible_sync_poll(&wq->wait, 612 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 613 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 614 } 615 rcu_read_unlock(); 616 } 617 618 /* When dgram socket disconnects (or changes its peer), we clear its receive 619 * queue of packets arrived from previous peer. First, it allows to do 620 * flow control based only on wmem_alloc; second, sk connected to peer 621 * may receive messages only from that peer. */ 622 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 623 { 624 if (!skb_queue_empty(&sk->sk_receive_queue)) { 625 skb_queue_purge_reason(&sk->sk_receive_queue, 626 SKB_DROP_REASON_UNIX_DISCONNECT); 627 628 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 629 630 /* If one link of bidirectional dgram pipe is disconnected, 631 * we signal error. Messages are lost. Do not make this, 632 * when peer was not connected to us. 633 */ 634 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 635 WRITE_ONCE(other->sk_err, ECONNRESET); 636 sk_error_report(other); 637 } 638 } 639 } 640 641 static void unix_sock_destructor(struct sock *sk) 642 { 643 struct unix_sock *u = unix_sk(sk); 644 645 skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE); 646 647 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 648 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 649 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 650 if (!sock_flag(sk, SOCK_DEAD)) { 651 pr_info("Attempt to release alive unix socket: %p\n", sk); 652 return; 653 } 654 655 if (u->addr) 656 unix_release_addr(u->addr); 657 658 atomic_long_dec(&unix_nr_socks); 659 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 660 #ifdef UNIX_REFCNT_DEBUG 661 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 662 atomic_long_read(&unix_nr_socks)); 663 #endif 664 } 665 666 static void unix_release_sock(struct sock *sk, int embrion) 667 { 668 struct unix_sock *u = unix_sk(sk); 669 struct sock *skpair; 670 struct sk_buff *skb; 671 struct path path; 672 int state; 673 674 unix_remove_socket(sock_net(sk), sk); 675 unix_remove_bsd_socket(sk); 676 677 /* Clear state */ 678 unix_state_lock(sk); 679 sock_orphan(sk); 680 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 681 path = u->path; 682 u->path.dentry = NULL; 683 u->path.mnt = NULL; 684 state = sk->sk_state; 685 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 686 687 skpair = unix_peer(sk); 688 unix_peer(sk) = NULL; 689 690 unix_state_unlock(sk); 691 692 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 693 u->oob_skb = NULL; 694 #endif 695 696 wake_up_interruptible_all(&u->peer_wait); 697 698 if (skpair != NULL) { 699 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 700 unix_state_lock(skpair); 701 /* No more writes */ 702 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 703 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) 704 WRITE_ONCE(skpair->sk_err, ECONNRESET); 705 unix_state_unlock(skpair); 706 skpair->sk_state_change(skpair); 707 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 708 } 709 710 unix_dgram_peer_wake_disconnect(sk, skpair); 711 sock_put(skpair); /* It may now die */ 712 } 713 714 /* Try to flush out this socket. Throw out buffers at least */ 715 716 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 717 if (state == TCP_LISTEN) 718 unix_release_sock(skb->sk, 1); 719 720 /* passed fds are erased in the kfree_skb hook */ 721 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 722 } 723 724 if (path.dentry) 725 path_put(&path); 726 727 sock_put(sk); 728 729 /* ---- Socket is dead now and most probably destroyed ---- */ 730 731 /* 732 * Fixme: BSD difference: In BSD all sockets connected to us get 733 * ECONNRESET and we die on the spot. In Linux we behave 734 * like files and pipes do and wait for the last 735 * dereference. 736 * 737 * Can't we simply set sock->err? 738 * 739 * What the above comment does talk about? --ANK(980817) 740 */ 741 742 if (READ_ONCE(unix_tot_inflight)) 743 unix_gc(); /* Garbage collect fds */ 744 } 745 746 static void init_peercred(struct sock *sk) 747 { 748 sk->sk_peer_pid = get_pid(task_tgid(current)); 749 sk->sk_peer_cred = get_current_cred(); 750 } 751 752 static void update_peercred(struct sock *sk) 753 { 754 const struct cred *old_cred; 755 struct pid *old_pid; 756 757 spin_lock(&sk->sk_peer_lock); 758 old_pid = sk->sk_peer_pid; 759 old_cred = sk->sk_peer_cred; 760 init_peercred(sk); 761 spin_unlock(&sk->sk_peer_lock); 762 763 put_pid(old_pid); 764 put_cred(old_cred); 765 } 766 767 static void copy_peercred(struct sock *sk, struct sock *peersk) 768 { 769 lockdep_assert_held(&unix_sk(peersk)->lock); 770 771 spin_lock(&sk->sk_peer_lock); 772 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 773 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 774 spin_unlock(&sk->sk_peer_lock); 775 } 776 777 static int unix_listen(struct socket *sock, int backlog) 778 { 779 int err; 780 struct sock *sk = sock->sk; 781 struct unix_sock *u = unix_sk(sk); 782 783 err = -EOPNOTSUPP; 784 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 785 goto out; /* Only stream/seqpacket sockets accept */ 786 err = -EINVAL; 787 if (!READ_ONCE(u->addr)) 788 goto out; /* No listens on an unbound socket */ 789 unix_state_lock(sk); 790 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 791 goto out_unlock; 792 if (backlog > sk->sk_max_ack_backlog) 793 wake_up_interruptible_all(&u->peer_wait); 794 sk->sk_max_ack_backlog = backlog; 795 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 796 797 /* set credentials so connect can copy them */ 798 update_peercred(sk); 799 err = 0; 800 801 out_unlock: 802 unix_state_unlock(sk); 803 out: 804 return err; 805 } 806 807 static int unix_release(struct socket *); 808 static int unix_bind(struct socket *, struct sockaddr *, int); 809 static int unix_stream_connect(struct socket *, struct sockaddr *, 810 int addr_len, int flags); 811 static int unix_socketpair(struct socket *, struct socket *); 812 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); 813 static int unix_getname(struct socket *, struct sockaddr *, int); 814 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 815 static __poll_t unix_dgram_poll(struct file *, struct socket *, 816 poll_table *); 817 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 818 #ifdef CONFIG_COMPAT 819 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 820 #endif 821 static int unix_shutdown(struct socket *, int); 822 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 823 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 824 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 825 struct pipe_inode_info *, size_t size, 826 unsigned int flags); 827 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 828 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 829 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 830 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 831 static int unix_dgram_connect(struct socket *, struct sockaddr *, 832 int, int); 833 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 834 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 835 int); 836 837 #ifdef CONFIG_PROC_FS 838 static int unix_count_nr_fds(struct sock *sk) 839 { 840 struct sk_buff *skb; 841 struct unix_sock *u; 842 int nr_fds = 0; 843 844 spin_lock(&sk->sk_receive_queue.lock); 845 skb = skb_peek(&sk->sk_receive_queue); 846 while (skb) { 847 u = unix_sk(skb->sk); 848 nr_fds += atomic_read(&u->scm_stat.nr_fds); 849 skb = skb_peek_next(skb, &sk->sk_receive_queue); 850 } 851 spin_unlock(&sk->sk_receive_queue.lock); 852 853 return nr_fds; 854 } 855 856 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 857 { 858 struct sock *sk = sock->sk; 859 unsigned char s_state; 860 struct unix_sock *u; 861 int nr_fds = 0; 862 863 if (sk) { 864 s_state = READ_ONCE(sk->sk_state); 865 u = unix_sk(sk); 866 867 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 868 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 869 * SOCK_DGRAM is ordinary. So, no lock is needed. 870 */ 871 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 872 nr_fds = atomic_read(&u->scm_stat.nr_fds); 873 else if (s_state == TCP_LISTEN) 874 nr_fds = unix_count_nr_fds(sk); 875 876 seq_printf(m, "scm_fds: %u\n", nr_fds); 877 } 878 } 879 #else 880 #define unix_show_fdinfo NULL 881 #endif 882 883 static const struct proto_ops unix_stream_ops = { 884 .family = PF_UNIX, 885 .owner = THIS_MODULE, 886 .release = unix_release, 887 .bind = unix_bind, 888 .connect = unix_stream_connect, 889 .socketpair = unix_socketpair, 890 .accept = unix_accept, 891 .getname = unix_getname, 892 .poll = unix_poll, 893 .ioctl = unix_ioctl, 894 #ifdef CONFIG_COMPAT 895 .compat_ioctl = unix_compat_ioctl, 896 #endif 897 .listen = unix_listen, 898 .shutdown = unix_shutdown, 899 .sendmsg = unix_stream_sendmsg, 900 .recvmsg = unix_stream_recvmsg, 901 .read_skb = unix_stream_read_skb, 902 .mmap = sock_no_mmap, 903 .splice_read = unix_stream_splice_read, 904 .set_peek_off = sk_set_peek_off, 905 .show_fdinfo = unix_show_fdinfo, 906 }; 907 908 static const struct proto_ops unix_dgram_ops = { 909 .family = PF_UNIX, 910 .owner = THIS_MODULE, 911 .release = unix_release, 912 .bind = unix_bind, 913 .connect = unix_dgram_connect, 914 .socketpair = unix_socketpair, 915 .accept = sock_no_accept, 916 .getname = unix_getname, 917 .poll = unix_dgram_poll, 918 .ioctl = unix_ioctl, 919 #ifdef CONFIG_COMPAT 920 .compat_ioctl = unix_compat_ioctl, 921 #endif 922 .listen = sock_no_listen, 923 .shutdown = unix_shutdown, 924 .sendmsg = unix_dgram_sendmsg, 925 .read_skb = unix_read_skb, 926 .recvmsg = unix_dgram_recvmsg, 927 .mmap = sock_no_mmap, 928 .set_peek_off = sk_set_peek_off, 929 .show_fdinfo = unix_show_fdinfo, 930 }; 931 932 static const struct proto_ops unix_seqpacket_ops = { 933 .family = PF_UNIX, 934 .owner = THIS_MODULE, 935 .release = unix_release, 936 .bind = unix_bind, 937 .connect = unix_stream_connect, 938 .socketpair = unix_socketpair, 939 .accept = unix_accept, 940 .getname = unix_getname, 941 .poll = unix_dgram_poll, 942 .ioctl = unix_ioctl, 943 #ifdef CONFIG_COMPAT 944 .compat_ioctl = unix_compat_ioctl, 945 #endif 946 .listen = unix_listen, 947 .shutdown = unix_shutdown, 948 .sendmsg = unix_seqpacket_sendmsg, 949 .recvmsg = unix_seqpacket_recvmsg, 950 .mmap = sock_no_mmap, 951 .set_peek_off = sk_set_peek_off, 952 .show_fdinfo = unix_show_fdinfo, 953 }; 954 955 static void unix_close(struct sock *sk, long timeout) 956 { 957 /* Nothing to do here, unix socket does not need a ->close(). 958 * This is merely for sockmap. 959 */ 960 } 961 962 static void unix_unhash(struct sock *sk) 963 { 964 /* Nothing to do here, unix socket does not need a ->unhash(). 965 * This is merely for sockmap. 966 */ 967 } 968 969 static bool unix_bpf_bypass_getsockopt(int level, int optname) 970 { 971 if (level == SOL_SOCKET) { 972 switch (optname) { 973 case SO_PEERPIDFD: 974 return true; 975 default: 976 return false; 977 } 978 } 979 980 return false; 981 } 982 983 struct proto unix_dgram_proto = { 984 .name = "UNIX", 985 .owner = THIS_MODULE, 986 .obj_size = sizeof(struct unix_sock), 987 .close = unix_close, 988 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 989 #ifdef CONFIG_BPF_SYSCALL 990 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 991 #endif 992 }; 993 994 struct proto unix_stream_proto = { 995 .name = "UNIX-STREAM", 996 .owner = THIS_MODULE, 997 .obj_size = sizeof(struct unix_sock), 998 .close = unix_close, 999 .unhash = unix_unhash, 1000 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 1001 #ifdef CONFIG_BPF_SYSCALL 1002 .psock_update_sk_prot = unix_stream_bpf_update_proto, 1003 #endif 1004 }; 1005 1006 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 1007 { 1008 struct unix_sock *u; 1009 struct sock *sk; 1010 int err; 1011 1012 atomic_long_inc(&unix_nr_socks); 1013 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 1014 err = -ENFILE; 1015 goto err; 1016 } 1017 1018 if (type == SOCK_STREAM) 1019 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 1020 else /*dgram and seqpacket */ 1021 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 1022 1023 if (!sk) { 1024 err = -ENOMEM; 1025 goto err; 1026 } 1027 1028 sock_init_data(sock, sk); 1029 1030 sk->sk_hash = unix_unbound_hash(sk); 1031 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 1032 sk->sk_write_space = unix_write_space; 1033 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 1034 sk->sk_destruct = unix_sock_destructor; 1035 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); 1036 1037 u = unix_sk(sk); 1038 u->listener = NULL; 1039 u->vertex = NULL; 1040 u->path.dentry = NULL; 1041 u->path.mnt = NULL; 1042 spin_lock_init(&u->lock); 1043 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); 1044 mutex_init(&u->iolock); /* single task reading lock */ 1045 mutex_init(&u->bindlock); /* single task binding lock */ 1046 init_waitqueue_head(&u->peer_wait); 1047 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1048 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1049 unix_insert_unbound_socket(net, sk); 1050 1051 sock_prot_inuse_add(net, sk->sk_prot, 1); 1052 1053 return sk; 1054 1055 err: 1056 atomic_long_dec(&unix_nr_socks); 1057 return ERR_PTR(err); 1058 } 1059 1060 static int unix_create(struct net *net, struct socket *sock, int protocol, 1061 int kern) 1062 { 1063 struct sock *sk; 1064 1065 if (protocol && protocol != PF_UNIX) 1066 return -EPROTONOSUPPORT; 1067 1068 sock->state = SS_UNCONNECTED; 1069 1070 switch (sock->type) { 1071 case SOCK_STREAM: 1072 sock->ops = &unix_stream_ops; 1073 break; 1074 /* 1075 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1076 * nothing uses it. 1077 */ 1078 case SOCK_RAW: 1079 sock->type = SOCK_DGRAM; 1080 fallthrough; 1081 case SOCK_DGRAM: 1082 sock->ops = &unix_dgram_ops; 1083 break; 1084 case SOCK_SEQPACKET: 1085 sock->ops = &unix_seqpacket_ops; 1086 break; 1087 default: 1088 return -ESOCKTNOSUPPORT; 1089 } 1090 1091 sk = unix_create1(net, sock, kern, sock->type); 1092 if (IS_ERR(sk)) 1093 return PTR_ERR(sk); 1094 1095 return 0; 1096 } 1097 1098 static int unix_release(struct socket *sock) 1099 { 1100 struct sock *sk = sock->sk; 1101 1102 if (!sk) 1103 return 0; 1104 1105 sk->sk_prot->close(sk, 0); 1106 unix_release_sock(sk, 0); 1107 sock->sk = NULL; 1108 1109 return 0; 1110 } 1111 1112 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1113 int type) 1114 { 1115 struct inode *inode; 1116 struct path path; 1117 struct sock *sk; 1118 int err; 1119 1120 unix_mkname_bsd(sunaddr, addr_len); 1121 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1122 if (err) 1123 goto fail; 1124 1125 err = path_permission(&path, MAY_WRITE); 1126 if (err) 1127 goto path_put; 1128 1129 err = -ECONNREFUSED; 1130 inode = d_backing_inode(path.dentry); 1131 if (!S_ISSOCK(inode->i_mode)) 1132 goto path_put; 1133 1134 sk = unix_find_socket_byinode(inode); 1135 if (!sk) 1136 goto path_put; 1137 1138 err = -EPROTOTYPE; 1139 if (sk->sk_type == type) 1140 touch_atime(&path); 1141 else 1142 goto sock_put; 1143 1144 path_put(&path); 1145 1146 return sk; 1147 1148 sock_put: 1149 sock_put(sk); 1150 path_put: 1151 path_put(&path); 1152 fail: 1153 return ERR_PTR(err); 1154 } 1155 1156 static struct sock *unix_find_abstract(struct net *net, 1157 struct sockaddr_un *sunaddr, 1158 int addr_len, int type) 1159 { 1160 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1161 struct dentry *dentry; 1162 struct sock *sk; 1163 1164 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1165 if (!sk) 1166 return ERR_PTR(-ECONNREFUSED); 1167 1168 dentry = unix_sk(sk)->path.dentry; 1169 if (dentry) 1170 touch_atime(&unix_sk(sk)->path); 1171 1172 return sk; 1173 } 1174 1175 static struct sock *unix_find_other(struct net *net, 1176 struct sockaddr_un *sunaddr, 1177 int addr_len, int type) 1178 { 1179 struct sock *sk; 1180 1181 if (sunaddr->sun_path[0]) 1182 sk = unix_find_bsd(sunaddr, addr_len, type); 1183 else 1184 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1185 1186 return sk; 1187 } 1188 1189 static int unix_autobind(struct sock *sk) 1190 { 1191 struct unix_sock *u = unix_sk(sk); 1192 unsigned int new_hash, old_hash; 1193 struct net *net = sock_net(sk); 1194 struct unix_address *addr; 1195 u32 lastnum, ordernum; 1196 int err; 1197 1198 err = mutex_lock_interruptible(&u->bindlock); 1199 if (err) 1200 return err; 1201 1202 if (u->addr) 1203 goto out; 1204 1205 err = -ENOMEM; 1206 addr = kzalloc(sizeof(*addr) + 1207 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1208 if (!addr) 1209 goto out; 1210 1211 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1212 addr->name->sun_family = AF_UNIX; 1213 refcount_set(&addr->refcnt, 1); 1214 1215 old_hash = sk->sk_hash; 1216 ordernum = get_random_u32(); 1217 lastnum = ordernum & 0xFFFFF; 1218 retry: 1219 ordernum = (ordernum + 1) & 0xFFFFF; 1220 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1221 1222 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1223 unix_table_double_lock(net, old_hash, new_hash); 1224 1225 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1226 unix_table_double_unlock(net, old_hash, new_hash); 1227 1228 /* __unix_find_socket_byname() may take long time if many names 1229 * are already in use. 1230 */ 1231 cond_resched(); 1232 1233 if (ordernum == lastnum) { 1234 /* Give up if all names seems to be in use. */ 1235 err = -ENOSPC; 1236 unix_release_addr(addr); 1237 goto out; 1238 } 1239 1240 goto retry; 1241 } 1242 1243 __unix_set_addr_hash(net, sk, addr, new_hash); 1244 unix_table_double_unlock(net, old_hash, new_hash); 1245 err = 0; 1246 1247 out: mutex_unlock(&u->bindlock); 1248 return err; 1249 } 1250 1251 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1252 int addr_len) 1253 { 1254 umode_t mode = S_IFSOCK | 1255 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1256 struct unix_sock *u = unix_sk(sk); 1257 unsigned int new_hash, old_hash; 1258 struct net *net = sock_net(sk); 1259 struct mnt_idmap *idmap; 1260 struct unix_address *addr; 1261 struct dentry *dentry; 1262 struct path parent; 1263 int err; 1264 1265 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1266 addr = unix_create_addr(sunaddr, addr_len); 1267 if (!addr) 1268 return -ENOMEM; 1269 1270 /* 1271 * Get the parent directory, calculate the hash for last 1272 * component. 1273 */ 1274 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1275 if (IS_ERR(dentry)) { 1276 err = PTR_ERR(dentry); 1277 goto out; 1278 } 1279 1280 /* 1281 * All right, let's create it. 1282 */ 1283 idmap = mnt_idmap(parent.mnt); 1284 err = security_path_mknod(&parent, dentry, mode, 0); 1285 if (!err) 1286 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1287 if (err) 1288 goto out_path; 1289 err = mutex_lock_interruptible(&u->bindlock); 1290 if (err) 1291 goto out_unlink; 1292 if (u->addr) 1293 goto out_unlock; 1294 1295 old_hash = sk->sk_hash; 1296 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1297 unix_table_double_lock(net, old_hash, new_hash); 1298 u->path.mnt = mntget(parent.mnt); 1299 u->path.dentry = dget(dentry); 1300 __unix_set_addr_hash(net, sk, addr, new_hash); 1301 unix_table_double_unlock(net, old_hash, new_hash); 1302 unix_insert_bsd_socket(sk); 1303 mutex_unlock(&u->bindlock); 1304 done_path_create(&parent, dentry); 1305 return 0; 1306 1307 out_unlock: 1308 mutex_unlock(&u->bindlock); 1309 err = -EINVAL; 1310 out_unlink: 1311 /* failed after successful mknod? unlink what we'd created... */ 1312 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1313 out_path: 1314 done_path_create(&parent, dentry); 1315 out: 1316 unix_release_addr(addr); 1317 return err == -EEXIST ? -EADDRINUSE : err; 1318 } 1319 1320 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1321 int addr_len) 1322 { 1323 struct unix_sock *u = unix_sk(sk); 1324 unsigned int new_hash, old_hash; 1325 struct net *net = sock_net(sk); 1326 struct unix_address *addr; 1327 int err; 1328 1329 addr = unix_create_addr(sunaddr, addr_len); 1330 if (!addr) 1331 return -ENOMEM; 1332 1333 err = mutex_lock_interruptible(&u->bindlock); 1334 if (err) 1335 goto out; 1336 1337 if (u->addr) { 1338 err = -EINVAL; 1339 goto out_mutex; 1340 } 1341 1342 old_hash = sk->sk_hash; 1343 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1344 unix_table_double_lock(net, old_hash, new_hash); 1345 1346 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1347 goto out_spin; 1348 1349 __unix_set_addr_hash(net, sk, addr, new_hash); 1350 unix_table_double_unlock(net, old_hash, new_hash); 1351 mutex_unlock(&u->bindlock); 1352 return 0; 1353 1354 out_spin: 1355 unix_table_double_unlock(net, old_hash, new_hash); 1356 err = -EADDRINUSE; 1357 out_mutex: 1358 mutex_unlock(&u->bindlock); 1359 out: 1360 unix_release_addr(addr); 1361 return err; 1362 } 1363 1364 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1365 { 1366 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1367 struct sock *sk = sock->sk; 1368 int err; 1369 1370 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1371 sunaddr->sun_family == AF_UNIX) 1372 return unix_autobind(sk); 1373 1374 err = unix_validate_addr(sunaddr, addr_len); 1375 if (err) 1376 return err; 1377 1378 if (sunaddr->sun_path[0]) 1379 err = unix_bind_bsd(sk, sunaddr, addr_len); 1380 else 1381 err = unix_bind_abstract(sk, sunaddr, addr_len); 1382 1383 return err; 1384 } 1385 1386 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1387 { 1388 if (unlikely(sk1 == sk2) || !sk2) { 1389 unix_state_lock(sk1); 1390 return; 1391 } 1392 1393 if (sk1 > sk2) 1394 swap(sk1, sk2); 1395 1396 unix_state_lock(sk1); 1397 unix_state_lock(sk2); 1398 } 1399 1400 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1401 { 1402 if (unlikely(sk1 == sk2) || !sk2) { 1403 unix_state_unlock(sk1); 1404 return; 1405 } 1406 unix_state_unlock(sk1); 1407 unix_state_unlock(sk2); 1408 } 1409 1410 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1411 int alen, int flags) 1412 { 1413 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1414 struct sock *sk = sock->sk; 1415 struct sock *other; 1416 int err; 1417 1418 err = -EINVAL; 1419 if (alen < offsetofend(struct sockaddr, sa_family)) 1420 goto out; 1421 1422 if (addr->sa_family != AF_UNSPEC) { 1423 err = unix_validate_addr(sunaddr, alen); 1424 if (err) 1425 goto out; 1426 1427 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1428 if (err) 1429 goto out; 1430 1431 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1432 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1433 !READ_ONCE(unix_sk(sk)->addr)) { 1434 err = unix_autobind(sk); 1435 if (err) 1436 goto out; 1437 } 1438 1439 restart: 1440 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1441 if (IS_ERR(other)) { 1442 err = PTR_ERR(other); 1443 goto out; 1444 } 1445 1446 unix_state_double_lock(sk, other); 1447 1448 /* Apparently VFS overslept socket death. Retry. */ 1449 if (sock_flag(other, SOCK_DEAD)) { 1450 unix_state_double_unlock(sk, other); 1451 sock_put(other); 1452 goto restart; 1453 } 1454 1455 err = -EPERM; 1456 if (!unix_may_send(sk, other)) 1457 goto out_unlock; 1458 1459 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1460 if (err) 1461 goto out_unlock; 1462 1463 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1464 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1465 } else { 1466 /* 1467 * 1003.1g breaking connected state with AF_UNSPEC 1468 */ 1469 other = NULL; 1470 unix_state_double_lock(sk, other); 1471 } 1472 1473 /* 1474 * If it was connected, reconnect. 1475 */ 1476 if (unix_peer(sk)) { 1477 struct sock *old_peer = unix_peer(sk); 1478 1479 unix_peer(sk) = other; 1480 if (!other) 1481 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1482 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1483 1484 unix_state_double_unlock(sk, other); 1485 1486 if (other != old_peer) { 1487 unix_dgram_disconnected(sk, old_peer); 1488 1489 unix_state_lock(old_peer); 1490 if (!unix_peer(old_peer)) 1491 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1492 unix_state_unlock(old_peer); 1493 } 1494 1495 sock_put(old_peer); 1496 } else { 1497 unix_peer(sk) = other; 1498 unix_state_double_unlock(sk, other); 1499 } 1500 1501 return 0; 1502 1503 out_unlock: 1504 unix_state_double_unlock(sk, other); 1505 sock_put(other); 1506 out: 1507 return err; 1508 } 1509 1510 static long unix_wait_for_peer(struct sock *other, long timeo) 1511 __releases(&unix_sk(other)->lock) 1512 { 1513 struct unix_sock *u = unix_sk(other); 1514 int sched; 1515 DEFINE_WAIT(wait); 1516 1517 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1518 1519 sched = !sock_flag(other, SOCK_DEAD) && 1520 !(other->sk_shutdown & RCV_SHUTDOWN) && 1521 unix_recvq_full_lockless(other); 1522 1523 unix_state_unlock(other); 1524 1525 if (sched) 1526 timeo = schedule_timeout(timeo); 1527 1528 finish_wait(&u->peer_wait, &wait); 1529 return timeo; 1530 } 1531 1532 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1533 int addr_len, int flags) 1534 { 1535 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1536 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1537 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1538 struct net *net = sock_net(sk); 1539 struct sk_buff *skb = NULL; 1540 unsigned char state; 1541 long timeo; 1542 int err; 1543 1544 err = unix_validate_addr(sunaddr, addr_len); 1545 if (err) 1546 goto out; 1547 1548 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1549 if (err) 1550 goto out; 1551 1552 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1553 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1554 !READ_ONCE(u->addr)) { 1555 err = unix_autobind(sk); 1556 if (err) 1557 goto out; 1558 } 1559 1560 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1561 1562 /* First of all allocate resources. 1563 * If we will make it after state is locked, 1564 * we will have to recheck all again in any case. 1565 */ 1566 1567 /* create new sock for complete connection */ 1568 newsk = unix_create1(net, NULL, 0, sock->type); 1569 if (IS_ERR(newsk)) { 1570 err = PTR_ERR(newsk); 1571 goto out; 1572 } 1573 1574 /* Allocate skb for sending to listening sock */ 1575 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1576 if (!skb) { 1577 err = -ENOMEM; 1578 goto out_free_sk; 1579 } 1580 1581 restart: 1582 /* Find listening sock. */ 1583 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1584 if (IS_ERR(other)) { 1585 err = PTR_ERR(other); 1586 goto out_free_skb; 1587 } 1588 1589 unix_state_lock(other); 1590 1591 /* Apparently VFS overslept socket death. Retry. */ 1592 if (sock_flag(other, SOCK_DEAD)) { 1593 unix_state_unlock(other); 1594 sock_put(other); 1595 goto restart; 1596 } 1597 1598 if (other->sk_state != TCP_LISTEN || 1599 other->sk_shutdown & RCV_SHUTDOWN) { 1600 err = -ECONNREFUSED; 1601 goto out_unlock; 1602 } 1603 1604 if (unix_recvq_full_lockless(other)) { 1605 if (!timeo) { 1606 err = -EAGAIN; 1607 goto out_unlock; 1608 } 1609 1610 timeo = unix_wait_for_peer(other, timeo); 1611 sock_put(other); 1612 1613 err = sock_intr_errno(timeo); 1614 if (signal_pending(current)) 1615 goto out_free_skb; 1616 1617 goto restart; 1618 } 1619 1620 /* self connect and simultaneous connect are eliminated 1621 * by rejecting TCP_LISTEN socket to avoid deadlock. 1622 */ 1623 state = READ_ONCE(sk->sk_state); 1624 if (unlikely(state != TCP_CLOSE)) { 1625 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1626 goto out_unlock; 1627 } 1628 1629 unix_state_lock(sk); 1630 1631 if (unlikely(sk->sk_state != TCP_CLOSE)) { 1632 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1633 unix_state_unlock(sk); 1634 goto out_unlock; 1635 } 1636 1637 err = security_unix_stream_connect(sk, other, newsk); 1638 if (err) { 1639 unix_state_unlock(sk); 1640 goto out_unlock; 1641 } 1642 1643 /* The way is open! Fastly set all the necessary fields... */ 1644 1645 sock_hold(sk); 1646 unix_peer(newsk) = sk; 1647 newsk->sk_state = TCP_ESTABLISHED; 1648 newsk->sk_type = sk->sk_type; 1649 init_peercred(newsk); 1650 newu = unix_sk(newsk); 1651 newu->listener = other; 1652 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1653 otheru = unix_sk(other); 1654 1655 /* copy address information from listening to new sock 1656 * 1657 * The contents of *(otheru->addr) and otheru->path 1658 * are seen fully set up here, since we have found 1659 * otheru in hash under its lock. Insertion into the 1660 * hash chain we'd found it in had been done in an 1661 * earlier critical area protected by the chain's lock, 1662 * the same one where we'd set *(otheru->addr) contents, 1663 * as well as otheru->path and otheru->addr itself. 1664 * 1665 * Using smp_store_release() here to set newu->addr 1666 * is enough to make those stores, as well as stores 1667 * to newu->path visible to anyone who gets newu->addr 1668 * by smp_load_acquire(). IOW, the same warranties 1669 * as for unix_sock instances bound in unix_bind() or 1670 * in unix_autobind(). 1671 */ 1672 if (otheru->path.dentry) { 1673 path_get(&otheru->path); 1674 newu->path = otheru->path; 1675 } 1676 refcount_inc(&otheru->addr->refcnt); 1677 smp_store_release(&newu->addr, otheru->addr); 1678 1679 /* Set credentials */ 1680 copy_peercred(sk, other); 1681 1682 sock->state = SS_CONNECTED; 1683 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1684 sock_hold(newsk); 1685 1686 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1687 unix_peer(sk) = newsk; 1688 1689 unix_state_unlock(sk); 1690 1691 /* take ten and send info to listening sock */ 1692 spin_lock(&other->sk_receive_queue.lock); 1693 __skb_queue_tail(&other->sk_receive_queue, skb); 1694 spin_unlock(&other->sk_receive_queue.lock); 1695 unix_state_unlock(other); 1696 other->sk_data_ready(other); 1697 sock_put(other); 1698 return 0; 1699 1700 out_unlock: 1701 unix_state_unlock(other); 1702 sock_put(other); 1703 out_free_skb: 1704 consume_skb(skb); 1705 out_free_sk: 1706 unix_release_sock(newsk, 0); 1707 out: 1708 return err; 1709 } 1710 1711 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1712 { 1713 struct sock *ska = socka->sk, *skb = sockb->sk; 1714 1715 /* Join our sockets back to back */ 1716 sock_hold(ska); 1717 sock_hold(skb); 1718 unix_peer(ska) = skb; 1719 unix_peer(skb) = ska; 1720 init_peercred(ska); 1721 init_peercred(skb); 1722 1723 ska->sk_state = TCP_ESTABLISHED; 1724 skb->sk_state = TCP_ESTABLISHED; 1725 socka->state = SS_CONNECTED; 1726 sockb->state = SS_CONNECTED; 1727 return 0; 1728 } 1729 1730 static void unix_sock_inherit_flags(const struct socket *old, 1731 struct socket *new) 1732 { 1733 if (test_bit(SOCK_PASSCRED, &old->flags)) 1734 set_bit(SOCK_PASSCRED, &new->flags); 1735 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1736 set_bit(SOCK_PASSPIDFD, &new->flags); 1737 if (test_bit(SOCK_PASSSEC, &old->flags)) 1738 set_bit(SOCK_PASSSEC, &new->flags); 1739 } 1740 1741 static int unix_accept(struct socket *sock, struct socket *newsock, 1742 struct proto_accept_arg *arg) 1743 { 1744 struct sock *sk = sock->sk; 1745 struct sk_buff *skb; 1746 struct sock *tsk; 1747 1748 arg->err = -EOPNOTSUPP; 1749 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1750 goto out; 1751 1752 arg->err = -EINVAL; 1753 if (READ_ONCE(sk->sk_state) != TCP_LISTEN) 1754 goto out; 1755 1756 /* If socket state is TCP_LISTEN it cannot change (for now...), 1757 * so that no locks are necessary. 1758 */ 1759 1760 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1761 &arg->err); 1762 if (!skb) { 1763 /* This means receive shutdown. */ 1764 if (arg->err == 0) 1765 arg->err = -EINVAL; 1766 goto out; 1767 } 1768 1769 tsk = skb->sk; 1770 skb_free_datagram(sk, skb); 1771 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1772 1773 /* attach accepted sock to socket */ 1774 unix_state_lock(tsk); 1775 unix_update_edges(unix_sk(tsk)); 1776 newsock->state = SS_CONNECTED; 1777 unix_sock_inherit_flags(sock, newsock); 1778 sock_graft(tsk, newsock); 1779 unix_state_unlock(tsk); 1780 return 0; 1781 1782 out: 1783 return arg->err; 1784 } 1785 1786 1787 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1788 { 1789 struct sock *sk = sock->sk; 1790 struct unix_address *addr; 1791 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1792 int err = 0; 1793 1794 if (peer) { 1795 sk = unix_peer_get(sk); 1796 1797 err = -ENOTCONN; 1798 if (!sk) 1799 goto out; 1800 err = 0; 1801 } else { 1802 sock_hold(sk); 1803 } 1804 1805 addr = smp_load_acquire(&unix_sk(sk)->addr); 1806 if (!addr) { 1807 sunaddr->sun_family = AF_UNIX; 1808 sunaddr->sun_path[0] = 0; 1809 err = offsetof(struct sockaddr_un, sun_path); 1810 } else { 1811 err = addr->len; 1812 memcpy(sunaddr, addr->name, addr->len); 1813 1814 if (peer) 1815 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1816 CGROUP_UNIX_GETPEERNAME); 1817 else 1818 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1819 CGROUP_UNIX_GETSOCKNAME); 1820 } 1821 sock_put(sk); 1822 out: 1823 return err; 1824 } 1825 1826 /* The "user->unix_inflight" variable is protected by the garbage 1827 * collection lock, and we just read it locklessly here. If you go 1828 * over the limit, there might be a tiny race in actually noticing 1829 * it across threads. Tough. 1830 */ 1831 static inline bool too_many_unix_fds(struct task_struct *p) 1832 { 1833 struct user_struct *user = current_user(); 1834 1835 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1836 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1837 return false; 1838 } 1839 1840 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1841 { 1842 if (too_many_unix_fds(current)) 1843 return -ETOOMANYREFS; 1844 1845 UNIXCB(skb).fp = scm->fp; 1846 scm->fp = NULL; 1847 1848 if (unix_prepare_fpl(UNIXCB(skb).fp)) 1849 return -ENOMEM; 1850 1851 return 0; 1852 } 1853 1854 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1855 { 1856 scm->fp = UNIXCB(skb).fp; 1857 UNIXCB(skb).fp = NULL; 1858 1859 unix_destroy_fpl(scm->fp); 1860 } 1861 1862 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1863 { 1864 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1865 } 1866 1867 static void unix_destruct_scm(struct sk_buff *skb) 1868 { 1869 struct scm_cookie scm; 1870 1871 memset(&scm, 0, sizeof(scm)); 1872 scm.pid = UNIXCB(skb).pid; 1873 if (UNIXCB(skb).fp) 1874 unix_detach_fds(&scm, skb); 1875 1876 /* Alas, it calls VFS */ 1877 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1878 scm_destroy(&scm); 1879 sock_wfree(skb); 1880 } 1881 1882 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1883 { 1884 int err = 0; 1885 1886 UNIXCB(skb).pid = get_pid(scm->pid); 1887 UNIXCB(skb).uid = scm->creds.uid; 1888 UNIXCB(skb).gid = scm->creds.gid; 1889 UNIXCB(skb).fp = NULL; 1890 unix_get_secdata(scm, skb); 1891 if (scm->fp && send_fds) 1892 err = unix_attach_fds(scm, skb); 1893 1894 skb->destructor = unix_destruct_scm; 1895 return err; 1896 } 1897 1898 static bool unix_passcred_enabled(const struct socket *sock, 1899 const struct sock *other) 1900 { 1901 return test_bit(SOCK_PASSCRED, &sock->flags) || 1902 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1903 !other->sk_socket || 1904 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1905 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1906 } 1907 1908 /* 1909 * Some apps rely on write() giving SCM_CREDENTIALS 1910 * We include credentials if source or destination socket 1911 * asserted SOCK_PASSCRED. 1912 */ 1913 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1914 const struct sock *other) 1915 { 1916 if (UNIXCB(skb).pid) 1917 return; 1918 if (unix_passcred_enabled(sock, other)) { 1919 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1920 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1921 } 1922 } 1923 1924 static bool unix_skb_scm_eq(struct sk_buff *skb, 1925 struct scm_cookie *scm) 1926 { 1927 return UNIXCB(skb).pid == scm->pid && 1928 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1929 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1930 unix_secdata_eq(scm, skb); 1931 } 1932 1933 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1934 { 1935 struct scm_fp_list *fp = UNIXCB(skb).fp; 1936 struct unix_sock *u = unix_sk(sk); 1937 1938 if (unlikely(fp && fp->count)) { 1939 atomic_add(fp->count, &u->scm_stat.nr_fds); 1940 unix_add_edges(fp, u); 1941 } 1942 } 1943 1944 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1945 { 1946 struct scm_fp_list *fp = UNIXCB(skb).fp; 1947 struct unix_sock *u = unix_sk(sk); 1948 1949 if (unlikely(fp && fp->count)) { 1950 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1951 unix_del_edges(fp); 1952 } 1953 } 1954 1955 /* 1956 * Send AF_UNIX data. 1957 */ 1958 1959 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1960 size_t len) 1961 { 1962 struct sock *sk = sock->sk, *other = NULL; 1963 struct unix_sock *u = unix_sk(sk); 1964 struct scm_cookie scm; 1965 struct sk_buff *skb; 1966 int data_len = 0; 1967 int sk_locked; 1968 long timeo; 1969 int err; 1970 1971 err = scm_send(sock, msg, &scm, false); 1972 if (err < 0) 1973 return err; 1974 1975 wait_for_unix_gc(scm.fp); 1976 1977 if (msg->msg_flags & MSG_OOB) { 1978 err = -EOPNOTSUPP; 1979 goto out; 1980 } 1981 1982 if (msg->msg_namelen) { 1983 err = unix_validate_addr(msg->msg_name, msg->msg_namelen); 1984 if (err) 1985 goto out; 1986 1987 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 1988 msg->msg_name, 1989 &msg->msg_namelen, 1990 NULL); 1991 if (err) 1992 goto out; 1993 } 1994 1995 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1996 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1997 !READ_ONCE(u->addr)) { 1998 err = unix_autobind(sk); 1999 if (err) 2000 goto out; 2001 } 2002 2003 if (len > READ_ONCE(sk->sk_sndbuf) - 32) { 2004 err = -EMSGSIZE; 2005 goto out; 2006 } 2007 2008 if (len > SKB_MAX_ALLOC) { 2009 data_len = min_t(size_t, 2010 len - SKB_MAX_ALLOC, 2011 MAX_SKB_FRAGS * PAGE_SIZE); 2012 data_len = PAGE_ALIGN(data_len); 2013 2014 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 2015 } 2016 2017 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 2018 msg->msg_flags & MSG_DONTWAIT, &err, 2019 PAGE_ALLOC_COSTLY_ORDER); 2020 if (!skb) 2021 goto out; 2022 2023 err = unix_scm_to_skb(&scm, skb, true); 2024 if (err < 0) 2025 goto out_free; 2026 2027 skb_put(skb, len - data_len); 2028 skb->data_len = data_len; 2029 skb->len = len; 2030 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 2031 if (err) 2032 goto out_free; 2033 2034 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 2035 2036 if (msg->msg_namelen) { 2037 lookup: 2038 other = unix_find_other(sock_net(sk), msg->msg_name, 2039 msg->msg_namelen, sk->sk_type); 2040 if (IS_ERR(other)) { 2041 err = PTR_ERR(other); 2042 goto out_free; 2043 } 2044 } else { 2045 other = unix_peer_get(sk); 2046 if (!other) { 2047 err = -ENOTCONN; 2048 goto out_free; 2049 } 2050 } 2051 2052 if (sk_filter(other, skb) < 0) { 2053 /* Toss the packet but do not return any error to the sender */ 2054 err = len; 2055 goto out_sock_put; 2056 } 2057 2058 restart: 2059 sk_locked = 0; 2060 unix_state_lock(other); 2061 restart_locked: 2062 2063 if (!unix_may_send(sk, other)) { 2064 err = -EPERM; 2065 goto out_unlock; 2066 } 2067 2068 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2069 /* Check with 1003.1g - what should datagram error */ 2070 2071 unix_state_unlock(other); 2072 2073 if (sk->sk_type == SOCK_SEQPACKET) { 2074 /* We are here only when racing with unix_release_sock() 2075 * is clearing @other. Never change state to TCP_CLOSE 2076 * unlike SOCK_DGRAM wants. 2077 */ 2078 err = -EPIPE; 2079 goto out_sock_put; 2080 } 2081 2082 if (!sk_locked) 2083 unix_state_lock(sk); 2084 2085 if (unix_peer(sk) == other) { 2086 unix_peer(sk) = NULL; 2087 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2088 2089 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2090 unix_state_unlock(sk); 2091 2092 unix_dgram_disconnected(sk, other); 2093 sock_put(other); 2094 err = -ECONNREFUSED; 2095 goto out_sock_put; 2096 } 2097 2098 unix_state_unlock(sk); 2099 2100 if (!msg->msg_namelen) { 2101 err = -ECONNRESET; 2102 goto out_sock_put; 2103 } 2104 2105 goto lookup; 2106 } 2107 2108 if (other->sk_shutdown & RCV_SHUTDOWN) { 2109 err = -EPIPE; 2110 goto out_unlock; 2111 } 2112 2113 if (sk->sk_type != SOCK_SEQPACKET) { 2114 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2115 if (err) 2116 goto out_unlock; 2117 } 2118 2119 /* other == sk && unix_peer(other) != sk if 2120 * - unix_peer(sk) == NULL, destination address bound to sk 2121 * - unix_peer(sk) == sk by time of get but disconnected before lock 2122 */ 2123 if (other != sk && 2124 unlikely(unix_peer(other) != sk && 2125 unix_recvq_full_lockless(other))) { 2126 if (timeo) { 2127 timeo = unix_wait_for_peer(other, timeo); 2128 2129 err = sock_intr_errno(timeo); 2130 if (signal_pending(current)) 2131 goto out_sock_put; 2132 2133 goto restart; 2134 } 2135 2136 if (!sk_locked) { 2137 unix_state_unlock(other); 2138 unix_state_double_lock(sk, other); 2139 } 2140 2141 if (unix_peer(sk) != other || 2142 unix_dgram_peer_wake_me(sk, other)) { 2143 err = -EAGAIN; 2144 sk_locked = 1; 2145 goto out_unlock; 2146 } 2147 2148 if (!sk_locked) { 2149 sk_locked = 1; 2150 goto restart_locked; 2151 } 2152 } 2153 2154 if (unlikely(sk_locked)) 2155 unix_state_unlock(sk); 2156 2157 if (sock_flag(other, SOCK_RCVTSTAMP)) 2158 __net_timestamp(skb); 2159 maybe_add_creds(skb, sock, other); 2160 scm_stat_add(other, skb); 2161 skb_queue_tail(&other->sk_receive_queue, skb); 2162 unix_state_unlock(other); 2163 other->sk_data_ready(other); 2164 sock_put(other); 2165 scm_destroy(&scm); 2166 return len; 2167 2168 out_unlock: 2169 if (sk_locked) 2170 unix_state_unlock(sk); 2171 unix_state_unlock(other); 2172 out_sock_put: 2173 sock_put(other); 2174 out_free: 2175 consume_skb(skb); 2176 out: 2177 scm_destroy(&scm); 2178 return err; 2179 } 2180 2181 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2182 * bytes, and a minimum of a full page. 2183 */ 2184 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2185 2186 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2187 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2188 struct scm_cookie *scm, bool fds_sent) 2189 { 2190 struct unix_sock *ousk = unix_sk(other); 2191 struct sk_buff *skb; 2192 int err; 2193 2194 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2195 2196 if (!skb) 2197 return err; 2198 2199 err = unix_scm_to_skb(scm, skb, !fds_sent); 2200 if (err < 0) 2201 goto out; 2202 2203 skb_put(skb, 1); 2204 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2205 2206 if (err) 2207 goto out; 2208 2209 unix_state_lock(other); 2210 2211 if (sock_flag(other, SOCK_DEAD) || 2212 (other->sk_shutdown & RCV_SHUTDOWN)) { 2213 unix_state_unlock(other); 2214 err = -EPIPE; 2215 goto out; 2216 } 2217 2218 maybe_add_creds(skb, sock, other); 2219 scm_stat_add(other, skb); 2220 2221 spin_lock(&other->sk_receive_queue.lock); 2222 WRITE_ONCE(ousk->oob_skb, skb); 2223 __skb_queue_tail(&other->sk_receive_queue, skb); 2224 spin_unlock(&other->sk_receive_queue.lock); 2225 2226 sk_send_sigurg(other); 2227 unix_state_unlock(other); 2228 other->sk_data_ready(other); 2229 2230 return 0; 2231 out: 2232 consume_skb(skb); 2233 return err; 2234 } 2235 #endif 2236 2237 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2238 size_t len) 2239 { 2240 struct sock *sk = sock->sk; 2241 struct sk_buff *skb = NULL; 2242 struct sock *other = NULL; 2243 struct scm_cookie scm; 2244 bool fds_sent = false; 2245 int err, sent = 0; 2246 2247 err = scm_send(sock, msg, &scm, false); 2248 if (err < 0) 2249 return err; 2250 2251 wait_for_unix_gc(scm.fp); 2252 2253 if (msg->msg_flags & MSG_OOB) { 2254 err = -EOPNOTSUPP; 2255 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2256 if (len) 2257 len--; 2258 else 2259 #endif 2260 goto out_err; 2261 } 2262 2263 if (msg->msg_namelen) { 2264 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2265 goto out_err; 2266 } else { 2267 other = unix_peer(sk); 2268 if (!other) { 2269 err = -ENOTCONN; 2270 goto out_err; 2271 } 2272 } 2273 2274 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2275 goto out_pipe; 2276 2277 while (sent < len) { 2278 int size = len - sent; 2279 int data_len; 2280 2281 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2282 skb = sock_alloc_send_pskb(sk, 0, 0, 2283 msg->msg_flags & MSG_DONTWAIT, 2284 &err, 0); 2285 } else { 2286 /* Keep two messages in the pipe so it schedules better */ 2287 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2288 2289 /* allow fallback to order-0 allocations */ 2290 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2291 2292 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2293 2294 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2295 2296 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2297 msg->msg_flags & MSG_DONTWAIT, &err, 2298 get_order(UNIX_SKB_FRAGS_SZ)); 2299 } 2300 if (!skb) 2301 goto out_err; 2302 2303 /* Only send the fds in the first buffer */ 2304 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2305 if (err < 0) 2306 goto out_free; 2307 2308 fds_sent = true; 2309 2310 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2311 skb->ip_summed = CHECKSUM_UNNECESSARY; 2312 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2313 sk->sk_allocation); 2314 if (err < 0) 2315 goto out_free; 2316 2317 size = err; 2318 refcount_add(size, &sk->sk_wmem_alloc); 2319 } else { 2320 skb_put(skb, size - data_len); 2321 skb->data_len = data_len; 2322 skb->len = size; 2323 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2324 if (err) 2325 goto out_free; 2326 } 2327 2328 unix_state_lock(other); 2329 2330 if (sock_flag(other, SOCK_DEAD) || 2331 (other->sk_shutdown & RCV_SHUTDOWN)) 2332 goto out_pipe_unlock; 2333 2334 maybe_add_creds(skb, sock, other); 2335 scm_stat_add(other, skb); 2336 skb_queue_tail(&other->sk_receive_queue, skb); 2337 unix_state_unlock(other); 2338 other->sk_data_ready(other); 2339 sent += size; 2340 } 2341 2342 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2343 if (msg->msg_flags & MSG_OOB) { 2344 err = queue_oob(sock, msg, other, &scm, fds_sent); 2345 if (err) 2346 goto out_err; 2347 sent++; 2348 } 2349 #endif 2350 2351 scm_destroy(&scm); 2352 2353 return sent; 2354 2355 out_pipe_unlock: 2356 unix_state_unlock(other); 2357 out_pipe: 2358 if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) 2359 send_sig(SIGPIPE, current, 0); 2360 err = -EPIPE; 2361 out_free: 2362 consume_skb(skb); 2363 out_err: 2364 scm_destroy(&scm); 2365 return sent ? : err; 2366 } 2367 2368 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2369 size_t len) 2370 { 2371 int err; 2372 struct sock *sk = sock->sk; 2373 2374 err = sock_error(sk); 2375 if (err) 2376 return err; 2377 2378 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2379 return -ENOTCONN; 2380 2381 if (msg->msg_namelen) 2382 msg->msg_namelen = 0; 2383 2384 return unix_dgram_sendmsg(sock, msg, len); 2385 } 2386 2387 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2388 size_t size, int flags) 2389 { 2390 struct sock *sk = sock->sk; 2391 2392 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2393 return -ENOTCONN; 2394 2395 return unix_dgram_recvmsg(sock, msg, size, flags); 2396 } 2397 2398 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2399 { 2400 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2401 2402 if (addr) { 2403 msg->msg_namelen = addr->len; 2404 memcpy(msg->msg_name, addr->name, addr->len); 2405 } 2406 } 2407 2408 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2409 int flags) 2410 { 2411 struct scm_cookie scm; 2412 struct socket *sock = sk->sk_socket; 2413 struct unix_sock *u = unix_sk(sk); 2414 struct sk_buff *skb, *last; 2415 long timeo; 2416 int skip; 2417 int err; 2418 2419 err = -EOPNOTSUPP; 2420 if (flags&MSG_OOB) 2421 goto out; 2422 2423 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2424 2425 do { 2426 mutex_lock(&u->iolock); 2427 2428 skip = sk_peek_offset(sk, flags); 2429 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2430 &skip, &err, &last); 2431 if (skb) { 2432 if (!(flags & MSG_PEEK)) 2433 scm_stat_del(sk, skb); 2434 break; 2435 } 2436 2437 mutex_unlock(&u->iolock); 2438 2439 if (err != -EAGAIN) 2440 break; 2441 } while (timeo && 2442 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2443 &err, &timeo, last)); 2444 2445 if (!skb) { /* implies iolock unlocked */ 2446 unix_state_lock(sk); 2447 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2448 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2449 (sk->sk_shutdown & RCV_SHUTDOWN)) 2450 err = 0; 2451 unix_state_unlock(sk); 2452 goto out; 2453 } 2454 2455 if (wq_has_sleeper(&u->peer_wait)) 2456 wake_up_interruptible_sync_poll(&u->peer_wait, 2457 EPOLLOUT | EPOLLWRNORM | 2458 EPOLLWRBAND); 2459 2460 if (msg->msg_name) { 2461 unix_copy_addr(msg, skb->sk); 2462 2463 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2464 msg->msg_name, 2465 &msg->msg_namelen); 2466 } 2467 2468 if (size > skb->len - skip) 2469 size = skb->len - skip; 2470 else if (size < skb->len - skip) 2471 msg->msg_flags |= MSG_TRUNC; 2472 2473 err = skb_copy_datagram_msg(skb, skip, msg, size); 2474 if (err) 2475 goto out_free; 2476 2477 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2478 __sock_recv_timestamp(msg, sk, skb); 2479 2480 memset(&scm, 0, sizeof(scm)); 2481 2482 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2483 unix_set_secdata(&scm, skb); 2484 2485 if (!(flags & MSG_PEEK)) { 2486 if (UNIXCB(skb).fp) 2487 unix_detach_fds(&scm, skb); 2488 2489 sk_peek_offset_bwd(sk, skb->len); 2490 } else { 2491 /* It is questionable: on PEEK we could: 2492 - do not return fds - good, but too simple 8) 2493 - return fds, and do not return them on read (old strategy, 2494 apparently wrong) 2495 - clone fds (I chose it for now, it is the most universal 2496 solution) 2497 2498 POSIX 1003.1g does not actually define this clearly 2499 at all. POSIX 1003.1g doesn't define a lot of things 2500 clearly however! 2501 2502 */ 2503 2504 sk_peek_offset_fwd(sk, size); 2505 2506 if (UNIXCB(skb).fp) 2507 unix_peek_fds(&scm, skb); 2508 } 2509 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2510 2511 scm_recv_unix(sock, msg, &scm, flags); 2512 2513 out_free: 2514 skb_free_datagram(sk, skb); 2515 mutex_unlock(&u->iolock); 2516 out: 2517 return err; 2518 } 2519 2520 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2521 int flags) 2522 { 2523 struct sock *sk = sock->sk; 2524 2525 #ifdef CONFIG_BPF_SYSCALL 2526 const struct proto *prot = READ_ONCE(sk->sk_prot); 2527 2528 if (prot != &unix_dgram_proto) 2529 return prot->recvmsg(sk, msg, size, flags, NULL); 2530 #endif 2531 return __unix_dgram_recvmsg(sk, msg, size, flags); 2532 } 2533 2534 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2535 { 2536 struct unix_sock *u = unix_sk(sk); 2537 struct sk_buff *skb; 2538 int err; 2539 2540 mutex_lock(&u->iolock); 2541 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2542 mutex_unlock(&u->iolock); 2543 if (!skb) 2544 return err; 2545 2546 return recv_actor(sk, skb); 2547 } 2548 2549 /* 2550 * Sleep until more data has arrived. But check for races.. 2551 */ 2552 static long unix_stream_data_wait(struct sock *sk, long timeo, 2553 struct sk_buff *last, unsigned int last_len, 2554 bool freezable) 2555 { 2556 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2557 struct sk_buff *tail; 2558 DEFINE_WAIT(wait); 2559 2560 unix_state_lock(sk); 2561 2562 for (;;) { 2563 prepare_to_wait(sk_sleep(sk), &wait, state); 2564 2565 tail = skb_peek_tail(&sk->sk_receive_queue); 2566 if (tail != last || 2567 (tail && tail->len != last_len) || 2568 sk->sk_err || 2569 (sk->sk_shutdown & RCV_SHUTDOWN) || 2570 signal_pending(current) || 2571 !timeo) 2572 break; 2573 2574 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2575 unix_state_unlock(sk); 2576 timeo = schedule_timeout(timeo); 2577 unix_state_lock(sk); 2578 2579 if (sock_flag(sk, SOCK_DEAD)) 2580 break; 2581 2582 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2583 } 2584 2585 finish_wait(sk_sleep(sk), &wait); 2586 unix_state_unlock(sk); 2587 return timeo; 2588 } 2589 2590 static unsigned int unix_skb_len(const struct sk_buff *skb) 2591 { 2592 return skb->len - UNIXCB(skb).consumed; 2593 } 2594 2595 struct unix_stream_read_state { 2596 int (*recv_actor)(struct sk_buff *, int, int, 2597 struct unix_stream_read_state *); 2598 struct socket *socket; 2599 struct msghdr *msg; 2600 struct pipe_inode_info *pipe; 2601 size_t size; 2602 int flags; 2603 unsigned int splice_flags; 2604 }; 2605 2606 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2607 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2608 { 2609 struct socket *sock = state->socket; 2610 struct sock *sk = sock->sk; 2611 struct unix_sock *u = unix_sk(sk); 2612 int chunk = 1; 2613 struct sk_buff *oob_skb; 2614 2615 mutex_lock(&u->iolock); 2616 unix_state_lock(sk); 2617 spin_lock(&sk->sk_receive_queue.lock); 2618 2619 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2620 spin_unlock(&sk->sk_receive_queue.lock); 2621 unix_state_unlock(sk); 2622 mutex_unlock(&u->iolock); 2623 return -EINVAL; 2624 } 2625 2626 oob_skb = u->oob_skb; 2627 2628 if (!(state->flags & MSG_PEEK)) 2629 WRITE_ONCE(u->oob_skb, NULL); 2630 2631 spin_unlock(&sk->sk_receive_queue.lock); 2632 unix_state_unlock(sk); 2633 2634 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2635 2636 if (!(state->flags & MSG_PEEK)) 2637 UNIXCB(oob_skb).consumed += 1; 2638 2639 mutex_unlock(&u->iolock); 2640 2641 if (chunk < 0) 2642 return -EFAULT; 2643 2644 state->msg->msg_flags |= MSG_OOB; 2645 return 1; 2646 } 2647 2648 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2649 int flags, int copied) 2650 { 2651 struct sk_buff *read_skb = NULL, *unread_skb = NULL; 2652 struct unix_sock *u = unix_sk(sk); 2653 2654 if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb))) 2655 return skb; 2656 2657 spin_lock(&sk->sk_receive_queue.lock); 2658 2659 if (!unix_skb_len(skb)) { 2660 if (copied && (!u->oob_skb || skb == u->oob_skb)) { 2661 skb = NULL; 2662 } else if (flags & MSG_PEEK) { 2663 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2664 } else { 2665 read_skb = skb; 2666 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2667 __skb_unlink(read_skb, &sk->sk_receive_queue); 2668 } 2669 2670 if (!skb) 2671 goto unlock; 2672 } 2673 2674 if (skb != u->oob_skb) 2675 goto unlock; 2676 2677 if (copied) { 2678 skb = NULL; 2679 } else if (!(flags & MSG_PEEK)) { 2680 WRITE_ONCE(u->oob_skb, NULL); 2681 2682 if (!sock_flag(sk, SOCK_URGINLINE)) { 2683 __skb_unlink(skb, &sk->sk_receive_queue); 2684 unread_skb = skb; 2685 skb = skb_peek(&sk->sk_receive_queue); 2686 } 2687 } else if (!sock_flag(sk, SOCK_URGINLINE)) { 2688 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2689 } 2690 2691 unlock: 2692 spin_unlock(&sk->sk_receive_queue.lock); 2693 2694 consume_skb(read_skb); 2695 kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB); 2696 2697 return skb; 2698 } 2699 #endif 2700 2701 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2702 { 2703 struct unix_sock *u = unix_sk(sk); 2704 struct sk_buff *skb; 2705 int err; 2706 2707 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2708 return -ENOTCONN; 2709 2710 mutex_lock(&u->iolock); 2711 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2712 mutex_unlock(&u->iolock); 2713 if (!skb) 2714 return err; 2715 2716 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2717 if (unlikely(skb == READ_ONCE(u->oob_skb))) { 2718 bool drop = false; 2719 2720 unix_state_lock(sk); 2721 2722 if (sock_flag(sk, SOCK_DEAD)) { 2723 unix_state_unlock(sk); 2724 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 2725 return -ECONNRESET; 2726 } 2727 2728 spin_lock(&sk->sk_receive_queue.lock); 2729 if (likely(skb == u->oob_skb)) { 2730 WRITE_ONCE(u->oob_skb, NULL); 2731 drop = true; 2732 } 2733 spin_unlock(&sk->sk_receive_queue.lock); 2734 2735 unix_state_unlock(sk); 2736 2737 if (drop) { 2738 kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); 2739 return -EAGAIN; 2740 } 2741 } 2742 #endif 2743 2744 return recv_actor(sk, skb); 2745 } 2746 2747 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2748 bool freezable) 2749 { 2750 struct scm_cookie scm; 2751 struct socket *sock = state->socket; 2752 struct sock *sk = sock->sk; 2753 struct unix_sock *u = unix_sk(sk); 2754 int copied = 0; 2755 int flags = state->flags; 2756 int noblock = flags & MSG_DONTWAIT; 2757 bool check_creds = false; 2758 int target; 2759 int err = 0; 2760 long timeo; 2761 int skip; 2762 size_t size = state->size; 2763 unsigned int last_len; 2764 2765 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2766 err = -EINVAL; 2767 goto out; 2768 } 2769 2770 if (unlikely(flags & MSG_OOB)) { 2771 err = -EOPNOTSUPP; 2772 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2773 err = unix_stream_recv_urg(state); 2774 #endif 2775 goto out; 2776 } 2777 2778 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2779 timeo = sock_rcvtimeo(sk, noblock); 2780 2781 memset(&scm, 0, sizeof(scm)); 2782 2783 /* Lock the socket to prevent queue disordering 2784 * while sleeps in memcpy_tomsg 2785 */ 2786 mutex_lock(&u->iolock); 2787 2788 skip = max(sk_peek_offset(sk, flags), 0); 2789 2790 do { 2791 struct sk_buff *skb, *last; 2792 int chunk; 2793 2794 redo: 2795 unix_state_lock(sk); 2796 if (sock_flag(sk, SOCK_DEAD)) { 2797 err = -ECONNRESET; 2798 goto unlock; 2799 } 2800 last = skb = skb_peek(&sk->sk_receive_queue); 2801 last_len = last ? last->len : 0; 2802 2803 again: 2804 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2805 if (skb) { 2806 skb = manage_oob(skb, sk, flags, copied); 2807 if (!skb && copied) { 2808 unix_state_unlock(sk); 2809 break; 2810 } 2811 } 2812 #endif 2813 if (skb == NULL) { 2814 if (copied >= target) 2815 goto unlock; 2816 2817 /* 2818 * POSIX 1003.1g mandates this order. 2819 */ 2820 2821 err = sock_error(sk); 2822 if (err) 2823 goto unlock; 2824 if (sk->sk_shutdown & RCV_SHUTDOWN) 2825 goto unlock; 2826 2827 unix_state_unlock(sk); 2828 if (!timeo) { 2829 err = -EAGAIN; 2830 break; 2831 } 2832 2833 mutex_unlock(&u->iolock); 2834 2835 timeo = unix_stream_data_wait(sk, timeo, last, 2836 last_len, freezable); 2837 2838 if (signal_pending(current)) { 2839 err = sock_intr_errno(timeo); 2840 scm_destroy(&scm); 2841 goto out; 2842 } 2843 2844 mutex_lock(&u->iolock); 2845 goto redo; 2846 unlock: 2847 unix_state_unlock(sk); 2848 break; 2849 } 2850 2851 while (skip >= unix_skb_len(skb)) { 2852 skip -= unix_skb_len(skb); 2853 last = skb; 2854 last_len = skb->len; 2855 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2856 if (!skb) 2857 goto again; 2858 } 2859 2860 unix_state_unlock(sk); 2861 2862 if (check_creds) { 2863 /* Never glue messages from different writers */ 2864 if (!unix_skb_scm_eq(skb, &scm)) 2865 break; 2866 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2867 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2868 /* Copy credentials */ 2869 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2870 unix_set_secdata(&scm, skb); 2871 check_creds = true; 2872 } 2873 2874 /* Copy address just once */ 2875 if (state->msg && state->msg->msg_name) { 2876 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2877 state->msg->msg_name); 2878 unix_copy_addr(state->msg, skb->sk); 2879 2880 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2881 state->msg->msg_name, 2882 &state->msg->msg_namelen); 2883 2884 sunaddr = NULL; 2885 } 2886 2887 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2888 chunk = state->recv_actor(skb, skip, chunk, state); 2889 if (chunk < 0) { 2890 if (copied == 0) 2891 copied = -EFAULT; 2892 break; 2893 } 2894 copied += chunk; 2895 size -= chunk; 2896 2897 /* Mark read part of skb as used */ 2898 if (!(flags & MSG_PEEK)) { 2899 UNIXCB(skb).consumed += chunk; 2900 2901 sk_peek_offset_bwd(sk, chunk); 2902 2903 if (UNIXCB(skb).fp) { 2904 scm_stat_del(sk, skb); 2905 unix_detach_fds(&scm, skb); 2906 } 2907 2908 if (unix_skb_len(skb)) 2909 break; 2910 2911 skb_unlink(skb, &sk->sk_receive_queue); 2912 consume_skb(skb); 2913 2914 if (scm.fp) 2915 break; 2916 } else { 2917 /* It is questionable, see note in unix_dgram_recvmsg. 2918 */ 2919 if (UNIXCB(skb).fp) 2920 unix_peek_fds(&scm, skb); 2921 2922 sk_peek_offset_fwd(sk, chunk); 2923 2924 if (UNIXCB(skb).fp) 2925 break; 2926 2927 skip = 0; 2928 last = skb; 2929 last_len = skb->len; 2930 unix_state_lock(sk); 2931 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2932 if (skb) 2933 goto again; 2934 unix_state_unlock(sk); 2935 break; 2936 } 2937 } while (size); 2938 2939 mutex_unlock(&u->iolock); 2940 if (state->msg) 2941 scm_recv_unix(sock, state->msg, &scm, flags); 2942 else 2943 scm_destroy(&scm); 2944 out: 2945 return copied ? : err; 2946 } 2947 2948 static int unix_stream_read_actor(struct sk_buff *skb, 2949 int skip, int chunk, 2950 struct unix_stream_read_state *state) 2951 { 2952 int ret; 2953 2954 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2955 state->msg, chunk); 2956 return ret ?: chunk; 2957 } 2958 2959 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2960 size_t size, int flags) 2961 { 2962 struct unix_stream_read_state state = { 2963 .recv_actor = unix_stream_read_actor, 2964 .socket = sk->sk_socket, 2965 .msg = msg, 2966 .size = size, 2967 .flags = flags 2968 }; 2969 2970 return unix_stream_read_generic(&state, true); 2971 } 2972 2973 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2974 size_t size, int flags) 2975 { 2976 struct unix_stream_read_state state = { 2977 .recv_actor = unix_stream_read_actor, 2978 .socket = sock, 2979 .msg = msg, 2980 .size = size, 2981 .flags = flags 2982 }; 2983 2984 #ifdef CONFIG_BPF_SYSCALL 2985 struct sock *sk = sock->sk; 2986 const struct proto *prot = READ_ONCE(sk->sk_prot); 2987 2988 if (prot != &unix_stream_proto) 2989 return prot->recvmsg(sk, msg, size, flags, NULL); 2990 #endif 2991 return unix_stream_read_generic(&state, true); 2992 } 2993 2994 static int unix_stream_splice_actor(struct sk_buff *skb, 2995 int skip, int chunk, 2996 struct unix_stream_read_state *state) 2997 { 2998 return skb_splice_bits(skb, state->socket->sk, 2999 UNIXCB(skb).consumed + skip, 3000 state->pipe, chunk, state->splice_flags); 3001 } 3002 3003 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 3004 struct pipe_inode_info *pipe, 3005 size_t size, unsigned int flags) 3006 { 3007 struct unix_stream_read_state state = { 3008 .recv_actor = unix_stream_splice_actor, 3009 .socket = sock, 3010 .pipe = pipe, 3011 .size = size, 3012 .splice_flags = flags, 3013 }; 3014 3015 if (unlikely(*ppos)) 3016 return -ESPIPE; 3017 3018 if (sock->file->f_flags & O_NONBLOCK || 3019 flags & SPLICE_F_NONBLOCK) 3020 state.flags = MSG_DONTWAIT; 3021 3022 return unix_stream_read_generic(&state, false); 3023 } 3024 3025 static int unix_shutdown(struct socket *sock, int mode) 3026 { 3027 struct sock *sk = sock->sk; 3028 struct sock *other; 3029 3030 if (mode < SHUT_RD || mode > SHUT_RDWR) 3031 return -EINVAL; 3032 /* This maps: 3033 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 3034 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 3035 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 3036 */ 3037 ++mode; 3038 3039 unix_state_lock(sk); 3040 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 3041 other = unix_peer(sk); 3042 if (other) 3043 sock_hold(other); 3044 unix_state_unlock(sk); 3045 sk->sk_state_change(sk); 3046 3047 if (other && 3048 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 3049 3050 int peer_mode = 0; 3051 const struct proto *prot = READ_ONCE(other->sk_prot); 3052 3053 if (prot->unhash) 3054 prot->unhash(other); 3055 if (mode&RCV_SHUTDOWN) 3056 peer_mode |= SEND_SHUTDOWN; 3057 if (mode&SEND_SHUTDOWN) 3058 peer_mode |= RCV_SHUTDOWN; 3059 unix_state_lock(other); 3060 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 3061 unix_state_unlock(other); 3062 other->sk_state_change(other); 3063 if (peer_mode == SHUTDOWN_MASK) 3064 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3065 else if (peer_mode & RCV_SHUTDOWN) 3066 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3067 } 3068 if (other) 3069 sock_put(other); 3070 3071 return 0; 3072 } 3073 3074 long unix_inq_len(struct sock *sk) 3075 { 3076 struct sk_buff *skb; 3077 long amount = 0; 3078 3079 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 3080 return -EINVAL; 3081 3082 spin_lock(&sk->sk_receive_queue.lock); 3083 if (sk->sk_type == SOCK_STREAM || 3084 sk->sk_type == SOCK_SEQPACKET) { 3085 skb_queue_walk(&sk->sk_receive_queue, skb) 3086 amount += unix_skb_len(skb); 3087 } else { 3088 skb = skb_peek(&sk->sk_receive_queue); 3089 if (skb) 3090 amount = skb->len; 3091 } 3092 spin_unlock(&sk->sk_receive_queue.lock); 3093 3094 return amount; 3095 } 3096 EXPORT_SYMBOL_GPL(unix_inq_len); 3097 3098 long unix_outq_len(struct sock *sk) 3099 { 3100 return sk_wmem_alloc_get(sk); 3101 } 3102 EXPORT_SYMBOL_GPL(unix_outq_len); 3103 3104 static int unix_open_file(struct sock *sk) 3105 { 3106 struct path path; 3107 struct file *f; 3108 int fd; 3109 3110 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3111 return -EPERM; 3112 3113 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3114 return -ENOENT; 3115 3116 path = unix_sk(sk)->path; 3117 if (!path.dentry) 3118 return -ENOENT; 3119 3120 path_get(&path); 3121 3122 fd = get_unused_fd_flags(O_CLOEXEC); 3123 if (fd < 0) 3124 goto out; 3125 3126 f = dentry_open(&path, O_PATH, current_cred()); 3127 if (IS_ERR(f)) { 3128 put_unused_fd(fd); 3129 fd = PTR_ERR(f); 3130 goto out; 3131 } 3132 3133 fd_install(fd, f); 3134 out: 3135 path_put(&path); 3136 3137 return fd; 3138 } 3139 3140 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3141 { 3142 struct sock *sk = sock->sk; 3143 long amount = 0; 3144 int err; 3145 3146 switch (cmd) { 3147 case SIOCOUTQ: 3148 amount = unix_outq_len(sk); 3149 err = put_user(amount, (int __user *)arg); 3150 break; 3151 case SIOCINQ: 3152 amount = unix_inq_len(sk); 3153 if (amount < 0) 3154 err = amount; 3155 else 3156 err = put_user(amount, (int __user *)arg); 3157 break; 3158 case SIOCUNIXFILE: 3159 err = unix_open_file(sk); 3160 break; 3161 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3162 case SIOCATMARK: 3163 { 3164 struct unix_sock *u = unix_sk(sk); 3165 struct sk_buff *skb; 3166 int answ = 0; 3167 3168 mutex_lock(&u->iolock); 3169 3170 skb = skb_peek(&sk->sk_receive_queue); 3171 if (skb) { 3172 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); 3173 struct sk_buff *next_skb; 3174 3175 next_skb = skb_peek_next(skb, &sk->sk_receive_queue); 3176 3177 if (skb == oob_skb || 3178 (!unix_skb_len(skb) && 3179 (!oob_skb || next_skb == oob_skb))) 3180 answ = 1; 3181 } 3182 3183 mutex_unlock(&u->iolock); 3184 3185 err = put_user(answ, (int __user *)arg); 3186 } 3187 break; 3188 #endif 3189 default: 3190 err = -ENOIOCTLCMD; 3191 break; 3192 } 3193 return err; 3194 } 3195 3196 #ifdef CONFIG_COMPAT 3197 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3198 { 3199 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3200 } 3201 #endif 3202 3203 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3204 { 3205 struct sock *sk = sock->sk; 3206 unsigned char state; 3207 __poll_t mask; 3208 u8 shutdown; 3209 3210 sock_poll_wait(file, sock, wait); 3211 mask = 0; 3212 shutdown = READ_ONCE(sk->sk_shutdown); 3213 state = READ_ONCE(sk->sk_state); 3214 3215 /* exceptional events? */ 3216 if (READ_ONCE(sk->sk_err)) 3217 mask |= EPOLLERR; 3218 if (shutdown == SHUTDOWN_MASK) 3219 mask |= EPOLLHUP; 3220 if (shutdown & RCV_SHUTDOWN) 3221 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3222 3223 /* readable? */ 3224 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3225 mask |= EPOLLIN | EPOLLRDNORM; 3226 if (sk_is_readable(sk)) 3227 mask |= EPOLLIN | EPOLLRDNORM; 3228 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3229 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3230 mask |= EPOLLPRI; 3231 #endif 3232 3233 /* Connection-based need to check for termination and startup */ 3234 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3235 state == TCP_CLOSE) 3236 mask |= EPOLLHUP; 3237 3238 /* 3239 * we set writable also when the other side has shut down the 3240 * connection. This prevents stuck sockets. 3241 */ 3242 if (unix_writable(sk, state)) 3243 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3244 3245 return mask; 3246 } 3247 3248 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3249 poll_table *wait) 3250 { 3251 struct sock *sk = sock->sk, *other; 3252 unsigned int writable; 3253 unsigned char state; 3254 __poll_t mask; 3255 u8 shutdown; 3256 3257 sock_poll_wait(file, sock, wait); 3258 mask = 0; 3259 shutdown = READ_ONCE(sk->sk_shutdown); 3260 state = READ_ONCE(sk->sk_state); 3261 3262 /* exceptional events? */ 3263 if (READ_ONCE(sk->sk_err) || 3264 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3265 mask |= EPOLLERR | 3266 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3267 3268 if (shutdown & RCV_SHUTDOWN) 3269 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3270 if (shutdown == SHUTDOWN_MASK) 3271 mask |= EPOLLHUP; 3272 3273 /* readable? */ 3274 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3275 mask |= EPOLLIN | EPOLLRDNORM; 3276 if (sk_is_readable(sk)) 3277 mask |= EPOLLIN | EPOLLRDNORM; 3278 3279 /* Connection-based need to check for termination and startup */ 3280 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3281 mask |= EPOLLHUP; 3282 3283 /* No write status requested, avoid expensive OUT tests. */ 3284 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3285 return mask; 3286 3287 writable = unix_writable(sk, state); 3288 if (writable) { 3289 unix_state_lock(sk); 3290 3291 other = unix_peer(sk); 3292 if (other && unix_peer(other) != sk && 3293 unix_recvq_full_lockless(other) && 3294 unix_dgram_peer_wake_me(sk, other)) 3295 writable = 0; 3296 3297 unix_state_unlock(sk); 3298 } 3299 3300 if (writable) 3301 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3302 else 3303 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3304 3305 return mask; 3306 } 3307 3308 #ifdef CONFIG_PROC_FS 3309 3310 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3311 3312 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3313 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3314 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3315 3316 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3317 { 3318 unsigned long offset = get_offset(*pos); 3319 unsigned long bucket = get_bucket(*pos); 3320 unsigned long count = 0; 3321 struct sock *sk; 3322 3323 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3324 sk; sk = sk_next(sk)) { 3325 if (++count == offset) 3326 break; 3327 } 3328 3329 return sk; 3330 } 3331 3332 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3333 { 3334 unsigned long bucket = get_bucket(*pos); 3335 struct net *net = seq_file_net(seq); 3336 struct sock *sk; 3337 3338 while (bucket < UNIX_HASH_SIZE) { 3339 spin_lock(&net->unx.table.locks[bucket]); 3340 3341 sk = unix_from_bucket(seq, pos); 3342 if (sk) 3343 return sk; 3344 3345 spin_unlock(&net->unx.table.locks[bucket]); 3346 3347 *pos = set_bucket_offset(++bucket, 1); 3348 } 3349 3350 return NULL; 3351 } 3352 3353 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3354 loff_t *pos) 3355 { 3356 unsigned long bucket = get_bucket(*pos); 3357 3358 sk = sk_next(sk); 3359 if (sk) 3360 return sk; 3361 3362 3363 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3364 3365 *pos = set_bucket_offset(++bucket, 1); 3366 3367 return unix_get_first(seq, pos); 3368 } 3369 3370 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3371 { 3372 if (!*pos) 3373 return SEQ_START_TOKEN; 3374 3375 return unix_get_first(seq, pos); 3376 } 3377 3378 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3379 { 3380 ++*pos; 3381 3382 if (v == SEQ_START_TOKEN) 3383 return unix_get_first(seq, pos); 3384 3385 return unix_get_next(seq, v, pos); 3386 } 3387 3388 static void unix_seq_stop(struct seq_file *seq, void *v) 3389 { 3390 struct sock *sk = v; 3391 3392 if (sk) 3393 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3394 } 3395 3396 static int unix_seq_show(struct seq_file *seq, void *v) 3397 { 3398 3399 if (v == SEQ_START_TOKEN) 3400 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3401 "Inode Path\n"); 3402 else { 3403 struct sock *s = v; 3404 struct unix_sock *u = unix_sk(s); 3405 unix_state_lock(s); 3406 3407 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3408 s, 3409 refcount_read(&s->sk_refcnt), 3410 0, 3411 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3412 s->sk_type, 3413 s->sk_socket ? 3414 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3415 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3416 sock_i_ino(s)); 3417 3418 if (u->addr) { // under a hash table lock here 3419 int i, len; 3420 seq_putc(seq, ' '); 3421 3422 i = 0; 3423 len = u->addr->len - 3424 offsetof(struct sockaddr_un, sun_path); 3425 if (u->addr->name->sun_path[0]) { 3426 len--; 3427 } else { 3428 seq_putc(seq, '@'); 3429 i++; 3430 } 3431 for ( ; i < len; i++) 3432 seq_putc(seq, u->addr->name->sun_path[i] ?: 3433 '@'); 3434 } 3435 unix_state_unlock(s); 3436 seq_putc(seq, '\n'); 3437 } 3438 3439 return 0; 3440 } 3441 3442 static const struct seq_operations unix_seq_ops = { 3443 .start = unix_seq_start, 3444 .next = unix_seq_next, 3445 .stop = unix_seq_stop, 3446 .show = unix_seq_show, 3447 }; 3448 3449 #ifdef CONFIG_BPF_SYSCALL 3450 struct bpf_unix_iter_state { 3451 struct seq_net_private p; 3452 unsigned int cur_sk; 3453 unsigned int end_sk; 3454 unsigned int max_sk; 3455 struct sock **batch; 3456 bool st_bucket_done; 3457 }; 3458 3459 struct bpf_iter__unix { 3460 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3461 __bpf_md_ptr(struct unix_sock *, unix_sk); 3462 uid_t uid __aligned(8); 3463 }; 3464 3465 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3466 struct unix_sock *unix_sk, uid_t uid) 3467 { 3468 struct bpf_iter__unix ctx; 3469 3470 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3471 ctx.meta = meta; 3472 ctx.unix_sk = unix_sk; 3473 ctx.uid = uid; 3474 return bpf_iter_run_prog(prog, &ctx); 3475 } 3476 3477 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3478 3479 { 3480 struct bpf_unix_iter_state *iter = seq->private; 3481 unsigned int expected = 1; 3482 struct sock *sk; 3483 3484 sock_hold(start_sk); 3485 iter->batch[iter->end_sk++] = start_sk; 3486 3487 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3488 if (iter->end_sk < iter->max_sk) { 3489 sock_hold(sk); 3490 iter->batch[iter->end_sk++] = sk; 3491 } 3492 3493 expected++; 3494 } 3495 3496 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3497 3498 return expected; 3499 } 3500 3501 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3502 { 3503 while (iter->cur_sk < iter->end_sk) 3504 sock_put(iter->batch[iter->cur_sk++]); 3505 } 3506 3507 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3508 unsigned int new_batch_sz) 3509 { 3510 struct sock **new_batch; 3511 3512 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3513 GFP_USER | __GFP_NOWARN); 3514 if (!new_batch) 3515 return -ENOMEM; 3516 3517 bpf_iter_unix_put_batch(iter); 3518 kvfree(iter->batch); 3519 iter->batch = new_batch; 3520 iter->max_sk = new_batch_sz; 3521 3522 return 0; 3523 } 3524 3525 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3526 loff_t *pos) 3527 { 3528 struct bpf_unix_iter_state *iter = seq->private; 3529 unsigned int expected; 3530 bool resized = false; 3531 struct sock *sk; 3532 3533 if (iter->st_bucket_done) 3534 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3535 3536 again: 3537 /* Get a new batch */ 3538 iter->cur_sk = 0; 3539 iter->end_sk = 0; 3540 3541 sk = unix_get_first(seq, pos); 3542 if (!sk) 3543 return NULL; /* Done */ 3544 3545 expected = bpf_iter_unix_hold_batch(seq, sk); 3546 3547 if (iter->end_sk == expected) { 3548 iter->st_bucket_done = true; 3549 return sk; 3550 } 3551 3552 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3553 resized = true; 3554 goto again; 3555 } 3556 3557 return sk; 3558 } 3559 3560 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3561 { 3562 if (!*pos) 3563 return SEQ_START_TOKEN; 3564 3565 /* bpf iter does not support lseek, so it always 3566 * continue from where it was stop()-ped. 3567 */ 3568 return bpf_iter_unix_batch(seq, pos); 3569 } 3570 3571 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3572 { 3573 struct bpf_unix_iter_state *iter = seq->private; 3574 struct sock *sk; 3575 3576 /* Whenever seq_next() is called, the iter->cur_sk is 3577 * done with seq_show(), so advance to the next sk in 3578 * the batch. 3579 */ 3580 if (iter->cur_sk < iter->end_sk) 3581 sock_put(iter->batch[iter->cur_sk++]); 3582 3583 ++*pos; 3584 3585 if (iter->cur_sk < iter->end_sk) 3586 sk = iter->batch[iter->cur_sk]; 3587 else 3588 sk = bpf_iter_unix_batch(seq, pos); 3589 3590 return sk; 3591 } 3592 3593 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3594 { 3595 struct bpf_iter_meta meta; 3596 struct bpf_prog *prog; 3597 struct sock *sk = v; 3598 uid_t uid; 3599 bool slow; 3600 int ret; 3601 3602 if (v == SEQ_START_TOKEN) 3603 return 0; 3604 3605 slow = lock_sock_fast(sk); 3606 3607 if (unlikely(sk_unhashed(sk))) { 3608 ret = SEQ_SKIP; 3609 goto unlock; 3610 } 3611 3612 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3613 meta.seq = seq; 3614 prog = bpf_iter_get_info(&meta, false); 3615 ret = unix_prog_seq_show(prog, &meta, v, uid); 3616 unlock: 3617 unlock_sock_fast(sk, slow); 3618 return ret; 3619 } 3620 3621 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3622 { 3623 struct bpf_unix_iter_state *iter = seq->private; 3624 struct bpf_iter_meta meta; 3625 struct bpf_prog *prog; 3626 3627 if (!v) { 3628 meta.seq = seq; 3629 prog = bpf_iter_get_info(&meta, true); 3630 if (prog) 3631 (void)unix_prog_seq_show(prog, &meta, v, 0); 3632 } 3633 3634 if (iter->cur_sk < iter->end_sk) 3635 bpf_iter_unix_put_batch(iter); 3636 } 3637 3638 static const struct seq_operations bpf_iter_unix_seq_ops = { 3639 .start = bpf_iter_unix_seq_start, 3640 .next = bpf_iter_unix_seq_next, 3641 .stop = bpf_iter_unix_seq_stop, 3642 .show = bpf_iter_unix_seq_show, 3643 }; 3644 #endif 3645 #endif 3646 3647 static const struct net_proto_family unix_family_ops = { 3648 .family = PF_UNIX, 3649 .create = unix_create, 3650 .owner = THIS_MODULE, 3651 }; 3652 3653 3654 static int __net_init unix_net_init(struct net *net) 3655 { 3656 int i; 3657 3658 net->unx.sysctl_max_dgram_qlen = 10; 3659 if (unix_sysctl_register(net)) 3660 goto out; 3661 3662 #ifdef CONFIG_PROC_FS 3663 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3664 sizeof(struct seq_net_private))) 3665 goto err_sysctl; 3666 #endif 3667 3668 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3669 sizeof(spinlock_t), GFP_KERNEL); 3670 if (!net->unx.table.locks) 3671 goto err_proc; 3672 3673 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3674 sizeof(struct hlist_head), 3675 GFP_KERNEL); 3676 if (!net->unx.table.buckets) 3677 goto free_locks; 3678 3679 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3680 spin_lock_init(&net->unx.table.locks[i]); 3681 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); 3682 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3683 } 3684 3685 return 0; 3686 3687 free_locks: 3688 kvfree(net->unx.table.locks); 3689 err_proc: 3690 #ifdef CONFIG_PROC_FS 3691 remove_proc_entry("unix", net->proc_net); 3692 err_sysctl: 3693 #endif 3694 unix_sysctl_unregister(net); 3695 out: 3696 return -ENOMEM; 3697 } 3698 3699 static void __net_exit unix_net_exit(struct net *net) 3700 { 3701 kvfree(net->unx.table.buckets); 3702 kvfree(net->unx.table.locks); 3703 unix_sysctl_unregister(net); 3704 remove_proc_entry("unix", net->proc_net); 3705 } 3706 3707 static struct pernet_operations unix_net_ops = { 3708 .init = unix_net_init, 3709 .exit = unix_net_exit, 3710 }; 3711 3712 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3713 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3714 struct unix_sock *unix_sk, uid_t uid) 3715 3716 #define INIT_BATCH_SZ 16 3717 3718 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3719 { 3720 struct bpf_unix_iter_state *iter = priv_data; 3721 int err; 3722 3723 err = bpf_iter_init_seq_net(priv_data, aux); 3724 if (err) 3725 return err; 3726 3727 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3728 if (err) { 3729 bpf_iter_fini_seq_net(priv_data); 3730 return err; 3731 } 3732 3733 return 0; 3734 } 3735 3736 static void bpf_iter_fini_unix(void *priv_data) 3737 { 3738 struct bpf_unix_iter_state *iter = priv_data; 3739 3740 bpf_iter_fini_seq_net(priv_data); 3741 kvfree(iter->batch); 3742 } 3743 3744 static const struct bpf_iter_seq_info unix_seq_info = { 3745 .seq_ops = &bpf_iter_unix_seq_ops, 3746 .init_seq_private = bpf_iter_init_unix, 3747 .fini_seq_private = bpf_iter_fini_unix, 3748 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3749 }; 3750 3751 static const struct bpf_func_proto * 3752 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3753 const struct bpf_prog *prog) 3754 { 3755 switch (func_id) { 3756 case BPF_FUNC_setsockopt: 3757 return &bpf_sk_setsockopt_proto; 3758 case BPF_FUNC_getsockopt: 3759 return &bpf_sk_getsockopt_proto; 3760 default: 3761 return NULL; 3762 } 3763 } 3764 3765 static struct bpf_iter_reg unix_reg_info = { 3766 .target = "unix", 3767 .ctx_arg_info_size = 1, 3768 .ctx_arg_info = { 3769 { offsetof(struct bpf_iter__unix, unix_sk), 3770 PTR_TO_BTF_ID_OR_NULL }, 3771 }, 3772 .get_func_proto = bpf_iter_unix_get_func_proto, 3773 .seq_info = &unix_seq_info, 3774 }; 3775 3776 static void __init bpf_iter_register(void) 3777 { 3778 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3779 if (bpf_iter_reg_target(&unix_reg_info)) 3780 pr_warn("Warning: could not register bpf iterator unix\n"); 3781 } 3782 #endif 3783 3784 static int __init af_unix_init(void) 3785 { 3786 int i, rc = -1; 3787 3788 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3789 3790 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3791 spin_lock_init(&bsd_socket_locks[i]); 3792 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3793 } 3794 3795 rc = proto_register(&unix_dgram_proto, 1); 3796 if (rc != 0) { 3797 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3798 goto out; 3799 } 3800 3801 rc = proto_register(&unix_stream_proto, 1); 3802 if (rc != 0) { 3803 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3804 proto_unregister(&unix_dgram_proto); 3805 goto out; 3806 } 3807 3808 sock_register(&unix_family_ops); 3809 register_pernet_subsys(&unix_net_ops); 3810 unix_bpf_build_proto(); 3811 3812 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3813 bpf_iter_register(); 3814 #endif 3815 3816 out: 3817 return rc; 3818 } 3819 3820 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3821 fs_initcall(af_unix_init); 3822