1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 #include <linux/bpf-cgroup.h> 120 121 static atomic_long_t unix_nr_socks; 122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 124 125 /* SMP locking strategy: 126 * hash table is protected with spinlock. 127 * each socket state is protected by separate spinlock. 128 */ 129 #ifdef CONFIG_PROVE_LOCKING 130 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) 131 132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a, 133 const struct lockdep_map *b) 134 { 135 return cmp_ptr(a, b); 136 } 137 138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, 139 const struct lockdep_map *_b) 140 { 141 const struct unix_sock *a, *b; 142 143 a = container_of(_a, struct unix_sock, lock.dep_map); 144 b = container_of(_b, struct unix_sock, lock.dep_map); 145 146 if (a->sk.sk_state == TCP_LISTEN) { 147 /* unix_stream_connect(): Before the 2nd unix_state_lock(), 148 * 149 * 1. a is TCP_LISTEN. 150 * 2. b is not a. 151 * 3. concurrent connect(b -> a) must fail. 152 * 153 * Except for 2. & 3., the b's state can be any possible 154 * value due to concurrent connect() or listen(). 155 * 156 * 2. is detected in debug_spin_lock_before(), and 3. cannot 157 * be expressed as lock_cmp_fn. 158 */ 159 switch (b->sk.sk_state) { 160 case TCP_CLOSE: 161 case TCP_ESTABLISHED: 162 case TCP_LISTEN: 163 return -1; 164 default: 165 /* Invalid case. */ 166 return 0; 167 } 168 } 169 170 /* Should never happen. Just to be symmetric. */ 171 if (b->sk.sk_state == TCP_LISTEN) { 172 switch (b->sk.sk_state) { 173 case TCP_CLOSE: 174 case TCP_ESTABLISHED: 175 return 1; 176 default: 177 return 0; 178 } 179 } 180 181 /* unix_state_double_lock(): ascending address order. */ 182 return cmp_ptr(a, b); 183 } 184 185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, 186 const struct lockdep_map *_b) 187 { 188 const struct sock *a, *b; 189 190 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); 191 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); 192 193 /* unix_collect_skb(): listener -> embryo order. */ 194 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) 195 return -1; 196 197 /* Should never happen. Just to be symmetric. */ 198 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) 199 return 1; 200 201 return 0; 202 } 203 #endif 204 205 static unsigned int unix_unbound_hash(struct sock *sk) 206 { 207 unsigned long hash = (unsigned long)sk; 208 209 hash ^= hash >> 16; 210 hash ^= hash >> 8; 211 hash ^= sk->sk_type; 212 213 return hash & UNIX_HASH_MOD; 214 } 215 216 static unsigned int unix_bsd_hash(struct inode *i) 217 { 218 return i->i_ino & UNIX_HASH_MOD; 219 } 220 221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 222 int addr_len, int type) 223 { 224 __wsum csum = csum_partial(sunaddr, addr_len, 0); 225 unsigned int hash; 226 227 hash = (__force unsigned int)csum_fold(csum); 228 hash ^= hash >> 8; 229 hash ^= type; 230 231 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 232 } 233 234 static void unix_table_double_lock(struct net *net, 235 unsigned int hash1, unsigned int hash2) 236 { 237 if (hash1 == hash2) { 238 spin_lock(&net->unx.table.locks[hash1]); 239 return; 240 } 241 242 if (hash1 > hash2) 243 swap(hash1, hash2); 244 245 spin_lock(&net->unx.table.locks[hash1]); 246 spin_lock(&net->unx.table.locks[hash2]); 247 } 248 249 static void unix_table_double_unlock(struct net *net, 250 unsigned int hash1, unsigned int hash2) 251 { 252 if (hash1 == hash2) { 253 spin_unlock(&net->unx.table.locks[hash1]); 254 return; 255 } 256 257 spin_unlock(&net->unx.table.locks[hash1]); 258 spin_unlock(&net->unx.table.locks[hash2]); 259 } 260 261 #ifdef CONFIG_SECURITY_NETWORK 262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 263 { 264 UNIXCB(skb).secid = scm->secid; 265 } 266 267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 268 { 269 scm->secid = UNIXCB(skb).secid; 270 } 271 272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 273 { 274 return (scm->secid == UNIXCB(skb).secid); 275 } 276 #else 277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 278 { } 279 280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 281 { } 282 283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 284 { 285 return true; 286 } 287 #endif /* CONFIG_SECURITY_NETWORK */ 288 289 static inline int unix_may_send(struct sock *sk, struct sock *osk) 290 { 291 return !unix_peer(osk) || unix_peer(osk) == sk; 292 } 293 294 static inline int unix_recvq_full_lockless(const struct sock *sk) 295 { 296 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 297 } 298 299 struct sock *unix_peer_get(struct sock *s) 300 { 301 struct sock *peer; 302 303 unix_state_lock(s); 304 peer = unix_peer(s); 305 if (peer) 306 sock_hold(peer); 307 unix_state_unlock(s); 308 return peer; 309 } 310 EXPORT_SYMBOL_GPL(unix_peer_get); 311 312 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 313 int addr_len) 314 { 315 struct unix_address *addr; 316 317 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 318 if (!addr) 319 return NULL; 320 321 refcount_set(&addr->refcnt, 1); 322 addr->len = addr_len; 323 memcpy(addr->name, sunaddr, addr_len); 324 325 return addr; 326 } 327 328 static inline void unix_release_addr(struct unix_address *addr) 329 { 330 if (refcount_dec_and_test(&addr->refcnt)) 331 kfree(addr); 332 } 333 334 /* 335 * Check unix socket name: 336 * - should be not zero length. 337 * - if started by not zero, should be NULL terminated (FS object) 338 * - if started by zero, it is abstract name. 339 */ 340 341 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 342 { 343 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 344 addr_len > sizeof(*sunaddr)) 345 return -EINVAL; 346 347 if (sunaddr->sun_family != AF_UNIX) 348 return -EINVAL; 349 350 return 0; 351 } 352 353 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 354 { 355 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 356 short offset = offsetof(struct sockaddr_storage, __data); 357 358 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 359 360 /* This may look like an off by one error but it is a bit more 361 * subtle. 108 is the longest valid AF_UNIX path for a binding. 362 * sun_path[108] doesn't as such exist. However in kernel space 363 * we are guaranteed that it is a valid memory location in our 364 * kernel address buffer because syscall functions always pass 365 * a pointer of struct sockaddr_storage which has a bigger buffer 366 * than 108. Also, we must terminate sun_path for strlen() in 367 * getname_kernel(). 368 */ 369 addr->__data[addr_len - offset] = 0; 370 371 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 372 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 373 * know the actual buffer. 374 */ 375 return strlen(addr->__data) + offset + 1; 376 } 377 378 static void __unix_remove_socket(struct sock *sk) 379 { 380 sk_del_node_init(sk); 381 } 382 383 static void __unix_insert_socket(struct net *net, struct sock *sk) 384 { 385 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 386 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 387 } 388 389 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 390 struct unix_address *addr, unsigned int hash) 391 { 392 __unix_remove_socket(sk); 393 smp_store_release(&unix_sk(sk)->addr, addr); 394 395 sk->sk_hash = hash; 396 __unix_insert_socket(net, sk); 397 } 398 399 static void unix_remove_socket(struct net *net, struct sock *sk) 400 { 401 spin_lock(&net->unx.table.locks[sk->sk_hash]); 402 __unix_remove_socket(sk); 403 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 404 } 405 406 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 407 { 408 spin_lock(&net->unx.table.locks[sk->sk_hash]); 409 __unix_insert_socket(net, sk); 410 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 411 } 412 413 static void unix_insert_bsd_socket(struct sock *sk) 414 { 415 spin_lock(&bsd_socket_locks[sk->sk_hash]); 416 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 417 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 418 } 419 420 static void unix_remove_bsd_socket(struct sock *sk) 421 { 422 if (!hlist_unhashed(&sk->sk_bind_node)) { 423 spin_lock(&bsd_socket_locks[sk->sk_hash]); 424 __sk_del_bind_node(sk); 425 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 426 427 sk_node_init(&sk->sk_bind_node); 428 } 429 } 430 431 static struct sock *__unix_find_socket_byname(struct net *net, 432 struct sockaddr_un *sunname, 433 int len, unsigned int hash) 434 { 435 struct sock *s; 436 437 sk_for_each(s, &net->unx.table.buckets[hash]) { 438 struct unix_sock *u = unix_sk(s); 439 440 if (u->addr->len == len && 441 !memcmp(u->addr->name, sunname, len)) 442 return s; 443 } 444 return NULL; 445 } 446 447 static inline struct sock *unix_find_socket_byname(struct net *net, 448 struct sockaddr_un *sunname, 449 int len, unsigned int hash) 450 { 451 struct sock *s; 452 453 spin_lock(&net->unx.table.locks[hash]); 454 s = __unix_find_socket_byname(net, sunname, len, hash); 455 if (s) 456 sock_hold(s); 457 spin_unlock(&net->unx.table.locks[hash]); 458 return s; 459 } 460 461 static struct sock *unix_find_socket_byinode(struct inode *i) 462 { 463 unsigned int hash = unix_bsd_hash(i); 464 struct sock *s; 465 466 spin_lock(&bsd_socket_locks[hash]); 467 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 468 struct dentry *dentry = unix_sk(s)->path.dentry; 469 470 if (dentry && d_backing_inode(dentry) == i) { 471 sock_hold(s); 472 spin_unlock(&bsd_socket_locks[hash]); 473 return s; 474 } 475 } 476 spin_unlock(&bsd_socket_locks[hash]); 477 return NULL; 478 } 479 480 /* Support code for asymmetrically connected dgram sockets 481 * 482 * If a datagram socket is connected to a socket not itself connected 483 * to the first socket (eg, /dev/log), clients may only enqueue more 484 * messages if the present receive queue of the server socket is not 485 * "too large". This means there's a second writeability condition 486 * poll and sendmsg need to test. The dgram recv code will do a wake 487 * up on the peer_wait wait queue of a socket upon reception of a 488 * datagram which needs to be propagated to sleeping would-be writers 489 * since these might not have sent anything so far. This can't be 490 * accomplished via poll_wait because the lifetime of the server 491 * socket might be less than that of its clients if these break their 492 * association with it or if the server socket is closed while clients 493 * are still connected to it and there's no way to inform "a polling 494 * implementation" that it should let go of a certain wait queue 495 * 496 * In order to propagate a wake up, a wait_queue_entry_t of the client 497 * socket is enqueued on the peer_wait queue of the server socket 498 * whose wake function does a wake_up on the ordinary client socket 499 * wait queue. This connection is established whenever a write (or 500 * poll for write) hit the flow control condition and broken when the 501 * association to the server socket is dissolved or after a wake up 502 * was relayed. 503 */ 504 505 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 506 void *key) 507 { 508 struct unix_sock *u; 509 wait_queue_head_t *u_sleep; 510 511 u = container_of(q, struct unix_sock, peer_wake); 512 513 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 514 q); 515 u->peer_wake.private = NULL; 516 517 /* relaying can only happen while the wq still exists */ 518 u_sleep = sk_sleep(&u->sk); 519 if (u_sleep) 520 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 521 522 return 0; 523 } 524 525 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 526 { 527 struct unix_sock *u, *u_other; 528 int rc; 529 530 u = unix_sk(sk); 531 u_other = unix_sk(other); 532 rc = 0; 533 spin_lock(&u_other->peer_wait.lock); 534 535 if (!u->peer_wake.private) { 536 u->peer_wake.private = other; 537 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 538 539 rc = 1; 540 } 541 542 spin_unlock(&u_other->peer_wait.lock); 543 return rc; 544 } 545 546 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 547 struct sock *other) 548 { 549 struct unix_sock *u, *u_other; 550 551 u = unix_sk(sk); 552 u_other = unix_sk(other); 553 spin_lock(&u_other->peer_wait.lock); 554 555 if (u->peer_wake.private == other) { 556 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 557 u->peer_wake.private = NULL; 558 } 559 560 spin_unlock(&u_other->peer_wait.lock); 561 } 562 563 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 564 struct sock *other) 565 { 566 unix_dgram_peer_wake_disconnect(sk, other); 567 wake_up_interruptible_poll(sk_sleep(sk), 568 EPOLLOUT | 569 EPOLLWRNORM | 570 EPOLLWRBAND); 571 } 572 573 /* preconditions: 574 * - unix_peer(sk) == other 575 * - association is stable 576 */ 577 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 578 { 579 int connected; 580 581 connected = unix_dgram_peer_wake_connect(sk, other); 582 583 /* If other is SOCK_DEAD, we want to make sure we signal 584 * POLLOUT, such that a subsequent write() can get a 585 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 586 * to other and its full, we will hang waiting for POLLOUT. 587 */ 588 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 589 return 1; 590 591 if (connected) 592 unix_dgram_peer_wake_disconnect(sk, other); 593 594 return 0; 595 } 596 597 static int unix_writable(const struct sock *sk, unsigned char state) 598 { 599 return state != TCP_LISTEN && 600 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 601 } 602 603 static void unix_write_space(struct sock *sk) 604 { 605 struct socket_wq *wq; 606 607 rcu_read_lock(); 608 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 609 wq = rcu_dereference(sk->sk_wq); 610 if (skwq_has_sleeper(wq)) 611 wake_up_interruptible_sync_poll(&wq->wait, 612 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 613 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 614 } 615 rcu_read_unlock(); 616 } 617 618 /* When dgram socket disconnects (or changes its peer), we clear its receive 619 * queue of packets arrived from previous peer. First, it allows to do 620 * flow control based only on wmem_alloc; second, sk connected to peer 621 * may receive messages only from that peer. */ 622 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 623 { 624 if (!skb_queue_empty(&sk->sk_receive_queue)) { 625 skb_queue_purge(&sk->sk_receive_queue); 626 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 627 628 /* If one link of bidirectional dgram pipe is disconnected, 629 * we signal error. Messages are lost. Do not make this, 630 * when peer was not connected to us. 631 */ 632 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 633 WRITE_ONCE(other->sk_err, ECONNRESET); 634 sk_error_report(other); 635 } 636 } 637 } 638 639 static void unix_sock_destructor(struct sock *sk) 640 { 641 struct unix_sock *u = unix_sk(sk); 642 643 skb_queue_purge(&sk->sk_receive_queue); 644 645 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 646 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 647 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 648 if (!sock_flag(sk, SOCK_DEAD)) { 649 pr_info("Attempt to release alive unix socket: %p\n", sk); 650 return; 651 } 652 653 if (u->addr) 654 unix_release_addr(u->addr); 655 656 atomic_long_dec(&unix_nr_socks); 657 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 658 #ifdef UNIX_REFCNT_DEBUG 659 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 660 atomic_long_read(&unix_nr_socks)); 661 #endif 662 } 663 664 static void unix_release_sock(struct sock *sk, int embrion) 665 { 666 struct unix_sock *u = unix_sk(sk); 667 struct sock *skpair; 668 struct sk_buff *skb; 669 struct path path; 670 int state; 671 672 unix_remove_socket(sock_net(sk), sk); 673 unix_remove_bsd_socket(sk); 674 675 /* Clear state */ 676 unix_state_lock(sk); 677 sock_orphan(sk); 678 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 679 path = u->path; 680 u->path.dentry = NULL; 681 u->path.mnt = NULL; 682 state = sk->sk_state; 683 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 684 685 skpair = unix_peer(sk); 686 unix_peer(sk) = NULL; 687 688 unix_state_unlock(sk); 689 690 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 691 u->oob_skb = NULL; 692 #endif 693 694 wake_up_interruptible_all(&u->peer_wait); 695 696 if (skpair != NULL) { 697 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 698 unix_state_lock(skpair); 699 /* No more writes */ 700 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 701 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) 702 WRITE_ONCE(skpair->sk_err, ECONNRESET); 703 unix_state_unlock(skpair); 704 skpair->sk_state_change(skpair); 705 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 706 } 707 708 unix_dgram_peer_wake_disconnect(sk, skpair); 709 sock_put(skpair); /* It may now die */ 710 } 711 712 /* Try to flush out this socket. Throw out buffers at least */ 713 714 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 715 if (state == TCP_LISTEN) 716 unix_release_sock(skb->sk, 1); 717 718 /* passed fds are erased in the kfree_skb hook */ 719 kfree_skb(skb); 720 } 721 722 if (path.dentry) 723 path_put(&path); 724 725 sock_put(sk); 726 727 /* ---- Socket is dead now and most probably destroyed ---- */ 728 729 /* 730 * Fixme: BSD difference: In BSD all sockets connected to us get 731 * ECONNRESET and we die on the spot. In Linux we behave 732 * like files and pipes do and wait for the last 733 * dereference. 734 * 735 * Can't we simply set sock->err? 736 * 737 * What the above comment does talk about? --ANK(980817) 738 */ 739 740 if (READ_ONCE(unix_tot_inflight)) 741 unix_gc(); /* Garbage collect fds */ 742 } 743 744 static void init_peercred(struct sock *sk) 745 { 746 sk->sk_peer_pid = get_pid(task_tgid(current)); 747 sk->sk_peer_cred = get_current_cred(); 748 } 749 750 static void update_peercred(struct sock *sk) 751 { 752 const struct cred *old_cred; 753 struct pid *old_pid; 754 755 spin_lock(&sk->sk_peer_lock); 756 old_pid = sk->sk_peer_pid; 757 old_cred = sk->sk_peer_cred; 758 init_peercred(sk); 759 spin_unlock(&sk->sk_peer_lock); 760 761 put_pid(old_pid); 762 put_cred(old_cred); 763 } 764 765 static void copy_peercred(struct sock *sk, struct sock *peersk) 766 { 767 lockdep_assert_held(&unix_sk(peersk)->lock); 768 769 spin_lock(&sk->sk_peer_lock); 770 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 771 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 772 spin_unlock(&sk->sk_peer_lock); 773 } 774 775 static int unix_listen(struct socket *sock, int backlog) 776 { 777 int err; 778 struct sock *sk = sock->sk; 779 struct unix_sock *u = unix_sk(sk); 780 781 err = -EOPNOTSUPP; 782 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 783 goto out; /* Only stream/seqpacket sockets accept */ 784 err = -EINVAL; 785 if (!READ_ONCE(u->addr)) 786 goto out; /* No listens on an unbound socket */ 787 unix_state_lock(sk); 788 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 789 goto out_unlock; 790 if (backlog > sk->sk_max_ack_backlog) 791 wake_up_interruptible_all(&u->peer_wait); 792 sk->sk_max_ack_backlog = backlog; 793 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 794 795 /* set credentials so connect can copy them */ 796 update_peercred(sk); 797 err = 0; 798 799 out_unlock: 800 unix_state_unlock(sk); 801 out: 802 return err; 803 } 804 805 static int unix_release(struct socket *); 806 static int unix_bind(struct socket *, struct sockaddr *, int); 807 static int unix_stream_connect(struct socket *, struct sockaddr *, 808 int addr_len, int flags); 809 static int unix_socketpair(struct socket *, struct socket *); 810 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); 811 static int unix_getname(struct socket *, struct sockaddr *, int); 812 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 813 static __poll_t unix_dgram_poll(struct file *, struct socket *, 814 poll_table *); 815 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 816 #ifdef CONFIG_COMPAT 817 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 818 #endif 819 static int unix_shutdown(struct socket *, int); 820 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 821 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 822 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 823 struct pipe_inode_info *, size_t size, 824 unsigned int flags); 825 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 826 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 827 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 828 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 829 static int unix_dgram_connect(struct socket *, struct sockaddr *, 830 int, int); 831 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 832 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 833 int); 834 835 #ifdef CONFIG_PROC_FS 836 static int unix_count_nr_fds(struct sock *sk) 837 { 838 struct sk_buff *skb; 839 struct unix_sock *u; 840 int nr_fds = 0; 841 842 spin_lock(&sk->sk_receive_queue.lock); 843 skb = skb_peek(&sk->sk_receive_queue); 844 while (skb) { 845 u = unix_sk(skb->sk); 846 nr_fds += atomic_read(&u->scm_stat.nr_fds); 847 skb = skb_peek_next(skb, &sk->sk_receive_queue); 848 } 849 spin_unlock(&sk->sk_receive_queue.lock); 850 851 return nr_fds; 852 } 853 854 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 855 { 856 struct sock *sk = sock->sk; 857 unsigned char s_state; 858 struct unix_sock *u; 859 int nr_fds = 0; 860 861 if (sk) { 862 s_state = READ_ONCE(sk->sk_state); 863 u = unix_sk(sk); 864 865 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 866 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 867 * SOCK_DGRAM is ordinary. So, no lock is needed. 868 */ 869 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 870 nr_fds = atomic_read(&u->scm_stat.nr_fds); 871 else if (s_state == TCP_LISTEN) 872 nr_fds = unix_count_nr_fds(sk); 873 874 seq_printf(m, "scm_fds: %u\n", nr_fds); 875 } 876 } 877 #else 878 #define unix_show_fdinfo NULL 879 #endif 880 881 static const struct proto_ops unix_stream_ops = { 882 .family = PF_UNIX, 883 .owner = THIS_MODULE, 884 .release = unix_release, 885 .bind = unix_bind, 886 .connect = unix_stream_connect, 887 .socketpair = unix_socketpair, 888 .accept = unix_accept, 889 .getname = unix_getname, 890 .poll = unix_poll, 891 .ioctl = unix_ioctl, 892 #ifdef CONFIG_COMPAT 893 .compat_ioctl = unix_compat_ioctl, 894 #endif 895 .listen = unix_listen, 896 .shutdown = unix_shutdown, 897 .sendmsg = unix_stream_sendmsg, 898 .recvmsg = unix_stream_recvmsg, 899 .read_skb = unix_stream_read_skb, 900 .mmap = sock_no_mmap, 901 .splice_read = unix_stream_splice_read, 902 .set_peek_off = sk_set_peek_off, 903 .show_fdinfo = unix_show_fdinfo, 904 }; 905 906 static const struct proto_ops unix_dgram_ops = { 907 .family = PF_UNIX, 908 .owner = THIS_MODULE, 909 .release = unix_release, 910 .bind = unix_bind, 911 .connect = unix_dgram_connect, 912 .socketpair = unix_socketpair, 913 .accept = sock_no_accept, 914 .getname = unix_getname, 915 .poll = unix_dgram_poll, 916 .ioctl = unix_ioctl, 917 #ifdef CONFIG_COMPAT 918 .compat_ioctl = unix_compat_ioctl, 919 #endif 920 .listen = sock_no_listen, 921 .shutdown = unix_shutdown, 922 .sendmsg = unix_dgram_sendmsg, 923 .read_skb = unix_read_skb, 924 .recvmsg = unix_dgram_recvmsg, 925 .mmap = sock_no_mmap, 926 .set_peek_off = sk_set_peek_off, 927 .show_fdinfo = unix_show_fdinfo, 928 }; 929 930 static const struct proto_ops unix_seqpacket_ops = { 931 .family = PF_UNIX, 932 .owner = THIS_MODULE, 933 .release = unix_release, 934 .bind = unix_bind, 935 .connect = unix_stream_connect, 936 .socketpair = unix_socketpair, 937 .accept = unix_accept, 938 .getname = unix_getname, 939 .poll = unix_dgram_poll, 940 .ioctl = unix_ioctl, 941 #ifdef CONFIG_COMPAT 942 .compat_ioctl = unix_compat_ioctl, 943 #endif 944 .listen = unix_listen, 945 .shutdown = unix_shutdown, 946 .sendmsg = unix_seqpacket_sendmsg, 947 .recvmsg = unix_seqpacket_recvmsg, 948 .mmap = sock_no_mmap, 949 .set_peek_off = sk_set_peek_off, 950 .show_fdinfo = unix_show_fdinfo, 951 }; 952 953 static void unix_close(struct sock *sk, long timeout) 954 { 955 /* Nothing to do here, unix socket does not need a ->close(). 956 * This is merely for sockmap. 957 */ 958 } 959 960 static void unix_unhash(struct sock *sk) 961 { 962 /* Nothing to do here, unix socket does not need a ->unhash(). 963 * This is merely for sockmap. 964 */ 965 } 966 967 static bool unix_bpf_bypass_getsockopt(int level, int optname) 968 { 969 if (level == SOL_SOCKET) { 970 switch (optname) { 971 case SO_PEERPIDFD: 972 return true; 973 default: 974 return false; 975 } 976 } 977 978 return false; 979 } 980 981 struct proto unix_dgram_proto = { 982 .name = "UNIX", 983 .owner = THIS_MODULE, 984 .obj_size = sizeof(struct unix_sock), 985 .close = unix_close, 986 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 987 #ifdef CONFIG_BPF_SYSCALL 988 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 989 #endif 990 }; 991 992 struct proto unix_stream_proto = { 993 .name = "UNIX-STREAM", 994 .owner = THIS_MODULE, 995 .obj_size = sizeof(struct unix_sock), 996 .close = unix_close, 997 .unhash = unix_unhash, 998 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 999 #ifdef CONFIG_BPF_SYSCALL 1000 .psock_update_sk_prot = unix_stream_bpf_update_proto, 1001 #endif 1002 }; 1003 1004 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 1005 { 1006 struct unix_sock *u; 1007 struct sock *sk; 1008 int err; 1009 1010 atomic_long_inc(&unix_nr_socks); 1011 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 1012 err = -ENFILE; 1013 goto err; 1014 } 1015 1016 if (type == SOCK_STREAM) 1017 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 1018 else /*dgram and seqpacket */ 1019 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 1020 1021 if (!sk) { 1022 err = -ENOMEM; 1023 goto err; 1024 } 1025 1026 sock_init_data(sock, sk); 1027 1028 sk->sk_hash = unix_unbound_hash(sk); 1029 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 1030 sk->sk_write_space = unix_write_space; 1031 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 1032 sk->sk_destruct = unix_sock_destructor; 1033 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); 1034 1035 u = unix_sk(sk); 1036 u->listener = NULL; 1037 u->vertex = NULL; 1038 u->path.dentry = NULL; 1039 u->path.mnt = NULL; 1040 spin_lock_init(&u->lock); 1041 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); 1042 mutex_init(&u->iolock); /* single task reading lock */ 1043 mutex_init(&u->bindlock); /* single task binding lock */ 1044 init_waitqueue_head(&u->peer_wait); 1045 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1046 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1047 unix_insert_unbound_socket(net, sk); 1048 1049 sock_prot_inuse_add(net, sk->sk_prot, 1); 1050 1051 return sk; 1052 1053 err: 1054 atomic_long_dec(&unix_nr_socks); 1055 return ERR_PTR(err); 1056 } 1057 1058 static int unix_create(struct net *net, struct socket *sock, int protocol, 1059 int kern) 1060 { 1061 struct sock *sk; 1062 1063 if (protocol && protocol != PF_UNIX) 1064 return -EPROTONOSUPPORT; 1065 1066 sock->state = SS_UNCONNECTED; 1067 1068 switch (sock->type) { 1069 case SOCK_STREAM: 1070 sock->ops = &unix_stream_ops; 1071 break; 1072 /* 1073 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1074 * nothing uses it. 1075 */ 1076 case SOCK_RAW: 1077 sock->type = SOCK_DGRAM; 1078 fallthrough; 1079 case SOCK_DGRAM: 1080 sock->ops = &unix_dgram_ops; 1081 break; 1082 case SOCK_SEQPACKET: 1083 sock->ops = &unix_seqpacket_ops; 1084 break; 1085 default: 1086 return -ESOCKTNOSUPPORT; 1087 } 1088 1089 sk = unix_create1(net, sock, kern, sock->type); 1090 if (IS_ERR(sk)) 1091 return PTR_ERR(sk); 1092 1093 return 0; 1094 } 1095 1096 static int unix_release(struct socket *sock) 1097 { 1098 struct sock *sk = sock->sk; 1099 1100 if (!sk) 1101 return 0; 1102 1103 sk->sk_prot->close(sk, 0); 1104 unix_release_sock(sk, 0); 1105 sock->sk = NULL; 1106 1107 return 0; 1108 } 1109 1110 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1111 int type) 1112 { 1113 struct inode *inode; 1114 struct path path; 1115 struct sock *sk; 1116 int err; 1117 1118 unix_mkname_bsd(sunaddr, addr_len); 1119 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1120 if (err) 1121 goto fail; 1122 1123 err = path_permission(&path, MAY_WRITE); 1124 if (err) 1125 goto path_put; 1126 1127 err = -ECONNREFUSED; 1128 inode = d_backing_inode(path.dentry); 1129 if (!S_ISSOCK(inode->i_mode)) 1130 goto path_put; 1131 1132 sk = unix_find_socket_byinode(inode); 1133 if (!sk) 1134 goto path_put; 1135 1136 err = -EPROTOTYPE; 1137 if (sk->sk_type == type) 1138 touch_atime(&path); 1139 else 1140 goto sock_put; 1141 1142 path_put(&path); 1143 1144 return sk; 1145 1146 sock_put: 1147 sock_put(sk); 1148 path_put: 1149 path_put(&path); 1150 fail: 1151 return ERR_PTR(err); 1152 } 1153 1154 static struct sock *unix_find_abstract(struct net *net, 1155 struct sockaddr_un *sunaddr, 1156 int addr_len, int type) 1157 { 1158 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1159 struct dentry *dentry; 1160 struct sock *sk; 1161 1162 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1163 if (!sk) 1164 return ERR_PTR(-ECONNREFUSED); 1165 1166 dentry = unix_sk(sk)->path.dentry; 1167 if (dentry) 1168 touch_atime(&unix_sk(sk)->path); 1169 1170 return sk; 1171 } 1172 1173 static struct sock *unix_find_other(struct net *net, 1174 struct sockaddr_un *sunaddr, 1175 int addr_len, int type) 1176 { 1177 struct sock *sk; 1178 1179 if (sunaddr->sun_path[0]) 1180 sk = unix_find_bsd(sunaddr, addr_len, type); 1181 else 1182 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1183 1184 return sk; 1185 } 1186 1187 static int unix_autobind(struct sock *sk) 1188 { 1189 struct unix_sock *u = unix_sk(sk); 1190 unsigned int new_hash, old_hash; 1191 struct net *net = sock_net(sk); 1192 struct unix_address *addr; 1193 u32 lastnum, ordernum; 1194 int err; 1195 1196 err = mutex_lock_interruptible(&u->bindlock); 1197 if (err) 1198 return err; 1199 1200 if (u->addr) 1201 goto out; 1202 1203 err = -ENOMEM; 1204 addr = kzalloc(sizeof(*addr) + 1205 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1206 if (!addr) 1207 goto out; 1208 1209 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1210 addr->name->sun_family = AF_UNIX; 1211 refcount_set(&addr->refcnt, 1); 1212 1213 old_hash = sk->sk_hash; 1214 ordernum = get_random_u32(); 1215 lastnum = ordernum & 0xFFFFF; 1216 retry: 1217 ordernum = (ordernum + 1) & 0xFFFFF; 1218 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1219 1220 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1221 unix_table_double_lock(net, old_hash, new_hash); 1222 1223 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1224 unix_table_double_unlock(net, old_hash, new_hash); 1225 1226 /* __unix_find_socket_byname() may take long time if many names 1227 * are already in use. 1228 */ 1229 cond_resched(); 1230 1231 if (ordernum == lastnum) { 1232 /* Give up if all names seems to be in use. */ 1233 err = -ENOSPC; 1234 unix_release_addr(addr); 1235 goto out; 1236 } 1237 1238 goto retry; 1239 } 1240 1241 __unix_set_addr_hash(net, sk, addr, new_hash); 1242 unix_table_double_unlock(net, old_hash, new_hash); 1243 err = 0; 1244 1245 out: mutex_unlock(&u->bindlock); 1246 return err; 1247 } 1248 1249 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1250 int addr_len) 1251 { 1252 umode_t mode = S_IFSOCK | 1253 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1254 struct unix_sock *u = unix_sk(sk); 1255 unsigned int new_hash, old_hash; 1256 struct net *net = sock_net(sk); 1257 struct mnt_idmap *idmap; 1258 struct unix_address *addr; 1259 struct dentry *dentry; 1260 struct path parent; 1261 int err; 1262 1263 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1264 addr = unix_create_addr(sunaddr, addr_len); 1265 if (!addr) 1266 return -ENOMEM; 1267 1268 /* 1269 * Get the parent directory, calculate the hash for last 1270 * component. 1271 */ 1272 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1273 if (IS_ERR(dentry)) { 1274 err = PTR_ERR(dentry); 1275 goto out; 1276 } 1277 1278 /* 1279 * All right, let's create it. 1280 */ 1281 idmap = mnt_idmap(parent.mnt); 1282 err = security_path_mknod(&parent, dentry, mode, 0); 1283 if (!err) 1284 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1285 if (err) 1286 goto out_path; 1287 err = mutex_lock_interruptible(&u->bindlock); 1288 if (err) 1289 goto out_unlink; 1290 if (u->addr) 1291 goto out_unlock; 1292 1293 old_hash = sk->sk_hash; 1294 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1295 unix_table_double_lock(net, old_hash, new_hash); 1296 u->path.mnt = mntget(parent.mnt); 1297 u->path.dentry = dget(dentry); 1298 __unix_set_addr_hash(net, sk, addr, new_hash); 1299 unix_table_double_unlock(net, old_hash, new_hash); 1300 unix_insert_bsd_socket(sk); 1301 mutex_unlock(&u->bindlock); 1302 done_path_create(&parent, dentry); 1303 return 0; 1304 1305 out_unlock: 1306 mutex_unlock(&u->bindlock); 1307 err = -EINVAL; 1308 out_unlink: 1309 /* failed after successful mknod? unlink what we'd created... */ 1310 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1311 out_path: 1312 done_path_create(&parent, dentry); 1313 out: 1314 unix_release_addr(addr); 1315 return err == -EEXIST ? -EADDRINUSE : err; 1316 } 1317 1318 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1319 int addr_len) 1320 { 1321 struct unix_sock *u = unix_sk(sk); 1322 unsigned int new_hash, old_hash; 1323 struct net *net = sock_net(sk); 1324 struct unix_address *addr; 1325 int err; 1326 1327 addr = unix_create_addr(sunaddr, addr_len); 1328 if (!addr) 1329 return -ENOMEM; 1330 1331 err = mutex_lock_interruptible(&u->bindlock); 1332 if (err) 1333 goto out; 1334 1335 if (u->addr) { 1336 err = -EINVAL; 1337 goto out_mutex; 1338 } 1339 1340 old_hash = sk->sk_hash; 1341 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1342 unix_table_double_lock(net, old_hash, new_hash); 1343 1344 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1345 goto out_spin; 1346 1347 __unix_set_addr_hash(net, sk, addr, new_hash); 1348 unix_table_double_unlock(net, old_hash, new_hash); 1349 mutex_unlock(&u->bindlock); 1350 return 0; 1351 1352 out_spin: 1353 unix_table_double_unlock(net, old_hash, new_hash); 1354 err = -EADDRINUSE; 1355 out_mutex: 1356 mutex_unlock(&u->bindlock); 1357 out: 1358 unix_release_addr(addr); 1359 return err; 1360 } 1361 1362 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1363 { 1364 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1365 struct sock *sk = sock->sk; 1366 int err; 1367 1368 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1369 sunaddr->sun_family == AF_UNIX) 1370 return unix_autobind(sk); 1371 1372 err = unix_validate_addr(sunaddr, addr_len); 1373 if (err) 1374 return err; 1375 1376 if (sunaddr->sun_path[0]) 1377 err = unix_bind_bsd(sk, sunaddr, addr_len); 1378 else 1379 err = unix_bind_abstract(sk, sunaddr, addr_len); 1380 1381 return err; 1382 } 1383 1384 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1385 { 1386 if (unlikely(sk1 == sk2) || !sk2) { 1387 unix_state_lock(sk1); 1388 return; 1389 } 1390 1391 if (sk1 > sk2) 1392 swap(sk1, sk2); 1393 1394 unix_state_lock(sk1); 1395 unix_state_lock(sk2); 1396 } 1397 1398 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1399 { 1400 if (unlikely(sk1 == sk2) || !sk2) { 1401 unix_state_unlock(sk1); 1402 return; 1403 } 1404 unix_state_unlock(sk1); 1405 unix_state_unlock(sk2); 1406 } 1407 1408 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1409 int alen, int flags) 1410 { 1411 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1412 struct sock *sk = sock->sk; 1413 struct sock *other; 1414 int err; 1415 1416 err = -EINVAL; 1417 if (alen < offsetofend(struct sockaddr, sa_family)) 1418 goto out; 1419 1420 if (addr->sa_family != AF_UNSPEC) { 1421 err = unix_validate_addr(sunaddr, alen); 1422 if (err) 1423 goto out; 1424 1425 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1426 if (err) 1427 goto out; 1428 1429 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1430 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1431 !READ_ONCE(unix_sk(sk)->addr)) { 1432 err = unix_autobind(sk); 1433 if (err) 1434 goto out; 1435 } 1436 1437 restart: 1438 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1439 if (IS_ERR(other)) { 1440 err = PTR_ERR(other); 1441 goto out; 1442 } 1443 1444 unix_state_double_lock(sk, other); 1445 1446 /* Apparently VFS overslept socket death. Retry. */ 1447 if (sock_flag(other, SOCK_DEAD)) { 1448 unix_state_double_unlock(sk, other); 1449 sock_put(other); 1450 goto restart; 1451 } 1452 1453 err = -EPERM; 1454 if (!unix_may_send(sk, other)) 1455 goto out_unlock; 1456 1457 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1458 if (err) 1459 goto out_unlock; 1460 1461 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1462 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1463 } else { 1464 /* 1465 * 1003.1g breaking connected state with AF_UNSPEC 1466 */ 1467 other = NULL; 1468 unix_state_double_lock(sk, other); 1469 } 1470 1471 /* 1472 * If it was connected, reconnect. 1473 */ 1474 if (unix_peer(sk)) { 1475 struct sock *old_peer = unix_peer(sk); 1476 1477 unix_peer(sk) = other; 1478 if (!other) 1479 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1480 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1481 1482 unix_state_double_unlock(sk, other); 1483 1484 if (other != old_peer) { 1485 unix_dgram_disconnected(sk, old_peer); 1486 1487 unix_state_lock(old_peer); 1488 if (!unix_peer(old_peer)) 1489 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1490 unix_state_unlock(old_peer); 1491 } 1492 1493 sock_put(old_peer); 1494 } else { 1495 unix_peer(sk) = other; 1496 unix_state_double_unlock(sk, other); 1497 } 1498 1499 return 0; 1500 1501 out_unlock: 1502 unix_state_double_unlock(sk, other); 1503 sock_put(other); 1504 out: 1505 return err; 1506 } 1507 1508 static long unix_wait_for_peer(struct sock *other, long timeo) 1509 __releases(&unix_sk(other)->lock) 1510 { 1511 struct unix_sock *u = unix_sk(other); 1512 int sched; 1513 DEFINE_WAIT(wait); 1514 1515 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1516 1517 sched = !sock_flag(other, SOCK_DEAD) && 1518 !(other->sk_shutdown & RCV_SHUTDOWN) && 1519 unix_recvq_full_lockless(other); 1520 1521 unix_state_unlock(other); 1522 1523 if (sched) 1524 timeo = schedule_timeout(timeo); 1525 1526 finish_wait(&u->peer_wait, &wait); 1527 return timeo; 1528 } 1529 1530 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1531 int addr_len, int flags) 1532 { 1533 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1534 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1535 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1536 struct net *net = sock_net(sk); 1537 struct sk_buff *skb = NULL; 1538 unsigned char state; 1539 long timeo; 1540 int err; 1541 1542 err = unix_validate_addr(sunaddr, addr_len); 1543 if (err) 1544 goto out; 1545 1546 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1547 if (err) 1548 goto out; 1549 1550 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1551 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1552 !READ_ONCE(u->addr)) { 1553 err = unix_autobind(sk); 1554 if (err) 1555 goto out; 1556 } 1557 1558 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1559 1560 /* First of all allocate resources. 1561 * If we will make it after state is locked, 1562 * we will have to recheck all again in any case. 1563 */ 1564 1565 /* create new sock for complete connection */ 1566 newsk = unix_create1(net, NULL, 0, sock->type); 1567 if (IS_ERR(newsk)) { 1568 err = PTR_ERR(newsk); 1569 goto out; 1570 } 1571 1572 /* Allocate skb for sending to listening sock */ 1573 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1574 if (!skb) { 1575 err = -ENOMEM; 1576 goto out_free_sk; 1577 } 1578 1579 restart: 1580 /* Find listening sock. */ 1581 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1582 if (IS_ERR(other)) { 1583 err = PTR_ERR(other); 1584 goto out_free_skb; 1585 } 1586 1587 unix_state_lock(other); 1588 1589 /* Apparently VFS overslept socket death. Retry. */ 1590 if (sock_flag(other, SOCK_DEAD)) { 1591 unix_state_unlock(other); 1592 sock_put(other); 1593 goto restart; 1594 } 1595 1596 if (other->sk_state != TCP_LISTEN || 1597 other->sk_shutdown & RCV_SHUTDOWN) { 1598 err = -ECONNREFUSED; 1599 goto out_unlock; 1600 } 1601 1602 if (unix_recvq_full_lockless(other)) { 1603 if (!timeo) { 1604 err = -EAGAIN; 1605 goto out_unlock; 1606 } 1607 1608 timeo = unix_wait_for_peer(other, timeo); 1609 sock_put(other); 1610 1611 err = sock_intr_errno(timeo); 1612 if (signal_pending(current)) 1613 goto out_free_skb; 1614 1615 goto restart; 1616 } 1617 1618 /* self connect and simultaneous connect are eliminated 1619 * by rejecting TCP_LISTEN socket to avoid deadlock. 1620 */ 1621 state = READ_ONCE(sk->sk_state); 1622 if (unlikely(state != TCP_CLOSE)) { 1623 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1624 goto out_unlock; 1625 } 1626 1627 unix_state_lock(sk); 1628 1629 if (unlikely(sk->sk_state != TCP_CLOSE)) { 1630 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1631 unix_state_unlock(sk); 1632 goto out_unlock; 1633 } 1634 1635 err = security_unix_stream_connect(sk, other, newsk); 1636 if (err) { 1637 unix_state_unlock(sk); 1638 goto out_unlock; 1639 } 1640 1641 /* The way is open! Fastly set all the necessary fields... */ 1642 1643 sock_hold(sk); 1644 unix_peer(newsk) = sk; 1645 newsk->sk_state = TCP_ESTABLISHED; 1646 newsk->sk_type = sk->sk_type; 1647 init_peercred(newsk); 1648 newu = unix_sk(newsk); 1649 newu->listener = other; 1650 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1651 otheru = unix_sk(other); 1652 1653 /* copy address information from listening to new sock 1654 * 1655 * The contents of *(otheru->addr) and otheru->path 1656 * are seen fully set up here, since we have found 1657 * otheru in hash under its lock. Insertion into the 1658 * hash chain we'd found it in had been done in an 1659 * earlier critical area protected by the chain's lock, 1660 * the same one where we'd set *(otheru->addr) contents, 1661 * as well as otheru->path and otheru->addr itself. 1662 * 1663 * Using smp_store_release() here to set newu->addr 1664 * is enough to make those stores, as well as stores 1665 * to newu->path visible to anyone who gets newu->addr 1666 * by smp_load_acquire(). IOW, the same warranties 1667 * as for unix_sock instances bound in unix_bind() or 1668 * in unix_autobind(). 1669 */ 1670 if (otheru->path.dentry) { 1671 path_get(&otheru->path); 1672 newu->path = otheru->path; 1673 } 1674 refcount_inc(&otheru->addr->refcnt); 1675 smp_store_release(&newu->addr, otheru->addr); 1676 1677 /* Set credentials */ 1678 copy_peercred(sk, other); 1679 1680 sock->state = SS_CONNECTED; 1681 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1682 sock_hold(newsk); 1683 1684 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1685 unix_peer(sk) = newsk; 1686 1687 unix_state_unlock(sk); 1688 1689 /* take ten and send info to listening sock */ 1690 spin_lock(&other->sk_receive_queue.lock); 1691 __skb_queue_tail(&other->sk_receive_queue, skb); 1692 spin_unlock(&other->sk_receive_queue.lock); 1693 unix_state_unlock(other); 1694 other->sk_data_ready(other); 1695 sock_put(other); 1696 return 0; 1697 1698 out_unlock: 1699 unix_state_unlock(other); 1700 sock_put(other); 1701 out_free_skb: 1702 kfree_skb(skb); 1703 out_free_sk: 1704 unix_release_sock(newsk, 0); 1705 out: 1706 return err; 1707 } 1708 1709 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1710 { 1711 struct sock *ska = socka->sk, *skb = sockb->sk; 1712 1713 /* Join our sockets back to back */ 1714 sock_hold(ska); 1715 sock_hold(skb); 1716 unix_peer(ska) = skb; 1717 unix_peer(skb) = ska; 1718 init_peercred(ska); 1719 init_peercred(skb); 1720 1721 ska->sk_state = TCP_ESTABLISHED; 1722 skb->sk_state = TCP_ESTABLISHED; 1723 socka->state = SS_CONNECTED; 1724 sockb->state = SS_CONNECTED; 1725 return 0; 1726 } 1727 1728 static void unix_sock_inherit_flags(const struct socket *old, 1729 struct socket *new) 1730 { 1731 if (test_bit(SOCK_PASSCRED, &old->flags)) 1732 set_bit(SOCK_PASSCRED, &new->flags); 1733 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1734 set_bit(SOCK_PASSPIDFD, &new->flags); 1735 if (test_bit(SOCK_PASSSEC, &old->flags)) 1736 set_bit(SOCK_PASSSEC, &new->flags); 1737 } 1738 1739 static int unix_accept(struct socket *sock, struct socket *newsock, 1740 struct proto_accept_arg *arg) 1741 { 1742 struct sock *sk = sock->sk; 1743 struct sk_buff *skb; 1744 struct sock *tsk; 1745 1746 arg->err = -EOPNOTSUPP; 1747 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1748 goto out; 1749 1750 arg->err = -EINVAL; 1751 if (READ_ONCE(sk->sk_state) != TCP_LISTEN) 1752 goto out; 1753 1754 /* If socket state is TCP_LISTEN it cannot change (for now...), 1755 * so that no locks are necessary. 1756 */ 1757 1758 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1759 &arg->err); 1760 if (!skb) { 1761 /* This means receive shutdown. */ 1762 if (arg->err == 0) 1763 arg->err = -EINVAL; 1764 goto out; 1765 } 1766 1767 tsk = skb->sk; 1768 skb_free_datagram(sk, skb); 1769 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1770 1771 /* attach accepted sock to socket */ 1772 unix_state_lock(tsk); 1773 unix_update_edges(unix_sk(tsk)); 1774 newsock->state = SS_CONNECTED; 1775 unix_sock_inherit_flags(sock, newsock); 1776 sock_graft(tsk, newsock); 1777 unix_state_unlock(tsk); 1778 return 0; 1779 1780 out: 1781 return arg->err; 1782 } 1783 1784 1785 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1786 { 1787 struct sock *sk = sock->sk; 1788 struct unix_address *addr; 1789 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1790 int err = 0; 1791 1792 if (peer) { 1793 sk = unix_peer_get(sk); 1794 1795 err = -ENOTCONN; 1796 if (!sk) 1797 goto out; 1798 err = 0; 1799 } else { 1800 sock_hold(sk); 1801 } 1802 1803 addr = smp_load_acquire(&unix_sk(sk)->addr); 1804 if (!addr) { 1805 sunaddr->sun_family = AF_UNIX; 1806 sunaddr->sun_path[0] = 0; 1807 err = offsetof(struct sockaddr_un, sun_path); 1808 } else { 1809 err = addr->len; 1810 memcpy(sunaddr, addr->name, addr->len); 1811 1812 if (peer) 1813 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1814 CGROUP_UNIX_GETPEERNAME); 1815 else 1816 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1817 CGROUP_UNIX_GETSOCKNAME); 1818 } 1819 sock_put(sk); 1820 out: 1821 return err; 1822 } 1823 1824 /* The "user->unix_inflight" variable is protected by the garbage 1825 * collection lock, and we just read it locklessly here. If you go 1826 * over the limit, there might be a tiny race in actually noticing 1827 * it across threads. Tough. 1828 */ 1829 static inline bool too_many_unix_fds(struct task_struct *p) 1830 { 1831 struct user_struct *user = current_user(); 1832 1833 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1834 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1835 return false; 1836 } 1837 1838 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1839 { 1840 if (too_many_unix_fds(current)) 1841 return -ETOOMANYREFS; 1842 1843 UNIXCB(skb).fp = scm->fp; 1844 scm->fp = NULL; 1845 1846 if (unix_prepare_fpl(UNIXCB(skb).fp)) 1847 return -ENOMEM; 1848 1849 return 0; 1850 } 1851 1852 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1853 { 1854 scm->fp = UNIXCB(skb).fp; 1855 UNIXCB(skb).fp = NULL; 1856 1857 unix_destroy_fpl(scm->fp); 1858 } 1859 1860 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1861 { 1862 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1863 } 1864 1865 static void unix_destruct_scm(struct sk_buff *skb) 1866 { 1867 struct scm_cookie scm; 1868 1869 memset(&scm, 0, sizeof(scm)); 1870 scm.pid = UNIXCB(skb).pid; 1871 if (UNIXCB(skb).fp) 1872 unix_detach_fds(&scm, skb); 1873 1874 /* Alas, it calls VFS */ 1875 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1876 scm_destroy(&scm); 1877 sock_wfree(skb); 1878 } 1879 1880 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1881 { 1882 int err = 0; 1883 1884 UNIXCB(skb).pid = get_pid(scm->pid); 1885 UNIXCB(skb).uid = scm->creds.uid; 1886 UNIXCB(skb).gid = scm->creds.gid; 1887 UNIXCB(skb).fp = NULL; 1888 unix_get_secdata(scm, skb); 1889 if (scm->fp && send_fds) 1890 err = unix_attach_fds(scm, skb); 1891 1892 skb->destructor = unix_destruct_scm; 1893 return err; 1894 } 1895 1896 static bool unix_passcred_enabled(const struct socket *sock, 1897 const struct sock *other) 1898 { 1899 return test_bit(SOCK_PASSCRED, &sock->flags) || 1900 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1901 !other->sk_socket || 1902 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1903 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1904 } 1905 1906 /* 1907 * Some apps rely on write() giving SCM_CREDENTIALS 1908 * We include credentials if source or destination socket 1909 * asserted SOCK_PASSCRED. 1910 */ 1911 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1912 const struct sock *other) 1913 { 1914 if (UNIXCB(skb).pid) 1915 return; 1916 if (unix_passcred_enabled(sock, other)) { 1917 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1918 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1919 } 1920 } 1921 1922 static bool unix_skb_scm_eq(struct sk_buff *skb, 1923 struct scm_cookie *scm) 1924 { 1925 return UNIXCB(skb).pid == scm->pid && 1926 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1927 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1928 unix_secdata_eq(scm, skb); 1929 } 1930 1931 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1932 { 1933 struct scm_fp_list *fp = UNIXCB(skb).fp; 1934 struct unix_sock *u = unix_sk(sk); 1935 1936 if (unlikely(fp && fp->count)) { 1937 atomic_add(fp->count, &u->scm_stat.nr_fds); 1938 unix_add_edges(fp, u); 1939 } 1940 } 1941 1942 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1943 { 1944 struct scm_fp_list *fp = UNIXCB(skb).fp; 1945 struct unix_sock *u = unix_sk(sk); 1946 1947 if (unlikely(fp && fp->count)) { 1948 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1949 unix_del_edges(fp); 1950 } 1951 } 1952 1953 /* 1954 * Send AF_UNIX data. 1955 */ 1956 1957 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1958 size_t len) 1959 { 1960 struct sock *sk = sock->sk, *other = NULL; 1961 struct unix_sock *u = unix_sk(sk); 1962 struct scm_cookie scm; 1963 struct sk_buff *skb; 1964 int data_len = 0; 1965 int sk_locked; 1966 long timeo; 1967 int err; 1968 1969 err = scm_send(sock, msg, &scm, false); 1970 if (err < 0) 1971 return err; 1972 1973 wait_for_unix_gc(scm.fp); 1974 1975 if (msg->msg_flags & MSG_OOB) { 1976 err = -EOPNOTSUPP; 1977 goto out; 1978 } 1979 1980 if (msg->msg_namelen) { 1981 err = unix_validate_addr(msg->msg_name, msg->msg_namelen); 1982 if (err) 1983 goto out; 1984 1985 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 1986 msg->msg_name, 1987 &msg->msg_namelen, 1988 NULL); 1989 if (err) 1990 goto out; 1991 } 1992 1993 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1994 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1995 !READ_ONCE(u->addr)) { 1996 err = unix_autobind(sk); 1997 if (err) 1998 goto out; 1999 } 2000 2001 if (len > READ_ONCE(sk->sk_sndbuf) - 32) { 2002 err = -EMSGSIZE; 2003 goto out; 2004 } 2005 2006 if (len > SKB_MAX_ALLOC) { 2007 data_len = min_t(size_t, 2008 len - SKB_MAX_ALLOC, 2009 MAX_SKB_FRAGS * PAGE_SIZE); 2010 data_len = PAGE_ALIGN(data_len); 2011 2012 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 2013 } 2014 2015 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 2016 msg->msg_flags & MSG_DONTWAIT, &err, 2017 PAGE_ALLOC_COSTLY_ORDER); 2018 if (!skb) 2019 goto out; 2020 2021 err = unix_scm_to_skb(&scm, skb, true); 2022 if (err < 0) 2023 goto out_free; 2024 2025 skb_put(skb, len - data_len); 2026 skb->data_len = data_len; 2027 skb->len = len; 2028 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 2029 if (err) 2030 goto out_free; 2031 2032 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 2033 2034 if (msg->msg_namelen) { 2035 lookup: 2036 other = unix_find_other(sock_net(sk), msg->msg_name, 2037 msg->msg_namelen, sk->sk_type); 2038 if (IS_ERR(other)) { 2039 err = PTR_ERR(other); 2040 goto out_free; 2041 } 2042 } else { 2043 other = unix_peer_get(sk); 2044 if (!other) { 2045 err = -ENOTCONN; 2046 goto out_free; 2047 } 2048 } 2049 2050 if (sk_filter(other, skb) < 0) { 2051 /* Toss the packet but do not return any error to the sender */ 2052 err = len; 2053 goto out_sock_put; 2054 } 2055 2056 restart: 2057 sk_locked = 0; 2058 unix_state_lock(other); 2059 restart_locked: 2060 2061 if (!unix_may_send(sk, other)) { 2062 err = -EPERM; 2063 goto out_unlock; 2064 } 2065 2066 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2067 /* Check with 1003.1g - what should datagram error */ 2068 2069 unix_state_unlock(other); 2070 2071 if (sk->sk_type == SOCK_SEQPACKET) { 2072 /* We are here only when racing with unix_release_sock() 2073 * is clearing @other. Never change state to TCP_CLOSE 2074 * unlike SOCK_DGRAM wants. 2075 */ 2076 err = -EPIPE; 2077 goto out_sock_put; 2078 } 2079 2080 if (!sk_locked) 2081 unix_state_lock(sk); 2082 2083 if (unix_peer(sk) == other) { 2084 unix_peer(sk) = NULL; 2085 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2086 2087 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2088 unix_state_unlock(sk); 2089 2090 unix_dgram_disconnected(sk, other); 2091 sock_put(other); 2092 err = -ECONNREFUSED; 2093 goto out_sock_put; 2094 } 2095 2096 unix_state_unlock(sk); 2097 2098 if (!msg->msg_namelen) { 2099 err = -ECONNRESET; 2100 goto out_sock_put; 2101 } 2102 2103 goto lookup; 2104 } 2105 2106 if (other->sk_shutdown & RCV_SHUTDOWN) { 2107 err = -EPIPE; 2108 goto out_unlock; 2109 } 2110 2111 if (sk->sk_type != SOCK_SEQPACKET) { 2112 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2113 if (err) 2114 goto out_unlock; 2115 } 2116 2117 /* other == sk && unix_peer(other) != sk if 2118 * - unix_peer(sk) == NULL, destination address bound to sk 2119 * - unix_peer(sk) == sk by time of get but disconnected before lock 2120 */ 2121 if (other != sk && 2122 unlikely(unix_peer(other) != sk && 2123 unix_recvq_full_lockless(other))) { 2124 if (timeo) { 2125 timeo = unix_wait_for_peer(other, timeo); 2126 2127 err = sock_intr_errno(timeo); 2128 if (signal_pending(current)) 2129 goto out_sock_put; 2130 2131 goto restart; 2132 } 2133 2134 if (!sk_locked) { 2135 unix_state_unlock(other); 2136 unix_state_double_lock(sk, other); 2137 } 2138 2139 if (unix_peer(sk) != other || 2140 unix_dgram_peer_wake_me(sk, other)) { 2141 err = -EAGAIN; 2142 sk_locked = 1; 2143 goto out_unlock; 2144 } 2145 2146 if (!sk_locked) { 2147 sk_locked = 1; 2148 goto restart_locked; 2149 } 2150 } 2151 2152 if (unlikely(sk_locked)) 2153 unix_state_unlock(sk); 2154 2155 if (sock_flag(other, SOCK_RCVTSTAMP)) 2156 __net_timestamp(skb); 2157 maybe_add_creds(skb, sock, other); 2158 scm_stat_add(other, skb); 2159 skb_queue_tail(&other->sk_receive_queue, skb); 2160 unix_state_unlock(other); 2161 other->sk_data_ready(other); 2162 sock_put(other); 2163 scm_destroy(&scm); 2164 return len; 2165 2166 out_unlock: 2167 if (sk_locked) 2168 unix_state_unlock(sk); 2169 unix_state_unlock(other); 2170 out_sock_put: 2171 sock_put(other); 2172 out_free: 2173 kfree_skb(skb); 2174 out: 2175 scm_destroy(&scm); 2176 return err; 2177 } 2178 2179 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2180 * bytes, and a minimum of a full page. 2181 */ 2182 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2183 2184 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2185 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2186 struct scm_cookie *scm, bool fds_sent) 2187 { 2188 struct unix_sock *ousk = unix_sk(other); 2189 struct sk_buff *skb; 2190 int err = 0; 2191 2192 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2193 2194 if (!skb) 2195 return err; 2196 2197 err = unix_scm_to_skb(scm, skb, !fds_sent); 2198 if (err < 0) { 2199 kfree_skb(skb); 2200 return err; 2201 } 2202 skb_put(skb, 1); 2203 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2204 2205 if (err) { 2206 kfree_skb(skb); 2207 return err; 2208 } 2209 2210 unix_state_lock(other); 2211 2212 if (sock_flag(other, SOCK_DEAD) || 2213 (other->sk_shutdown & RCV_SHUTDOWN)) { 2214 unix_state_unlock(other); 2215 kfree_skb(skb); 2216 return -EPIPE; 2217 } 2218 2219 maybe_add_creds(skb, sock, other); 2220 scm_stat_add(other, skb); 2221 2222 spin_lock(&other->sk_receive_queue.lock); 2223 WRITE_ONCE(ousk->oob_skb, skb); 2224 __skb_queue_tail(&other->sk_receive_queue, skb); 2225 spin_unlock(&other->sk_receive_queue.lock); 2226 2227 sk_send_sigurg(other); 2228 unix_state_unlock(other); 2229 other->sk_data_ready(other); 2230 2231 return err; 2232 } 2233 #endif 2234 2235 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2236 size_t len) 2237 { 2238 struct sock *sk = sock->sk; 2239 struct sock *other = NULL; 2240 int err, size; 2241 struct sk_buff *skb; 2242 int sent = 0; 2243 struct scm_cookie scm; 2244 bool fds_sent = false; 2245 int data_len; 2246 2247 err = scm_send(sock, msg, &scm, false); 2248 if (err < 0) 2249 return err; 2250 2251 wait_for_unix_gc(scm.fp); 2252 2253 if (msg->msg_flags & MSG_OOB) { 2254 err = -EOPNOTSUPP; 2255 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2256 if (len) 2257 len--; 2258 else 2259 #endif 2260 goto out_err; 2261 } 2262 2263 if (msg->msg_namelen) { 2264 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2265 goto out_err; 2266 } else { 2267 other = unix_peer(sk); 2268 if (!other) { 2269 err = -ENOTCONN; 2270 goto out_err; 2271 } 2272 } 2273 2274 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) { 2275 if (!(msg->msg_flags & MSG_NOSIGNAL)) 2276 send_sig(SIGPIPE, current, 0); 2277 2278 err = -EPIPE; 2279 goto out_err; 2280 } 2281 2282 while (sent < len) { 2283 size = len - sent; 2284 2285 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2286 skb = sock_alloc_send_pskb(sk, 0, 0, 2287 msg->msg_flags & MSG_DONTWAIT, 2288 &err, 0); 2289 } else { 2290 /* Keep two messages in the pipe so it schedules better */ 2291 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2292 2293 /* allow fallback to order-0 allocations */ 2294 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2295 2296 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2297 2298 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2299 2300 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2301 msg->msg_flags & MSG_DONTWAIT, &err, 2302 get_order(UNIX_SKB_FRAGS_SZ)); 2303 } 2304 if (!skb) 2305 goto out_err; 2306 2307 /* Only send the fds in the first buffer */ 2308 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2309 if (err < 0) 2310 goto out_free; 2311 2312 fds_sent = true; 2313 2314 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2315 skb->ip_summed = CHECKSUM_UNNECESSARY; 2316 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2317 sk->sk_allocation); 2318 if (err < 0) 2319 goto out_free; 2320 2321 size = err; 2322 refcount_add(size, &sk->sk_wmem_alloc); 2323 } else { 2324 skb_put(skb, size - data_len); 2325 skb->data_len = data_len; 2326 skb->len = size; 2327 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2328 if (err) 2329 goto out_free; 2330 } 2331 2332 unix_state_lock(other); 2333 2334 if (sock_flag(other, SOCK_DEAD) || 2335 (other->sk_shutdown & RCV_SHUTDOWN)) 2336 goto out_pipe; 2337 2338 maybe_add_creds(skb, sock, other); 2339 scm_stat_add(other, skb); 2340 skb_queue_tail(&other->sk_receive_queue, skb); 2341 unix_state_unlock(other); 2342 other->sk_data_ready(other); 2343 sent += size; 2344 } 2345 2346 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2347 if (msg->msg_flags & MSG_OOB) { 2348 err = queue_oob(sock, msg, other, &scm, fds_sent); 2349 if (err) 2350 goto out_err; 2351 sent++; 2352 } 2353 #endif 2354 2355 scm_destroy(&scm); 2356 2357 return sent; 2358 2359 out_pipe: 2360 unix_state_unlock(other); 2361 if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) 2362 send_sig(SIGPIPE, current, 0); 2363 err = -EPIPE; 2364 out_free: 2365 kfree_skb(skb); 2366 out_err: 2367 scm_destroy(&scm); 2368 return sent ? : err; 2369 } 2370 2371 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2372 size_t len) 2373 { 2374 int err; 2375 struct sock *sk = sock->sk; 2376 2377 err = sock_error(sk); 2378 if (err) 2379 return err; 2380 2381 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2382 return -ENOTCONN; 2383 2384 if (msg->msg_namelen) 2385 msg->msg_namelen = 0; 2386 2387 return unix_dgram_sendmsg(sock, msg, len); 2388 } 2389 2390 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2391 size_t size, int flags) 2392 { 2393 struct sock *sk = sock->sk; 2394 2395 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2396 return -ENOTCONN; 2397 2398 return unix_dgram_recvmsg(sock, msg, size, flags); 2399 } 2400 2401 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2402 { 2403 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2404 2405 if (addr) { 2406 msg->msg_namelen = addr->len; 2407 memcpy(msg->msg_name, addr->name, addr->len); 2408 } 2409 } 2410 2411 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2412 int flags) 2413 { 2414 struct scm_cookie scm; 2415 struct socket *sock = sk->sk_socket; 2416 struct unix_sock *u = unix_sk(sk); 2417 struct sk_buff *skb, *last; 2418 long timeo; 2419 int skip; 2420 int err; 2421 2422 err = -EOPNOTSUPP; 2423 if (flags&MSG_OOB) 2424 goto out; 2425 2426 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2427 2428 do { 2429 mutex_lock(&u->iolock); 2430 2431 skip = sk_peek_offset(sk, flags); 2432 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2433 &skip, &err, &last); 2434 if (skb) { 2435 if (!(flags & MSG_PEEK)) 2436 scm_stat_del(sk, skb); 2437 break; 2438 } 2439 2440 mutex_unlock(&u->iolock); 2441 2442 if (err != -EAGAIN) 2443 break; 2444 } while (timeo && 2445 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2446 &err, &timeo, last)); 2447 2448 if (!skb) { /* implies iolock unlocked */ 2449 unix_state_lock(sk); 2450 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2451 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2452 (sk->sk_shutdown & RCV_SHUTDOWN)) 2453 err = 0; 2454 unix_state_unlock(sk); 2455 goto out; 2456 } 2457 2458 if (wq_has_sleeper(&u->peer_wait)) 2459 wake_up_interruptible_sync_poll(&u->peer_wait, 2460 EPOLLOUT | EPOLLWRNORM | 2461 EPOLLWRBAND); 2462 2463 if (msg->msg_name) { 2464 unix_copy_addr(msg, skb->sk); 2465 2466 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2467 msg->msg_name, 2468 &msg->msg_namelen); 2469 } 2470 2471 if (size > skb->len - skip) 2472 size = skb->len - skip; 2473 else if (size < skb->len - skip) 2474 msg->msg_flags |= MSG_TRUNC; 2475 2476 err = skb_copy_datagram_msg(skb, skip, msg, size); 2477 if (err) 2478 goto out_free; 2479 2480 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2481 __sock_recv_timestamp(msg, sk, skb); 2482 2483 memset(&scm, 0, sizeof(scm)); 2484 2485 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2486 unix_set_secdata(&scm, skb); 2487 2488 if (!(flags & MSG_PEEK)) { 2489 if (UNIXCB(skb).fp) 2490 unix_detach_fds(&scm, skb); 2491 2492 sk_peek_offset_bwd(sk, skb->len); 2493 } else { 2494 /* It is questionable: on PEEK we could: 2495 - do not return fds - good, but too simple 8) 2496 - return fds, and do not return them on read (old strategy, 2497 apparently wrong) 2498 - clone fds (I chose it for now, it is the most universal 2499 solution) 2500 2501 POSIX 1003.1g does not actually define this clearly 2502 at all. POSIX 1003.1g doesn't define a lot of things 2503 clearly however! 2504 2505 */ 2506 2507 sk_peek_offset_fwd(sk, size); 2508 2509 if (UNIXCB(skb).fp) 2510 unix_peek_fds(&scm, skb); 2511 } 2512 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2513 2514 scm_recv_unix(sock, msg, &scm, flags); 2515 2516 out_free: 2517 skb_free_datagram(sk, skb); 2518 mutex_unlock(&u->iolock); 2519 out: 2520 return err; 2521 } 2522 2523 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2524 int flags) 2525 { 2526 struct sock *sk = sock->sk; 2527 2528 #ifdef CONFIG_BPF_SYSCALL 2529 const struct proto *prot = READ_ONCE(sk->sk_prot); 2530 2531 if (prot != &unix_dgram_proto) 2532 return prot->recvmsg(sk, msg, size, flags, NULL); 2533 #endif 2534 return __unix_dgram_recvmsg(sk, msg, size, flags); 2535 } 2536 2537 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2538 { 2539 struct unix_sock *u = unix_sk(sk); 2540 struct sk_buff *skb; 2541 int err; 2542 2543 mutex_lock(&u->iolock); 2544 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2545 mutex_unlock(&u->iolock); 2546 if (!skb) 2547 return err; 2548 2549 return recv_actor(sk, skb); 2550 } 2551 2552 /* 2553 * Sleep until more data has arrived. But check for races.. 2554 */ 2555 static long unix_stream_data_wait(struct sock *sk, long timeo, 2556 struct sk_buff *last, unsigned int last_len, 2557 bool freezable) 2558 { 2559 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2560 struct sk_buff *tail; 2561 DEFINE_WAIT(wait); 2562 2563 unix_state_lock(sk); 2564 2565 for (;;) { 2566 prepare_to_wait(sk_sleep(sk), &wait, state); 2567 2568 tail = skb_peek_tail(&sk->sk_receive_queue); 2569 if (tail != last || 2570 (tail && tail->len != last_len) || 2571 sk->sk_err || 2572 (sk->sk_shutdown & RCV_SHUTDOWN) || 2573 signal_pending(current) || 2574 !timeo) 2575 break; 2576 2577 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2578 unix_state_unlock(sk); 2579 timeo = schedule_timeout(timeo); 2580 unix_state_lock(sk); 2581 2582 if (sock_flag(sk, SOCK_DEAD)) 2583 break; 2584 2585 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2586 } 2587 2588 finish_wait(sk_sleep(sk), &wait); 2589 unix_state_unlock(sk); 2590 return timeo; 2591 } 2592 2593 static unsigned int unix_skb_len(const struct sk_buff *skb) 2594 { 2595 return skb->len - UNIXCB(skb).consumed; 2596 } 2597 2598 struct unix_stream_read_state { 2599 int (*recv_actor)(struct sk_buff *, int, int, 2600 struct unix_stream_read_state *); 2601 struct socket *socket; 2602 struct msghdr *msg; 2603 struct pipe_inode_info *pipe; 2604 size_t size; 2605 int flags; 2606 unsigned int splice_flags; 2607 }; 2608 2609 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2610 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2611 { 2612 struct socket *sock = state->socket; 2613 struct sock *sk = sock->sk; 2614 struct unix_sock *u = unix_sk(sk); 2615 int chunk = 1; 2616 struct sk_buff *oob_skb; 2617 2618 mutex_lock(&u->iolock); 2619 unix_state_lock(sk); 2620 spin_lock(&sk->sk_receive_queue.lock); 2621 2622 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2623 spin_unlock(&sk->sk_receive_queue.lock); 2624 unix_state_unlock(sk); 2625 mutex_unlock(&u->iolock); 2626 return -EINVAL; 2627 } 2628 2629 oob_skb = u->oob_skb; 2630 2631 if (!(state->flags & MSG_PEEK)) 2632 WRITE_ONCE(u->oob_skb, NULL); 2633 2634 spin_unlock(&sk->sk_receive_queue.lock); 2635 unix_state_unlock(sk); 2636 2637 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2638 2639 if (!(state->flags & MSG_PEEK)) 2640 UNIXCB(oob_skb).consumed += 1; 2641 2642 mutex_unlock(&u->iolock); 2643 2644 if (chunk < 0) 2645 return -EFAULT; 2646 2647 state->msg->msg_flags |= MSG_OOB; 2648 return 1; 2649 } 2650 2651 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2652 int flags, int copied) 2653 { 2654 struct sk_buff *read_skb = NULL, *unread_skb = NULL; 2655 struct unix_sock *u = unix_sk(sk); 2656 2657 if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb))) 2658 return skb; 2659 2660 spin_lock(&sk->sk_receive_queue.lock); 2661 2662 if (!unix_skb_len(skb)) { 2663 if (copied && (!u->oob_skb || skb == u->oob_skb)) { 2664 skb = NULL; 2665 } else if (flags & MSG_PEEK) { 2666 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2667 } else { 2668 read_skb = skb; 2669 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2670 __skb_unlink(read_skb, &sk->sk_receive_queue); 2671 } 2672 2673 if (!skb) 2674 goto unlock; 2675 } 2676 2677 if (skb != u->oob_skb) 2678 goto unlock; 2679 2680 if (copied) { 2681 skb = NULL; 2682 } else if (!(flags & MSG_PEEK)) { 2683 WRITE_ONCE(u->oob_skb, NULL); 2684 2685 if (!sock_flag(sk, SOCK_URGINLINE)) { 2686 __skb_unlink(skb, &sk->sk_receive_queue); 2687 unread_skb = skb; 2688 skb = skb_peek(&sk->sk_receive_queue); 2689 } 2690 } else if (!sock_flag(sk, SOCK_URGINLINE)) { 2691 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2692 } 2693 2694 unlock: 2695 spin_unlock(&sk->sk_receive_queue.lock); 2696 2697 consume_skb(read_skb); 2698 kfree_skb(unread_skb); 2699 2700 return skb; 2701 } 2702 #endif 2703 2704 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2705 { 2706 struct unix_sock *u = unix_sk(sk); 2707 struct sk_buff *skb; 2708 int err; 2709 2710 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2711 return -ENOTCONN; 2712 2713 mutex_lock(&u->iolock); 2714 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2715 mutex_unlock(&u->iolock); 2716 if (!skb) 2717 return err; 2718 2719 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2720 if (unlikely(skb == READ_ONCE(u->oob_skb))) { 2721 bool drop = false; 2722 2723 unix_state_lock(sk); 2724 2725 if (sock_flag(sk, SOCK_DEAD)) { 2726 unix_state_unlock(sk); 2727 kfree_skb(skb); 2728 return -ECONNRESET; 2729 } 2730 2731 spin_lock(&sk->sk_receive_queue.lock); 2732 if (likely(skb == u->oob_skb)) { 2733 WRITE_ONCE(u->oob_skb, NULL); 2734 drop = true; 2735 } 2736 spin_unlock(&sk->sk_receive_queue.lock); 2737 2738 unix_state_unlock(sk); 2739 2740 if (drop) { 2741 kfree_skb(skb); 2742 return -EAGAIN; 2743 } 2744 } 2745 #endif 2746 2747 return recv_actor(sk, skb); 2748 } 2749 2750 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2751 bool freezable) 2752 { 2753 struct scm_cookie scm; 2754 struct socket *sock = state->socket; 2755 struct sock *sk = sock->sk; 2756 struct unix_sock *u = unix_sk(sk); 2757 int copied = 0; 2758 int flags = state->flags; 2759 int noblock = flags & MSG_DONTWAIT; 2760 bool check_creds = false; 2761 int target; 2762 int err = 0; 2763 long timeo; 2764 int skip; 2765 size_t size = state->size; 2766 unsigned int last_len; 2767 2768 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2769 err = -EINVAL; 2770 goto out; 2771 } 2772 2773 if (unlikely(flags & MSG_OOB)) { 2774 err = -EOPNOTSUPP; 2775 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2776 err = unix_stream_recv_urg(state); 2777 #endif 2778 goto out; 2779 } 2780 2781 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2782 timeo = sock_rcvtimeo(sk, noblock); 2783 2784 memset(&scm, 0, sizeof(scm)); 2785 2786 /* Lock the socket to prevent queue disordering 2787 * while sleeps in memcpy_tomsg 2788 */ 2789 mutex_lock(&u->iolock); 2790 2791 skip = max(sk_peek_offset(sk, flags), 0); 2792 2793 do { 2794 struct sk_buff *skb, *last; 2795 int chunk; 2796 2797 redo: 2798 unix_state_lock(sk); 2799 if (sock_flag(sk, SOCK_DEAD)) { 2800 err = -ECONNRESET; 2801 goto unlock; 2802 } 2803 last = skb = skb_peek(&sk->sk_receive_queue); 2804 last_len = last ? last->len : 0; 2805 2806 again: 2807 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2808 if (skb) { 2809 skb = manage_oob(skb, sk, flags, copied); 2810 if (!skb && copied) { 2811 unix_state_unlock(sk); 2812 break; 2813 } 2814 } 2815 #endif 2816 if (skb == NULL) { 2817 if (copied >= target) 2818 goto unlock; 2819 2820 /* 2821 * POSIX 1003.1g mandates this order. 2822 */ 2823 2824 err = sock_error(sk); 2825 if (err) 2826 goto unlock; 2827 if (sk->sk_shutdown & RCV_SHUTDOWN) 2828 goto unlock; 2829 2830 unix_state_unlock(sk); 2831 if (!timeo) { 2832 err = -EAGAIN; 2833 break; 2834 } 2835 2836 mutex_unlock(&u->iolock); 2837 2838 timeo = unix_stream_data_wait(sk, timeo, last, 2839 last_len, freezable); 2840 2841 if (signal_pending(current)) { 2842 err = sock_intr_errno(timeo); 2843 scm_destroy(&scm); 2844 goto out; 2845 } 2846 2847 mutex_lock(&u->iolock); 2848 goto redo; 2849 unlock: 2850 unix_state_unlock(sk); 2851 break; 2852 } 2853 2854 while (skip >= unix_skb_len(skb)) { 2855 skip -= unix_skb_len(skb); 2856 last = skb; 2857 last_len = skb->len; 2858 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2859 if (!skb) 2860 goto again; 2861 } 2862 2863 unix_state_unlock(sk); 2864 2865 if (check_creds) { 2866 /* Never glue messages from different writers */ 2867 if (!unix_skb_scm_eq(skb, &scm)) 2868 break; 2869 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2870 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2871 /* Copy credentials */ 2872 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2873 unix_set_secdata(&scm, skb); 2874 check_creds = true; 2875 } 2876 2877 /* Copy address just once */ 2878 if (state->msg && state->msg->msg_name) { 2879 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2880 state->msg->msg_name); 2881 unix_copy_addr(state->msg, skb->sk); 2882 2883 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2884 state->msg->msg_name, 2885 &state->msg->msg_namelen); 2886 2887 sunaddr = NULL; 2888 } 2889 2890 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2891 chunk = state->recv_actor(skb, skip, chunk, state); 2892 if (chunk < 0) { 2893 if (copied == 0) 2894 copied = -EFAULT; 2895 break; 2896 } 2897 copied += chunk; 2898 size -= chunk; 2899 2900 /* Mark read part of skb as used */ 2901 if (!(flags & MSG_PEEK)) { 2902 UNIXCB(skb).consumed += chunk; 2903 2904 sk_peek_offset_bwd(sk, chunk); 2905 2906 if (UNIXCB(skb).fp) { 2907 scm_stat_del(sk, skb); 2908 unix_detach_fds(&scm, skb); 2909 } 2910 2911 if (unix_skb_len(skb)) 2912 break; 2913 2914 skb_unlink(skb, &sk->sk_receive_queue); 2915 consume_skb(skb); 2916 2917 if (scm.fp) 2918 break; 2919 } else { 2920 /* It is questionable, see note in unix_dgram_recvmsg. 2921 */ 2922 if (UNIXCB(skb).fp) 2923 unix_peek_fds(&scm, skb); 2924 2925 sk_peek_offset_fwd(sk, chunk); 2926 2927 if (UNIXCB(skb).fp) 2928 break; 2929 2930 skip = 0; 2931 last = skb; 2932 last_len = skb->len; 2933 unix_state_lock(sk); 2934 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2935 if (skb) 2936 goto again; 2937 unix_state_unlock(sk); 2938 break; 2939 } 2940 } while (size); 2941 2942 mutex_unlock(&u->iolock); 2943 if (state->msg) 2944 scm_recv_unix(sock, state->msg, &scm, flags); 2945 else 2946 scm_destroy(&scm); 2947 out: 2948 return copied ? : err; 2949 } 2950 2951 static int unix_stream_read_actor(struct sk_buff *skb, 2952 int skip, int chunk, 2953 struct unix_stream_read_state *state) 2954 { 2955 int ret; 2956 2957 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2958 state->msg, chunk); 2959 return ret ?: chunk; 2960 } 2961 2962 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2963 size_t size, int flags) 2964 { 2965 struct unix_stream_read_state state = { 2966 .recv_actor = unix_stream_read_actor, 2967 .socket = sk->sk_socket, 2968 .msg = msg, 2969 .size = size, 2970 .flags = flags 2971 }; 2972 2973 return unix_stream_read_generic(&state, true); 2974 } 2975 2976 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2977 size_t size, int flags) 2978 { 2979 struct unix_stream_read_state state = { 2980 .recv_actor = unix_stream_read_actor, 2981 .socket = sock, 2982 .msg = msg, 2983 .size = size, 2984 .flags = flags 2985 }; 2986 2987 #ifdef CONFIG_BPF_SYSCALL 2988 struct sock *sk = sock->sk; 2989 const struct proto *prot = READ_ONCE(sk->sk_prot); 2990 2991 if (prot != &unix_stream_proto) 2992 return prot->recvmsg(sk, msg, size, flags, NULL); 2993 #endif 2994 return unix_stream_read_generic(&state, true); 2995 } 2996 2997 static int unix_stream_splice_actor(struct sk_buff *skb, 2998 int skip, int chunk, 2999 struct unix_stream_read_state *state) 3000 { 3001 return skb_splice_bits(skb, state->socket->sk, 3002 UNIXCB(skb).consumed + skip, 3003 state->pipe, chunk, state->splice_flags); 3004 } 3005 3006 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 3007 struct pipe_inode_info *pipe, 3008 size_t size, unsigned int flags) 3009 { 3010 struct unix_stream_read_state state = { 3011 .recv_actor = unix_stream_splice_actor, 3012 .socket = sock, 3013 .pipe = pipe, 3014 .size = size, 3015 .splice_flags = flags, 3016 }; 3017 3018 if (unlikely(*ppos)) 3019 return -ESPIPE; 3020 3021 if (sock->file->f_flags & O_NONBLOCK || 3022 flags & SPLICE_F_NONBLOCK) 3023 state.flags = MSG_DONTWAIT; 3024 3025 return unix_stream_read_generic(&state, false); 3026 } 3027 3028 static int unix_shutdown(struct socket *sock, int mode) 3029 { 3030 struct sock *sk = sock->sk; 3031 struct sock *other; 3032 3033 if (mode < SHUT_RD || mode > SHUT_RDWR) 3034 return -EINVAL; 3035 /* This maps: 3036 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 3037 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 3038 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 3039 */ 3040 ++mode; 3041 3042 unix_state_lock(sk); 3043 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 3044 other = unix_peer(sk); 3045 if (other) 3046 sock_hold(other); 3047 unix_state_unlock(sk); 3048 sk->sk_state_change(sk); 3049 3050 if (other && 3051 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 3052 3053 int peer_mode = 0; 3054 const struct proto *prot = READ_ONCE(other->sk_prot); 3055 3056 if (prot->unhash) 3057 prot->unhash(other); 3058 if (mode&RCV_SHUTDOWN) 3059 peer_mode |= SEND_SHUTDOWN; 3060 if (mode&SEND_SHUTDOWN) 3061 peer_mode |= RCV_SHUTDOWN; 3062 unix_state_lock(other); 3063 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 3064 unix_state_unlock(other); 3065 other->sk_state_change(other); 3066 if (peer_mode == SHUTDOWN_MASK) 3067 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3068 else if (peer_mode & RCV_SHUTDOWN) 3069 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3070 } 3071 if (other) 3072 sock_put(other); 3073 3074 return 0; 3075 } 3076 3077 long unix_inq_len(struct sock *sk) 3078 { 3079 struct sk_buff *skb; 3080 long amount = 0; 3081 3082 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 3083 return -EINVAL; 3084 3085 spin_lock(&sk->sk_receive_queue.lock); 3086 if (sk->sk_type == SOCK_STREAM || 3087 sk->sk_type == SOCK_SEQPACKET) { 3088 skb_queue_walk(&sk->sk_receive_queue, skb) 3089 amount += unix_skb_len(skb); 3090 } else { 3091 skb = skb_peek(&sk->sk_receive_queue); 3092 if (skb) 3093 amount = skb->len; 3094 } 3095 spin_unlock(&sk->sk_receive_queue.lock); 3096 3097 return amount; 3098 } 3099 EXPORT_SYMBOL_GPL(unix_inq_len); 3100 3101 long unix_outq_len(struct sock *sk) 3102 { 3103 return sk_wmem_alloc_get(sk); 3104 } 3105 EXPORT_SYMBOL_GPL(unix_outq_len); 3106 3107 static int unix_open_file(struct sock *sk) 3108 { 3109 struct path path; 3110 struct file *f; 3111 int fd; 3112 3113 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3114 return -EPERM; 3115 3116 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3117 return -ENOENT; 3118 3119 path = unix_sk(sk)->path; 3120 if (!path.dentry) 3121 return -ENOENT; 3122 3123 path_get(&path); 3124 3125 fd = get_unused_fd_flags(O_CLOEXEC); 3126 if (fd < 0) 3127 goto out; 3128 3129 f = dentry_open(&path, O_PATH, current_cred()); 3130 if (IS_ERR(f)) { 3131 put_unused_fd(fd); 3132 fd = PTR_ERR(f); 3133 goto out; 3134 } 3135 3136 fd_install(fd, f); 3137 out: 3138 path_put(&path); 3139 3140 return fd; 3141 } 3142 3143 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3144 { 3145 struct sock *sk = sock->sk; 3146 long amount = 0; 3147 int err; 3148 3149 switch (cmd) { 3150 case SIOCOUTQ: 3151 amount = unix_outq_len(sk); 3152 err = put_user(amount, (int __user *)arg); 3153 break; 3154 case SIOCINQ: 3155 amount = unix_inq_len(sk); 3156 if (amount < 0) 3157 err = amount; 3158 else 3159 err = put_user(amount, (int __user *)arg); 3160 break; 3161 case SIOCUNIXFILE: 3162 err = unix_open_file(sk); 3163 break; 3164 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3165 case SIOCATMARK: 3166 { 3167 struct unix_sock *u = unix_sk(sk); 3168 struct sk_buff *skb; 3169 int answ = 0; 3170 3171 mutex_lock(&u->iolock); 3172 3173 skb = skb_peek(&sk->sk_receive_queue); 3174 if (skb) { 3175 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); 3176 struct sk_buff *next_skb; 3177 3178 next_skb = skb_peek_next(skb, &sk->sk_receive_queue); 3179 3180 if (skb == oob_skb || 3181 (!unix_skb_len(skb) && 3182 (!oob_skb || next_skb == oob_skb))) 3183 answ = 1; 3184 } 3185 3186 mutex_unlock(&u->iolock); 3187 3188 err = put_user(answ, (int __user *)arg); 3189 } 3190 break; 3191 #endif 3192 default: 3193 err = -ENOIOCTLCMD; 3194 break; 3195 } 3196 return err; 3197 } 3198 3199 #ifdef CONFIG_COMPAT 3200 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3201 { 3202 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3203 } 3204 #endif 3205 3206 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3207 { 3208 struct sock *sk = sock->sk; 3209 unsigned char state; 3210 __poll_t mask; 3211 u8 shutdown; 3212 3213 sock_poll_wait(file, sock, wait); 3214 mask = 0; 3215 shutdown = READ_ONCE(sk->sk_shutdown); 3216 state = READ_ONCE(sk->sk_state); 3217 3218 /* exceptional events? */ 3219 if (READ_ONCE(sk->sk_err)) 3220 mask |= EPOLLERR; 3221 if (shutdown == SHUTDOWN_MASK) 3222 mask |= EPOLLHUP; 3223 if (shutdown & RCV_SHUTDOWN) 3224 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3225 3226 /* readable? */ 3227 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3228 mask |= EPOLLIN | EPOLLRDNORM; 3229 if (sk_is_readable(sk)) 3230 mask |= EPOLLIN | EPOLLRDNORM; 3231 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3232 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3233 mask |= EPOLLPRI; 3234 #endif 3235 3236 /* Connection-based need to check for termination and startup */ 3237 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3238 state == TCP_CLOSE) 3239 mask |= EPOLLHUP; 3240 3241 /* 3242 * we set writable also when the other side has shut down the 3243 * connection. This prevents stuck sockets. 3244 */ 3245 if (unix_writable(sk, state)) 3246 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3247 3248 return mask; 3249 } 3250 3251 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3252 poll_table *wait) 3253 { 3254 struct sock *sk = sock->sk, *other; 3255 unsigned int writable; 3256 unsigned char state; 3257 __poll_t mask; 3258 u8 shutdown; 3259 3260 sock_poll_wait(file, sock, wait); 3261 mask = 0; 3262 shutdown = READ_ONCE(sk->sk_shutdown); 3263 state = READ_ONCE(sk->sk_state); 3264 3265 /* exceptional events? */ 3266 if (READ_ONCE(sk->sk_err) || 3267 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3268 mask |= EPOLLERR | 3269 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3270 3271 if (shutdown & RCV_SHUTDOWN) 3272 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3273 if (shutdown == SHUTDOWN_MASK) 3274 mask |= EPOLLHUP; 3275 3276 /* readable? */ 3277 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3278 mask |= EPOLLIN | EPOLLRDNORM; 3279 if (sk_is_readable(sk)) 3280 mask |= EPOLLIN | EPOLLRDNORM; 3281 3282 /* Connection-based need to check for termination and startup */ 3283 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3284 mask |= EPOLLHUP; 3285 3286 /* No write status requested, avoid expensive OUT tests. */ 3287 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3288 return mask; 3289 3290 writable = unix_writable(sk, state); 3291 if (writable) { 3292 unix_state_lock(sk); 3293 3294 other = unix_peer(sk); 3295 if (other && unix_peer(other) != sk && 3296 unix_recvq_full_lockless(other) && 3297 unix_dgram_peer_wake_me(sk, other)) 3298 writable = 0; 3299 3300 unix_state_unlock(sk); 3301 } 3302 3303 if (writable) 3304 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3305 else 3306 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3307 3308 return mask; 3309 } 3310 3311 #ifdef CONFIG_PROC_FS 3312 3313 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3314 3315 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3316 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3317 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3318 3319 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3320 { 3321 unsigned long offset = get_offset(*pos); 3322 unsigned long bucket = get_bucket(*pos); 3323 unsigned long count = 0; 3324 struct sock *sk; 3325 3326 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3327 sk; sk = sk_next(sk)) { 3328 if (++count == offset) 3329 break; 3330 } 3331 3332 return sk; 3333 } 3334 3335 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3336 { 3337 unsigned long bucket = get_bucket(*pos); 3338 struct net *net = seq_file_net(seq); 3339 struct sock *sk; 3340 3341 while (bucket < UNIX_HASH_SIZE) { 3342 spin_lock(&net->unx.table.locks[bucket]); 3343 3344 sk = unix_from_bucket(seq, pos); 3345 if (sk) 3346 return sk; 3347 3348 spin_unlock(&net->unx.table.locks[bucket]); 3349 3350 *pos = set_bucket_offset(++bucket, 1); 3351 } 3352 3353 return NULL; 3354 } 3355 3356 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3357 loff_t *pos) 3358 { 3359 unsigned long bucket = get_bucket(*pos); 3360 3361 sk = sk_next(sk); 3362 if (sk) 3363 return sk; 3364 3365 3366 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3367 3368 *pos = set_bucket_offset(++bucket, 1); 3369 3370 return unix_get_first(seq, pos); 3371 } 3372 3373 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3374 { 3375 if (!*pos) 3376 return SEQ_START_TOKEN; 3377 3378 return unix_get_first(seq, pos); 3379 } 3380 3381 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3382 { 3383 ++*pos; 3384 3385 if (v == SEQ_START_TOKEN) 3386 return unix_get_first(seq, pos); 3387 3388 return unix_get_next(seq, v, pos); 3389 } 3390 3391 static void unix_seq_stop(struct seq_file *seq, void *v) 3392 { 3393 struct sock *sk = v; 3394 3395 if (sk) 3396 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3397 } 3398 3399 static int unix_seq_show(struct seq_file *seq, void *v) 3400 { 3401 3402 if (v == SEQ_START_TOKEN) 3403 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3404 "Inode Path\n"); 3405 else { 3406 struct sock *s = v; 3407 struct unix_sock *u = unix_sk(s); 3408 unix_state_lock(s); 3409 3410 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3411 s, 3412 refcount_read(&s->sk_refcnt), 3413 0, 3414 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3415 s->sk_type, 3416 s->sk_socket ? 3417 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3418 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3419 sock_i_ino(s)); 3420 3421 if (u->addr) { // under a hash table lock here 3422 int i, len; 3423 seq_putc(seq, ' '); 3424 3425 i = 0; 3426 len = u->addr->len - 3427 offsetof(struct sockaddr_un, sun_path); 3428 if (u->addr->name->sun_path[0]) { 3429 len--; 3430 } else { 3431 seq_putc(seq, '@'); 3432 i++; 3433 } 3434 for ( ; i < len; i++) 3435 seq_putc(seq, u->addr->name->sun_path[i] ?: 3436 '@'); 3437 } 3438 unix_state_unlock(s); 3439 seq_putc(seq, '\n'); 3440 } 3441 3442 return 0; 3443 } 3444 3445 static const struct seq_operations unix_seq_ops = { 3446 .start = unix_seq_start, 3447 .next = unix_seq_next, 3448 .stop = unix_seq_stop, 3449 .show = unix_seq_show, 3450 }; 3451 3452 #ifdef CONFIG_BPF_SYSCALL 3453 struct bpf_unix_iter_state { 3454 struct seq_net_private p; 3455 unsigned int cur_sk; 3456 unsigned int end_sk; 3457 unsigned int max_sk; 3458 struct sock **batch; 3459 bool st_bucket_done; 3460 }; 3461 3462 struct bpf_iter__unix { 3463 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3464 __bpf_md_ptr(struct unix_sock *, unix_sk); 3465 uid_t uid __aligned(8); 3466 }; 3467 3468 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3469 struct unix_sock *unix_sk, uid_t uid) 3470 { 3471 struct bpf_iter__unix ctx; 3472 3473 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3474 ctx.meta = meta; 3475 ctx.unix_sk = unix_sk; 3476 ctx.uid = uid; 3477 return bpf_iter_run_prog(prog, &ctx); 3478 } 3479 3480 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3481 3482 { 3483 struct bpf_unix_iter_state *iter = seq->private; 3484 unsigned int expected = 1; 3485 struct sock *sk; 3486 3487 sock_hold(start_sk); 3488 iter->batch[iter->end_sk++] = start_sk; 3489 3490 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3491 if (iter->end_sk < iter->max_sk) { 3492 sock_hold(sk); 3493 iter->batch[iter->end_sk++] = sk; 3494 } 3495 3496 expected++; 3497 } 3498 3499 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3500 3501 return expected; 3502 } 3503 3504 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3505 { 3506 while (iter->cur_sk < iter->end_sk) 3507 sock_put(iter->batch[iter->cur_sk++]); 3508 } 3509 3510 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3511 unsigned int new_batch_sz) 3512 { 3513 struct sock **new_batch; 3514 3515 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3516 GFP_USER | __GFP_NOWARN); 3517 if (!new_batch) 3518 return -ENOMEM; 3519 3520 bpf_iter_unix_put_batch(iter); 3521 kvfree(iter->batch); 3522 iter->batch = new_batch; 3523 iter->max_sk = new_batch_sz; 3524 3525 return 0; 3526 } 3527 3528 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3529 loff_t *pos) 3530 { 3531 struct bpf_unix_iter_state *iter = seq->private; 3532 unsigned int expected; 3533 bool resized = false; 3534 struct sock *sk; 3535 3536 if (iter->st_bucket_done) 3537 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3538 3539 again: 3540 /* Get a new batch */ 3541 iter->cur_sk = 0; 3542 iter->end_sk = 0; 3543 3544 sk = unix_get_first(seq, pos); 3545 if (!sk) 3546 return NULL; /* Done */ 3547 3548 expected = bpf_iter_unix_hold_batch(seq, sk); 3549 3550 if (iter->end_sk == expected) { 3551 iter->st_bucket_done = true; 3552 return sk; 3553 } 3554 3555 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3556 resized = true; 3557 goto again; 3558 } 3559 3560 return sk; 3561 } 3562 3563 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3564 { 3565 if (!*pos) 3566 return SEQ_START_TOKEN; 3567 3568 /* bpf iter does not support lseek, so it always 3569 * continue from where it was stop()-ped. 3570 */ 3571 return bpf_iter_unix_batch(seq, pos); 3572 } 3573 3574 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3575 { 3576 struct bpf_unix_iter_state *iter = seq->private; 3577 struct sock *sk; 3578 3579 /* Whenever seq_next() is called, the iter->cur_sk is 3580 * done with seq_show(), so advance to the next sk in 3581 * the batch. 3582 */ 3583 if (iter->cur_sk < iter->end_sk) 3584 sock_put(iter->batch[iter->cur_sk++]); 3585 3586 ++*pos; 3587 3588 if (iter->cur_sk < iter->end_sk) 3589 sk = iter->batch[iter->cur_sk]; 3590 else 3591 sk = bpf_iter_unix_batch(seq, pos); 3592 3593 return sk; 3594 } 3595 3596 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3597 { 3598 struct bpf_iter_meta meta; 3599 struct bpf_prog *prog; 3600 struct sock *sk = v; 3601 uid_t uid; 3602 bool slow; 3603 int ret; 3604 3605 if (v == SEQ_START_TOKEN) 3606 return 0; 3607 3608 slow = lock_sock_fast(sk); 3609 3610 if (unlikely(sk_unhashed(sk))) { 3611 ret = SEQ_SKIP; 3612 goto unlock; 3613 } 3614 3615 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3616 meta.seq = seq; 3617 prog = bpf_iter_get_info(&meta, false); 3618 ret = unix_prog_seq_show(prog, &meta, v, uid); 3619 unlock: 3620 unlock_sock_fast(sk, slow); 3621 return ret; 3622 } 3623 3624 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3625 { 3626 struct bpf_unix_iter_state *iter = seq->private; 3627 struct bpf_iter_meta meta; 3628 struct bpf_prog *prog; 3629 3630 if (!v) { 3631 meta.seq = seq; 3632 prog = bpf_iter_get_info(&meta, true); 3633 if (prog) 3634 (void)unix_prog_seq_show(prog, &meta, v, 0); 3635 } 3636 3637 if (iter->cur_sk < iter->end_sk) 3638 bpf_iter_unix_put_batch(iter); 3639 } 3640 3641 static const struct seq_operations bpf_iter_unix_seq_ops = { 3642 .start = bpf_iter_unix_seq_start, 3643 .next = bpf_iter_unix_seq_next, 3644 .stop = bpf_iter_unix_seq_stop, 3645 .show = bpf_iter_unix_seq_show, 3646 }; 3647 #endif 3648 #endif 3649 3650 static const struct net_proto_family unix_family_ops = { 3651 .family = PF_UNIX, 3652 .create = unix_create, 3653 .owner = THIS_MODULE, 3654 }; 3655 3656 3657 static int __net_init unix_net_init(struct net *net) 3658 { 3659 int i; 3660 3661 net->unx.sysctl_max_dgram_qlen = 10; 3662 if (unix_sysctl_register(net)) 3663 goto out; 3664 3665 #ifdef CONFIG_PROC_FS 3666 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3667 sizeof(struct seq_net_private))) 3668 goto err_sysctl; 3669 #endif 3670 3671 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3672 sizeof(spinlock_t), GFP_KERNEL); 3673 if (!net->unx.table.locks) 3674 goto err_proc; 3675 3676 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3677 sizeof(struct hlist_head), 3678 GFP_KERNEL); 3679 if (!net->unx.table.buckets) 3680 goto free_locks; 3681 3682 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3683 spin_lock_init(&net->unx.table.locks[i]); 3684 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); 3685 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3686 } 3687 3688 return 0; 3689 3690 free_locks: 3691 kvfree(net->unx.table.locks); 3692 err_proc: 3693 #ifdef CONFIG_PROC_FS 3694 remove_proc_entry("unix", net->proc_net); 3695 err_sysctl: 3696 #endif 3697 unix_sysctl_unregister(net); 3698 out: 3699 return -ENOMEM; 3700 } 3701 3702 static void __net_exit unix_net_exit(struct net *net) 3703 { 3704 kvfree(net->unx.table.buckets); 3705 kvfree(net->unx.table.locks); 3706 unix_sysctl_unregister(net); 3707 remove_proc_entry("unix", net->proc_net); 3708 } 3709 3710 static struct pernet_operations unix_net_ops = { 3711 .init = unix_net_init, 3712 .exit = unix_net_exit, 3713 }; 3714 3715 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3716 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3717 struct unix_sock *unix_sk, uid_t uid) 3718 3719 #define INIT_BATCH_SZ 16 3720 3721 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3722 { 3723 struct bpf_unix_iter_state *iter = priv_data; 3724 int err; 3725 3726 err = bpf_iter_init_seq_net(priv_data, aux); 3727 if (err) 3728 return err; 3729 3730 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3731 if (err) { 3732 bpf_iter_fini_seq_net(priv_data); 3733 return err; 3734 } 3735 3736 return 0; 3737 } 3738 3739 static void bpf_iter_fini_unix(void *priv_data) 3740 { 3741 struct bpf_unix_iter_state *iter = priv_data; 3742 3743 bpf_iter_fini_seq_net(priv_data); 3744 kvfree(iter->batch); 3745 } 3746 3747 static const struct bpf_iter_seq_info unix_seq_info = { 3748 .seq_ops = &bpf_iter_unix_seq_ops, 3749 .init_seq_private = bpf_iter_init_unix, 3750 .fini_seq_private = bpf_iter_fini_unix, 3751 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3752 }; 3753 3754 static const struct bpf_func_proto * 3755 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3756 const struct bpf_prog *prog) 3757 { 3758 switch (func_id) { 3759 case BPF_FUNC_setsockopt: 3760 return &bpf_sk_setsockopt_proto; 3761 case BPF_FUNC_getsockopt: 3762 return &bpf_sk_getsockopt_proto; 3763 default: 3764 return NULL; 3765 } 3766 } 3767 3768 static struct bpf_iter_reg unix_reg_info = { 3769 .target = "unix", 3770 .ctx_arg_info_size = 1, 3771 .ctx_arg_info = { 3772 { offsetof(struct bpf_iter__unix, unix_sk), 3773 PTR_TO_BTF_ID_OR_NULL }, 3774 }, 3775 .get_func_proto = bpf_iter_unix_get_func_proto, 3776 .seq_info = &unix_seq_info, 3777 }; 3778 3779 static void __init bpf_iter_register(void) 3780 { 3781 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3782 if (bpf_iter_reg_target(&unix_reg_info)) 3783 pr_warn("Warning: could not register bpf iterator unix\n"); 3784 } 3785 #endif 3786 3787 static int __init af_unix_init(void) 3788 { 3789 int i, rc = -1; 3790 3791 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3792 3793 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3794 spin_lock_init(&bsd_socket_locks[i]); 3795 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3796 } 3797 3798 rc = proto_register(&unix_dgram_proto, 1); 3799 if (rc != 0) { 3800 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3801 goto out; 3802 } 3803 3804 rc = proto_register(&unix_stream_proto, 1); 3805 if (rc != 0) { 3806 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3807 proto_unregister(&unix_dgram_proto); 3808 goto out; 3809 } 3810 3811 sock_register(&unix_family_ops); 3812 register_pernet_subsys(&unix_net_ops); 3813 unix_bpf_build_proto(); 3814 3815 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3816 bpf_iter_register(); 3817 #endif 3818 3819 out: 3820 return rc; 3821 } 3822 3823 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3824 fs_initcall(af_unix_init); 3825