1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 #include <linux/bpf-cgroup.h> 120 121 static atomic_long_t unix_nr_socks; 122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 124 125 /* SMP locking strategy: 126 * hash table is protected with spinlock. 127 * each socket state is protected by separate spinlock. 128 */ 129 130 static unsigned int unix_unbound_hash(struct sock *sk) 131 { 132 unsigned long hash = (unsigned long)sk; 133 134 hash ^= hash >> 16; 135 hash ^= hash >> 8; 136 hash ^= sk->sk_type; 137 138 return hash & UNIX_HASH_MOD; 139 } 140 141 static unsigned int unix_bsd_hash(struct inode *i) 142 { 143 return i->i_ino & UNIX_HASH_MOD; 144 } 145 146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 147 int addr_len, int type) 148 { 149 __wsum csum = csum_partial(sunaddr, addr_len, 0); 150 unsigned int hash; 151 152 hash = (__force unsigned int)csum_fold(csum); 153 hash ^= hash >> 8; 154 hash ^= type; 155 156 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 157 } 158 159 static void unix_table_double_lock(struct net *net, 160 unsigned int hash1, unsigned int hash2) 161 { 162 if (hash1 == hash2) { 163 spin_lock(&net->unx.table.locks[hash1]); 164 return; 165 } 166 167 if (hash1 > hash2) 168 swap(hash1, hash2); 169 170 spin_lock(&net->unx.table.locks[hash1]); 171 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); 172 } 173 174 static void unix_table_double_unlock(struct net *net, 175 unsigned int hash1, unsigned int hash2) 176 { 177 if (hash1 == hash2) { 178 spin_unlock(&net->unx.table.locks[hash1]); 179 return; 180 } 181 182 spin_unlock(&net->unx.table.locks[hash1]); 183 spin_unlock(&net->unx.table.locks[hash2]); 184 } 185 186 #ifdef CONFIG_SECURITY_NETWORK 187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 188 { 189 UNIXCB(skb).secid = scm->secid; 190 } 191 192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 193 { 194 scm->secid = UNIXCB(skb).secid; 195 } 196 197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 198 { 199 return (scm->secid == UNIXCB(skb).secid); 200 } 201 #else 202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 203 { } 204 205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 206 { } 207 208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 209 { 210 return true; 211 } 212 #endif /* CONFIG_SECURITY_NETWORK */ 213 214 static inline int unix_our_peer(struct sock *sk, struct sock *osk) 215 { 216 return unix_peer(osk) == sk; 217 } 218 219 static inline int unix_may_send(struct sock *sk, struct sock *osk) 220 { 221 return unix_peer(osk) == NULL || unix_our_peer(sk, osk); 222 } 223 224 static inline int unix_recvq_full_lockless(const struct sock *sk) 225 { 226 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 227 } 228 229 struct sock *unix_peer_get(struct sock *s) 230 { 231 struct sock *peer; 232 233 unix_state_lock(s); 234 peer = unix_peer(s); 235 if (peer) 236 sock_hold(peer); 237 unix_state_unlock(s); 238 return peer; 239 } 240 EXPORT_SYMBOL_GPL(unix_peer_get); 241 242 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 243 int addr_len) 244 { 245 struct unix_address *addr; 246 247 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 248 if (!addr) 249 return NULL; 250 251 refcount_set(&addr->refcnt, 1); 252 addr->len = addr_len; 253 memcpy(addr->name, sunaddr, addr_len); 254 255 return addr; 256 } 257 258 static inline void unix_release_addr(struct unix_address *addr) 259 { 260 if (refcount_dec_and_test(&addr->refcnt)) 261 kfree(addr); 262 } 263 264 /* 265 * Check unix socket name: 266 * - should be not zero length. 267 * - if started by not zero, should be NULL terminated (FS object) 268 * - if started by zero, it is abstract name. 269 */ 270 271 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 272 { 273 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 274 addr_len > sizeof(*sunaddr)) 275 return -EINVAL; 276 277 if (sunaddr->sun_family != AF_UNIX) 278 return -EINVAL; 279 280 return 0; 281 } 282 283 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 284 { 285 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 286 short offset = offsetof(struct sockaddr_storage, __data); 287 288 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 289 290 /* This may look like an off by one error but it is a bit more 291 * subtle. 108 is the longest valid AF_UNIX path for a binding. 292 * sun_path[108] doesn't as such exist. However in kernel space 293 * we are guaranteed that it is a valid memory location in our 294 * kernel address buffer because syscall functions always pass 295 * a pointer of struct sockaddr_storage which has a bigger buffer 296 * than 108. Also, we must terminate sun_path for strlen() in 297 * getname_kernel(). 298 */ 299 addr->__data[addr_len - offset] = 0; 300 301 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 302 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 303 * know the actual buffer. 304 */ 305 return strlen(addr->__data) + offset + 1; 306 } 307 308 static void __unix_remove_socket(struct sock *sk) 309 { 310 sk_del_node_init(sk); 311 } 312 313 static void __unix_insert_socket(struct net *net, struct sock *sk) 314 { 315 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 316 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 317 } 318 319 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 320 struct unix_address *addr, unsigned int hash) 321 { 322 __unix_remove_socket(sk); 323 smp_store_release(&unix_sk(sk)->addr, addr); 324 325 sk->sk_hash = hash; 326 __unix_insert_socket(net, sk); 327 } 328 329 static void unix_remove_socket(struct net *net, struct sock *sk) 330 { 331 spin_lock(&net->unx.table.locks[sk->sk_hash]); 332 __unix_remove_socket(sk); 333 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 334 } 335 336 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 337 { 338 spin_lock(&net->unx.table.locks[sk->sk_hash]); 339 __unix_insert_socket(net, sk); 340 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 341 } 342 343 static void unix_insert_bsd_socket(struct sock *sk) 344 { 345 spin_lock(&bsd_socket_locks[sk->sk_hash]); 346 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 347 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 348 } 349 350 static void unix_remove_bsd_socket(struct sock *sk) 351 { 352 if (!hlist_unhashed(&sk->sk_bind_node)) { 353 spin_lock(&bsd_socket_locks[sk->sk_hash]); 354 __sk_del_bind_node(sk); 355 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 356 357 sk_node_init(&sk->sk_bind_node); 358 } 359 } 360 361 static struct sock *__unix_find_socket_byname(struct net *net, 362 struct sockaddr_un *sunname, 363 int len, unsigned int hash) 364 { 365 struct sock *s; 366 367 sk_for_each(s, &net->unx.table.buckets[hash]) { 368 struct unix_sock *u = unix_sk(s); 369 370 if (u->addr->len == len && 371 !memcmp(u->addr->name, sunname, len)) 372 return s; 373 } 374 return NULL; 375 } 376 377 static inline struct sock *unix_find_socket_byname(struct net *net, 378 struct sockaddr_un *sunname, 379 int len, unsigned int hash) 380 { 381 struct sock *s; 382 383 spin_lock(&net->unx.table.locks[hash]); 384 s = __unix_find_socket_byname(net, sunname, len, hash); 385 if (s) 386 sock_hold(s); 387 spin_unlock(&net->unx.table.locks[hash]); 388 return s; 389 } 390 391 static struct sock *unix_find_socket_byinode(struct inode *i) 392 { 393 unsigned int hash = unix_bsd_hash(i); 394 struct sock *s; 395 396 spin_lock(&bsd_socket_locks[hash]); 397 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 398 struct dentry *dentry = unix_sk(s)->path.dentry; 399 400 if (dentry && d_backing_inode(dentry) == i) { 401 sock_hold(s); 402 spin_unlock(&bsd_socket_locks[hash]); 403 return s; 404 } 405 } 406 spin_unlock(&bsd_socket_locks[hash]); 407 return NULL; 408 } 409 410 /* Support code for asymmetrically connected dgram sockets 411 * 412 * If a datagram socket is connected to a socket not itself connected 413 * to the first socket (eg, /dev/log), clients may only enqueue more 414 * messages if the present receive queue of the server socket is not 415 * "too large". This means there's a second writeability condition 416 * poll and sendmsg need to test. The dgram recv code will do a wake 417 * up on the peer_wait wait queue of a socket upon reception of a 418 * datagram which needs to be propagated to sleeping would-be writers 419 * since these might not have sent anything so far. This can't be 420 * accomplished via poll_wait because the lifetime of the server 421 * socket might be less than that of its clients if these break their 422 * association with it or if the server socket is closed while clients 423 * are still connected to it and there's no way to inform "a polling 424 * implementation" that it should let go of a certain wait queue 425 * 426 * In order to propagate a wake up, a wait_queue_entry_t of the client 427 * socket is enqueued on the peer_wait queue of the server socket 428 * whose wake function does a wake_up on the ordinary client socket 429 * wait queue. This connection is established whenever a write (or 430 * poll for write) hit the flow control condition and broken when the 431 * association to the server socket is dissolved or after a wake up 432 * was relayed. 433 */ 434 435 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 436 void *key) 437 { 438 struct unix_sock *u; 439 wait_queue_head_t *u_sleep; 440 441 u = container_of(q, struct unix_sock, peer_wake); 442 443 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 444 q); 445 u->peer_wake.private = NULL; 446 447 /* relaying can only happen while the wq still exists */ 448 u_sleep = sk_sleep(&u->sk); 449 if (u_sleep) 450 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 451 452 return 0; 453 } 454 455 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 456 { 457 struct unix_sock *u, *u_other; 458 int rc; 459 460 u = unix_sk(sk); 461 u_other = unix_sk(other); 462 rc = 0; 463 spin_lock(&u_other->peer_wait.lock); 464 465 if (!u->peer_wake.private) { 466 u->peer_wake.private = other; 467 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 468 469 rc = 1; 470 } 471 472 spin_unlock(&u_other->peer_wait.lock); 473 return rc; 474 } 475 476 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 477 struct sock *other) 478 { 479 struct unix_sock *u, *u_other; 480 481 u = unix_sk(sk); 482 u_other = unix_sk(other); 483 spin_lock(&u_other->peer_wait.lock); 484 485 if (u->peer_wake.private == other) { 486 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 487 u->peer_wake.private = NULL; 488 } 489 490 spin_unlock(&u_other->peer_wait.lock); 491 } 492 493 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 494 struct sock *other) 495 { 496 unix_dgram_peer_wake_disconnect(sk, other); 497 wake_up_interruptible_poll(sk_sleep(sk), 498 EPOLLOUT | 499 EPOLLWRNORM | 500 EPOLLWRBAND); 501 } 502 503 /* preconditions: 504 * - unix_peer(sk) == other 505 * - association is stable 506 */ 507 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 508 { 509 int connected; 510 511 connected = unix_dgram_peer_wake_connect(sk, other); 512 513 /* If other is SOCK_DEAD, we want to make sure we signal 514 * POLLOUT, such that a subsequent write() can get a 515 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 516 * to other and its full, we will hang waiting for POLLOUT. 517 */ 518 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 519 return 1; 520 521 if (connected) 522 unix_dgram_peer_wake_disconnect(sk, other); 523 524 return 0; 525 } 526 527 static int unix_writable(const struct sock *sk, unsigned char state) 528 { 529 return state != TCP_LISTEN && 530 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 531 } 532 533 static void unix_write_space(struct sock *sk) 534 { 535 struct socket_wq *wq; 536 537 rcu_read_lock(); 538 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 539 wq = rcu_dereference(sk->sk_wq); 540 if (skwq_has_sleeper(wq)) 541 wake_up_interruptible_sync_poll(&wq->wait, 542 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 543 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 544 } 545 rcu_read_unlock(); 546 } 547 548 /* When dgram socket disconnects (or changes its peer), we clear its receive 549 * queue of packets arrived from previous peer. First, it allows to do 550 * flow control based only on wmem_alloc; second, sk connected to peer 551 * may receive messages only from that peer. */ 552 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 553 { 554 if (!skb_queue_empty(&sk->sk_receive_queue)) { 555 skb_queue_purge(&sk->sk_receive_queue); 556 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 557 558 /* If one link of bidirectional dgram pipe is disconnected, 559 * we signal error. Messages are lost. Do not make this, 560 * when peer was not connected to us. 561 */ 562 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 563 WRITE_ONCE(other->sk_err, ECONNRESET); 564 sk_error_report(other); 565 } 566 } 567 } 568 569 static void unix_sock_destructor(struct sock *sk) 570 { 571 struct unix_sock *u = unix_sk(sk); 572 573 skb_queue_purge(&sk->sk_receive_queue); 574 575 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 576 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 577 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 578 if (!sock_flag(sk, SOCK_DEAD)) { 579 pr_info("Attempt to release alive unix socket: %p\n", sk); 580 return; 581 } 582 583 if (u->addr) 584 unix_release_addr(u->addr); 585 586 atomic_long_dec(&unix_nr_socks); 587 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 588 #ifdef UNIX_REFCNT_DEBUG 589 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 590 atomic_long_read(&unix_nr_socks)); 591 #endif 592 } 593 594 static void unix_release_sock(struct sock *sk, int embrion) 595 { 596 struct unix_sock *u = unix_sk(sk); 597 struct sock *skpair; 598 struct sk_buff *skb; 599 struct path path; 600 int state; 601 602 unix_remove_socket(sock_net(sk), sk); 603 unix_remove_bsd_socket(sk); 604 605 /* Clear state */ 606 unix_state_lock(sk); 607 sock_orphan(sk); 608 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 609 path = u->path; 610 u->path.dentry = NULL; 611 u->path.mnt = NULL; 612 state = sk->sk_state; 613 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 614 615 skpair = unix_peer(sk); 616 unix_peer(sk) = NULL; 617 618 unix_state_unlock(sk); 619 620 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 621 if (u->oob_skb) { 622 kfree_skb(u->oob_skb); 623 u->oob_skb = NULL; 624 } 625 #endif 626 627 wake_up_interruptible_all(&u->peer_wait); 628 629 if (skpair != NULL) { 630 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 631 unix_state_lock(skpair); 632 /* No more writes */ 633 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 634 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) 635 WRITE_ONCE(skpair->sk_err, ECONNRESET); 636 unix_state_unlock(skpair); 637 skpair->sk_state_change(skpair); 638 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 639 } 640 641 unix_dgram_peer_wake_disconnect(sk, skpair); 642 sock_put(skpair); /* It may now die */ 643 } 644 645 /* Try to flush out this socket. Throw out buffers at least */ 646 647 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 648 if (state == TCP_LISTEN) 649 unix_release_sock(skb->sk, 1); 650 /* passed fds are erased in the kfree_skb hook */ 651 UNIXCB(skb).consumed = skb->len; 652 kfree_skb(skb); 653 } 654 655 if (path.dentry) 656 path_put(&path); 657 658 sock_put(sk); 659 660 /* ---- Socket is dead now and most probably destroyed ---- */ 661 662 /* 663 * Fixme: BSD difference: In BSD all sockets connected to us get 664 * ECONNRESET and we die on the spot. In Linux we behave 665 * like files and pipes do and wait for the last 666 * dereference. 667 * 668 * Can't we simply set sock->err? 669 * 670 * What the above comment does talk about? --ANK(980817) 671 */ 672 673 if (READ_ONCE(unix_tot_inflight)) 674 unix_gc(); /* Garbage collect fds */ 675 } 676 677 static void init_peercred(struct sock *sk) 678 { 679 const struct cred *old_cred; 680 struct pid *old_pid; 681 682 spin_lock(&sk->sk_peer_lock); 683 old_pid = sk->sk_peer_pid; 684 old_cred = sk->sk_peer_cred; 685 sk->sk_peer_pid = get_pid(task_tgid(current)); 686 sk->sk_peer_cred = get_current_cred(); 687 spin_unlock(&sk->sk_peer_lock); 688 689 put_pid(old_pid); 690 put_cred(old_cred); 691 } 692 693 static void copy_peercred(struct sock *sk, struct sock *peersk) 694 { 695 const struct cred *old_cred; 696 struct pid *old_pid; 697 698 if (sk < peersk) { 699 spin_lock(&sk->sk_peer_lock); 700 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); 701 } else { 702 spin_lock(&peersk->sk_peer_lock); 703 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); 704 } 705 old_pid = sk->sk_peer_pid; 706 old_cred = sk->sk_peer_cred; 707 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 708 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 709 710 spin_unlock(&sk->sk_peer_lock); 711 spin_unlock(&peersk->sk_peer_lock); 712 713 put_pid(old_pid); 714 put_cred(old_cred); 715 } 716 717 static int unix_listen(struct socket *sock, int backlog) 718 { 719 int err; 720 struct sock *sk = sock->sk; 721 struct unix_sock *u = unix_sk(sk); 722 723 err = -EOPNOTSUPP; 724 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 725 goto out; /* Only stream/seqpacket sockets accept */ 726 err = -EINVAL; 727 if (!READ_ONCE(u->addr)) 728 goto out; /* No listens on an unbound socket */ 729 unix_state_lock(sk); 730 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 731 goto out_unlock; 732 if (backlog > sk->sk_max_ack_backlog) 733 wake_up_interruptible_all(&u->peer_wait); 734 sk->sk_max_ack_backlog = backlog; 735 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 736 737 /* set credentials so connect can copy them */ 738 init_peercred(sk); 739 err = 0; 740 741 out_unlock: 742 unix_state_unlock(sk); 743 out: 744 return err; 745 } 746 747 static int unix_release(struct socket *); 748 static int unix_bind(struct socket *, struct sockaddr *, int); 749 static int unix_stream_connect(struct socket *, struct sockaddr *, 750 int addr_len, int flags); 751 static int unix_socketpair(struct socket *, struct socket *); 752 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); 753 static int unix_getname(struct socket *, struct sockaddr *, int); 754 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 755 static __poll_t unix_dgram_poll(struct file *, struct socket *, 756 poll_table *); 757 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 758 #ifdef CONFIG_COMPAT 759 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 760 #endif 761 static int unix_shutdown(struct socket *, int); 762 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 763 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 764 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 765 struct pipe_inode_info *, size_t size, 766 unsigned int flags); 767 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 768 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 769 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 770 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 771 static int unix_dgram_connect(struct socket *, struct sockaddr *, 772 int, int); 773 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 774 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 775 int); 776 777 #ifdef CONFIG_PROC_FS 778 static int unix_count_nr_fds(struct sock *sk) 779 { 780 struct sk_buff *skb; 781 struct unix_sock *u; 782 int nr_fds = 0; 783 784 spin_lock(&sk->sk_receive_queue.lock); 785 skb = skb_peek(&sk->sk_receive_queue); 786 while (skb) { 787 u = unix_sk(skb->sk); 788 nr_fds += atomic_read(&u->scm_stat.nr_fds); 789 skb = skb_peek_next(skb, &sk->sk_receive_queue); 790 } 791 spin_unlock(&sk->sk_receive_queue.lock); 792 793 return nr_fds; 794 } 795 796 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 797 { 798 struct sock *sk = sock->sk; 799 unsigned char s_state; 800 struct unix_sock *u; 801 int nr_fds = 0; 802 803 if (sk) { 804 s_state = READ_ONCE(sk->sk_state); 805 u = unix_sk(sk); 806 807 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 808 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 809 * SOCK_DGRAM is ordinary. So, no lock is needed. 810 */ 811 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 812 nr_fds = atomic_read(&u->scm_stat.nr_fds); 813 else if (s_state == TCP_LISTEN) 814 nr_fds = unix_count_nr_fds(sk); 815 816 seq_printf(m, "scm_fds: %u\n", nr_fds); 817 } 818 } 819 #else 820 #define unix_show_fdinfo NULL 821 #endif 822 823 static const struct proto_ops unix_stream_ops = { 824 .family = PF_UNIX, 825 .owner = THIS_MODULE, 826 .release = unix_release, 827 .bind = unix_bind, 828 .connect = unix_stream_connect, 829 .socketpair = unix_socketpair, 830 .accept = unix_accept, 831 .getname = unix_getname, 832 .poll = unix_poll, 833 .ioctl = unix_ioctl, 834 #ifdef CONFIG_COMPAT 835 .compat_ioctl = unix_compat_ioctl, 836 #endif 837 .listen = unix_listen, 838 .shutdown = unix_shutdown, 839 .sendmsg = unix_stream_sendmsg, 840 .recvmsg = unix_stream_recvmsg, 841 .read_skb = unix_stream_read_skb, 842 .mmap = sock_no_mmap, 843 .splice_read = unix_stream_splice_read, 844 .set_peek_off = sk_set_peek_off, 845 .show_fdinfo = unix_show_fdinfo, 846 }; 847 848 static const struct proto_ops unix_dgram_ops = { 849 .family = PF_UNIX, 850 .owner = THIS_MODULE, 851 .release = unix_release, 852 .bind = unix_bind, 853 .connect = unix_dgram_connect, 854 .socketpair = unix_socketpair, 855 .accept = sock_no_accept, 856 .getname = unix_getname, 857 .poll = unix_dgram_poll, 858 .ioctl = unix_ioctl, 859 #ifdef CONFIG_COMPAT 860 .compat_ioctl = unix_compat_ioctl, 861 #endif 862 .listen = sock_no_listen, 863 .shutdown = unix_shutdown, 864 .sendmsg = unix_dgram_sendmsg, 865 .read_skb = unix_read_skb, 866 .recvmsg = unix_dgram_recvmsg, 867 .mmap = sock_no_mmap, 868 .set_peek_off = sk_set_peek_off, 869 .show_fdinfo = unix_show_fdinfo, 870 }; 871 872 static const struct proto_ops unix_seqpacket_ops = { 873 .family = PF_UNIX, 874 .owner = THIS_MODULE, 875 .release = unix_release, 876 .bind = unix_bind, 877 .connect = unix_stream_connect, 878 .socketpair = unix_socketpair, 879 .accept = unix_accept, 880 .getname = unix_getname, 881 .poll = unix_dgram_poll, 882 .ioctl = unix_ioctl, 883 #ifdef CONFIG_COMPAT 884 .compat_ioctl = unix_compat_ioctl, 885 #endif 886 .listen = unix_listen, 887 .shutdown = unix_shutdown, 888 .sendmsg = unix_seqpacket_sendmsg, 889 .recvmsg = unix_seqpacket_recvmsg, 890 .mmap = sock_no_mmap, 891 .set_peek_off = sk_set_peek_off, 892 .show_fdinfo = unix_show_fdinfo, 893 }; 894 895 static void unix_close(struct sock *sk, long timeout) 896 { 897 /* Nothing to do here, unix socket does not need a ->close(). 898 * This is merely for sockmap. 899 */ 900 } 901 902 static void unix_unhash(struct sock *sk) 903 { 904 /* Nothing to do here, unix socket does not need a ->unhash(). 905 * This is merely for sockmap. 906 */ 907 } 908 909 static bool unix_bpf_bypass_getsockopt(int level, int optname) 910 { 911 if (level == SOL_SOCKET) { 912 switch (optname) { 913 case SO_PEERPIDFD: 914 return true; 915 default: 916 return false; 917 } 918 } 919 920 return false; 921 } 922 923 struct proto unix_dgram_proto = { 924 .name = "UNIX", 925 .owner = THIS_MODULE, 926 .obj_size = sizeof(struct unix_sock), 927 .close = unix_close, 928 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 929 #ifdef CONFIG_BPF_SYSCALL 930 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 931 #endif 932 }; 933 934 struct proto unix_stream_proto = { 935 .name = "UNIX-STREAM", 936 .owner = THIS_MODULE, 937 .obj_size = sizeof(struct unix_sock), 938 .close = unix_close, 939 .unhash = unix_unhash, 940 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 941 #ifdef CONFIG_BPF_SYSCALL 942 .psock_update_sk_prot = unix_stream_bpf_update_proto, 943 #endif 944 }; 945 946 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 947 { 948 struct unix_sock *u; 949 struct sock *sk; 950 int err; 951 952 atomic_long_inc(&unix_nr_socks); 953 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 954 err = -ENFILE; 955 goto err; 956 } 957 958 if (type == SOCK_STREAM) 959 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 960 else /*dgram and seqpacket */ 961 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 962 963 if (!sk) { 964 err = -ENOMEM; 965 goto err; 966 } 967 968 sock_init_data(sock, sk); 969 970 sk->sk_hash = unix_unbound_hash(sk); 971 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 972 sk->sk_write_space = unix_write_space; 973 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 974 sk->sk_destruct = unix_sock_destructor; 975 u = unix_sk(sk); 976 u->listener = NULL; 977 u->vertex = NULL; 978 u->path.dentry = NULL; 979 u->path.mnt = NULL; 980 spin_lock_init(&u->lock); 981 mutex_init(&u->iolock); /* single task reading lock */ 982 mutex_init(&u->bindlock); /* single task binding lock */ 983 init_waitqueue_head(&u->peer_wait); 984 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 985 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 986 unix_insert_unbound_socket(net, sk); 987 988 sock_prot_inuse_add(net, sk->sk_prot, 1); 989 990 return sk; 991 992 err: 993 atomic_long_dec(&unix_nr_socks); 994 return ERR_PTR(err); 995 } 996 997 static int unix_create(struct net *net, struct socket *sock, int protocol, 998 int kern) 999 { 1000 struct sock *sk; 1001 1002 if (protocol && protocol != PF_UNIX) 1003 return -EPROTONOSUPPORT; 1004 1005 sock->state = SS_UNCONNECTED; 1006 1007 switch (sock->type) { 1008 case SOCK_STREAM: 1009 sock->ops = &unix_stream_ops; 1010 break; 1011 /* 1012 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1013 * nothing uses it. 1014 */ 1015 case SOCK_RAW: 1016 sock->type = SOCK_DGRAM; 1017 fallthrough; 1018 case SOCK_DGRAM: 1019 sock->ops = &unix_dgram_ops; 1020 break; 1021 case SOCK_SEQPACKET: 1022 sock->ops = &unix_seqpacket_ops; 1023 break; 1024 default: 1025 return -ESOCKTNOSUPPORT; 1026 } 1027 1028 sk = unix_create1(net, sock, kern, sock->type); 1029 if (IS_ERR(sk)) 1030 return PTR_ERR(sk); 1031 1032 return 0; 1033 } 1034 1035 static int unix_release(struct socket *sock) 1036 { 1037 struct sock *sk = sock->sk; 1038 1039 if (!sk) 1040 return 0; 1041 1042 sk->sk_prot->close(sk, 0); 1043 unix_release_sock(sk, 0); 1044 sock->sk = NULL; 1045 1046 return 0; 1047 } 1048 1049 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1050 int type) 1051 { 1052 struct inode *inode; 1053 struct path path; 1054 struct sock *sk; 1055 int err; 1056 1057 unix_mkname_bsd(sunaddr, addr_len); 1058 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1059 if (err) 1060 goto fail; 1061 1062 err = path_permission(&path, MAY_WRITE); 1063 if (err) 1064 goto path_put; 1065 1066 err = -ECONNREFUSED; 1067 inode = d_backing_inode(path.dentry); 1068 if (!S_ISSOCK(inode->i_mode)) 1069 goto path_put; 1070 1071 sk = unix_find_socket_byinode(inode); 1072 if (!sk) 1073 goto path_put; 1074 1075 err = -EPROTOTYPE; 1076 if (sk->sk_type == type) 1077 touch_atime(&path); 1078 else 1079 goto sock_put; 1080 1081 path_put(&path); 1082 1083 return sk; 1084 1085 sock_put: 1086 sock_put(sk); 1087 path_put: 1088 path_put(&path); 1089 fail: 1090 return ERR_PTR(err); 1091 } 1092 1093 static struct sock *unix_find_abstract(struct net *net, 1094 struct sockaddr_un *sunaddr, 1095 int addr_len, int type) 1096 { 1097 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1098 struct dentry *dentry; 1099 struct sock *sk; 1100 1101 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1102 if (!sk) 1103 return ERR_PTR(-ECONNREFUSED); 1104 1105 dentry = unix_sk(sk)->path.dentry; 1106 if (dentry) 1107 touch_atime(&unix_sk(sk)->path); 1108 1109 return sk; 1110 } 1111 1112 static struct sock *unix_find_other(struct net *net, 1113 struct sockaddr_un *sunaddr, 1114 int addr_len, int type) 1115 { 1116 struct sock *sk; 1117 1118 if (sunaddr->sun_path[0]) 1119 sk = unix_find_bsd(sunaddr, addr_len, type); 1120 else 1121 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1122 1123 return sk; 1124 } 1125 1126 static int unix_autobind(struct sock *sk) 1127 { 1128 struct unix_sock *u = unix_sk(sk); 1129 unsigned int new_hash, old_hash; 1130 struct net *net = sock_net(sk); 1131 struct unix_address *addr; 1132 u32 lastnum, ordernum; 1133 int err; 1134 1135 err = mutex_lock_interruptible(&u->bindlock); 1136 if (err) 1137 return err; 1138 1139 if (u->addr) 1140 goto out; 1141 1142 err = -ENOMEM; 1143 addr = kzalloc(sizeof(*addr) + 1144 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1145 if (!addr) 1146 goto out; 1147 1148 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1149 addr->name->sun_family = AF_UNIX; 1150 refcount_set(&addr->refcnt, 1); 1151 1152 old_hash = sk->sk_hash; 1153 ordernum = get_random_u32(); 1154 lastnum = ordernum & 0xFFFFF; 1155 retry: 1156 ordernum = (ordernum + 1) & 0xFFFFF; 1157 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1158 1159 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1160 unix_table_double_lock(net, old_hash, new_hash); 1161 1162 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1163 unix_table_double_unlock(net, old_hash, new_hash); 1164 1165 /* __unix_find_socket_byname() may take long time if many names 1166 * are already in use. 1167 */ 1168 cond_resched(); 1169 1170 if (ordernum == lastnum) { 1171 /* Give up if all names seems to be in use. */ 1172 err = -ENOSPC; 1173 unix_release_addr(addr); 1174 goto out; 1175 } 1176 1177 goto retry; 1178 } 1179 1180 __unix_set_addr_hash(net, sk, addr, new_hash); 1181 unix_table_double_unlock(net, old_hash, new_hash); 1182 err = 0; 1183 1184 out: mutex_unlock(&u->bindlock); 1185 return err; 1186 } 1187 1188 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1189 int addr_len) 1190 { 1191 umode_t mode = S_IFSOCK | 1192 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1193 struct unix_sock *u = unix_sk(sk); 1194 unsigned int new_hash, old_hash; 1195 struct net *net = sock_net(sk); 1196 struct mnt_idmap *idmap; 1197 struct unix_address *addr; 1198 struct dentry *dentry; 1199 struct path parent; 1200 int err; 1201 1202 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1203 addr = unix_create_addr(sunaddr, addr_len); 1204 if (!addr) 1205 return -ENOMEM; 1206 1207 /* 1208 * Get the parent directory, calculate the hash for last 1209 * component. 1210 */ 1211 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1212 if (IS_ERR(dentry)) { 1213 err = PTR_ERR(dentry); 1214 goto out; 1215 } 1216 1217 /* 1218 * All right, let's create it. 1219 */ 1220 idmap = mnt_idmap(parent.mnt); 1221 err = security_path_mknod(&parent, dentry, mode, 0); 1222 if (!err) 1223 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1224 if (err) 1225 goto out_path; 1226 err = mutex_lock_interruptible(&u->bindlock); 1227 if (err) 1228 goto out_unlink; 1229 if (u->addr) 1230 goto out_unlock; 1231 1232 old_hash = sk->sk_hash; 1233 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1234 unix_table_double_lock(net, old_hash, new_hash); 1235 u->path.mnt = mntget(parent.mnt); 1236 u->path.dentry = dget(dentry); 1237 __unix_set_addr_hash(net, sk, addr, new_hash); 1238 unix_table_double_unlock(net, old_hash, new_hash); 1239 unix_insert_bsd_socket(sk); 1240 mutex_unlock(&u->bindlock); 1241 done_path_create(&parent, dentry); 1242 return 0; 1243 1244 out_unlock: 1245 mutex_unlock(&u->bindlock); 1246 err = -EINVAL; 1247 out_unlink: 1248 /* failed after successful mknod? unlink what we'd created... */ 1249 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1250 out_path: 1251 done_path_create(&parent, dentry); 1252 out: 1253 unix_release_addr(addr); 1254 return err == -EEXIST ? -EADDRINUSE : err; 1255 } 1256 1257 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1258 int addr_len) 1259 { 1260 struct unix_sock *u = unix_sk(sk); 1261 unsigned int new_hash, old_hash; 1262 struct net *net = sock_net(sk); 1263 struct unix_address *addr; 1264 int err; 1265 1266 addr = unix_create_addr(sunaddr, addr_len); 1267 if (!addr) 1268 return -ENOMEM; 1269 1270 err = mutex_lock_interruptible(&u->bindlock); 1271 if (err) 1272 goto out; 1273 1274 if (u->addr) { 1275 err = -EINVAL; 1276 goto out_mutex; 1277 } 1278 1279 old_hash = sk->sk_hash; 1280 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1281 unix_table_double_lock(net, old_hash, new_hash); 1282 1283 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1284 goto out_spin; 1285 1286 __unix_set_addr_hash(net, sk, addr, new_hash); 1287 unix_table_double_unlock(net, old_hash, new_hash); 1288 mutex_unlock(&u->bindlock); 1289 return 0; 1290 1291 out_spin: 1292 unix_table_double_unlock(net, old_hash, new_hash); 1293 err = -EADDRINUSE; 1294 out_mutex: 1295 mutex_unlock(&u->bindlock); 1296 out: 1297 unix_release_addr(addr); 1298 return err; 1299 } 1300 1301 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1302 { 1303 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1304 struct sock *sk = sock->sk; 1305 int err; 1306 1307 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1308 sunaddr->sun_family == AF_UNIX) 1309 return unix_autobind(sk); 1310 1311 err = unix_validate_addr(sunaddr, addr_len); 1312 if (err) 1313 return err; 1314 1315 if (sunaddr->sun_path[0]) 1316 err = unix_bind_bsd(sk, sunaddr, addr_len); 1317 else 1318 err = unix_bind_abstract(sk, sunaddr, addr_len); 1319 1320 return err; 1321 } 1322 1323 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1324 { 1325 if (unlikely(sk1 == sk2) || !sk2) { 1326 unix_state_lock(sk1); 1327 return; 1328 } 1329 if (sk1 > sk2) 1330 swap(sk1, sk2); 1331 1332 unix_state_lock(sk1); 1333 unix_state_lock_nested(sk2, U_LOCK_SECOND); 1334 } 1335 1336 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1337 { 1338 if (unlikely(sk1 == sk2) || !sk2) { 1339 unix_state_unlock(sk1); 1340 return; 1341 } 1342 unix_state_unlock(sk1); 1343 unix_state_unlock(sk2); 1344 } 1345 1346 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1347 int alen, int flags) 1348 { 1349 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1350 struct sock *sk = sock->sk; 1351 struct sock *other; 1352 int err; 1353 1354 err = -EINVAL; 1355 if (alen < offsetofend(struct sockaddr, sa_family)) 1356 goto out; 1357 1358 if (addr->sa_family != AF_UNSPEC) { 1359 err = unix_validate_addr(sunaddr, alen); 1360 if (err) 1361 goto out; 1362 1363 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1364 if (err) 1365 goto out; 1366 1367 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1368 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1369 !READ_ONCE(unix_sk(sk)->addr)) { 1370 err = unix_autobind(sk); 1371 if (err) 1372 goto out; 1373 } 1374 1375 restart: 1376 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1377 if (IS_ERR(other)) { 1378 err = PTR_ERR(other); 1379 goto out; 1380 } 1381 1382 unix_state_double_lock(sk, other); 1383 1384 /* Apparently VFS overslept socket death. Retry. */ 1385 if (sock_flag(other, SOCK_DEAD)) { 1386 unix_state_double_unlock(sk, other); 1387 sock_put(other); 1388 goto restart; 1389 } 1390 1391 err = -EPERM; 1392 if (!unix_may_send(sk, other)) 1393 goto out_unlock; 1394 1395 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1396 if (err) 1397 goto out_unlock; 1398 1399 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1400 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1401 } else { 1402 /* 1403 * 1003.1g breaking connected state with AF_UNSPEC 1404 */ 1405 other = NULL; 1406 unix_state_double_lock(sk, other); 1407 } 1408 1409 /* 1410 * If it was connected, reconnect. 1411 */ 1412 if (unix_peer(sk)) { 1413 struct sock *old_peer = unix_peer(sk); 1414 1415 unix_peer(sk) = other; 1416 if (!other) 1417 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1418 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1419 1420 unix_state_double_unlock(sk, other); 1421 1422 if (other != old_peer) { 1423 unix_dgram_disconnected(sk, old_peer); 1424 1425 unix_state_lock(old_peer); 1426 if (!unix_peer(old_peer)) 1427 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1428 unix_state_unlock(old_peer); 1429 } 1430 1431 sock_put(old_peer); 1432 } else { 1433 unix_peer(sk) = other; 1434 unix_state_double_unlock(sk, other); 1435 } 1436 1437 return 0; 1438 1439 out_unlock: 1440 unix_state_double_unlock(sk, other); 1441 sock_put(other); 1442 out: 1443 return err; 1444 } 1445 1446 static long unix_wait_for_peer(struct sock *other, long timeo) 1447 __releases(&unix_sk(other)->lock) 1448 { 1449 struct unix_sock *u = unix_sk(other); 1450 int sched; 1451 DEFINE_WAIT(wait); 1452 1453 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1454 1455 sched = !sock_flag(other, SOCK_DEAD) && 1456 !(other->sk_shutdown & RCV_SHUTDOWN) && 1457 unix_recvq_full_lockless(other); 1458 1459 unix_state_unlock(other); 1460 1461 if (sched) 1462 timeo = schedule_timeout(timeo); 1463 1464 finish_wait(&u->peer_wait, &wait); 1465 return timeo; 1466 } 1467 1468 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1469 int addr_len, int flags) 1470 { 1471 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1472 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1473 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1474 struct net *net = sock_net(sk); 1475 struct sk_buff *skb = NULL; 1476 long timeo; 1477 int err; 1478 1479 err = unix_validate_addr(sunaddr, addr_len); 1480 if (err) 1481 goto out; 1482 1483 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1484 if (err) 1485 goto out; 1486 1487 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1488 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1489 !READ_ONCE(u->addr)) { 1490 err = unix_autobind(sk); 1491 if (err) 1492 goto out; 1493 } 1494 1495 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1496 1497 /* First of all allocate resources. 1498 If we will make it after state is locked, 1499 we will have to recheck all again in any case. 1500 */ 1501 1502 /* create new sock for complete connection */ 1503 newsk = unix_create1(net, NULL, 0, sock->type); 1504 if (IS_ERR(newsk)) { 1505 err = PTR_ERR(newsk); 1506 newsk = NULL; 1507 goto out; 1508 } 1509 1510 err = -ENOMEM; 1511 1512 /* Allocate skb for sending to listening sock */ 1513 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1514 if (skb == NULL) 1515 goto out; 1516 1517 restart: 1518 /* Find listening sock. */ 1519 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1520 if (IS_ERR(other)) { 1521 err = PTR_ERR(other); 1522 other = NULL; 1523 goto out; 1524 } 1525 1526 /* Latch state of peer */ 1527 unix_state_lock(other); 1528 1529 /* Apparently VFS overslept socket death. Retry. */ 1530 if (sock_flag(other, SOCK_DEAD)) { 1531 unix_state_unlock(other); 1532 sock_put(other); 1533 goto restart; 1534 } 1535 1536 err = -ECONNREFUSED; 1537 if (other->sk_state != TCP_LISTEN) 1538 goto out_unlock; 1539 if (other->sk_shutdown & RCV_SHUTDOWN) 1540 goto out_unlock; 1541 1542 if (unix_recvq_full_lockless(other)) { 1543 err = -EAGAIN; 1544 if (!timeo) 1545 goto out_unlock; 1546 1547 timeo = unix_wait_for_peer(other, timeo); 1548 1549 err = sock_intr_errno(timeo); 1550 if (signal_pending(current)) 1551 goto out; 1552 sock_put(other); 1553 goto restart; 1554 } 1555 1556 /* Latch our state. 1557 1558 It is tricky place. We need to grab our state lock and cannot 1559 drop lock on peer. It is dangerous because deadlock is 1560 possible. Connect to self case and simultaneous 1561 attempt to connect are eliminated by checking socket 1562 state. other is TCP_LISTEN, if sk is TCP_LISTEN we 1563 check this before attempt to grab lock. 1564 1565 Well, and we have to recheck the state after socket locked. 1566 */ 1567 switch (READ_ONCE(sk->sk_state)) { 1568 case TCP_CLOSE: 1569 /* This is ok... continue with connect */ 1570 break; 1571 case TCP_ESTABLISHED: 1572 /* Socket is already connected */ 1573 err = -EISCONN; 1574 goto out_unlock; 1575 default: 1576 err = -EINVAL; 1577 goto out_unlock; 1578 } 1579 1580 unix_state_lock_nested(sk, U_LOCK_SECOND); 1581 1582 if (sk->sk_state != TCP_CLOSE) { 1583 unix_state_unlock(sk); 1584 unix_state_unlock(other); 1585 sock_put(other); 1586 goto restart; 1587 } 1588 1589 err = security_unix_stream_connect(sk, other, newsk); 1590 if (err) { 1591 unix_state_unlock(sk); 1592 goto out_unlock; 1593 } 1594 1595 /* The way is open! Fastly set all the necessary fields... */ 1596 1597 sock_hold(sk); 1598 unix_peer(newsk) = sk; 1599 newsk->sk_state = TCP_ESTABLISHED; 1600 newsk->sk_type = sk->sk_type; 1601 init_peercred(newsk); 1602 newu = unix_sk(newsk); 1603 newu->listener = other; 1604 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1605 otheru = unix_sk(other); 1606 1607 /* copy address information from listening to new sock 1608 * 1609 * The contents of *(otheru->addr) and otheru->path 1610 * are seen fully set up here, since we have found 1611 * otheru in hash under its lock. Insertion into the 1612 * hash chain we'd found it in had been done in an 1613 * earlier critical area protected by the chain's lock, 1614 * the same one where we'd set *(otheru->addr) contents, 1615 * as well as otheru->path and otheru->addr itself. 1616 * 1617 * Using smp_store_release() here to set newu->addr 1618 * is enough to make those stores, as well as stores 1619 * to newu->path visible to anyone who gets newu->addr 1620 * by smp_load_acquire(). IOW, the same warranties 1621 * as for unix_sock instances bound in unix_bind() or 1622 * in unix_autobind(). 1623 */ 1624 if (otheru->path.dentry) { 1625 path_get(&otheru->path); 1626 newu->path = otheru->path; 1627 } 1628 refcount_inc(&otheru->addr->refcnt); 1629 smp_store_release(&newu->addr, otheru->addr); 1630 1631 /* Set credentials */ 1632 copy_peercred(sk, other); 1633 1634 sock->state = SS_CONNECTED; 1635 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1636 sock_hold(newsk); 1637 1638 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1639 unix_peer(sk) = newsk; 1640 1641 unix_state_unlock(sk); 1642 1643 /* take ten and send info to listening sock */ 1644 spin_lock(&other->sk_receive_queue.lock); 1645 __skb_queue_tail(&other->sk_receive_queue, skb); 1646 spin_unlock(&other->sk_receive_queue.lock); 1647 unix_state_unlock(other); 1648 other->sk_data_ready(other); 1649 sock_put(other); 1650 return 0; 1651 1652 out_unlock: 1653 if (other) 1654 unix_state_unlock(other); 1655 1656 out: 1657 kfree_skb(skb); 1658 if (newsk) 1659 unix_release_sock(newsk, 0); 1660 if (other) 1661 sock_put(other); 1662 return err; 1663 } 1664 1665 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1666 { 1667 struct sock *ska = socka->sk, *skb = sockb->sk; 1668 1669 /* Join our sockets back to back */ 1670 sock_hold(ska); 1671 sock_hold(skb); 1672 unix_peer(ska) = skb; 1673 unix_peer(skb) = ska; 1674 init_peercred(ska); 1675 init_peercred(skb); 1676 1677 ska->sk_state = TCP_ESTABLISHED; 1678 skb->sk_state = TCP_ESTABLISHED; 1679 socka->state = SS_CONNECTED; 1680 sockb->state = SS_CONNECTED; 1681 return 0; 1682 } 1683 1684 static void unix_sock_inherit_flags(const struct socket *old, 1685 struct socket *new) 1686 { 1687 if (test_bit(SOCK_PASSCRED, &old->flags)) 1688 set_bit(SOCK_PASSCRED, &new->flags); 1689 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1690 set_bit(SOCK_PASSPIDFD, &new->flags); 1691 if (test_bit(SOCK_PASSSEC, &old->flags)) 1692 set_bit(SOCK_PASSSEC, &new->flags); 1693 } 1694 1695 static int unix_accept(struct socket *sock, struct socket *newsock, 1696 struct proto_accept_arg *arg) 1697 { 1698 struct sock *sk = sock->sk; 1699 struct sk_buff *skb; 1700 struct sock *tsk; 1701 1702 arg->err = -EOPNOTSUPP; 1703 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1704 goto out; 1705 1706 arg->err = -EINVAL; 1707 if (READ_ONCE(sk->sk_state) != TCP_LISTEN) 1708 goto out; 1709 1710 /* If socket state is TCP_LISTEN it cannot change (for now...), 1711 * so that no locks are necessary. 1712 */ 1713 1714 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1715 &arg->err); 1716 if (!skb) { 1717 /* This means receive shutdown. */ 1718 if (arg->err == 0) 1719 arg->err = -EINVAL; 1720 goto out; 1721 } 1722 1723 tsk = skb->sk; 1724 skb_free_datagram(sk, skb); 1725 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1726 1727 /* attach accepted sock to socket */ 1728 unix_state_lock(tsk); 1729 unix_update_edges(unix_sk(tsk)); 1730 newsock->state = SS_CONNECTED; 1731 unix_sock_inherit_flags(sock, newsock); 1732 sock_graft(tsk, newsock); 1733 unix_state_unlock(tsk); 1734 return 0; 1735 1736 out: 1737 return arg->err; 1738 } 1739 1740 1741 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1742 { 1743 struct sock *sk = sock->sk; 1744 struct unix_address *addr; 1745 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1746 int err = 0; 1747 1748 if (peer) { 1749 sk = unix_peer_get(sk); 1750 1751 err = -ENOTCONN; 1752 if (!sk) 1753 goto out; 1754 err = 0; 1755 } else { 1756 sock_hold(sk); 1757 } 1758 1759 addr = smp_load_acquire(&unix_sk(sk)->addr); 1760 if (!addr) { 1761 sunaddr->sun_family = AF_UNIX; 1762 sunaddr->sun_path[0] = 0; 1763 err = offsetof(struct sockaddr_un, sun_path); 1764 } else { 1765 err = addr->len; 1766 memcpy(sunaddr, addr->name, addr->len); 1767 1768 if (peer) 1769 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1770 CGROUP_UNIX_GETPEERNAME); 1771 else 1772 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1773 CGROUP_UNIX_GETSOCKNAME); 1774 } 1775 sock_put(sk); 1776 out: 1777 return err; 1778 } 1779 1780 /* The "user->unix_inflight" variable is protected by the garbage 1781 * collection lock, and we just read it locklessly here. If you go 1782 * over the limit, there might be a tiny race in actually noticing 1783 * it across threads. Tough. 1784 */ 1785 static inline bool too_many_unix_fds(struct task_struct *p) 1786 { 1787 struct user_struct *user = current_user(); 1788 1789 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1790 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1791 return false; 1792 } 1793 1794 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1795 { 1796 if (too_many_unix_fds(current)) 1797 return -ETOOMANYREFS; 1798 1799 UNIXCB(skb).fp = scm->fp; 1800 scm->fp = NULL; 1801 1802 if (unix_prepare_fpl(UNIXCB(skb).fp)) 1803 return -ENOMEM; 1804 1805 return 0; 1806 } 1807 1808 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1809 { 1810 scm->fp = UNIXCB(skb).fp; 1811 UNIXCB(skb).fp = NULL; 1812 1813 unix_destroy_fpl(scm->fp); 1814 } 1815 1816 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1817 { 1818 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1819 } 1820 1821 static void unix_destruct_scm(struct sk_buff *skb) 1822 { 1823 struct scm_cookie scm; 1824 1825 memset(&scm, 0, sizeof(scm)); 1826 scm.pid = UNIXCB(skb).pid; 1827 if (UNIXCB(skb).fp) 1828 unix_detach_fds(&scm, skb); 1829 1830 /* Alas, it calls VFS */ 1831 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1832 scm_destroy(&scm); 1833 sock_wfree(skb); 1834 } 1835 1836 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1837 { 1838 int err = 0; 1839 1840 UNIXCB(skb).pid = get_pid(scm->pid); 1841 UNIXCB(skb).uid = scm->creds.uid; 1842 UNIXCB(skb).gid = scm->creds.gid; 1843 UNIXCB(skb).fp = NULL; 1844 unix_get_secdata(scm, skb); 1845 if (scm->fp && send_fds) 1846 err = unix_attach_fds(scm, skb); 1847 1848 skb->destructor = unix_destruct_scm; 1849 return err; 1850 } 1851 1852 static bool unix_passcred_enabled(const struct socket *sock, 1853 const struct sock *other) 1854 { 1855 return test_bit(SOCK_PASSCRED, &sock->flags) || 1856 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1857 !other->sk_socket || 1858 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1859 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1860 } 1861 1862 /* 1863 * Some apps rely on write() giving SCM_CREDENTIALS 1864 * We include credentials if source or destination socket 1865 * asserted SOCK_PASSCRED. 1866 */ 1867 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1868 const struct sock *other) 1869 { 1870 if (UNIXCB(skb).pid) 1871 return; 1872 if (unix_passcred_enabled(sock, other)) { 1873 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1874 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1875 } 1876 } 1877 1878 static bool unix_skb_scm_eq(struct sk_buff *skb, 1879 struct scm_cookie *scm) 1880 { 1881 return UNIXCB(skb).pid == scm->pid && 1882 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1883 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1884 unix_secdata_eq(scm, skb); 1885 } 1886 1887 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1888 { 1889 struct scm_fp_list *fp = UNIXCB(skb).fp; 1890 struct unix_sock *u = unix_sk(sk); 1891 1892 if (unlikely(fp && fp->count)) { 1893 atomic_add(fp->count, &u->scm_stat.nr_fds); 1894 unix_add_edges(fp, u); 1895 } 1896 } 1897 1898 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1899 { 1900 struct scm_fp_list *fp = UNIXCB(skb).fp; 1901 struct unix_sock *u = unix_sk(sk); 1902 1903 if (unlikely(fp && fp->count)) { 1904 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1905 unix_del_edges(fp); 1906 } 1907 } 1908 1909 /* 1910 * Send AF_UNIX data. 1911 */ 1912 1913 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1914 size_t len) 1915 { 1916 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 1917 struct sock *sk = sock->sk, *other = NULL; 1918 struct unix_sock *u = unix_sk(sk); 1919 struct scm_cookie scm; 1920 struct sk_buff *skb; 1921 int data_len = 0; 1922 int sk_locked; 1923 long timeo; 1924 int err; 1925 1926 err = scm_send(sock, msg, &scm, false); 1927 if (err < 0) 1928 return err; 1929 1930 wait_for_unix_gc(scm.fp); 1931 1932 err = -EOPNOTSUPP; 1933 if (msg->msg_flags&MSG_OOB) 1934 goto out; 1935 1936 if (msg->msg_namelen) { 1937 err = unix_validate_addr(sunaddr, msg->msg_namelen); 1938 if (err) 1939 goto out; 1940 1941 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 1942 msg->msg_name, 1943 &msg->msg_namelen, 1944 NULL); 1945 if (err) 1946 goto out; 1947 } else { 1948 sunaddr = NULL; 1949 err = -ENOTCONN; 1950 other = unix_peer_get(sk); 1951 if (!other) 1952 goto out; 1953 } 1954 1955 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1956 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1957 !READ_ONCE(u->addr)) { 1958 err = unix_autobind(sk); 1959 if (err) 1960 goto out; 1961 } 1962 1963 err = -EMSGSIZE; 1964 if (len > READ_ONCE(sk->sk_sndbuf) - 32) 1965 goto out; 1966 1967 if (len > SKB_MAX_ALLOC) { 1968 data_len = min_t(size_t, 1969 len - SKB_MAX_ALLOC, 1970 MAX_SKB_FRAGS * PAGE_SIZE); 1971 data_len = PAGE_ALIGN(data_len); 1972 1973 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 1974 } 1975 1976 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 1977 msg->msg_flags & MSG_DONTWAIT, &err, 1978 PAGE_ALLOC_COSTLY_ORDER); 1979 if (skb == NULL) 1980 goto out; 1981 1982 err = unix_scm_to_skb(&scm, skb, true); 1983 if (err < 0) 1984 goto out_free; 1985 1986 skb_put(skb, len - data_len); 1987 skb->data_len = data_len; 1988 skb->len = len; 1989 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 1990 if (err) 1991 goto out_free; 1992 1993 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1994 1995 restart: 1996 if (!other) { 1997 err = -ECONNRESET; 1998 if (sunaddr == NULL) 1999 goto out_free; 2000 2001 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, 2002 sk->sk_type); 2003 if (IS_ERR(other)) { 2004 err = PTR_ERR(other); 2005 other = NULL; 2006 goto out_free; 2007 } 2008 } 2009 2010 if (sk_filter(other, skb) < 0) { 2011 /* Toss the packet but do not return any error to the sender */ 2012 err = len; 2013 goto out_free; 2014 } 2015 2016 sk_locked = 0; 2017 unix_state_lock(other); 2018 restart_locked: 2019 err = -EPERM; 2020 if (!unix_may_send(sk, other)) 2021 goto out_unlock; 2022 2023 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2024 /* 2025 * Check with 1003.1g - what should 2026 * datagram error 2027 */ 2028 unix_state_unlock(other); 2029 sock_put(other); 2030 2031 if (!sk_locked) 2032 unix_state_lock(sk); 2033 2034 err = 0; 2035 if (sk->sk_type == SOCK_SEQPACKET) { 2036 /* We are here only when racing with unix_release_sock() 2037 * is clearing @other. Never change state to TCP_CLOSE 2038 * unlike SOCK_DGRAM wants. 2039 */ 2040 unix_state_unlock(sk); 2041 err = -EPIPE; 2042 } else if (unix_peer(sk) == other) { 2043 unix_peer(sk) = NULL; 2044 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2045 2046 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2047 unix_state_unlock(sk); 2048 2049 unix_dgram_disconnected(sk, other); 2050 sock_put(other); 2051 err = -ECONNREFUSED; 2052 } else { 2053 unix_state_unlock(sk); 2054 } 2055 2056 other = NULL; 2057 if (err) 2058 goto out_free; 2059 goto restart; 2060 } 2061 2062 err = -EPIPE; 2063 if (other->sk_shutdown & RCV_SHUTDOWN) 2064 goto out_unlock; 2065 2066 if (sk->sk_type != SOCK_SEQPACKET) { 2067 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2068 if (err) 2069 goto out_unlock; 2070 } 2071 2072 /* other == sk && unix_peer(other) != sk if 2073 * - unix_peer(sk) == NULL, destination address bound to sk 2074 * - unix_peer(sk) == sk by time of get but disconnected before lock 2075 */ 2076 if (other != sk && 2077 unlikely(unix_peer(other) != sk && 2078 unix_recvq_full_lockless(other))) { 2079 if (timeo) { 2080 timeo = unix_wait_for_peer(other, timeo); 2081 2082 err = sock_intr_errno(timeo); 2083 if (signal_pending(current)) 2084 goto out_free; 2085 2086 goto restart; 2087 } 2088 2089 if (!sk_locked) { 2090 unix_state_unlock(other); 2091 unix_state_double_lock(sk, other); 2092 } 2093 2094 if (unix_peer(sk) != other || 2095 unix_dgram_peer_wake_me(sk, other)) { 2096 err = -EAGAIN; 2097 sk_locked = 1; 2098 goto out_unlock; 2099 } 2100 2101 if (!sk_locked) { 2102 sk_locked = 1; 2103 goto restart_locked; 2104 } 2105 } 2106 2107 if (unlikely(sk_locked)) 2108 unix_state_unlock(sk); 2109 2110 if (sock_flag(other, SOCK_RCVTSTAMP)) 2111 __net_timestamp(skb); 2112 maybe_add_creds(skb, sock, other); 2113 scm_stat_add(other, skb); 2114 skb_queue_tail(&other->sk_receive_queue, skb); 2115 unix_state_unlock(other); 2116 other->sk_data_ready(other); 2117 sock_put(other); 2118 scm_destroy(&scm); 2119 return len; 2120 2121 out_unlock: 2122 if (sk_locked) 2123 unix_state_unlock(sk); 2124 unix_state_unlock(other); 2125 out_free: 2126 kfree_skb(skb); 2127 out: 2128 if (other) 2129 sock_put(other); 2130 scm_destroy(&scm); 2131 return err; 2132 } 2133 2134 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2135 * bytes, and a minimum of a full page. 2136 */ 2137 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2138 2139 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2140 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2141 struct scm_cookie *scm, bool fds_sent) 2142 { 2143 struct unix_sock *ousk = unix_sk(other); 2144 struct sk_buff *skb; 2145 int err = 0; 2146 2147 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2148 2149 if (!skb) 2150 return err; 2151 2152 err = unix_scm_to_skb(scm, skb, !fds_sent); 2153 if (err < 0) { 2154 kfree_skb(skb); 2155 return err; 2156 } 2157 skb_put(skb, 1); 2158 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2159 2160 if (err) { 2161 kfree_skb(skb); 2162 return err; 2163 } 2164 2165 unix_state_lock(other); 2166 2167 if (sock_flag(other, SOCK_DEAD) || 2168 (other->sk_shutdown & RCV_SHUTDOWN)) { 2169 unix_state_unlock(other); 2170 kfree_skb(skb); 2171 return -EPIPE; 2172 } 2173 2174 maybe_add_creds(skb, sock, other); 2175 skb_get(skb); 2176 2177 scm_stat_add(other, skb); 2178 2179 spin_lock(&other->sk_receive_queue.lock); 2180 if (ousk->oob_skb) 2181 consume_skb(ousk->oob_skb); 2182 WRITE_ONCE(ousk->oob_skb, skb); 2183 __skb_queue_tail(&other->sk_receive_queue, skb); 2184 spin_unlock(&other->sk_receive_queue.lock); 2185 2186 sk_send_sigurg(other); 2187 unix_state_unlock(other); 2188 other->sk_data_ready(other); 2189 2190 return err; 2191 } 2192 #endif 2193 2194 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2195 size_t len) 2196 { 2197 struct sock *sk = sock->sk; 2198 struct sock *other = NULL; 2199 int err, size; 2200 struct sk_buff *skb; 2201 int sent = 0; 2202 struct scm_cookie scm; 2203 bool fds_sent = false; 2204 int data_len; 2205 2206 err = scm_send(sock, msg, &scm, false); 2207 if (err < 0) 2208 return err; 2209 2210 wait_for_unix_gc(scm.fp); 2211 2212 err = -EOPNOTSUPP; 2213 if (msg->msg_flags & MSG_OOB) { 2214 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2215 if (len) 2216 len--; 2217 else 2218 #endif 2219 goto out_err; 2220 } 2221 2222 if (msg->msg_namelen) { 2223 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2224 goto out_err; 2225 } else { 2226 err = -ENOTCONN; 2227 other = unix_peer(sk); 2228 if (!other) 2229 goto out_err; 2230 } 2231 2232 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2233 goto pipe_err; 2234 2235 while (sent < len) { 2236 size = len - sent; 2237 2238 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2239 skb = sock_alloc_send_pskb(sk, 0, 0, 2240 msg->msg_flags & MSG_DONTWAIT, 2241 &err, 0); 2242 } else { 2243 /* Keep two messages in the pipe so it schedules better */ 2244 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2245 2246 /* allow fallback to order-0 allocations */ 2247 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2248 2249 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2250 2251 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2252 2253 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2254 msg->msg_flags & MSG_DONTWAIT, &err, 2255 get_order(UNIX_SKB_FRAGS_SZ)); 2256 } 2257 if (!skb) 2258 goto out_err; 2259 2260 /* Only send the fds in the first buffer */ 2261 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2262 if (err < 0) { 2263 kfree_skb(skb); 2264 goto out_err; 2265 } 2266 fds_sent = true; 2267 2268 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2269 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2270 sk->sk_allocation); 2271 if (err < 0) { 2272 kfree_skb(skb); 2273 goto out_err; 2274 } 2275 size = err; 2276 refcount_add(size, &sk->sk_wmem_alloc); 2277 } else { 2278 skb_put(skb, size - data_len); 2279 skb->data_len = data_len; 2280 skb->len = size; 2281 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2282 if (err) { 2283 kfree_skb(skb); 2284 goto out_err; 2285 } 2286 } 2287 2288 unix_state_lock(other); 2289 2290 if (sock_flag(other, SOCK_DEAD) || 2291 (other->sk_shutdown & RCV_SHUTDOWN)) 2292 goto pipe_err_free; 2293 2294 maybe_add_creds(skb, sock, other); 2295 scm_stat_add(other, skb); 2296 skb_queue_tail(&other->sk_receive_queue, skb); 2297 unix_state_unlock(other); 2298 other->sk_data_ready(other); 2299 sent += size; 2300 } 2301 2302 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2303 if (msg->msg_flags & MSG_OOB) { 2304 err = queue_oob(sock, msg, other, &scm, fds_sent); 2305 if (err) 2306 goto out_err; 2307 sent++; 2308 } 2309 #endif 2310 2311 scm_destroy(&scm); 2312 2313 return sent; 2314 2315 pipe_err_free: 2316 unix_state_unlock(other); 2317 kfree_skb(skb); 2318 pipe_err: 2319 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) 2320 send_sig(SIGPIPE, current, 0); 2321 err = -EPIPE; 2322 out_err: 2323 scm_destroy(&scm); 2324 return sent ? : err; 2325 } 2326 2327 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2328 size_t len) 2329 { 2330 int err; 2331 struct sock *sk = sock->sk; 2332 2333 err = sock_error(sk); 2334 if (err) 2335 return err; 2336 2337 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2338 return -ENOTCONN; 2339 2340 if (msg->msg_namelen) 2341 msg->msg_namelen = 0; 2342 2343 return unix_dgram_sendmsg(sock, msg, len); 2344 } 2345 2346 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2347 size_t size, int flags) 2348 { 2349 struct sock *sk = sock->sk; 2350 2351 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2352 return -ENOTCONN; 2353 2354 return unix_dgram_recvmsg(sock, msg, size, flags); 2355 } 2356 2357 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2358 { 2359 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2360 2361 if (addr) { 2362 msg->msg_namelen = addr->len; 2363 memcpy(msg->msg_name, addr->name, addr->len); 2364 } 2365 } 2366 2367 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2368 int flags) 2369 { 2370 struct scm_cookie scm; 2371 struct socket *sock = sk->sk_socket; 2372 struct unix_sock *u = unix_sk(sk); 2373 struct sk_buff *skb, *last; 2374 long timeo; 2375 int skip; 2376 int err; 2377 2378 err = -EOPNOTSUPP; 2379 if (flags&MSG_OOB) 2380 goto out; 2381 2382 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2383 2384 do { 2385 mutex_lock(&u->iolock); 2386 2387 skip = sk_peek_offset(sk, flags); 2388 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2389 &skip, &err, &last); 2390 if (skb) { 2391 if (!(flags & MSG_PEEK)) 2392 scm_stat_del(sk, skb); 2393 break; 2394 } 2395 2396 mutex_unlock(&u->iolock); 2397 2398 if (err != -EAGAIN) 2399 break; 2400 } while (timeo && 2401 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2402 &err, &timeo, last)); 2403 2404 if (!skb) { /* implies iolock unlocked */ 2405 unix_state_lock(sk); 2406 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2407 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2408 (sk->sk_shutdown & RCV_SHUTDOWN)) 2409 err = 0; 2410 unix_state_unlock(sk); 2411 goto out; 2412 } 2413 2414 if (wq_has_sleeper(&u->peer_wait)) 2415 wake_up_interruptible_sync_poll(&u->peer_wait, 2416 EPOLLOUT | EPOLLWRNORM | 2417 EPOLLWRBAND); 2418 2419 if (msg->msg_name) { 2420 unix_copy_addr(msg, skb->sk); 2421 2422 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2423 msg->msg_name, 2424 &msg->msg_namelen); 2425 } 2426 2427 if (size > skb->len - skip) 2428 size = skb->len - skip; 2429 else if (size < skb->len - skip) 2430 msg->msg_flags |= MSG_TRUNC; 2431 2432 err = skb_copy_datagram_msg(skb, skip, msg, size); 2433 if (err) 2434 goto out_free; 2435 2436 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2437 __sock_recv_timestamp(msg, sk, skb); 2438 2439 memset(&scm, 0, sizeof(scm)); 2440 2441 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2442 unix_set_secdata(&scm, skb); 2443 2444 if (!(flags & MSG_PEEK)) { 2445 if (UNIXCB(skb).fp) 2446 unix_detach_fds(&scm, skb); 2447 2448 sk_peek_offset_bwd(sk, skb->len); 2449 } else { 2450 /* It is questionable: on PEEK we could: 2451 - do not return fds - good, but too simple 8) 2452 - return fds, and do not return them on read (old strategy, 2453 apparently wrong) 2454 - clone fds (I chose it for now, it is the most universal 2455 solution) 2456 2457 POSIX 1003.1g does not actually define this clearly 2458 at all. POSIX 1003.1g doesn't define a lot of things 2459 clearly however! 2460 2461 */ 2462 2463 sk_peek_offset_fwd(sk, size); 2464 2465 if (UNIXCB(skb).fp) 2466 unix_peek_fds(&scm, skb); 2467 } 2468 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2469 2470 scm_recv_unix(sock, msg, &scm, flags); 2471 2472 out_free: 2473 skb_free_datagram(sk, skb); 2474 mutex_unlock(&u->iolock); 2475 out: 2476 return err; 2477 } 2478 2479 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2480 int flags) 2481 { 2482 struct sock *sk = sock->sk; 2483 2484 #ifdef CONFIG_BPF_SYSCALL 2485 const struct proto *prot = READ_ONCE(sk->sk_prot); 2486 2487 if (prot != &unix_dgram_proto) 2488 return prot->recvmsg(sk, msg, size, flags, NULL); 2489 #endif 2490 return __unix_dgram_recvmsg(sk, msg, size, flags); 2491 } 2492 2493 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2494 { 2495 struct unix_sock *u = unix_sk(sk); 2496 struct sk_buff *skb; 2497 int err; 2498 2499 mutex_lock(&u->iolock); 2500 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2501 mutex_unlock(&u->iolock); 2502 if (!skb) 2503 return err; 2504 2505 return recv_actor(sk, skb); 2506 } 2507 2508 /* 2509 * Sleep until more data has arrived. But check for races.. 2510 */ 2511 static long unix_stream_data_wait(struct sock *sk, long timeo, 2512 struct sk_buff *last, unsigned int last_len, 2513 bool freezable) 2514 { 2515 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2516 struct sk_buff *tail; 2517 DEFINE_WAIT(wait); 2518 2519 unix_state_lock(sk); 2520 2521 for (;;) { 2522 prepare_to_wait(sk_sleep(sk), &wait, state); 2523 2524 tail = skb_peek_tail(&sk->sk_receive_queue); 2525 if (tail != last || 2526 (tail && tail->len != last_len) || 2527 sk->sk_err || 2528 (sk->sk_shutdown & RCV_SHUTDOWN) || 2529 signal_pending(current) || 2530 !timeo) 2531 break; 2532 2533 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2534 unix_state_unlock(sk); 2535 timeo = schedule_timeout(timeo); 2536 unix_state_lock(sk); 2537 2538 if (sock_flag(sk, SOCK_DEAD)) 2539 break; 2540 2541 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2542 } 2543 2544 finish_wait(sk_sleep(sk), &wait); 2545 unix_state_unlock(sk); 2546 return timeo; 2547 } 2548 2549 static unsigned int unix_skb_len(const struct sk_buff *skb) 2550 { 2551 return skb->len - UNIXCB(skb).consumed; 2552 } 2553 2554 struct unix_stream_read_state { 2555 int (*recv_actor)(struct sk_buff *, int, int, 2556 struct unix_stream_read_state *); 2557 struct socket *socket; 2558 struct msghdr *msg; 2559 struct pipe_inode_info *pipe; 2560 size_t size; 2561 int flags; 2562 unsigned int splice_flags; 2563 }; 2564 2565 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2566 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2567 { 2568 struct socket *sock = state->socket; 2569 struct sock *sk = sock->sk; 2570 struct unix_sock *u = unix_sk(sk); 2571 int chunk = 1; 2572 struct sk_buff *oob_skb; 2573 2574 mutex_lock(&u->iolock); 2575 unix_state_lock(sk); 2576 spin_lock(&sk->sk_receive_queue.lock); 2577 2578 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2579 spin_unlock(&sk->sk_receive_queue.lock); 2580 unix_state_unlock(sk); 2581 mutex_unlock(&u->iolock); 2582 return -EINVAL; 2583 } 2584 2585 oob_skb = u->oob_skb; 2586 2587 if (!(state->flags & MSG_PEEK)) 2588 WRITE_ONCE(u->oob_skb, NULL); 2589 else 2590 skb_get(oob_skb); 2591 2592 spin_unlock(&sk->sk_receive_queue.lock); 2593 unix_state_unlock(sk); 2594 2595 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2596 2597 if (!(state->flags & MSG_PEEK)) 2598 UNIXCB(oob_skb).consumed += 1; 2599 2600 consume_skb(oob_skb); 2601 2602 mutex_unlock(&u->iolock); 2603 2604 if (chunk < 0) 2605 return -EFAULT; 2606 2607 state->msg->msg_flags |= MSG_OOB; 2608 return 1; 2609 } 2610 2611 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2612 int flags, int copied) 2613 { 2614 struct unix_sock *u = unix_sk(sk); 2615 2616 if (!unix_skb_len(skb)) { 2617 struct sk_buff *unlinked_skb = NULL; 2618 2619 spin_lock(&sk->sk_receive_queue.lock); 2620 2621 if (copied && (!u->oob_skb || skb == u->oob_skb)) { 2622 skb = NULL; 2623 } else if (flags & MSG_PEEK) { 2624 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2625 } else { 2626 unlinked_skb = skb; 2627 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2628 __skb_unlink(unlinked_skb, &sk->sk_receive_queue); 2629 } 2630 2631 spin_unlock(&sk->sk_receive_queue.lock); 2632 2633 consume_skb(unlinked_skb); 2634 } else { 2635 struct sk_buff *unlinked_skb = NULL; 2636 2637 spin_lock(&sk->sk_receive_queue.lock); 2638 2639 if (skb == u->oob_skb) { 2640 if (copied) { 2641 skb = NULL; 2642 } else if (!(flags & MSG_PEEK)) { 2643 if (sock_flag(sk, SOCK_URGINLINE)) { 2644 WRITE_ONCE(u->oob_skb, NULL); 2645 consume_skb(skb); 2646 } else { 2647 __skb_unlink(skb, &sk->sk_receive_queue); 2648 WRITE_ONCE(u->oob_skb, NULL); 2649 unlinked_skb = skb; 2650 skb = skb_peek(&sk->sk_receive_queue); 2651 } 2652 } else if (!sock_flag(sk, SOCK_URGINLINE)) { 2653 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2654 } 2655 } 2656 2657 spin_unlock(&sk->sk_receive_queue.lock); 2658 2659 if (unlinked_skb) { 2660 WARN_ON_ONCE(skb_unref(unlinked_skb)); 2661 kfree_skb(unlinked_skb); 2662 } 2663 } 2664 return skb; 2665 } 2666 #endif 2667 2668 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2669 { 2670 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2671 return -ENOTCONN; 2672 2673 return unix_read_skb(sk, recv_actor); 2674 } 2675 2676 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2677 bool freezable) 2678 { 2679 struct scm_cookie scm; 2680 struct socket *sock = state->socket; 2681 struct sock *sk = sock->sk; 2682 struct unix_sock *u = unix_sk(sk); 2683 int copied = 0; 2684 int flags = state->flags; 2685 int noblock = flags & MSG_DONTWAIT; 2686 bool check_creds = false; 2687 int target; 2688 int err = 0; 2689 long timeo; 2690 int skip; 2691 size_t size = state->size; 2692 unsigned int last_len; 2693 2694 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2695 err = -EINVAL; 2696 goto out; 2697 } 2698 2699 if (unlikely(flags & MSG_OOB)) { 2700 err = -EOPNOTSUPP; 2701 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2702 err = unix_stream_recv_urg(state); 2703 #endif 2704 goto out; 2705 } 2706 2707 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2708 timeo = sock_rcvtimeo(sk, noblock); 2709 2710 memset(&scm, 0, sizeof(scm)); 2711 2712 /* Lock the socket to prevent queue disordering 2713 * while sleeps in memcpy_tomsg 2714 */ 2715 mutex_lock(&u->iolock); 2716 2717 skip = max(sk_peek_offset(sk, flags), 0); 2718 2719 do { 2720 int chunk; 2721 bool drop_skb; 2722 struct sk_buff *skb, *last; 2723 2724 redo: 2725 unix_state_lock(sk); 2726 if (sock_flag(sk, SOCK_DEAD)) { 2727 err = -ECONNRESET; 2728 goto unlock; 2729 } 2730 last = skb = skb_peek(&sk->sk_receive_queue); 2731 last_len = last ? last->len : 0; 2732 2733 again: 2734 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2735 if (skb) { 2736 skb = manage_oob(skb, sk, flags, copied); 2737 if (!skb && copied) { 2738 unix_state_unlock(sk); 2739 break; 2740 } 2741 } 2742 #endif 2743 if (skb == NULL) { 2744 if (copied >= target) 2745 goto unlock; 2746 2747 /* 2748 * POSIX 1003.1g mandates this order. 2749 */ 2750 2751 err = sock_error(sk); 2752 if (err) 2753 goto unlock; 2754 if (sk->sk_shutdown & RCV_SHUTDOWN) 2755 goto unlock; 2756 2757 unix_state_unlock(sk); 2758 if (!timeo) { 2759 err = -EAGAIN; 2760 break; 2761 } 2762 2763 mutex_unlock(&u->iolock); 2764 2765 timeo = unix_stream_data_wait(sk, timeo, last, 2766 last_len, freezable); 2767 2768 if (signal_pending(current)) { 2769 err = sock_intr_errno(timeo); 2770 scm_destroy(&scm); 2771 goto out; 2772 } 2773 2774 mutex_lock(&u->iolock); 2775 goto redo; 2776 unlock: 2777 unix_state_unlock(sk); 2778 break; 2779 } 2780 2781 while (skip >= unix_skb_len(skb)) { 2782 skip -= unix_skb_len(skb); 2783 last = skb; 2784 last_len = skb->len; 2785 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2786 if (!skb) 2787 goto again; 2788 } 2789 2790 unix_state_unlock(sk); 2791 2792 if (check_creds) { 2793 /* Never glue messages from different writers */ 2794 if (!unix_skb_scm_eq(skb, &scm)) 2795 break; 2796 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2797 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2798 /* Copy credentials */ 2799 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2800 unix_set_secdata(&scm, skb); 2801 check_creds = true; 2802 } 2803 2804 /* Copy address just once */ 2805 if (state->msg && state->msg->msg_name) { 2806 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2807 state->msg->msg_name); 2808 unix_copy_addr(state->msg, skb->sk); 2809 2810 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2811 state->msg->msg_name, 2812 &state->msg->msg_namelen); 2813 2814 sunaddr = NULL; 2815 } 2816 2817 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2818 skb_get(skb); 2819 chunk = state->recv_actor(skb, skip, chunk, state); 2820 drop_skb = !unix_skb_len(skb); 2821 /* skb is only safe to use if !drop_skb */ 2822 consume_skb(skb); 2823 if (chunk < 0) { 2824 if (copied == 0) 2825 copied = -EFAULT; 2826 break; 2827 } 2828 copied += chunk; 2829 size -= chunk; 2830 2831 if (drop_skb) { 2832 /* the skb was touched by a concurrent reader; 2833 * we should not expect anything from this skb 2834 * anymore and assume it invalid - we can be 2835 * sure it was dropped from the socket queue 2836 * 2837 * let's report a short read 2838 */ 2839 err = 0; 2840 break; 2841 } 2842 2843 /* Mark read part of skb as used */ 2844 if (!(flags & MSG_PEEK)) { 2845 UNIXCB(skb).consumed += chunk; 2846 2847 sk_peek_offset_bwd(sk, chunk); 2848 2849 if (UNIXCB(skb).fp) { 2850 scm_stat_del(sk, skb); 2851 unix_detach_fds(&scm, skb); 2852 } 2853 2854 if (unix_skb_len(skb)) 2855 break; 2856 2857 skb_unlink(skb, &sk->sk_receive_queue); 2858 consume_skb(skb); 2859 2860 if (scm.fp) 2861 break; 2862 } else { 2863 /* It is questionable, see note in unix_dgram_recvmsg. 2864 */ 2865 if (UNIXCB(skb).fp) 2866 unix_peek_fds(&scm, skb); 2867 2868 sk_peek_offset_fwd(sk, chunk); 2869 2870 if (UNIXCB(skb).fp) 2871 break; 2872 2873 skip = 0; 2874 last = skb; 2875 last_len = skb->len; 2876 unix_state_lock(sk); 2877 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2878 if (skb) 2879 goto again; 2880 unix_state_unlock(sk); 2881 break; 2882 } 2883 } while (size); 2884 2885 mutex_unlock(&u->iolock); 2886 if (state->msg) 2887 scm_recv_unix(sock, state->msg, &scm, flags); 2888 else 2889 scm_destroy(&scm); 2890 out: 2891 return copied ? : err; 2892 } 2893 2894 static int unix_stream_read_actor(struct sk_buff *skb, 2895 int skip, int chunk, 2896 struct unix_stream_read_state *state) 2897 { 2898 int ret; 2899 2900 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2901 state->msg, chunk); 2902 return ret ?: chunk; 2903 } 2904 2905 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2906 size_t size, int flags) 2907 { 2908 struct unix_stream_read_state state = { 2909 .recv_actor = unix_stream_read_actor, 2910 .socket = sk->sk_socket, 2911 .msg = msg, 2912 .size = size, 2913 .flags = flags 2914 }; 2915 2916 return unix_stream_read_generic(&state, true); 2917 } 2918 2919 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2920 size_t size, int flags) 2921 { 2922 struct unix_stream_read_state state = { 2923 .recv_actor = unix_stream_read_actor, 2924 .socket = sock, 2925 .msg = msg, 2926 .size = size, 2927 .flags = flags 2928 }; 2929 2930 #ifdef CONFIG_BPF_SYSCALL 2931 struct sock *sk = sock->sk; 2932 const struct proto *prot = READ_ONCE(sk->sk_prot); 2933 2934 if (prot != &unix_stream_proto) 2935 return prot->recvmsg(sk, msg, size, flags, NULL); 2936 #endif 2937 return unix_stream_read_generic(&state, true); 2938 } 2939 2940 static int unix_stream_splice_actor(struct sk_buff *skb, 2941 int skip, int chunk, 2942 struct unix_stream_read_state *state) 2943 { 2944 return skb_splice_bits(skb, state->socket->sk, 2945 UNIXCB(skb).consumed + skip, 2946 state->pipe, chunk, state->splice_flags); 2947 } 2948 2949 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2950 struct pipe_inode_info *pipe, 2951 size_t size, unsigned int flags) 2952 { 2953 struct unix_stream_read_state state = { 2954 .recv_actor = unix_stream_splice_actor, 2955 .socket = sock, 2956 .pipe = pipe, 2957 .size = size, 2958 .splice_flags = flags, 2959 }; 2960 2961 if (unlikely(*ppos)) 2962 return -ESPIPE; 2963 2964 if (sock->file->f_flags & O_NONBLOCK || 2965 flags & SPLICE_F_NONBLOCK) 2966 state.flags = MSG_DONTWAIT; 2967 2968 return unix_stream_read_generic(&state, false); 2969 } 2970 2971 static int unix_shutdown(struct socket *sock, int mode) 2972 { 2973 struct sock *sk = sock->sk; 2974 struct sock *other; 2975 2976 if (mode < SHUT_RD || mode > SHUT_RDWR) 2977 return -EINVAL; 2978 /* This maps: 2979 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 2980 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 2981 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 2982 */ 2983 ++mode; 2984 2985 unix_state_lock(sk); 2986 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 2987 other = unix_peer(sk); 2988 if (other) 2989 sock_hold(other); 2990 unix_state_unlock(sk); 2991 sk->sk_state_change(sk); 2992 2993 if (other && 2994 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 2995 2996 int peer_mode = 0; 2997 const struct proto *prot = READ_ONCE(other->sk_prot); 2998 2999 if (prot->unhash) 3000 prot->unhash(other); 3001 if (mode&RCV_SHUTDOWN) 3002 peer_mode |= SEND_SHUTDOWN; 3003 if (mode&SEND_SHUTDOWN) 3004 peer_mode |= RCV_SHUTDOWN; 3005 unix_state_lock(other); 3006 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 3007 unix_state_unlock(other); 3008 other->sk_state_change(other); 3009 if (peer_mode == SHUTDOWN_MASK) 3010 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3011 else if (peer_mode & RCV_SHUTDOWN) 3012 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3013 } 3014 if (other) 3015 sock_put(other); 3016 3017 return 0; 3018 } 3019 3020 long unix_inq_len(struct sock *sk) 3021 { 3022 struct sk_buff *skb; 3023 long amount = 0; 3024 3025 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 3026 return -EINVAL; 3027 3028 spin_lock(&sk->sk_receive_queue.lock); 3029 if (sk->sk_type == SOCK_STREAM || 3030 sk->sk_type == SOCK_SEQPACKET) { 3031 skb_queue_walk(&sk->sk_receive_queue, skb) 3032 amount += unix_skb_len(skb); 3033 } else { 3034 skb = skb_peek(&sk->sk_receive_queue); 3035 if (skb) 3036 amount = skb->len; 3037 } 3038 spin_unlock(&sk->sk_receive_queue.lock); 3039 3040 return amount; 3041 } 3042 EXPORT_SYMBOL_GPL(unix_inq_len); 3043 3044 long unix_outq_len(struct sock *sk) 3045 { 3046 return sk_wmem_alloc_get(sk); 3047 } 3048 EXPORT_SYMBOL_GPL(unix_outq_len); 3049 3050 static int unix_open_file(struct sock *sk) 3051 { 3052 struct path path; 3053 struct file *f; 3054 int fd; 3055 3056 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3057 return -EPERM; 3058 3059 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3060 return -ENOENT; 3061 3062 path = unix_sk(sk)->path; 3063 if (!path.dentry) 3064 return -ENOENT; 3065 3066 path_get(&path); 3067 3068 fd = get_unused_fd_flags(O_CLOEXEC); 3069 if (fd < 0) 3070 goto out; 3071 3072 f = dentry_open(&path, O_PATH, current_cred()); 3073 if (IS_ERR(f)) { 3074 put_unused_fd(fd); 3075 fd = PTR_ERR(f); 3076 goto out; 3077 } 3078 3079 fd_install(fd, f); 3080 out: 3081 path_put(&path); 3082 3083 return fd; 3084 } 3085 3086 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3087 { 3088 struct sock *sk = sock->sk; 3089 long amount = 0; 3090 int err; 3091 3092 switch (cmd) { 3093 case SIOCOUTQ: 3094 amount = unix_outq_len(sk); 3095 err = put_user(amount, (int __user *)arg); 3096 break; 3097 case SIOCINQ: 3098 amount = unix_inq_len(sk); 3099 if (amount < 0) 3100 err = amount; 3101 else 3102 err = put_user(amount, (int __user *)arg); 3103 break; 3104 case SIOCUNIXFILE: 3105 err = unix_open_file(sk); 3106 break; 3107 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3108 case SIOCATMARK: 3109 { 3110 struct unix_sock *u = unix_sk(sk); 3111 struct sk_buff *skb; 3112 int answ = 0; 3113 3114 mutex_lock(&u->iolock); 3115 3116 skb = skb_peek(&sk->sk_receive_queue); 3117 if (skb) { 3118 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); 3119 3120 if (skb == oob_skb || 3121 (!oob_skb && !unix_skb_len(skb))) 3122 answ = 1; 3123 } 3124 3125 mutex_unlock(&u->iolock); 3126 3127 err = put_user(answ, (int __user *)arg); 3128 } 3129 break; 3130 #endif 3131 default: 3132 err = -ENOIOCTLCMD; 3133 break; 3134 } 3135 return err; 3136 } 3137 3138 #ifdef CONFIG_COMPAT 3139 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3140 { 3141 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3142 } 3143 #endif 3144 3145 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3146 { 3147 struct sock *sk = sock->sk; 3148 unsigned char state; 3149 __poll_t mask; 3150 u8 shutdown; 3151 3152 sock_poll_wait(file, sock, wait); 3153 mask = 0; 3154 shutdown = READ_ONCE(sk->sk_shutdown); 3155 state = READ_ONCE(sk->sk_state); 3156 3157 /* exceptional events? */ 3158 if (READ_ONCE(sk->sk_err)) 3159 mask |= EPOLLERR; 3160 if (shutdown == SHUTDOWN_MASK) 3161 mask |= EPOLLHUP; 3162 if (shutdown & RCV_SHUTDOWN) 3163 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3164 3165 /* readable? */ 3166 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3167 mask |= EPOLLIN | EPOLLRDNORM; 3168 if (sk_is_readable(sk)) 3169 mask |= EPOLLIN | EPOLLRDNORM; 3170 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3171 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3172 mask |= EPOLLPRI; 3173 #endif 3174 3175 /* Connection-based need to check for termination and startup */ 3176 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3177 state == TCP_CLOSE) 3178 mask |= EPOLLHUP; 3179 3180 /* 3181 * we set writable also when the other side has shut down the 3182 * connection. This prevents stuck sockets. 3183 */ 3184 if (unix_writable(sk, state)) 3185 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3186 3187 return mask; 3188 } 3189 3190 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3191 poll_table *wait) 3192 { 3193 struct sock *sk = sock->sk, *other; 3194 unsigned int writable; 3195 unsigned char state; 3196 __poll_t mask; 3197 u8 shutdown; 3198 3199 sock_poll_wait(file, sock, wait); 3200 mask = 0; 3201 shutdown = READ_ONCE(sk->sk_shutdown); 3202 state = READ_ONCE(sk->sk_state); 3203 3204 /* exceptional events? */ 3205 if (READ_ONCE(sk->sk_err) || 3206 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3207 mask |= EPOLLERR | 3208 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3209 3210 if (shutdown & RCV_SHUTDOWN) 3211 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3212 if (shutdown == SHUTDOWN_MASK) 3213 mask |= EPOLLHUP; 3214 3215 /* readable? */ 3216 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3217 mask |= EPOLLIN | EPOLLRDNORM; 3218 if (sk_is_readable(sk)) 3219 mask |= EPOLLIN | EPOLLRDNORM; 3220 3221 /* Connection-based need to check for termination and startup */ 3222 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3223 mask |= EPOLLHUP; 3224 3225 /* No write status requested, avoid expensive OUT tests. */ 3226 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3227 return mask; 3228 3229 writable = unix_writable(sk, state); 3230 if (writable) { 3231 unix_state_lock(sk); 3232 3233 other = unix_peer(sk); 3234 if (other && unix_peer(other) != sk && 3235 unix_recvq_full_lockless(other) && 3236 unix_dgram_peer_wake_me(sk, other)) 3237 writable = 0; 3238 3239 unix_state_unlock(sk); 3240 } 3241 3242 if (writable) 3243 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3244 else 3245 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3246 3247 return mask; 3248 } 3249 3250 #ifdef CONFIG_PROC_FS 3251 3252 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3253 3254 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3255 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3256 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3257 3258 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3259 { 3260 unsigned long offset = get_offset(*pos); 3261 unsigned long bucket = get_bucket(*pos); 3262 unsigned long count = 0; 3263 struct sock *sk; 3264 3265 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3266 sk; sk = sk_next(sk)) { 3267 if (++count == offset) 3268 break; 3269 } 3270 3271 return sk; 3272 } 3273 3274 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3275 { 3276 unsigned long bucket = get_bucket(*pos); 3277 struct net *net = seq_file_net(seq); 3278 struct sock *sk; 3279 3280 while (bucket < UNIX_HASH_SIZE) { 3281 spin_lock(&net->unx.table.locks[bucket]); 3282 3283 sk = unix_from_bucket(seq, pos); 3284 if (sk) 3285 return sk; 3286 3287 spin_unlock(&net->unx.table.locks[bucket]); 3288 3289 *pos = set_bucket_offset(++bucket, 1); 3290 } 3291 3292 return NULL; 3293 } 3294 3295 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3296 loff_t *pos) 3297 { 3298 unsigned long bucket = get_bucket(*pos); 3299 3300 sk = sk_next(sk); 3301 if (sk) 3302 return sk; 3303 3304 3305 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3306 3307 *pos = set_bucket_offset(++bucket, 1); 3308 3309 return unix_get_first(seq, pos); 3310 } 3311 3312 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3313 { 3314 if (!*pos) 3315 return SEQ_START_TOKEN; 3316 3317 return unix_get_first(seq, pos); 3318 } 3319 3320 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3321 { 3322 ++*pos; 3323 3324 if (v == SEQ_START_TOKEN) 3325 return unix_get_first(seq, pos); 3326 3327 return unix_get_next(seq, v, pos); 3328 } 3329 3330 static void unix_seq_stop(struct seq_file *seq, void *v) 3331 { 3332 struct sock *sk = v; 3333 3334 if (sk) 3335 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3336 } 3337 3338 static int unix_seq_show(struct seq_file *seq, void *v) 3339 { 3340 3341 if (v == SEQ_START_TOKEN) 3342 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3343 "Inode Path\n"); 3344 else { 3345 struct sock *s = v; 3346 struct unix_sock *u = unix_sk(s); 3347 unix_state_lock(s); 3348 3349 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3350 s, 3351 refcount_read(&s->sk_refcnt), 3352 0, 3353 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3354 s->sk_type, 3355 s->sk_socket ? 3356 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3357 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3358 sock_i_ino(s)); 3359 3360 if (u->addr) { // under a hash table lock here 3361 int i, len; 3362 seq_putc(seq, ' '); 3363 3364 i = 0; 3365 len = u->addr->len - 3366 offsetof(struct sockaddr_un, sun_path); 3367 if (u->addr->name->sun_path[0]) { 3368 len--; 3369 } else { 3370 seq_putc(seq, '@'); 3371 i++; 3372 } 3373 for ( ; i < len; i++) 3374 seq_putc(seq, u->addr->name->sun_path[i] ?: 3375 '@'); 3376 } 3377 unix_state_unlock(s); 3378 seq_putc(seq, '\n'); 3379 } 3380 3381 return 0; 3382 } 3383 3384 static const struct seq_operations unix_seq_ops = { 3385 .start = unix_seq_start, 3386 .next = unix_seq_next, 3387 .stop = unix_seq_stop, 3388 .show = unix_seq_show, 3389 }; 3390 3391 #ifdef CONFIG_BPF_SYSCALL 3392 struct bpf_unix_iter_state { 3393 struct seq_net_private p; 3394 unsigned int cur_sk; 3395 unsigned int end_sk; 3396 unsigned int max_sk; 3397 struct sock **batch; 3398 bool st_bucket_done; 3399 }; 3400 3401 struct bpf_iter__unix { 3402 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3403 __bpf_md_ptr(struct unix_sock *, unix_sk); 3404 uid_t uid __aligned(8); 3405 }; 3406 3407 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3408 struct unix_sock *unix_sk, uid_t uid) 3409 { 3410 struct bpf_iter__unix ctx; 3411 3412 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3413 ctx.meta = meta; 3414 ctx.unix_sk = unix_sk; 3415 ctx.uid = uid; 3416 return bpf_iter_run_prog(prog, &ctx); 3417 } 3418 3419 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3420 3421 { 3422 struct bpf_unix_iter_state *iter = seq->private; 3423 unsigned int expected = 1; 3424 struct sock *sk; 3425 3426 sock_hold(start_sk); 3427 iter->batch[iter->end_sk++] = start_sk; 3428 3429 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3430 if (iter->end_sk < iter->max_sk) { 3431 sock_hold(sk); 3432 iter->batch[iter->end_sk++] = sk; 3433 } 3434 3435 expected++; 3436 } 3437 3438 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3439 3440 return expected; 3441 } 3442 3443 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3444 { 3445 while (iter->cur_sk < iter->end_sk) 3446 sock_put(iter->batch[iter->cur_sk++]); 3447 } 3448 3449 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3450 unsigned int new_batch_sz) 3451 { 3452 struct sock **new_batch; 3453 3454 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3455 GFP_USER | __GFP_NOWARN); 3456 if (!new_batch) 3457 return -ENOMEM; 3458 3459 bpf_iter_unix_put_batch(iter); 3460 kvfree(iter->batch); 3461 iter->batch = new_batch; 3462 iter->max_sk = new_batch_sz; 3463 3464 return 0; 3465 } 3466 3467 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3468 loff_t *pos) 3469 { 3470 struct bpf_unix_iter_state *iter = seq->private; 3471 unsigned int expected; 3472 bool resized = false; 3473 struct sock *sk; 3474 3475 if (iter->st_bucket_done) 3476 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3477 3478 again: 3479 /* Get a new batch */ 3480 iter->cur_sk = 0; 3481 iter->end_sk = 0; 3482 3483 sk = unix_get_first(seq, pos); 3484 if (!sk) 3485 return NULL; /* Done */ 3486 3487 expected = bpf_iter_unix_hold_batch(seq, sk); 3488 3489 if (iter->end_sk == expected) { 3490 iter->st_bucket_done = true; 3491 return sk; 3492 } 3493 3494 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3495 resized = true; 3496 goto again; 3497 } 3498 3499 return sk; 3500 } 3501 3502 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3503 { 3504 if (!*pos) 3505 return SEQ_START_TOKEN; 3506 3507 /* bpf iter does not support lseek, so it always 3508 * continue from where it was stop()-ped. 3509 */ 3510 return bpf_iter_unix_batch(seq, pos); 3511 } 3512 3513 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3514 { 3515 struct bpf_unix_iter_state *iter = seq->private; 3516 struct sock *sk; 3517 3518 /* Whenever seq_next() is called, the iter->cur_sk is 3519 * done with seq_show(), so advance to the next sk in 3520 * the batch. 3521 */ 3522 if (iter->cur_sk < iter->end_sk) 3523 sock_put(iter->batch[iter->cur_sk++]); 3524 3525 ++*pos; 3526 3527 if (iter->cur_sk < iter->end_sk) 3528 sk = iter->batch[iter->cur_sk]; 3529 else 3530 sk = bpf_iter_unix_batch(seq, pos); 3531 3532 return sk; 3533 } 3534 3535 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3536 { 3537 struct bpf_iter_meta meta; 3538 struct bpf_prog *prog; 3539 struct sock *sk = v; 3540 uid_t uid; 3541 bool slow; 3542 int ret; 3543 3544 if (v == SEQ_START_TOKEN) 3545 return 0; 3546 3547 slow = lock_sock_fast(sk); 3548 3549 if (unlikely(sk_unhashed(sk))) { 3550 ret = SEQ_SKIP; 3551 goto unlock; 3552 } 3553 3554 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3555 meta.seq = seq; 3556 prog = bpf_iter_get_info(&meta, false); 3557 ret = unix_prog_seq_show(prog, &meta, v, uid); 3558 unlock: 3559 unlock_sock_fast(sk, slow); 3560 return ret; 3561 } 3562 3563 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3564 { 3565 struct bpf_unix_iter_state *iter = seq->private; 3566 struct bpf_iter_meta meta; 3567 struct bpf_prog *prog; 3568 3569 if (!v) { 3570 meta.seq = seq; 3571 prog = bpf_iter_get_info(&meta, true); 3572 if (prog) 3573 (void)unix_prog_seq_show(prog, &meta, v, 0); 3574 } 3575 3576 if (iter->cur_sk < iter->end_sk) 3577 bpf_iter_unix_put_batch(iter); 3578 } 3579 3580 static const struct seq_operations bpf_iter_unix_seq_ops = { 3581 .start = bpf_iter_unix_seq_start, 3582 .next = bpf_iter_unix_seq_next, 3583 .stop = bpf_iter_unix_seq_stop, 3584 .show = bpf_iter_unix_seq_show, 3585 }; 3586 #endif 3587 #endif 3588 3589 static const struct net_proto_family unix_family_ops = { 3590 .family = PF_UNIX, 3591 .create = unix_create, 3592 .owner = THIS_MODULE, 3593 }; 3594 3595 3596 static int __net_init unix_net_init(struct net *net) 3597 { 3598 int i; 3599 3600 net->unx.sysctl_max_dgram_qlen = 10; 3601 if (unix_sysctl_register(net)) 3602 goto out; 3603 3604 #ifdef CONFIG_PROC_FS 3605 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3606 sizeof(struct seq_net_private))) 3607 goto err_sysctl; 3608 #endif 3609 3610 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3611 sizeof(spinlock_t), GFP_KERNEL); 3612 if (!net->unx.table.locks) 3613 goto err_proc; 3614 3615 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3616 sizeof(struct hlist_head), 3617 GFP_KERNEL); 3618 if (!net->unx.table.buckets) 3619 goto free_locks; 3620 3621 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3622 spin_lock_init(&net->unx.table.locks[i]); 3623 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3624 } 3625 3626 return 0; 3627 3628 free_locks: 3629 kvfree(net->unx.table.locks); 3630 err_proc: 3631 #ifdef CONFIG_PROC_FS 3632 remove_proc_entry("unix", net->proc_net); 3633 err_sysctl: 3634 #endif 3635 unix_sysctl_unregister(net); 3636 out: 3637 return -ENOMEM; 3638 } 3639 3640 static void __net_exit unix_net_exit(struct net *net) 3641 { 3642 kvfree(net->unx.table.buckets); 3643 kvfree(net->unx.table.locks); 3644 unix_sysctl_unregister(net); 3645 remove_proc_entry("unix", net->proc_net); 3646 } 3647 3648 static struct pernet_operations unix_net_ops = { 3649 .init = unix_net_init, 3650 .exit = unix_net_exit, 3651 }; 3652 3653 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3654 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3655 struct unix_sock *unix_sk, uid_t uid) 3656 3657 #define INIT_BATCH_SZ 16 3658 3659 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3660 { 3661 struct bpf_unix_iter_state *iter = priv_data; 3662 int err; 3663 3664 err = bpf_iter_init_seq_net(priv_data, aux); 3665 if (err) 3666 return err; 3667 3668 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3669 if (err) { 3670 bpf_iter_fini_seq_net(priv_data); 3671 return err; 3672 } 3673 3674 return 0; 3675 } 3676 3677 static void bpf_iter_fini_unix(void *priv_data) 3678 { 3679 struct bpf_unix_iter_state *iter = priv_data; 3680 3681 bpf_iter_fini_seq_net(priv_data); 3682 kvfree(iter->batch); 3683 } 3684 3685 static const struct bpf_iter_seq_info unix_seq_info = { 3686 .seq_ops = &bpf_iter_unix_seq_ops, 3687 .init_seq_private = bpf_iter_init_unix, 3688 .fini_seq_private = bpf_iter_fini_unix, 3689 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3690 }; 3691 3692 static const struct bpf_func_proto * 3693 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3694 const struct bpf_prog *prog) 3695 { 3696 switch (func_id) { 3697 case BPF_FUNC_setsockopt: 3698 return &bpf_sk_setsockopt_proto; 3699 case BPF_FUNC_getsockopt: 3700 return &bpf_sk_getsockopt_proto; 3701 default: 3702 return NULL; 3703 } 3704 } 3705 3706 static struct bpf_iter_reg unix_reg_info = { 3707 .target = "unix", 3708 .ctx_arg_info_size = 1, 3709 .ctx_arg_info = { 3710 { offsetof(struct bpf_iter__unix, unix_sk), 3711 PTR_TO_BTF_ID_OR_NULL }, 3712 }, 3713 .get_func_proto = bpf_iter_unix_get_func_proto, 3714 .seq_info = &unix_seq_info, 3715 }; 3716 3717 static void __init bpf_iter_register(void) 3718 { 3719 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3720 if (bpf_iter_reg_target(&unix_reg_info)) 3721 pr_warn("Warning: could not register bpf iterator unix\n"); 3722 } 3723 #endif 3724 3725 static int __init af_unix_init(void) 3726 { 3727 int i, rc = -1; 3728 3729 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3730 3731 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3732 spin_lock_init(&bsd_socket_locks[i]); 3733 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3734 } 3735 3736 rc = proto_register(&unix_dgram_proto, 1); 3737 if (rc != 0) { 3738 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3739 goto out; 3740 } 3741 3742 rc = proto_register(&unix_stream_proto, 1); 3743 if (rc != 0) { 3744 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3745 proto_unregister(&unix_dgram_proto); 3746 goto out; 3747 } 3748 3749 sock_register(&unix_family_ops); 3750 register_pernet_subsys(&unix_net_ops); 3751 unix_bpf_build_proto(); 3752 3753 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3754 bpf_iter_register(); 3755 #endif 3756 3757 out: 3758 return rc; 3759 } 3760 3761 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3762 fs_initcall(af_unix_init); 3763