1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 #include <linux/bpf-cgroup.h> 120 121 #include "scm.h" 122 123 static atomic_long_t unix_nr_socks; 124 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 125 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 126 127 /* SMP locking strategy: 128 * hash table is protected with spinlock. 129 * each socket state is protected by separate spinlock. 130 */ 131 132 static unsigned int unix_unbound_hash(struct sock *sk) 133 { 134 unsigned long hash = (unsigned long)sk; 135 136 hash ^= hash >> 16; 137 hash ^= hash >> 8; 138 hash ^= sk->sk_type; 139 140 return hash & UNIX_HASH_MOD; 141 } 142 143 static unsigned int unix_bsd_hash(struct inode *i) 144 { 145 return i->i_ino & UNIX_HASH_MOD; 146 } 147 148 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 149 int addr_len, int type) 150 { 151 __wsum csum = csum_partial(sunaddr, addr_len, 0); 152 unsigned int hash; 153 154 hash = (__force unsigned int)csum_fold(csum); 155 hash ^= hash >> 8; 156 hash ^= type; 157 158 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 159 } 160 161 static void unix_table_double_lock(struct net *net, 162 unsigned int hash1, unsigned int hash2) 163 { 164 if (hash1 == hash2) { 165 spin_lock(&net->unx.table.locks[hash1]); 166 return; 167 } 168 169 if (hash1 > hash2) 170 swap(hash1, hash2); 171 172 spin_lock(&net->unx.table.locks[hash1]); 173 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); 174 } 175 176 static void unix_table_double_unlock(struct net *net, 177 unsigned int hash1, unsigned int hash2) 178 { 179 if (hash1 == hash2) { 180 spin_unlock(&net->unx.table.locks[hash1]); 181 return; 182 } 183 184 spin_unlock(&net->unx.table.locks[hash1]); 185 spin_unlock(&net->unx.table.locks[hash2]); 186 } 187 188 #ifdef CONFIG_SECURITY_NETWORK 189 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 190 { 191 UNIXCB(skb).secid = scm->secid; 192 } 193 194 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 195 { 196 scm->secid = UNIXCB(skb).secid; 197 } 198 199 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 200 { 201 return (scm->secid == UNIXCB(skb).secid); 202 } 203 #else 204 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 205 { } 206 207 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 208 { } 209 210 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 211 { 212 return true; 213 } 214 #endif /* CONFIG_SECURITY_NETWORK */ 215 216 static inline int unix_our_peer(struct sock *sk, struct sock *osk) 217 { 218 return unix_peer(osk) == sk; 219 } 220 221 static inline int unix_may_send(struct sock *sk, struct sock *osk) 222 { 223 return unix_peer(osk) == NULL || unix_our_peer(sk, osk); 224 } 225 226 static inline int unix_recvq_full(const struct sock *sk) 227 { 228 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 229 } 230 231 static inline int unix_recvq_full_lockless(const struct sock *sk) 232 { 233 return skb_queue_len_lockless(&sk->sk_receive_queue) > 234 READ_ONCE(sk->sk_max_ack_backlog); 235 } 236 237 struct sock *unix_peer_get(struct sock *s) 238 { 239 struct sock *peer; 240 241 unix_state_lock(s); 242 peer = unix_peer(s); 243 if (peer) 244 sock_hold(peer); 245 unix_state_unlock(s); 246 return peer; 247 } 248 EXPORT_SYMBOL_GPL(unix_peer_get); 249 250 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 251 int addr_len) 252 { 253 struct unix_address *addr; 254 255 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 256 if (!addr) 257 return NULL; 258 259 refcount_set(&addr->refcnt, 1); 260 addr->len = addr_len; 261 memcpy(addr->name, sunaddr, addr_len); 262 263 return addr; 264 } 265 266 static inline void unix_release_addr(struct unix_address *addr) 267 { 268 if (refcount_dec_and_test(&addr->refcnt)) 269 kfree(addr); 270 } 271 272 /* 273 * Check unix socket name: 274 * - should be not zero length. 275 * - if started by not zero, should be NULL terminated (FS object) 276 * - if started by zero, it is abstract name. 277 */ 278 279 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 280 { 281 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 282 addr_len > sizeof(*sunaddr)) 283 return -EINVAL; 284 285 if (sunaddr->sun_family != AF_UNIX) 286 return -EINVAL; 287 288 return 0; 289 } 290 291 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 292 { 293 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 294 short offset = offsetof(struct sockaddr_storage, __data); 295 296 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 297 298 /* This may look like an off by one error but it is a bit more 299 * subtle. 108 is the longest valid AF_UNIX path for a binding. 300 * sun_path[108] doesn't as such exist. However in kernel space 301 * we are guaranteed that it is a valid memory location in our 302 * kernel address buffer because syscall functions always pass 303 * a pointer of struct sockaddr_storage which has a bigger buffer 304 * than 108. Also, we must terminate sun_path for strlen() in 305 * getname_kernel(). 306 */ 307 addr->__data[addr_len - offset] = 0; 308 309 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 310 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 311 * know the actual buffer. 312 */ 313 return strlen(addr->__data) + offset + 1; 314 } 315 316 static void __unix_remove_socket(struct sock *sk) 317 { 318 sk_del_node_init(sk); 319 } 320 321 static void __unix_insert_socket(struct net *net, struct sock *sk) 322 { 323 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 324 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 325 } 326 327 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 328 struct unix_address *addr, unsigned int hash) 329 { 330 __unix_remove_socket(sk); 331 smp_store_release(&unix_sk(sk)->addr, addr); 332 333 sk->sk_hash = hash; 334 __unix_insert_socket(net, sk); 335 } 336 337 static void unix_remove_socket(struct net *net, struct sock *sk) 338 { 339 spin_lock(&net->unx.table.locks[sk->sk_hash]); 340 __unix_remove_socket(sk); 341 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 342 } 343 344 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 345 { 346 spin_lock(&net->unx.table.locks[sk->sk_hash]); 347 __unix_insert_socket(net, sk); 348 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 349 } 350 351 static void unix_insert_bsd_socket(struct sock *sk) 352 { 353 spin_lock(&bsd_socket_locks[sk->sk_hash]); 354 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 355 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 356 } 357 358 static void unix_remove_bsd_socket(struct sock *sk) 359 { 360 if (!hlist_unhashed(&sk->sk_bind_node)) { 361 spin_lock(&bsd_socket_locks[sk->sk_hash]); 362 __sk_del_bind_node(sk); 363 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 364 365 sk_node_init(&sk->sk_bind_node); 366 } 367 } 368 369 static struct sock *__unix_find_socket_byname(struct net *net, 370 struct sockaddr_un *sunname, 371 int len, unsigned int hash) 372 { 373 struct sock *s; 374 375 sk_for_each(s, &net->unx.table.buckets[hash]) { 376 struct unix_sock *u = unix_sk(s); 377 378 if (u->addr->len == len && 379 !memcmp(u->addr->name, sunname, len)) 380 return s; 381 } 382 return NULL; 383 } 384 385 static inline struct sock *unix_find_socket_byname(struct net *net, 386 struct sockaddr_un *sunname, 387 int len, unsigned int hash) 388 { 389 struct sock *s; 390 391 spin_lock(&net->unx.table.locks[hash]); 392 s = __unix_find_socket_byname(net, sunname, len, hash); 393 if (s) 394 sock_hold(s); 395 spin_unlock(&net->unx.table.locks[hash]); 396 return s; 397 } 398 399 static struct sock *unix_find_socket_byinode(struct inode *i) 400 { 401 unsigned int hash = unix_bsd_hash(i); 402 struct sock *s; 403 404 spin_lock(&bsd_socket_locks[hash]); 405 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 406 struct dentry *dentry = unix_sk(s)->path.dentry; 407 408 if (dentry && d_backing_inode(dentry) == i) { 409 sock_hold(s); 410 spin_unlock(&bsd_socket_locks[hash]); 411 return s; 412 } 413 } 414 spin_unlock(&bsd_socket_locks[hash]); 415 return NULL; 416 } 417 418 /* Support code for asymmetrically connected dgram sockets 419 * 420 * If a datagram socket is connected to a socket not itself connected 421 * to the first socket (eg, /dev/log), clients may only enqueue more 422 * messages if the present receive queue of the server socket is not 423 * "too large". This means there's a second writeability condition 424 * poll and sendmsg need to test. The dgram recv code will do a wake 425 * up on the peer_wait wait queue of a socket upon reception of a 426 * datagram which needs to be propagated to sleeping would-be writers 427 * since these might not have sent anything so far. This can't be 428 * accomplished via poll_wait because the lifetime of the server 429 * socket might be less than that of its clients if these break their 430 * association with it or if the server socket is closed while clients 431 * are still connected to it and there's no way to inform "a polling 432 * implementation" that it should let go of a certain wait queue 433 * 434 * In order to propagate a wake up, a wait_queue_entry_t of the client 435 * socket is enqueued on the peer_wait queue of the server socket 436 * whose wake function does a wake_up on the ordinary client socket 437 * wait queue. This connection is established whenever a write (or 438 * poll for write) hit the flow control condition and broken when the 439 * association to the server socket is dissolved or after a wake up 440 * was relayed. 441 */ 442 443 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 444 void *key) 445 { 446 struct unix_sock *u; 447 wait_queue_head_t *u_sleep; 448 449 u = container_of(q, struct unix_sock, peer_wake); 450 451 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 452 q); 453 u->peer_wake.private = NULL; 454 455 /* relaying can only happen while the wq still exists */ 456 u_sleep = sk_sleep(&u->sk); 457 if (u_sleep) 458 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 459 460 return 0; 461 } 462 463 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 464 { 465 struct unix_sock *u, *u_other; 466 int rc; 467 468 u = unix_sk(sk); 469 u_other = unix_sk(other); 470 rc = 0; 471 spin_lock(&u_other->peer_wait.lock); 472 473 if (!u->peer_wake.private) { 474 u->peer_wake.private = other; 475 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 476 477 rc = 1; 478 } 479 480 spin_unlock(&u_other->peer_wait.lock); 481 return rc; 482 } 483 484 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 485 struct sock *other) 486 { 487 struct unix_sock *u, *u_other; 488 489 u = unix_sk(sk); 490 u_other = unix_sk(other); 491 spin_lock(&u_other->peer_wait.lock); 492 493 if (u->peer_wake.private == other) { 494 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 495 u->peer_wake.private = NULL; 496 } 497 498 spin_unlock(&u_other->peer_wait.lock); 499 } 500 501 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 502 struct sock *other) 503 { 504 unix_dgram_peer_wake_disconnect(sk, other); 505 wake_up_interruptible_poll(sk_sleep(sk), 506 EPOLLOUT | 507 EPOLLWRNORM | 508 EPOLLWRBAND); 509 } 510 511 /* preconditions: 512 * - unix_peer(sk) == other 513 * - association is stable 514 */ 515 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 516 { 517 int connected; 518 519 connected = unix_dgram_peer_wake_connect(sk, other); 520 521 /* If other is SOCK_DEAD, we want to make sure we signal 522 * POLLOUT, such that a subsequent write() can get a 523 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 524 * to other and its full, we will hang waiting for POLLOUT. 525 */ 526 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 527 return 1; 528 529 if (connected) 530 unix_dgram_peer_wake_disconnect(sk, other); 531 532 return 0; 533 } 534 535 static int unix_writable(const struct sock *sk) 536 { 537 return sk->sk_state != TCP_LISTEN && 538 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; 539 } 540 541 static void unix_write_space(struct sock *sk) 542 { 543 struct socket_wq *wq; 544 545 rcu_read_lock(); 546 if (unix_writable(sk)) { 547 wq = rcu_dereference(sk->sk_wq); 548 if (skwq_has_sleeper(wq)) 549 wake_up_interruptible_sync_poll(&wq->wait, 550 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 551 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 552 } 553 rcu_read_unlock(); 554 } 555 556 /* When dgram socket disconnects (or changes its peer), we clear its receive 557 * queue of packets arrived from previous peer. First, it allows to do 558 * flow control based only on wmem_alloc; second, sk connected to peer 559 * may receive messages only from that peer. */ 560 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 561 { 562 if (!skb_queue_empty(&sk->sk_receive_queue)) { 563 skb_queue_purge(&sk->sk_receive_queue); 564 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 565 566 /* If one link of bidirectional dgram pipe is disconnected, 567 * we signal error. Messages are lost. Do not make this, 568 * when peer was not connected to us. 569 */ 570 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 571 WRITE_ONCE(other->sk_err, ECONNRESET); 572 sk_error_report(other); 573 } 574 } 575 other->sk_state = TCP_CLOSE; 576 } 577 578 static void unix_sock_destructor(struct sock *sk) 579 { 580 struct unix_sock *u = unix_sk(sk); 581 582 skb_queue_purge(&sk->sk_receive_queue); 583 584 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 585 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 586 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 587 if (!sock_flag(sk, SOCK_DEAD)) { 588 pr_info("Attempt to release alive unix socket: %p\n", sk); 589 return; 590 } 591 592 if (u->addr) 593 unix_release_addr(u->addr); 594 595 atomic_long_dec(&unix_nr_socks); 596 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 597 #ifdef UNIX_REFCNT_DEBUG 598 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 599 atomic_long_read(&unix_nr_socks)); 600 #endif 601 } 602 603 static void unix_release_sock(struct sock *sk, int embrion) 604 { 605 struct unix_sock *u = unix_sk(sk); 606 struct sock *skpair; 607 struct sk_buff *skb; 608 struct path path; 609 int state; 610 611 unix_remove_socket(sock_net(sk), sk); 612 unix_remove_bsd_socket(sk); 613 614 /* Clear state */ 615 unix_state_lock(sk); 616 sock_orphan(sk); 617 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 618 path = u->path; 619 u->path.dentry = NULL; 620 u->path.mnt = NULL; 621 state = sk->sk_state; 622 sk->sk_state = TCP_CLOSE; 623 624 skpair = unix_peer(sk); 625 unix_peer(sk) = NULL; 626 627 unix_state_unlock(sk); 628 629 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 630 if (u->oob_skb) { 631 kfree_skb(u->oob_skb); 632 u->oob_skb = NULL; 633 } 634 #endif 635 636 wake_up_interruptible_all(&u->peer_wait); 637 638 if (skpair != NULL) { 639 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 640 unix_state_lock(skpair); 641 /* No more writes */ 642 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 643 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) 644 WRITE_ONCE(skpair->sk_err, ECONNRESET); 645 unix_state_unlock(skpair); 646 skpair->sk_state_change(skpair); 647 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 648 } 649 650 unix_dgram_peer_wake_disconnect(sk, skpair); 651 sock_put(skpair); /* It may now die */ 652 } 653 654 /* Try to flush out this socket. Throw out buffers at least */ 655 656 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 657 if (state == TCP_LISTEN) 658 unix_release_sock(skb->sk, 1); 659 /* passed fds are erased in the kfree_skb hook */ 660 UNIXCB(skb).consumed = skb->len; 661 kfree_skb(skb); 662 } 663 664 if (path.dentry) 665 path_put(&path); 666 667 sock_put(sk); 668 669 /* ---- Socket is dead now and most probably destroyed ---- */ 670 671 /* 672 * Fixme: BSD difference: In BSD all sockets connected to us get 673 * ECONNRESET and we die on the spot. In Linux we behave 674 * like files and pipes do and wait for the last 675 * dereference. 676 * 677 * Can't we simply set sock->err? 678 * 679 * What the above comment does talk about? --ANK(980817) 680 */ 681 682 if (READ_ONCE(unix_tot_inflight)) 683 unix_gc(); /* Garbage collect fds */ 684 } 685 686 static void init_peercred(struct sock *sk) 687 { 688 const struct cred *old_cred; 689 struct pid *old_pid; 690 691 spin_lock(&sk->sk_peer_lock); 692 old_pid = sk->sk_peer_pid; 693 old_cred = sk->sk_peer_cred; 694 sk->sk_peer_pid = get_pid(task_tgid(current)); 695 sk->sk_peer_cred = get_current_cred(); 696 spin_unlock(&sk->sk_peer_lock); 697 698 put_pid(old_pid); 699 put_cred(old_cred); 700 } 701 702 static void copy_peercred(struct sock *sk, struct sock *peersk) 703 { 704 const struct cred *old_cred; 705 struct pid *old_pid; 706 707 if (sk < peersk) { 708 spin_lock(&sk->sk_peer_lock); 709 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); 710 } else { 711 spin_lock(&peersk->sk_peer_lock); 712 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); 713 } 714 old_pid = sk->sk_peer_pid; 715 old_cred = sk->sk_peer_cred; 716 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 717 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 718 719 spin_unlock(&sk->sk_peer_lock); 720 spin_unlock(&peersk->sk_peer_lock); 721 722 put_pid(old_pid); 723 put_cred(old_cred); 724 } 725 726 static int unix_listen(struct socket *sock, int backlog) 727 { 728 int err; 729 struct sock *sk = sock->sk; 730 struct unix_sock *u = unix_sk(sk); 731 732 err = -EOPNOTSUPP; 733 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 734 goto out; /* Only stream/seqpacket sockets accept */ 735 err = -EINVAL; 736 if (!u->addr) 737 goto out; /* No listens on an unbound socket */ 738 unix_state_lock(sk); 739 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 740 goto out_unlock; 741 if (backlog > sk->sk_max_ack_backlog) 742 wake_up_interruptible_all(&u->peer_wait); 743 sk->sk_max_ack_backlog = backlog; 744 sk->sk_state = TCP_LISTEN; 745 /* set credentials so connect can copy them */ 746 init_peercred(sk); 747 err = 0; 748 749 out_unlock: 750 unix_state_unlock(sk); 751 out: 752 return err; 753 } 754 755 static int unix_release(struct socket *); 756 static int unix_bind(struct socket *, struct sockaddr *, int); 757 static int unix_stream_connect(struct socket *, struct sockaddr *, 758 int addr_len, int flags); 759 static int unix_socketpair(struct socket *, struct socket *); 760 static int unix_accept(struct socket *, struct socket *, int, bool); 761 static int unix_getname(struct socket *, struct sockaddr *, int); 762 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 763 static __poll_t unix_dgram_poll(struct file *, struct socket *, 764 poll_table *); 765 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 766 #ifdef CONFIG_COMPAT 767 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 768 #endif 769 static int unix_shutdown(struct socket *, int); 770 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 771 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 772 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 773 struct pipe_inode_info *, size_t size, 774 unsigned int flags); 775 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 776 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 777 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 778 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 779 static int unix_dgram_connect(struct socket *, struct sockaddr *, 780 int, int); 781 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 782 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 783 int); 784 785 static int unix_set_peek_off(struct sock *sk, int val) 786 { 787 struct unix_sock *u = unix_sk(sk); 788 789 if (mutex_lock_interruptible(&u->iolock)) 790 return -EINTR; 791 792 WRITE_ONCE(sk->sk_peek_off, val); 793 mutex_unlock(&u->iolock); 794 795 return 0; 796 } 797 798 #ifdef CONFIG_PROC_FS 799 static int unix_count_nr_fds(struct sock *sk) 800 { 801 struct sk_buff *skb; 802 struct unix_sock *u; 803 int nr_fds = 0; 804 805 spin_lock(&sk->sk_receive_queue.lock); 806 skb = skb_peek(&sk->sk_receive_queue); 807 while (skb) { 808 u = unix_sk(skb->sk); 809 nr_fds += atomic_read(&u->scm_stat.nr_fds); 810 skb = skb_peek_next(skb, &sk->sk_receive_queue); 811 } 812 spin_unlock(&sk->sk_receive_queue.lock); 813 814 return nr_fds; 815 } 816 817 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 818 { 819 struct sock *sk = sock->sk; 820 unsigned char s_state; 821 struct unix_sock *u; 822 int nr_fds = 0; 823 824 if (sk) { 825 s_state = READ_ONCE(sk->sk_state); 826 u = unix_sk(sk); 827 828 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 829 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 830 * SOCK_DGRAM is ordinary. So, no lock is needed. 831 */ 832 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 833 nr_fds = atomic_read(&u->scm_stat.nr_fds); 834 else if (s_state == TCP_LISTEN) 835 nr_fds = unix_count_nr_fds(sk); 836 837 seq_printf(m, "scm_fds: %u\n", nr_fds); 838 } 839 } 840 #else 841 #define unix_show_fdinfo NULL 842 #endif 843 844 static const struct proto_ops unix_stream_ops = { 845 .family = PF_UNIX, 846 .owner = THIS_MODULE, 847 .release = unix_release, 848 .bind = unix_bind, 849 .connect = unix_stream_connect, 850 .socketpair = unix_socketpair, 851 .accept = unix_accept, 852 .getname = unix_getname, 853 .poll = unix_poll, 854 .ioctl = unix_ioctl, 855 #ifdef CONFIG_COMPAT 856 .compat_ioctl = unix_compat_ioctl, 857 #endif 858 .listen = unix_listen, 859 .shutdown = unix_shutdown, 860 .sendmsg = unix_stream_sendmsg, 861 .recvmsg = unix_stream_recvmsg, 862 .read_skb = unix_stream_read_skb, 863 .mmap = sock_no_mmap, 864 .splice_read = unix_stream_splice_read, 865 .set_peek_off = unix_set_peek_off, 866 .show_fdinfo = unix_show_fdinfo, 867 }; 868 869 static const struct proto_ops unix_dgram_ops = { 870 .family = PF_UNIX, 871 .owner = THIS_MODULE, 872 .release = unix_release, 873 .bind = unix_bind, 874 .connect = unix_dgram_connect, 875 .socketpair = unix_socketpair, 876 .accept = sock_no_accept, 877 .getname = unix_getname, 878 .poll = unix_dgram_poll, 879 .ioctl = unix_ioctl, 880 #ifdef CONFIG_COMPAT 881 .compat_ioctl = unix_compat_ioctl, 882 #endif 883 .listen = sock_no_listen, 884 .shutdown = unix_shutdown, 885 .sendmsg = unix_dgram_sendmsg, 886 .read_skb = unix_read_skb, 887 .recvmsg = unix_dgram_recvmsg, 888 .mmap = sock_no_mmap, 889 .set_peek_off = unix_set_peek_off, 890 .show_fdinfo = unix_show_fdinfo, 891 }; 892 893 static const struct proto_ops unix_seqpacket_ops = { 894 .family = PF_UNIX, 895 .owner = THIS_MODULE, 896 .release = unix_release, 897 .bind = unix_bind, 898 .connect = unix_stream_connect, 899 .socketpair = unix_socketpair, 900 .accept = unix_accept, 901 .getname = unix_getname, 902 .poll = unix_dgram_poll, 903 .ioctl = unix_ioctl, 904 #ifdef CONFIG_COMPAT 905 .compat_ioctl = unix_compat_ioctl, 906 #endif 907 .listen = unix_listen, 908 .shutdown = unix_shutdown, 909 .sendmsg = unix_seqpacket_sendmsg, 910 .recvmsg = unix_seqpacket_recvmsg, 911 .mmap = sock_no_mmap, 912 .set_peek_off = unix_set_peek_off, 913 .show_fdinfo = unix_show_fdinfo, 914 }; 915 916 static void unix_close(struct sock *sk, long timeout) 917 { 918 /* Nothing to do here, unix socket does not need a ->close(). 919 * This is merely for sockmap. 920 */ 921 } 922 923 static void unix_unhash(struct sock *sk) 924 { 925 /* Nothing to do here, unix socket does not need a ->unhash(). 926 * This is merely for sockmap. 927 */ 928 } 929 930 static bool unix_bpf_bypass_getsockopt(int level, int optname) 931 { 932 if (level == SOL_SOCKET) { 933 switch (optname) { 934 case SO_PEERPIDFD: 935 return true; 936 default: 937 return false; 938 } 939 } 940 941 return false; 942 } 943 944 struct proto unix_dgram_proto = { 945 .name = "UNIX", 946 .owner = THIS_MODULE, 947 .obj_size = sizeof(struct unix_sock), 948 .close = unix_close, 949 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 950 #ifdef CONFIG_BPF_SYSCALL 951 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 952 #endif 953 }; 954 955 struct proto unix_stream_proto = { 956 .name = "UNIX-STREAM", 957 .owner = THIS_MODULE, 958 .obj_size = sizeof(struct unix_sock), 959 .close = unix_close, 960 .unhash = unix_unhash, 961 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 962 #ifdef CONFIG_BPF_SYSCALL 963 .psock_update_sk_prot = unix_stream_bpf_update_proto, 964 #endif 965 }; 966 967 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 968 { 969 struct unix_sock *u; 970 struct sock *sk; 971 int err; 972 973 atomic_long_inc(&unix_nr_socks); 974 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 975 err = -ENFILE; 976 goto err; 977 } 978 979 if (type == SOCK_STREAM) 980 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 981 else /*dgram and seqpacket */ 982 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 983 984 if (!sk) { 985 err = -ENOMEM; 986 goto err; 987 } 988 989 sock_init_data(sock, sk); 990 991 sk->sk_hash = unix_unbound_hash(sk); 992 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 993 sk->sk_write_space = unix_write_space; 994 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; 995 sk->sk_destruct = unix_sock_destructor; 996 u = unix_sk(sk); 997 u->path.dentry = NULL; 998 u->path.mnt = NULL; 999 spin_lock_init(&u->lock); 1000 atomic_long_set(&u->inflight, 0); 1001 INIT_LIST_HEAD(&u->link); 1002 mutex_init(&u->iolock); /* single task reading lock */ 1003 mutex_init(&u->bindlock); /* single task binding lock */ 1004 init_waitqueue_head(&u->peer_wait); 1005 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1006 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1007 unix_insert_unbound_socket(net, sk); 1008 1009 sock_prot_inuse_add(net, sk->sk_prot, 1); 1010 1011 return sk; 1012 1013 err: 1014 atomic_long_dec(&unix_nr_socks); 1015 return ERR_PTR(err); 1016 } 1017 1018 static int unix_create(struct net *net, struct socket *sock, int protocol, 1019 int kern) 1020 { 1021 struct sock *sk; 1022 1023 if (protocol && protocol != PF_UNIX) 1024 return -EPROTONOSUPPORT; 1025 1026 sock->state = SS_UNCONNECTED; 1027 1028 switch (sock->type) { 1029 case SOCK_STREAM: 1030 sock->ops = &unix_stream_ops; 1031 break; 1032 /* 1033 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1034 * nothing uses it. 1035 */ 1036 case SOCK_RAW: 1037 sock->type = SOCK_DGRAM; 1038 fallthrough; 1039 case SOCK_DGRAM: 1040 sock->ops = &unix_dgram_ops; 1041 break; 1042 case SOCK_SEQPACKET: 1043 sock->ops = &unix_seqpacket_ops; 1044 break; 1045 default: 1046 return -ESOCKTNOSUPPORT; 1047 } 1048 1049 sk = unix_create1(net, sock, kern, sock->type); 1050 if (IS_ERR(sk)) 1051 return PTR_ERR(sk); 1052 1053 return 0; 1054 } 1055 1056 static int unix_release(struct socket *sock) 1057 { 1058 struct sock *sk = sock->sk; 1059 1060 if (!sk) 1061 return 0; 1062 1063 sk->sk_prot->close(sk, 0); 1064 unix_release_sock(sk, 0); 1065 sock->sk = NULL; 1066 1067 return 0; 1068 } 1069 1070 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1071 int type) 1072 { 1073 struct inode *inode; 1074 struct path path; 1075 struct sock *sk; 1076 int err; 1077 1078 unix_mkname_bsd(sunaddr, addr_len); 1079 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1080 if (err) 1081 goto fail; 1082 1083 err = path_permission(&path, MAY_WRITE); 1084 if (err) 1085 goto path_put; 1086 1087 err = -ECONNREFUSED; 1088 inode = d_backing_inode(path.dentry); 1089 if (!S_ISSOCK(inode->i_mode)) 1090 goto path_put; 1091 1092 sk = unix_find_socket_byinode(inode); 1093 if (!sk) 1094 goto path_put; 1095 1096 err = -EPROTOTYPE; 1097 if (sk->sk_type == type) 1098 touch_atime(&path); 1099 else 1100 goto sock_put; 1101 1102 path_put(&path); 1103 1104 return sk; 1105 1106 sock_put: 1107 sock_put(sk); 1108 path_put: 1109 path_put(&path); 1110 fail: 1111 return ERR_PTR(err); 1112 } 1113 1114 static struct sock *unix_find_abstract(struct net *net, 1115 struct sockaddr_un *sunaddr, 1116 int addr_len, int type) 1117 { 1118 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1119 struct dentry *dentry; 1120 struct sock *sk; 1121 1122 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1123 if (!sk) 1124 return ERR_PTR(-ECONNREFUSED); 1125 1126 dentry = unix_sk(sk)->path.dentry; 1127 if (dentry) 1128 touch_atime(&unix_sk(sk)->path); 1129 1130 return sk; 1131 } 1132 1133 static struct sock *unix_find_other(struct net *net, 1134 struct sockaddr_un *sunaddr, 1135 int addr_len, int type) 1136 { 1137 struct sock *sk; 1138 1139 if (sunaddr->sun_path[0]) 1140 sk = unix_find_bsd(sunaddr, addr_len, type); 1141 else 1142 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1143 1144 return sk; 1145 } 1146 1147 static int unix_autobind(struct sock *sk) 1148 { 1149 unsigned int new_hash, old_hash = sk->sk_hash; 1150 struct unix_sock *u = unix_sk(sk); 1151 struct net *net = sock_net(sk); 1152 struct unix_address *addr; 1153 u32 lastnum, ordernum; 1154 int err; 1155 1156 err = mutex_lock_interruptible(&u->bindlock); 1157 if (err) 1158 return err; 1159 1160 if (u->addr) 1161 goto out; 1162 1163 err = -ENOMEM; 1164 addr = kzalloc(sizeof(*addr) + 1165 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1166 if (!addr) 1167 goto out; 1168 1169 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1170 addr->name->sun_family = AF_UNIX; 1171 refcount_set(&addr->refcnt, 1); 1172 1173 ordernum = get_random_u32(); 1174 lastnum = ordernum & 0xFFFFF; 1175 retry: 1176 ordernum = (ordernum + 1) & 0xFFFFF; 1177 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1178 1179 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1180 unix_table_double_lock(net, old_hash, new_hash); 1181 1182 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1183 unix_table_double_unlock(net, old_hash, new_hash); 1184 1185 /* __unix_find_socket_byname() may take long time if many names 1186 * are already in use. 1187 */ 1188 cond_resched(); 1189 1190 if (ordernum == lastnum) { 1191 /* Give up if all names seems to be in use. */ 1192 err = -ENOSPC; 1193 unix_release_addr(addr); 1194 goto out; 1195 } 1196 1197 goto retry; 1198 } 1199 1200 __unix_set_addr_hash(net, sk, addr, new_hash); 1201 unix_table_double_unlock(net, old_hash, new_hash); 1202 err = 0; 1203 1204 out: mutex_unlock(&u->bindlock); 1205 return err; 1206 } 1207 1208 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1209 int addr_len) 1210 { 1211 umode_t mode = S_IFSOCK | 1212 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1213 unsigned int new_hash, old_hash = sk->sk_hash; 1214 struct unix_sock *u = unix_sk(sk); 1215 struct net *net = sock_net(sk); 1216 struct mnt_idmap *idmap; 1217 struct unix_address *addr; 1218 struct dentry *dentry; 1219 struct path parent; 1220 int err; 1221 1222 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1223 addr = unix_create_addr(sunaddr, addr_len); 1224 if (!addr) 1225 return -ENOMEM; 1226 1227 /* 1228 * Get the parent directory, calculate the hash for last 1229 * component. 1230 */ 1231 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1232 if (IS_ERR(dentry)) { 1233 err = PTR_ERR(dentry); 1234 goto out; 1235 } 1236 1237 /* 1238 * All right, let's create it. 1239 */ 1240 idmap = mnt_idmap(parent.mnt); 1241 err = security_path_mknod(&parent, dentry, mode, 0); 1242 if (!err) 1243 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1244 if (err) 1245 goto out_path; 1246 err = mutex_lock_interruptible(&u->bindlock); 1247 if (err) 1248 goto out_unlink; 1249 if (u->addr) 1250 goto out_unlock; 1251 1252 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1253 unix_table_double_lock(net, old_hash, new_hash); 1254 u->path.mnt = mntget(parent.mnt); 1255 u->path.dentry = dget(dentry); 1256 __unix_set_addr_hash(net, sk, addr, new_hash); 1257 unix_table_double_unlock(net, old_hash, new_hash); 1258 unix_insert_bsd_socket(sk); 1259 mutex_unlock(&u->bindlock); 1260 done_path_create(&parent, dentry); 1261 return 0; 1262 1263 out_unlock: 1264 mutex_unlock(&u->bindlock); 1265 err = -EINVAL; 1266 out_unlink: 1267 /* failed after successful mknod? unlink what we'd created... */ 1268 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1269 out_path: 1270 done_path_create(&parent, dentry); 1271 out: 1272 unix_release_addr(addr); 1273 return err == -EEXIST ? -EADDRINUSE : err; 1274 } 1275 1276 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1277 int addr_len) 1278 { 1279 unsigned int new_hash, old_hash = sk->sk_hash; 1280 struct unix_sock *u = unix_sk(sk); 1281 struct net *net = sock_net(sk); 1282 struct unix_address *addr; 1283 int err; 1284 1285 addr = unix_create_addr(sunaddr, addr_len); 1286 if (!addr) 1287 return -ENOMEM; 1288 1289 err = mutex_lock_interruptible(&u->bindlock); 1290 if (err) 1291 goto out; 1292 1293 if (u->addr) { 1294 err = -EINVAL; 1295 goto out_mutex; 1296 } 1297 1298 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1299 unix_table_double_lock(net, old_hash, new_hash); 1300 1301 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1302 goto out_spin; 1303 1304 __unix_set_addr_hash(net, sk, addr, new_hash); 1305 unix_table_double_unlock(net, old_hash, new_hash); 1306 mutex_unlock(&u->bindlock); 1307 return 0; 1308 1309 out_spin: 1310 unix_table_double_unlock(net, old_hash, new_hash); 1311 err = -EADDRINUSE; 1312 out_mutex: 1313 mutex_unlock(&u->bindlock); 1314 out: 1315 unix_release_addr(addr); 1316 return err; 1317 } 1318 1319 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1320 { 1321 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1322 struct sock *sk = sock->sk; 1323 int err; 1324 1325 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1326 sunaddr->sun_family == AF_UNIX) 1327 return unix_autobind(sk); 1328 1329 err = unix_validate_addr(sunaddr, addr_len); 1330 if (err) 1331 return err; 1332 1333 if (sunaddr->sun_path[0]) 1334 err = unix_bind_bsd(sk, sunaddr, addr_len); 1335 else 1336 err = unix_bind_abstract(sk, sunaddr, addr_len); 1337 1338 return err; 1339 } 1340 1341 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1342 { 1343 if (unlikely(sk1 == sk2) || !sk2) { 1344 unix_state_lock(sk1); 1345 return; 1346 } 1347 if (sk1 > sk2) 1348 swap(sk1, sk2); 1349 1350 unix_state_lock(sk1); 1351 unix_state_lock_nested(sk2, U_LOCK_SECOND); 1352 } 1353 1354 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1355 { 1356 if (unlikely(sk1 == sk2) || !sk2) { 1357 unix_state_unlock(sk1); 1358 return; 1359 } 1360 unix_state_unlock(sk1); 1361 unix_state_unlock(sk2); 1362 } 1363 1364 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1365 int alen, int flags) 1366 { 1367 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1368 struct sock *sk = sock->sk; 1369 struct sock *other; 1370 int err; 1371 1372 err = -EINVAL; 1373 if (alen < offsetofend(struct sockaddr, sa_family)) 1374 goto out; 1375 1376 if (addr->sa_family != AF_UNSPEC) { 1377 err = unix_validate_addr(sunaddr, alen); 1378 if (err) 1379 goto out; 1380 1381 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1382 if (err) 1383 goto out; 1384 1385 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1386 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1387 !unix_sk(sk)->addr) { 1388 err = unix_autobind(sk); 1389 if (err) 1390 goto out; 1391 } 1392 1393 restart: 1394 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1395 if (IS_ERR(other)) { 1396 err = PTR_ERR(other); 1397 goto out; 1398 } 1399 1400 unix_state_double_lock(sk, other); 1401 1402 /* Apparently VFS overslept socket death. Retry. */ 1403 if (sock_flag(other, SOCK_DEAD)) { 1404 unix_state_double_unlock(sk, other); 1405 sock_put(other); 1406 goto restart; 1407 } 1408 1409 err = -EPERM; 1410 if (!unix_may_send(sk, other)) 1411 goto out_unlock; 1412 1413 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1414 if (err) 1415 goto out_unlock; 1416 1417 sk->sk_state = other->sk_state = TCP_ESTABLISHED; 1418 } else { 1419 /* 1420 * 1003.1g breaking connected state with AF_UNSPEC 1421 */ 1422 other = NULL; 1423 unix_state_double_lock(sk, other); 1424 } 1425 1426 /* 1427 * If it was connected, reconnect. 1428 */ 1429 if (unix_peer(sk)) { 1430 struct sock *old_peer = unix_peer(sk); 1431 1432 unix_peer(sk) = other; 1433 if (!other) 1434 sk->sk_state = TCP_CLOSE; 1435 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1436 1437 unix_state_double_unlock(sk, other); 1438 1439 if (other != old_peer) 1440 unix_dgram_disconnected(sk, old_peer); 1441 sock_put(old_peer); 1442 } else { 1443 unix_peer(sk) = other; 1444 unix_state_double_unlock(sk, other); 1445 } 1446 1447 return 0; 1448 1449 out_unlock: 1450 unix_state_double_unlock(sk, other); 1451 sock_put(other); 1452 out: 1453 return err; 1454 } 1455 1456 static long unix_wait_for_peer(struct sock *other, long timeo) 1457 __releases(&unix_sk(other)->lock) 1458 { 1459 struct unix_sock *u = unix_sk(other); 1460 int sched; 1461 DEFINE_WAIT(wait); 1462 1463 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1464 1465 sched = !sock_flag(other, SOCK_DEAD) && 1466 !(other->sk_shutdown & RCV_SHUTDOWN) && 1467 unix_recvq_full_lockless(other); 1468 1469 unix_state_unlock(other); 1470 1471 if (sched) 1472 timeo = schedule_timeout(timeo); 1473 1474 finish_wait(&u->peer_wait, &wait); 1475 return timeo; 1476 } 1477 1478 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1479 int addr_len, int flags) 1480 { 1481 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1482 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1483 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1484 struct net *net = sock_net(sk); 1485 struct sk_buff *skb = NULL; 1486 long timeo; 1487 int err; 1488 int st; 1489 1490 err = unix_validate_addr(sunaddr, addr_len); 1491 if (err) 1492 goto out; 1493 1494 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1495 if (err) 1496 goto out; 1497 1498 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1499 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { 1500 err = unix_autobind(sk); 1501 if (err) 1502 goto out; 1503 } 1504 1505 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1506 1507 /* First of all allocate resources. 1508 If we will make it after state is locked, 1509 we will have to recheck all again in any case. 1510 */ 1511 1512 /* create new sock for complete connection */ 1513 newsk = unix_create1(net, NULL, 0, sock->type); 1514 if (IS_ERR(newsk)) { 1515 err = PTR_ERR(newsk); 1516 newsk = NULL; 1517 goto out; 1518 } 1519 1520 err = -ENOMEM; 1521 1522 /* Allocate skb for sending to listening sock */ 1523 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1524 if (skb == NULL) 1525 goto out; 1526 1527 restart: 1528 /* Find listening sock. */ 1529 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1530 if (IS_ERR(other)) { 1531 err = PTR_ERR(other); 1532 other = NULL; 1533 goto out; 1534 } 1535 1536 /* Latch state of peer */ 1537 unix_state_lock(other); 1538 1539 /* Apparently VFS overslept socket death. Retry. */ 1540 if (sock_flag(other, SOCK_DEAD)) { 1541 unix_state_unlock(other); 1542 sock_put(other); 1543 goto restart; 1544 } 1545 1546 err = -ECONNREFUSED; 1547 if (other->sk_state != TCP_LISTEN) 1548 goto out_unlock; 1549 if (other->sk_shutdown & RCV_SHUTDOWN) 1550 goto out_unlock; 1551 1552 if (unix_recvq_full(other)) { 1553 err = -EAGAIN; 1554 if (!timeo) 1555 goto out_unlock; 1556 1557 timeo = unix_wait_for_peer(other, timeo); 1558 1559 err = sock_intr_errno(timeo); 1560 if (signal_pending(current)) 1561 goto out; 1562 sock_put(other); 1563 goto restart; 1564 } 1565 1566 /* Latch our state. 1567 1568 It is tricky place. We need to grab our state lock and cannot 1569 drop lock on peer. It is dangerous because deadlock is 1570 possible. Connect to self case and simultaneous 1571 attempt to connect are eliminated by checking socket 1572 state. other is TCP_LISTEN, if sk is TCP_LISTEN we 1573 check this before attempt to grab lock. 1574 1575 Well, and we have to recheck the state after socket locked. 1576 */ 1577 st = sk->sk_state; 1578 1579 switch (st) { 1580 case TCP_CLOSE: 1581 /* This is ok... continue with connect */ 1582 break; 1583 case TCP_ESTABLISHED: 1584 /* Socket is already connected */ 1585 err = -EISCONN; 1586 goto out_unlock; 1587 default: 1588 err = -EINVAL; 1589 goto out_unlock; 1590 } 1591 1592 unix_state_lock_nested(sk, U_LOCK_SECOND); 1593 1594 if (sk->sk_state != st) { 1595 unix_state_unlock(sk); 1596 unix_state_unlock(other); 1597 sock_put(other); 1598 goto restart; 1599 } 1600 1601 err = security_unix_stream_connect(sk, other, newsk); 1602 if (err) { 1603 unix_state_unlock(sk); 1604 goto out_unlock; 1605 } 1606 1607 /* The way is open! Fastly set all the necessary fields... */ 1608 1609 sock_hold(sk); 1610 unix_peer(newsk) = sk; 1611 newsk->sk_state = TCP_ESTABLISHED; 1612 newsk->sk_type = sk->sk_type; 1613 init_peercred(newsk); 1614 newu = unix_sk(newsk); 1615 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1616 otheru = unix_sk(other); 1617 1618 /* copy address information from listening to new sock 1619 * 1620 * The contents of *(otheru->addr) and otheru->path 1621 * are seen fully set up here, since we have found 1622 * otheru in hash under its lock. Insertion into the 1623 * hash chain we'd found it in had been done in an 1624 * earlier critical area protected by the chain's lock, 1625 * the same one where we'd set *(otheru->addr) contents, 1626 * as well as otheru->path and otheru->addr itself. 1627 * 1628 * Using smp_store_release() here to set newu->addr 1629 * is enough to make those stores, as well as stores 1630 * to newu->path visible to anyone who gets newu->addr 1631 * by smp_load_acquire(). IOW, the same warranties 1632 * as for unix_sock instances bound in unix_bind() or 1633 * in unix_autobind(). 1634 */ 1635 if (otheru->path.dentry) { 1636 path_get(&otheru->path); 1637 newu->path = otheru->path; 1638 } 1639 refcount_inc(&otheru->addr->refcnt); 1640 smp_store_release(&newu->addr, otheru->addr); 1641 1642 /* Set credentials */ 1643 copy_peercred(sk, other); 1644 1645 sock->state = SS_CONNECTED; 1646 sk->sk_state = TCP_ESTABLISHED; 1647 sock_hold(newsk); 1648 1649 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1650 unix_peer(sk) = newsk; 1651 1652 unix_state_unlock(sk); 1653 1654 /* take ten and send info to listening sock */ 1655 spin_lock(&other->sk_receive_queue.lock); 1656 __skb_queue_tail(&other->sk_receive_queue, skb); 1657 spin_unlock(&other->sk_receive_queue.lock); 1658 unix_state_unlock(other); 1659 other->sk_data_ready(other); 1660 sock_put(other); 1661 return 0; 1662 1663 out_unlock: 1664 if (other) 1665 unix_state_unlock(other); 1666 1667 out: 1668 kfree_skb(skb); 1669 if (newsk) 1670 unix_release_sock(newsk, 0); 1671 if (other) 1672 sock_put(other); 1673 return err; 1674 } 1675 1676 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1677 { 1678 struct sock *ska = socka->sk, *skb = sockb->sk; 1679 1680 /* Join our sockets back to back */ 1681 sock_hold(ska); 1682 sock_hold(skb); 1683 unix_peer(ska) = skb; 1684 unix_peer(skb) = ska; 1685 init_peercred(ska); 1686 init_peercred(skb); 1687 1688 ska->sk_state = TCP_ESTABLISHED; 1689 skb->sk_state = TCP_ESTABLISHED; 1690 socka->state = SS_CONNECTED; 1691 sockb->state = SS_CONNECTED; 1692 return 0; 1693 } 1694 1695 static void unix_sock_inherit_flags(const struct socket *old, 1696 struct socket *new) 1697 { 1698 if (test_bit(SOCK_PASSCRED, &old->flags)) 1699 set_bit(SOCK_PASSCRED, &new->flags); 1700 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1701 set_bit(SOCK_PASSPIDFD, &new->flags); 1702 if (test_bit(SOCK_PASSSEC, &old->flags)) 1703 set_bit(SOCK_PASSSEC, &new->flags); 1704 } 1705 1706 static int unix_accept(struct socket *sock, struct socket *newsock, int flags, 1707 bool kern) 1708 { 1709 struct sock *sk = sock->sk; 1710 struct sock *tsk; 1711 struct sk_buff *skb; 1712 int err; 1713 1714 err = -EOPNOTSUPP; 1715 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1716 goto out; 1717 1718 err = -EINVAL; 1719 if (sk->sk_state != TCP_LISTEN) 1720 goto out; 1721 1722 /* If socket state is TCP_LISTEN it cannot change (for now...), 1723 * so that no locks are necessary. 1724 */ 1725 1726 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1727 &err); 1728 if (!skb) { 1729 /* This means receive shutdown. */ 1730 if (err == 0) 1731 err = -EINVAL; 1732 goto out; 1733 } 1734 1735 tsk = skb->sk; 1736 skb_free_datagram(sk, skb); 1737 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1738 1739 /* attach accepted sock to socket */ 1740 unix_state_lock(tsk); 1741 newsock->state = SS_CONNECTED; 1742 unix_sock_inherit_flags(sock, newsock); 1743 sock_graft(tsk, newsock); 1744 unix_state_unlock(tsk); 1745 return 0; 1746 1747 out: 1748 return err; 1749 } 1750 1751 1752 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1753 { 1754 struct sock *sk = sock->sk; 1755 struct unix_address *addr; 1756 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1757 int err = 0; 1758 1759 if (peer) { 1760 sk = unix_peer_get(sk); 1761 1762 err = -ENOTCONN; 1763 if (!sk) 1764 goto out; 1765 err = 0; 1766 } else { 1767 sock_hold(sk); 1768 } 1769 1770 addr = smp_load_acquire(&unix_sk(sk)->addr); 1771 if (!addr) { 1772 sunaddr->sun_family = AF_UNIX; 1773 sunaddr->sun_path[0] = 0; 1774 err = offsetof(struct sockaddr_un, sun_path); 1775 } else { 1776 err = addr->len; 1777 memcpy(sunaddr, addr->name, addr->len); 1778 1779 if (peer) 1780 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1781 CGROUP_UNIX_GETPEERNAME); 1782 else 1783 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1784 CGROUP_UNIX_GETSOCKNAME); 1785 } 1786 sock_put(sk); 1787 out: 1788 return err; 1789 } 1790 1791 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1792 { 1793 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1794 1795 /* 1796 * Garbage collection of unix sockets starts by selecting a set of 1797 * candidate sockets which have reference only from being in flight 1798 * (total_refs == inflight_refs). This condition is checked once during 1799 * the candidate collection phase, and candidates are marked as such, so 1800 * that non-candidates can later be ignored. While inflight_refs is 1801 * protected by unix_gc_lock, total_refs (file count) is not, hence this 1802 * is an instantaneous decision. 1803 * 1804 * Once a candidate, however, the socket must not be reinstalled into a 1805 * file descriptor while the garbage collection is in progress. 1806 * 1807 * If the above conditions are met, then the directed graph of 1808 * candidates (*) does not change while unix_gc_lock is held. 1809 * 1810 * Any operations that changes the file count through file descriptors 1811 * (dup, close, sendmsg) does not change the graph since candidates are 1812 * not installed in fds. 1813 * 1814 * Dequeing a candidate via recvmsg would install it into an fd, but 1815 * that takes unix_gc_lock to decrement the inflight count, so it's 1816 * serialized with garbage collection. 1817 * 1818 * MSG_PEEK is special in that it does not change the inflight count, 1819 * yet does install the socket into an fd. The following lock/unlock 1820 * pair is to ensure serialization with garbage collection. It must be 1821 * done between incrementing the file count and installing the file into 1822 * an fd. 1823 * 1824 * If garbage collection starts after the barrier provided by the 1825 * lock/unlock, then it will see the elevated refcount and not mark this 1826 * as a candidate. If a garbage collection is already in progress 1827 * before the file count was incremented, then the lock/unlock pair will 1828 * ensure that garbage collection is finished before progressing to 1829 * installing the fd. 1830 * 1831 * (*) A -> B where B is on the queue of A or B is on the queue of C 1832 * which is on the queue of listening socket A. 1833 */ 1834 spin_lock(&unix_gc_lock); 1835 spin_unlock(&unix_gc_lock); 1836 } 1837 1838 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1839 { 1840 int err = 0; 1841 1842 UNIXCB(skb).pid = get_pid(scm->pid); 1843 UNIXCB(skb).uid = scm->creds.uid; 1844 UNIXCB(skb).gid = scm->creds.gid; 1845 UNIXCB(skb).fp = NULL; 1846 unix_get_secdata(scm, skb); 1847 if (scm->fp && send_fds) 1848 err = unix_attach_fds(scm, skb); 1849 1850 skb->destructor = unix_destruct_scm; 1851 return err; 1852 } 1853 1854 static bool unix_passcred_enabled(const struct socket *sock, 1855 const struct sock *other) 1856 { 1857 return test_bit(SOCK_PASSCRED, &sock->flags) || 1858 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1859 !other->sk_socket || 1860 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1861 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1862 } 1863 1864 /* 1865 * Some apps rely on write() giving SCM_CREDENTIALS 1866 * We include credentials if source or destination socket 1867 * asserted SOCK_PASSCRED. 1868 */ 1869 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1870 const struct sock *other) 1871 { 1872 if (UNIXCB(skb).pid) 1873 return; 1874 if (unix_passcred_enabled(sock, other)) { 1875 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1876 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1877 } 1878 } 1879 1880 static bool unix_skb_scm_eq(struct sk_buff *skb, 1881 struct scm_cookie *scm) 1882 { 1883 return UNIXCB(skb).pid == scm->pid && 1884 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1885 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1886 unix_secdata_eq(scm, skb); 1887 } 1888 1889 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1890 { 1891 struct scm_fp_list *fp = UNIXCB(skb).fp; 1892 struct unix_sock *u = unix_sk(sk); 1893 1894 if (unlikely(fp && fp->count)) 1895 atomic_add(fp->count, &u->scm_stat.nr_fds); 1896 } 1897 1898 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1899 { 1900 struct scm_fp_list *fp = UNIXCB(skb).fp; 1901 struct unix_sock *u = unix_sk(sk); 1902 1903 if (unlikely(fp && fp->count)) 1904 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1905 } 1906 1907 /* 1908 * Send AF_UNIX data. 1909 */ 1910 1911 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1912 size_t len) 1913 { 1914 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 1915 struct sock *sk = sock->sk, *other = NULL; 1916 struct unix_sock *u = unix_sk(sk); 1917 struct scm_cookie scm; 1918 struct sk_buff *skb; 1919 int data_len = 0; 1920 int sk_locked; 1921 long timeo; 1922 int err; 1923 1924 wait_for_unix_gc(); 1925 err = scm_send(sock, msg, &scm, false); 1926 if (err < 0) 1927 return err; 1928 1929 err = -EOPNOTSUPP; 1930 if (msg->msg_flags&MSG_OOB) 1931 goto out; 1932 1933 if (msg->msg_namelen) { 1934 err = unix_validate_addr(sunaddr, msg->msg_namelen); 1935 if (err) 1936 goto out; 1937 1938 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 1939 msg->msg_name, 1940 &msg->msg_namelen, 1941 NULL); 1942 if (err) 1943 goto out; 1944 } else { 1945 sunaddr = NULL; 1946 err = -ENOTCONN; 1947 other = unix_peer_get(sk); 1948 if (!other) 1949 goto out; 1950 } 1951 1952 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1953 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { 1954 err = unix_autobind(sk); 1955 if (err) 1956 goto out; 1957 } 1958 1959 err = -EMSGSIZE; 1960 if (len > sk->sk_sndbuf - 32) 1961 goto out; 1962 1963 if (len > SKB_MAX_ALLOC) { 1964 data_len = min_t(size_t, 1965 len - SKB_MAX_ALLOC, 1966 MAX_SKB_FRAGS * PAGE_SIZE); 1967 data_len = PAGE_ALIGN(data_len); 1968 1969 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 1970 } 1971 1972 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 1973 msg->msg_flags & MSG_DONTWAIT, &err, 1974 PAGE_ALLOC_COSTLY_ORDER); 1975 if (skb == NULL) 1976 goto out; 1977 1978 err = unix_scm_to_skb(&scm, skb, true); 1979 if (err < 0) 1980 goto out_free; 1981 1982 skb_put(skb, len - data_len); 1983 skb->data_len = data_len; 1984 skb->len = len; 1985 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 1986 if (err) 1987 goto out_free; 1988 1989 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1990 1991 restart: 1992 if (!other) { 1993 err = -ECONNRESET; 1994 if (sunaddr == NULL) 1995 goto out_free; 1996 1997 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, 1998 sk->sk_type); 1999 if (IS_ERR(other)) { 2000 err = PTR_ERR(other); 2001 other = NULL; 2002 goto out_free; 2003 } 2004 } 2005 2006 if (sk_filter(other, skb) < 0) { 2007 /* Toss the packet but do not return any error to the sender */ 2008 err = len; 2009 goto out_free; 2010 } 2011 2012 sk_locked = 0; 2013 unix_state_lock(other); 2014 restart_locked: 2015 err = -EPERM; 2016 if (!unix_may_send(sk, other)) 2017 goto out_unlock; 2018 2019 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2020 /* 2021 * Check with 1003.1g - what should 2022 * datagram error 2023 */ 2024 unix_state_unlock(other); 2025 sock_put(other); 2026 2027 if (!sk_locked) 2028 unix_state_lock(sk); 2029 2030 err = 0; 2031 if (sk->sk_type == SOCK_SEQPACKET) { 2032 /* We are here only when racing with unix_release_sock() 2033 * is clearing @other. Never change state to TCP_CLOSE 2034 * unlike SOCK_DGRAM wants. 2035 */ 2036 unix_state_unlock(sk); 2037 err = -EPIPE; 2038 } else if (unix_peer(sk) == other) { 2039 unix_peer(sk) = NULL; 2040 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2041 2042 sk->sk_state = TCP_CLOSE; 2043 unix_state_unlock(sk); 2044 2045 unix_dgram_disconnected(sk, other); 2046 sock_put(other); 2047 err = -ECONNREFUSED; 2048 } else { 2049 unix_state_unlock(sk); 2050 } 2051 2052 other = NULL; 2053 if (err) 2054 goto out_free; 2055 goto restart; 2056 } 2057 2058 err = -EPIPE; 2059 if (other->sk_shutdown & RCV_SHUTDOWN) 2060 goto out_unlock; 2061 2062 if (sk->sk_type != SOCK_SEQPACKET) { 2063 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2064 if (err) 2065 goto out_unlock; 2066 } 2067 2068 /* other == sk && unix_peer(other) != sk if 2069 * - unix_peer(sk) == NULL, destination address bound to sk 2070 * - unix_peer(sk) == sk by time of get but disconnected before lock 2071 */ 2072 if (other != sk && 2073 unlikely(unix_peer(other) != sk && 2074 unix_recvq_full_lockless(other))) { 2075 if (timeo) { 2076 timeo = unix_wait_for_peer(other, timeo); 2077 2078 err = sock_intr_errno(timeo); 2079 if (signal_pending(current)) 2080 goto out_free; 2081 2082 goto restart; 2083 } 2084 2085 if (!sk_locked) { 2086 unix_state_unlock(other); 2087 unix_state_double_lock(sk, other); 2088 } 2089 2090 if (unix_peer(sk) != other || 2091 unix_dgram_peer_wake_me(sk, other)) { 2092 err = -EAGAIN; 2093 sk_locked = 1; 2094 goto out_unlock; 2095 } 2096 2097 if (!sk_locked) { 2098 sk_locked = 1; 2099 goto restart_locked; 2100 } 2101 } 2102 2103 if (unlikely(sk_locked)) 2104 unix_state_unlock(sk); 2105 2106 if (sock_flag(other, SOCK_RCVTSTAMP)) 2107 __net_timestamp(skb); 2108 maybe_add_creds(skb, sock, other); 2109 scm_stat_add(other, skb); 2110 skb_queue_tail(&other->sk_receive_queue, skb); 2111 unix_state_unlock(other); 2112 other->sk_data_ready(other); 2113 sock_put(other); 2114 scm_destroy(&scm); 2115 return len; 2116 2117 out_unlock: 2118 if (sk_locked) 2119 unix_state_unlock(sk); 2120 unix_state_unlock(other); 2121 out_free: 2122 kfree_skb(skb); 2123 out: 2124 if (other) 2125 sock_put(other); 2126 scm_destroy(&scm); 2127 return err; 2128 } 2129 2130 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2131 * bytes, and a minimum of a full page. 2132 */ 2133 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2134 2135 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2136 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2137 struct scm_cookie *scm, bool fds_sent) 2138 { 2139 struct unix_sock *ousk = unix_sk(other); 2140 struct sk_buff *skb; 2141 int err = 0; 2142 2143 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2144 2145 if (!skb) 2146 return err; 2147 2148 err = unix_scm_to_skb(scm, skb, !fds_sent); 2149 if (err < 0) { 2150 kfree_skb(skb); 2151 return err; 2152 } 2153 skb_put(skb, 1); 2154 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2155 2156 if (err) { 2157 kfree_skb(skb); 2158 return err; 2159 } 2160 2161 unix_state_lock(other); 2162 2163 if (sock_flag(other, SOCK_DEAD) || 2164 (other->sk_shutdown & RCV_SHUTDOWN)) { 2165 unix_state_unlock(other); 2166 kfree_skb(skb); 2167 return -EPIPE; 2168 } 2169 2170 maybe_add_creds(skb, sock, other); 2171 skb_get(skb); 2172 2173 if (ousk->oob_skb) 2174 consume_skb(ousk->oob_skb); 2175 2176 WRITE_ONCE(ousk->oob_skb, skb); 2177 2178 scm_stat_add(other, skb); 2179 skb_queue_tail(&other->sk_receive_queue, skb); 2180 sk_send_sigurg(other); 2181 unix_state_unlock(other); 2182 other->sk_data_ready(other); 2183 2184 return err; 2185 } 2186 #endif 2187 2188 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2189 size_t len) 2190 { 2191 struct sock *sk = sock->sk; 2192 struct sock *other = NULL; 2193 int err, size; 2194 struct sk_buff *skb; 2195 int sent = 0; 2196 struct scm_cookie scm; 2197 bool fds_sent = false; 2198 int data_len; 2199 2200 wait_for_unix_gc(); 2201 err = scm_send(sock, msg, &scm, false); 2202 if (err < 0) 2203 return err; 2204 2205 err = -EOPNOTSUPP; 2206 if (msg->msg_flags & MSG_OOB) { 2207 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2208 if (len) 2209 len--; 2210 else 2211 #endif 2212 goto out_err; 2213 } 2214 2215 if (msg->msg_namelen) { 2216 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2217 goto out_err; 2218 } else { 2219 err = -ENOTCONN; 2220 other = unix_peer(sk); 2221 if (!other) 2222 goto out_err; 2223 } 2224 2225 if (sk->sk_shutdown & SEND_SHUTDOWN) 2226 goto pipe_err; 2227 2228 while (sent < len) { 2229 size = len - sent; 2230 2231 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2232 skb = sock_alloc_send_pskb(sk, 0, 0, 2233 msg->msg_flags & MSG_DONTWAIT, 2234 &err, 0); 2235 } else { 2236 /* Keep two messages in the pipe so it schedules better */ 2237 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); 2238 2239 /* allow fallback to order-0 allocations */ 2240 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2241 2242 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2243 2244 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2245 2246 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2247 msg->msg_flags & MSG_DONTWAIT, &err, 2248 get_order(UNIX_SKB_FRAGS_SZ)); 2249 } 2250 if (!skb) 2251 goto out_err; 2252 2253 /* Only send the fds in the first buffer */ 2254 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2255 if (err < 0) { 2256 kfree_skb(skb); 2257 goto out_err; 2258 } 2259 fds_sent = true; 2260 2261 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2262 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2263 sk->sk_allocation); 2264 if (err < 0) { 2265 kfree_skb(skb); 2266 goto out_err; 2267 } 2268 size = err; 2269 refcount_add(size, &sk->sk_wmem_alloc); 2270 } else { 2271 skb_put(skb, size - data_len); 2272 skb->data_len = data_len; 2273 skb->len = size; 2274 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2275 if (err) { 2276 kfree_skb(skb); 2277 goto out_err; 2278 } 2279 } 2280 2281 unix_state_lock(other); 2282 2283 if (sock_flag(other, SOCK_DEAD) || 2284 (other->sk_shutdown & RCV_SHUTDOWN)) 2285 goto pipe_err_free; 2286 2287 maybe_add_creds(skb, sock, other); 2288 scm_stat_add(other, skb); 2289 skb_queue_tail(&other->sk_receive_queue, skb); 2290 unix_state_unlock(other); 2291 other->sk_data_ready(other); 2292 sent += size; 2293 } 2294 2295 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2296 if (msg->msg_flags & MSG_OOB) { 2297 err = queue_oob(sock, msg, other, &scm, fds_sent); 2298 if (err) 2299 goto out_err; 2300 sent++; 2301 } 2302 #endif 2303 2304 scm_destroy(&scm); 2305 2306 return sent; 2307 2308 pipe_err_free: 2309 unix_state_unlock(other); 2310 kfree_skb(skb); 2311 pipe_err: 2312 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) 2313 send_sig(SIGPIPE, current, 0); 2314 err = -EPIPE; 2315 out_err: 2316 scm_destroy(&scm); 2317 return sent ? : err; 2318 } 2319 2320 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2321 size_t len) 2322 { 2323 int err; 2324 struct sock *sk = sock->sk; 2325 2326 err = sock_error(sk); 2327 if (err) 2328 return err; 2329 2330 if (sk->sk_state != TCP_ESTABLISHED) 2331 return -ENOTCONN; 2332 2333 if (msg->msg_namelen) 2334 msg->msg_namelen = 0; 2335 2336 return unix_dgram_sendmsg(sock, msg, len); 2337 } 2338 2339 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2340 size_t size, int flags) 2341 { 2342 struct sock *sk = sock->sk; 2343 2344 if (sk->sk_state != TCP_ESTABLISHED) 2345 return -ENOTCONN; 2346 2347 return unix_dgram_recvmsg(sock, msg, size, flags); 2348 } 2349 2350 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2351 { 2352 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2353 2354 if (addr) { 2355 msg->msg_namelen = addr->len; 2356 memcpy(msg->msg_name, addr->name, addr->len); 2357 } 2358 } 2359 2360 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2361 int flags) 2362 { 2363 struct scm_cookie scm; 2364 struct socket *sock = sk->sk_socket; 2365 struct unix_sock *u = unix_sk(sk); 2366 struct sk_buff *skb, *last; 2367 long timeo; 2368 int skip; 2369 int err; 2370 2371 err = -EOPNOTSUPP; 2372 if (flags&MSG_OOB) 2373 goto out; 2374 2375 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2376 2377 do { 2378 mutex_lock(&u->iolock); 2379 2380 skip = sk_peek_offset(sk, flags); 2381 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2382 &skip, &err, &last); 2383 if (skb) { 2384 if (!(flags & MSG_PEEK)) 2385 scm_stat_del(sk, skb); 2386 break; 2387 } 2388 2389 mutex_unlock(&u->iolock); 2390 2391 if (err != -EAGAIN) 2392 break; 2393 } while (timeo && 2394 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2395 &err, &timeo, last)); 2396 2397 if (!skb) { /* implies iolock unlocked */ 2398 unix_state_lock(sk); 2399 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2400 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2401 (sk->sk_shutdown & RCV_SHUTDOWN)) 2402 err = 0; 2403 unix_state_unlock(sk); 2404 goto out; 2405 } 2406 2407 if (wq_has_sleeper(&u->peer_wait)) 2408 wake_up_interruptible_sync_poll(&u->peer_wait, 2409 EPOLLOUT | EPOLLWRNORM | 2410 EPOLLWRBAND); 2411 2412 if (msg->msg_name) { 2413 unix_copy_addr(msg, skb->sk); 2414 2415 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2416 msg->msg_name, 2417 &msg->msg_namelen); 2418 } 2419 2420 if (size > skb->len - skip) 2421 size = skb->len - skip; 2422 else if (size < skb->len - skip) 2423 msg->msg_flags |= MSG_TRUNC; 2424 2425 err = skb_copy_datagram_msg(skb, skip, msg, size); 2426 if (err) 2427 goto out_free; 2428 2429 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2430 __sock_recv_timestamp(msg, sk, skb); 2431 2432 memset(&scm, 0, sizeof(scm)); 2433 2434 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2435 unix_set_secdata(&scm, skb); 2436 2437 if (!(flags & MSG_PEEK)) { 2438 if (UNIXCB(skb).fp) 2439 unix_detach_fds(&scm, skb); 2440 2441 sk_peek_offset_bwd(sk, skb->len); 2442 } else { 2443 /* It is questionable: on PEEK we could: 2444 - do not return fds - good, but too simple 8) 2445 - return fds, and do not return them on read (old strategy, 2446 apparently wrong) 2447 - clone fds (I chose it for now, it is the most universal 2448 solution) 2449 2450 POSIX 1003.1g does not actually define this clearly 2451 at all. POSIX 1003.1g doesn't define a lot of things 2452 clearly however! 2453 2454 */ 2455 2456 sk_peek_offset_fwd(sk, size); 2457 2458 if (UNIXCB(skb).fp) 2459 unix_peek_fds(&scm, skb); 2460 } 2461 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2462 2463 scm_recv_unix(sock, msg, &scm, flags); 2464 2465 out_free: 2466 skb_free_datagram(sk, skb); 2467 mutex_unlock(&u->iolock); 2468 out: 2469 return err; 2470 } 2471 2472 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2473 int flags) 2474 { 2475 struct sock *sk = sock->sk; 2476 2477 #ifdef CONFIG_BPF_SYSCALL 2478 const struct proto *prot = READ_ONCE(sk->sk_prot); 2479 2480 if (prot != &unix_dgram_proto) 2481 return prot->recvmsg(sk, msg, size, flags, NULL); 2482 #endif 2483 return __unix_dgram_recvmsg(sk, msg, size, flags); 2484 } 2485 2486 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2487 { 2488 struct unix_sock *u = unix_sk(sk); 2489 struct sk_buff *skb; 2490 int err; 2491 2492 mutex_lock(&u->iolock); 2493 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2494 mutex_unlock(&u->iolock); 2495 if (!skb) 2496 return err; 2497 2498 return recv_actor(sk, skb); 2499 } 2500 2501 /* 2502 * Sleep until more data has arrived. But check for races.. 2503 */ 2504 static long unix_stream_data_wait(struct sock *sk, long timeo, 2505 struct sk_buff *last, unsigned int last_len, 2506 bool freezable) 2507 { 2508 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2509 struct sk_buff *tail; 2510 DEFINE_WAIT(wait); 2511 2512 unix_state_lock(sk); 2513 2514 for (;;) { 2515 prepare_to_wait(sk_sleep(sk), &wait, state); 2516 2517 tail = skb_peek_tail(&sk->sk_receive_queue); 2518 if (tail != last || 2519 (tail && tail->len != last_len) || 2520 sk->sk_err || 2521 (sk->sk_shutdown & RCV_SHUTDOWN) || 2522 signal_pending(current) || 2523 !timeo) 2524 break; 2525 2526 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2527 unix_state_unlock(sk); 2528 timeo = schedule_timeout(timeo); 2529 unix_state_lock(sk); 2530 2531 if (sock_flag(sk, SOCK_DEAD)) 2532 break; 2533 2534 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2535 } 2536 2537 finish_wait(sk_sleep(sk), &wait); 2538 unix_state_unlock(sk); 2539 return timeo; 2540 } 2541 2542 static unsigned int unix_skb_len(const struct sk_buff *skb) 2543 { 2544 return skb->len - UNIXCB(skb).consumed; 2545 } 2546 2547 struct unix_stream_read_state { 2548 int (*recv_actor)(struct sk_buff *, int, int, 2549 struct unix_stream_read_state *); 2550 struct socket *socket; 2551 struct msghdr *msg; 2552 struct pipe_inode_info *pipe; 2553 size_t size; 2554 int flags; 2555 unsigned int splice_flags; 2556 }; 2557 2558 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2559 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2560 { 2561 struct socket *sock = state->socket; 2562 struct sock *sk = sock->sk; 2563 struct unix_sock *u = unix_sk(sk); 2564 int chunk = 1; 2565 struct sk_buff *oob_skb; 2566 2567 mutex_lock(&u->iolock); 2568 unix_state_lock(sk); 2569 2570 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2571 unix_state_unlock(sk); 2572 mutex_unlock(&u->iolock); 2573 return -EINVAL; 2574 } 2575 2576 oob_skb = u->oob_skb; 2577 2578 if (!(state->flags & MSG_PEEK)) 2579 WRITE_ONCE(u->oob_skb, NULL); 2580 else 2581 skb_get(oob_skb); 2582 unix_state_unlock(sk); 2583 2584 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2585 2586 if (!(state->flags & MSG_PEEK)) 2587 UNIXCB(oob_skb).consumed += 1; 2588 2589 consume_skb(oob_skb); 2590 2591 mutex_unlock(&u->iolock); 2592 2593 if (chunk < 0) 2594 return -EFAULT; 2595 2596 state->msg->msg_flags |= MSG_OOB; 2597 return 1; 2598 } 2599 2600 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2601 int flags, int copied) 2602 { 2603 struct unix_sock *u = unix_sk(sk); 2604 2605 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { 2606 skb_unlink(skb, &sk->sk_receive_queue); 2607 consume_skb(skb); 2608 skb = NULL; 2609 } else { 2610 if (skb == u->oob_skb) { 2611 if (copied) { 2612 skb = NULL; 2613 } else if (sock_flag(sk, SOCK_URGINLINE)) { 2614 if (!(flags & MSG_PEEK)) { 2615 WRITE_ONCE(u->oob_skb, NULL); 2616 consume_skb(skb); 2617 } 2618 } else if (!(flags & MSG_PEEK)) { 2619 skb_unlink(skb, &sk->sk_receive_queue); 2620 consume_skb(skb); 2621 skb = skb_peek(&sk->sk_receive_queue); 2622 } 2623 } 2624 } 2625 return skb; 2626 } 2627 #endif 2628 2629 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2630 { 2631 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) 2632 return -ENOTCONN; 2633 2634 return unix_read_skb(sk, recv_actor); 2635 } 2636 2637 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2638 bool freezable) 2639 { 2640 struct scm_cookie scm; 2641 struct socket *sock = state->socket; 2642 struct sock *sk = sock->sk; 2643 struct unix_sock *u = unix_sk(sk); 2644 int copied = 0; 2645 int flags = state->flags; 2646 int noblock = flags & MSG_DONTWAIT; 2647 bool check_creds = false; 2648 int target; 2649 int err = 0; 2650 long timeo; 2651 int skip; 2652 size_t size = state->size; 2653 unsigned int last_len; 2654 2655 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { 2656 err = -EINVAL; 2657 goto out; 2658 } 2659 2660 if (unlikely(flags & MSG_OOB)) { 2661 err = -EOPNOTSUPP; 2662 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2663 err = unix_stream_recv_urg(state); 2664 #endif 2665 goto out; 2666 } 2667 2668 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2669 timeo = sock_rcvtimeo(sk, noblock); 2670 2671 memset(&scm, 0, sizeof(scm)); 2672 2673 /* Lock the socket to prevent queue disordering 2674 * while sleeps in memcpy_tomsg 2675 */ 2676 mutex_lock(&u->iolock); 2677 2678 skip = max(sk_peek_offset(sk, flags), 0); 2679 2680 do { 2681 int chunk; 2682 bool drop_skb; 2683 struct sk_buff *skb, *last; 2684 2685 redo: 2686 unix_state_lock(sk); 2687 if (sock_flag(sk, SOCK_DEAD)) { 2688 err = -ECONNRESET; 2689 goto unlock; 2690 } 2691 last = skb = skb_peek(&sk->sk_receive_queue); 2692 last_len = last ? last->len : 0; 2693 2694 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2695 if (skb) { 2696 skb = manage_oob(skb, sk, flags, copied); 2697 if (!skb) { 2698 unix_state_unlock(sk); 2699 if (copied) 2700 break; 2701 goto redo; 2702 } 2703 } 2704 #endif 2705 again: 2706 if (skb == NULL) { 2707 if (copied >= target) 2708 goto unlock; 2709 2710 /* 2711 * POSIX 1003.1g mandates this order. 2712 */ 2713 2714 err = sock_error(sk); 2715 if (err) 2716 goto unlock; 2717 if (sk->sk_shutdown & RCV_SHUTDOWN) 2718 goto unlock; 2719 2720 unix_state_unlock(sk); 2721 if (!timeo) { 2722 err = -EAGAIN; 2723 break; 2724 } 2725 2726 mutex_unlock(&u->iolock); 2727 2728 timeo = unix_stream_data_wait(sk, timeo, last, 2729 last_len, freezable); 2730 2731 if (signal_pending(current)) { 2732 err = sock_intr_errno(timeo); 2733 scm_destroy(&scm); 2734 goto out; 2735 } 2736 2737 mutex_lock(&u->iolock); 2738 goto redo; 2739 unlock: 2740 unix_state_unlock(sk); 2741 break; 2742 } 2743 2744 while (skip >= unix_skb_len(skb)) { 2745 skip -= unix_skb_len(skb); 2746 last = skb; 2747 last_len = skb->len; 2748 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2749 if (!skb) 2750 goto again; 2751 } 2752 2753 unix_state_unlock(sk); 2754 2755 if (check_creds) { 2756 /* Never glue messages from different writers */ 2757 if (!unix_skb_scm_eq(skb, &scm)) 2758 break; 2759 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2760 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2761 /* Copy credentials */ 2762 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2763 unix_set_secdata(&scm, skb); 2764 check_creds = true; 2765 } 2766 2767 /* Copy address just once */ 2768 if (state->msg && state->msg->msg_name) { 2769 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2770 state->msg->msg_name); 2771 unix_copy_addr(state->msg, skb->sk); 2772 2773 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2774 state->msg->msg_name, 2775 &state->msg->msg_namelen); 2776 2777 sunaddr = NULL; 2778 } 2779 2780 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2781 skb_get(skb); 2782 chunk = state->recv_actor(skb, skip, chunk, state); 2783 drop_skb = !unix_skb_len(skb); 2784 /* skb is only safe to use if !drop_skb */ 2785 consume_skb(skb); 2786 if (chunk < 0) { 2787 if (copied == 0) 2788 copied = -EFAULT; 2789 break; 2790 } 2791 copied += chunk; 2792 size -= chunk; 2793 2794 if (drop_skb) { 2795 /* the skb was touched by a concurrent reader; 2796 * we should not expect anything from this skb 2797 * anymore and assume it invalid - we can be 2798 * sure it was dropped from the socket queue 2799 * 2800 * let's report a short read 2801 */ 2802 err = 0; 2803 break; 2804 } 2805 2806 /* Mark read part of skb as used */ 2807 if (!(flags & MSG_PEEK)) { 2808 UNIXCB(skb).consumed += chunk; 2809 2810 sk_peek_offset_bwd(sk, chunk); 2811 2812 if (UNIXCB(skb).fp) { 2813 scm_stat_del(sk, skb); 2814 unix_detach_fds(&scm, skb); 2815 } 2816 2817 if (unix_skb_len(skb)) 2818 break; 2819 2820 skb_unlink(skb, &sk->sk_receive_queue); 2821 consume_skb(skb); 2822 2823 if (scm.fp) 2824 break; 2825 } else { 2826 /* It is questionable, see note in unix_dgram_recvmsg. 2827 */ 2828 if (UNIXCB(skb).fp) 2829 unix_peek_fds(&scm, skb); 2830 2831 sk_peek_offset_fwd(sk, chunk); 2832 2833 if (UNIXCB(skb).fp) 2834 break; 2835 2836 skip = 0; 2837 last = skb; 2838 last_len = skb->len; 2839 unix_state_lock(sk); 2840 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2841 if (skb) 2842 goto again; 2843 unix_state_unlock(sk); 2844 break; 2845 } 2846 } while (size); 2847 2848 mutex_unlock(&u->iolock); 2849 if (state->msg) 2850 scm_recv_unix(sock, state->msg, &scm, flags); 2851 else 2852 scm_destroy(&scm); 2853 out: 2854 return copied ? : err; 2855 } 2856 2857 static int unix_stream_read_actor(struct sk_buff *skb, 2858 int skip, int chunk, 2859 struct unix_stream_read_state *state) 2860 { 2861 int ret; 2862 2863 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2864 state->msg, chunk); 2865 return ret ?: chunk; 2866 } 2867 2868 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2869 size_t size, int flags) 2870 { 2871 struct unix_stream_read_state state = { 2872 .recv_actor = unix_stream_read_actor, 2873 .socket = sk->sk_socket, 2874 .msg = msg, 2875 .size = size, 2876 .flags = flags 2877 }; 2878 2879 return unix_stream_read_generic(&state, true); 2880 } 2881 2882 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2883 size_t size, int flags) 2884 { 2885 struct unix_stream_read_state state = { 2886 .recv_actor = unix_stream_read_actor, 2887 .socket = sock, 2888 .msg = msg, 2889 .size = size, 2890 .flags = flags 2891 }; 2892 2893 #ifdef CONFIG_BPF_SYSCALL 2894 struct sock *sk = sock->sk; 2895 const struct proto *prot = READ_ONCE(sk->sk_prot); 2896 2897 if (prot != &unix_stream_proto) 2898 return prot->recvmsg(sk, msg, size, flags, NULL); 2899 #endif 2900 return unix_stream_read_generic(&state, true); 2901 } 2902 2903 static int unix_stream_splice_actor(struct sk_buff *skb, 2904 int skip, int chunk, 2905 struct unix_stream_read_state *state) 2906 { 2907 return skb_splice_bits(skb, state->socket->sk, 2908 UNIXCB(skb).consumed + skip, 2909 state->pipe, chunk, state->splice_flags); 2910 } 2911 2912 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2913 struct pipe_inode_info *pipe, 2914 size_t size, unsigned int flags) 2915 { 2916 struct unix_stream_read_state state = { 2917 .recv_actor = unix_stream_splice_actor, 2918 .socket = sock, 2919 .pipe = pipe, 2920 .size = size, 2921 .splice_flags = flags, 2922 }; 2923 2924 if (unlikely(*ppos)) 2925 return -ESPIPE; 2926 2927 if (sock->file->f_flags & O_NONBLOCK || 2928 flags & SPLICE_F_NONBLOCK) 2929 state.flags = MSG_DONTWAIT; 2930 2931 return unix_stream_read_generic(&state, false); 2932 } 2933 2934 static int unix_shutdown(struct socket *sock, int mode) 2935 { 2936 struct sock *sk = sock->sk; 2937 struct sock *other; 2938 2939 if (mode < SHUT_RD || mode > SHUT_RDWR) 2940 return -EINVAL; 2941 /* This maps: 2942 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 2943 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 2944 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 2945 */ 2946 ++mode; 2947 2948 unix_state_lock(sk); 2949 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 2950 other = unix_peer(sk); 2951 if (other) 2952 sock_hold(other); 2953 unix_state_unlock(sk); 2954 sk->sk_state_change(sk); 2955 2956 if (other && 2957 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 2958 2959 int peer_mode = 0; 2960 const struct proto *prot = READ_ONCE(other->sk_prot); 2961 2962 if (prot->unhash) 2963 prot->unhash(other); 2964 if (mode&RCV_SHUTDOWN) 2965 peer_mode |= SEND_SHUTDOWN; 2966 if (mode&SEND_SHUTDOWN) 2967 peer_mode |= RCV_SHUTDOWN; 2968 unix_state_lock(other); 2969 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 2970 unix_state_unlock(other); 2971 other->sk_state_change(other); 2972 if (peer_mode == SHUTDOWN_MASK) 2973 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 2974 else if (peer_mode & RCV_SHUTDOWN) 2975 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 2976 } 2977 if (other) 2978 sock_put(other); 2979 2980 return 0; 2981 } 2982 2983 long unix_inq_len(struct sock *sk) 2984 { 2985 struct sk_buff *skb; 2986 long amount = 0; 2987 2988 if (sk->sk_state == TCP_LISTEN) 2989 return -EINVAL; 2990 2991 spin_lock(&sk->sk_receive_queue.lock); 2992 if (sk->sk_type == SOCK_STREAM || 2993 sk->sk_type == SOCK_SEQPACKET) { 2994 skb_queue_walk(&sk->sk_receive_queue, skb) 2995 amount += unix_skb_len(skb); 2996 } else { 2997 skb = skb_peek(&sk->sk_receive_queue); 2998 if (skb) 2999 amount = skb->len; 3000 } 3001 spin_unlock(&sk->sk_receive_queue.lock); 3002 3003 return amount; 3004 } 3005 EXPORT_SYMBOL_GPL(unix_inq_len); 3006 3007 long unix_outq_len(struct sock *sk) 3008 { 3009 return sk_wmem_alloc_get(sk); 3010 } 3011 EXPORT_SYMBOL_GPL(unix_outq_len); 3012 3013 static int unix_open_file(struct sock *sk) 3014 { 3015 struct path path; 3016 struct file *f; 3017 int fd; 3018 3019 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3020 return -EPERM; 3021 3022 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3023 return -ENOENT; 3024 3025 path = unix_sk(sk)->path; 3026 if (!path.dentry) 3027 return -ENOENT; 3028 3029 path_get(&path); 3030 3031 fd = get_unused_fd_flags(O_CLOEXEC); 3032 if (fd < 0) 3033 goto out; 3034 3035 f = dentry_open(&path, O_PATH, current_cred()); 3036 if (IS_ERR(f)) { 3037 put_unused_fd(fd); 3038 fd = PTR_ERR(f); 3039 goto out; 3040 } 3041 3042 fd_install(fd, f); 3043 out: 3044 path_put(&path); 3045 3046 return fd; 3047 } 3048 3049 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3050 { 3051 struct sock *sk = sock->sk; 3052 long amount = 0; 3053 int err; 3054 3055 switch (cmd) { 3056 case SIOCOUTQ: 3057 amount = unix_outq_len(sk); 3058 err = put_user(amount, (int __user *)arg); 3059 break; 3060 case SIOCINQ: 3061 amount = unix_inq_len(sk); 3062 if (amount < 0) 3063 err = amount; 3064 else 3065 err = put_user(amount, (int __user *)arg); 3066 break; 3067 case SIOCUNIXFILE: 3068 err = unix_open_file(sk); 3069 break; 3070 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3071 case SIOCATMARK: 3072 { 3073 struct sk_buff *skb; 3074 int answ = 0; 3075 3076 skb = skb_peek(&sk->sk_receive_queue); 3077 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) 3078 answ = 1; 3079 err = put_user(answ, (int __user *)arg); 3080 } 3081 break; 3082 #endif 3083 default: 3084 err = -ENOIOCTLCMD; 3085 break; 3086 } 3087 return err; 3088 } 3089 3090 #ifdef CONFIG_COMPAT 3091 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3092 { 3093 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3094 } 3095 #endif 3096 3097 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3098 { 3099 struct sock *sk = sock->sk; 3100 __poll_t mask; 3101 u8 shutdown; 3102 3103 sock_poll_wait(file, sock, wait); 3104 mask = 0; 3105 shutdown = READ_ONCE(sk->sk_shutdown); 3106 3107 /* exceptional events? */ 3108 if (READ_ONCE(sk->sk_err)) 3109 mask |= EPOLLERR; 3110 if (shutdown == SHUTDOWN_MASK) 3111 mask |= EPOLLHUP; 3112 if (shutdown & RCV_SHUTDOWN) 3113 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3114 3115 /* readable? */ 3116 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3117 mask |= EPOLLIN | EPOLLRDNORM; 3118 if (sk_is_readable(sk)) 3119 mask |= EPOLLIN | EPOLLRDNORM; 3120 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3121 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3122 mask |= EPOLLPRI; 3123 #endif 3124 3125 /* Connection-based need to check for termination and startup */ 3126 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3127 sk->sk_state == TCP_CLOSE) 3128 mask |= EPOLLHUP; 3129 3130 /* 3131 * we set writable also when the other side has shut down the 3132 * connection. This prevents stuck sockets. 3133 */ 3134 if (unix_writable(sk)) 3135 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3136 3137 return mask; 3138 } 3139 3140 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3141 poll_table *wait) 3142 { 3143 struct sock *sk = sock->sk, *other; 3144 unsigned int writable; 3145 __poll_t mask; 3146 u8 shutdown; 3147 3148 sock_poll_wait(file, sock, wait); 3149 mask = 0; 3150 shutdown = READ_ONCE(sk->sk_shutdown); 3151 3152 /* exceptional events? */ 3153 if (READ_ONCE(sk->sk_err) || 3154 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3155 mask |= EPOLLERR | 3156 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3157 3158 if (shutdown & RCV_SHUTDOWN) 3159 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3160 if (shutdown == SHUTDOWN_MASK) 3161 mask |= EPOLLHUP; 3162 3163 /* readable? */ 3164 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3165 mask |= EPOLLIN | EPOLLRDNORM; 3166 if (sk_is_readable(sk)) 3167 mask |= EPOLLIN | EPOLLRDNORM; 3168 3169 /* Connection-based need to check for termination and startup */ 3170 if (sk->sk_type == SOCK_SEQPACKET) { 3171 if (sk->sk_state == TCP_CLOSE) 3172 mask |= EPOLLHUP; 3173 /* connection hasn't started yet? */ 3174 if (sk->sk_state == TCP_SYN_SENT) 3175 return mask; 3176 } 3177 3178 /* No write status requested, avoid expensive OUT tests. */ 3179 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3180 return mask; 3181 3182 writable = unix_writable(sk); 3183 if (writable) { 3184 unix_state_lock(sk); 3185 3186 other = unix_peer(sk); 3187 if (other && unix_peer(other) != sk && 3188 unix_recvq_full_lockless(other) && 3189 unix_dgram_peer_wake_me(sk, other)) 3190 writable = 0; 3191 3192 unix_state_unlock(sk); 3193 } 3194 3195 if (writable) 3196 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3197 else 3198 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3199 3200 return mask; 3201 } 3202 3203 #ifdef CONFIG_PROC_FS 3204 3205 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3206 3207 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3208 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3209 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3210 3211 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3212 { 3213 unsigned long offset = get_offset(*pos); 3214 unsigned long bucket = get_bucket(*pos); 3215 unsigned long count = 0; 3216 struct sock *sk; 3217 3218 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3219 sk; sk = sk_next(sk)) { 3220 if (++count == offset) 3221 break; 3222 } 3223 3224 return sk; 3225 } 3226 3227 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3228 { 3229 unsigned long bucket = get_bucket(*pos); 3230 struct net *net = seq_file_net(seq); 3231 struct sock *sk; 3232 3233 while (bucket < UNIX_HASH_SIZE) { 3234 spin_lock(&net->unx.table.locks[bucket]); 3235 3236 sk = unix_from_bucket(seq, pos); 3237 if (sk) 3238 return sk; 3239 3240 spin_unlock(&net->unx.table.locks[bucket]); 3241 3242 *pos = set_bucket_offset(++bucket, 1); 3243 } 3244 3245 return NULL; 3246 } 3247 3248 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3249 loff_t *pos) 3250 { 3251 unsigned long bucket = get_bucket(*pos); 3252 3253 sk = sk_next(sk); 3254 if (sk) 3255 return sk; 3256 3257 3258 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3259 3260 *pos = set_bucket_offset(++bucket, 1); 3261 3262 return unix_get_first(seq, pos); 3263 } 3264 3265 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3266 { 3267 if (!*pos) 3268 return SEQ_START_TOKEN; 3269 3270 return unix_get_first(seq, pos); 3271 } 3272 3273 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3274 { 3275 ++*pos; 3276 3277 if (v == SEQ_START_TOKEN) 3278 return unix_get_first(seq, pos); 3279 3280 return unix_get_next(seq, v, pos); 3281 } 3282 3283 static void unix_seq_stop(struct seq_file *seq, void *v) 3284 { 3285 struct sock *sk = v; 3286 3287 if (sk) 3288 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3289 } 3290 3291 static int unix_seq_show(struct seq_file *seq, void *v) 3292 { 3293 3294 if (v == SEQ_START_TOKEN) 3295 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3296 "Inode Path\n"); 3297 else { 3298 struct sock *s = v; 3299 struct unix_sock *u = unix_sk(s); 3300 unix_state_lock(s); 3301 3302 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3303 s, 3304 refcount_read(&s->sk_refcnt), 3305 0, 3306 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3307 s->sk_type, 3308 s->sk_socket ? 3309 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3310 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3311 sock_i_ino(s)); 3312 3313 if (u->addr) { // under a hash table lock here 3314 int i, len; 3315 seq_putc(seq, ' '); 3316 3317 i = 0; 3318 len = u->addr->len - 3319 offsetof(struct sockaddr_un, sun_path); 3320 if (u->addr->name->sun_path[0]) { 3321 len--; 3322 } else { 3323 seq_putc(seq, '@'); 3324 i++; 3325 } 3326 for ( ; i < len; i++) 3327 seq_putc(seq, u->addr->name->sun_path[i] ?: 3328 '@'); 3329 } 3330 unix_state_unlock(s); 3331 seq_putc(seq, '\n'); 3332 } 3333 3334 return 0; 3335 } 3336 3337 static const struct seq_operations unix_seq_ops = { 3338 .start = unix_seq_start, 3339 .next = unix_seq_next, 3340 .stop = unix_seq_stop, 3341 .show = unix_seq_show, 3342 }; 3343 3344 #ifdef CONFIG_BPF_SYSCALL 3345 struct bpf_unix_iter_state { 3346 struct seq_net_private p; 3347 unsigned int cur_sk; 3348 unsigned int end_sk; 3349 unsigned int max_sk; 3350 struct sock **batch; 3351 bool st_bucket_done; 3352 }; 3353 3354 struct bpf_iter__unix { 3355 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3356 __bpf_md_ptr(struct unix_sock *, unix_sk); 3357 uid_t uid __aligned(8); 3358 }; 3359 3360 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3361 struct unix_sock *unix_sk, uid_t uid) 3362 { 3363 struct bpf_iter__unix ctx; 3364 3365 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3366 ctx.meta = meta; 3367 ctx.unix_sk = unix_sk; 3368 ctx.uid = uid; 3369 return bpf_iter_run_prog(prog, &ctx); 3370 } 3371 3372 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3373 3374 { 3375 struct bpf_unix_iter_state *iter = seq->private; 3376 unsigned int expected = 1; 3377 struct sock *sk; 3378 3379 sock_hold(start_sk); 3380 iter->batch[iter->end_sk++] = start_sk; 3381 3382 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3383 if (iter->end_sk < iter->max_sk) { 3384 sock_hold(sk); 3385 iter->batch[iter->end_sk++] = sk; 3386 } 3387 3388 expected++; 3389 } 3390 3391 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3392 3393 return expected; 3394 } 3395 3396 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3397 { 3398 while (iter->cur_sk < iter->end_sk) 3399 sock_put(iter->batch[iter->cur_sk++]); 3400 } 3401 3402 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3403 unsigned int new_batch_sz) 3404 { 3405 struct sock **new_batch; 3406 3407 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3408 GFP_USER | __GFP_NOWARN); 3409 if (!new_batch) 3410 return -ENOMEM; 3411 3412 bpf_iter_unix_put_batch(iter); 3413 kvfree(iter->batch); 3414 iter->batch = new_batch; 3415 iter->max_sk = new_batch_sz; 3416 3417 return 0; 3418 } 3419 3420 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3421 loff_t *pos) 3422 { 3423 struct bpf_unix_iter_state *iter = seq->private; 3424 unsigned int expected; 3425 bool resized = false; 3426 struct sock *sk; 3427 3428 if (iter->st_bucket_done) 3429 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3430 3431 again: 3432 /* Get a new batch */ 3433 iter->cur_sk = 0; 3434 iter->end_sk = 0; 3435 3436 sk = unix_get_first(seq, pos); 3437 if (!sk) 3438 return NULL; /* Done */ 3439 3440 expected = bpf_iter_unix_hold_batch(seq, sk); 3441 3442 if (iter->end_sk == expected) { 3443 iter->st_bucket_done = true; 3444 return sk; 3445 } 3446 3447 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3448 resized = true; 3449 goto again; 3450 } 3451 3452 return sk; 3453 } 3454 3455 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3456 { 3457 if (!*pos) 3458 return SEQ_START_TOKEN; 3459 3460 /* bpf iter does not support lseek, so it always 3461 * continue from where it was stop()-ped. 3462 */ 3463 return bpf_iter_unix_batch(seq, pos); 3464 } 3465 3466 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3467 { 3468 struct bpf_unix_iter_state *iter = seq->private; 3469 struct sock *sk; 3470 3471 /* Whenever seq_next() is called, the iter->cur_sk is 3472 * done with seq_show(), so advance to the next sk in 3473 * the batch. 3474 */ 3475 if (iter->cur_sk < iter->end_sk) 3476 sock_put(iter->batch[iter->cur_sk++]); 3477 3478 ++*pos; 3479 3480 if (iter->cur_sk < iter->end_sk) 3481 sk = iter->batch[iter->cur_sk]; 3482 else 3483 sk = bpf_iter_unix_batch(seq, pos); 3484 3485 return sk; 3486 } 3487 3488 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3489 { 3490 struct bpf_iter_meta meta; 3491 struct bpf_prog *prog; 3492 struct sock *sk = v; 3493 uid_t uid; 3494 bool slow; 3495 int ret; 3496 3497 if (v == SEQ_START_TOKEN) 3498 return 0; 3499 3500 slow = lock_sock_fast(sk); 3501 3502 if (unlikely(sk_unhashed(sk))) { 3503 ret = SEQ_SKIP; 3504 goto unlock; 3505 } 3506 3507 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3508 meta.seq = seq; 3509 prog = bpf_iter_get_info(&meta, false); 3510 ret = unix_prog_seq_show(prog, &meta, v, uid); 3511 unlock: 3512 unlock_sock_fast(sk, slow); 3513 return ret; 3514 } 3515 3516 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3517 { 3518 struct bpf_unix_iter_state *iter = seq->private; 3519 struct bpf_iter_meta meta; 3520 struct bpf_prog *prog; 3521 3522 if (!v) { 3523 meta.seq = seq; 3524 prog = bpf_iter_get_info(&meta, true); 3525 if (prog) 3526 (void)unix_prog_seq_show(prog, &meta, v, 0); 3527 } 3528 3529 if (iter->cur_sk < iter->end_sk) 3530 bpf_iter_unix_put_batch(iter); 3531 } 3532 3533 static const struct seq_operations bpf_iter_unix_seq_ops = { 3534 .start = bpf_iter_unix_seq_start, 3535 .next = bpf_iter_unix_seq_next, 3536 .stop = bpf_iter_unix_seq_stop, 3537 .show = bpf_iter_unix_seq_show, 3538 }; 3539 #endif 3540 #endif 3541 3542 static const struct net_proto_family unix_family_ops = { 3543 .family = PF_UNIX, 3544 .create = unix_create, 3545 .owner = THIS_MODULE, 3546 }; 3547 3548 3549 static int __net_init unix_net_init(struct net *net) 3550 { 3551 int i; 3552 3553 net->unx.sysctl_max_dgram_qlen = 10; 3554 if (unix_sysctl_register(net)) 3555 goto out; 3556 3557 #ifdef CONFIG_PROC_FS 3558 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3559 sizeof(struct seq_net_private))) 3560 goto err_sysctl; 3561 #endif 3562 3563 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3564 sizeof(spinlock_t), GFP_KERNEL); 3565 if (!net->unx.table.locks) 3566 goto err_proc; 3567 3568 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3569 sizeof(struct hlist_head), 3570 GFP_KERNEL); 3571 if (!net->unx.table.buckets) 3572 goto free_locks; 3573 3574 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3575 spin_lock_init(&net->unx.table.locks[i]); 3576 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3577 } 3578 3579 return 0; 3580 3581 free_locks: 3582 kvfree(net->unx.table.locks); 3583 err_proc: 3584 #ifdef CONFIG_PROC_FS 3585 remove_proc_entry("unix", net->proc_net); 3586 err_sysctl: 3587 #endif 3588 unix_sysctl_unregister(net); 3589 out: 3590 return -ENOMEM; 3591 } 3592 3593 static void __net_exit unix_net_exit(struct net *net) 3594 { 3595 kvfree(net->unx.table.buckets); 3596 kvfree(net->unx.table.locks); 3597 unix_sysctl_unregister(net); 3598 remove_proc_entry("unix", net->proc_net); 3599 } 3600 3601 static struct pernet_operations unix_net_ops = { 3602 .init = unix_net_init, 3603 .exit = unix_net_exit, 3604 }; 3605 3606 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3607 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3608 struct unix_sock *unix_sk, uid_t uid) 3609 3610 #define INIT_BATCH_SZ 16 3611 3612 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3613 { 3614 struct bpf_unix_iter_state *iter = priv_data; 3615 int err; 3616 3617 err = bpf_iter_init_seq_net(priv_data, aux); 3618 if (err) 3619 return err; 3620 3621 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3622 if (err) { 3623 bpf_iter_fini_seq_net(priv_data); 3624 return err; 3625 } 3626 3627 return 0; 3628 } 3629 3630 static void bpf_iter_fini_unix(void *priv_data) 3631 { 3632 struct bpf_unix_iter_state *iter = priv_data; 3633 3634 bpf_iter_fini_seq_net(priv_data); 3635 kvfree(iter->batch); 3636 } 3637 3638 static const struct bpf_iter_seq_info unix_seq_info = { 3639 .seq_ops = &bpf_iter_unix_seq_ops, 3640 .init_seq_private = bpf_iter_init_unix, 3641 .fini_seq_private = bpf_iter_fini_unix, 3642 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3643 }; 3644 3645 static const struct bpf_func_proto * 3646 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3647 const struct bpf_prog *prog) 3648 { 3649 switch (func_id) { 3650 case BPF_FUNC_setsockopt: 3651 return &bpf_sk_setsockopt_proto; 3652 case BPF_FUNC_getsockopt: 3653 return &bpf_sk_getsockopt_proto; 3654 default: 3655 return NULL; 3656 } 3657 } 3658 3659 static struct bpf_iter_reg unix_reg_info = { 3660 .target = "unix", 3661 .ctx_arg_info_size = 1, 3662 .ctx_arg_info = { 3663 { offsetof(struct bpf_iter__unix, unix_sk), 3664 PTR_TO_BTF_ID_OR_NULL }, 3665 }, 3666 .get_func_proto = bpf_iter_unix_get_func_proto, 3667 .seq_info = &unix_seq_info, 3668 }; 3669 3670 static void __init bpf_iter_register(void) 3671 { 3672 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3673 if (bpf_iter_reg_target(&unix_reg_info)) 3674 pr_warn("Warning: could not register bpf iterator unix\n"); 3675 } 3676 #endif 3677 3678 static int __init af_unix_init(void) 3679 { 3680 int i, rc = -1; 3681 3682 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3683 3684 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3685 spin_lock_init(&bsd_socket_locks[i]); 3686 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3687 } 3688 3689 rc = proto_register(&unix_dgram_proto, 1); 3690 if (rc != 0) { 3691 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3692 goto out; 3693 } 3694 3695 rc = proto_register(&unix_stream_proto, 1); 3696 if (rc != 0) { 3697 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3698 proto_unregister(&unix_dgram_proto); 3699 goto out; 3700 } 3701 3702 sock_register(&unix_family_ops); 3703 register_pernet_subsys(&unix_net_ops); 3704 unix_bpf_build_proto(); 3705 3706 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3707 bpf_iter_register(); 3708 #endif 3709 3710 out: 3711 return rc; 3712 } 3713 3714 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3715 fs_initcall(af_unix_init); 3716