1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 #include <linux/bpf-cgroup.h> 120 121 #include "scm.h" 122 123 static atomic_long_t unix_nr_socks; 124 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 125 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 126 127 /* SMP locking strategy: 128 * hash table is protected with spinlock. 129 * each socket state is protected by separate spinlock. 130 */ 131 132 static unsigned int unix_unbound_hash(struct sock *sk) 133 { 134 unsigned long hash = (unsigned long)sk; 135 136 hash ^= hash >> 16; 137 hash ^= hash >> 8; 138 hash ^= sk->sk_type; 139 140 return hash & UNIX_HASH_MOD; 141 } 142 143 static unsigned int unix_bsd_hash(struct inode *i) 144 { 145 return i->i_ino & UNIX_HASH_MOD; 146 } 147 148 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 149 int addr_len, int type) 150 { 151 __wsum csum = csum_partial(sunaddr, addr_len, 0); 152 unsigned int hash; 153 154 hash = (__force unsigned int)csum_fold(csum); 155 hash ^= hash >> 8; 156 hash ^= type; 157 158 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 159 } 160 161 static void unix_table_double_lock(struct net *net, 162 unsigned int hash1, unsigned int hash2) 163 { 164 if (hash1 == hash2) { 165 spin_lock(&net->unx.table.locks[hash1]); 166 return; 167 } 168 169 if (hash1 > hash2) 170 swap(hash1, hash2); 171 172 spin_lock(&net->unx.table.locks[hash1]); 173 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); 174 } 175 176 static void unix_table_double_unlock(struct net *net, 177 unsigned int hash1, unsigned int hash2) 178 { 179 if (hash1 == hash2) { 180 spin_unlock(&net->unx.table.locks[hash1]); 181 return; 182 } 183 184 spin_unlock(&net->unx.table.locks[hash1]); 185 spin_unlock(&net->unx.table.locks[hash2]); 186 } 187 188 #ifdef CONFIG_SECURITY_NETWORK 189 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 190 { 191 UNIXCB(skb).secid = scm->secid; 192 } 193 194 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 195 { 196 scm->secid = UNIXCB(skb).secid; 197 } 198 199 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 200 { 201 return (scm->secid == UNIXCB(skb).secid); 202 } 203 #else 204 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 205 { } 206 207 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 208 { } 209 210 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 211 { 212 return true; 213 } 214 #endif /* CONFIG_SECURITY_NETWORK */ 215 216 static inline int unix_our_peer(struct sock *sk, struct sock *osk) 217 { 218 return unix_peer(osk) == sk; 219 } 220 221 static inline int unix_may_send(struct sock *sk, struct sock *osk) 222 { 223 return unix_peer(osk) == NULL || unix_our_peer(sk, osk); 224 } 225 226 static inline int unix_recvq_full(const struct sock *sk) 227 { 228 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 229 } 230 231 static inline int unix_recvq_full_lockless(const struct sock *sk) 232 { 233 return skb_queue_len_lockless(&sk->sk_receive_queue) > 234 READ_ONCE(sk->sk_max_ack_backlog); 235 } 236 237 struct sock *unix_peer_get(struct sock *s) 238 { 239 struct sock *peer; 240 241 unix_state_lock(s); 242 peer = unix_peer(s); 243 if (peer) 244 sock_hold(peer); 245 unix_state_unlock(s); 246 return peer; 247 } 248 EXPORT_SYMBOL_GPL(unix_peer_get); 249 250 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 251 int addr_len) 252 { 253 struct unix_address *addr; 254 255 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 256 if (!addr) 257 return NULL; 258 259 refcount_set(&addr->refcnt, 1); 260 addr->len = addr_len; 261 memcpy(addr->name, sunaddr, addr_len); 262 263 return addr; 264 } 265 266 static inline void unix_release_addr(struct unix_address *addr) 267 { 268 if (refcount_dec_and_test(&addr->refcnt)) 269 kfree(addr); 270 } 271 272 /* 273 * Check unix socket name: 274 * - should be not zero length. 275 * - if started by not zero, should be NULL terminated (FS object) 276 * - if started by zero, it is abstract name. 277 */ 278 279 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 280 { 281 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 282 addr_len > sizeof(*sunaddr)) 283 return -EINVAL; 284 285 if (sunaddr->sun_family != AF_UNIX) 286 return -EINVAL; 287 288 return 0; 289 } 290 291 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 292 { 293 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 294 short offset = offsetof(struct sockaddr_storage, __data); 295 296 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 297 298 /* This may look like an off by one error but it is a bit more 299 * subtle. 108 is the longest valid AF_UNIX path for a binding. 300 * sun_path[108] doesn't as such exist. However in kernel space 301 * we are guaranteed that it is a valid memory location in our 302 * kernel address buffer because syscall functions always pass 303 * a pointer of struct sockaddr_storage which has a bigger buffer 304 * than 108. Also, we must terminate sun_path for strlen() in 305 * getname_kernel(). 306 */ 307 addr->__data[addr_len - offset] = 0; 308 309 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 310 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 311 * know the actual buffer. 312 */ 313 return strlen(addr->__data) + offset + 1; 314 } 315 316 static void __unix_remove_socket(struct sock *sk) 317 { 318 sk_del_node_init(sk); 319 } 320 321 static void __unix_insert_socket(struct net *net, struct sock *sk) 322 { 323 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 324 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 325 } 326 327 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 328 struct unix_address *addr, unsigned int hash) 329 { 330 __unix_remove_socket(sk); 331 smp_store_release(&unix_sk(sk)->addr, addr); 332 333 sk->sk_hash = hash; 334 __unix_insert_socket(net, sk); 335 } 336 337 static void unix_remove_socket(struct net *net, struct sock *sk) 338 { 339 spin_lock(&net->unx.table.locks[sk->sk_hash]); 340 __unix_remove_socket(sk); 341 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 342 } 343 344 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 345 { 346 spin_lock(&net->unx.table.locks[sk->sk_hash]); 347 __unix_insert_socket(net, sk); 348 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 349 } 350 351 static void unix_insert_bsd_socket(struct sock *sk) 352 { 353 spin_lock(&bsd_socket_locks[sk->sk_hash]); 354 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 355 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 356 } 357 358 static void unix_remove_bsd_socket(struct sock *sk) 359 { 360 if (!hlist_unhashed(&sk->sk_bind_node)) { 361 spin_lock(&bsd_socket_locks[sk->sk_hash]); 362 __sk_del_bind_node(sk); 363 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 364 365 sk_node_init(&sk->sk_bind_node); 366 } 367 } 368 369 static struct sock *__unix_find_socket_byname(struct net *net, 370 struct sockaddr_un *sunname, 371 int len, unsigned int hash) 372 { 373 struct sock *s; 374 375 sk_for_each(s, &net->unx.table.buckets[hash]) { 376 struct unix_sock *u = unix_sk(s); 377 378 if (u->addr->len == len && 379 !memcmp(u->addr->name, sunname, len)) 380 return s; 381 } 382 return NULL; 383 } 384 385 static inline struct sock *unix_find_socket_byname(struct net *net, 386 struct sockaddr_un *sunname, 387 int len, unsigned int hash) 388 { 389 struct sock *s; 390 391 spin_lock(&net->unx.table.locks[hash]); 392 s = __unix_find_socket_byname(net, sunname, len, hash); 393 if (s) 394 sock_hold(s); 395 spin_unlock(&net->unx.table.locks[hash]); 396 return s; 397 } 398 399 static struct sock *unix_find_socket_byinode(struct inode *i) 400 { 401 unsigned int hash = unix_bsd_hash(i); 402 struct sock *s; 403 404 spin_lock(&bsd_socket_locks[hash]); 405 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 406 struct dentry *dentry = unix_sk(s)->path.dentry; 407 408 if (dentry && d_backing_inode(dentry) == i) { 409 sock_hold(s); 410 spin_unlock(&bsd_socket_locks[hash]); 411 return s; 412 } 413 } 414 spin_unlock(&bsd_socket_locks[hash]); 415 return NULL; 416 } 417 418 /* Support code for asymmetrically connected dgram sockets 419 * 420 * If a datagram socket is connected to a socket not itself connected 421 * to the first socket (eg, /dev/log), clients may only enqueue more 422 * messages if the present receive queue of the server socket is not 423 * "too large". This means there's a second writeability condition 424 * poll and sendmsg need to test. The dgram recv code will do a wake 425 * up on the peer_wait wait queue of a socket upon reception of a 426 * datagram which needs to be propagated to sleeping would-be writers 427 * since these might not have sent anything so far. This can't be 428 * accomplished via poll_wait because the lifetime of the server 429 * socket might be less than that of its clients if these break their 430 * association with it or if the server socket is closed while clients 431 * are still connected to it and there's no way to inform "a polling 432 * implementation" that it should let go of a certain wait queue 433 * 434 * In order to propagate a wake up, a wait_queue_entry_t of the client 435 * socket is enqueued on the peer_wait queue of the server socket 436 * whose wake function does a wake_up on the ordinary client socket 437 * wait queue. This connection is established whenever a write (or 438 * poll for write) hit the flow control condition and broken when the 439 * association to the server socket is dissolved or after a wake up 440 * was relayed. 441 */ 442 443 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 444 void *key) 445 { 446 struct unix_sock *u; 447 wait_queue_head_t *u_sleep; 448 449 u = container_of(q, struct unix_sock, peer_wake); 450 451 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 452 q); 453 u->peer_wake.private = NULL; 454 455 /* relaying can only happen while the wq still exists */ 456 u_sleep = sk_sleep(&u->sk); 457 if (u_sleep) 458 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 459 460 return 0; 461 } 462 463 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 464 { 465 struct unix_sock *u, *u_other; 466 int rc; 467 468 u = unix_sk(sk); 469 u_other = unix_sk(other); 470 rc = 0; 471 spin_lock(&u_other->peer_wait.lock); 472 473 if (!u->peer_wake.private) { 474 u->peer_wake.private = other; 475 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 476 477 rc = 1; 478 } 479 480 spin_unlock(&u_other->peer_wait.lock); 481 return rc; 482 } 483 484 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 485 struct sock *other) 486 { 487 struct unix_sock *u, *u_other; 488 489 u = unix_sk(sk); 490 u_other = unix_sk(other); 491 spin_lock(&u_other->peer_wait.lock); 492 493 if (u->peer_wake.private == other) { 494 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 495 u->peer_wake.private = NULL; 496 } 497 498 spin_unlock(&u_other->peer_wait.lock); 499 } 500 501 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 502 struct sock *other) 503 { 504 unix_dgram_peer_wake_disconnect(sk, other); 505 wake_up_interruptible_poll(sk_sleep(sk), 506 EPOLLOUT | 507 EPOLLWRNORM | 508 EPOLLWRBAND); 509 } 510 511 /* preconditions: 512 * - unix_peer(sk) == other 513 * - association is stable 514 */ 515 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 516 { 517 int connected; 518 519 connected = unix_dgram_peer_wake_connect(sk, other); 520 521 /* If other is SOCK_DEAD, we want to make sure we signal 522 * POLLOUT, such that a subsequent write() can get a 523 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 524 * to other and its full, we will hang waiting for POLLOUT. 525 */ 526 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 527 return 1; 528 529 if (connected) 530 unix_dgram_peer_wake_disconnect(sk, other); 531 532 return 0; 533 } 534 535 static int unix_writable(const struct sock *sk) 536 { 537 return sk->sk_state != TCP_LISTEN && 538 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; 539 } 540 541 static void unix_write_space(struct sock *sk) 542 { 543 struct socket_wq *wq; 544 545 rcu_read_lock(); 546 if (unix_writable(sk)) { 547 wq = rcu_dereference(sk->sk_wq); 548 if (skwq_has_sleeper(wq)) 549 wake_up_interruptible_sync_poll(&wq->wait, 550 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 551 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 552 } 553 rcu_read_unlock(); 554 } 555 556 /* When dgram socket disconnects (or changes its peer), we clear its receive 557 * queue of packets arrived from previous peer. First, it allows to do 558 * flow control based only on wmem_alloc; second, sk connected to peer 559 * may receive messages only from that peer. */ 560 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 561 { 562 if (!skb_queue_empty(&sk->sk_receive_queue)) { 563 skb_queue_purge(&sk->sk_receive_queue); 564 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 565 566 /* If one link of bidirectional dgram pipe is disconnected, 567 * we signal error. Messages are lost. Do not make this, 568 * when peer was not connected to us. 569 */ 570 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 571 WRITE_ONCE(other->sk_err, ECONNRESET); 572 sk_error_report(other); 573 } 574 } 575 other->sk_state = TCP_CLOSE; 576 } 577 578 static void unix_sock_destructor(struct sock *sk) 579 { 580 struct unix_sock *u = unix_sk(sk); 581 582 skb_queue_purge(&sk->sk_receive_queue); 583 584 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 585 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 586 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 587 if (!sock_flag(sk, SOCK_DEAD)) { 588 pr_info("Attempt to release alive unix socket: %p\n", sk); 589 return; 590 } 591 592 if (u->addr) 593 unix_release_addr(u->addr); 594 595 atomic_long_dec(&unix_nr_socks); 596 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 597 #ifdef UNIX_REFCNT_DEBUG 598 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 599 atomic_long_read(&unix_nr_socks)); 600 #endif 601 } 602 603 static void unix_release_sock(struct sock *sk, int embrion) 604 { 605 struct unix_sock *u = unix_sk(sk); 606 struct sock *skpair; 607 struct sk_buff *skb; 608 struct path path; 609 int state; 610 611 unix_remove_socket(sock_net(sk), sk); 612 unix_remove_bsd_socket(sk); 613 614 /* Clear state */ 615 unix_state_lock(sk); 616 sock_orphan(sk); 617 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 618 path = u->path; 619 u->path.dentry = NULL; 620 u->path.mnt = NULL; 621 state = sk->sk_state; 622 sk->sk_state = TCP_CLOSE; 623 624 skpair = unix_peer(sk); 625 unix_peer(sk) = NULL; 626 627 unix_state_unlock(sk); 628 629 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 630 if (u->oob_skb) { 631 kfree_skb(u->oob_skb); 632 u->oob_skb = NULL; 633 } 634 #endif 635 636 wake_up_interruptible_all(&u->peer_wait); 637 638 if (skpair != NULL) { 639 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 640 unix_state_lock(skpair); 641 /* No more writes */ 642 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 643 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) 644 WRITE_ONCE(skpair->sk_err, ECONNRESET); 645 unix_state_unlock(skpair); 646 skpair->sk_state_change(skpair); 647 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 648 } 649 650 unix_dgram_peer_wake_disconnect(sk, skpair); 651 sock_put(skpair); /* It may now die */ 652 } 653 654 /* Try to flush out this socket. Throw out buffers at least */ 655 656 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 657 if (state == TCP_LISTEN) 658 unix_release_sock(skb->sk, 1); 659 /* passed fds are erased in the kfree_skb hook */ 660 UNIXCB(skb).consumed = skb->len; 661 kfree_skb(skb); 662 } 663 664 if (path.dentry) 665 path_put(&path); 666 667 sock_put(sk); 668 669 /* ---- Socket is dead now and most probably destroyed ---- */ 670 671 /* 672 * Fixme: BSD difference: In BSD all sockets connected to us get 673 * ECONNRESET and we die on the spot. In Linux we behave 674 * like files and pipes do and wait for the last 675 * dereference. 676 * 677 * Can't we simply set sock->err? 678 * 679 * What the above comment does talk about? --ANK(980817) 680 */ 681 682 if (READ_ONCE(unix_tot_inflight)) 683 unix_gc(); /* Garbage collect fds */ 684 } 685 686 static void init_peercred(struct sock *sk) 687 { 688 const struct cred *old_cred; 689 struct pid *old_pid; 690 691 spin_lock(&sk->sk_peer_lock); 692 old_pid = sk->sk_peer_pid; 693 old_cred = sk->sk_peer_cred; 694 sk->sk_peer_pid = get_pid(task_tgid(current)); 695 sk->sk_peer_cred = get_current_cred(); 696 spin_unlock(&sk->sk_peer_lock); 697 698 put_pid(old_pid); 699 put_cred(old_cred); 700 } 701 702 static void copy_peercred(struct sock *sk, struct sock *peersk) 703 { 704 const struct cred *old_cred; 705 struct pid *old_pid; 706 707 if (sk < peersk) { 708 spin_lock(&sk->sk_peer_lock); 709 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); 710 } else { 711 spin_lock(&peersk->sk_peer_lock); 712 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); 713 } 714 old_pid = sk->sk_peer_pid; 715 old_cred = sk->sk_peer_cred; 716 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 717 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 718 719 spin_unlock(&sk->sk_peer_lock); 720 spin_unlock(&peersk->sk_peer_lock); 721 722 put_pid(old_pid); 723 put_cred(old_cred); 724 } 725 726 static int unix_listen(struct socket *sock, int backlog) 727 { 728 int err; 729 struct sock *sk = sock->sk; 730 struct unix_sock *u = unix_sk(sk); 731 732 err = -EOPNOTSUPP; 733 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 734 goto out; /* Only stream/seqpacket sockets accept */ 735 err = -EINVAL; 736 if (!u->addr) 737 goto out; /* No listens on an unbound socket */ 738 unix_state_lock(sk); 739 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 740 goto out_unlock; 741 if (backlog > sk->sk_max_ack_backlog) 742 wake_up_interruptible_all(&u->peer_wait); 743 sk->sk_max_ack_backlog = backlog; 744 sk->sk_state = TCP_LISTEN; 745 /* set credentials so connect can copy them */ 746 init_peercred(sk); 747 err = 0; 748 749 out_unlock: 750 unix_state_unlock(sk); 751 out: 752 return err; 753 } 754 755 static int unix_release(struct socket *); 756 static int unix_bind(struct socket *, struct sockaddr *, int); 757 static int unix_stream_connect(struct socket *, struct sockaddr *, 758 int addr_len, int flags); 759 static int unix_socketpair(struct socket *, struct socket *); 760 static int unix_accept(struct socket *, struct socket *, int, bool); 761 static int unix_getname(struct socket *, struct sockaddr *, int); 762 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 763 static __poll_t unix_dgram_poll(struct file *, struct socket *, 764 poll_table *); 765 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 766 #ifdef CONFIG_COMPAT 767 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 768 #endif 769 static int unix_shutdown(struct socket *, int); 770 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 771 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 772 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 773 struct pipe_inode_info *, size_t size, 774 unsigned int flags); 775 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 776 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 777 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 778 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 779 static int unix_dgram_connect(struct socket *, struct sockaddr *, 780 int, int); 781 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 782 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 783 int); 784 785 static int unix_set_peek_off(struct sock *sk, int val) 786 { 787 struct unix_sock *u = unix_sk(sk); 788 789 if (mutex_lock_interruptible(&u->iolock)) 790 return -EINTR; 791 792 WRITE_ONCE(sk->sk_peek_off, val); 793 mutex_unlock(&u->iolock); 794 795 return 0; 796 } 797 798 #ifdef CONFIG_PROC_FS 799 static int unix_count_nr_fds(struct sock *sk) 800 { 801 struct sk_buff *skb; 802 struct unix_sock *u; 803 int nr_fds = 0; 804 805 spin_lock(&sk->sk_receive_queue.lock); 806 skb = skb_peek(&sk->sk_receive_queue); 807 while (skb) { 808 u = unix_sk(skb->sk); 809 nr_fds += atomic_read(&u->scm_stat.nr_fds); 810 skb = skb_peek_next(skb, &sk->sk_receive_queue); 811 } 812 spin_unlock(&sk->sk_receive_queue.lock); 813 814 return nr_fds; 815 } 816 817 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 818 { 819 struct sock *sk = sock->sk; 820 unsigned char s_state; 821 struct unix_sock *u; 822 int nr_fds = 0; 823 824 if (sk) { 825 s_state = READ_ONCE(sk->sk_state); 826 u = unix_sk(sk); 827 828 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 829 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 830 * SOCK_DGRAM is ordinary. So, no lock is needed. 831 */ 832 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 833 nr_fds = atomic_read(&u->scm_stat.nr_fds); 834 else if (s_state == TCP_LISTEN) 835 nr_fds = unix_count_nr_fds(sk); 836 837 seq_printf(m, "scm_fds: %u\n", nr_fds); 838 } 839 } 840 #else 841 #define unix_show_fdinfo NULL 842 #endif 843 844 static const struct proto_ops unix_stream_ops = { 845 .family = PF_UNIX, 846 .owner = THIS_MODULE, 847 .release = unix_release, 848 .bind = unix_bind, 849 .connect = unix_stream_connect, 850 .socketpair = unix_socketpair, 851 .accept = unix_accept, 852 .getname = unix_getname, 853 .poll = unix_poll, 854 .ioctl = unix_ioctl, 855 #ifdef CONFIG_COMPAT 856 .compat_ioctl = unix_compat_ioctl, 857 #endif 858 .listen = unix_listen, 859 .shutdown = unix_shutdown, 860 .sendmsg = unix_stream_sendmsg, 861 .recvmsg = unix_stream_recvmsg, 862 .read_skb = unix_stream_read_skb, 863 .mmap = sock_no_mmap, 864 .splice_read = unix_stream_splice_read, 865 .set_peek_off = unix_set_peek_off, 866 .show_fdinfo = unix_show_fdinfo, 867 }; 868 869 static const struct proto_ops unix_dgram_ops = { 870 .family = PF_UNIX, 871 .owner = THIS_MODULE, 872 .release = unix_release, 873 .bind = unix_bind, 874 .connect = unix_dgram_connect, 875 .socketpair = unix_socketpair, 876 .accept = sock_no_accept, 877 .getname = unix_getname, 878 .poll = unix_dgram_poll, 879 .ioctl = unix_ioctl, 880 #ifdef CONFIG_COMPAT 881 .compat_ioctl = unix_compat_ioctl, 882 #endif 883 .listen = sock_no_listen, 884 .shutdown = unix_shutdown, 885 .sendmsg = unix_dgram_sendmsg, 886 .read_skb = unix_read_skb, 887 .recvmsg = unix_dgram_recvmsg, 888 .mmap = sock_no_mmap, 889 .set_peek_off = unix_set_peek_off, 890 .show_fdinfo = unix_show_fdinfo, 891 }; 892 893 static const struct proto_ops unix_seqpacket_ops = { 894 .family = PF_UNIX, 895 .owner = THIS_MODULE, 896 .release = unix_release, 897 .bind = unix_bind, 898 .connect = unix_stream_connect, 899 .socketpair = unix_socketpair, 900 .accept = unix_accept, 901 .getname = unix_getname, 902 .poll = unix_dgram_poll, 903 .ioctl = unix_ioctl, 904 #ifdef CONFIG_COMPAT 905 .compat_ioctl = unix_compat_ioctl, 906 #endif 907 .listen = unix_listen, 908 .shutdown = unix_shutdown, 909 .sendmsg = unix_seqpacket_sendmsg, 910 .recvmsg = unix_seqpacket_recvmsg, 911 .mmap = sock_no_mmap, 912 .set_peek_off = unix_set_peek_off, 913 .show_fdinfo = unix_show_fdinfo, 914 }; 915 916 static void unix_close(struct sock *sk, long timeout) 917 { 918 /* Nothing to do here, unix socket does not need a ->close(). 919 * This is merely for sockmap. 920 */ 921 } 922 923 static void unix_unhash(struct sock *sk) 924 { 925 /* Nothing to do here, unix socket does not need a ->unhash(). 926 * This is merely for sockmap. 927 */ 928 } 929 930 static bool unix_bpf_bypass_getsockopt(int level, int optname) 931 { 932 if (level == SOL_SOCKET) { 933 switch (optname) { 934 case SO_PEERPIDFD: 935 return true; 936 default: 937 return false; 938 } 939 } 940 941 return false; 942 } 943 944 struct proto unix_dgram_proto = { 945 .name = "UNIX", 946 .owner = THIS_MODULE, 947 .obj_size = sizeof(struct unix_sock), 948 .close = unix_close, 949 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 950 #ifdef CONFIG_BPF_SYSCALL 951 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 952 #endif 953 }; 954 955 struct proto unix_stream_proto = { 956 .name = "UNIX-STREAM", 957 .owner = THIS_MODULE, 958 .obj_size = sizeof(struct unix_sock), 959 .close = unix_close, 960 .unhash = unix_unhash, 961 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 962 #ifdef CONFIG_BPF_SYSCALL 963 .psock_update_sk_prot = unix_stream_bpf_update_proto, 964 #endif 965 }; 966 967 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 968 { 969 struct unix_sock *u; 970 struct sock *sk; 971 int err; 972 973 atomic_long_inc(&unix_nr_socks); 974 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 975 err = -ENFILE; 976 goto err; 977 } 978 979 if (type == SOCK_STREAM) 980 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 981 else /*dgram and seqpacket */ 982 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 983 984 if (!sk) { 985 err = -ENOMEM; 986 goto err; 987 } 988 989 sock_init_data(sock, sk); 990 991 sk->sk_hash = unix_unbound_hash(sk); 992 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 993 sk->sk_write_space = unix_write_space; 994 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; 995 sk->sk_destruct = unix_sock_destructor; 996 u = unix_sk(sk); 997 u->path.dentry = NULL; 998 u->path.mnt = NULL; 999 spin_lock_init(&u->lock); 1000 atomic_long_set(&u->inflight, 0); 1001 INIT_LIST_HEAD(&u->link); 1002 mutex_init(&u->iolock); /* single task reading lock */ 1003 mutex_init(&u->bindlock); /* single task binding lock */ 1004 init_waitqueue_head(&u->peer_wait); 1005 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1006 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1007 unix_insert_unbound_socket(net, sk); 1008 1009 sock_prot_inuse_add(net, sk->sk_prot, 1); 1010 1011 return sk; 1012 1013 err: 1014 atomic_long_dec(&unix_nr_socks); 1015 return ERR_PTR(err); 1016 } 1017 1018 static int unix_create(struct net *net, struct socket *sock, int protocol, 1019 int kern) 1020 { 1021 struct sock *sk; 1022 1023 if (protocol && protocol != PF_UNIX) 1024 return -EPROTONOSUPPORT; 1025 1026 sock->state = SS_UNCONNECTED; 1027 1028 switch (sock->type) { 1029 case SOCK_STREAM: 1030 sock->ops = &unix_stream_ops; 1031 break; 1032 /* 1033 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1034 * nothing uses it. 1035 */ 1036 case SOCK_RAW: 1037 sock->type = SOCK_DGRAM; 1038 fallthrough; 1039 case SOCK_DGRAM: 1040 sock->ops = &unix_dgram_ops; 1041 break; 1042 case SOCK_SEQPACKET: 1043 sock->ops = &unix_seqpacket_ops; 1044 break; 1045 default: 1046 return -ESOCKTNOSUPPORT; 1047 } 1048 1049 sk = unix_create1(net, sock, kern, sock->type); 1050 if (IS_ERR(sk)) 1051 return PTR_ERR(sk); 1052 1053 return 0; 1054 } 1055 1056 static int unix_release(struct socket *sock) 1057 { 1058 struct sock *sk = sock->sk; 1059 1060 if (!sk) 1061 return 0; 1062 1063 sk->sk_prot->close(sk, 0); 1064 unix_release_sock(sk, 0); 1065 sock->sk = NULL; 1066 1067 return 0; 1068 } 1069 1070 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1071 int type) 1072 { 1073 struct inode *inode; 1074 struct path path; 1075 struct sock *sk; 1076 int err; 1077 1078 unix_mkname_bsd(sunaddr, addr_len); 1079 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1080 if (err) 1081 goto fail; 1082 1083 err = path_permission(&path, MAY_WRITE); 1084 if (err) 1085 goto path_put; 1086 1087 err = -ECONNREFUSED; 1088 inode = d_backing_inode(path.dentry); 1089 if (!S_ISSOCK(inode->i_mode)) 1090 goto path_put; 1091 1092 sk = unix_find_socket_byinode(inode); 1093 if (!sk) 1094 goto path_put; 1095 1096 err = -EPROTOTYPE; 1097 if (sk->sk_type == type) 1098 touch_atime(&path); 1099 else 1100 goto sock_put; 1101 1102 path_put(&path); 1103 1104 return sk; 1105 1106 sock_put: 1107 sock_put(sk); 1108 path_put: 1109 path_put(&path); 1110 fail: 1111 return ERR_PTR(err); 1112 } 1113 1114 static struct sock *unix_find_abstract(struct net *net, 1115 struct sockaddr_un *sunaddr, 1116 int addr_len, int type) 1117 { 1118 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1119 struct dentry *dentry; 1120 struct sock *sk; 1121 1122 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1123 if (!sk) 1124 return ERR_PTR(-ECONNREFUSED); 1125 1126 dentry = unix_sk(sk)->path.dentry; 1127 if (dentry) 1128 touch_atime(&unix_sk(sk)->path); 1129 1130 return sk; 1131 } 1132 1133 static struct sock *unix_find_other(struct net *net, 1134 struct sockaddr_un *sunaddr, 1135 int addr_len, int type) 1136 { 1137 struct sock *sk; 1138 1139 if (sunaddr->sun_path[0]) 1140 sk = unix_find_bsd(sunaddr, addr_len, type); 1141 else 1142 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1143 1144 return sk; 1145 } 1146 1147 static int unix_autobind(struct sock *sk) 1148 { 1149 unsigned int new_hash, old_hash = sk->sk_hash; 1150 struct unix_sock *u = unix_sk(sk); 1151 struct net *net = sock_net(sk); 1152 struct unix_address *addr; 1153 u32 lastnum, ordernum; 1154 int err; 1155 1156 err = mutex_lock_interruptible(&u->bindlock); 1157 if (err) 1158 return err; 1159 1160 if (u->addr) 1161 goto out; 1162 1163 err = -ENOMEM; 1164 addr = kzalloc(sizeof(*addr) + 1165 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1166 if (!addr) 1167 goto out; 1168 1169 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1170 addr->name->sun_family = AF_UNIX; 1171 refcount_set(&addr->refcnt, 1); 1172 1173 ordernum = get_random_u32(); 1174 lastnum = ordernum & 0xFFFFF; 1175 retry: 1176 ordernum = (ordernum + 1) & 0xFFFFF; 1177 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1178 1179 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1180 unix_table_double_lock(net, old_hash, new_hash); 1181 1182 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1183 unix_table_double_unlock(net, old_hash, new_hash); 1184 1185 /* __unix_find_socket_byname() may take long time if many names 1186 * are already in use. 1187 */ 1188 cond_resched(); 1189 1190 if (ordernum == lastnum) { 1191 /* Give up if all names seems to be in use. */ 1192 err = -ENOSPC; 1193 unix_release_addr(addr); 1194 goto out; 1195 } 1196 1197 goto retry; 1198 } 1199 1200 __unix_set_addr_hash(net, sk, addr, new_hash); 1201 unix_table_double_unlock(net, old_hash, new_hash); 1202 err = 0; 1203 1204 out: mutex_unlock(&u->bindlock); 1205 return err; 1206 } 1207 1208 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1209 int addr_len) 1210 { 1211 umode_t mode = S_IFSOCK | 1212 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1213 unsigned int new_hash, old_hash = sk->sk_hash; 1214 struct unix_sock *u = unix_sk(sk); 1215 struct net *net = sock_net(sk); 1216 struct mnt_idmap *idmap; 1217 struct unix_address *addr; 1218 struct dentry *dentry; 1219 struct path parent; 1220 int err; 1221 1222 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1223 addr = unix_create_addr(sunaddr, addr_len); 1224 if (!addr) 1225 return -ENOMEM; 1226 1227 /* 1228 * Get the parent directory, calculate the hash for last 1229 * component. 1230 */ 1231 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1232 if (IS_ERR(dentry)) { 1233 err = PTR_ERR(dentry); 1234 goto out; 1235 } 1236 1237 /* 1238 * All right, let's create it. 1239 */ 1240 idmap = mnt_idmap(parent.mnt); 1241 err = security_path_mknod(&parent, dentry, mode, 0); 1242 if (!err) 1243 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1244 if (err) 1245 goto out_path; 1246 err = mutex_lock_interruptible(&u->bindlock); 1247 if (err) 1248 goto out_unlink; 1249 if (u->addr) 1250 goto out_unlock; 1251 1252 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1253 unix_table_double_lock(net, old_hash, new_hash); 1254 u->path.mnt = mntget(parent.mnt); 1255 u->path.dentry = dget(dentry); 1256 __unix_set_addr_hash(net, sk, addr, new_hash); 1257 unix_table_double_unlock(net, old_hash, new_hash); 1258 unix_insert_bsd_socket(sk); 1259 mutex_unlock(&u->bindlock); 1260 done_path_create(&parent, dentry); 1261 return 0; 1262 1263 out_unlock: 1264 mutex_unlock(&u->bindlock); 1265 err = -EINVAL; 1266 out_unlink: 1267 /* failed after successful mknod? unlink what we'd created... */ 1268 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1269 out_path: 1270 done_path_create(&parent, dentry); 1271 out: 1272 unix_release_addr(addr); 1273 return err == -EEXIST ? -EADDRINUSE : err; 1274 } 1275 1276 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1277 int addr_len) 1278 { 1279 unsigned int new_hash, old_hash = sk->sk_hash; 1280 struct unix_sock *u = unix_sk(sk); 1281 struct net *net = sock_net(sk); 1282 struct unix_address *addr; 1283 int err; 1284 1285 addr = unix_create_addr(sunaddr, addr_len); 1286 if (!addr) 1287 return -ENOMEM; 1288 1289 err = mutex_lock_interruptible(&u->bindlock); 1290 if (err) 1291 goto out; 1292 1293 if (u->addr) { 1294 err = -EINVAL; 1295 goto out_mutex; 1296 } 1297 1298 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1299 unix_table_double_lock(net, old_hash, new_hash); 1300 1301 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1302 goto out_spin; 1303 1304 __unix_set_addr_hash(net, sk, addr, new_hash); 1305 unix_table_double_unlock(net, old_hash, new_hash); 1306 mutex_unlock(&u->bindlock); 1307 return 0; 1308 1309 out_spin: 1310 unix_table_double_unlock(net, old_hash, new_hash); 1311 err = -EADDRINUSE; 1312 out_mutex: 1313 mutex_unlock(&u->bindlock); 1314 out: 1315 unix_release_addr(addr); 1316 return err; 1317 } 1318 1319 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1320 { 1321 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1322 struct sock *sk = sock->sk; 1323 int err; 1324 1325 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1326 sunaddr->sun_family == AF_UNIX) 1327 return unix_autobind(sk); 1328 1329 err = unix_validate_addr(sunaddr, addr_len); 1330 if (err) 1331 return err; 1332 1333 if (sunaddr->sun_path[0]) 1334 err = unix_bind_bsd(sk, sunaddr, addr_len); 1335 else 1336 err = unix_bind_abstract(sk, sunaddr, addr_len); 1337 1338 return err; 1339 } 1340 1341 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1342 { 1343 if (unlikely(sk1 == sk2) || !sk2) { 1344 unix_state_lock(sk1); 1345 return; 1346 } 1347 if (sk1 < sk2) { 1348 unix_state_lock(sk1); 1349 unix_state_lock_nested(sk2); 1350 } else { 1351 unix_state_lock(sk2); 1352 unix_state_lock_nested(sk1); 1353 } 1354 } 1355 1356 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1357 { 1358 if (unlikely(sk1 == sk2) || !sk2) { 1359 unix_state_unlock(sk1); 1360 return; 1361 } 1362 unix_state_unlock(sk1); 1363 unix_state_unlock(sk2); 1364 } 1365 1366 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1367 int alen, int flags) 1368 { 1369 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1370 struct sock *sk = sock->sk; 1371 struct sock *other; 1372 int err; 1373 1374 err = -EINVAL; 1375 if (alen < offsetofend(struct sockaddr, sa_family)) 1376 goto out; 1377 1378 if (addr->sa_family != AF_UNSPEC) { 1379 err = unix_validate_addr(sunaddr, alen); 1380 if (err) 1381 goto out; 1382 1383 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1384 if (err) 1385 goto out; 1386 1387 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1388 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1389 !unix_sk(sk)->addr) { 1390 err = unix_autobind(sk); 1391 if (err) 1392 goto out; 1393 } 1394 1395 restart: 1396 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1397 if (IS_ERR(other)) { 1398 err = PTR_ERR(other); 1399 goto out; 1400 } 1401 1402 unix_state_double_lock(sk, other); 1403 1404 /* Apparently VFS overslept socket death. Retry. */ 1405 if (sock_flag(other, SOCK_DEAD)) { 1406 unix_state_double_unlock(sk, other); 1407 sock_put(other); 1408 goto restart; 1409 } 1410 1411 err = -EPERM; 1412 if (!unix_may_send(sk, other)) 1413 goto out_unlock; 1414 1415 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1416 if (err) 1417 goto out_unlock; 1418 1419 sk->sk_state = other->sk_state = TCP_ESTABLISHED; 1420 } else { 1421 /* 1422 * 1003.1g breaking connected state with AF_UNSPEC 1423 */ 1424 other = NULL; 1425 unix_state_double_lock(sk, other); 1426 } 1427 1428 /* 1429 * If it was connected, reconnect. 1430 */ 1431 if (unix_peer(sk)) { 1432 struct sock *old_peer = unix_peer(sk); 1433 1434 unix_peer(sk) = other; 1435 if (!other) 1436 sk->sk_state = TCP_CLOSE; 1437 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1438 1439 unix_state_double_unlock(sk, other); 1440 1441 if (other != old_peer) 1442 unix_dgram_disconnected(sk, old_peer); 1443 sock_put(old_peer); 1444 } else { 1445 unix_peer(sk) = other; 1446 unix_state_double_unlock(sk, other); 1447 } 1448 1449 return 0; 1450 1451 out_unlock: 1452 unix_state_double_unlock(sk, other); 1453 sock_put(other); 1454 out: 1455 return err; 1456 } 1457 1458 static long unix_wait_for_peer(struct sock *other, long timeo) 1459 __releases(&unix_sk(other)->lock) 1460 { 1461 struct unix_sock *u = unix_sk(other); 1462 int sched; 1463 DEFINE_WAIT(wait); 1464 1465 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1466 1467 sched = !sock_flag(other, SOCK_DEAD) && 1468 !(other->sk_shutdown & RCV_SHUTDOWN) && 1469 unix_recvq_full_lockless(other); 1470 1471 unix_state_unlock(other); 1472 1473 if (sched) 1474 timeo = schedule_timeout(timeo); 1475 1476 finish_wait(&u->peer_wait, &wait); 1477 return timeo; 1478 } 1479 1480 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1481 int addr_len, int flags) 1482 { 1483 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1484 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1485 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1486 struct net *net = sock_net(sk); 1487 struct sk_buff *skb = NULL; 1488 long timeo; 1489 int err; 1490 int st; 1491 1492 err = unix_validate_addr(sunaddr, addr_len); 1493 if (err) 1494 goto out; 1495 1496 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1497 if (err) 1498 goto out; 1499 1500 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1501 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { 1502 err = unix_autobind(sk); 1503 if (err) 1504 goto out; 1505 } 1506 1507 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1508 1509 /* First of all allocate resources. 1510 If we will make it after state is locked, 1511 we will have to recheck all again in any case. 1512 */ 1513 1514 /* create new sock for complete connection */ 1515 newsk = unix_create1(net, NULL, 0, sock->type); 1516 if (IS_ERR(newsk)) { 1517 err = PTR_ERR(newsk); 1518 newsk = NULL; 1519 goto out; 1520 } 1521 1522 err = -ENOMEM; 1523 1524 /* Allocate skb for sending to listening sock */ 1525 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1526 if (skb == NULL) 1527 goto out; 1528 1529 restart: 1530 /* Find listening sock. */ 1531 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1532 if (IS_ERR(other)) { 1533 err = PTR_ERR(other); 1534 other = NULL; 1535 goto out; 1536 } 1537 1538 /* Latch state of peer */ 1539 unix_state_lock(other); 1540 1541 /* Apparently VFS overslept socket death. Retry. */ 1542 if (sock_flag(other, SOCK_DEAD)) { 1543 unix_state_unlock(other); 1544 sock_put(other); 1545 goto restart; 1546 } 1547 1548 err = -ECONNREFUSED; 1549 if (other->sk_state != TCP_LISTEN) 1550 goto out_unlock; 1551 if (other->sk_shutdown & RCV_SHUTDOWN) 1552 goto out_unlock; 1553 1554 if (unix_recvq_full(other)) { 1555 err = -EAGAIN; 1556 if (!timeo) 1557 goto out_unlock; 1558 1559 timeo = unix_wait_for_peer(other, timeo); 1560 1561 err = sock_intr_errno(timeo); 1562 if (signal_pending(current)) 1563 goto out; 1564 sock_put(other); 1565 goto restart; 1566 } 1567 1568 /* Latch our state. 1569 1570 It is tricky place. We need to grab our state lock and cannot 1571 drop lock on peer. It is dangerous because deadlock is 1572 possible. Connect to self case and simultaneous 1573 attempt to connect are eliminated by checking socket 1574 state. other is TCP_LISTEN, if sk is TCP_LISTEN we 1575 check this before attempt to grab lock. 1576 1577 Well, and we have to recheck the state after socket locked. 1578 */ 1579 st = sk->sk_state; 1580 1581 switch (st) { 1582 case TCP_CLOSE: 1583 /* This is ok... continue with connect */ 1584 break; 1585 case TCP_ESTABLISHED: 1586 /* Socket is already connected */ 1587 err = -EISCONN; 1588 goto out_unlock; 1589 default: 1590 err = -EINVAL; 1591 goto out_unlock; 1592 } 1593 1594 unix_state_lock_nested(sk); 1595 1596 if (sk->sk_state != st) { 1597 unix_state_unlock(sk); 1598 unix_state_unlock(other); 1599 sock_put(other); 1600 goto restart; 1601 } 1602 1603 err = security_unix_stream_connect(sk, other, newsk); 1604 if (err) { 1605 unix_state_unlock(sk); 1606 goto out_unlock; 1607 } 1608 1609 /* The way is open! Fastly set all the necessary fields... */ 1610 1611 sock_hold(sk); 1612 unix_peer(newsk) = sk; 1613 newsk->sk_state = TCP_ESTABLISHED; 1614 newsk->sk_type = sk->sk_type; 1615 init_peercred(newsk); 1616 newu = unix_sk(newsk); 1617 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1618 otheru = unix_sk(other); 1619 1620 /* copy address information from listening to new sock 1621 * 1622 * The contents of *(otheru->addr) and otheru->path 1623 * are seen fully set up here, since we have found 1624 * otheru in hash under its lock. Insertion into the 1625 * hash chain we'd found it in had been done in an 1626 * earlier critical area protected by the chain's lock, 1627 * the same one where we'd set *(otheru->addr) contents, 1628 * as well as otheru->path and otheru->addr itself. 1629 * 1630 * Using smp_store_release() here to set newu->addr 1631 * is enough to make those stores, as well as stores 1632 * to newu->path visible to anyone who gets newu->addr 1633 * by smp_load_acquire(). IOW, the same warranties 1634 * as for unix_sock instances bound in unix_bind() or 1635 * in unix_autobind(). 1636 */ 1637 if (otheru->path.dentry) { 1638 path_get(&otheru->path); 1639 newu->path = otheru->path; 1640 } 1641 refcount_inc(&otheru->addr->refcnt); 1642 smp_store_release(&newu->addr, otheru->addr); 1643 1644 /* Set credentials */ 1645 copy_peercred(sk, other); 1646 1647 sock->state = SS_CONNECTED; 1648 sk->sk_state = TCP_ESTABLISHED; 1649 sock_hold(newsk); 1650 1651 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1652 unix_peer(sk) = newsk; 1653 1654 unix_state_unlock(sk); 1655 1656 /* take ten and send info to listening sock */ 1657 spin_lock(&other->sk_receive_queue.lock); 1658 __skb_queue_tail(&other->sk_receive_queue, skb); 1659 spin_unlock(&other->sk_receive_queue.lock); 1660 unix_state_unlock(other); 1661 other->sk_data_ready(other); 1662 sock_put(other); 1663 return 0; 1664 1665 out_unlock: 1666 if (other) 1667 unix_state_unlock(other); 1668 1669 out: 1670 kfree_skb(skb); 1671 if (newsk) 1672 unix_release_sock(newsk, 0); 1673 if (other) 1674 sock_put(other); 1675 return err; 1676 } 1677 1678 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1679 { 1680 struct sock *ska = socka->sk, *skb = sockb->sk; 1681 1682 /* Join our sockets back to back */ 1683 sock_hold(ska); 1684 sock_hold(skb); 1685 unix_peer(ska) = skb; 1686 unix_peer(skb) = ska; 1687 init_peercred(ska); 1688 init_peercred(skb); 1689 1690 ska->sk_state = TCP_ESTABLISHED; 1691 skb->sk_state = TCP_ESTABLISHED; 1692 socka->state = SS_CONNECTED; 1693 sockb->state = SS_CONNECTED; 1694 return 0; 1695 } 1696 1697 static void unix_sock_inherit_flags(const struct socket *old, 1698 struct socket *new) 1699 { 1700 if (test_bit(SOCK_PASSCRED, &old->flags)) 1701 set_bit(SOCK_PASSCRED, &new->flags); 1702 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1703 set_bit(SOCK_PASSPIDFD, &new->flags); 1704 if (test_bit(SOCK_PASSSEC, &old->flags)) 1705 set_bit(SOCK_PASSSEC, &new->flags); 1706 } 1707 1708 static int unix_accept(struct socket *sock, struct socket *newsock, int flags, 1709 bool kern) 1710 { 1711 struct sock *sk = sock->sk; 1712 struct sock *tsk; 1713 struct sk_buff *skb; 1714 int err; 1715 1716 err = -EOPNOTSUPP; 1717 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1718 goto out; 1719 1720 err = -EINVAL; 1721 if (sk->sk_state != TCP_LISTEN) 1722 goto out; 1723 1724 /* If socket state is TCP_LISTEN it cannot change (for now...), 1725 * so that no locks are necessary. 1726 */ 1727 1728 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1729 &err); 1730 if (!skb) { 1731 /* This means receive shutdown. */ 1732 if (err == 0) 1733 err = -EINVAL; 1734 goto out; 1735 } 1736 1737 tsk = skb->sk; 1738 skb_free_datagram(sk, skb); 1739 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1740 1741 /* attach accepted sock to socket */ 1742 unix_state_lock(tsk); 1743 newsock->state = SS_CONNECTED; 1744 unix_sock_inherit_flags(sock, newsock); 1745 sock_graft(tsk, newsock); 1746 unix_state_unlock(tsk); 1747 return 0; 1748 1749 out: 1750 return err; 1751 } 1752 1753 1754 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1755 { 1756 struct sock *sk = sock->sk; 1757 struct unix_address *addr; 1758 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1759 int err = 0; 1760 1761 if (peer) { 1762 sk = unix_peer_get(sk); 1763 1764 err = -ENOTCONN; 1765 if (!sk) 1766 goto out; 1767 err = 0; 1768 } else { 1769 sock_hold(sk); 1770 } 1771 1772 addr = smp_load_acquire(&unix_sk(sk)->addr); 1773 if (!addr) { 1774 sunaddr->sun_family = AF_UNIX; 1775 sunaddr->sun_path[0] = 0; 1776 err = offsetof(struct sockaddr_un, sun_path); 1777 } else { 1778 err = addr->len; 1779 memcpy(sunaddr, addr->name, addr->len); 1780 1781 if (peer) 1782 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1783 CGROUP_UNIX_GETPEERNAME); 1784 else 1785 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1786 CGROUP_UNIX_GETSOCKNAME); 1787 } 1788 sock_put(sk); 1789 out: 1790 return err; 1791 } 1792 1793 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1794 { 1795 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1796 1797 /* 1798 * Garbage collection of unix sockets starts by selecting a set of 1799 * candidate sockets which have reference only from being in flight 1800 * (total_refs == inflight_refs). This condition is checked once during 1801 * the candidate collection phase, and candidates are marked as such, so 1802 * that non-candidates can later be ignored. While inflight_refs is 1803 * protected by unix_gc_lock, total_refs (file count) is not, hence this 1804 * is an instantaneous decision. 1805 * 1806 * Once a candidate, however, the socket must not be reinstalled into a 1807 * file descriptor while the garbage collection is in progress. 1808 * 1809 * If the above conditions are met, then the directed graph of 1810 * candidates (*) does not change while unix_gc_lock is held. 1811 * 1812 * Any operations that changes the file count through file descriptors 1813 * (dup, close, sendmsg) does not change the graph since candidates are 1814 * not installed in fds. 1815 * 1816 * Dequeing a candidate via recvmsg would install it into an fd, but 1817 * that takes unix_gc_lock to decrement the inflight count, so it's 1818 * serialized with garbage collection. 1819 * 1820 * MSG_PEEK is special in that it does not change the inflight count, 1821 * yet does install the socket into an fd. The following lock/unlock 1822 * pair is to ensure serialization with garbage collection. It must be 1823 * done between incrementing the file count and installing the file into 1824 * an fd. 1825 * 1826 * If garbage collection starts after the barrier provided by the 1827 * lock/unlock, then it will see the elevated refcount and not mark this 1828 * as a candidate. If a garbage collection is already in progress 1829 * before the file count was incremented, then the lock/unlock pair will 1830 * ensure that garbage collection is finished before progressing to 1831 * installing the fd. 1832 * 1833 * (*) A -> B where B is on the queue of A or B is on the queue of C 1834 * which is on the queue of listening socket A. 1835 */ 1836 spin_lock(&unix_gc_lock); 1837 spin_unlock(&unix_gc_lock); 1838 } 1839 1840 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1841 { 1842 int err = 0; 1843 1844 UNIXCB(skb).pid = get_pid(scm->pid); 1845 UNIXCB(skb).uid = scm->creds.uid; 1846 UNIXCB(skb).gid = scm->creds.gid; 1847 UNIXCB(skb).fp = NULL; 1848 unix_get_secdata(scm, skb); 1849 if (scm->fp && send_fds) 1850 err = unix_attach_fds(scm, skb); 1851 1852 skb->destructor = unix_destruct_scm; 1853 return err; 1854 } 1855 1856 static bool unix_passcred_enabled(const struct socket *sock, 1857 const struct sock *other) 1858 { 1859 return test_bit(SOCK_PASSCRED, &sock->flags) || 1860 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1861 !other->sk_socket || 1862 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1863 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1864 } 1865 1866 /* 1867 * Some apps rely on write() giving SCM_CREDENTIALS 1868 * We include credentials if source or destination socket 1869 * asserted SOCK_PASSCRED. 1870 */ 1871 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1872 const struct sock *other) 1873 { 1874 if (UNIXCB(skb).pid) 1875 return; 1876 if (unix_passcred_enabled(sock, other)) { 1877 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1878 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1879 } 1880 } 1881 1882 static bool unix_skb_scm_eq(struct sk_buff *skb, 1883 struct scm_cookie *scm) 1884 { 1885 return UNIXCB(skb).pid == scm->pid && 1886 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1887 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1888 unix_secdata_eq(scm, skb); 1889 } 1890 1891 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1892 { 1893 struct scm_fp_list *fp = UNIXCB(skb).fp; 1894 struct unix_sock *u = unix_sk(sk); 1895 1896 if (unlikely(fp && fp->count)) 1897 atomic_add(fp->count, &u->scm_stat.nr_fds); 1898 } 1899 1900 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1901 { 1902 struct scm_fp_list *fp = UNIXCB(skb).fp; 1903 struct unix_sock *u = unix_sk(sk); 1904 1905 if (unlikely(fp && fp->count)) 1906 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1907 } 1908 1909 /* 1910 * Send AF_UNIX data. 1911 */ 1912 1913 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1914 size_t len) 1915 { 1916 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 1917 struct sock *sk = sock->sk, *other = NULL; 1918 struct unix_sock *u = unix_sk(sk); 1919 struct scm_cookie scm; 1920 struct sk_buff *skb; 1921 int data_len = 0; 1922 int sk_locked; 1923 long timeo; 1924 int err; 1925 1926 wait_for_unix_gc(); 1927 err = scm_send(sock, msg, &scm, false); 1928 if (err < 0) 1929 return err; 1930 1931 err = -EOPNOTSUPP; 1932 if (msg->msg_flags&MSG_OOB) 1933 goto out; 1934 1935 if (msg->msg_namelen) { 1936 err = unix_validate_addr(sunaddr, msg->msg_namelen); 1937 if (err) 1938 goto out; 1939 1940 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 1941 msg->msg_name, 1942 &msg->msg_namelen, 1943 NULL); 1944 if (err) 1945 goto out; 1946 } else { 1947 sunaddr = NULL; 1948 err = -ENOTCONN; 1949 other = unix_peer_get(sk); 1950 if (!other) 1951 goto out; 1952 } 1953 1954 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1955 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { 1956 err = unix_autobind(sk); 1957 if (err) 1958 goto out; 1959 } 1960 1961 err = -EMSGSIZE; 1962 if (len > sk->sk_sndbuf - 32) 1963 goto out; 1964 1965 if (len > SKB_MAX_ALLOC) { 1966 data_len = min_t(size_t, 1967 len - SKB_MAX_ALLOC, 1968 MAX_SKB_FRAGS * PAGE_SIZE); 1969 data_len = PAGE_ALIGN(data_len); 1970 1971 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 1972 } 1973 1974 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 1975 msg->msg_flags & MSG_DONTWAIT, &err, 1976 PAGE_ALLOC_COSTLY_ORDER); 1977 if (skb == NULL) 1978 goto out; 1979 1980 err = unix_scm_to_skb(&scm, skb, true); 1981 if (err < 0) 1982 goto out_free; 1983 1984 skb_put(skb, len - data_len); 1985 skb->data_len = data_len; 1986 skb->len = len; 1987 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 1988 if (err) 1989 goto out_free; 1990 1991 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1992 1993 restart: 1994 if (!other) { 1995 err = -ECONNRESET; 1996 if (sunaddr == NULL) 1997 goto out_free; 1998 1999 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, 2000 sk->sk_type); 2001 if (IS_ERR(other)) { 2002 err = PTR_ERR(other); 2003 other = NULL; 2004 goto out_free; 2005 } 2006 } 2007 2008 if (sk_filter(other, skb) < 0) { 2009 /* Toss the packet but do not return any error to the sender */ 2010 err = len; 2011 goto out_free; 2012 } 2013 2014 sk_locked = 0; 2015 unix_state_lock(other); 2016 restart_locked: 2017 err = -EPERM; 2018 if (!unix_may_send(sk, other)) 2019 goto out_unlock; 2020 2021 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2022 /* 2023 * Check with 1003.1g - what should 2024 * datagram error 2025 */ 2026 unix_state_unlock(other); 2027 sock_put(other); 2028 2029 if (!sk_locked) 2030 unix_state_lock(sk); 2031 2032 err = 0; 2033 if (sk->sk_type == SOCK_SEQPACKET) { 2034 /* We are here only when racing with unix_release_sock() 2035 * is clearing @other. Never change state to TCP_CLOSE 2036 * unlike SOCK_DGRAM wants. 2037 */ 2038 unix_state_unlock(sk); 2039 err = -EPIPE; 2040 } else if (unix_peer(sk) == other) { 2041 unix_peer(sk) = NULL; 2042 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2043 2044 sk->sk_state = TCP_CLOSE; 2045 unix_state_unlock(sk); 2046 2047 unix_dgram_disconnected(sk, other); 2048 sock_put(other); 2049 err = -ECONNREFUSED; 2050 } else { 2051 unix_state_unlock(sk); 2052 } 2053 2054 other = NULL; 2055 if (err) 2056 goto out_free; 2057 goto restart; 2058 } 2059 2060 err = -EPIPE; 2061 if (other->sk_shutdown & RCV_SHUTDOWN) 2062 goto out_unlock; 2063 2064 if (sk->sk_type != SOCK_SEQPACKET) { 2065 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2066 if (err) 2067 goto out_unlock; 2068 } 2069 2070 /* other == sk && unix_peer(other) != sk if 2071 * - unix_peer(sk) == NULL, destination address bound to sk 2072 * - unix_peer(sk) == sk by time of get but disconnected before lock 2073 */ 2074 if (other != sk && 2075 unlikely(unix_peer(other) != sk && 2076 unix_recvq_full_lockless(other))) { 2077 if (timeo) { 2078 timeo = unix_wait_for_peer(other, timeo); 2079 2080 err = sock_intr_errno(timeo); 2081 if (signal_pending(current)) 2082 goto out_free; 2083 2084 goto restart; 2085 } 2086 2087 if (!sk_locked) { 2088 unix_state_unlock(other); 2089 unix_state_double_lock(sk, other); 2090 } 2091 2092 if (unix_peer(sk) != other || 2093 unix_dgram_peer_wake_me(sk, other)) { 2094 err = -EAGAIN; 2095 sk_locked = 1; 2096 goto out_unlock; 2097 } 2098 2099 if (!sk_locked) { 2100 sk_locked = 1; 2101 goto restart_locked; 2102 } 2103 } 2104 2105 if (unlikely(sk_locked)) 2106 unix_state_unlock(sk); 2107 2108 if (sock_flag(other, SOCK_RCVTSTAMP)) 2109 __net_timestamp(skb); 2110 maybe_add_creds(skb, sock, other); 2111 scm_stat_add(other, skb); 2112 skb_queue_tail(&other->sk_receive_queue, skb); 2113 unix_state_unlock(other); 2114 other->sk_data_ready(other); 2115 sock_put(other); 2116 scm_destroy(&scm); 2117 return len; 2118 2119 out_unlock: 2120 if (sk_locked) 2121 unix_state_unlock(sk); 2122 unix_state_unlock(other); 2123 out_free: 2124 kfree_skb(skb); 2125 out: 2126 if (other) 2127 sock_put(other); 2128 scm_destroy(&scm); 2129 return err; 2130 } 2131 2132 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2133 * bytes, and a minimum of a full page. 2134 */ 2135 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2136 2137 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2138 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2139 struct scm_cookie *scm, bool fds_sent) 2140 { 2141 struct unix_sock *ousk = unix_sk(other); 2142 struct sk_buff *skb; 2143 int err = 0; 2144 2145 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2146 2147 if (!skb) 2148 return err; 2149 2150 err = unix_scm_to_skb(scm, skb, !fds_sent); 2151 if (err < 0) { 2152 kfree_skb(skb); 2153 return err; 2154 } 2155 skb_put(skb, 1); 2156 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2157 2158 if (err) { 2159 kfree_skb(skb); 2160 return err; 2161 } 2162 2163 unix_state_lock(other); 2164 2165 if (sock_flag(other, SOCK_DEAD) || 2166 (other->sk_shutdown & RCV_SHUTDOWN)) { 2167 unix_state_unlock(other); 2168 kfree_skb(skb); 2169 return -EPIPE; 2170 } 2171 2172 maybe_add_creds(skb, sock, other); 2173 skb_get(skb); 2174 2175 if (ousk->oob_skb) 2176 consume_skb(ousk->oob_skb); 2177 2178 WRITE_ONCE(ousk->oob_skb, skb); 2179 2180 scm_stat_add(other, skb); 2181 skb_queue_tail(&other->sk_receive_queue, skb); 2182 sk_send_sigurg(other); 2183 unix_state_unlock(other); 2184 other->sk_data_ready(other); 2185 2186 return err; 2187 } 2188 #endif 2189 2190 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2191 size_t len) 2192 { 2193 struct sock *sk = sock->sk; 2194 struct sock *other = NULL; 2195 int err, size; 2196 struct sk_buff *skb; 2197 int sent = 0; 2198 struct scm_cookie scm; 2199 bool fds_sent = false; 2200 int data_len; 2201 2202 wait_for_unix_gc(); 2203 err = scm_send(sock, msg, &scm, false); 2204 if (err < 0) 2205 return err; 2206 2207 err = -EOPNOTSUPP; 2208 if (msg->msg_flags & MSG_OOB) { 2209 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2210 if (len) 2211 len--; 2212 else 2213 #endif 2214 goto out_err; 2215 } 2216 2217 if (msg->msg_namelen) { 2218 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2219 goto out_err; 2220 } else { 2221 err = -ENOTCONN; 2222 other = unix_peer(sk); 2223 if (!other) 2224 goto out_err; 2225 } 2226 2227 if (sk->sk_shutdown & SEND_SHUTDOWN) 2228 goto pipe_err; 2229 2230 while (sent < len) { 2231 size = len - sent; 2232 2233 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2234 skb = sock_alloc_send_pskb(sk, 0, 0, 2235 msg->msg_flags & MSG_DONTWAIT, 2236 &err, 0); 2237 } else { 2238 /* Keep two messages in the pipe so it schedules better */ 2239 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); 2240 2241 /* allow fallback to order-0 allocations */ 2242 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2243 2244 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2245 2246 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2247 2248 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2249 msg->msg_flags & MSG_DONTWAIT, &err, 2250 get_order(UNIX_SKB_FRAGS_SZ)); 2251 } 2252 if (!skb) 2253 goto out_err; 2254 2255 /* Only send the fds in the first buffer */ 2256 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2257 if (err < 0) { 2258 kfree_skb(skb); 2259 goto out_err; 2260 } 2261 fds_sent = true; 2262 2263 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2264 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2265 sk->sk_allocation); 2266 if (err < 0) { 2267 kfree_skb(skb); 2268 goto out_err; 2269 } 2270 size = err; 2271 refcount_add(size, &sk->sk_wmem_alloc); 2272 } else { 2273 skb_put(skb, size - data_len); 2274 skb->data_len = data_len; 2275 skb->len = size; 2276 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2277 if (err) { 2278 kfree_skb(skb); 2279 goto out_err; 2280 } 2281 } 2282 2283 unix_state_lock(other); 2284 2285 if (sock_flag(other, SOCK_DEAD) || 2286 (other->sk_shutdown & RCV_SHUTDOWN)) 2287 goto pipe_err_free; 2288 2289 maybe_add_creds(skb, sock, other); 2290 scm_stat_add(other, skb); 2291 skb_queue_tail(&other->sk_receive_queue, skb); 2292 unix_state_unlock(other); 2293 other->sk_data_ready(other); 2294 sent += size; 2295 } 2296 2297 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2298 if (msg->msg_flags & MSG_OOB) { 2299 err = queue_oob(sock, msg, other, &scm, fds_sent); 2300 if (err) 2301 goto out_err; 2302 sent++; 2303 } 2304 #endif 2305 2306 scm_destroy(&scm); 2307 2308 return sent; 2309 2310 pipe_err_free: 2311 unix_state_unlock(other); 2312 kfree_skb(skb); 2313 pipe_err: 2314 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) 2315 send_sig(SIGPIPE, current, 0); 2316 err = -EPIPE; 2317 out_err: 2318 scm_destroy(&scm); 2319 return sent ? : err; 2320 } 2321 2322 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2323 size_t len) 2324 { 2325 int err; 2326 struct sock *sk = sock->sk; 2327 2328 err = sock_error(sk); 2329 if (err) 2330 return err; 2331 2332 if (sk->sk_state != TCP_ESTABLISHED) 2333 return -ENOTCONN; 2334 2335 if (msg->msg_namelen) 2336 msg->msg_namelen = 0; 2337 2338 return unix_dgram_sendmsg(sock, msg, len); 2339 } 2340 2341 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2342 size_t size, int flags) 2343 { 2344 struct sock *sk = sock->sk; 2345 2346 if (sk->sk_state != TCP_ESTABLISHED) 2347 return -ENOTCONN; 2348 2349 return unix_dgram_recvmsg(sock, msg, size, flags); 2350 } 2351 2352 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2353 { 2354 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2355 2356 if (addr) { 2357 msg->msg_namelen = addr->len; 2358 memcpy(msg->msg_name, addr->name, addr->len); 2359 } 2360 } 2361 2362 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2363 int flags) 2364 { 2365 struct scm_cookie scm; 2366 struct socket *sock = sk->sk_socket; 2367 struct unix_sock *u = unix_sk(sk); 2368 struct sk_buff *skb, *last; 2369 long timeo; 2370 int skip; 2371 int err; 2372 2373 err = -EOPNOTSUPP; 2374 if (flags&MSG_OOB) 2375 goto out; 2376 2377 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2378 2379 do { 2380 mutex_lock(&u->iolock); 2381 2382 skip = sk_peek_offset(sk, flags); 2383 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2384 &skip, &err, &last); 2385 if (skb) { 2386 if (!(flags & MSG_PEEK)) 2387 scm_stat_del(sk, skb); 2388 break; 2389 } 2390 2391 mutex_unlock(&u->iolock); 2392 2393 if (err != -EAGAIN) 2394 break; 2395 } while (timeo && 2396 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2397 &err, &timeo, last)); 2398 2399 if (!skb) { /* implies iolock unlocked */ 2400 unix_state_lock(sk); 2401 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2402 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2403 (sk->sk_shutdown & RCV_SHUTDOWN)) 2404 err = 0; 2405 unix_state_unlock(sk); 2406 goto out; 2407 } 2408 2409 if (wq_has_sleeper(&u->peer_wait)) 2410 wake_up_interruptible_sync_poll(&u->peer_wait, 2411 EPOLLOUT | EPOLLWRNORM | 2412 EPOLLWRBAND); 2413 2414 if (msg->msg_name) { 2415 unix_copy_addr(msg, skb->sk); 2416 2417 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2418 msg->msg_name, 2419 &msg->msg_namelen); 2420 } 2421 2422 if (size > skb->len - skip) 2423 size = skb->len - skip; 2424 else if (size < skb->len - skip) 2425 msg->msg_flags |= MSG_TRUNC; 2426 2427 err = skb_copy_datagram_msg(skb, skip, msg, size); 2428 if (err) 2429 goto out_free; 2430 2431 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2432 __sock_recv_timestamp(msg, sk, skb); 2433 2434 memset(&scm, 0, sizeof(scm)); 2435 2436 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2437 unix_set_secdata(&scm, skb); 2438 2439 if (!(flags & MSG_PEEK)) { 2440 if (UNIXCB(skb).fp) 2441 unix_detach_fds(&scm, skb); 2442 2443 sk_peek_offset_bwd(sk, skb->len); 2444 } else { 2445 /* It is questionable: on PEEK we could: 2446 - do not return fds - good, but too simple 8) 2447 - return fds, and do not return them on read (old strategy, 2448 apparently wrong) 2449 - clone fds (I chose it for now, it is the most universal 2450 solution) 2451 2452 POSIX 1003.1g does not actually define this clearly 2453 at all. POSIX 1003.1g doesn't define a lot of things 2454 clearly however! 2455 2456 */ 2457 2458 sk_peek_offset_fwd(sk, size); 2459 2460 if (UNIXCB(skb).fp) 2461 unix_peek_fds(&scm, skb); 2462 } 2463 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2464 2465 scm_recv_unix(sock, msg, &scm, flags); 2466 2467 out_free: 2468 skb_free_datagram(sk, skb); 2469 mutex_unlock(&u->iolock); 2470 out: 2471 return err; 2472 } 2473 2474 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2475 int flags) 2476 { 2477 struct sock *sk = sock->sk; 2478 2479 #ifdef CONFIG_BPF_SYSCALL 2480 const struct proto *prot = READ_ONCE(sk->sk_prot); 2481 2482 if (prot != &unix_dgram_proto) 2483 return prot->recvmsg(sk, msg, size, flags, NULL); 2484 #endif 2485 return __unix_dgram_recvmsg(sk, msg, size, flags); 2486 } 2487 2488 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2489 { 2490 struct unix_sock *u = unix_sk(sk); 2491 struct sk_buff *skb; 2492 int err; 2493 2494 mutex_lock(&u->iolock); 2495 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2496 mutex_unlock(&u->iolock); 2497 if (!skb) 2498 return err; 2499 2500 return recv_actor(sk, skb); 2501 } 2502 2503 /* 2504 * Sleep until more data has arrived. But check for races.. 2505 */ 2506 static long unix_stream_data_wait(struct sock *sk, long timeo, 2507 struct sk_buff *last, unsigned int last_len, 2508 bool freezable) 2509 { 2510 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2511 struct sk_buff *tail; 2512 DEFINE_WAIT(wait); 2513 2514 unix_state_lock(sk); 2515 2516 for (;;) { 2517 prepare_to_wait(sk_sleep(sk), &wait, state); 2518 2519 tail = skb_peek_tail(&sk->sk_receive_queue); 2520 if (tail != last || 2521 (tail && tail->len != last_len) || 2522 sk->sk_err || 2523 (sk->sk_shutdown & RCV_SHUTDOWN) || 2524 signal_pending(current) || 2525 !timeo) 2526 break; 2527 2528 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2529 unix_state_unlock(sk); 2530 timeo = schedule_timeout(timeo); 2531 unix_state_lock(sk); 2532 2533 if (sock_flag(sk, SOCK_DEAD)) 2534 break; 2535 2536 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2537 } 2538 2539 finish_wait(sk_sleep(sk), &wait); 2540 unix_state_unlock(sk); 2541 return timeo; 2542 } 2543 2544 static unsigned int unix_skb_len(const struct sk_buff *skb) 2545 { 2546 return skb->len - UNIXCB(skb).consumed; 2547 } 2548 2549 struct unix_stream_read_state { 2550 int (*recv_actor)(struct sk_buff *, int, int, 2551 struct unix_stream_read_state *); 2552 struct socket *socket; 2553 struct msghdr *msg; 2554 struct pipe_inode_info *pipe; 2555 size_t size; 2556 int flags; 2557 unsigned int splice_flags; 2558 }; 2559 2560 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2561 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2562 { 2563 struct socket *sock = state->socket; 2564 struct sock *sk = sock->sk; 2565 struct unix_sock *u = unix_sk(sk); 2566 int chunk = 1; 2567 struct sk_buff *oob_skb; 2568 2569 mutex_lock(&u->iolock); 2570 unix_state_lock(sk); 2571 2572 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2573 unix_state_unlock(sk); 2574 mutex_unlock(&u->iolock); 2575 return -EINVAL; 2576 } 2577 2578 oob_skb = u->oob_skb; 2579 2580 if (!(state->flags & MSG_PEEK)) 2581 WRITE_ONCE(u->oob_skb, NULL); 2582 else 2583 skb_get(oob_skb); 2584 unix_state_unlock(sk); 2585 2586 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2587 2588 if (!(state->flags & MSG_PEEK)) 2589 UNIXCB(oob_skb).consumed += 1; 2590 2591 consume_skb(oob_skb); 2592 2593 mutex_unlock(&u->iolock); 2594 2595 if (chunk < 0) 2596 return -EFAULT; 2597 2598 state->msg->msg_flags |= MSG_OOB; 2599 return 1; 2600 } 2601 2602 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2603 int flags, int copied) 2604 { 2605 struct unix_sock *u = unix_sk(sk); 2606 2607 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { 2608 skb_unlink(skb, &sk->sk_receive_queue); 2609 consume_skb(skb); 2610 skb = NULL; 2611 } else { 2612 if (skb == u->oob_skb) { 2613 if (copied) { 2614 skb = NULL; 2615 } else if (sock_flag(sk, SOCK_URGINLINE)) { 2616 if (!(flags & MSG_PEEK)) { 2617 WRITE_ONCE(u->oob_skb, NULL); 2618 consume_skb(skb); 2619 } 2620 } else if (!(flags & MSG_PEEK)) { 2621 skb_unlink(skb, &sk->sk_receive_queue); 2622 consume_skb(skb); 2623 skb = skb_peek(&sk->sk_receive_queue); 2624 } 2625 } 2626 } 2627 return skb; 2628 } 2629 #endif 2630 2631 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2632 { 2633 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) 2634 return -ENOTCONN; 2635 2636 return unix_read_skb(sk, recv_actor); 2637 } 2638 2639 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2640 bool freezable) 2641 { 2642 struct scm_cookie scm; 2643 struct socket *sock = state->socket; 2644 struct sock *sk = sock->sk; 2645 struct unix_sock *u = unix_sk(sk); 2646 int copied = 0; 2647 int flags = state->flags; 2648 int noblock = flags & MSG_DONTWAIT; 2649 bool check_creds = false; 2650 int target; 2651 int err = 0; 2652 long timeo; 2653 int skip; 2654 size_t size = state->size; 2655 unsigned int last_len; 2656 2657 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { 2658 err = -EINVAL; 2659 goto out; 2660 } 2661 2662 if (unlikely(flags & MSG_OOB)) { 2663 err = -EOPNOTSUPP; 2664 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2665 err = unix_stream_recv_urg(state); 2666 #endif 2667 goto out; 2668 } 2669 2670 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2671 timeo = sock_rcvtimeo(sk, noblock); 2672 2673 memset(&scm, 0, sizeof(scm)); 2674 2675 /* Lock the socket to prevent queue disordering 2676 * while sleeps in memcpy_tomsg 2677 */ 2678 mutex_lock(&u->iolock); 2679 2680 skip = max(sk_peek_offset(sk, flags), 0); 2681 2682 do { 2683 int chunk; 2684 bool drop_skb; 2685 struct sk_buff *skb, *last; 2686 2687 redo: 2688 unix_state_lock(sk); 2689 if (sock_flag(sk, SOCK_DEAD)) { 2690 err = -ECONNRESET; 2691 goto unlock; 2692 } 2693 last = skb = skb_peek(&sk->sk_receive_queue); 2694 last_len = last ? last->len : 0; 2695 2696 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2697 if (skb) { 2698 skb = manage_oob(skb, sk, flags, copied); 2699 if (!skb) { 2700 unix_state_unlock(sk); 2701 if (copied) 2702 break; 2703 goto redo; 2704 } 2705 } 2706 #endif 2707 again: 2708 if (skb == NULL) { 2709 if (copied >= target) 2710 goto unlock; 2711 2712 /* 2713 * POSIX 1003.1g mandates this order. 2714 */ 2715 2716 err = sock_error(sk); 2717 if (err) 2718 goto unlock; 2719 if (sk->sk_shutdown & RCV_SHUTDOWN) 2720 goto unlock; 2721 2722 unix_state_unlock(sk); 2723 if (!timeo) { 2724 err = -EAGAIN; 2725 break; 2726 } 2727 2728 mutex_unlock(&u->iolock); 2729 2730 timeo = unix_stream_data_wait(sk, timeo, last, 2731 last_len, freezable); 2732 2733 if (signal_pending(current)) { 2734 err = sock_intr_errno(timeo); 2735 scm_destroy(&scm); 2736 goto out; 2737 } 2738 2739 mutex_lock(&u->iolock); 2740 goto redo; 2741 unlock: 2742 unix_state_unlock(sk); 2743 break; 2744 } 2745 2746 while (skip >= unix_skb_len(skb)) { 2747 skip -= unix_skb_len(skb); 2748 last = skb; 2749 last_len = skb->len; 2750 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2751 if (!skb) 2752 goto again; 2753 } 2754 2755 unix_state_unlock(sk); 2756 2757 if (check_creds) { 2758 /* Never glue messages from different writers */ 2759 if (!unix_skb_scm_eq(skb, &scm)) 2760 break; 2761 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2762 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2763 /* Copy credentials */ 2764 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2765 unix_set_secdata(&scm, skb); 2766 check_creds = true; 2767 } 2768 2769 /* Copy address just once */ 2770 if (state->msg && state->msg->msg_name) { 2771 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2772 state->msg->msg_name); 2773 unix_copy_addr(state->msg, skb->sk); 2774 2775 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2776 state->msg->msg_name, 2777 &state->msg->msg_namelen); 2778 2779 sunaddr = NULL; 2780 } 2781 2782 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2783 skb_get(skb); 2784 chunk = state->recv_actor(skb, skip, chunk, state); 2785 drop_skb = !unix_skb_len(skb); 2786 /* skb is only safe to use if !drop_skb */ 2787 consume_skb(skb); 2788 if (chunk < 0) { 2789 if (copied == 0) 2790 copied = -EFAULT; 2791 break; 2792 } 2793 copied += chunk; 2794 size -= chunk; 2795 2796 if (drop_skb) { 2797 /* the skb was touched by a concurrent reader; 2798 * we should not expect anything from this skb 2799 * anymore and assume it invalid - we can be 2800 * sure it was dropped from the socket queue 2801 * 2802 * let's report a short read 2803 */ 2804 err = 0; 2805 break; 2806 } 2807 2808 /* Mark read part of skb as used */ 2809 if (!(flags & MSG_PEEK)) { 2810 UNIXCB(skb).consumed += chunk; 2811 2812 sk_peek_offset_bwd(sk, chunk); 2813 2814 if (UNIXCB(skb).fp) { 2815 scm_stat_del(sk, skb); 2816 unix_detach_fds(&scm, skb); 2817 } 2818 2819 if (unix_skb_len(skb)) 2820 break; 2821 2822 skb_unlink(skb, &sk->sk_receive_queue); 2823 consume_skb(skb); 2824 2825 if (scm.fp) 2826 break; 2827 } else { 2828 /* It is questionable, see note in unix_dgram_recvmsg. 2829 */ 2830 if (UNIXCB(skb).fp) 2831 unix_peek_fds(&scm, skb); 2832 2833 sk_peek_offset_fwd(sk, chunk); 2834 2835 if (UNIXCB(skb).fp) 2836 break; 2837 2838 skip = 0; 2839 last = skb; 2840 last_len = skb->len; 2841 unix_state_lock(sk); 2842 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2843 if (skb) 2844 goto again; 2845 unix_state_unlock(sk); 2846 break; 2847 } 2848 } while (size); 2849 2850 mutex_unlock(&u->iolock); 2851 if (state->msg) 2852 scm_recv_unix(sock, state->msg, &scm, flags); 2853 else 2854 scm_destroy(&scm); 2855 out: 2856 return copied ? : err; 2857 } 2858 2859 static int unix_stream_read_actor(struct sk_buff *skb, 2860 int skip, int chunk, 2861 struct unix_stream_read_state *state) 2862 { 2863 int ret; 2864 2865 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2866 state->msg, chunk); 2867 return ret ?: chunk; 2868 } 2869 2870 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2871 size_t size, int flags) 2872 { 2873 struct unix_stream_read_state state = { 2874 .recv_actor = unix_stream_read_actor, 2875 .socket = sk->sk_socket, 2876 .msg = msg, 2877 .size = size, 2878 .flags = flags 2879 }; 2880 2881 return unix_stream_read_generic(&state, true); 2882 } 2883 2884 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2885 size_t size, int flags) 2886 { 2887 struct unix_stream_read_state state = { 2888 .recv_actor = unix_stream_read_actor, 2889 .socket = sock, 2890 .msg = msg, 2891 .size = size, 2892 .flags = flags 2893 }; 2894 2895 #ifdef CONFIG_BPF_SYSCALL 2896 struct sock *sk = sock->sk; 2897 const struct proto *prot = READ_ONCE(sk->sk_prot); 2898 2899 if (prot != &unix_stream_proto) 2900 return prot->recvmsg(sk, msg, size, flags, NULL); 2901 #endif 2902 return unix_stream_read_generic(&state, true); 2903 } 2904 2905 static int unix_stream_splice_actor(struct sk_buff *skb, 2906 int skip, int chunk, 2907 struct unix_stream_read_state *state) 2908 { 2909 return skb_splice_bits(skb, state->socket->sk, 2910 UNIXCB(skb).consumed + skip, 2911 state->pipe, chunk, state->splice_flags); 2912 } 2913 2914 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2915 struct pipe_inode_info *pipe, 2916 size_t size, unsigned int flags) 2917 { 2918 struct unix_stream_read_state state = { 2919 .recv_actor = unix_stream_splice_actor, 2920 .socket = sock, 2921 .pipe = pipe, 2922 .size = size, 2923 .splice_flags = flags, 2924 }; 2925 2926 if (unlikely(*ppos)) 2927 return -ESPIPE; 2928 2929 if (sock->file->f_flags & O_NONBLOCK || 2930 flags & SPLICE_F_NONBLOCK) 2931 state.flags = MSG_DONTWAIT; 2932 2933 return unix_stream_read_generic(&state, false); 2934 } 2935 2936 static int unix_shutdown(struct socket *sock, int mode) 2937 { 2938 struct sock *sk = sock->sk; 2939 struct sock *other; 2940 2941 if (mode < SHUT_RD || mode > SHUT_RDWR) 2942 return -EINVAL; 2943 /* This maps: 2944 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 2945 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 2946 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 2947 */ 2948 ++mode; 2949 2950 unix_state_lock(sk); 2951 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 2952 other = unix_peer(sk); 2953 if (other) 2954 sock_hold(other); 2955 unix_state_unlock(sk); 2956 sk->sk_state_change(sk); 2957 2958 if (other && 2959 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 2960 2961 int peer_mode = 0; 2962 const struct proto *prot = READ_ONCE(other->sk_prot); 2963 2964 if (prot->unhash) 2965 prot->unhash(other); 2966 if (mode&RCV_SHUTDOWN) 2967 peer_mode |= SEND_SHUTDOWN; 2968 if (mode&SEND_SHUTDOWN) 2969 peer_mode |= RCV_SHUTDOWN; 2970 unix_state_lock(other); 2971 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 2972 unix_state_unlock(other); 2973 other->sk_state_change(other); 2974 if (peer_mode == SHUTDOWN_MASK) 2975 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 2976 else if (peer_mode & RCV_SHUTDOWN) 2977 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 2978 } 2979 if (other) 2980 sock_put(other); 2981 2982 return 0; 2983 } 2984 2985 long unix_inq_len(struct sock *sk) 2986 { 2987 struct sk_buff *skb; 2988 long amount = 0; 2989 2990 if (sk->sk_state == TCP_LISTEN) 2991 return -EINVAL; 2992 2993 spin_lock(&sk->sk_receive_queue.lock); 2994 if (sk->sk_type == SOCK_STREAM || 2995 sk->sk_type == SOCK_SEQPACKET) { 2996 skb_queue_walk(&sk->sk_receive_queue, skb) 2997 amount += unix_skb_len(skb); 2998 } else { 2999 skb = skb_peek(&sk->sk_receive_queue); 3000 if (skb) 3001 amount = skb->len; 3002 } 3003 spin_unlock(&sk->sk_receive_queue.lock); 3004 3005 return amount; 3006 } 3007 EXPORT_SYMBOL_GPL(unix_inq_len); 3008 3009 long unix_outq_len(struct sock *sk) 3010 { 3011 return sk_wmem_alloc_get(sk); 3012 } 3013 EXPORT_SYMBOL_GPL(unix_outq_len); 3014 3015 static int unix_open_file(struct sock *sk) 3016 { 3017 struct path path; 3018 struct file *f; 3019 int fd; 3020 3021 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3022 return -EPERM; 3023 3024 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3025 return -ENOENT; 3026 3027 path = unix_sk(sk)->path; 3028 if (!path.dentry) 3029 return -ENOENT; 3030 3031 path_get(&path); 3032 3033 fd = get_unused_fd_flags(O_CLOEXEC); 3034 if (fd < 0) 3035 goto out; 3036 3037 f = dentry_open(&path, O_PATH, current_cred()); 3038 if (IS_ERR(f)) { 3039 put_unused_fd(fd); 3040 fd = PTR_ERR(f); 3041 goto out; 3042 } 3043 3044 fd_install(fd, f); 3045 out: 3046 path_put(&path); 3047 3048 return fd; 3049 } 3050 3051 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3052 { 3053 struct sock *sk = sock->sk; 3054 long amount = 0; 3055 int err; 3056 3057 switch (cmd) { 3058 case SIOCOUTQ: 3059 amount = unix_outq_len(sk); 3060 err = put_user(amount, (int __user *)arg); 3061 break; 3062 case SIOCINQ: 3063 amount = unix_inq_len(sk); 3064 if (amount < 0) 3065 err = amount; 3066 else 3067 err = put_user(amount, (int __user *)arg); 3068 break; 3069 case SIOCUNIXFILE: 3070 err = unix_open_file(sk); 3071 break; 3072 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3073 case SIOCATMARK: 3074 { 3075 struct sk_buff *skb; 3076 int answ = 0; 3077 3078 skb = skb_peek(&sk->sk_receive_queue); 3079 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) 3080 answ = 1; 3081 err = put_user(answ, (int __user *)arg); 3082 } 3083 break; 3084 #endif 3085 default: 3086 err = -ENOIOCTLCMD; 3087 break; 3088 } 3089 return err; 3090 } 3091 3092 #ifdef CONFIG_COMPAT 3093 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3094 { 3095 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3096 } 3097 #endif 3098 3099 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3100 { 3101 struct sock *sk = sock->sk; 3102 __poll_t mask; 3103 u8 shutdown; 3104 3105 sock_poll_wait(file, sock, wait); 3106 mask = 0; 3107 shutdown = READ_ONCE(sk->sk_shutdown); 3108 3109 /* exceptional events? */ 3110 if (READ_ONCE(sk->sk_err)) 3111 mask |= EPOLLERR; 3112 if (shutdown == SHUTDOWN_MASK) 3113 mask |= EPOLLHUP; 3114 if (shutdown & RCV_SHUTDOWN) 3115 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3116 3117 /* readable? */ 3118 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3119 mask |= EPOLLIN | EPOLLRDNORM; 3120 if (sk_is_readable(sk)) 3121 mask |= EPOLLIN | EPOLLRDNORM; 3122 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3123 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3124 mask |= EPOLLPRI; 3125 #endif 3126 3127 /* Connection-based need to check for termination and startup */ 3128 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3129 sk->sk_state == TCP_CLOSE) 3130 mask |= EPOLLHUP; 3131 3132 /* 3133 * we set writable also when the other side has shut down the 3134 * connection. This prevents stuck sockets. 3135 */ 3136 if (unix_writable(sk)) 3137 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3138 3139 return mask; 3140 } 3141 3142 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3143 poll_table *wait) 3144 { 3145 struct sock *sk = sock->sk, *other; 3146 unsigned int writable; 3147 __poll_t mask; 3148 u8 shutdown; 3149 3150 sock_poll_wait(file, sock, wait); 3151 mask = 0; 3152 shutdown = READ_ONCE(sk->sk_shutdown); 3153 3154 /* exceptional events? */ 3155 if (READ_ONCE(sk->sk_err) || 3156 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3157 mask |= EPOLLERR | 3158 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3159 3160 if (shutdown & RCV_SHUTDOWN) 3161 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3162 if (shutdown == SHUTDOWN_MASK) 3163 mask |= EPOLLHUP; 3164 3165 /* readable? */ 3166 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3167 mask |= EPOLLIN | EPOLLRDNORM; 3168 if (sk_is_readable(sk)) 3169 mask |= EPOLLIN | EPOLLRDNORM; 3170 3171 /* Connection-based need to check for termination and startup */ 3172 if (sk->sk_type == SOCK_SEQPACKET) { 3173 if (sk->sk_state == TCP_CLOSE) 3174 mask |= EPOLLHUP; 3175 /* connection hasn't started yet? */ 3176 if (sk->sk_state == TCP_SYN_SENT) 3177 return mask; 3178 } 3179 3180 /* No write status requested, avoid expensive OUT tests. */ 3181 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3182 return mask; 3183 3184 writable = unix_writable(sk); 3185 if (writable) { 3186 unix_state_lock(sk); 3187 3188 other = unix_peer(sk); 3189 if (other && unix_peer(other) != sk && 3190 unix_recvq_full_lockless(other) && 3191 unix_dgram_peer_wake_me(sk, other)) 3192 writable = 0; 3193 3194 unix_state_unlock(sk); 3195 } 3196 3197 if (writable) 3198 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3199 else 3200 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3201 3202 return mask; 3203 } 3204 3205 #ifdef CONFIG_PROC_FS 3206 3207 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3208 3209 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3210 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3211 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3212 3213 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3214 { 3215 unsigned long offset = get_offset(*pos); 3216 unsigned long bucket = get_bucket(*pos); 3217 unsigned long count = 0; 3218 struct sock *sk; 3219 3220 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3221 sk; sk = sk_next(sk)) { 3222 if (++count == offset) 3223 break; 3224 } 3225 3226 return sk; 3227 } 3228 3229 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3230 { 3231 unsigned long bucket = get_bucket(*pos); 3232 struct net *net = seq_file_net(seq); 3233 struct sock *sk; 3234 3235 while (bucket < UNIX_HASH_SIZE) { 3236 spin_lock(&net->unx.table.locks[bucket]); 3237 3238 sk = unix_from_bucket(seq, pos); 3239 if (sk) 3240 return sk; 3241 3242 spin_unlock(&net->unx.table.locks[bucket]); 3243 3244 *pos = set_bucket_offset(++bucket, 1); 3245 } 3246 3247 return NULL; 3248 } 3249 3250 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3251 loff_t *pos) 3252 { 3253 unsigned long bucket = get_bucket(*pos); 3254 3255 sk = sk_next(sk); 3256 if (sk) 3257 return sk; 3258 3259 3260 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3261 3262 *pos = set_bucket_offset(++bucket, 1); 3263 3264 return unix_get_first(seq, pos); 3265 } 3266 3267 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3268 { 3269 if (!*pos) 3270 return SEQ_START_TOKEN; 3271 3272 return unix_get_first(seq, pos); 3273 } 3274 3275 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3276 { 3277 ++*pos; 3278 3279 if (v == SEQ_START_TOKEN) 3280 return unix_get_first(seq, pos); 3281 3282 return unix_get_next(seq, v, pos); 3283 } 3284 3285 static void unix_seq_stop(struct seq_file *seq, void *v) 3286 { 3287 struct sock *sk = v; 3288 3289 if (sk) 3290 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3291 } 3292 3293 static int unix_seq_show(struct seq_file *seq, void *v) 3294 { 3295 3296 if (v == SEQ_START_TOKEN) 3297 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3298 "Inode Path\n"); 3299 else { 3300 struct sock *s = v; 3301 struct unix_sock *u = unix_sk(s); 3302 unix_state_lock(s); 3303 3304 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3305 s, 3306 refcount_read(&s->sk_refcnt), 3307 0, 3308 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3309 s->sk_type, 3310 s->sk_socket ? 3311 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3312 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3313 sock_i_ino(s)); 3314 3315 if (u->addr) { // under a hash table lock here 3316 int i, len; 3317 seq_putc(seq, ' '); 3318 3319 i = 0; 3320 len = u->addr->len - 3321 offsetof(struct sockaddr_un, sun_path); 3322 if (u->addr->name->sun_path[0]) { 3323 len--; 3324 } else { 3325 seq_putc(seq, '@'); 3326 i++; 3327 } 3328 for ( ; i < len; i++) 3329 seq_putc(seq, u->addr->name->sun_path[i] ?: 3330 '@'); 3331 } 3332 unix_state_unlock(s); 3333 seq_putc(seq, '\n'); 3334 } 3335 3336 return 0; 3337 } 3338 3339 static const struct seq_operations unix_seq_ops = { 3340 .start = unix_seq_start, 3341 .next = unix_seq_next, 3342 .stop = unix_seq_stop, 3343 .show = unix_seq_show, 3344 }; 3345 3346 #ifdef CONFIG_BPF_SYSCALL 3347 struct bpf_unix_iter_state { 3348 struct seq_net_private p; 3349 unsigned int cur_sk; 3350 unsigned int end_sk; 3351 unsigned int max_sk; 3352 struct sock **batch; 3353 bool st_bucket_done; 3354 }; 3355 3356 struct bpf_iter__unix { 3357 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3358 __bpf_md_ptr(struct unix_sock *, unix_sk); 3359 uid_t uid __aligned(8); 3360 }; 3361 3362 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3363 struct unix_sock *unix_sk, uid_t uid) 3364 { 3365 struct bpf_iter__unix ctx; 3366 3367 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3368 ctx.meta = meta; 3369 ctx.unix_sk = unix_sk; 3370 ctx.uid = uid; 3371 return bpf_iter_run_prog(prog, &ctx); 3372 } 3373 3374 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3375 3376 { 3377 struct bpf_unix_iter_state *iter = seq->private; 3378 unsigned int expected = 1; 3379 struct sock *sk; 3380 3381 sock_hold(start_sk); 3382 iter->batch[iter->end_sk++] = start_sk; 3383 3384 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3385 if (iter->end_sk < iter->max_sk) { 3386 sock_hold(sk); 3387 iter->batch[iter->end_sk++] = sk; 3388 } 3389 3390 expected++; 3391 } 3392 3393 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3394 3395 return expected; 3396 } 3397 3398 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3399 { 3400 while (iter->cur_sk < iter->end_sk) 3401 sock_put(iter->batch[iter->cur_sk++]); 3402 } 3403 3404 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3405 unsigned int new_batch_sz) 3406 { 3407 struct sock **new_batch; 3408 3409 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3410 GFP_USER | __GFP_NOWARN); 3411 if (!new_batch) 3412 return -ENOMEM; 3413 3414 bpf_iter_unix_put_batch(iter); 3415 kvfree(iter->batch); 3416 iter->batch = new_batch; 3417 iter->max_sk = new_batch_sz; 3418 3419 return 0; 3420 } 3421 3422 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3423 loff_t *pos) 3424 { 3425 struct bpf_unix_iter_state *iter = seq->private; 3426 unsigned int expected; 3427 bool resized = false; 3428 struct sock *sk; 3429 3430 if (iter->st_bucket_done) 3431 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3432 3433 again: 3434 /* Get a new batch */ 3435 iter->cur_sk = 0; 3436 iter->end_sk = 0; 3437 3438 sk = unix_get_first(seq, pos); 3439 if (!sk) 3440 return NULL; /* Done */ 3441 3442 expected = bpf_iter_unix_hold_batch(seq, sk); 3443 3444 if (iter->end_sk == expected) { 3445 iter->st_bucket_done = true; 3446 return sk; 3447 } 3448 3449 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3450 resized = true; 3451 goto again; 3452 } 3453 3454 return sk; 3455 } 3456 3457 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3458 { 3459 if (!*pos) 3460 return SEQ_START_TOKEN; 3461 3462 /* bpf iter does not support lseek, so it always 3463 * continue from where it was stop()-ped. 3464 */ 3465 return bpf_iter_unix_batch(seq, pos); 3466 } 3467 3468 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3469 { 3470 struct bpf_unix_iter_state *iter = seq->private; 3471 struct sock *sk; 3472 3473 /* Whenever seq_next() is called, the iter->cur_sk is 3474 * done with seq_show(), so advance to the next sk in 3475 * the batch. 3476 */ 3477 if (iter->cur_sk < iter->end_sk) 3478 sock_put(iter->batch[iter->cur_sk++]); 3479 3480 ++*pos; 3481 3482 if (iter->cur_sk < iter->end_sk) 3483 sk = iter->batch[iter->cur_sk]; 3484 else 3485 sk = bpf_iter_unix_batch(seq, pos); 3486 3487 return sk; 3488 } 3489 3490 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3491 { 3492 struct bpf_iter_meta meta; 3493 struct bpf_prog *prog; 3494 struct sock *sk = v; 3495 uid_t uid; 3496 bool slow; 3497 int ret; 3498 3499 if (v == SEQ_START_TOKEN) 3500 return 0; 3501 3502 slow = lock_sock_fast(sk); 3503 3504 if (unlikely(sk_unhashed(sk))) { 3505 ret = SEQ_SKIP; 3506 goto unlock; 3507 } 3508 3509 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3510 meta.seq = seq; 3511 prog = bpf_iter_get_info(&meta, false); 3512 ret = unix_prog_seq_show(prog, &meta, v, uid); 3513 unlock: 3514 unlock_sock_fast(sk, slow); 3515 return ret; 3516 } 3517 3518 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3519 { 3520 struct bpf_unix_iter_state *iter = seq->private; 3521 struct bpf_iter_meta meta; 3522 struct bpf_prog *prog; 3523 3524 if (!v) { 3525 meta.seq = seq; 3526 prog = bpf_iter_get_info(&meta, true); 3527 if (prog) 3528 (void)unix_prog_seq_show(prog, &meta, v, 0); 3529 } 3530 3531 if (iter->cur_sk < iter->end_sk) 3532 bpf_iter_unix_put_batch(iter); 3533 } 3534 3535 static const struct seq_operations bpf_iter_unix_seq_ops = { 3536 .start = bpf_iter_unix_seq_start, 3537 .next = bpf_iter_unix_seq_next, 3538 .stop = bpf_iter_unix_seq_stop, 3539 .show = bpf_iter_unix_seq_show, 3540 }; 3541 #endif 3542 #endif 3543 3544 static const struct net_proto_family unix_family_ops = { 3545 .family = PF_UNIX, 3546 .create = unix_create, 3547 .owner = THIS_MODULE, 3548 }; 3549 3550 3551 static int __net_init unix_net_init(struct net *net) 3552 { 3553 int i; 3554 3555 net->unx.sysctl_max_dgram_qlen = 10; 3556 if (unix_sysctl_register(net)) 3557 goto out; 3558 3559 #ifdef CONFIG_PROC_FS 3560 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3561 sizeof(struct seq_net_private))) 3562 goto err_sysctl; 3563 #endif 3564 3565 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3566 sizeof(spinlock_t), GFP_KERNEL); 3567 if (!net->unx.table.locks) 3568 goto err_proc; 3569 3570 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3571 sizeof(struct hlist_head), 3572 GFP_KERNEL); 3573 if (!net->unx.table.buckets) 3574 goto free_locks; 3575 3576 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3577 spin_lock_init(&net->unx.table.locks[i]); 3578 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3579 } 3580 3581 return 0; 3582 3583 free_locks: 3584 kvfree(net->unx.table.locks); 3585 err_proc: 3586 #ifdef CONFIG_PROC_FS 3587 remove_proc_entry("unix", net->proc_net); 3588 err_sysctl: 3589 #endif 3590 unix_sysctl_unregister(net); 3591 out: 3592 return -ENOMEM; 3593 } 3594 3595 static void __net_exit unix_net_exit(struct net *net) 3596 { 3597 kvfree(net->unx.table.buckets); 3598 kvfree(net->unx.table.locks); 3599 unix_sysctl_unregister(net); 3600 remove_proc_entry("unix", net->proc_net); 3601 } 3602 3603 static struct pernet_operations unix_net_ops = { 3604 .init = unix_net_init, 3605 .exit = unix_net_exit, 3606 }; 3607 3608 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3609 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3610 struct unix_sock *unix_sk, uid_t uid) 3611 3612 #define INIT_BATCH_SZ 16 3613 3614 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3615 { 3616 struct bpf_unix_iter_state *iter = priv_data; 3617 int err; 3618 3619 err = bpf_iter_init_seq_net(priv_data, aux); 3620 if (err) 3621 return err; 3622 3623 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3624 if (err) { 3625 bpf_iter_fini_seq_net(priv_data); 3626 return err; 3627 } 3628 3629 return 0; 3630 } 3631 3632 static void bpf_iter_fini_unix(void *priv_data) 3633 { 3634 struct bpf_unix_iter_state *iter = priv_data; 3635 3636 bpf_iter_fini_seq_net(priv_data); 3637 kvfree(iter->batch); 3638 } 3639 3640 static const struct bpf_iter_seq_info unix_seq_info = { 3641 .seq_ops = &bpf_iter_unix_seq_ops, 3642 .init_seq_private = bpf_iter_init_unix, 3643 .fini_seq_private = bpf_iter_fini_unix, 3644 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3645 }; 3646 3647 static const struct bpf_func_proto * 3648 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3649 const struct bpf_prog *prog) 3650 { 3651 switch (func_id) { 3652 case BPF_FUNC_setsockopt: 3653 return &bpf_sk_setsockopt_proto; 3654 case BPF_FUNC_getsockopt: 3655 return &bpf_sk_getsockopt_proto; 3656 default: 3657 return NULL; 3658 } 3659 } 3660 3661 static struct bpf_iter_reg unix_reg_info = { 3662 .target = "unix", 3663 .ctx_arg_info_size = 1, 3664 .ctx_arg_info = { 3665 { offsetof(struct bpf_iter__unix, unix_sk), 3666 PTR_TO_BTF_ID_OR_NULL }, 3667 }, 3668 .get_func_proto = bpf_iter_unix_get_func_proto, 3669 .seq_info = &unix_seq_info, 3670 }; 3671 3672 static void __init bpf_iter_register(void) 3673 { 3674 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3675 if (bpf_iter_reg_target(&unix_reg_info)) 3676 pr_warn("Warning: could not register bpf iterator unix\n"); 3677 } 3678 #endif 3679 3680 static int __init af_unix_init(void) 3681 { 3682 int i, rc = -1; 3683 3684 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3685 3686 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3687 spin_lock_init(&bsd_socket_locks[i]); 3688 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3689 } 3690 3691 rc = proto_register(&unix_dgram_proto, 1); 3692 if (rc != 0) { 3693 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3694 goto out; 3695 } 3696 3697 rc = proto_register(&unix_stream_proto, 1); 3698 if (rc != 0) { 3699 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3700 proto_unregister(&unix_dgram_proto); 3701 goto out; 3702 } 3703 3704 sock_register(&unix_family_ops); 3705 register_pernet_subsys(&unix_net_ops); 3706 unix_bpf_build_proto(); 3707 3708 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3709 bpf_iter_register(); 3710 #endif 3711 3712 out: 3713 return rc; 3714 } 3715 3716 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3717 fs_initcall(af_unix_init); 3718