1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 #include <linux/bpf-cgroup.h> 120 121 static atomic_long_t unix_nr_socks; 122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 124 125 /* SMP locking strategy: 126 * hash table is protected with spinlock. 127 * each socket state is protected by separate spinlock. 128 */ 129 130 static unsigned int unix_unbound_hash(struct sock *sk) 131 { 132 unsigned long hash = (unsigned long)sk; 133 134 hash ^= hash >> 16; 135 hash ^= hash >> 8; 136 hash ^= sk->sk_type; 137 138 return hash & UNIX_HASH_MOD; 139 } 140 141 static unsigned int unix_bsd_hash(struct inode *i) 142 { 143 return i->i_ino & UNIX_HASH_MOD; 144 } 145 146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 147 int addr_len, int type) 148 { 149 __wsum csum = csum_partial(sunaddr, addr_len, 0); 150 unsigned int hash; 151 152 hash = (__force unsigned int)csum_fold(csum); 153 hash ^= hash >> 8; 154 hash ^= type; 155 156 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 157 } 158 159 static void unix_table_double_lock(struct net *net, 160 unsigned int hash1, unsigned int hash2) 161 { 162 if (hash1 == hash2) { 163 spin_lock(&net->unx.table.locks[hash1]); 164 return; 165 } 166 167 if (hash1 > hash2) 168 swap(hash1, hash2); 169 170 spin_lock(&net->unx.table.locks[hash1]); 171 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); 172 } 173 174 static void unix_table_double_unlock(struct net *net, 175 unsigned int hash1, unsigned int hash2) 176 { 177 if (hash1 == hash2) { 178 spin_unlock(&net->unx.table.locks[hash1]); 179 return; 180 } 181 182 spin_unlock(&net->unx.table.locks[hash1]); 183 spin_unlock(&net->unx.table.locks[hash2]); 184 } 185 186 #ifdef CONFIG_SECURITY_NETWORK 187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 188 { 189 UNIXCB(skb).secid = scm->secid; 190 } 191 192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 193 { 194 scm->secid = UNIXCB(skb).secid; 195 } 196 197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 198 { 199 return (scm->secid == UNIXCB(skb).secid); 200 } 201 #else 202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 203 { } 204 205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 206 { } 207 208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 209 { 210 return true; 211 } 212 #endif /* CONFIG_SECURITY_NETWORK */ 213 214 static inline int unix_our_peer(struct sock *sk, struct sock *osk) 215 { 216 return unix_peer(osk) == sk; 217 } 218 219 static inline int unix_may_send(struct sock *sk, struct sock *osk) 220 { 221 return unix_peer(osk) == NULL || unix_our_peer(sk, osk); 222 } 223 224 static inline int unix_recvq_full(const struct sock *sk) 225 { 226 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 227 } 228 229 static inline int unix_recvq_full_lockless(const struct sock *sk) 230 { 231 return skb_queue_len_lockless(&sk->sk_receive_queue) > 232 READ_ONCE(sk->sk_max_ack_backlog); 233 } 234 235 struct sock *unix_peer_get(struct sock *s) 236 { 237 struct sock *peer; 238 239 unix_state_lock(s); 240 peer = unix_peer(s); 241 if (peer) 242 sock_hold(peer); 243 unix_state_unlock(s); 244 return peer; 245 } 246 EXPORT_SYMBOL_GPL(unix_peer_get); 247 248 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 249 int addr_len) 250 { 251 struct unix_address *addr; 252 253 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 254 if (!addr) 255 return NULL; 256 257 refcount_set(&addr->refcnt, 1); 258 addr->len = addr_len; 259 memcpy(addr->name, sunaddr, addr_len); 260 261 return addr; 262 } 263 264 static inline void unix_release_addr(struct unix_address *addr) 265 { 266 if (refcount_dec_and_test(&addr->refcnt)) 267 kfree(addr); 268 } 269 270 /* 271 * Check unix socket name: 272 * - should be not zero length. 273 * - if started by not zero, should be NULL terminated (FS object) 274 * - if started by zero, it is abstract name. 275 */ 276 277 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 278 { 279 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 280 addr_len > sizeof(*sunaddr)) 281 return -EINVAL; 282 283 if (sunaddr->sun_family != AF_UNIX) 284 return -EINVAL; 285 286 return 0; 287 } 288 289 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 290 { 291 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 292 short offset = offsetof(struct sockaddr_storage, __data); 293 294 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 295 296 /* This may look like an off by one error but it is a bit more 297 * subtle. 108 is the longest valid AF_UNIX path for a binding. 298 * sun_path[108] doesn't as such exist. However in kernel space 299 * we are guaranteed that it is a valid memory location in our 300 * kernel address buffer because syscall functions always pass 301 * a pointer of struct sockaddr_storage which has a bigger buffer 302 * than 108. Also, we must terminate sun_path for strlen() in 303 * getname_kernel(). 304 */ 305 addr->__data[addr_len - offset] = 0; 306 307 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 308 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 309 * know the actual buffer. 310 */ 311 return strlen(addr->__data) + offset + 1; 312 } 313 314 static void __unix_remove_socket(struct sock *sk) 315 { 316 sk_del_node_init(sk); 317 } 318 319 static void __unix_insert_socket(struct net *net, struct sock *sk) 320 { 321 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 322 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 323 } 324 325 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 326 struct unix_address *addr, unsigned int hash) 327 { 328 __unix_remove_socket(sk); 329 smp_store_release(&unix_sk(sk)->addr, addr); 330 331 sk->sk_hash = hash; 332 __unix_insert_socket(net, sk); 333 } 334 335 static void unix_remove_socket(struct net *net, struct sock *sk) 336 { 337 spin_lock(&net->unx.table.locks[sk->sk_hash]); 338 __unix_remove_socket(sk); 339 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 340 } 341 342 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 343 { 344 spin_lock(&net->unx.table.locks[sk->sk_hash]); 345 __unix_insert_socket(net, sk); 346 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 347 } 348 349 static void unix_insert_bsd_socket(struct sock *sk) 350 { 351 spin_lock(&bsd_socket_locks[sk->sk_hash]); 352 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 353 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 354 } 355 356 static void unix_remove_bsd_socket(struct sock *sk) 357 { 358 if (!hlist_unhashed(&sk->sk_bind_node)) { 359 spin_lock(&bsd_socket_locks[sk->sk_hash]); 360 __sk_del_bind_node(sk); 361 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 362 363 sk_node_init(&sk->sk_bind_node); 364 } 365 } 366 367 static struct sock *__unix_find_socket_byname(struct net *net, 368 struct sockaddr_un *sunname, 369 int len, unsigned int hash) 370 { 371 struct sock *s; 372 373 sk_for_each(s, &net->unx.table.buckets[hash]) { 374 struct unix_sock *u = unix_sk(s); 375 376 if (u->addr->len == len && 377 !memcmp(u->addr->name, sunname, len)) 378 return s; 379 } 380 return NULL; 381 } 382 383 static inline struct sock *unix_find_socket_byname(struct net *net, 384 struct sockaddr_un *sunname, 385 int len, unsigned int hash) 386 { 387 struct sock *s; 388 389 spin_lock(&net->unx.table.locks[hash]); 390 s = __unix_find_socket_byname(net, sunname, len, hash); 391 if (s) 392 sock_hold(s); 393 spin_unlock(&net->unx.table.locks[hash]); 394 return s; 395 } 396 397 static struct sock *unix_find_socket_byinode(struct inode *i) 398 { 399 unsigned int hash = unix_bsd_hash(i); 400 struct sock *s; 401 402 spin_lock(&bsd_socket_locks[hash]); 403 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 404 struct dentry *dentry = unix_sk(s)->path.dentry; 405 406 if (dentry && d_backing_inode(dentry) == i) { 407 sock_hold(s); 408 spin_unlock(&bsd_socket_locks[hash]); 409 return s; 410 } 411 } 412 spin_unlock(&bsd_socket_locks[hash]); 413 return NULL; 414 } 415 416 /* Support code for asymmetrically connected dgram sockets 417 * 418 * If a datagram socket is connected to a socket not itself connected 419 * to the first socket (eg, /dev/log), clients may only enqueue more 420 * messages if the present receive queue of the server socket is not 421 * "too large". This means there's a second writeability condition 422 * poll and sendmsg need to test. The dgram recv code will do a wake 423 * up on the peer_wait wait queue of a socket upon reception of a 424 * datagram which needs to be propagated to sleeping would-be writers 425 * since these might not have sent anything so far. This can't be 426 * accomplished via poll_wait because the lifetime of the server 427 * socket might be less than that of its clients if these break their 428 * association with it or if the server socket is closed while clients 429 * are still connected to it and there's no way to inform "a polling 430 * implementation" that it should let go of a certain wait queue 431 * 432 * In order to propagate a wake up, a wait_queue_entry_t of the client 433 * socket is enqueued on the peer_wait queue of the server socket 434 * whose wake function does a wake_up on the ordinary client socket 435 * wait queue. This connection is established whenever a write (or 436 * poll for write) hit the flow control condition and broken when the 437 * association to the server socket is dissolved or after a wake up 438 * was relayed. 439 */ 440 441 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 442 void *key) 443 { 444 struct unix_sock *u; 445 wait_queue_head_t *u_sleep; 446 447 u = container_of(q, struct unix_sock, peer_wake); 448 449 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 450 q); 451 u->peer_wake.private = NULL; 452 453 /* relaying can only happen while the wq still exists */ 454 u_sleep = sk_sleep(&u->sk); 455 if (u_sleep) 456 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 457 458 return 0; 459 } 460 461 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 462 { 463 struct unix_sock *u, *u_other; 464 int rc; 465 466 u = unix_sk(sk); 467 u_other = unix_sk(other); 468 rc = 0; 469 spin_lock(&u_other->peer_wait.lock); 470 471 if (!u->peer_wake.private) { 472 u->peer_wake.private = other; 473 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 474 475 rc = 1; 476 } 477 478 spin_unlock(&u_other->peer_wait.lock); 479 return rc; 480 } 481 482 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 483 struct sock *other) 484 { 485 struct unix_sock *u, *u_other; 486 487 u = unix_sk(sk); 488 u_other = unix_sk(other); 489 spin_lock(&u_other->peer_wait.lock); 490 491 if (u->peer_wake.private == other) { 492 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 493 u->peer_wake.private = NULL; 494 } 495 496 spin_unlock(&u_other->peer_wait.lock); 497 } 498 499 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 500 struct sock *other) 501 { 502 unix_dgram_peer_wake_disconnect(sk, other); 503 wake_up_interruptible_poll(sk_sleep(sk), 504 EPOLLOUT | 505 EPOLLWRNORM | 506 EPOLLWRBAND); 507 } 508 509 /* preconditions: 510 * - unix_peer(sk) == other 511 * - association is stable 512 */ 513 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 514 { 515 int connected; 516 517 connected = unix_dgram_peer_wake_connect(sk, other); 518 519 /* If other is SOCK_DEAD, we want to make sure we signal 520 * POLLOUT, such that a subsequent write() can get a 521 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 522 * to other and its full, we will hang waiting for POLLOUT. 523 */ 524 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 525 return 1; 526 527 if (connected) 528 unix_dgram_peer_wake_disconnect(sk, other); 529 530 return 0; 531 } 532 533 static int unix_writable(const struct sock *sk) 534 { 535 return sk->sk_state != TCP_LISTEN && 536 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; 537 } 538 539 static void unix_write_space(struct sock *sk) 540 { 541 struct socket_wq *wq; 542 543 rcu_read_lock(); 544 if (unix_writable(sk)) { 545 wq = rcu_dereference(sk->sk_wq); 546 if (skwq_has_sleeper(wq)) 547 wake_up_interruptible_sync_poll(&wq->wait, 548 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 549 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 550 } 551 rcu_read_unlock(); 552 } 553 554 /* When dgram socket disconnects (or changes its peer), we clear its receive 555 * queue of packets arrived from previous peer. First, it allows to do 556 * flow control based only on wmem_alloc; second, sk connected to peer 557 * may receive messages only from that peer. */ 558 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 559 { 560 if (!skb_queue_empty(&sk->sk_receive_queue)) { 561 skb_queue_purge(&sk->sk_receive_queue); 562 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 563 564 /* If one link of bidirectional dgram pipe is disconnected, 565 * we signal error. Messages are lost. Do not make this, 566 * when peer was not connected to us. 567 */ 568 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 569 WRITE_ONCE(other->sk_err, ECONNRESET); 570 sk_error_report(other); 571 } 572 } 573 other->sk_state = TCP_CLOSE; 574 } 575 576 static void unix_sock_destructor(struct sock *sk) 577 { 578 struct unix_sock *u = unix_sk(sk); 579 580 skb_queue_purge(&sk->sk_receive_queue); 581 582 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 583 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 584 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 585 if (!sock_flag(sk, SOCK_DEAD)) { 586 pr_info("Attempt to release alive unix socket: %p\n", sk); 587 return; 588 } 589 590 if (u->addr) 591 unix_release_addr(u->addr); 592 593 atomic_long_dec(&unix_nr_socks); 594 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 595 #ifdef UNIX_REFCNT_DEBUG 596 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 597 atomic_long_read(&unix_nr_socks)); 598 #endif 599 } 600 601 static void unix_release_sock(struct sock *sk, int embrion) 602 { 603 struct unix_sock *u = unix_sk(sk); 604 struct sock *skpair; 605 struct sk_buff *skb; 606 struct path path; 607 int state; 608 609 unix_remove_socket(sock_net(sk), sk); 610 unix_remove_bsd_socket(sk); 611 612 /* Clear state */ 613 unix_state_lock(sk); 614 sock_orphan(sk); 615 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 616 path = u->path; 617 u->path.dentry = NULL; 618 u->path.mnt = NULL; 619 state = sk->sk_state; 620 sk->sk_state = TCP_CLOSE; 621 622 skpair = unix_peer(sk); 623 unix_peer(sk) = NULL; 624 625 unix_state_unlock(sk); 626 627 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 628 if (u->oob_skb) { 629 kfree_skb(u->oob_skb); 630 u->oob_skb = NULL; 631 } 632 #endif 633 634 wake_up_interruptible_all(&u->peer_wait); 635 636 if (skpair != NULL) { 637 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 638 unix_state_lock(skpair); 639 /* No more writes */ 640 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 641 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) 642 WRITE_ONCE(skpair->sk_err, ECONNRESET); 643 unix_state_unlock(skpair); 644 skpair->sk_state_change(skpair); 645 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 646 } 647 648 unix_dgram_peer_wake_disconnect(sk, skpair); 649 sock_put(skpair); /* It may now die */ 650 } 651 652 /* Try to flush out this socket. Throw out buffers at least */ 653 654 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 655 if (state == TCP_LISTEN) 656 unix_release_sock(skb->sk, 1); 657 /* passed fds are erased in the kfree_skb hook */ 658 UNIXCB(skb).consumed = skb->len; 659 kfree_skb(skb); 660 } 661 662 if (path.dentry) 663 path_put(&path); 664 665 sock_put(sk); 666 667 /* ---- Socket is dead now and most probably destroyed ---- */ 668 669 /* 670 * Fixme: BSD difference: In BSD all sockets connected to us get 671 * ECONNRESET and we die on the spot. In Linux we behave 672 * like files and pipes do and wait for the last 673 * dereference. 674 * 675 * Can't we simply set sock->err? 676 * 677 * What the above comment does talk about? --ANK(980817) 678 */ 679 680 if (READ_ONCE(unix_tot_inflight)) 681 unix_gc(); /* Garbage collect fds */ 682 } 683 684 static void init_peercred(struct sock *sk) 685 { 686 const struct cred *old_cred; 687 struct pid *old_pid; 688 689 spin_lock(&sk->sk_peer_lock); 690 old_pid = sk->sk_peer_pid; 691 old_cred = sk->sk_peer_cred; 692 sk->sk_peer_pid = get_pid(task_tgid(current)); 693 sk->sk_peer_cred = get_current_cred(); 694 spin_unlock(&sk->sk_peer_lock); 695 696 put_pid(old_pid); 697 put_cred(old_cred); 698 } 699 700 static void copy_peercred(struct sock *sk, struct sock *peersk) 701 { 702 const struct cred *old_cred; 703 struct pid *old_pid; 704 705 if (sk < peersk) { 706 spin_lock(&sk->sk_peer_lock); 707 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); 708 } else { 709 spin_lock(&peersk->sk_peer_lock); 710 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); 711 } 712 old_pid = sk->sk_peer_pid; 713 old_cred = sk->sk_peer_cred; 714 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 715 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 716 717 spin_unlock(&sk->sk_peer_lock); 718 spin_unlock(&peersk->sk_peer_lock); 719 720 put_pid(old_pid); 721 put_cred(old_cred); 722 } 723 724 static int unix_listen(struct socket *sock, int backlog) 725 { 726 int err; 727 struct sock *sk = sock->sk; 728 struct unix_sock *u = unix_sk(sk); 729 730 err = -EOPNOTSUPP; 731 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 732 goto out; /* Only stream/seqpacket sockets accept */ 733 err = -EINVAL; 734 if (!u->addr) 735 goto out; /* No listens on an unbound socket */ 736 unix_state_lock(sk); 737 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 738 goto out_unlock; 739 if (backlog > sk->sk_max_ack_backlog) 740 wake_up_interruptible_all(&u->peer_wait); 741 sk->sk_max_ack_backlog = backlog; 742 sk->sk_state = TCP_LISTEN; 743 /* set credentials so connect can copy them */ 744 init_peercred(sk); 745 err = 0; 746 747 out_unlock: 748 unix_state_unlock(sk); 749 out: 750 return err; 751 } 752 753 static int unix_release(struct socket *); 754 static int unix_bind(struct socket *, struct sockaddr *, int); 755 static int unix_stream_connect(struct socket *, struct sockaddr *, 756 int addr_len, int flags); 757 static int unix_socketpair(struct socket *, struct socket *); 758 static int unix_accept(struct socket *, struct socket *, int, bool); 759 static int unix_getname(struct socket *, struct sockaddr *, int); 760 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 761 static __poll_t unix_dgram_poll(struct file *, struct socket *, 762 poll_table *); 763 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 764 #ifdef CONFIG_COMPAT 765 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 766 #endif 767 static int unix_shutdown(struct socket *, int); 768 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 769 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 770 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 771 struct pipe_inode_info *, size_t size, 772 unsigned int flags); 773 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 774 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 775 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 776 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 777 static int unix_dgram_connect(struct socket *, struct sockaddr *, 778 int, int); 779 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 780 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 781 int); 782 783 static int unix_set_peek_off(struct sock *sk, int val) 784 { 785 struct unix_sock *u = unix_sk(sk); 786 787 if (mutex_lock_interruptible(&u->iolock)) 788 return -EINTR; 789 790 WRITE_ONCE(sk->sk_peek_off, val); 791 mutex_unlock(&u->iolock); 792 793 return 0; 794 } 795 796 #ifdef CONFIG_PROC_FS 797 static int unix_count_nr_fds(struct sock *sk) 798 { 799 struct sk_buff *skb; 800 struct unix_sock *u; 801 int nr_fds = 0; 802 803 spin_lock(&sk->sk_receive_queue.lock); 804 skb = skb_peek(&sk->sk_receive_queue); 805 while (skb) { 806 u = unix_sk(skb->sk); 807 nr_fds += atomic_read(&u->scm_stat.nr_fds); 808 skb = skb_peek_next(skb, &sk->sk_receive_queue); 809 } 810 spin_unlock(&sk->sk_receive_queue.lock); 811 812 return nr_fds; 813 } 814 815 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 816 { 817 struct sock *sk = sock->sk; 818 unsigned char s_state; 819 struct unix_sock *u; 820 int nr_fds = 0; 821 822 if (sk) { 823 s_state = READ_ONCE(sk->sk_state); 824 u = unix_sk(sk); 825 826 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 827 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 828 * SOCK_DGRAM is ordinary. So, no lock is needed. 829 */ 830 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 831 nr_fds = atomic_read(&u->scm_stat.nr_fds); 832 else if (s_state == TCP_LISTEN) 833 nr_fds = unix_count_nr_fds(sk); 834 835 seq_printf(m, "scm_fds: %u\n", nr_fds); 836 } 837 } 838 #else 839 #define unix_show_fdinfo NULL 840 #endif 841 842 static const struct proto_ops unix_stream_ops = { 843 .family = PF_UNIX, 844 .owner = THIS_MODULE, 845 .release = unix_release, 846 .bind = unix_bind, 847 .connect = unix_stream_connect, 848 .socketpair = unix_socketpair, 849 .accept = unix_accept, 850 .getname = unix_getname, 851 .poll = unix_poll, 852 .ioctl = unix_ioctl, 853 #ifdef CONFIG_COMPAT 854 .compat_ioctl = unix_compat_ioctl, 855 #endif 856 .listen = unix_listen, 857 .shutdown = unix_shutdown, 858 .sendmsg = unix_stream_sendmsg, 859 .recvmsg = unix_stream_recvmsg, 860 .read_skb = unix_stream_read_skb, 861 .mmap = sock_no_mmap, 862 .splice_read = unix_stream_splice_read, 863 .set_peek_off = unix_set_peek_off, 864 .show_fdinfo = unix_show_fdinfo, 865 }; 866 867 static const struct proto_ops unix_dgram_ops = { 868 .family = PF_UNIX, 869 .owner = THIS_MODULE, 870 .release = unix_release, 871 .bind = unix_bind, 872 .connect = unix_dgram_connect, 873 .socketpair = unix_socketpair, 874 .accept = sock_no_accept, 875 .getname = unix_getname, 876 .poll = unix_dgram_poll, 877 .ioctl = unix_ioctl, 878 #ifdef CONFIG_COMPAT 879 .compat_ioctl = unix_compat_ioctl, 880 #endif 881 .listen = sock_no_listen, 882 .shutdown = unix_shutdown, 883 .sendmsg = unix_dgram_sendmsg, 884 .read_skb = unix_read_skb, 885 .recvmsg = unix_dgram_recvmsg, 886 .mmap = sock_no_mmap, 887 .set_peek_off = unix_set_peek_off, 888 .show_fdinfo = unix_show_fdinfo, 889 }; 890 891 static const struct proto_ops unix_seqpacket_ops = { 892 .family = PF_UNIX, 893 .owner = THIS_MODULE, 894 .release = unix_release, 895 .bind = unix_bind, 896 .connect = unix_stream_connect, 897 .socketpair = unix_socketpair, 898 .accept = unix_accept, 899 .getname = unix_getname, 900 .poll = unix_dgram_poll, 901 .ioctl = unix_ioctl, 902 #ifdef CONFIG_COMPAT 903 .compat_ioctl = unix_compat_ioctl, 904 #endif 905 .listen = unix_listen, 906 .shutdown = unix_shutdown, 907 .sendmsg = unix_seqpacket_sendmsg, 908 .recvmsg = unix_seqpacket_recvmsg, 909 .mmap = sock_no_mmap, 910 .set_peek_off = unix_set_peek_off, 911 .show_fdinfo = unix_show_fdinfo, 912 }; 913 914 static void unix_close(struct sock *sk, long timeout) 915 { 916 /* Nothing to do here, unix socket does not need a ->close(). 917 * This is merely for sockmap. 918 */ 919 } 920 921 static void unix_unhash(struct sock *sk) 922 { 923 /* Nothing to do here, unix socket does not need a ->unhash(). 924 * This is merely for sockmap. 925 */ 926 } 927 928 static bool unix_bpf_bypass_getsockopt(int level, int optname) 929 { 930 if (level == SOL_SOCKET) { 931 switch (optname) { 932 case SO_PEERPIDFD: 933 return true; 934 default: 935 return false; 936 } 937 } 938 939 return false; 940 } 941 942 struct proto unix_dgram_proto = { 943 .name = "UNIX", 944 .owner = THIS_MODULE, 945 .obj_size = sizeof(struct unix_sock), 946 .close = unix_close, 947 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 948 #ifdef CONFIG_BPF_SYSCALL 949 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 950 #endif 951 }; 952 953 struct proto unix_stream_proto = { 954 .name = "UNIX-STREAM", 955 .owner = THIS_MODULE, 956 .obj_size = sizeof(struct unix_sock), 957 .close = unix_close, 958 .unhash = unix_unhash, 959 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 960 #ifdef CONFIG_BPF_SYSCALL 961 .psock_update_sk_prot = unix_stream_bpf_update_proto, 962 #endif 963 }; 964 965 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 966 { 967 struct unix_sock *u; 968 struct sock *sk; 969 int err; 970 971 atomic_long_inc(&unix_nr_socks); 972 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 973 err = -ENFILE; 974 goto err; 975 } 976 977 if (type == SOCK_STREAM) 978 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 979 else /*dgram and seqpacket */ 980 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 981 982 if (!sk) { 983 err = -ENOMEM; 984 goto err; 985 } 986 987 sock_init_data(sock, sk); 988 989 sk->sk_hash = unix_unbound_hash(sk); 990 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 991 sk->sk_write_space = unix_write_space; 992 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; 993 sk->sk_destruct = unix_sock_destructor; 994 u = unix_sk(sk); 995 u->inflight = 0; 996 u->path.dentry = NULL; 997 u->path.mnt = NULL; 998 spin_lock_init(&u->lock); 999 INIT_LIST_HEAD(&u->link); 1000 mutex_init(&u->iolock); /* single task reading lock */ 1001 mutex_init(&u->bindlock); /* single task binding lock */ 1002 init_waitqueue_head(&u->peer_wait); 1003 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1004 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1005 unix_insert_unbound_socket(net, sk); 1006 1007 sock_prot_inuse_add(net, sk->sk_prot, 1); 1008 1009 return sk; 1010 1011 err: 1012 atomic_long_dec(&unix_nr_socks); 1013 return ERR_PTR(err); 1014 } 1015 1016 static int unix_create(struct net *net, struct socket *sock, int protocol, 1017 int kern) 1018 { 1019 struct sock *sk; 1020 1021 if (protocol && protocol != PF_UNIX) 1022 return -EPROTONOSUPPORT; 1023 1024 sock->state = SS_UNCONNECTED; 1025 1026 switch (sock->type) { 1027 case SOCK_STREAM: 1028 sock->ops = &unix_stream_ops; 1029 break; 1030 /* 1031 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1032 * nothing uses it. 1033 */ 1034 case SOCK_RAW: 1035 sock->type = SOCK_DGRAM; 1036 fallthrough; 1037 case SOCK_DGRAM: 1038 sock->ops = &unix_dgram_ops; 1039 break; 1040 case SOCK_SEQPACKET: 1041 sock->ops = &unix_seqpacket_ops; 1042 break; 1043 default: 1044 return -ESOCKTNOSUPPORT; 1045 } 1046 1047 sk = unix_create1(net, sock, kern, sock->type); 1048 if (IS_ERR(sk)) 1049 return PTR_ERR(sk); 1050 1051 return 0; 1052 } 1053 1054 static int unix_release(struct socket *sock) 1055 { 1056 struct sock *sk = sock->sk; 1057 1058 if (!sk) 1059 return 0; 1060 1061 sk->sk_prot->close(sk, 0); 1062 unix_release_sock(sk, 0); 1063 sock->sk = NULL; 1064 1065 return 0; 1066 } 1067 1068 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1069 int type) 1070 { 1071 struct inode *inode; 1072 struct path path; 1073 struct sock *sk; 1074 int err; 1075 1076 unix_mkname_bsd(sunaddr, addr_len); 1077 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1078 if (err) 1079 goto fail; 1080 1081 err = path_permission(&path, MAY_WRITE); 1082 if (err) 1083 goto path_put; 1084 1085 err = -ECONNREFUSED; 1086 inode = d_backing_inode(path.dentry); 1087 if (!S_ISSOCK(inode->i_mode)) 1088 goto path_put; 1089 1090 sk = unix_find_socket_byinode(inode); 1091 if (!sk) 1092 goto path_put; 1093 1094 err = -EPROTOTYPE; 1095 if (sk->sk_type == type) 1096 touch_atime(&path); 1097 else 1098 goto sock_put; 1099 1100 path_put(&path); 1101 1102 return sk; 1103 1104 sock_put: 1105 sock_put(sk); 1106 path_put: 1107 path_put(&path); 1108 fail: 1109 return ERR_PTR(err); 1110 } 1111 1112 static struct sock *unix_find_abstract(struct net *net, 1113 struct sockaddr_un *sunaddr, 1114 int addr_len, int type) 1115 { 1116 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1117 struct dentry *dentry; 1118 struct sock *sk; 1119 1120 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1121 if (!sk) 1122 return ERR_PTR(-ECONNREFUSED); 1123 1124 dentry = unix_sk(sk)->path.dentry; 1125 if (dentry) 1126 touch_atime(&unix_sk(sk)->path); 1127 1128 return sk; 1129 } 1130 1131 static struct sock *unix_find_other(struct net *net, 1132 struct sockaddr_un *sunaddr, 1133 int addr_len, int type) 1134 { 1135 struct sock *sk; 1136 1137 if (sunaddr->sun_path[0]) 1138 sk = unix_find_bsd(sunaddr, addr_len, type); 1139 else 1140 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1141 1142 return sk; 1143 } 1144 1145 static int unix_autobind(struct sock *sk) 1146 { 1147 unsigned int new_hash, old_hash = sk->sk_hash; 1148 struct unix_sock *u = unix_sk(sk); 1149 struct net *net = sock_net(sk); 1150 struct unix_address *addr; 1151 u32 lastnum, ordernum; 1152 int err; 1153 1154 err = mutex_lock_interruptible(&u->bindlock); 1155 if (err) 1156 return err; 1157 1158 if (u->addr) 1159 goto out; 1160 1161 err = -ENOMEM; 1162 addr = kzalloc(sizeof(*addr) + 1163 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1164 if (!addr) 1165 goto out; 1166 1167 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1168 addr->name->sun_family = AF_UNIX; 1169 refcount_set(&addr->refcnt, 1); 1170 1171 ordernum = get_random_u32(); 1172 lastnum = ordernum & 0xFFFFF; 1173 retry: 1174 ordernum = (ordernum + 1) & 0xFFFFF; 1175 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1176 1177 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1178 unix_table_double_lock(net, old_hash, new_hash); 1179 1180 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1181 unix_table_double_unlock(net, old_hash, new_hash); 1182 1183 /* __unix_find_socket_byname() may take long time if many names 1184 * are already in use. 1185 */ 1186 cond_resched(); 1187 1188 if (ordernum == lastnum) { 1189 /* Give up if all names seems to be in use. */ 1190 err = -ENOSPC; 1191 unix_release_addr(addr); 1192 goto out; 1193 } 1194 1195 goto retry; 1196 } 1197 1198 __unix_set_addr_hash(net, sk, addr, new_hash); 1199 unix_table_double_unlock(net, old_hash, new_hash); 1200 err = 0; 1201 1202 out: mutex_unlock(&u->bindlock); 1203 return err; 1204 } 1205 1206 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1207 int addr_len) 1208 { 1209 umode_t mode = S_IFSOCK | 1210 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1211 unsigned int new_hash, old_hash = sk->sk_hash; 1212 struct unix_sock *u = unix_sk(sk); 1213 struct net *net = sock_net(sk); 1214 struct mnt_idmap *idmap; 1215 struct unix_address *addr; 1216 struct dentry *dentry; 1217 struct path parent; 1218 int err; 1219 1220 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1221 addr = unix_create_addr(sunaddr, addr_len); 1222 if (!addr) 1223 return -ENOMEM; 1224 1225 /* 1226 * Get the parent directory, calculate the hash for last 1227 * component. 1228 */ 1229 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1230 if (IS_ERR(dentry)) { 1231 err = PTR_ERR(dentry); 1232 goto out; 1233 } 1234 1235 /* 1236 * All right, let's create it. 1237 */ 1238 idmap = mnt_idmap(parent.mnt); 1239 err = security_path_mknod(&parent, dentry, mode, 0); 1240 if (!err) 1241 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1242 if (err) 1243 goto out_path; 1244 err = mutex_lock_interruptible(&u->bindlock); 1245 if (err) 1246 goto out_unlink; 1247 if (u->addr) 1248 goto out_unlock; 1249 1250 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1251 unix_table_double_lock(net, old_hash, new_hash); 1252 u->path.mnt = mntget(parent.mnt); 1253 u->path.dentry = dget(dentry); 1254 __unix_set_addr_hash(net, sk, addr, new_hash); 1255 unix_table_double_unlock(net, old_hash, new_hash); 1256 unix_insert_bsd_socket(sk); 1257 mutex_unlock(&u->bindlock); 1258 done_path_create(&parent, dentry); 1259 return 0; 1260 1261 out_unlock: 1262 mutex_unlock(&u->bindlock); 1263 err = -EINVAL; 1264 out_unlink: 1265 /* failed after successful mknod? unlink what we'd created... */ 1266 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1267 out_path: 1268 done_path_create(&parent, dentry); 1269 out: 1270 unix_release_addr(addr); 1271 return err == -EEXIST ? -EADDRINUSE : err; 1272 } 1273 1274 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1275 int addr_len) 1276 { 1277 unsigned int new_hash, old_hash = sk->sk_hash; 1278 struct unix_sock *u = unix_sk(sk); 1279 struct net *net = sock_net(sk); 1280 struct unix_address *addr; 1281 int err; 1282 1283 addr = unix_create_addr(sunaddr, addr_len); 1284 if (!addr) 1285 return -ENOMEM; 1286 1287 err = mutex_lock_interruptible(&u->bindlock); 1288 if (err) 1289 goto out; 1290 1291 if (u->addr) { 1292 err = -EINVAL; 1293 goto out_mutex; 1294 } 1295 1296 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1297 unix_table_double_lock(net, old_hash, new_hash); 1298 1299 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1300 goto out_spin; 1301 1302 __unix_set_addr_hash(net, sk, addr, new_hash); 1303 unix_table_double_unlock(net, old_hash, new_hash); 1304 mutex_unlock(&u->bindlock); 1305 return 0; 1306 1307 out_spin: 1308 unix_table_double_unlock(net, old_hash, new_hash); 1309 err = -EADDRINUSE; 1310 out_mutex: 1311 mutex_unlock(&u->bindlock); 1312 out: 1313 unix_release_addr(addr); 1314 return err; 1315 } 1316 1317 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1318 { 1319 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1320 struct sock *sk = sock->sk; 1321 int err; 1322 1323 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1324 sunaddr->sun_family == AF_UNIX) 1325 return unix_autobind(sk); 1326 1327 err = unix_validate_addr(sunaddr, addr_len); 1328 if (err) 1329 return err; 1330 1331 if (sunaddr->sun_path[0]) 1332 err = unix_bind_bsd(sk, sunaddr, addr_len); 1333 else 1334 err = unix_bind_abstract(sk, sunaddr, addr_len); 1335 1336 return err; 1337 } 1338 1339 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1340 { 1341 if (unlikely(sk1 == sk2) || !sk2) { 1342 unix_state_lock(sk1); 1343 return; 1344 } 1345 if (sk1 < sk2) { 1346 unix_state_lock(sk1); 1347 unix_state_lock_nested(sk2); 1348 } else { 1349 unix_state_lock(sk2); 1350 unix_state_lock_nested(sk1); 1351 } 1352 } 1353 1354 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1355 { 1356 if (unlikely(sk1 == sk2) || !sk2) { 1357 unix_state_unlock(sk1); 1358 return; 1359 } 1360 unix_state_unlock(sk1); 1361 unix_state_unlock(sk2); 1362 } 1363 1364 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1365 int alen, int flags) 1366 { 1367 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1368 struct sock *sk = sock->sk; 1369 struct sock *other; 1370 int err; 1371 1372 err = -EINVAL; 1373 if (alen < offsetofend(struct sockaddr, sa_family)) 1374 goto out; 1375 1376 if (addr->sa_family != AF_UNSPEC) { 1377 err = unix_validate_addr(sunaddr, alen); 1378 if (err) 1379 goto out; 1380 1381 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1382 if (err) 1383 goto out; 1384 1385 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1386 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1387 !unix_sk(sk)->addr) { 1388 err = unix_autobind(sk); 1389 if (err) 1390 goto out; 1391 } 1392 1393 restart: 1394 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1395 if (IS_ERR(other)) { 1396 err = PTR_ERR(other); 1397 goto out; 1398 } 1399 1400 unix_state_double_lock(sk, other); 1401 1402 /* Apparently VFS overslept socket death. Retry. */ 1403 if (sock_flag(other, SOCK_DEAD)) { 1404 unix_state_double_unlock(sk, other); 1405 sock_put(other); 1406 goto restart; 1407 } 1408 1409 err = -EPERM; 1410 if (!unix_may_send(sk, other)) 1411 goto out_unlock; 1412 1413 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1414 if (err) 1415 goto out_unlock; 1416 1417 sk->sk_state = other->sk_state = TCP_ESTABLISHED; 1418 } else { 1419 /* 1420 * 1003.1g breaking connected state with AF_UNSPEC 1421 */ 1422 other = NULL; 1423 unix_state_double_lock(sk, other); 1424 } 1425 1426 /* 1427 * If it was connected, reconnect. 1428 */ 1429 if (unix_peer(sk)) { 1430 struct sock *old_peer = unix_peer(sk); 1431 1432 unix_peer(sk) = other; 1433 if (!other) 1434 sk->sk_state = TCP_CLOSE; 1435 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1436 1437 unix_state_double_unlock(sk, other); 1438 1439 if (other != old_peer) 1440 unix_dgram_disconnected(sk, old_peer); 1441 sock_put(old_peer); 1442 } else { 1443 unix_peer(sk) = other; 1444 unix_state_double_unlock(sk, other); 1445 } 1446 1447 return 0; 1448 1449 out_unlock: 1450 unix_state_double_unlock(sk, other); 1451 sock_put(other); 1452 out: 1453 return err; 1454 } 1455 1456 static long unix_wait_for_peer(struct sock *other, long timeo) 1457 __releases(&unix_sk(other)->lock) 1458 { 1459 struct unix_sock *u = unix_sk(other); 1460 int sched; 1461 DEFINE_WAIT(wait); 1462 1463 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1464 1465 sched = !sock_flag(other, SOCK_DEAD) && 1466 !(other->sk_shutdown & RCV_SHUTDOWN) && 1467 unix_recvq_full_lockless(other); 1468 1469 unix_state_unlock(other); 1470 1471 if (sched) 1472 timeo = schedule_timeout(timeo); 1473 1474 finish_wait(&u->peer_wait, &wait); 1475 return timeo; 1476 } 1477 1478 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1479 int addr_len, int flags) 1480 { 1481 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1482 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1483 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1484 struct net *net = sock_net(sk); 1485 struct sk_buff *skb = NULL; 1486 long timeo; 1487 int err; 1488 int st; 1489 1490 err = unix_validate_addr(sunaddr, addr_len); 1491 if (err) 1492 goto out; 1493 1494 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1495 if (err) 1496 goto out; 1497 1498 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1499 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { 1500 err = unix_autobind(sk); 1501 if (err) 1502 goto out; 1503 } 1504 1505 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1506 1507 /* First of all allocate resources. 1508 If we will make it after state is locked, 1509 we will have to recheck all again in any case. 1510 */ 1511 1512 /* create new sock for complete connection */ 1513 newsk = unix_create1(net, NULL, 0, sock->type); 1514 if (IS_ERR(newsk)) { 1515 err = PTR_ERR(newsk); 1516 newsk = NULL; 1517 goto out; 1518 } 1519 1520 err = -ENOMEM; 1521 1522 /* Allocate skb for sending to listening sock */ 1523 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1524 if (skb == NULL) 1525 goto out; 1526 1527 restart: 1528 /* Find listening sock. */ 1529 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1530 if (IS_ERR(other)) { 1531 err = PTR_ERR(other); 1532 other = NULL; 1533 goto out; 1534 } 1535 1536 /* Latch state of peer */ 1537 unix_state_lock(other); 1538 1539 /* Apparently VFS overslept socket death. Retry. */ 1540 if (sock_flag(other, SOCK_DEAD)) { 1541 unix_state_unlock(other); 1542 sock_put(other); 1543 goto restart; 1544 } 1545 1546 err = -ECONNREFUSED; 1547 if (other->sk_state != TCP_LISTEN) 1548 goto out_unlock; 1549 if (other->sk_shutdown & RCV_SHUTDOWN) 1550 goto out_unlock; 1551 1552 if (unix_recvq_full(other)) { 1553 err = -EAGAIN; 1554 if (!timeo) 1555 goto out_unlock; 1556 1557 timeo = unix_wait_for_peer(other, timeo); 1558 1559 err = sock_intr_errno(timeo); 1560 if (signal_pending(current)) 1561 goto out; 1562 sock_put(other); 1563 goto restart; 1564 } 1565 1566 /* Latch our state. 1567 1568 It is tricky place. We need to grab our state lock and cannot 1569 drop lock on peer. It is dangerous because deadlock is 1570 possible. Connect to self case and simultaneous 1571 attempt to connect are eliminated by checking socket 1572 state. other is TCP_LISTEN, if sk is TCP_LISTEN we 1573 check this before attempt to grab lock. 1574 1575 Well, and we have to recheck the state after socket locked. 1576 */ 1577 st = sk->sk_state; 1578 1579 switch (st) { 1580 case TCP_CLOSE: 1581 /* This is ok... continue with connect */ 1582 break; 1583 case TCP_ESTABLISHED: 1584 /* Socket is already connected */ 1585 err = -EISCONN; 1586 goto out_unlock; 1587 default: 1588 err = -EINVAL; 1589 goto out_unlock; 1590 } 1591 1592 unix_state_lock_nested(sk); 1593 1594 if (sk->sk_state != st) { 1595 unix_state_unlock(sk); 1596 unix_state_unlock(other); 1597 sock_put(other); 1598 goto restart; 1599 } 1600 1601 err = security_unix_stream_connect(sk, other, newsk); 1602 if (err) { 1603 unix_state_unlock(sk); 1604 goto out_unlock; 1605 } 1606 1607 /* The way is open! Fastly set all the necessary fields... */ 1608 1609 sock_hold(sk); 1610 unix_peer(newsk) = sk; 1611 newsk->sk_state = TCP_ESTABLISHED; 1612 newsk->sk_type = sk->sk_type; 1613 init_peercred(newsk); 1614 newu = unix_sk(newsk); 1615 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1616 otheru = unix_sk(other); 1617 1618 /* copy address information from listening to new sock 1619 * 1620 * The contents of *(otheru->addr) and otheru->path 1621 * are seen fully set up here, since we have found 1622 * otheru in hash under its lock. Insertion into the 1623 * hash chain we'd found it in had been done in an 1624 * earlier critical area protected by the chain's lock, 1625 * the same one where we'd set *(otheru->addr) contents, 1626 * as well as otheru->path and otheru->addr itself. 1627 * 1628 * Using smp_store_release() here to set newu->addr 1629 * is enough to make those stores, as well as stores 1630 * to newu->path visible to anyone who gets newu->addr 1631 * by smp_load_acquire(). IOW, the same warranties 1632 * as for unix_sock instances bound in unix_bind() or 1633 * in unix_autobind(). 1634 */ 1635 if (otheru->path.dentry) { 1636 path_get(&otheru->path); 1637 newu->path = otheru->path; 1638 } 1639 refcount_inc(&otheru->addr->refcnt); 1640 smp_store_release(&newu->addr, otheru->addr); 1641 1642 /* Set credentials */ 1643 copy_peercred(sk, other); 1644 1645 sock->state = SS_CONNECTED; 1646 sk->sk_state = TCP_ESTABLISHED; 1647 sock_hold(newsk); 1648 1649 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1650 unix_peer(sk) = newsk; 1651 1652 unix_state_unlock(sk); 1653 1654 /* take ten and send info to listening sock */ 1655 spin_lock(&other->sk_receive_queue.lock); 1656 __skb_queue_tail(&other->sk_receive_queue, skb); 1657 spin_unlock(&other->sk_receive_queue.lock); 1658 unix_state_unlock(other); 1659 other->sk_data_ready(other); 1660 sock_put(other); 1661 return 0; 1662 1663 out_unlock: 1664 if (other) 1665 unix_state_unlock(other); 1666 1667 out: 1668 kfree_skb(skb); 1669 if (newsk) 1670 unix_release_sock(newsk, 0); 1671 if (other) 1672 sock_put(other); 1673 return err; 1674 } 1675 1676 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1677 { 1678 struct sock *ska = socka->sk, *skb = sockb->sk; 1679 1680 /* Join our sockets back to back */ 1681 sock_hold(ska); 1682 sock_hold(skb); 1683 unix_peer(ska) = skb; 1684 unix_peer(skb) = ska; 1685 init_peercred(ska); 1686 init_peercred(skb); 1687 1688 ska->sk_state = TCP_ESTABLISHED; 1689 skb->sk_state = TCP_ESTABLISHED; 1690 socka->state = SS_CONNECTED; 1691 sockb->state = SS_CONNECTED; 1692 return 0; 1693 } 1694 1695 static void unix_sock_inherit_flags(const struct socket *old, 1696 struct socket *new) 1697 { 1698 if (test_bit(SOCK_PASSCRED, &old->flags)) 1699 set_bit(SOCK_PASSCRED, &new->flags); 1700 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1701 set_bit(SOCK_PASSPIDFD, &new->flags); 1702 if (test_bit(SOCK_PASSSEC, &old->flags)) 1703 set_bit(SOCK_PASSSEC, &new->flags); 1704 } 1705 1706 static int unix_accept(struct socket *sock, struct socket *newsock, int flags, 1707 bool kern) 1708 { 1709 struct sock *sk = sock->sk; 1710 struct sock *tsk; 1711 struct sk_buff *skb; 1712 int err; 1713 1714 err = -EOPNOTSUPP; 1715 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1716 goto out; 1717 1718 err = -EINVAL; 1719 if (sk->sk_state != TCP_LISTEN) 1720 goto out; 1721 1722 /* If socket state is TCP_LISTEN it cannot change (for now...), 1723 * so that no locks are necessary. 1724 */ 1725 1726 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1727 &err); 1728 if (!skb) { 1729 /* This means receive shutdown. */ 1730 if (err == 0) 1731 err = -EINVAL; 1732 goto out; 1733 } 1734 1735 tsk = skb->sk; 1736 skb_free_datagram(sk, skb); 1737 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1738 1739 /* attach accepted sock to socket */ 1740 unix_state_lock(tsk); 1741 newsock->state = SS_CONNECTED; 1742 unix_sock_inherit_flags(sock, newsock); 1743 sock_graft(tsk, newsock); 1744 unix_state_unlock(tsk); 1745 return 0; 1746 1747 out: 1748 return err; 1749 } 1750 1751 1752 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1753 { 1754 struct sock *sk = sock->sk; 1755 struct unix_address *addr; 1756 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1757 int err = 0; 1758 1759 if (peer) { 1760 sk = unix_peer_get(sk); 1761 1762 err = -ENOTCONN; 1763 if (!sk) 1764 goto out; 1765 err = 0; 1766 } else { 1767 sock_hold(sk); 1768 } 1769 1770 addr = smp_load_acquire(&unix_sk(sk)->addr); 1771 if (!addr) { 1772 sunaddr->sun_family = AF_UNIX; 1773 sunaddr->sun_path[0] = 0; 1774 err = offsetof(struct sockaddr_un, sun_path); 1775 } else { 1776 err = addr->len; 1777 memcpy(sunaddr, addr->name, addr->len); 1778 1779 if (peer) 1780 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1781 CGROUP_UNIX_GETPEERNAME); 1782 else 1783 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1784 CGROUP_UNIX_GETSOCKNAME); 1785 } 1786 sock_put(sk); 1787 out: 1788 return err; 1789 } 1790 1791 /* The "user->unix_inflight" variable is protected by the garbage 1792 * collection lock, and we just read it locklessly here. If you go 1793 * over the limit, there might be a tiny race in actually noticing 1794 * it across threads. Tough. 1795 */ 1796 static inline bool too_many_unix_fds(struct task_struct *p) 1797 { 1798 struct user_struct *user = current_user(); 1799 1800 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1801 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1802 return false; 1803 } 1804 1805 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1806 { 1807 int i; 1808 1809 if (too_many_unix_fds(current)) 1810 return -ETOOMANYREFS; 1811 1812 /* Need to duplicate file references for the sake of garbage 1813 * collection. Otherwise a socket in the fps might become a 1814 * candidate for GC while the skb is not yet queued. 1815 */ 1816 UNIXCB(skb).fp = scm_fp_dup(scm->fp); 1817 if (!UNIXCB(skb).fp) 1818 return -ENOMEM; 1819 1820 for (i = scm->fp->count - 1; i >= 0; i--) 1821 unix_inflight(scm->fp->user, scm->fp->fp[i]); 1822 1823 return 0; 1824 } 1825 1826 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1827 { 1828 int i; 1829 1830 scm->fp = UNIXCB(skb).fp; 1831 UNIXCB(skb).fp = NULL; 1832 1833 for (i = scm->fp->count - 1; i >= 0; i--) 1834 unix_notinflight(scm->fp->user, scm->fp->fp[i]); 1835 } 1836 1837 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1838 { 1839 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1840 1841 /* 1842 * Garbage collection of unix sockets starts by selecting a set of 1843 * candidate sockets which have reference only from being in flight 1844 * (total_refs == inflight_refs). This condition is checked once during 1845 * the candidate collection phase, and candidates are marked as such, so 1846 * that non-candidates can later be ignored. While inflight_refs is 1847 * protected by unix_gc_lock, total_refs (file count) is not, hence this 1848 * is an instantaneous decision. 1849 * 1850 * Once a candidate, however, the socket must not be reinstalled into a 1851 * file descriptor while the garbage collection is in progress. 1852 * 1853 * If the above conditions are met, then the directed graph of 1854 * candidates (*) does not change while unix_gc_lock is held. 1855 * 1856 * Any operations that changes the file count through file descriptors 1857 * (dup, close, sendmsg) does not change the graph since candidates are 1858 * not installed in fds. 1859 * 1860 * Dequeing a candidate via recvmsg would install it into an fd, but 1861 * that takes unix_gc_lock to decrement the inflight count, so it's 1862 * serialized with garbage collection. 1863 * 1864 * MSG_PEEK is special in that it does not change the inflight count, 1865 * yet does install the socket into an fd. The following lock/unlock 1866 * pair is to ensure serialization with garbage collection. It must be 1867 * done between incrementing the file count and installing the file into 1868 * an fd. 1869 * 1870 * If garbage collection starts after the barrier provided by the 1871 * lock/unlock, then it will see the elevated refcount and not mark this 1872 * as a candidate. If a garbage collection is already in progress 1873 * before the file count was incremented, then the lock/unlock pair will 1874 * ensure that garbage collection is finished before progressing to 1875 * installing the fd. 1876 * 1877 * (*) A -> B where B is on the queue of A or B is on the queue of C 1878 * which is on the queue of listening socket A. 1879 */ 1880 spin_lock(&unix_gc_lock); 1881 spin_unlock(&unix_gc_lock); 1882 } 1883 1884 static void unix_destruct_scm(struct sk_buff *skb) 1885 { 1886 struct scm_cookie scm; 1887 1888 memset(&scm, 0, sizeof(scm)); 1889 scm.pid = UNIXCB(skb).pid; 1890 if (UNIXCB(skb).fp) 1891 unix_detach_fds(&scm, skb); 1892 1893 /* Alas, it calls VFS */ 1894 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1895 scm_destroy(&scm); 1896 sock_wfree(skb); 1897 } 1898 1899 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1900 { 1901 int err = 0; 1902 1903 UNIXCB(skb).pid = get_pid(scm->pid); 1904 UNIXCB(skb).uid = scm->creds.uid; 1905 UNIXCB(skb).gid = scm->creds.gid; 1906 UNIXCB(skb).fp = NULL; 1907 unix_get_secdata(scm, skb); 1908 if (scm->fp && send_fds) 1909 err = unix_attach_fds(scm, skb); 1910 1911 skb->destructor = unix_destruct_scm; 1912 return err; 1913 } 1914 1915 static bool unix_passcred_enabled(const struct socket *sock, 1916 const struct sock *other) 1917 { 1918 return test_bit(SOCK_PASSCRED, &sock->flags) || 1919 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1920 !other->sk_socket || 1921 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1922 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1923 } 1924 1925 /* 1926 * Some apps rely on write() giving SCM_CREDENTIALS 1927 * We include credentials if source or destination socket 1928 * asserted SOCK_PASSCRED. 1929 */ 1930 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1931 const struct sock *other) 1932 { 1933 if (UNIXCB(skb).pid) 1934 return; 1935 if (unix_passcred_enabled(sock, other)) { 1936 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1937 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1938 } 1939 } 1940 1941 static bool unix_skb_scm_eq(struct sk_buff *skb, 1942 struct scm_cookie *scm) 1943 { 1944 return UNIXCB(skb).pid == scm->pid && 1945 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1946 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1947 unix_secdata_eq(scm, skb); 1948 } 1949 1950 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1951 { 1952 struct scm_fp_list *fp = UNIXCB(skb).fp; 1953 struct unix_sock *u = unix_sk(sk); 1954 1955 if (unlikely(fp && fp->count)) 1956 atomic_add(fp->count, &u->scm_stat.nr_fds); 1957 } 1958 1959 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1960 { 1961 struct scm_fp_list *fp = UNIXCB(skb).fp; 1962 struct unix_sock *u = unix_sk(sk); 1963 1964 if (unlikely(fp && fp->count)) 1965 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1966 } 1967 1968 /* 1969 * Send AF_UNIX data. 1970 */ 1971 1972 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1973 size_t len) 1974 { 1975 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 1976 struct sock *sk = sock->sk, *other = NULL; 1977 struct unix_sock *u = unix_sk(sk); 1978 struct scm_cookie scm; 1979 struct sk_buff *skb; 1980 int data_len = 0; 1981 int sk_locked; 1982 long timeo; 1983 int err; 1984 1985 err = scm_send(sock, msg, &scm, false); 1986 if (err < 0) 1987 return err; 1988 1989 wait_for_unix_gc(scm.fp); 1990 1991 err = -EOPNOTSUPP; 1992 if (msg->msg_flags&MSG_OOB) 1993 goto out; 1994 1995 if (msg->msg_namelen) { 1996 err = unix_validate_addr(sunaddr, msg->msg_namelen); 1997 if (err) 1998 goto out; 1999 2000 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 2001 msg->msg_name, 2002 &msg->msg_namelen, 2003 NULL); 2004 if (err) 2005 goto out; 2006 } else { 2007 sunaddr = NULL; 2008 err = -ENOTCONN; 2009 other = unix_peer_get(sk); 2010 if (!other) 2011 goto out; 2012 } 2013 2014 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 2015 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { 2016 err = unix_autobind(sk); 2017 if (err) 2018 goto out; 2019 } 2020 2021 err = -EMSGSIZE; 2022 if (len > sk->sk_sndbuf - 32) 2023 goto out; 2024 2025 if (len > SKB_MAX_ALLOC) { 2026 data_len = min_t(size_t, 2027 len - SKB_MAX_ALLOC, 2028 MAX_SKB_FRAGS * PAGE_SIZE); 2029 data_len = PAGE_ALIGN(data_len); 2030 2031 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 2032 } 2033 2034 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 2035 msg->msg_flags & MSG_DONTWAIT, &err, 2036 PAGE_ALLOC_COSTLY_ORDER); 2037 if (skb == NULL) 2038 goto out; 2039 2040 err = unix_scm_to_skb(&scm, skb, true); 2041 if (err < 0) 2042 goto out_free; 2043 2044 skb_put(skb, len - data_len); 2045 skb->data_len = data_len; 2046 skb->len = len; 2047 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 2048 if (err) 2049 goto out_free; 2050 2051 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 2052 2053 restart: 2054 if (!other) { 2055 err = -ECONNRESET; 2056 if (sunaddr == NULL) 2057 goto out_free; 2058 2059 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, 2060 sk->sk_type); 2061 if (IS_ERR(other)) { 2062 err = PTR_ERR(other); 2063 other = NULL; 2064 goto out_free; 2065 } 2066 } 2067 2068 if (sk_filter(other, skb) < 0) { 2069 /* Toss the packet but do not return any error to the sender */ 2070 err = len; 2071 goto out_free; 2072 } 2073 2074 sk_locked = 0; 2075 unix_state_lock(other); 2076 restart_locked: 2077 err = -EPERM; 2078 if (!unix_may_send(sk, other)) 2079 goto out_unlock; 2080 2081 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2082 /* 2083 * Check with 1003.1g - what should 2084 * datagram error 2085 */ 2086 unix_state_unlock(other); 2087 sock_put(other); 2088 2089 if (!sk_locked) 2090 unix_state_lock(sk); 2091 2092 err = 0; 2093 if (sk->sk_type == SOCK_SEQPACKET) { 2094 /* We are here only when racing with unix_release_sock() 2095 * is clearing @other. Never change state to TCP_CLOSE 2096 * unlike SOCK_DGRAM wants. 2097 */ 2098 unix_state_unlock(sk); 2099 err = -EPIPE; 2100 } else if (unix_peer(sk) == other) { 2101 unix_peer(sk) = NULL; 2102 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2103 2104 sk->sk_state = TCP_CLOSE; 2105 unix_state_unlock(sk); 2106 2107 unix_dgram_disconnected(sk, other); 2108 sock_put(other); 2109 err = -ECONNREFUSED; 2110 } else { 2111 unix_state_unlock(sk); 2112 } 2113 2114 other = NULL; 2115 if (err) 2116 goto out_free; 2117 goto restart; 2118 } 2119 2120 err = -EPIPE; 2121 if (other->sk_shutdown & RCV_SHUTDOWN) 2122 goto out_unlock; 2123 2124 if (sk->sk_type != SOCK_SEQPACKET) { 2125 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2126 if (err) 2127 goto out_unlock; 2128 } 2129 2130 /* other == sk && unix_peer(other) != sk if 2131 * - unix_peer(sk) == NULL, destination address bound to sk 2132 * - unix_peer(sk) == sk by time of get but disconnected before lock 2133 */ 2134 if (other != sk && 2135 unlikely(unix_peer(other) != sk && 2136 unix_recvq_full_lockless(other))) { 2137 if (timeo) { 2138 timeo = unix_wait_for_peer(other, timeo); 2139 2140 err = sock_intr_errno(timeo); 2141 if (signal_pending(current)) 2142 goto out_free; 2143 2144 goto restart; 2145 } 2146 2147 if (!sk_locked) { 2148 unix_state_unlock(other); 2149 unix_state_double_lock(sk, other); 2150 } 2151 2152 if (unix_peer(sk) != other || 2153 unix_dgram_peer_wake_me(sk, other)) { 2154 err = -EAGAIN; 2155 sk_locked = 1; 2156 goto out_unlock; 2157 } 2158 2159 if (!sk_locked) { 2160 sk_locked = 1; 2161 goto restart_locked; 2162 } 2163 } 2164 2165 if (unlikely(sk_locked)) 2166 unix_state_unlock(sk); 2167 2168 if (sock_flag(other, SOCK_RCVTSTAMP)) 2169 __net_timestamp(skb); 2170 maybe_add_creds(skb, sock, other); 2171 scm_stat_add(other, skb); 2172 skb_queue_tail(&other->sk_receive_queue, skb); 2173 unix_state_unlock(other); 2174 other->sk_data_ready(other); 2175 sock_put(other); 2176 scm_destroy(&scm); 2177 return len; 2178 2179 out_unlock: 2180 if (sk_locked) 2181 unix_state_unlock(sk); 2182 unix_state_unlock(other); 2183 out_free: 2184 kfree_skb(skb); 2185 out: 2186 if (other) 2187 sock_put(other); 2188 scm_destroy(&scm); 2189 return err; 2190 } 2191 2192 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2193 * bytes, and a minimum of a full page. 2194 */ 2195 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2196 2197 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2198 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2199 struct scm_cookie *scm, bool fds_sent) 2200 { 2201 struct unix_sock *ousk = unix_sk(other); 2202 struct sk_buff *skb; 2203 int err = 0; 2204 2205 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2206 2207 if (!skb) 2208 return err; 2209 2210 err = unix_scm_to_skb(scm, skb, !fds_sent); 2211 if (err < 0) { 2212 kfree_skb(skb); 2213 return err; 2214 } 2215 skb_put(skb, 1); 2216 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2217 2218 if (err) { 2219 kfree_skb(skb); 2220 return err; 2221 } 2222 2223 unix_state_lock(other); 2224 2225 if (sock_flag(other, SOCK_DEAD) || 2226 (other->sk_shutdown & RCV_SHUTDOWN)) { 2227 unix_state_unlock(other); 2228 kfree_skb(skb); 2229 return -EPIPE; 2230 } 2231 2232 maybe_add_creds(skb, sock, other); 2233 skb_get(skb); 2234 2235 if (ousk->oob_skb) 2236 consume_skb(ousk->oob_skb); 2237 2238 WRITE_ONCE(ousk->oob_skb, skb); 2239 2240 scm_stat_add(other, skb); 2241 skb_queue_tail(&other->sk_receive_queue, skb); 2242 sk_send_sigurg(other); 2243 unix_state_unlock(other); 2244 other->sk_data_ready(other); 2245 2246 return err; 2247 } 2248 #endif 2249 2250 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2251 size_t len) 2252 { 2253 struct sock *sk = sock->sk; 2254 struct sock *other = NULL; 2255 int err, size; 2256 struct sk_buff *skb; 2257 int sent = 0; 2258 struct scm_cookie scm; 2259 bool fds_sent = false; 2260 int data_len; 2261 2262 err = scm_send(sock, msg, &scm, false); 2263 if (err < 0) 2264 return err; 2265 2266 wait_for_unix_gc(scm.fp); 2267 2268 err = -EOPNOTSUPP; 2269 if (msg->msg_flags & MSG_OOB) { 2270 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2271 if (len) 2272 len--; 2273 else 2274 #endif 2275 goto out_err; 2276 } 2277 2278 if (msg->msg_namelen) { 2279 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2280 goto out_err; 2281 } else { 2282 err = -ENOTCONN; 2283 other = unix_peer(sk); 2284 if (!other) 2285 goto out_err; 2286 } 2287 2288 if (sk->sk_shutdown & SEND_SHUTDOWN) 2289 goto pipe_err; 2290 2291 while (sent < len) { 2292 size = len - sent; 2293 2294 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2295 skb = sock_alloc_send_pskb(sk, 0, 0, 2296 msg->msg_flags & MSG_DONTWAIT, 2297 &err, 0); 2298 } else { 2299 /* Keep two messages in the pipe so it schedules better */ 2300 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); 2301 2302 /* allow fallback to order-0 allocations */ 2303 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2304 2305 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2306 2307 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2308 2309 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2310 msg->msg_flags & MSG_DONTWAIT, &err, 2311 get_order(UNIX_SKB_FRAGS_SZ)); 2312 } 2313 if (!skb) 2314 goto out_err; 2315 2316 /* Only send the fds in the first buffer */ 2317 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2318 if (err < 0) { 2319 kfree_skb(skb); 2320 goto out_err; 2321 } 2322 fds_sent = true; 2323 2324 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2325 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2326 sk->sk_allocation); 2327 if (err < 0) { 2328 kfree_skb(skb); 2329 goto out_err; 2330 } 2331 size = err; 2332 refcount_add(size, &sk->sk_wmem_alloc); 2333 } else { 2334 skb_put(skb, size - data_len); 2335 skb->data_len = data_len; 2336 skb->len = size; 2337 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2338 if (err) { 2339 kfree_skb(skb); 2340 goto out_err; 2341 } 2342 } 2343 2344 unix_state_lock(other); 2345 2346 if (sock_flag(other, SOCK_DEAD) || 2347 (other->sk_shutdown & RCV_SHUTDOWN)) 2348 goto pipe_err_free; 2349 2350 maybe_add_creds(skb, sock, other); 2351 scm_stat_add(other, skb); 2352 skb_queue_tail(&other->sk_receive_queue, skb); 2353 unix_state_unlock(other); 2354 other->sk_data_ready(other); 2355 sent += size; 2356 } 2357 2358 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2359 if (msg->msg_flags & MSG_OOB) { 2360 err = queue_oob(sock, msg, other, &scm, fds_sent); 2361 if (err) 2362 goto out_err; 2363 sent++; 2364 } 2365 #endif 2366 2367 scm_destroy(&scm); 2368 2369 return sent; 2370 2371 pipe_err_free: 2372 unix_state_unlock(other); 2373 kfree_skb(skb); 2374 pipe_err: 2375 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) 2376 send_sig(SIGPIPE, current, 0); 2377 err = -EPIPE; 2378 out_err: 2379 scm_destroy(&scm); 2380 return sent ? : err; 2381 } 2382 2383 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2384 size_t len) 2385 { 2386 int err; 2387 struct sock *sk = sock->sk; 2388 2389 err = sock_error(sk); 2390 if (err) 2391 return err; 2392 2393 if (sk->sk_state != TCP_ESTABLISHED) 2394 return -ENOTCONN; 2395 2396 if (msg->msg_namelen) 2397 msg->msg_namelen = 0; 2398 2399 return unix_dgram_sendmsg(sock, msg, len); 2400 } 2401 2402 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2403 size_t size, int flags) 2404 { 2405 struct sock *sk = sock->sk; 2406 2407 if (sk->sk_state != TCP_ESTABLISHED) 2408 return -ENOTCONN; 2409 2410 return unix_dgram_recvmsg(sock, msg, size, flags); 2411 } 2412 2413 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2414 { 2415 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2416 2417 if (addr) { 2418 msg->msg_namelen = addr->len; 2419 memcpy(msg->msg_name, addr->name, addr->len); 2420 } 2421 } 2422 2423 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2424 int flags) 2425 { 2426 struct scm_cookie scm; 2427 struct socket *sock = sk->sk_socket; 2428 struct unix_sock *u = unix_sk(sk); 2429 struct sk_buff *skb, *last; 2430 long timeo; 2431 int skip; 2432 int err; 2433 2434 err = -EOPNOTSUPP; 2435 if (flags&MSG_OOB) 2436 goto out; 2437 2438 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2439 2440 do { 2441 mutex_lock(&u->iolock); 2442 2443 skip = sk_peek_offset(sk, flags); 2444 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2445 &skip, &err, &last); 2446 if (skb) { 2447 if (!(flags & MSG_PEEK)) 2448 scm_stat_del(sk, skb); 2449 break; 2450 } 2451 2452 mutex_unlock(&u->iolock); 2453 2454 if (err != -EAGAIN) 2455 break; 2456 } while (timeo && 2457 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2458 &err, &timeo, last)); 2459 2460 if (!skb) { /* implies iolock unlocked */ 2461 unix_state_lock(sk); 2462 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2463 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2464 (sk->sk_shutdown & RCV_SHUTDOWN)) 2465 err = 0; 2466 unix_state_unlock(sk); 2467 goto out; 2468 } 2469 2470 if (wq_has_sleeper(&u->peer_wait)) 2471 wake_up_interruptible_sync_poll(&u->peer_wait, 2472 EPOLLOUT | EPOLLWRNORM | 2473 EPOLLWRBAND); 2474 2475 if (msg->msg_name) { 2476 unix_copy_addr(msg, skb->sk); 2477 2478 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2479 msg->msg_name, 2480 &msg->msg_namelen); 2481 } 2482 2483 if (size > skb->len - skip) 2484 size = skb->len - skip; 2485 else if (size < skb->len - skip) 2486 msg->msg_flags |= MSG_TRUNC; 2487 2488 err = skb_copy_datagram_msg(skb, skip, msg, size); 2489 if (err) 2490 goto out_free; 2491 2492 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2493 __sock_recv_timestamp(msg, sk, skb); 2494 2495 memset(&scm, 0, sizeof(scm)); 2496 2497 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2498 unix_set_secdata(&scm, skb); 2499 2500 if (!(flags & MSG_PEEK)) { 2501 if (UNIXCB(skb).fp) 2502 unix_detach_fds(&scm, skb); 2503 2504 sk_peek_offset_bwd(sk, skb->len); 2505 } else { 2506 /* It is questionable: on PEEK we could: 2507 - do not return fds - good, but too simple 8) 2508 - return fds, and do not return them on read (old strategy, 2509 apparently wrong) 2510 - clone fds (I chose it for now, it is the most universal 2511 solution) 2512 2513 POSIX 1003.1g does not actually define this clearly 2514 at all. POSIX 1003.1g doesn't define a lot of things 2515 clearly however! 2516 2517 */ 2518 2519 sk_peek_offset_fwd(sk, size); 2520 2521 if (UNIXCB(skb).fp) 2522 unix_peek_fds(&scm, skb); 2523 } 2524 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2525 2526 scm_recv_unix(sock, msg, &scm, flags); 2527 2528 out_free: 2529 skb_free_datagram(sk, skb); 2530 mutex_unlock(&u->iolock); 2531 out: 2532 return err; 2533 } 2534 2535 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2536 int flags) 2537 { 2538 struct sock *sk = sock->sk; 2539 2540 #ifdef CONFIG_BPF_SYSCALL 2541 const struct proto *prot = READ_ONCE(sk->sk_prot); 2542 2543 if (prot != &unix_dgram_proto) 2544 return prot->recvmsg(sk, msg, size, flags, NULL); 2545 #endif 2546 return __unix_dgram_recvmsg(sk, msg, size, flags); 2547 } 2548 2549 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2550 { 2551 struct unix_sock *u = unix_sk(sk); 2552 struct sk_buff *skb; 2553 int err; 2554 2555 mutex_lock(&u->iolock); 2556 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2557 mutex_unlock(&u->iolock); 2558 if (!skb) 2559 return err; 2560 2561 return recv_actor(sk, skb); 2562 } 2563 2564 /* 2565 * Sleep until more data has arrived. But check for races.. 2566 */ 2567 static long unix_stream_data_wait(struct sock *sk, long timeo, 2568 struct sk_buff *last, unsigned int last_len, 2569 bool freezable) 2570 { 2571 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2572 struct sk_buff *tail; 2573 DEFINE_WAIT(wait); 2574 2575 unix_state_lock(sk); 2576 2577 for (;;) { 2578 prepare_to_wait(sk_sleep(sk), &wait, state); 2579 2580 tail = skb_peek_tail(&sk->sk_receive_queue); 2581 if (tail != last || 2582 (tail && tail->len != last_len) || 2583 sk->sk_err || 2584 (sk->sk_shutdown & RCV_SHUTDOWN) || 2585 signal_pending(current) || 2586 !timeo) 2587 break; 2588 2589 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2590 unix_state_unlock(sk); 2591 timeo = schedule_timeout(timeo); 2592 unix_state_lock(sk); 2593 2594 if (sock_flag(sk, SOCK_DEAD)) 2595 break; 2596 2597 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2598 } 2599 2600 finish_wait(sk_sleep(sk), &wait); 2601 unix_state_unlock(sk); 2602 return timeo; 2603 } 2604 2605 static unsigned int unix_skb_len(const struct sk_buff *skb) 2606 { 2607 return skb->len - UNIXCB(skb).consumed; 2608 } 2609 2610 struct unix_stream_read_state { 2611 int (*recv_actor)(struct sk_buff *, int, int, 2612 struct unix_stream_read_state *); 2613 struct socket *socket; 2614 struct msghdr *msg; 2615 struct pipe_inode_info *pipe; 2616 size_t size; 2617 int flags; 2618 unsigned int splice_flags; 2619 }; 2620 2621 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2622 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2623 { 2624 struct socket *sock = state->socket; 2625 struct sock *sk = sock->sk; 2626 struct unix_sock *u = unix_sk(sk); 2627 int chunk = 1; 2628 struct sk_buff *oob_skb; 2629 2630 mutex_lock(&u->iolock); 2631 unix_state_lock(sk); 2632 2633 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2634 unix_state_unlock(sk); 2635 mutex_unlock(&u->iolock); 2636 return -EINVAL; 2637 } 2638 2639 oob_skb = u->oob_skb; 2640 2641 if (!(state->flags & MSG_PEEK)) 2642 WRITE_ONCE(u->oob_skb, NULL); 2643 else 2644 skb_get(oob_skb); 2645 unix_state_unlock(sk); 2646 2647 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2648 2649 if (!(state->flags & MSG_PEEK)) 2650 UNIXCB(oob_skb).consumed += 1; 2651 2652 consume_skb(oob_skb); 2653 2654 mutex_unlock(&u->iolock); 2655 2656 if (chunk < 0) 2657 return -EFAULT; 2658 2659 state->msg->msg_flags |= MSG_OOB; 2660 return 1; 2661 } 2662 2663 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2664 int flags, int copied) 2665 { 2666 struct unix_sock *u = unix_sk(sk); 2667 2668 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { 2669 skb_unlink(skb, &sk->sk_receive_queue); 2670 consume_skb(skb); 2671 skb = NULL; 2672 } else { 2673 if (skb == u->oob_skb) { 2674 if (copied) { 2675 skb = NULL; 2676 } else if (sock_flag(sk, SOCK_URGINLINE)) { 2677 if (!(flags & MSG_PEEK)) { 2678 WRITE_ONCE(u->oob_skb, NULL); 2679 consume_skb(skb); 2680 } 2681 } else if (!(flags & MSG_PEEK)) { 2682 skb_unlink(skb, &sk->sk_receive_queue); 2683 consume_skb(skb); 2684 skb = skb_peek(&sk->sk_receive_queue); 2685 } 2686 } 2687 } 2688 return skb; 2689 } 2690 #endif 2691 2692 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2693 { 2694 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) 2695 return -ENOTCONN; 2696 2697 return unix_read_skb(sk, recv_actor); 2698 } 2699 2700 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2701 bool freezable) 2702 { 2703 struct scm_cookie scm; 2704 struct socket *sock = state->socket; 2705 struct sock *sk = sock->sk; 2706 struct unix_sock *u = unix_sk(sk); 2707 int copied = 0; 2708 int flags = state->flags; 2709 int noblock = flags & MSG_DONTWAIT; 2710 bool check_creds = false; 2711 int target; 2712 int err = 0; 2713 long timeo; 2714 int skip; 2715 size_t size = state->size; 2716 unsigned int last_len; 2717 2718 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { 2719 err = -EINVAL; 2720 goto out; 2721 } 2722 2723 if (unlikely(flags & MSG_OOB)) { 2724 err = -EOPNOTSUPP; 2725 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2726 err = unix_stream_recv_urg(state); 2727 #endif 2728 goto out; 2729 } 2730 2731 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2732 timeo = sock_rcvtimeo(sk, noblock); 2733 2734 memset(&scm, 0, sizeof(scm)); 2735 2736 /* Lock the socket to prevent queue disordering 2737 * while sleeps in memcpy_tomsg 2738 */ 2739 mutex_lock(&u->iolock); 2740 2741 skip = max(sk_peek_offset(sk, flags), 0); 2742 2743 do { 2744 int chunk; 2745 bool drop_skb; 2746 struct sk_buff *skb, *last; 2747 2748 redo: 2749 unix_state_lock(sk); 2750 if (sock_flag(sk, SOCK_DEAD)) { 2751 err = -ECONNRESET; 2752 goto unlock; 2753 } 2754 last = skb = skb_peek(&sk->sk_receive_queue); 2755 last_len = last ? last->len : 0; 2756 2757 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2758 if (skb) { 2759 skb = manage_oob(skb, sk, flags, copied); 2760 if (!skb) { 2761 unix_state_unlock(sk); 2762 if (copied) 2763 break; 2764 goto redo; 2765 } 2766 } 2767 #endif 2768 again: 2769 if (skb == NULL) { 2770 if (copied >= target) 2771 goto unlock; 2772 2773 /* 2774 * POSIX 1003.1g mandates this order. 2775 */ 2776 2777 err = sock_error(sk); 2778 if (err) 2779 goto unlock; 2780 if (sk->sk_shutdown & RCV_SHUTDOWN) 2781 goto unlock; 2782 2783 unix_state_unlock(sk); 2784 if (!timeo) { 2785 err = -EAGAIN; 2786 break; 2787 } 2788 2789 mutex_unlock(&u->iolock); 2790 2791 timeo = unix_stream_data_wait(sk, timeo, last, 2792 last_len, freezable); 2793 2794 if (signal_pending(current)) { 2795 err = sock_intr_errno(timeo); 2796 scm_destroy(&scm); 2797 goto out; 2798 } 2799 2800 mutex_lock(&u->iolock); 2801 goto redo; 2802 unlock: 2803 unix_state_unlock(sk); 2804 break; 2805 } 2806 2807 while (skip >= unix_skb_len(skb)) { 2808 skip -= unix_skb_len(skb); 2809 last = skb; 2810 last_len = skb->len; 2811 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2812 if (!skb) 2813 goto again; 2814 } 2815 2816 unix_state_unlock(sk); 2817 2818 if (check_creds) { 2819 /* Never glue messages from different writers */ 2820 if (!unix_skb_scm_eq(skb, &scm)) 2821 break; 2822 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2823 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2824 /* Copy credentials */ 2825 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2826 unix_set_secdata(&scm, skb); 2827 check_creds = true; 2828 } 2829 2830 /* Copy address just once */ 2831 if (state->msg && state->msg->msg_name) { 2832 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2833 state->msg->msg_name); 2834 unix_copy_addr(state->msg, skb->sk); 2835 2836 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2837 state->msg->msg_name, 2838 &state->msg->msg_namelen); 2839 2840 sunaddr = NULL; 2841 } 2842 2843 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2844 skb_get(skb); 2845 chunk = state->recv_actor(skb, skip, chunk, state); 2846 drop_skb = !unix_skb_len(skb); 2847 /* skb is only safe to use if !drop_skb */ 2848 consume_skb(skb); 2849 if (chunk < 0) { 2850 if (copied == 0) 2851 copied = -EFAULT; 2852 break; 2853 } 2854 copied += chunk; 2855 size -= chunk; 2856 2857 if (drop_skb) { 2858 /* the skb was touched by a concurrent reader; 2859 * we should not expect anything from this skb 2860 * anymore and assume it invalid - we can be 2861 * sure it was dropped from the socket queue 2862 * 2863 * let's report a short read 2864 */ 2865 err = 0; 2866 break; 2867 } 2868 2869 /* Mark read part of skb as used */ 2870 if (!(flags & MSG_PEEK)) { 2871 UNIXCB(skb).consumed += chunk; 2872 2873 sk_peek_offset_bwd(sk, chunk); 2874 2875 if (UNIXCB(skb).fp) { 2876 scm_stat_del(sk, skb); 2877 unix_detach_fds(&scm, skb); 2878 } 2879 2880 if (unix_skb_len(skb)) 2881 break; 2882 2883 skb_unlink(skb, &sk->sk_receive_queue); 2884 consume_skb(skb); 2885 2886 if (scm.fp) 2887 break; 2888 } else { 2889 /* It is questionable, see note in unix_dgram_recvmsg. 2890 */ 2891 if (UNIXCB(skb).fp) 2892 unix_peek_fds(&scm, skb); 2893 2894 sk_peek_offset_fwd(sk, chunk); 2895 2896 if (UNIXCB(skb).fp) 2897 break; 2898 2899 skip = 0; 2900 last = skb; 2901 last_len = skb->len; 2902 unix_state_lock(sk); 2903 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2904 if (skb) 2905 goto again; 2906 unix_state_unlock(sk); 2907 break; 2908 } 2909 } while (size); 2910 2911 mutex_unlock(&u->iolock); 2912 if (state->msg) 2913 scm_recv_unix(sock, state->msg, &scm, flags); 2914 else 2915 scm_destroy(&scm); 2916 out: 2917 return copied ? : err; 2918 } 2919 2920 static int unix_stream_read_actor(struct sk_buff *skb, 2921 int skip, int chunk, 2922 struct unix_stream_read_state *state) 2923 { 2924 int ret; 2925 2926 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2927 state->msg, chunk); 2928 return ret ?: chunk; 2929 } 2930 2931 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2932 size_t size, int flags) 2933 { 2934 struct unix_stream_read_state state = { 2935 .recv_actor = unix_stream_read_actor, 2936 .socket = sk->sk_socket, 2937 .msg = msg, 2938 .size = size, 2939 .flags = flags 2940 }; 2941 2942 return unix_stream_read_generic(&state, true); 2943 } 2944 2945 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2946 size_t size, int flags) 2947 { 2948 struct unix_stream_read_state state = { 2949 .recv_actor = unix_stream_read_actor, 2950 .socket = sock, 2951 .msg = msg, 2952 .size = size, 2953 .flags = flags 2954 }; 2955 2956 #ifdef CONFIG_BPF_SYSCALL 2957 struct sock *sk = sock->sk; 2958 const struct proto *prot = READ_ONCE(sk->sk_prot); 2959 2960 if (prot != &unix_stream_proto) 2961 return prot->recvmsg(sk, msg, size, flags, NULL); 2962 #endif 2963 return unix_stream_read_generic(&state, true); 2964 } 2965 2966 static int unix_stream_splice_actor(struct sk_buff *skb, 2967 int skip, int chunk, 2968 struct unix_stream_read_state *state) 2969 { 2970 return skb_splice_bits(skb, state->socket->sk, 2971 UNIXCB(skb).consumed + skip, 2972 state->pipe, chunk, state->splice_flags); 2973 } 2974 2975 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2976 struct pipe_inode_info *pipe, 2977 size_t size, unsigned int flags) 2978 { 2979 struct unix_stream_read_state state = { 2980 .recv_actor = unix_stream_splice_actor, 2981 .socket = sock, 2982 .pipe = pipe, 2983 .size = size, 2984 .splice_flags = flags, 2985 }; 2986 2987 if (unlikely(*ppos)) 2988 return -ESPIPE; 2989 2990 if (sock->file->f_flags & O_NONBLOCK || 2991 flags & SPLICE_F_NONBLOCK) 2992 state.flags = MSG_DONTWAIT; 2993 2994 return unix_stream_read_generic(&state, false); 2995 } 2996 2997 static int unix_shutdown(struct socket *sock, int mode) 2998 { 2999 struct sock *sk = sock->sk; 3000 struct sock *other; 3001 3002 if (mode < SHUT_RD || mode > SHUT_RDWR) 3003 return -EINVAL; 3004 /* This maps: 3005 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 3006 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 3007 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 3008 */ 3009 ++mode; 3010 3011 unix_state_lock(sk); 3012 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 3013 other = unix_peer(sk); 3014 if (other) 3015 sock_hold(other); 3016 unix_state_unlock(sk); 3017 sk->sk_state_change(sk); 3018 3019 if (other && 3020 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 3021 3022 int peer_mode = 0; 3023 const struct proto *prot = READ_ONCE(other->sk_prot); 3024 3025 if (prot->unhash) 3026 prot->unhash(other); 3027 if (mode&RCV_SHUTDOWN) 3028 peer_mode |= SEND_SHUTDOWN; 3029 if (mode&SEND_SHUTDOWN) 3030 peer_mode |= RCV_SHUTDOWN; 3031 unix_state_lock(other); 3032 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 3033 unix_state_unlock(other); 3034 other->sk_state_change(other); 3035 if (peer_mode == SHUTDOWN_MASK) 3036 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3037 else if (peer_mode & RCV_SHUTDOWN) 3038 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3039 } 3040 if (other) 3041 sock_put(other); 3042 3043 return 0; 3044 } 3045 3046 long unix_inq_len(struct sock *sk) 3047 { 3048 struct sk_buff *skb; 3049 long amount = 0; 3050 3051 if (sk->sk_state == TCP_LISTEN) 3052 return -EINVAL; 3053 3054 spin_lock(&sk->sk_receive_queue.lock); 3055 if (sk->sk_type == SOCK_STREAM || 3056 sk->sk_type == SOCK_SEQPACKET) { 3057 skb_queue_walk(&sk->sk_receive_queue, skb) 3058 amount += unix_skb_len(skb); 3059 } else { 3060 skb = skb_peek(&sk->sk_receive_queue); 3061 if (skb) 3062 amount = skb->len; 3063 } 3064 spin_unlock(&sk->sk_receive_queue.lock); 3065 3066 return amount; 3067 } 3068 EXPORT_SYMBOL_GPL(unix_inq_len); 3069 3070 long unix_outq_len(struct sock *sk) 3071 { 3072 return sk_wmem_alloc_get(sk); 3073 } 3074 EXPORT_SYMBOL_GPL(unix_outq_len); 3075 3076 static int unix_open_file(struct sock *sk) 3077 { 3078 struct path path; 3079 struct file *f; 3080 int fd; 3081 3082 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3083 return -EPERM; 3084 3085 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3086 return -ENOENT; 3087 3088 path = unix_sk(sk)->path; 3089 if (!path.dentry) 3090 return -ENOENT; 3091 3092 path_get(&path); 3093 3094 fd = get_unused_fd_flags(O_CLOEXEC); 3095 if (fd < 0) 3096 goto out; 3097 3098 f = dentry_open(&path, O_PATH, current_cred()); 3099 if (IS_ERR(f)) { 3100 put_unused_fd(fd); 3101 fd = PTR_ERR(f); 3102 goto out; 3103 } 3104 3105 fd_install(fd, f); 3106 out: 3107 path_put(&path); 3108 3109 return fd; 3110 } 3111 3112 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3113 { 3114 struct sock *sk = sock->sk; 3115 long amount = 0; 3116 int err; 3117 3118 switch (cmd) { 3119 case SIOCOUTQ: 3120 amount = unix_outq_len(sk); 3121 err = put_user(amount, (int __user *)arg); 3122 break; 3123 case SIOCINQ: 3124 amount = unix_inq_len(sk); 3125 if (amount < 0) 3126 err = amount; 3127 else 3128 err = put_user(amount, (int __user *)arg); 3129 break; 3130 case SIOCUNIXFILE: 3131 err = unix_open_file(sk); 3132 break; 3133 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3134 case SIOCATMARK: 3135 { 3136 struct sk_buff *skb; 3137 int answ = 0; 3138 3139 skb = skb_peek(&sk->sk_receive_queue); 3140 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) 3141 answ = 1; 3142 err = put_user(answ, (int __user *)arg); 3143 } 3144 break; 3145 #endif 3146 default: 3147 err = -ENOIOCTLCMD; 3148 break; 3149 } 3150 return err; 3151 } 3152 3153 #ifdef CONFIG_COMPAT 3154 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3155 { 3156 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3157 } 3158 #endif 3159 3160 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3161 { 3162 struct sock *sk = sock->sk; 3163 __poll_t mask; 3164 u8 shutdown; 3165 3166 sock_poll_wait(file, sock, wait); 3167 mask = 0; 3168 shutdown = READ_ONCE(sk->sk_shutdown); 3169 3170 /* exceptional events? */ 3171 if (READ_ONCE(sk->sk_err)) 3172 mask |= EPOLLERR; 3173 if (shutdown == SHUTDOWN_MASK) 3174 mask |= EPOLLHUP; 3175 if (shutdown & RCV_SHUTDOWN) 3176 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3177 3178 /* readable? */ 3179 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3180 mask |= EPOLLIN | EPOLLRDNORM; 3181 if (sk_is_readable(sk)) 3182 mask |= EPOLLIN | EPOLLRDNORM; 3183 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3184 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3185 mask |= EPOLLPRI; 3186 #endif 3187 3188 /* Connection-based need to check for termination and startup */ 3189 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3190 sk->sk_state == TCP_CLOSE) 3191 mask |= EPOLLHUP; 3192 3193 /* 3194 * we set writable also when the other side has shut down the 3195 * connection. This prevents stuck sockets. 3196 */ 3197 if (unix_writable(sk)) 3198 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3199 3200 return mask; 3201 } 3202 3203 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3204 poll_table *wait) 3205 { 3206 struct sock *sk = sock->sk, *other; 3207 unsigned int writable; 3208 __poll_t mask; 3209 u8 shutdown; 3210 3211 sock_poll_wait(file, sock, wait); 3212 mask = 0; 3213 shutdown = READ_ONCE(sk->sk_shutdown); 3214 3215 /* exceptional events? */ 3216 if (READ_ONCE(sk->sk_err) || 3217 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3218 mask |= EPOLLERR | 3219 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3220 3221 if (shutdown & RCV_SHUTDOWN) 3222 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3223 if (shutdown == SHUTDOWN_MASK) 3224 mask |= EPOLLHUP; 3225 3226 /* readable? */ 3227 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3228 mask |= EPOLLIN | EPOLLRDNORM; 3229 if (sk_is_readable(sk)) 3230 mask |= EPOLLIN | EPOLLRDNORM; 3231 3232 /* Connection-based need to check for termination and startup */ 3233 if (sk->sk_type == SOCK_SEQPACKET) { 3234 if (sk->sk_state == TCP_CLOSE) 3235 mask |= EPOLLHUP; 3236 /* connection hasn't started yet? */ 3237 if (sk->sk_state == TCP_SYN_SENT) 3238 return mask; 3239 } 3240 3241 /* No write status requested, avoid expensive OUT tests. */ 3242 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3243 return mask; 3244 3245 writable = unix_writable(sk); 3246 if (writable) { 3247 unix_state_lock(sk); 3248 3249 other = unix_peer(sk); 3250 if (other && unix_peer(other) != sk && 3251 unix_recvq_full_lockless(other) && 3252 unix_dgram_peer_wake_me(sk, other)) 3253 writable = 0; 3254 3255 unix_state_unlock(sk); 3256 } 3257 3258 if (writable) 3259 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3260 else 3261 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3262 3263 return mask; 3264 } 3265 3266 #ifdef CONFIG_PROC_FS 3267 3268 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3269 3270 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3271 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3272 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3273 3274 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3275 { 3276 unsigned long offset = get_offset(*pos); 3277 unsigned long bucket = get_bucket(*pos); 3278 unsigned long count = 0; 3279 struct sock *sk; 3280 3281 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3282 sk; sk = sk_next(sk)) { 3283 if (++count == offset) 3284 break; 3285 } 3286 3287 return sk; 3288 } 3289 3290 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3291 { 3292 unsigned long bucket = get_bucket(*pos); 3293 struct net *net = seq_file_net(seq); 3294 struct sock *sk; 3295 3296 while (bucket < UNIX_HASH_SIZE) { 3297 spin_lock(&net->unx.table.locks[bucket]); 3298 3299 sk = unix_from_bucket(seq, pos); 3300 if (sk) 3301 return sk; 3302 3303 spin_unlock(&net->unx.table.locks[bucket]); 3304 3305 *pos = set_bucket_offset(++bucket, 1); 3306 } 3307 3308 return NULL; 3309 } 3310 3311 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3312 loff_t *pos) 3313 { 3314 unsigned long bucket = get_bucket(*pos); 3315 3316 sk = sk_next(sk); 3317 if (sk) 3318 return sk; 3319 3320 3321 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3322 3323 *pos = set_bucket_offset(++bucket, 1); 3324 3325 return unix_get_first(seq, pos); 3326 } 3327 3328 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3329 { 3330 if (!*pos) 3331 return SEQ_START_TOKEN; 3332 3333 return unix_get_first(seq, pos); 3334 } 3335 3336 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3337 { 3338 ++*pos; 3339 3340 if (v == SEQ_START_TOKEN) 3341 return unix_get_first(seq, pos); 3342 3343 return unix_get_next(seq, v, pos); 3344 } 3345 3346 static void unix_seq_stop(struct seq_file *seq, void *v) 3347 { 3348 struct sock *sk = v; 3349 3350 if (sk) 3351 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3352 } 3353 3354 static int unix_seq_show(struct seq_file *seq, void *v) 3355 { 3356 3357 if (v == SEQ_START_TOKEN) 3358 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3359 "Inode Path\n"); 3360 else { 3361 struct sock *s = v; 3362 struct unix_sock *u = unix_sk(s); 3363 unix_state_lock(s); 3364 3365 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3366 s, 3367 refcount_read(&s->sk_refcnt), 3368 0, 3369 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3370 s->sk_type, 3371 s->sk_socket ? 3372 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3373 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3374 sock_i_ino(s)); 3375 3376 if (u->addr) { // under a hash table lock here 3377 int i, len; 3378 seq_putc(seq, ' '); 3379 3380 i = 0; 3381 len = u->addr->len - 3382 offsetof(struct sockaddr_un, sun_path); 3383 if (u->addr->name->sun_path[0]) { 3384 len--; 3385 } else { 3386 seq_putc(seq, '@'); 3387 i++; 3388 } 3389 for ( ; i < len; i++) 3390 seq_putc(seq, u->addr->name->sun_path[i] ?: 3391 '@'); 3392 } 3393 unix_state_unlock(s); 3394 seq_putc(seq, '\n'); 3395 } 3396 3397 return 0; 3398 } 3399 3400 static const struct seq_operations unix_seq_ops = { 3401 .start = unix_seq_start, 3402 .next = unix_seq_next, 3403 .stop = unix_seq_stop, 3404 .show = unix_seq_show, 3405 }; 3406 3407 #ifdef CONFIG_BPF_SYSCALL 3408 struct bpf_unix_iter_state { 3409 struct seq_net_private p; 3410 unsigned int cur_sk; 3411 unsigned int end_sk; 3412 unsigned int max_sk; 3413 struct sock **batch; 3414 bool st_bucket_done; 3415 }; 3416 3417 struct bpf_iter__unix { 3418 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3419 __bpf_md_ptr(struct unix_sock *, unix_sk); 3420 uid_t uid __aligned(8); 3421 }; 3422 3423 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3424 struct unix_sock *unix_sk, uid_t uid) 3425 { 3426 struct bpf_iter__unix ctx; 3427 3428 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3429 ctx.meta = meta; 3430 ctx.unix_sk = unix_sk; 3431 ctx.uid = uid; 3432 return bpf_iter_run_prog(prog, &ctx); 3433 } 3434 3435 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3436 3437 { 3438 struct bpf_unix_iter_state *iter = seq->private; 3439 unsigned int expected = 1; 3440 struct sock *sk; 3441 3442 sock_hold(start_sk); 3443 iter->batch[iter->end_sk++] = start_sk; 3444 3445 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3446 if (iter->end_sk < iter->max_sk) { 3447 sock_hold(sk); 3448 iter->batch[iter->end_sk++] = sk; 3449 } 3450 3451 expected++; 3452 } 3453 3454 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3455 3456 return expected; 3457 } 3458 3459 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3460 { 3461 while (iter->cur_sk < iter->end_sk) 3462 sock_put(iter->batch[iter->cur_sk++]); 3463 } 3464 3465 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3466 unsigned int new_batch_sz) 3467 { 3468 struct sock **new_batch; 3469 3470 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3471 GFP_USER | __GFP_NOWARN); 3472 if (!new_batch) 3473 return -ENOMEM; 3474 3475 bpf_iter_unix_put_batch(iter); 3476 kvfree(iter->batch); 3477 iter->batch = new_batch; 3478 iter->max_sk = new_batch_sz; 3479 3480 return 0; 3481 } 3482 3483 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3484 loff_t *pos) 3485 { 3486 struct bpf_unix_iter_state *iter = seq->private; 3487 unsigned int expected; 3488 bool resized = false; 3489 struct sock *sk; 3490 3491 if (iter->st_bucket_done) 3492 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3493 3494 again: 3495 /* Get a new batch */ 3496 iter->cur_sk = 0; 3497 iter->end_sk = 0; 3498 3499 sk = unix_get_first(seq, pos); 3500 if (!sk) 3501 return NULL; /* Done */ 3502 3503 expected = bpf_iter_unix_hold_batch(seq, sk); 3504 3505 if (iter->end_sk == expected) { 3506 iter->st_bucket_done = true; 3507 return sk; 3508 } 3509 3510 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3511 resized = true; 3512 goto again; 3513 } 3514 3515 return sk; 3516 } 3517 3518 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3519 { 3520 if (!*pos) 3521 return SEQ_START_TOKEN; 3522 3523 /* bpf iter does not support lseek, so it always 3524 * continue from where it was stop()-ped. 3525 */ 3526 return bpf_iter_unix_batch(seq, pos); 3527 } 3528 3529 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3530 { 3531 struct bpf_unix_iter_state *iter = seq->private; 3532 struct sock *sk; 3533 3534 /* Whenever seq_next() is called, the iter->cur_sk is 3535 * done with seq_show(), so advance to the next sk in 3536 * the batch. 3537 */ 3538 if (iter->cur_sk < iter->end_sk) 3539 sock_put(iter->batch[iter->cur_sk++]); 3540 3541 ++*pos; 3542 3543 if (iter->cur_sk < iter->end_sk) 3544 sk = iter->batch[iter->cur_sk]; 3545 else 3546 sk = bpf_iter_unix_batch(seq, pos); 3547 3548 return sk; 3549 } 3550 3551 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3552 { 3553 struct bpf_iter_meta meta; 3554 struct bpf_prog *prog; 3555 struct sock *sk = v; 3556 uid_t uid; 3557 bool slow; 3558 int ret; 3559 3560 if (v == SEQ_START_TOKEN) 3561 return 0; 3562 3563 slow = lock_sock_fast(sk); 3564 3565 if (unlikely(sk_unhashed(sk))) { 3566 ret = SEQ_SKIP; 3567 goto unlock; 3568 } 3569 3570 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3571 meta.seq = seq; 3572 prog = bpf_iter_get_info(&meta, false); 3573 ret = unix_prog_seq_show(prog, &meta, v, uid); 3574 unlock: 3575 unlock_sock_fast(sk, slow); 3576 return ret; 3577 } 3578 3579 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3580 { 3581 struct bpf_unix_iter_state *iter = seq->private; 3582 struct bpf_iter_meta meta; 3583 struct bpf_prog *prog; 3584 3585 if (!v) { 3586 meta.seq = seq; 3587 prog = bpf_iter_get_info(&meta, true); 3588 if (prog) 3589 (void)unix_prog_seq_show(prog, &meta, v, 0); 3590 } 3591 3592 if (iter->cur_sk < iter->end_sk) 3593 bpf_iter_unix_put_batch(iter); 3594 } 3595 3596 static const struct seq_operations bpf_iter_unix_seq_ops = { 3597 .start = bpf_iter_unix_seq_start, 3598 .next = bpf_iter_unix_seq_next, 3599 .stop = bpf_iter_unix_seq_stop, 3600 .show = bpf_iter_unix_seq_show, 3601 }; 3602 #endif 3603 #endif 3604 3605 static const struct net_proto_family unix_family_ops = { 3606 .family = PF_UNIX, 3607 .create = unix_create, 3608 .owner = THIS_MODULE, 3609 }; 3610 3611 3612 static int __net_init unix_net_init(struct net *net) 3613 { 3614 int i; 3615 3616 net->unx.sysctl_max_dgram_qlen = 10; 3617 if (unix_sysctl_register(net)) 3618 goto out; 3619 3620 #ifdef CONFIG_PROC_FS 3621 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3622 sizeof(struct seq_net_private))) 3623 goto err_sysctl; 3624 #endif 3625 3626 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3627 sizeof(spinlock_t), GFP_KERNEL); 3628 if (!net->unx.table.locks) 3629 goto err_proc; 3630 3631 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3632 sizeof(struct hlist_head), 3633 GFP_KERNEL); 3634 if (!net->unx.table.buckets) 3635 goto free_locks; 3636 3637 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3638 spin_lock_init(&net->unx.table.locks[i]); 3639 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3640 } 3641 3642 return 0; 3643 3644 free_locks: 3645 kvfree(net->unx.table.locks); 3646 err_proc: 3647 #ifdef CONFIG_PROC_FS 3648 remove_proc_entry("unix", net->proc_net); 3649 err_sysctl: 3650 #endif 3651 unix_sysctl_unregister(net); 3652 out: 3653 return -ENOMEM; 3654 } 3655 3656 static void __net_exit unix_net_exit(struct net *net) 3657 { 3658 kvfree(net->unx.table.buckets); 3659 kvfree(net->unx.table.locks); 3660 unix_sysctl_unregister(net); 3661 remove_proc_entry("unix", net->proc_net); 3662 } 3663 3664 static struct pernet_operations unix_net_ops = { 3665 .init = unix_net_init, 3666 .exit = unix_net_exit, 3667 }; 3668 3669 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3670 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3671 struct unix_sock *unix_sk, uid_t uid) 3672 3673 #define INIT_BATCH_SZ 16 3674 3675 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3676 { 3677 struct bpf_unix_iter_state *iter = priv_data; 3678 int err; 3679 3680 err = bpf_iter_init_seq_net(priv_data, aux); 3681 if (err) 3682 return err; 3683 3684 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3685 if (err) { 3686 bpf_iter_fini_seq_net(priv_data); 3687 return err; 3688 } 3689 3690 return 0; 3691 } 3692 3693 static void bpf_iter_fini_unix(void *priv_data) 3694 { 3695 struct bpf_unix_iter_state *iter = priv_data; 3696 3697 bpf_iter_fini_seq_net(priv_data); 3698 kvfree(iter->batch); 3699 } 3700 3701 static const struct bpf_iter_seq_info unix_seq_info = { 3702 .seq_ops = &bpf_iter_unix_seq_ops, 3703 .init_seq_private = bpf_iter_init_unix, 3704 .fini_seq_private = bpf_iter_fini_unix, 3705 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3706 }; 3707 3708 static const struct bpf_func_proto * 3709 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3710 const struct bpf_prog *prog) 3711 { 3712 switch (func_id) { 3713 case BPF_FUNC_setsockopt: 3714 return &bpf_sk_setsockopt_proto; 3715 case BPF_FUNC_getsockopt: 3716 return &bpf_sk_getsockopt_proto; 3717 default: 3718 return NULL; 3719 } 3720 } 3721 3722 static struct bpf_iter_reg unix_reg_info = { 3723 .target = "unix", 3724 .ctx_arg_info_size = 1, 3725 .ctx_arg_info = { 3726 { offsetof(struct bpf_iter__unix, unix_sk), 3727 PTR_TO_BTF_ID_OR_NULL }, 3728 }, 3729 .get_func_proto = bpf_iter_unix_get_func_proto, 3730 .seq_info = &unix_seq_info, 3731 }; 3732 3733 static void __init bpf_iter_register(void) 3734 { 3735 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3736 if (bpf_iter_reg_target(&unix_reg_info)) 3737 pr_warn("Warning: could not register bpf iterator unix\n"); 3738 } 3739 #endif 3740 3741 static int __init af_unix_init(void) 3742 { 3743 int i, rc = -1; 3744 3745 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3746 3747 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3748 spin_lock_init(&bsd_socket_locks[i]); 3749 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3750 } 3751 3752 rc = proto_register(&unix_dgram_proto, 1); 3753 if (rc != 0) { 3754 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3755 goto out; 3756 } 3757 3758 rc = proto_register(&unix_stream_proto, 1); 3759 if (rc != 0) { 3760 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3761 proto_unregister(&unix_dgram_proto); 3762 goto out; 3763 } 3764 3765 sock_register(&unix_family_ops); 3766 register_pernet_subsys(&unix_net_ops); 3767 unix_bpf_build_proto(); 3768 3769 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3770 bpf_iter_register(); 3771 #endif 3772 3773 out: 3774 return rc; 3775 } 3776 3777 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3778 fs_initcall(af_unix_init); 3779