1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 #include <linux/bpf-cgroup.h> 120 121 static atomic_long_t unix_nr_socks; 122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 124 125 /* SMP locking strategy: 126 * hash table is protected with spinlock. 127 * each socket state is protected by separate spinlock. 128 */ 129 130 static unsigned int unix_unbound_hash(struct sock *sk) 131 { 132 unsigned long hash = (unsigned long)sk; 133 134 hash ^= hash >> 16; 135 hash ^= hash >> 8; 136 hash ^= sk->sk_type; 137 138 return hash & UNIX_HASH_MOD; 139 } 140 141 static unsigned int unix_bsd_hash(struct inode *i) 142 { 143 return i->i_ino & UNIX_HASH_MOD; 144 } 145 146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 147 int addr_len, int type) 148 { 149 __wsum csum = csum_partial(sunaddr, addr_len, 0); 150 unsigned int hash; 151 152 hash = (__force unsigned int)csum_fold(csum); 153 hash ^= hash >> 8; 154 hash ^= type; 155 156 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 157 } 158 159 static void unix_table_double_lock(struct net *net, 160 unsigned int hash1, unsigned int hash2) 161 { 162 if (hash1 == hash2) { 163 spin_lock(&net->unx.table.locks[hash1]); 164 return; 165 } 166 167 if (hash1 > hash2) 168 swap(hash1, hash2); 169 170 spin_lock(&net->unx.table.locks[hash1]); 171 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); 172 } 173 174 static void unix_table_double_unlock(struct net *net, 175 unsigned int hash1, unsigned int hash2) 176 { 177 if (hash1 == hash2) { 178 spin_unlock(&net->unx.table.locks[hash1]); 179 return; 180 } 181 182 spin_unlock(&net->unx.table.locks[hash1]); 183 spin_unlock(&net->unx.table.locks[hash2]); 184 } 185 186 #ifdef CONFIG_SECURITY_NETWORK 187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 188 { 189 UNIXCB(skb).secid = scm->secid; 190 } 191 192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 193 { 194 scm->secid = UNIXCB(skb).secid; 195 } 196 197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 198 { 199 return (scm->secid == UNIXCB(skb).secid); 200 } 201 #else 202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 203 { } 204 205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 206 { } 207 208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 209 { 210 return true; 211 } 212 #endif /* CONFIG_SECURITY_NETWORK */ 213 214 static inline int unix_our_peer(struct sock *sk, struct sock *osk) 215 { 216 return unix_peer(osk) == sk; 217 } 218 219 static inline int unix_may_send(struct sock *sk, struct sock *osk) 220 { 221 return unix_peer(osk) == NULL || unix_our_peer(sk, osk); 222 } 223 224 static inline int unix_recvq_full(const struct sock *sk) 225 { 226 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 227 } 228 229 static inline int unix_recvq_full_lockless(const struct sock *sk) 230 { 231 return skb_queue_len_lockless(&sk->sk_receive_queue) > 232 READ_ONCE(sk->sk_max_ack_backlog); 233 } 234 235 struct sock *unix_peer_get(struct sock *s) 236 { 237 struct sock *peer; 238 239 unix_state_lock(s); 240 peer = unix_peer(s); 241 if (peer) 242 sock_hold(peer); 243 unix_state_unlock(s); 244 return peer; 245 } 246 EXPORT_SYMBOL_GPL(unix_peer_get); 247 248 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 249 int addr_len) 250 { 251 struct unix_address *addr; 252 253 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 254 if (!addr) 255 return NULL; 256 257 refcount_set(&addr->refcnt, 1); 258 addr->len = addr_len; 259 memcpy(addr->name, sunaddr, addr_len); 260 261 return addr; 262 } 263 264 static inline void unix_release_addr(struct unix_address *addr) 265 { 266 if (refcount_dec_and_test(&addr->refcnt)) 267 kfree(addr); 268 } 269 270 /* 271 * Check unix socket name: 272 * - should be not zero length. 273 * - if started by not zero, should be NULL terminated (FS object) 274 * - if started by zero, it is abstract name. 275 */ 276 277 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 278 { 279 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 280 addr_len > sizeof(*sunaddr)) 281 return -EINVAL; 282 283 if (sunaddr->sun_family != AF_UNIX) 284 return -EINVAL; 285 286 return 0; 287 } 288 289 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 290 { 291 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 292 short offset = offsetof(struct sockaddr_storage, __data); 293 294 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 295 296 /* This may look like an off by one error but it is a bit more 297 * subtle. 108 is the longest valid AF_UNIX path for a binding. 298 * sun_path[108] doesn't as such exist. However in kernel space 299 * we are guaranteed that it is a valid memory location in our 300 * kernel address buffer because syscall functions always pass 301 * a pointer of struct sockaddr_storage which has a bigger buffer 302 * than 108. Also, we must terminate sun_path for strlen() in 303 * getname_kernel(). 304 */ 305 addr->__data[addr_len - offset] = 0; 306 307 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 308 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 309 * know the actual buffer. 310 */ 311 return strlen(addr->__data) + offset + 1; 312 } 313 314 static void __unix_remove_socket(struct sock *sk) 315 { 316 sk_del_node_init(sk); 317 } 318 319 static void __unix_insert_socket(struct net *net, struct sock *sk) 320 { 321 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 322 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 323 } 324 325 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 326 struct unix_address *addr, unsigned int hash) 327 { 328 __unix_remove_socket(sk); 329 smp_store_release(&unix_sk(sk)->addr, addr); 330 331 sk->sk_hash = hash; 332 __unix_insert_socket(net, sk); 333 } 334 335 static void unix_remove_socket(struct net *net, struct sock *sk) 336 { 337 spin_lock(&net->unx.table.locks[sk->sk_hash]); 338 __unix_remove_socket(sk); 339 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 340 } 341 342 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 343 { 344 spin_lock(&net->unx.table.locks[sk->sk_hash]); 345 __unix_insert_socket(net, sk); 346 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 347 } 348 349 static void unix_insert_bsd_socket(struct sock *sk) 350 { 351 spin_lock(&bsd_socket_locks[sk->sk_hash]); 352 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 353 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 354 } 355 356 static void unix_remove_bsd_socket(struct sock *sk) 357 { 358 if (!hlist_unhashed(&sk->sk_bind_node)) { 359 spin_lock(&bsd_socket_locks[sk->sk_hash]); 360 __sk_del_bind_node(sk); 361 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 362 363 sk_node_init(&sk->sk_bind_node); 364 } 365 } 366 367 static struct sock *__unix_find_socket_byname(struct net *net, 368 struct sockaddr_un *sunname, 369 int len, unsigned int hash) 370 { 371 struct sock *s; 372 373 sk_for_each(s, &net->unx.table.buckets[hash]) { 374 struct unix_sock *u = unix_sk(s); 375 376 if (u->addr->len == len && 377 !memcmp(u->addr->name, sunname, len)) 378 return s; 379 } 380 return NULL; 381 } 382 383 static inline struct sock *unix_find_socket_byname(struct net *net, 384 struct sockaddr_un *sunname, 385 int len, unsigned int hash) 386 { 387 struct sock *s; 388 389 spin_lock(&net->unx.table.locks[hash]); 390 s = __unix_find_socket_byname(net, sunname, len, hash); 391 if (s) 392 sock_hold(s); 393 spin_unlock(&net->unx.table.locks[hash]); 394 return s; 395 } 396 397 static struct sock *unix_find_socket_byinode(struct inode *i) 398 { 399 unsigned int hash = unix_bsd_hash(i); 400 struct sock *s; 401 402 spin_lock(&bsd_socket_locks[hash]); 403 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 404 struct dentry *dentry = unix_sk(s)->path.dentry; 405 406 if (dentry && d_backing_inode(dentry) == i) { 407 sock_hold(s); 408 spin_unlock(&bsd_socket_locks[hash]); 409 return s; 410 } 411 } 412 spin_unlock(&bsd_socket_locks[hash]); 413 return NULL; 414 } 415 416 /* Support code for asymmetrically connected dgram sockets 417 * 418 * If a datagram socket is connected to a socket not itself connected 419 * to the first socket (eg, /dev/log), clients may only enqueue more 420 * messages if the present receive queue of the server socket is not 421 * "too large". This means there's a second writeability condition 422 * poll and sendmsg need to test. The dgram recv code will do a wake 423 * up on the peer_wait wait queue of a socket upon reception of a 424 * datagram which needs to be propagated to sleeping would-be writers 425 * since these might not have sent anything so far. This can't be 426 * accomplished via poll_wait because the lifetime of the server 427 * socket might be less than that of its clients if these break their 428 * association with it or if the server socket is closed while clients 429 * are still connected to it and there's no way to inform "a polling 430 * implementation" that it should let go of a certain wait queue 431 * 432 * In order to propagate a wake up, a wait_queue_entry_t of the client 433 * socket is enqueued on the peer_wait queue of the server socket 434 * whose wake function does a wake_up on the ordinary client socket 435 * wait queue. This connection is established whenever a write (or 436 * poll for write) hit the flow control condition and broken when the 437 * association to the server socket is dissolved or after a wake up 438 * was relayed. 439 */ 440 441 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 442 void *key) 443 { 444 struct unix_sock *u; 445 wait_queue_head_t *u_sleep; 446 447 u = container_of(q, struct unix_sock, peer_wake); 448 449 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 450 q); 451 u->peer_wake.private = NULL; 452 453 /* relaying can only happen while the wq still exists */ 454 u_sleep = sk_sleep(&u->sk); 455 if (u_sleep) 456 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 457 458 return 0; 459 } 460 461 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 462 { 463 struct unix_sock *u, *u_other; 464 int rc; 465 466 u = unix_sk(sk); 467 u_other = unix_sk(other); 468 rc = 0; 469 spin_lock(&u_other->peer_wait.lock); 470 471 if (!u->peer_wake.private) { 472 u->peer_wake.private = other; 473 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 474 475 rc = 1; 476 } 477 478 spin_unlock(&u_other->peer_wait.lock); 479 return rc; 480 } 481 482 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 483 struct sock *other) 484 { 485 struct unix_sock *u, *u_other; 486 487 u = unix_sk(sk); 488 u_other = unix_sk(other); 489 spin_lock(&u_other->peer_wait.lock); 490 491 if (u->peer_wake.private == other) { 492 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 493 u->peer_wake.private = NULL; 494 } 495 496 spin_unlock(&u_other->peer_wait.lock); 497 } 498 499 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 500 struct sock *other) 501 { 502 unix_dgram_peer_wake_disconnect(sk, other); 503 wake_up_interruptible_poll(sk_sleep(sk), 504 EPOLLOUT | 505 EPOLLWRNORM | 506 EPOLLWRBAND); 507 } 508 509 /* preconditions: 510 * - unix_peer(sk) == other 511 * - association is stable 512 */ 513 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 514 { 515 int connected; 516 517 connected = unix_dgram_peer_wake_connect(sk, other); 518 519 /* If other is SOCK_DEAD, we want to make sure we signal 520 * POLLOUT, such that a subsequent write() can get a 521 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 522 * to other and its full, we will hang waiting for POLLOUT. 523 */ 524 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 525 return 1; 526 527 if (connected) 528 unix_dgram_peer_wake_disconnect(sk, other); 529 530 return 0; 531 } 532 533 static int unix_writable(const struct sock *sk) 534 { 535 return sk->sk_state != TCP_LISTEN && 536 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; 537 } 538 539 static void unix_write_space(struct sock *sk) 540 { 541 struct socket_wq *wq; 542 543 rcu_read_lock(); 544 if (unix_writable(sk)) { 545 wq = rcu_dereference(sk->sk_wq); 546 if (skwq_has_sleeper(wq)) 547 wake_up_interruptible_sync_poll(&wq->wait, 548 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 549 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 550 } 551 rcu_read_unlock(); 552 } 553 554 /* When dgram socket disconnects (or changes its peer), we clear its receive 555 * queue of packets arrived from previous peer. First, it allows to do 556 * flow control based only on wmem_alloc; second, sk connected to peer 557 * may receive messages only from that peer. */ 558 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 559 { 560 if (!skb_queue_empty(&sk->sk_receive_queue)) { 561 skb_queue_purge(&sk->sk_receive_queue); 562 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 563 564 /* If one link of bidirectional dgram pipe is disconnected, 565 * we signal error. Messages are lost. Do not make this, 566 * when peer was not connected to us. 567 */ 568 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 569 WRITE_ONCE(other->sk_err, ECONNRESET); 570 sk_error_report(other); 571 } 572 } 573 other->sk_state = TCP_CLOSE; 574 } 575 576 static void unix_sock_destructor(struct sock *sk) 577 { 578 struct unix_sock *u = unix_sk(sk); 579 580 skb_queue_purge(&sk->sk_receive_queue); 581 582 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 583 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 584 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 585 if (!sock_flag(sk, SOCK_DEAD)) { 586 pr_info("Attempt to release alive unix socket: %p\n", sk); 587 return; 588 } 589 590 if (u->addr) 591 unix_release_addr(u->addr); 592 593 atomic_long_dec(&unix_nr_socks); 594 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 595 #ifdef UNIX_REFCNT_DEBUG 596 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 597 atomic_long_read(&unix_nr_socks)); 598 #endif 599 } 600 601 static void unix_release_sock(struct sock *sk, int embrion) 602 { 603 struct unix_sock *u = unix_sk(sk); 604 struct sock *skpair; 605 struct sk_buff *skb; 606 struct path path; 607 int state; 608 609 unix_remove_socket(sock_net(sk), sk); 610 unix_remove_bsd_socket(sk); 611 612 /* Clear state */ 613 unix_state_lock(sk); 614 sock_orphan(sk); 615 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 616 path = u->path; 617 u->path.dentry = NULL; 618 u->path.mnt = NULL; 619 state = sk->sk_state; 620 sk->sk_state = TCP_CLOSE; 621 622 skpair = unix_peer(sk); 623 unix_peer(sk) = NULL; 624 625 unix_state_unlock(sk); 626 627 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 628 if (u->oob_skb) { 629 kfree_skb(u->oob_skb); 630 u->oob_skb = NULL; 631 } 632 #endif 633 634 wake_up_interruptible_all(&u->peer_wait); 635 636 if (skpair != NULL) { 637 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 638 unix_state_lock(skpair); 639 /* No more writes */ 640 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 641 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) 642 WRITE_ONCE(skpair->sk_err, ECONNRESET); 643 unix_state_unlock(skpair); 644 skpair->sk_state_change(skpair); 645 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 646 } 647 648 unix_dgram_peer_wake_disconnect(sk, skpair); 649 sock_put(skpair); /* It may now die */ 650 } 651 652 /* Try to flush out this socket. Throw out buffers at least */ 653 654 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 655 if (state == TCP_LISTEN) 656 unix_release_sock(skb->sk, 1); 657 /* passed fds are erased in the kfree_skb hook */ 658 UNIXCB(skb).consumed = skb->len; 659 kfree_skb(skb); 660 } 661 662 if (path.dentry) 663 path_put(&path); 664 665 sock_put(sk); 666 667 /* ---- Socket is dead now and most probably destroyed ---- */ 668 669 /* 670 * Fixme: BSD difference: In BSD all sockets connected to us get 671 * ECONNRESET and we die on the spot. In Linux we behave 672 * like files and pipes do and wait for the last 673 * dereference. 674 * 675 * Can't we simply set sock->err? 676 * 677 * What the above comment does talk about? --ANK(980817) 678 */ 679 680 if (READ_ONCE(unix_tot_inflight)) 681 unix_gc(); /* Garbage collect fds */ 682 } 683 684 static void init_peercred(struct sock *sk) 685 { 686 const struct cred *old_cred; 687 struct pid *old_pid; 688 689 spin_lock(&sk->sk_peer_lock); 690 old_pid = sk->sk_peer_pid; 691 old_cred = sk->sk_peer_cred; 692 sk->sk_peer_pid = get_pid(task_tgid(current)); 693 sk->sk_peer_cred = get_current_cred(); 694 spin_unlock(&sk->sk_peer_lock); 695 696 put_pid(old_pid); 697 put_cred(old_cred); 698 } 699 700 static void copy_peercred(struct sock *sk, struct sock *peersk) 701 { 702 const struct cred *old_cred; 703 struct pid *old_pid; 704 705 if (sk < peersk) { 706 spin_lock(&sk->sk_peer_lock); 707 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); 708 } else { 709 spin_lock(&peersk->sk_peer_lock); 710 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); 711 } 712 old_pid = sk->sk_peer_pid; 713 old_cred = sk->sk_peer_cred; 714 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 715 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 716 717 spin_unlock(&sk->sk_peer_lock); 718 spin_unlock(&peersk->sk_peer_lock); 719 720 put_pid(old_pid); 721 put_cred(old_cred); 722 } 723 724 static int unix_listen(struct socket *sock, int backlog) 725 { 726 int err; 727 struct sock *sk = sock->sk; 728 struct unix_sock *u = unix_sk(sk); 729 730 err = -EOPNOTSUPP; 731 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 732 goto out; /* Only stream/seqpacket sockets accept */ 733 err = -EINVAL; 734 if (!u->addr) 735 goto out; /* No listens on an unbound socket */ 736 unix_state_lock(sk); 737 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 738 goto out_unlock; 739 if (backlog > sk->sk_max_ack_backlog) 740 wake_up_interruptible_all(&u->peer_wait); 741 sk->sk_max_ack_backlog = backlog; 742 sk->sk_state = TCP_LISTEN; 743 /* set credentials so connect can copy them */ 744 init_peercred(sk); 745 err = 0; 746 747 out_unlock: 748 unix_state_unlock(sk); 749 out: 750 return err; 751 } 752 753 static int unix_release(struct socket *); 754 static int unix_bind(struct socket *, struct sockaddr *, int); 755 static int unix_stream_connect(struct socket *, struct sockaddr *, 756 int addr_len, int flags); 757 static int unix_socketpair(struct socket *, struct socket *); 758 static int unix_accept(struct socket *, struct socket *, int, bool); 759 static int unix_getname(struct socket *, struct sockaddr *, int); 760 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 761 static __poll_t unix_dgram_poll(struct file *, struct socket *, 762 poll_table *); 763 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 764 #ifdef CONFIG_COMPAT 765 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 766 #endif 767 static int unix_shutdown(struct socket *, int); 768 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 769 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 770 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 771 struct pipe_inode_info *, size_t size, 772 unsigned int flags); 773 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 774 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 775 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 776 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 777 static int unix_dgram_connect(struct socket *, struct sockaddr *, 778 int, int); 779 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 780 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 781 int); 782 783 static int unix_set_peek_off(struct sock *sk, int val) 784 { 785 struct unix_sock *u = unix_sk(sk); 786 787 if (mutex_lock_interruptible(&u->iolock)) 788 return -EINTR; 789 790 WRITE_ONCE(sk->sk_peek_off, val); 791 mutex_unlock(&u->iolock); 792 793 return 0; 794 } 795 796 #ifdef CONFIG_PROC_FS 797 static int unix_count_nr_fds(struct sock *sk) 798 { 799 struct sk_buff *skb; 800 struct unix_sock *u; 801 int nr_fds = 0; 802 803 spin_lock(&sk->sk_receive_queue.lock); 804 skb = skb_peek(&sk->sk_receive_queue); 805 while (skb) { 806 u = unix_sk(skb->sk); 807 nr_fds += atomic_read(&u->scm_stat.nr_fds); 808 skb = skb_peek_next(skb, &sk->sk_receive_queue); 809 } 810 spin_unlock(&sk->sk_receive_queue.lock); 811 812 return nr_fds; 813 } 814 815 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 816 { 817 struct sock *sk = sock->sk; 818 unsigned char s_state; 819 struct unix_sock *u; 820 int nr_fds = 0; 821 822 if (sk) { 823 s_state = READ_ONCE(sk->sk_state); 824 u = unix_sk(sk); 825 826 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 827 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 828 * SOCK_DGRAM is ordinary. So, no lock is needed. 829 */ 830 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 831 nr_fds = atomic_read(&u->scm_stat.nr_fds); 832 else if (s_state == TCP_LISTEN) 833 nr_fds = unix_count_nr_fds(sk); 834 835 seq_printf(m, "scm_fds: %u\n", nr_fds); 836 } 837 } 838 #else 839 #define unix_show_fdinfo NULL 840 #endif 841 842 static const struct proto_ops unix_stream_ops = { 843 .family = PF_UNIX, 844 .owner = THIS_MODULE, 845 .release = unix_release, 846 .bind = unix_bind, 847 .connect = unix_stream_connect, 848 .socketpair = unix_socketpair, 849 .accept = unix_accept, 850 .getname = unix_getname, 851 .poll = unix_poll, 852 .ioctl = unix_ioctl, 853 #ifdef CONFIG_COMPAT 854 .compat_ioctl = unix_compat_ioctl, 855 #endif 856 .listen = unix_listen, 857 .shutdown = unix_shutdown, 858 .sendmsg = unix_stream_sendmsg, 859 .recvmsg = unix_stream_recvmsg, 860 .read_skb = unix_stream_read_skb, 861 .mmap = sock_no_mmap, 862 .splice_read = unix_stream_splice_read, 863 .set_peek_off = unix_set_peek_off, 864 .show_fdinfo = unix_show_fdinfo, 865 }; 866 867 static const struct proto_ops unix_dgram_ops = { 868 .family = PF_UNIX, 869 .owner = THIS_MODULE, 870 .release = unix_release, 871 .bind = unix_bind, 872 .connect = unix_dgram_connect, 873 .socketpair = unix_socketpair, 874 .accept = sock_no_accept, 875 .getname = unix_getname, 876 .poll = unix_dgram_poll, 877 .ioctl = unix_ioctl, 878 #ifdef CONFIG_COMPAT 879 .compat_ioctl = unix_compat_ioctl, 880 #endif 881 .listen = sock_no_listen, 882 .shutdown = unix_shutdown, 883 .sendmsg = unix_dgram_sendmsg, 884 .read_skb = unix_read_skb, 885 .recvmsg = unix_dgram_recvmsg, 886 .mmap = sock_no_mmap, 887 .set_peek_off = unix_set_peek_off, 888 .show_fdinfo = unix_show_fdinfo, 889 }; 890 891 static const struct proto_ops unix_seqpacket_ops = { 892 .family = PF_UNIX, 893 .owner = THIS_MODULE, 894 .release = unix_release, 895 .bind = unix_bind, 896 .connect = unix_stream_connect, 897 .socketpair = unix_socketpair, 898 .accept = unix_accept, 899 .getname = unix_getname, 900 .poll = unix_dgram_poll, 901 .ioctl = unix_ioctl, 902 #ifdef CONFIG_COMPAT 903 .compat_ioctl = unix_compat_ioctl, 904 #endif 905 .listen = unix_listen, 906 .shutdown = unix_shutdown, 907 .sendmsg = unix_seqpacket_sendmsg, 908 .recvmsg = unix_seqpacket_recvmsg, 909 .mmap = sock_no_mmap, 910 .set_peek_off = unix_set_peek_off, 911 .show_fdinfo = unix_show_fdinfo, 912 }; 913 914 static void unix_close(struct sock *sk, long timeout) 915 { 916 /* Nothing to do here, unix socket does not need a ->close(). 917 * This is merely for sockmap. 918 */ 919 } 920 921 static void unix_unhash(struct sock *sk) 922 { 923 /* Nothing to do here, unix socket does not need a ->unhash(). 924 * This is merely for sockmap. 925 */ 926 } 927 928 static bool unix_bpf_bypass_getsockopt(int level, int optname) 929 { 930 if (level == SOL_SOCKET) { 931 switch (optname) { 932 case SO_PEERPIDFD: 933 return true; 934 default: 935 return false; 936 } 937 } 938 939 return false; 940 } 941 942 struct proto unix_dgram_proto = { 943 .name = "UNIX", 944 .owner = THIS_MODULE, 945 .obj_size = sizeof(struct unix_sock), 946 .close = unix_close, 947 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 948 #ifdef CONFIG_BPF_SYSCALL 949 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 950 #endif 951 }; 952 953 struct proto unix_stream_proto = { 954 .name = "UNIX-STREAM", 955 .owner = THIS_MODULE, 956 .obj_size = sizeof(struct unix_sock), 957 .close = unix_close, 958 .unhash = unix_unhash, 959 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 960 #ifdef CONFIG_BPF_SYSCALL 961 .psock_update_sk_prot = unix_stream_bpf_update_proto, 962 #endif 963 }; 964 965 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 966 { 967 struct unix_sock *u; 968 struct sock *sk; 969 int err; 970 971 atomic_long_inc(&unix_nr_socks); 972 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 973 err = -ENFILE; 974 goto err; 975 } 976 977 if (type == SOCK_STREAM) 978 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 979 else /*dgram and seqpacket */ 980 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 981 982 if (!sk) { 983 err = -ENOMEM; 984 goto err; 985 } 986 987 sock_init_data(sock, sk); 988 989 sk->sk_hash = unix_unbound_hash(sk); 990 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 991 sk->sk_write_space = unix_write_space; 992 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; 993 sk->sk_destruct = unix_sock_destructor; 994 u = unix_sk(sk); 995 u->inflight = 0; 996 u->path.dentry = NULL; 997 u->path.mnt = NULL; 998 spin_lock_init(&u->lock); 999 INIT_LIST_HEAD(&u->link); 1000 mutex_init(&u->iolock); /* single task reading lock */ 1001 mutex_init(&u->bindlock); /* single task binding lock */ 1002 init_waitqueue_head(&u->peer_wait); 1003 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1004 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1005 unix_insert_unbound_socket(net, sk); 1006 1007 sock_prot_inuse_add(net, sk->sk_prot, 1); 1008 1009 return sk; 1010 1011 err: 1012 atomic_long_dec(&unix_nr_socks); 1013 return ERR_PTR(err); 1014 } 1015 1016 static int unix_create(struct net *net, struct socket *sock, int protocol, 1017 int kern) 1018 { 1019 struct sock *sk; 1020 1021 if (protocol && protocol != PF_UNIX) 1022 return -EPROTONOSUPPORT; 1023 1024 sock->state = SS_UNCONNECTED; 1025 1026 switch (sock->type) { 1027 case SOCK_STREAM: 1028 sock->ops = &unix_stream_ops; 1029 break; 1030 /* 1031 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1032 * nothing uses it. 1033 */ 1034 case SOCK_RAW: 1035 sock->type = SOCK_DGRAM; 1036 fallthrough; 1037 case SOCK_DGRAM: 1038 sock->ops = &unix_dgram_ops; 1039 break; 1040 case SOCK_SEQPACKET: 1041 sock->ops = &unix_seqpacket_ops; 1042 break; 1043 default: 1044 return -ESOCKTNOSUPPORT; 1045 } 1046 1047 sk = unix_create1(net, sock, kern, sock->type); 1048 if (IS_ERR(sk)) 1049 return PTR_ERR(sk); 1050 1051 return 0; 1052 } 1053 1054 static int unix_release(struct socket *sock) 1055 { 1056 struct sock *sk = sock->sk; 1057 1058 if (!sk) 1059 return 0; 1060 1061 sk->sk_prot->close(sk, 0); 1062 unix_release_sock(sk, 0); 1063 sock->sk = NULL; 1064 1065 return 0; 1066 } 1067 1068 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1069 int type) 1070 { 1071 struct inode *inode; 1072 struct path path; 1073 struct sock *sk; 1074 int err; 1075 1076 unix_mkname_bsd(sunaddr, addr_len); 1077 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1078 if (err) 1079 goto fail; 1080 1081 err = path_permission(&path, MAY_WRITE); 1082 if (err) 1083 goto path_put; 1084 1085 err = -ECONNREFUSED; 1086 inode = d_backing_inode(path.dentry); 1087 if (!S_ISSOCK(inode->i_mode)) 1088 goto path_put; 1089 1090 sk = unix_find_socket_byinode(inode); 1091 if (!sk) 1092 goto path_put; 1093 1094 err = -EPROTOTYPE; 1095 if (sk->sk_type == type) 1096 touch_atime(&path); 1097 else 1098 goto sock_put; 1099 1100 path_put(&path); 1101 1102 return sk; 1103 1104 sock_put: 1105 sock_put(sk); 1106 path_put: 1107 path_put(&path); 1108 fail: 1109 return ERR_PTR(err); 1110 } 1111 1112 static struct sock *unix_find_abstract(struct net *net, 1113 struct sockaddr_un *sunaddr, 1114 int addr_len, int type) 1115 { 1116 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1117 struct dentry *dentry; 1118 struct sock *sk; 1119 1120 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1121 if (!sk) 1122 return ERR_PTR(-ECONNREFUSED); 1123 1124 dentry = unix_sk(sk)->path.dentry; 1125 if (dentry) 1126 touch_atime(&unix_sk(sk)->path); 1127 1128 return sk; 1129 } 1130 1131 static struct sock *unix_find_other(struct net *net, 1132 struct sockaddr_un *sunaddr, 1133 int addr_len, int type) 1134 { 1135 struct sock *sk; 1136 1137 if (sunaddr->sun_path[0]) 1138 sk = unix_find_bsd(sunaddr, addr_len, type); 1139 else 1140 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1141 1142 return sk; 1143 } 1144 1145 static int unix_autobind(struct sock *sk) 1146 { 1147 unsigned int new_hash, old_hash = sk->sk_hash; 1148 struct unix_sock *u = unix_sk(sk); 1149 struct net *net = sock_net(sk); 1150 struct unix_address *addr; 1151 u32 lastnum, ordernum; 1152 int err; 1153 1154 err = mutex_lock_interruptible(&u->bindlock); 1155 if (err) 1156 return err; 1157 1158 if (u->addr) 1159 goto out; 1160 1161 err = -ENOMEM; 1162 addr = kzalloc(sizeof(*addr) + 1163 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1164 if (!addr) 1165 goto out; 1166 1167 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1168 addr->name->sun_family = AF_UNIX; 1169 refcount_set(&addr->refcnt, 1); 1170 1171 ordernum = get_random_u32(); 1172 lastnum = ordernum & 0xFFFFF; 1173 retry: 1174 ordernum = (ordernum + 1) & 0xFFFFF; 1175 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1176 1177 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1178 unix_table_double_lock(net, old_hash, new_hash); 1179 1180 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1181 unix_table_double_unlock(net, old_hash, new_hash); 1182 1183 /* __unix_find_socket_byname() may take long time if many names 1184 * are already in use. 1185 */ 1186 cond_resched(); 1187 1188 if (ordernum == lastnum) { 1189 /* Give up if all names seems to be in use. */ 1190 err = -ENOSPC; 1191 unix_release_addr(addr); 1192 goto out; 1193 } 1194 1195 goto retry; 1196 } 1197 1198 __unix_set_addr_hash(net, sk, addr, new_hash); 1199 unix_table_double_unlock(net, old_hash, new_hash); 1200 err = 0; 1201 1202 out: mutex_unlock(&u->bindlock); 1203 return err; 1204 } 1205 1206 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1207 int addr_len) 1208 { 1209 umode_t mode = S_IFSOCK | 1210 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1211 unsigned int new_hash, old_hash = sk->sk_hash; 1212 struct unix_sock *u = unix_sk(sk); 1213 struct net *net = sock_net(sk); 1214 struct mnt_idmap *idmap; 1215 struct unix_address *addr; 1216 struct dentry *dentry; 1217 struct path parent; 1218 int err; 1219 1220 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1221 addr = unix_create_addr(sunaddr, addr_len); 1222 if (!addr) 1223 return -ENOMEM; 1224 1225 /* 1226 * Get the parent directory, calculate the hash for last 1227 * component. 1228 */ 1229 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1230 if (IS_ERR(dentry)) { 1231 err = PTR_ERR(dentry); 1232 goto out; 1233 } 1234 1235 /* 1236 * All right, let's create it. 1237 */ 1238 idmap = mnt_idmap(parent.mnt); 1239 err = security_path_mknod(&parent, dentry, mode, 0); 1240 if (!err) 1241 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1242 if (err) 1243 goto out_path; 1244 err = mutex_lock_interruptible(&u->bindlock); 1245 if (err) 1246 goto out_unlink; 1247 if (u->addr) 1248 goto out_unlock; 1249 1250 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1251 unix_table_double_lock(net, old_hash, new_hash); 1252 u->path.mnt = mntget(parent.mnt); 1253 u->path.dentry = dget(dentry); 1254 __unix_set_addr_hash(net, sk, addr, new_hash); 1255 unix_table_double_unlock(net, old_hash, new_hash); 1256 unix_insert_bsd_socket(sk); 1257 mutex_unlock(&u->bindlock); 1258 done_path_create(&parent, dentry); 1259 return 0; 1260 1261 out_unlock: 1262 mutex_unlock(&u->bindlock); 1263 err = -EINVAL; 1264 out_unlink: 1265 /* failed after successful mknod? unlink what we'd created... */ 1266 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1267 out_path: 1268 done_path_create(&parent, dentry); 1269 out: 1270 unix_release_addr(addr); 1271 return err == -EEXIST ? -EADDRINUSE : err; 1272 } 1273 1274 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1275 int addr_len) 1276 { 1277 unsigned int new_hash, old_hash = sk->sk_hash; 1278 struct unix_sock *u = unix_sk(sk); 1279 struct net *net = sock_net(sk); 1280 struct unix_address *addr; 1281 int err; 1282 1283 addr = unix_create_addr(sunaddr, addr_len); 1284 if (!addr) 1285 return -ENOMEM; 1286 1287 err = mutex_lock_interruptible(&u->bindlock); 1288 if (err) 1289 goto out; 1290 1291 if (u->addr) { 1292 err = -EINVAL; 1293 goto out_mutex; 1294 } 1295 1296 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1297 unix_table_double_lock(net, old_hash, new_hash); 1298 1299 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1300 goto out_spin; 1301 1302 __unix_set_addr_hash(net, sk, addr, new_hash); 1303 unix_table_double_unlock(net, old_hash, new_hash); 1304 mutex_unlock(&u->bindlock); 1305 return 0; 1306 1307 out_spin: 1308 unix_table_double_unlock(net, old_hash, new_hash); 1309 err = -EADDRINUSE; 1310 out_mutex: 1311 mutex_unlock(&u->bindlock); 1312 out: 1313 unix_release_addr(addr); 1314 return err; 1315 } 1316 1317 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1318 { 1319 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1320 struct sock *sk = sock->sk; 1321 int err; 1322 1323 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1324 sunaddr->sun_family == AF_UNIX) 1325 return unix_autobind(sk); 1326 1327 err = unix_validate_addr(sunaddr, addr_len); 1328 if (err) 1329 return err; 1330 1331 if (sunaddr->sun_path[0]) 1332 err = unix_bind_bsd(sk, sunaddr, addr_len); 1333 else 1334 err = unix_bind_abstract(sk, sunaddr, addr_len); 1335 1336 return err; 1337 } 1338 1339 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1340 { 1341 if (unlikely(sk1 == sk2) || !sk2) { 1342 unix_state_lock(sk1); 1343 return; 1344 } 1345 if (sk1 > sk2) 1346 swap(sk1, sk2); 1347 1348 unix_state_lock(sk1); 1349 unix_state_lock_nested(sk2, U_LOCK_SECOND); 1350 } 1351 1352 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1353 { 1354 if (unlikely(sk1 == sk2) || !sk2) { 1355 unix_state_unlock(sk1); 1356 return; 1357 } 1358 unix_state_unlock(sk1); 1359 unix_state_unlock(sk2); 1360 } 1361 1362 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1363 int alen, int flags) 1364 { 1365 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1366 struct sock *sk = sock->sk; 1367 struct sock *other; 1368 int err; 1369 1370 err = -EINVAL; 1371 if (alen < offsetofend(struct sockaddr, sa_family)) 1372 goto out; 1373 1374 if (addr->sa_family != AF_UNSPEC) { 1375 err = unix_validate_addr(sunaddr, alen); 1376 if (err) 1377 goto out; 1378 1379 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1380 if (err) 1381 goto out; 1382 1383 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1384 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1385 !unix_sk(sk)->addr) { 1386 err = unix_autobind(sk); 1387 if (err) 1388 goto out; 1389 } 1390 1391 restart: 1392 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1393 if (IS_ERR(other)) { 1394 err = PTR_ERR(other); 1395 goto out; 1396 } 1397 1398 unix_state_double_lock(sk, other); 1399 1400 /* Apparently VFS overslept socket death. Retry. */ 1401 if (sock_flag(other, SOCK_DEAD)) { 1402 unix_state_double_unlock(sk, other); 1403 sock_put(other); 1404 goto restart; 1405 } 1406 1407 err = -EPERM; 1408 if (!unix_may_send(sk, other)) 1409 goto out_unlock; 1410 1411 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1412 if (err) 1413 goto out_unlock; 1414 1415 sk->sk_state = other->sk_state = TCP_ESTABLISHED; 1416 } else { 1417 /* 1418 * 1003.1g breaking connected state with AF_UNSPEC 1419 */ 1420 other = NULL; 1421 unix_state_double_lock(sk, other); 1422 } 1423 1424 /* 1425 * If it was connected, reconnect. 1426 */ 1427 if (unix_peer(sk)) { 1428 struct sock *old_peer = unix_peer(sk); 1429 1430 unix_peer(sk) = other; 1431 if (!other) 1432 sk->sk_state = TCP_CLOSE; 1433 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1434 1435 unix_state_double_unlock(sk, other); 1436 1437 if (other != old_peer) 1438 unix_dgram_disconnected(sk, old_peer); 1439 sock_put(old_peer); 1440 } else { 1441 unix_peer(sk) = other; 1442 unix_state_double_unlock(sk, other); 1443 } 1444 1445 return 0; 1446 1447 out_unlock: 1448 unix_state_double_unlock(sk, other); 1449 sock_put(other); 1450 out: 1451 return err; 1452 } 1453 1454 static long unix_wait_for_peer(struct sock *other, long timeo) 1455 __releases(&unix_sk(other)->lock) 1456 { 1457 struct unix_sock *u = unix_sk(other); 1458 int sched; 1459 DEFINE_WAIT(wait); 1460 1461 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1462 1463 sched = !sock_flag(other, SOCK_DEAD) && 1464 !(other->sk_shutdown & RCV_SHUTDOWN) && 1465 unix_recvq_full_lockless(other); 1466 1467 unix_state_unlock(other); 1468 1469 if (sched) 1470 timeo = schedule_timeout(timeo); 1471 1472 finish_wait(&u->peer_wait, &wait); 1473 return timeo; 1474 } 1475 1476 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1477 int addr_len, int flags) 1478 { 1479 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1480 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1481 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1482 struct net *net = sock_net(sk); 1483 struct sk_buff *skb = NULL; 1484 long timeo; 1485 int err; 1486 int st; 1487 1488 err = unix_validate_addr(sunaddr, addr_len); 1489 if (err) 1490 goto out; 1491 1492 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1493 if (err) 1494 goto out; 1495 1496 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1497 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { 1498 err = unix_autobind(sk); 1499 if (err) 1500 goto out; 1501 } 1502 1503 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1504 1505 /* First of all allocate resources. 1506 If we will make it after state is locked, 1507 we will have to recheck all again in any case. 1508 */ 1509 1510 /* create new sock for complete connection */ 1511 newsk = unix_create1(net, NULL, 0, sock->type); 1512 if (IS_ERR(newsk)) { 1513 err = PTR_ERR(newsk); 1514 newsk = NULL; 1515 goto out; 1516 } 1517 1518 err = -ENOMEM; 1519 1520 /* Allocate skb for sending to listening sock */ 1521 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1522 if (skb == NULL) 1523 goto out; 1524 1525 restart: 1526 /* Find listening sock. */ 1527 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1528 if (IS_ERR(other)) { 1529 err = PTR_ERR(other); 1530 other = NULL; 1531 goto out; 1532 } 1533 1534 /* Latch state of peer */ 1535 unix_state_lock(other); 1536 1537 /* Apparently VFS overslept socket death. Retry. */ 1538 if (sock_flag(other, SOCK_DEAD)) { 1539 unix_state_unlock(other); 1540 sock_put(other); 1541 goto restart; 1542 } 1543 1544 err = -ECONNREFUSED; 1545 if (other->sk_state != TCP_LISTEN) 1546 goto out_unlock; 1547 if (other->sk_shutdown & RCV_SHUTDOWN) 1548 goto out_unlock; 1549 1550 if (unix_recvq_full(other)) { 1551 err = -EAGAIN; 1552 if (!timeo) 1553 goto out_unlock; 1554 1555 timeo = unix_wait_for_peer(other, timeo); 1556 1557 err = sock_intr_errno(timeo); 1558 if (signal_pending(current)) 1559 goto out; 1560 sock_put(other); 1561 goto restart; 1562 } 1563 1564 /* Latch our state. 1565 1566 It is tricky place. We need to grab our state lock and cannot 1567 drop lock on peer. It is dangerous because deadlock is 1568 possible. Connect to self case and simultaneous 1569 attempt to connect are eliminated by checking socket 1570 state. other is TCP_LISTEN, if sk is TCP_LISTEN we 1571 check this before attempt to grab lock. 1572 1573 Well, and we have to recheck the state after socket locked. 1574 */ 1575 st = sk->sk_state; 1576 1577 switch (st) { 1578 case TCP_CLOSE: 1579 /* This is ok... continue with connect */ 1580 break; 1581 case TCP_ESTABLISHED: 1582 /* Socket is already connected */ 1583 err = -EISCONN; 1584 goto out_unlock; 1585 default: 1586 err = -EINVAL; 1587 goto out_unlock; 1588 } 1589 1590 unix_state_lock_nested(sk, U_LOCK_SECOND); 1591 1592 if (sk->sk_state != st) { 1593 unix_state_unlock(sk); 1594 unix_state_unlock(other); 1595 sock_put(other); 1596 goto restart; 1597 } 1598 1599 err = security_unix_stream_connect(sk, other, newsk); 1600 if (err) { 1601 unix_state_unlock(sk); 1602 goto out_unlock; 1603 } 1604 1605 /* The way is open! Fastly set all the necessary fields... */ 1606 1607 sock_hold(sk); 1608 unix_peer(newsk) = sk; 1609 newsk->sk_state = TCP_ESTABLISHED; 1610 newsk->sk_type = sk->sk_type; 1611 init_peercred(newsk); 1612 newu = unix_sk(newsk); 1613 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1614 otheru = unix_sk(other); 1615 1616 /* copy address information from listening to new sock 1617 * 1618 * The contents of *(otheru->addr) and otheru->path 1619 * are seen fully set up here, since we have found 1620 * otheru in hash under its lock. Insertion into the 1621 * hash chain we'd found it in had been done in an 1622 * earlier critical area protected by the chain's lock, 1623 * the same one where we'd set *(otheru->addr) contents, 1624 * as well as otheru->path and otheru->addr itself. 1625 * 1626 * Using smp_store_release() here to set newu->addr 1627 * is enough to make those stores, as well as stores 1628 * to newu->path visible to anyone who gets newu->addr 1629 * by smp_load_acquire(). IOW, the same warranties 1630 * as for unix_sock instances bound in unix_bind() or 1631 * in unix_autobind(). 1632 */ 1633 if (otheru->path.dentry) { 1634 path_get(&otheru->path); 1635 newu->path = otheru->path; 1636 } 1637 refcount_inc(&otheru->addr->refcnt); 1638 smp_store_release(&newu->addr, otheru->addr); 1639 1640 /* Set credentials */ 1641 copy_peercred(sk, other); 1642 1643 sock->state = SS_CONNECTED; 1644 sk->sk_state = TCP_ESTABLISHED; 1645 sock_hold(newsk); 1646 1647 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1648 unix_peer(sk) = newsk; 1649 1650 unix_state_unlock(sk); 1651 1652 /* take ten and send info to listening sock */ 1653 spin_lock(&other->sk_receive_queue.lock); 1654 __skb_queue_tail(&other->sk_receive_queue, skb); 1655 spin_unlock(&other->sk_receive_queue.lock); 1656 unix_state_unlock(other); 1657 other->sk_data_ready(other); 1658 sock_put(other); 1659 return 0; 1660 1661 out_unlock: 1662 if (other) 1663 unix_state_unlock(other); 1664 1665 out: 1666 kfree_skb(skb); 1667 if (newsk) 1668 unix_release_sock(newsk, 0); 1669 if (other) 1670 sock_put(other); 1671 return err; 1672 } 1673 1674 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1675 { 1676 struct sock *ska = socka->sk, *skb = sockb->sk; 1677 1678 /* Join our sockets back to back */ 1679 sock_hold(ska); 1680 sock_hold(skb); 1681 unix_peer(ska) = skb; 1682 unix_peer(skb) = ska; 1683 init_peercred(ska); 1684 init_peercred(skb); 1685 1686 ska->sk_state = TCP_ESTABLISHED; 1687 skb->sk_state = TCP_ESTABLISHED; 1688 socka->state = SS_CONNECTED; 1689 sockb->state = SS_CONNECTED; 1690 return 0; 1691 } 1692 1693 static void unix_sock_inherit_flags(const struct socket *old, 1694 struct socket *new) 1695 { 1696 if (test_bit(SOCK_PASSCRED, &old->flags)) 1697 set_bit(SOCK_PASSCRED, &new->flags); 1698 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1699 set_bit(SOCK_PASSPIDFD, &new->flags); 1700 if (test_bit(SOCK_PASSSEC, &old->flags)) 1701 set_bit(SOCK_PASSSEC, &new->flags); 1702 } 1703 1704 static int unix_accept(struct socket *sock, struct socket *newsock, int flags, 1705 bool kern) 1706 { 1707 struct sock *sk = sock->sk; 1708 struct sock *tsk; 1709 struct sk_buff *skb; 1710 int err; 1711 1712 err = -EOPNOTSUPP; 1713 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1714 goto out; 1715 1716 err = -EINVAL; 1717 if (sk->sk_state != TCP_LISTEN) 1718 goto out; 1719 1720 /* If socket state is TCP_LISTEN it cannot change (for now...), 1721 * so that no locks are necessary. 1722 */ 1723 1724 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1725 &err); 1726 if (!skb) { 1727 /* This means receive shutdown. */ 1728 if (err == 0) 1729 err = -EINVAL; 1730 goto out; 1731 } 1732 1733 tsk = skb->sk; 1734 skb_free_datagram(sk, skb); 1735 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1736 1737 /* attach accepted sock to socket */ 1738 unix_state_lock(tsk); 1739 newsock->state = SS_CONNECTED; 1740 unix_sock_inherit_flags(sock, newsock); 1741 sock_graft(tsk, newsock); 1742 unix_state_unlock(tsk); 1743 return 0; 1744 1745 out: 1746 return err; 1747 } 1748 1749 1750 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1751 { 1752 struct sock *sk = sock->sk; 1753 struct unix_address *addr; 1754 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1755 int err = 0; 1756 1757 if (peer) { 1758 sk = unix_peer_get(sk); 1759 1760 err = -ENOTCONN; 1761 if (!sk) 1762 goto out; 1763 err = 0; 1764 } else { 1765 sock_hold(sk); 1766 } 1767 1768 addr = smp_load_acquire(&unix_sk(sk)->addr); 1769 if (!addr) { 1770 sunaddr->sun_family = AF_UNIX; 1771 sunaddr->sun_path[0] = 0; 1772 err = offsetof(struct sockaddr_un, sun_path); 1773 } else { 1774 err = addr->len; 1775 memcpy(sunaddr, addr->name, addr->len); 1776 1777 if (peer) 1778 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1779 CGROUP_UNIX_GETPEERNAME); 1780 else 1781 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1782 CGROUP_UNIX_GETSOCKNAME); 1783 } 1784 sock_put(sk); 1785 out: 1786 return err; 1787 } 1788 1789 /* The "user->unix_inflight" variable is protected by the garbage 1790 * collection lock, and we just read it locklessly here. If you go 1791 * over the limit, there might be a tiny race in actually noticing 1792 * it across threads. Tough. 1793 */ 1794 static inline bool too_many_unix_fds(struct task_struct *p) 1795 { 1796 struct user_struct *user = current_user(); 1797 1798 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1799 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1800 return false; 1801 } 1802 1803 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1804 { 1805 int i; 1806 1807 if (too_many_unix_fds(current)) 1808 return -ETOOMANYREFS; 1809 1810 /* Need to duplicate file references for the sake of garbage 1811 * collection. Otherwise a socket in the fps might become a 1812 * candidate for GC while the skb is not yet queued. 1813 */ 1814 UNIXCB(skb).fp = scm_fp_dup(scm->fp); 1815 if (!UNIXCB(skb).fp) 1816 return -ENOMEM; 1817 1818 for (i = scm->fp->count - 1; i >= 0; i--) 1819 unix_inflight(scm->fp->user, scm->fp->fp[i]); 1820 1821 return 0; 1822 } 1823 1824 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1825 { 1826 int i; 1827 1828 scm->fp = UNIXCB(skb).fp; 1829 UNIXCB(skb).fp = NULL; 1830 1831 for (i = scm->fp->count - 1; i >= 0; i--) 1832 unix_notinflight(scm->fp->user, scm->fp->fp[i]); 1833 } 1834 1835 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1836 { 1837 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1838 1839 /* 1840 * Garbage collection of unix sockets starts by selecting a set of 1841 * candidate sockets which have reference only from being in flight 1842 * (total_refs == inflight_refs). This condition is checked once during 1843 * the candidate collection phase, and candidates are marked as such, so 1844 * that non-candidates can later be ignored. While inflight_refs is 1845 * protected by unix_gc_lock, total_refs (file count) is not, hence this 1846 * is an instantaneous decision. 1847 * 1848 * Once a candidate, however, the socket must not be reinstalled into a 1849 * file descriptor while the garbage collection is in progress. 1850 * 1851 * If the above conditions are met, then the directed graph of 1852 * candidates (*) does not change while unix_gc_lock is held. 1853 * 1854 * Any operations that changes the file count through file descriptors 1855 * (dup, close, sendmsg) does not change the graph since candidates are 1856 * not installed in fds. 1857 * 1858 * Dequeing a candidate via recvmsg would install it into an fd, but 1859 * that takes unix_gc_lock to decrement the inflight count, so it's 1860 * serialized with garbage collection. 1861 * 1862 * MSG_PEEK is special in that it does not change the inflight count, 1863 * yet does install the socket into an fd. The following lock/unlock 1864 * pair is to ensure serialization with garbage collection. It must be 1865 * done between incrementing the file count and installing the file into 1866 * an fd. 1867 * 1868 * If garbage collection starts after the barrier provided by the 1869 * lock/unlock, then it will see the elevated refcount and not mark this 1870 * as a candidate. If a garbage collection is already in progress 1871 * before the file count was incremented, then the lock/unlock pair will 1872 * ensure that garbage collection is finished before progressing to 1873 * installing the fd. 1874 * 1875 * (*) A -> B where B is on the queue of A or B is on the queue of C 1876 * which is on the queue of listening socket A. 1877 */ 1878 spin_lock(&unix_gc_lock); 1879 spin_unlock(&unix_gc_lock); 1880 } 1881 1882 static void unix_destruct_scm(struct sk_buff *skb) 1883 { 1884 struct scm_cookie scm; 1885 1886 memset(&scm, 0, sizeof(scm)); 1887 scm.pid = UNIXCB(skb).pid; 1888 if (UNIXCB(skb).fp) 1889 unix_detach_fds(&scm, skb); 1890 1891 /* Alas, it calls VFS */ 1892 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1893 scm_destroy(&scm); 1894 sock_wfree(skb); 1895 } 1896 1897 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1898 { 1899 int err = 0; 1900 1901 UNIXCB(skb).pid = get_pid(scm->pid); 1902 UNIXCB(skb).uid = scm->creds.uid; 1903 UNIXCB(skb).gid = scm->creds.gid; 1904 UNIXCB(skb).fp = NULL; 1905 unix_get_secdata(scm, skb); 1906 if (scm->fp && send_fds) 1907 err = unix_attach_fds(scm, skb); 1908 1909 skb->destructor = unix_destruct_scm; 1910 return err; 1911 } 1912 1913 static bool unix_passcred_enabled(const struct socket *sock, 1914 const struct sock *other) 1915 { 1916 return test_bit(SOCK_PASSCRED, &sock->flags) || 1917 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1918 !other->sk_socket || 1919 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1920 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1921 } 1922 1923 /* 1924 * Some apps rely on write() giving SCM_CREDENTIALS 1925 * We include credentials if source or destination socket 1926 * asserted SOCK_PASSCRED. 1927 */ 1928 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1929 const struct sock *other) 1930 { 1931 if (UNIXCB(skb).pid) 1932 return; 1933 if (unix_passcred_enabled(sock, other)) { 1934 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1935 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1936 } 1937 } 1938 1939 static bool unix_skb_scm_eq(struct sk_buff *skb, 1940 struct scm_cookie *scm) 1941 { 1942 return UNIXCB(skb).pid == scm->pid && 1943 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1944 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1945 unix_secdata_eq(scm, skb); 1946 } 1947 1948 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1949 { 1950 struct scm_fp_list *fp = UNIXCB(skb).fp; 1951 struct unix_sock *u = unix_sk(sk); 1952 1953 if (unlikely(fp && fp->count)) 1954 atomic_add(fp->count, &u->scm_stat.nr_fds); 1955 } 1956 1957 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1958 { 1959 struct scm_fp_list *fp = UNIXCB(skb).fp; 1960 struct unix_sock *u = unix_sk(sk); 1961 1962 if (unlikely(fp && fp->count)) 1963 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1964 } 1965 1966 /* 1967 * Send AF_UNIX data. 1968 */ 1969 1970 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1971 size_t len) 1972 { 1973 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 1974 struct sock *sk = sock->sk, *other = NULL; 1975 struct unix_sock *u = unix_sk(sk); 1976 struct scm_cookie scm; 1977 struct sk_buff *skb; 1978 int data_len = 0; 1979 int sk_locked; 1980 long timeo; 1981 int err; 1982 1983 err = scm_send(sock, msg, &scm, false); 1984 if (err < 0) 1985 return err; 1986 1987 wait_for_unix_gc(scm.fp); 1988 1989 err = -EOPNOTSUPP; 1990 if (msg->msg_flags&MSG_OOB) 1991 goto out; 1992 1993 if (msg->msg_namelen) { 1994 err = unix_validate_addr(sunaddr, msg->msg_namelen); 1995 if (err) 1996 goto out; 1997 1998 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 1999 msg->msg_name, 2000 &msg->msg_namelen, 2001 NULL); 2002 if (err) 2003 goto out; 2004 } else { 2005 sunaddr = NULL; 2006 err = -ENOTCONN; 2007 other = unix_peer_get(sk); 2008 if (!other) 2009 goto out; 2010 } 2011 2012 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 2013 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { 2014 err = unix_autobind(sk); 2015 if (err) 2016 goto out; 2017 } 2018 2019 err = -EMSGSIZE; 2020 if (len > sk->sk_sndbuf - 32) 2021 goto out; 2022 2023 if (len > SKB_MAX_ALLOC) { 2024 data_len = min_t(size_t, 2025 len - SKB_MAX_ALLOC, 2026 MAX_SKB_FRAGS * PAGE_SIZE); 2027 data_len = PAGE_ALIGN(data_len); 2028 2029 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 2030 } 2031 2032 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 2033 msg->msg_flags & MSG_DONTWAIT, &err, 2034 PAGE_ALLOC_COSTLY_ORDER); 2035 if (skb == NULL) 2036 goto out; 2037 2038 err = unix_scm_to_skb(&scm, skb, true); 2039 if (err < 0) 2040 goto out_free; 2041 2042 skb_put(skb, len - data_len); 2043 skb->data_len = data_len; 2044 skb->len = len; 2045 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 2046 if (err) 2047 goto out_free; 2048 2049 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 2050 2051 restart: 2052 if (!other) { 2053 err = -ECONNRESET; 2054 if (sunaddr == NULL) 2055 goto out_free; 2056 2057 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, 2058 sk->sk_type); 2059 if (IS_ERR(other)) { 2060 err = PTR_ERR(other); 2061 other = NULL; 2062 goto out_free; 2063 } 2064 } 2065 2066 if (sk_filter(other, skb) < 0) { 2067 /* Toss the packet but do not return any error to the sender */ 2068 err = len; 2069 goto out_free; 2070 } 2071 2072 sk_locked = 0; 2073 unix_state_lock(other); 2074 restart_locked: 2075 err = -EPERM; 2076 if (!unix_may_send(sk, other)) 2077 goto out_unlock; 2078 2079 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2080 /* 2081 * Check with 1003.1g - what should 2082 * datagram error 2083 */ 2084 unix_state_unlock(other); 2085 sock_put(other); 2086 2087 if (!sk_locked) 2088 unix_state_lock(sk); 2089 2090 err = 0; 2091 if (sk->sk_type == SOCK_SEQPACKET) { 2092 /* We are here only when racing with unix_release_sock() 2093 * is clearing @other. Never change state to TCP_CLOSE 2094 * unlike SOCK_DGRAM wants. 2095 */ 2096 unix_state_unlock(sk); 2097 err = -EPIPE; 2098 } else if (unix_peer(sk) == other) { 2099 unix_peer(sk) = NULL; 2100 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2101 2102 sk->sk_state = TCP_CLOSE; 2103 unix_state_unlock(sk); 2104 2105 unix_dgram_disconnected(sk, other); 2106 sock_put(other); 2107 err = -ECONNREFUSED; 2108 } else { 2109 unix_state_unlock(sk); 2110 } 2111 2112 other = NULL; 2113 if (err) 2114 goto out_free; 2115 goto restart; 2116 } 2117 2118 err = -EPIPE; 2119 if (other->sk_shutdown & RCV_SHUTDOWN) 2120 goto out_unlock; 2121 2122 if (sk->sk_type != SOCK_SEQPACKET) { 2123 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2124 if (err) 2125 goto out_unlock; 2126 } 2127 2128 /* other == sk && unix_peer(other) != sk if 2129 * - unix_peer(sk) == NULL, destination address bound to sk 2130 * - unix_peer(sk) == sk by time of get but disconnected before lock 2131 */ 2132 if (other != sk && 2133 unlikely(unix_peer(other) != sk && 2134 unix_recvq_full_lockless(other))) { 2135 if (timeo) { 2136 timeo = unix_wait_for_peer(other, timeo); 2137 2138 err = sock_intr_errno(timeo); 2139 if (signal_pending(current)) 2140 goto out_free; 2141 2142 goto restart; 2143 } 2144 2145 if (!sk_locked) { 2146 unix_state_unlock(other); 2147 unix_state_double_lock(sk, other); 2148 } 2149 2150 if (unix_peer(sk) != other || 2151 unix_dgram_peer_wake_me(sk, other)) { 2152 err = -EAGAIN; 2153 sk_locked = 1; 2154 goto out_unlock; 2155 } 2156 2157 if (!sk_locked) { 2158 sk_locked = 1; 2159 goto restart_locked; 2160 } 2161 } 2162 2163 if (unlikely(sk_locked)) 2164 unix_state_unlock(sk); 2165 2166 if (sock_flag(other, SOCK_RCVTSTAMP)) 2167 __net_timestamp(skb); 2168 maybe_add_creds(skb, sock, other); 2169 scm_stat_add(other, skb); 2170 skb_queue_tail(&other->sk_receive_queue, skb); 2171 unix_state_unlock(other); 2172 other->sk_data_ready(other); 2173 sock_put(other); 2174 scm_destroy(&scm); 2175 return len; 2176 2177 out_unlock: 2178 if (sk_locked) 2179 unix_state_unlock(sk); 2180 unix_state_unlock(other); 2181 out_free: 2182 kfree_skb(skb); 2183 out: 2184 if (other) 2185 sock_put(other); 2186 scm_destroy(&scm); 2187 return err; 2188 } 2189 2190 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2191 * bytes, and a minimum of a full page. 2192 */ 2193 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2194 2195 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2196 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2197 struct scm_cookie *scm, bool fds_sent) 2198 { 2199 struct unix_sock *ousk = unix_sk(other); 2200 struct sk_buff *skb; 2201 int err = 0; 2202 2203 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2204 2205 if (!skb) 2206 return err; 2207 2208 err = unix_scm_to_skb(scm, skb, !fds_sent); 2209 if (err < 0) { 2210 kfree_skb(skb); 2211 return err; 2212 } 2213 skb_put(skb, 1); 2214 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2215 2216 if (err) { 2217 kfree_skb(skb); 2218 return err; 2219 } 2220 2221 unix_state_lock(other); 2222 2223 if (sock_flag(other, SOCK_DEAD) || 2224 (other->sk_shutdown & RCV_SHUTDOWN)) { 2225 unix_state_unlock(other); 2226 kfree_skb(skb); 2227 return -EPIPE; 2228 } 2229 2230 maybe_add_creds(skb, sock, other); 2231 skb_get(skb); 2232 2233 if (ousk->oob_skb) 2234 consume_skb(ousk->oob_skb); 2235 2236 WRITE_ONCE(ousk->oob_skb, skb); 2237 2238 scm_stat_add(other, skb); 2239 skb_queue_tail(&other->sk_receive_queue, skb); 2240 sk_send_sigurg(other); 2241 unix_state_unlock(other); 2242 other->sk_data_ready(other); 2243 2244 return err; 2245 } 2246 #endif 2247 2248 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2249 size_t len) 2250 { 2251 struct sock *sk = sock->sk; 2252 struct sock *other = NULL; 2253 int err, size; 2254 struct sk_buff *skb; 2255 int sent = 0; 2256 struct scm_cookie scm; 2257 bool fds_sent = false; 2258 int data_len; 2259 2260 err = scm_send(sock, msg, &scm, false); 2261 if (err < 0) 2262 return err; 2263 2264 wait_for_unix_gc(scm.fp); 2265 2266 err = -EOPNOTSUPP; 2267 if (msg->msg_flags & MSG_OOB) { 2268 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2269 if (len) 2270 len--; 2271 else 2272 #endif 2273 goto out_err; 2274 } 2275 2276 if (msg->msg_namelen) { 2277 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2278 goto out_err; 2279 } else { 2280 err = -ENOTCONN; 2281 other = unix_peer(sk); 2282 if (!other) 2283 goto out_err; 2284 } 2285 2286 if (sk->sk_shutdown & SEND_SHUTDOWN) 2287 goto pipe_err; 2288 2289 while (sent < len) { 2290 size = len - sent; 2291 2292 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2293 skb = sock_alloc_send_pskb(sk, 0, 0, 2294 msg->msg_flags & MSG_DONTWAIT, 2295 &err, 0); 2296 } else { 2297 /* Keep two messages in the pipe so it schedules better */ 2298 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); 2299 2300 /* allow fallback to order-0 allocations */ 2301 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2302 2303 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2304 2305 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2306 2307 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2308 msg->msg_flags & MSG_DONTWAIT, &err, 2309 get_order(UNIX_SKB_FRAGS_SZ)); 2310 } 2311 if (!skb) 2312 goto out_err; 2313 2314 /* Only send the fds in the first buffer */ 2315 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2316 if (err < 0) { 2317 kfree_skb(skb); 2318 goto out_err; 2319 } 2320 fds_sent = true; 2321 2322 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2323 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2324 sk->sk_allocation); 2325 if (err < 0) { 2326 kfree_skb(skb); 2327 goto out_err; 2328 } 2329 size = err; 2330 refcount_add(size, &sk->sk_wmem_alloc); 2331 } else { 2332 skb_put(skb, size - data_len); 2333 skb->data_len = data_len; 2334 skb->len = size; 2335 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2336 if (err) { 2337 kfree_skb(skb); 2338 goto out_err; 2339 } 2340 } 2341 2342 unix_state_lock(other); 2343 2344 if (sock_flag(other, SOCK_DEAD) || 2345 (other->sk_shutdown & RCV_SHUTDOWN)) 2346 goto pipe_err_free; 2347 2348 maybe_add_creds(skb, sock, other); 2349 scm_stat_add(other, skb); 2350 skb_queue_tail(&other->sk_receive_queue, skb); 2351 unix_state_unlock(other); 2352 other->sk_data_ready(other); 2353 sent += size; 2354 } 2355 2356 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2357 if (msg->msg_flags & MSG_OOB) { 2358 err = queue_oob(sock, msg, other, &scm, fds_sent); 2359 if (err) 2360 goto out_err; 2361 sent++; 2362 } 2363 #endif 2364 2365 scm_destroy(&scm); 2366 2367 return sent; 2368 2369 pipe_err_free: 2370 unix_state_unlock(other); 2371 kfree_skb(skb); 2372 pipe_err: 2373 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) 2374 send_sig(SIGPIPE, current, 0); 2375 err = -EPIPE; 2376 out_err: 2377 scm_destroy(&scm); 2378 return sent ? : err; 2379 } 2380 2381 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2382 size_t len) 2383 { 2384 int err; 2385 struct sock *sk = sock->sk; 2386 2387 err = sock_error(sk); 2388 if (err) 2389 return err; 2390 2391 if (sk->sk_state != TCP_ESTABLISHED) 2392 return -ENOTCONN; 2393 2394 if (msg->msg_namelen) 2395 msg->msg_namelen = 0; 2396 2397 return unix_dgram_sendmsg(sock, msg, len); 2398 } 2399 2400 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2401 size_t size, int flags) 2402 { 2403 struct sock *sk = sock->sk; 2404 2405 if (sk->sk_state != TCP_ESTABLISHED) 2406 return -ENOTCONN; 2407 2408 return unix_dgram_recvmsg(sock, msg, size, flags); 2409 } 2410 2411 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2412 { 2413 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2414 2415 if (addr) { 2416 msg->msg_namelen = addr->len; 2417 memcpy(msg->msg_name, addr->name, addr->len); 2418 } 2419 } 2420 2421 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2422 int flags) 2423 { 2424 struct scm_cookie scm; 2425 struct socket *sock = sk->sk_socket; 2426 struct unix_sock *u = unix_sk(sk); 2427 struct sk_buff *skb, *last; 2428 long timeo; 2429 int skip; 2430 int err; 2431 2432 err = -EOPNOTSUPP; 2433 if (flags&MSG_OOB) 2434 goto out; 2435 2436 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2437 2438 do { 2439 mutex_lock(&u->iolock); 2440 2441 skip = sk_peek_offset(sk, flags); 2442 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2443 &skip, &err, &last); 2444 if (skb) { 2445 if (!(flags & MSG_PEEK)) 2446 scm_stat_del(sk, skb); 2447 break; 2448 } 2449 2450 mutex_unlock(&u->iolock); 2451 2452 if (err != -EAGAIN) 2453 break; 2454 } while (timeo && 2455 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2456 &err, &timeo, last)); 2457 2458 if (!skb) { /* implies iolock unlocked */ 2459 unix_state_lock(sk); 2460 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2461 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2462 (sk->sk_shutdown & RCV_SHUTDOWN)) 2463 err = 0; 2464 unix_state_unlock(sk); 2465 goto out; 2466 } 2467 2468 if (wq_has_sleeper(&u->peer_wait)) 2469 wake_up_interruptible_sync_poll(&u->peer_wait, 2470 EPOLLOUT | EPOLLWRNORM | 2471 EPOLLWRBAND); 2472 2473 if (msg->msg_name) { 2474 unix_copy_addr(msg, skb->sk); 2475 2476 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2477 msg->msg_name, 2478 &msg->msg_namelen); 2479 } 2480 2481 if (size > skb->len - skip) 2482 size = skb->len - skip; 2483 else if (size < skb->len - skip) 2484 msg->msg_flags |= MSG_TRUNC; 2485 2486 err = skb_copy_datagram_msg(skb, skip, msg, size); 2487 if (err) 2488 goto out_free; 2489 2490 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2491 __sock_recv_timestamp(msg, sk, skb); 2492 2493 memset(&scm, 0, sizeof(scm)); 2494 2495 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2496 unix_set_secdata(&scm, skb); 2497 2498 if (!(flags & MSG_PEEK)) { 2499 if (UNIXCB(skb).fp) 2500 unix_detach_fds(&scm, skb); 2501 2502 sk_peek_offset_bwd(sk, skb->len); 2503 } else { 2504 /* It is questionable: on PEEK we could: 2505 - do not return fds - good, but too simple 8) 2506 - return fds, and do not return them on read (old strategy, 2507 apparently wrong) 2508 - clone fds (I chose it for now, it is the most universal 2509 solution) 2510 2511 POSIX 1003.1g does not actually define this clearly 2512 at all. POSIX 1003.1g doesn't define a lot of things 2513 clearly however! 2514 2515 */ 2516 2517 sk_peek_offset_fwd(sk, size); 2518 2519 if (UNIXCB(skb).fp) 2520 unix_peek_fds(&scm, skb); 2521 } 2522 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2523 2524 scm_recv_unix(sock, msg, &scm, flags); 2525 2526 out_free: 2527 skb_free_datagram(sk, skb); 2528 mutex_unlock(&u->iolock); 2529 out: 2530 return err; 2531 } 2532 2533 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2534 int flags) 2535 { 2536 struct sock *sk = sock->sk; 2537 2538 #ifdef CONFIG_BPF_SYSCALL 2539 const struct proto *prot = READ_ONCE(sk->sk_prot); 2540 2541 if (prot != &unix_dgram_proto) 2542 return prot->recvmsg(sk, msg, size, flags, NULL); 2543 #endif 2544 return __unix_dgram_recvmsg(sk, msg, size, flags); 2545 } 2546 2547 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2548 { 2549 struct unix_sock *u = unix_sk(sk); 2550 struct sk_buff *skb; 2551 int err; 2552 2553 mutex_lock(&u->iolock); 2554 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2555 mutex_unlock(&u->iolock); 2556 if (!skb) 2557 return err; 2558 2559 return recv_actor(sk, skb); 2560 } 2561 2562 /* 2563 * Sleep until more data has arrived. But check for races.. 2564 */ 2565 static long unix_stream_data_wait(struct sock *sk, long timeo, 2566 struct sk_buff *last, unsigned int last_len, 2567 bool freezable) 2568 { 2569 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2570 struct sk_buff *tail; 2571 DEFINE_WAIT(wait); 2572 2573 unix_state_lock(sk); 2574 2575 for (;;) { 2576 prepare_to_wait(sk_sleep(sk), &wait, state); 2577 2578 tail = skb_peek_tail(&sk->sk_receive_queue); 2579 if (tail != last || 2580 (tail && tail->len != last_len) || 2581 sk->sk_err || 2582 (sk->sk_shutdown & RCV_SHUTDOWN) || 2583 signal_pending(current) || 2584 !timeo) 2585 break; 2586 2587 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2588 unix_state_unlock(sk); 2589 timeo = schedule_timeout(timeo); 2590 unix_state_lock(sk); 2591 2592 if (sock_flag(sk, SOCK_DEAD)) 2593 break; 2594 2595 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2596 } 2597 2598 finish_wait(sk_sleep(sk), &wait); 2599 unix_state_unlock(sk); 2600 return timeo; 2601 } 2602 2603 static unsigned int unix_skb_len(const struct sk_buff *skb) 2604 { 2605 return skb->len - UNIXCB(skb).consumed; 2606 } 2607 2608 struct unix_stream_read_state { 2609 int (*recv_actor)(struct sk_buff *, int, int, 2610 struct unix_stream_read_state *); 2611 struct socket *socket; 2612 struct msghdr *msg; 2613 struct pipe_inode_info *pipe; 2614 size_t size; 2615 int flags; 2616 unsigned int splice_flags; 2617 }; 2618 2619 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2620 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2621 { 2622 struct socket *sock = state->socket; 2623 struct sock *sk = sock->sk; 2624 struct unix_sock *u = unix_sk(sk); 2625 int chunk = 1; 2626 struct sk_buff *oob_skb; 2627 2628 mutex_lock(&u->iolock); 2629 unix_state_lock(sk); 2630 2631 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2632 unix_state_unlock(sk); 2633 mutex_unlock(&u->iolock); 2634 return -EINVAL; 2635 } 2636 2637 oob_skb = u->oob_skb; 2638 2639 if (!(state->flags & MSG_PEEK)) 2640 WRITE_ONCE(u->oob_skb, NULL); 2641 else 2642 skb_get(oob_skb); 2643 unix_state_unlock(sk); 2644 2645 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2646 2647 if (!(state->flags & MSG_PEEK)) 2648 UNIXCB(oob_skb).consumed += 1; 2649 2650 consume_skb(oob_skb); 2651 2652 mutex_unlock(&u->iolock); 2653 2654 if (chunk < 0) 2655 return -EFAULT; 2656 2657 state->msg->msg_flags |= MSG_OOB; 2658 return 1; 2659 } 2660 2661 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2662 int flags, int copied) 2663 { 2664 struct unix_sock *u = unix_sk(sk); 2665 2666 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { 2667 skb_unlink(skb, &sk->sk_receive_queue); 2668 consume_skb(skb); 2669 skb = NULL; 2670 } else { 2671 if (skb == u->oob_skb) { 2672 if (copied) { 2673 skb = NULL; 2674 } else if (sock_flag(sk, SOCK_URGINLINE)) { 2675 if (!(flags & MSG_PEEK)) { 2676 WRITE_ONCE(u->oob_skb, NULL); 2677 consume_skb(skb); 2678 } 2679 } else if (!(flags & MSG_PEEK)) { 2680 skb_unlink(skb, &sk->sk_receive_queue); 2681 consume_skb(skb); 2682 skb = skb_peek(&sk->sk_receive_queue); 2683 } 2684 } 2685 } 2686 return skb; 2687 } 2688 #endif 2689 2690 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2691 { 2692 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) 2693 return -ENOTCONN; 2694 2695 return unix_read_skb(sk, recv_actor); 2696 } 2697 2698 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2699 bool freezable) 2700 { 2701 struct scm_cookie scm; 2702 struct socket *sock = state->socket; 2703 struct sock *sk = sock->sk; 2704 struct unix_sock *u = unix_sk(sk); 2705 int copied = 0; 2706 int flags = state->flags; 2707 int noblock = flags & MSG_DONTWAIT; 2708 bool check_creds = false; 2709 int target; 2710 int err = 0; 2711 long timeo; 2712 int skip; 2713 size_t size = state->size; 2714 unsigned int last_len; 2715 2716 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { 2717 err = -EINVAL; 2718 goto out; 2719 } 2720 2721 if (unlikely(flags & MSG_OOB)) { 2722 err = -EOPNOTSUPP; 2723 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2724 err = unix_stream_recv_urg(state); 2725 #endif 2726 goto out; 2727 } 2728 2729 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2730 timeo = sock_rcvtimeo(sk, noblock); 2731 2732 memset(&scm, 0, sizeof(scm)); 2733 2734 /* Lock the socket to prevent queue disordering 2735 * while sleeps in memcpy_tomsg 2736 */ 2737 mutex_lock(&u->iolock); 2738 2739 skip = max(sk_peek_offset(sk, flags), 0); 2740 2741 do { 2742 int chunk; 2743 bool drop_skb; 2744 struct sk_buff *skb, *last; 2745 2746 redo: 2747 unix_state_lock(sk); 2748 if (sock_flag(sk, SOCK_DEAD)) { 2749 err = -ECONNRESET; 2750 goto unlock; 2751 } 2752 last = skb = skb_peek(&sk->sk_receive_queue); 2753 last_len = last ? last->len : 0; 2754 2755 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2756 if (skb) { 2757 skb = manage_oob(skb, sk, flags, copied); 2758 if (!skb) { 2759 unix_state_unlock(sk); 2760 if (copied) 2761 break; 2762 goto redo; 2763 } 2764 } 2765 #endif 2766 again: 2767 if (skb == NULL) { 2768 if (copied >= target) 2769 goto unlock; 2770 2771 /* 2772 * POSIX 1003.1g mandates this order. 2773 */ 2774 2775 err = sock_error(sk); 2776 if (err) 2777 goto unlock; 2778 if (sk->sk_shutdown & RCV_SHUTDOWN) 2779 goto unlock; 2780 2781 unix_state_unlock(sk); 2782 if (!timeo) { 2783 err = -EAGAIN; 2784 break; 2785 } 2786 2787 mutex_unlock(&u->iolock); 2788 2789 timeo = unix_stream_data_wait(sk, timeo, last, 2790 last_len, freezable); 2791 2792 if (signal_pending(current)) { 2793 err = sock_intr_errno(timeo); 2794 scm_destroy(&scm); 2795 goto out; 2796 } 2797 2798 mutex_lock(&u->iolock); 2799 goto redo; 2800 unlock: 2801 unix_state_unlock(sk); 2802 break; 2803 } 2804 2805 while (skip >= unix_skb_len(skb)) { 2806 skip -= unix_skb_len(skb); 2807 last = skb; 2808 last_len = skb->len; 2809 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2810 if (!skb) 2811 goto again; 2812 } 2813 2814 unix_state_unlock(sk); 2815 2816 if (check_creds) { 2817 /* Never glue messages from different writers */ 2818 if (!unix_skb_scm_eq(skb, &scm)) 2819 break; 2820 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2821 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2822 /* Copy credentials */ 2823 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2824 unix_set_secdata(&scm, skb); 2825 check_creds = true; 2826 } 2827 2828 /* Copy address just once */ 2829 if (state->msg && state->msg->msg_name) { 2830 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2831 state->msg->msg_name); 2832 unix_copy_addr(state->msg, skb->sk); 2833 2834 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2835 state->msg->msg_name, 2836 &state->msg->msg_namelen); 2837 2838 sunaddr = NULL; 2839 } 2840 2841 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2842 skb_get(skb); 2843 chunk = state->recv_actor(skb, skip, chunk, state); 2844 drop_skb = !unix_skb_len(skb); 2845 /* skb is only safe to use if !drop_skb */ 2846 consume_skb(skb); 2847 if (chunk < 0) { 2848 if (copied == 0) 2849 copied = -EFAULT; 2850 break; 2851 } 2852 copied += chunk; 2853 size -= chunk; 2854 2855 if (drop_skb) { 2856 /* the skb was touched by a concurrent reader; 2857 * we should not expect anything from this skb 2858 * anymore and assume it invalid - we can be 2859 * sure it was dropped from the socket queue 2860 * 2861 * let's report a short read 2862 */ 2863 err = 0; 2864 break; 2865 } 2866 2867 /* Mark read part of skb as used */ 2868 if (!(flags & MSG_PEEK)) { 2869 UNIXCB(skb).consumed += chunk; 2870 2871 sk_peek_offset_bwd(sk, chunk); 2872 2873 if (UNIXCB(skb).fp) { 2874 scm_stat_del(sk, skb); 2875 unix_detach_fds(&scm, skb); 2876 } 2877 2878 if (unix_skb_len(skb)) 2879 break; 2880 2881 skb_unlink(skb, &sk->sk_receive_queue); 2882 consume_skb(skb); 2883 2884 if (scm.fp) 2885 break; 2886 } else { 2887 /* It is questionable, see note in unix_dgram_recvmsg. 2888 */ 2889 if (UNIXCB(skb).fp) 2890 unix_peek_fds(&scm, skb); 2891 2892 sk_peek_offset_fwd(sk, chunk); 2893 2894 if (UNIXCB(skb).fp) 2895 break; 2896 2897 skip = 0; 2898 last = skb; 2899 last_len = skb->len; 2900 unix_state_lock(sk); 2901 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2902 if (skb) 2903 goto again; 2904 unix_state_unlock(sk); 2905 break; 2906 } 2907 } while (size); 2908 2909 mutex_unlock(&u->iolock); 2910 if (state->msg) 2911 scm_recv_unix(sock, state->msg, &scm, flags); 2912 else 2913 scm_destroy(&scm); 2914 out: 2915 return copied ? : err; 2916 } 2917 2918 static int unix_stream_read_actor(struct sk_buff *skb, 2919 int skip, int chunk, 2920 struct unix_stream_read_state *state) 2921 { 2922 int ret; 2923 2924 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2925 state->msg, chunk); 2926 return ret ?: chunk; 2927 } 2928 2929 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2930 size_t size, int flags) 2931 { 2932 struct unix_stream_read_state state = { 2933 .recv_actor = unix_stream_read_actor, 2934 .socket = sk->sk_socket, 2935 .msg = msg, 2936 .size = size, 2937 .flags = flags 2938 }; 2939 2940 return unix_stream_read_generic(&state, true); 2941 } 2942 2943 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2944 size_t size, int flags) 2945 { 2946 struct unix_stream_read_state state = { 2947 .recv_actor = unix_stream_read_actor, 2948 .socket = sock, 2949 .msg = msg, 2950 .size = size, 2951 .flags = flags 2952 }; 2953 2954 #ifdef CONFIG_BPF_SYSCALL 2955 struct sock *sk = sock->sk; 2956 const struct proto *prot = READ_ONCE(sk->sk_prot); 2957 2958 if (prot != &unix_stream_proto) 2959 return prot->recvmsg(sk, msg, size, flags, NULL); 2960 #endif 2961 return unix_stream_read_generic(&state, true); 2962 } 2963 2964 static int unix_stream_splice_actor(struct sk_buff *skb, 2965 int skip, int chunk, 2966 struct unix_stream_read_state *state) 2967 { 2968 return skb_splice_bits(skb, state->socket->sk, 2969 UNIXCB(skb).consumed + skip, 2970 state->pipe, chunk, state->splice_flags); 2971 } 2972 2973 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2974 struct pipe_inode_info *pipe, 2975 size_t size, unsigned int flags) 2976 { 2977 struct unix_stream_read_state state = { 2978 .recv_actor = unix_stream_splice_actor, 2979 .socket = sock, 2980 .pipe = pipe, 2981 .size = size, 2982 .splice_flags = flags, 2983 }; 2984 2985 if (unlikely(*ppos)) 2986 return -ESPIPE; 2987 2988 if (sock->file->f_flags & O_NONBLOCK || 2989 flags & SPLICE_F_NONBLOCK) 2990 state.flags = MSG_DONTWAIT; 2991 2992 return unix_stream_read_generic(&state, false); 2993 } 2994 2995 static int unix_shutdown(struct socket *sock, int mode) 2996 { 2997 struct sock *sk = sock->sk; 2998 struct sock *other; 2999 3000 if (mode < SHUT_RD || mode > SHUT_RDWR) 3001 return -EINVAL; 3002 /* This maps: 3003 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 3004 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 3005 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 3006 */ 3007 ++mode; 3008 3009 unix_state_lock(sk); 3010 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 3011 other = unix_peer(sk); 3012 if (other) 3013 sock_hold(other); 3014 unix_state_unlock(sk); 3015 sk->sk_state_change(sk); 3016 3017 if (other && 3018 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 3019 3020 int peer_mode = 0; 3021 const struct proto *prot = READ_ONCE(other->sk_prot); 3022 3023 if (prot->unhash) 3024 prot->unhash(other); 3025 if (mode&RCV_SHUTDOWN) 3026 peer_mode |= SEND_SHUTDOWN; 3027 if (mode&SEND_SHUTDOWN) 3028 peer_mode |= RCV_SHUTDOWN; 3029 unix_state_lock(other); 3030 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 3031 unix_state_unlock(other); 3032 other->sk_state_change(other); 3033 if (peer_mode == SHUTDOWN_MASK) 3034 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3035 else if (peer_mode & RCV_SHUTDOWN) 3036 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3037 } 3038 if (other) 3039 sock_put(other); 3040 3041 return 0; 3042 } 3043 3044 long unix_inq_len(struct sock *sk) 3045 { 3046 struct sk_buff *skb; 3047 long amount = 0; 3048 3049 if (sk->sk_state == TCP_LISTEN) 3050 return -EINVAL; 3051 3052 spin_lock(&sk->sk_receive_queue.lock); 3053 if (sk->sk_type == SOCK_STREAM || 3054 sk->sk_type == SOCK_SEQPACKET) { 3055 skb_queue_walk(&sk->sk_receive_queue, skb) 3056 amount += unix_skb_len(skb); 3057 } else { 3058 skb = skb_peek(&sk->sk_receive_queue); 3059 if (skb) 3060 amount = skb->len; 3061 } 3062 spin_unlock(&sk->sk_receive_queue.lock); 3063 3064 return amount; 3065 } 3066 EXPORT_SYMBOL_GPL(unix_inq_len); 3067 3068 long unix_outq_len(struct sock *sk) 3069 { 3070 return sk_wmem_alloc_get(sk); 3071 } 3072 EXPORT_SYMBOL_GPL(unix_outq_len); 3073 3074 static int unix_open_file(struct sock *sk) 3075 { 3076 struct path path; 3077 struct file *f; 3078 int fd; 3079 3080 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3081 return -EPERM; 3082 3083 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3084 return -ENOENT; 3085 3086 path = unix_sk(sk)->path; 3087 if (!path.dentry) 3088 return -ENOENT; 3089 3090 path_get(&path); 3091 3092 fd = get_unused_fd_flags(O_CLOEXEC); 3093 if (fd < 0) 3094 goto out; 3095 3096 f = dentry_open(&path, O_PATH, current_cred()); 3097 if (IS_ERR(f)) { 3098 put_unused_fd(fd); 3099 fd = PTR_ERR(f); 3100 goto out; 3101 } 3102 3103 fd_install(fd, f); 3104 out: 3105 path_put(&path); 3106 3107 return fd; 3108 } 3109 3110 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3111 { 3112 struct sock *sk = sock->sk; 3113 long amount = 0; 3114 int err; 3115 3116 switch (cmd) { 3117 case SIOCOUTQ: 3118 amount = unix_outq_len(sk); 3119 err = put_user(amount, (int __user *)arg); 3120 break; 3121 case SIOCINQ: 3122 amount = unix_inq_len(sk); 3123 if (amount < 0) 3124 err = amount; 3125 else 3126 err = put_user(amount, (int __user *)arg); 3127 break; 3128 case SIOCUNIXFILE: 3129 err = unix_open_file(sk); 3130 break; 3131 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3132 case SIOCATMARK: 3133 { 3134 struct sk_buff *skb; 3135 int answ = 0; 3136 3137 skb = skb_peek(&sk->sk_receive_queue); 3138 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) 3139 answ = 1; 3140 err = put_user(answ, (int __user *)arg); 3141 } 3142 break; 3143 #endif 3144 default: 3145 err = -ENOIOCTLCMD; 3146 break; 3147 } 3148 return err; 3149 } 3150 3151 #ifdef CONFIG_COMPAT 3152 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3153 { 3154 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3155 } 3156 #endif 3157 3158 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3159 { 3160 struct sock *sk = sock->sk; 3161 __poll_t mask; 3162 u8 shutdown; 3163 3164 sock_poll_wait(file, sock, wait); 3165 mask = 0; 3166 shutdown = READ_ONCE(sk->sk_shutdown); 3167 3168 /* exceptional events? */ 3169 if (READ_ONCE(sk->sk_err)) 3170 mask |= EPOLLERR; 3171 if (shutdown == SHUTDOWN_MASK) 3172 mask |= EPOLLHUP; 3173 if (shutdown & RCV_SHUTDOWN) 3174 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3175 3176 /* readable? */ 3177 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3178 mask |= EPOLLIN | EPOLLRDNORM; 3179 if (sk_is_readable(sk)) 3180 mask |= EPOLLIN | EPOLLRDNORM; 3181 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3182 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3183 mask |= EPOLLPRI; 3184 #endif 3185 3186 /* Connection-based need to check for termination and startup */ 3187 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3188 sk->sk_state == TCP_CLOSE) 3189 mask |= EPOLLHUP; 3190 3191 /* 3192 * we set writable also when the other side has shut down the 3193 * connection. This prevents stuck sockets. 3194 */ 3195 if (unix_writable(sk)) 3196 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3197 3198 return mask; 3199 } 3200 3201 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3202 poll_table *wait) 3203 { 3204 struct sock *sk = sock->sk, *other; 3205 unsigned int writable; 3206 __poll_t mask; 3207 u8 shutdown; 3208 3209 sock_poll_wait(file, sock, wait); 3210 mask = 0; 3211 shutdown = READ_ONCE(sk->sk_shutdown); 3212 3213 /* exceptional events? */ 3214 if (READ_ONCE(sk->sk_err) || 3215 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3216 mask |= EPOLLERR | 3217 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3218 3219 if (shutdown & RCV_SHUTDOWN) 3220 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3221 if (shutdown == SHUTDOWN_MASK) 3222 mask |= EPOLLHUP; 3223 3224 /* readable? */ 3225 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3226 mask |= EPOLLIN | EPOLLRDNORM; 3227 if (sk_is_readable(sk)) 3228 mask |= EPOLLIN | EPOLLRDNORM; 3229 3230 /* Connection-based need to check for termination and startup */ 3231 if (sk->sk_type == SOCK_SEQPACKET) { 3232 if (sk->sk_state == TCP_CLOSE) 3233 mask |= EPOLLHUP; 3234 /* connection hasn't started yet? */ 3235 if (sk->sk_state == TCP_SYN_SENT) 3236 return mask; 3237 } 3238 3239 /* No write status requested, avoid expensive OUT tests. */ 3240 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3241 return mask; 3242 3243 writable = unix_writable(sk); 3244 if (writable) { 3245 unix_state_lock(sk); 3246 3247 other = unix_peer(sk); 3248 if (other && unix_peer(other) != sk && 3249 unix_recvq_full_lockless(other) && 3250 unix_dgram_peer_wake_me(sk, other)) 3251 writable = 0; 3252 3253 unix_state_unlock(sk); 3254 } 3255 3256 if (writable) 3257 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3258 else 3259 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3260 3261 return mask; 3262 } 3263 3264 #ifdef CONFIG_PROC_FS 3265 3266 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3267 3268 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3269 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3270 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3271 3272 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3273 { 3274 unsigned long offset = get_offset(*pos); 3275 unsigned long bucket = get_bucket(*pos); 3276 unsigned long count = 0; 3277 struct sock *sk; 3278 3279 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3280 sk; sk = sk_next(sk)) { 3281 if (++count == offset) 3282 break; 3283 } 3284 3285 return sk; 3286 } 3287 3288 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3289 { 3290 unsigned long bucket = get_bucket(*pos); 3291 struct net *net = seq_file_net(seq); 3292 struct sock *sk; 3293 3294 while (bucket < UNIX_HASH_SIZE) { 3295 spin_lock(&net->unx.table.locks[bucket]); 3296 3297 sk = unix_from_bucket(seq, pos); 3298 if (sk) 3299 return sk; 3300 3301 spin_unlock(&net->unx.table.locks[bucket]); 3302 3303 *pos = set_bucket_offset(++bucket, 1); 3304 } 3305 3306 return NULL; 3307 } 3308 3309 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3310 loff_t *pos) 3311 { 3312 unsigned long bucket = get_bucket(*pos); 3313 3314 sk = sk_next(sk); 3315 if (sk) 3316 return sk; 3317 3318 3319 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3320 3321 *pos = set_bucket_offset(++bucket, 1); 3322 3323 return unix_get_first(seq, pos); 3324 } 3325 3326 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3327 { 3328 if (!*pos) 3329 return SEQ_START_TOKEN; 3330 3331 return unix_get_first(seq, pos); 3332 } 3333 3334 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3335 { 3336 ++*pos; 3337 3338 if (v == SEQ_START_TOKEN) 3339 return unix_get_first(seq, pos); 3340 3341 return unix_get_next(seq, v, pos); 3342 } 3343 3344 static void unix_seq_stop(struct seq_file *seq, void *v) 3345 { 3346 struct sock *sk = v; 3347 3348 if (sk) 3349 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3350 } 3351 3352 static int unix_seq_show(struct seq_file *seq, void *v) 3353 { 3354 3355 if (v == SEQ_START_TOKEN) 3356 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3357 "Inode Path\n"); 3358 else { 3359 struct sock *s = v; 3360 struct unix_sock *u = unix_sk(s); 3361 unix_state_lock(s); 3362 3363 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3364 s, 3365 refcount_read(&s->sk_refcnt), 3366 0, 3367 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3368 s->sk_type, 3369 s->sk_socket ? 3370 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3371 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3372 sock_i_ino(s)); 3373 3374 if (u->addr) { // under a hash table lock here 3375 int i, len; 3376 seq_putc(seq, ' '); 3377 3378 i = 0; 3379 len = u->addr->len - 3380 offsetof(struct sockaddr_un, sun_path); 3381 if (u->addr->name->sun_path[0]) { 3382 len--; 3383 } else { 3384 seq_putc(seq, '@'); 3385 i++; 3386 } 3387 for ( ; i < len; i++) 3388 seq_putc(seq, u->addr->name->sun_path[i] ?: 3389 '@'); 3390 } 3391 unix_state_unlock(s); 3392 seq_putc(seq, '\n'); 3393 } 3394 3395 return 0; 3396 } 3397 3398 static const struct seq_operations unix_seq_ops = { 3399 .start = unix_seq_start, 3400 .next = unix_seq_next, 3401 .stop = unix_seq_stop, 3402 .show = unix_seq_show, 3403 }; 3404 3405 #ifdef CONFIG_BPF_SYSCALL 3406 struct bpf_unix_iter_state { 3407 struct seq_net_private p; 3408 unsigned int cur_sk; 3409 unsigned int end_sk; 3410 unsigned int max_sk; 3411 struct sock **batch; 3412 bool st_bucket_done; 3413 }; 3414 3415 struct bpf_iter__unix { 3416 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3417 __bpf_md_ptr(struct unix_sock *, unix_sk); 3418 uid_t uid __aligned(8); 3419 }; 3420 3421 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3422 struct unix_sock *unix_sk, uid_t uid) 3423 { 3424 struct bpf_iter__unix ctx; 3425 3426 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3427 ctx.meta = meta; 3428 ctx.unix_sk = unix_sk; 3429 ctx.uid = uid; 3430 return bpf_iter_run_prog(prog, &ctx); 3431 } 3432 3433 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3434 3435 { 3436 struct bpf_unix_iter_state *iter = seq->private; 3437 unsigned int expected = 1; 3438 struct sock *sk; 3439 3440 sock_hold(start_sk); 3441 iter->batch[iter->end_sk++] = start_sk; 3442 3443 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3444 if (iter->end_sk < iter->max_sk) { 3445 sock_hold(sk); 3446 iter->batch[iter->end_sk++] = sk; 3447 } 3448 3449 expected++; 3450 } 3451 3452 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3453 3454 return expected; 3455 } 3456 3457 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3458 { 3459 while (iter->cur_sk < iter->end_sk) 3460 sock_put(iter->batch[iter->cur_sk++]); 3461 } 3462 3463 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3464 unsigned int new_batch_sz) 3465 { 3466 struct sock **new_batch; 3467 3468 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3469 GFP_USER | __GFP_NOWARN); 3470 if (!new_batch) 3471 return -ENOMEM; 3472 3473 bpf_iter_unix_put_batch(iter); 3474 kvfree(iter->batch); 3475 iter->batch = new_batch; 3476 iter->max_sk = new_batch_sz; 3477 3478 return 0; 3479 } 3480 3481 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3482 loff_t *pos) 3483 { 3484 struct bpf_unix_iter_state *iter = seq->private; 3485 unsigned int expected; 3486 bool resized = false; 3487 struct sock *sk; 3488 3489 if (iter->st_bucket_done) 3490 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3491 3492 again: 3493 /* Get a new batch */ 3494 iter->cur_sk = 0; 3495 iter->end_sk = 0; 3496 3497 sk = unix_get_first(seq, pos); 3498 if (!sk) 3499 return NULL; /* Done */ 3500 3501 expected = bpf_iter_unix_hold_batch(seq, sk); 3502 3503 if (iter->end_sk == expected) { 3504 iter->st_bucket_done = true; 3505 return sk; 3506 } 3507 3508 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3509 resized = true; 3510 goto again; 3511 } 3512 3513 return sk; 3514 } 3515 3516 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3517 { 3518 if (!*pos) 3519 return SEQ_START_TOKEN; 3520 3521 /* bpf iter does not support lseek, so it always 3522 * continue from where it was stop()-ped. 3523 */ 3524 return bpf_iter_unix_batch(seq, pos); 3525 } 3526 3527 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3528 { 3529 struct bpf_unix_iter_state *iter = seq->private; 3530 struct sock *sk; 3531 3532 /* Whenever seq_next() is called, the iter->cur_sk is 3533 * done with seq_show(), so advance to the next sk in 3534 * the batch. 3535 */ 3536 if (iter->cur_sk < iter->end_sk) 3537 sock_put(iter->batch[iter->cur_sk++]); 3538 3539 ++*pos; 3540 3541 if (iter->cur_sk < iter->end_sk) 3542 sk = iter->batch[iter->cur_sk]; 3543 else 3544 sk = bpf_iter_unix_batch(seq, pos); 3545 3546 return sk; 3547 } 3548 3549 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3550 { 3551 struct bpf_iter_meta meta; 3552 struct bpf_prog *prog; 3553 struct sock *sk = v; 3554 uid_t uid; 3555 bool slow; 3556 int ret; 3557 3558 if (v == SEQ_START_TOKEN) 3559 return 0; 3560 3561 slow = lock_sock_fast(sk); 3562 3563 if (unlikely(sk_unhashed(sk))) { 3564 ret = SEQ_SKIP; 3565 goto unlock; 3566 } 3567 3568 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3569 meta.seq = seq; 3570 prog = bpf_iter_get_info(&meta, false); 3571 ret = unix_prog_seq_show(prog, &meta, v, uid); 3572 unlock: 3573 unlock_sock_fast(sk, slow); 3574 return ret; 3575 } 3576 3577 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3578 { 3579 struct bpf_unix_iter_state *iter = seq->private; 3580 struct bpf_iter_meta meta; 3581 struct bpf_prog *prog; 3582 3583 if (!v) { 3584 meta.seq = seq; 3585 prog = bpf_iter_get_info(&meta, true); 3586 if (prog) 3587 (void)unix_prog_seq_show(prog, &meta, v, 0); 3588 } 3589 3590 if (iter->cur_sk < iter->end_sk) 3591 bpf_iter_unix_put_batch(iter); 3592 } 3593 3594 static const struct seq_operations bpf_iter_unix_seq_ops = { 3595 .start = bpf_iter_unix_seq_start, 3596 .next = bpf_iter_unix_seq_next, 3597 .stop = bpf_iter_unix_seq_stop, 3598 .show = bpf_iter_unix_seq_show, 3599 }; 3600 #endif 3601 #endif 3602 3603 static const struct net_proto_family unix_family_ops = { 3604 .family = PF_UNIX, 3605 .create = unix_create, 3606 .owner = THIS_MODULE, 3607 }; 3608 3609 3610 static int __net_init unix_net_init(struct net *net) 3611 { 3612 int i; 3613 3614 net->unx.sysctl_max_dgram_qlen = 10; 3615 if (unix_sysctl_register(net)) 3616 goto out; 3617 3618 #ifdef CONFIG_PROC_FS 3619 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3620 sizeof(struct seq_net_private))) 3621 goto err_sysctl; 3622 #endif 3623 3624 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3625 sizeof(spinlock_t), GFP_KERNEL); 3626 if (!net->unx.table.locks) 3627 goto err_proc; 3628 3629 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3630 sizeof(struct hlist_head), 3631 GFP_KERNEL); 3632 if (!net->unx.table.buckets) 3633 goto free_locks; 3634 3635 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3636 spin_lock_init(&net->unx.table.locks[i]); 3637 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3638 } 3639 3640 return 0; 3641 3642 free_locks: 3643 kvfree(net->unx.table.locks); 3644 err_proc: 3645 #ifdef CONFIG_PROC_FS 3646 remove_proc_entry("unix", net->proc_net); 3647 err_sysctl: 3648 #endif 3649 unix_sysctl_unregister(net); 3650 out: 3651 return -ENOMEM; 3652 } 3653 3654 static void __net_exit unix_net_exit(struct net *net) 3655 { 3656 kvfree(net->unx.table.buckets); 3657 kvfree(net->unx.table.locks); 3658 unix_sysctl_unregister(net); 3659 remove_proc_entry("unix", net->proc_net); 3660 } 3661 3662 static struct pernet_operations unix_net_ops = { 3663 .init = unix_net_init, 3664 .exit = unix_net_exit, 3665 }; 3666 3667 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3668 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3669 struct unix_sock *unix_sk, uid_t uid) 3670 3671 #define INIT_BATCH_SZ 16 3672 3673 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3674 { 3675 struct bpf_unix_iter_state *iter = priv_data; 3676 int err; 3677 3678 err = bpf_iter_init_seq_net(priv_data, aux); 3679 if (err) 3680 return err; 3681 3682 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3683 if (err) { 3684 bpf_iter_fini_seq_net(priv_data); 3685 return err; 3686 } 3687 3688 return 0; 3689 } 3690 3691 static void bpf_iter_fini_unix(void *priv_data) 3692 { 3693 struct bpf_unix_iter_state *iter = priv_data; 3694 3695 bpf_iter_fini_seq_net(priv_data); 3696 kvfree(iter->batch); 3697 } 3698 3699 static const struct bpf_iter_seq_info unix_seq_info = { 3700 .seq_ops = &bpf_iter_unix_seq_ops, 3701 .init_seq_private = bpf_iter_init_unix, 3702 .fini_seq_private = bpf_iter_fini_unix, 3703 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3704 }; 3705 3706 static const struct bpf_func_proto * 3707 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3708 const struct bpf_prog *prog) 3709 { 3710 switch (func_id) { 3711 case BPF_FUNC_setsockopt: 3712 return &bpf_sk_setsockopt_proto; 3713 case BPF_FUNC_getsockopt: 3714 return &bpf_sk_getsockopt_proto; 3715 default: 3716 return NULL; 3717 } 3718 } 3719 3720 static struct bpf_iter_reg unix_reg_info = { 3721 .target = "unix", 3722 .ctx_arg_info_size = 1, 3723 .ctx_arg_info = { 3724 { offsetof(struct bpf_iter__unix, unix_sk), 3725 PTR_TO_BTF_ID_OR_NULL }, 3726 }, 3727 .get_func_proto = bpf_iter_unix_get_func_proto, 3728 .seq_info = &unix_seq_info, 3729 }; 3730 3731 static void __init bpf_iter_register(void) 3732 { 3733 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3734 if (bpf_iter_reg_target(&unix_reg_info)) 3735 pr_warn("Warning: could not register bpf iterator unix\n"); 3736 } 3737 #endif 3738 3739 static int __init af_unix_init(void) 3740 { 3741 int i, rc = -1; 3742 3743 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3744 3745 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3746 spin_lock_init(&bsd_socket_locks[i]); 3747 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3748 } 3749 3750 rc = proto_register(&unix_dgram_proto, 1); 3751 if (rc != 0) { 3752 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3753 goto out; 3754 } 3755 3756 rc = proto_register(&unix_stream_proto, 1); 3757 if (rc != 0) { 3758 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3759 proto_unregister(&unix_dgram_proto); 3760 goto out; 3761 } 3762 3763 sock_register(&unix_family_ops); 3764 register_pernet_subsys(&unix_net_ops); 3765 unix_bpf_build_proto(); 3766 3767 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3768 bpf_iter_register(); 3769 #endif 3770 3771 out: 3772 return rc; 3773 } 3774 3775 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3776 fs_initcall(af_unix_init); 3777