1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 #include <linux/bpf-cgroup.h> 120 121 static atomic_long_t unix_nr_socks; 122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 124 125 /* SMP locking strategy: 126 * hash table is protected with spinlock. 127 * each socket state is protected by separate spinlock. 128 */ 129 130 static unsigned int unix_unbound_hash(struct sock *sk) 131 { 132 unsigned long hash = (unsigned long)sk; 133 134 hash ^= hash >> 16; 135 hash ^= hash >> 8; 136 hash ^= sk->sk_type; 137 138 return hash & UNIX_HASH_MOD; 139 } 140 141 static unsigned int unix_bsd_hash(struct inode *i) 142 { 143 return i->i_ino & UNIX_HASH_MOD; 144 } 145 146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 147 int addr_len, int type) 148 { 149 __wsum csum = csum_partial(sunaddr, addr_len, 0); 150 unsigned int hash; 151 152 hash = (__force unsigned int)csum_fold(csum); 153 hash ^= hash >> 8; 154 hash ^= type; 155 156 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 157 } 158 159 static void unix_table_double_lock(struct net *net, 160 unsigned int hash1, unsigned int hash2) 161 { 162 if (hash1 == hash2) { 163 spin_lock(&net->unx.table.locks[hash1]); 164 return; 165 } 166 167 if (hash1 > hash2) 168 swap(hash1, hash2); 169 170 spin_lock(&net->unx.table.locks[hash1]); 171 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); 172 } 173 174 static void unix_table_double_unlock(struct net *net, 175 unsigned int hash1, unsigned int hash2) 176 { 177 if (hash1 == hash2) { 178 spin_unlock(&net->unx.table.locks[hash1]); 179 return; 180 } 181 182 spin_unlock(&net->unx.table.locks[hash1]); 183 spin_unlock(&net->unx.table.locks[hash2]); 184 } 185 186 #ifdef CONFIG_SECURITY_NETWORK 187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 188 { 189 UNIXCB(skb).secid = scm->secid; 190 } 191 192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 193 { 194 scm->secid = UNIXCB(skb).secid; 195 } 196 197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 198 { 199 return (scm->secid == UNIXCB(skb).secid); 200 } 201 #else 202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 203 { } 204 205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 206 { } 207 208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 209 { 210 return true; 211 } 212 #endif /* CONFIG_SECURITY_NETWORK */ 213 214 static inline int unix_our_peer(struct sock *sk, struct sock *osk) 215 { 216 return unix_peer(osk) == sk; 217 } 218 219 static inline int unix_may_send(struct sock *sk, struct sock *osk) 220 { 221 return unix_peer(osk) == NULL || unix_our_peer(sk, osk); 222 } 223 224 static inline int unix_recvq_full(const struct sock *sk) 225 { 226 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 227 } 228 229 static inline int unix_recvq_full_lockless(const struct sock *sk) 230 { 231 return skb_queue_len_lockless(&sk->sk_receive_queue) > 232 READ_ONCE(sk->sk_max_ack_backlog); 233 } 234 235 struct sock *unix_peer_get(struct sock *s) 236 { 237 struct sock *peer; 238 239 unix_state_lock(s); 240 peer = unix_peer(s); 241 if (peer) 242 sock_hold(peer); 243 unix_state_unlock(s); 244 return peer; 245 } 246 EXPORT_SYMBOL_GPL(unix_peer_get); 247 248 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 249 int addr_len) 250 { 251 struct unix_address *addr; 252 253 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 254 if (!addr) 255 return NULL; 256 257 refcount_set(&addr->refcnt, 1); 258 addr->len = addr_len; 259 memcpy(addr->name, sunaddr, addr_len); 260 261 return addr; 262 } 263 264 static inline void unix_release_addr(struct unix_address *addr) 265 { 266 if (refcount_dec_and_test(&addr->refcnt)) 267 kfree(addr); 268 } 269 270 /* 271 * Check unix socket name: 272 * - should be not zero length. 273 * - if started by not zero, should be NULL terminated (FS object) 274 * - if started by zero, it is abstract name. 275 */ 276 277 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 278 { 279 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 280 addr_len > sizeof(*sunaddr)) 281 return -EINVAL; 282 283 if (sunaddr->sun_family != AF_UNIX) 284 return -EINVAL; 285 286 return 0; 287 } 288 289 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 290 { 291 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 292 short offset = offsetof(struct sockaddr_storage, __data); 293 294 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 295 296 /* This may look like an off by one error but it is a bit more 297 * subtle. 108 is the longest valid AF_UNIX path for a binding. 298 * sun_path[108] doesn't as such exist. However in kernel space 299 * we are guaranteed that it is a valid memory location in our 300 * kernel address buffer because syscall functions always pass 301 * a pointer of struct sockaddr_storage which has a bigger buffer 302 * than 108. Also, we must terminate sun_path for strlen() in 303 * getname_kernel(). 304 */ 305 addr->__data[addr_len - offset] = 0; 306 307 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 308 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 309 * know the actual buffer. 310 */ 311 return strlen(addr->__data) + offset + 1; 312 } 313 314 static void __unix_remove_socket(struct sock *sk) 315 { 316 sk_del_node_init(sk); 317 } 318 319 static void __unix_insert_socket(struct net *net, struct sock *sk) 320 { 321 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 322 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 323 } 324 325 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 326 struct unix_address *addr, unsigned int hash) 327 { 328 __unix_remove_socket(sk); 329 smp_store_release(&unix_sk(sk)->addr, addr); 330 331 sk->sk_hash = hash; 332 __unix_insert_socket(net, sk); 333 } 334 335 static void unix_remove_socket(struct net *net, struct sock *sk) 336 { 337 spin_lock(&net->unx.table.locks[sk->sk_hash]); 338 __unix_remove_socket(sk); 339 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 340 } 341 342 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 343 { 344 spin_lock(&net->unx.table.locks[sk->sk_hash]); 345 __unix_insert_socket(net, sk); 346 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 347 } 348 349 static void unix_insert_bsd_socket(struct sock *sk) 350 { 351 spin_lock(&bsd_socket_locks[sk->sk_hash]); 352 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 353 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 354 } 355 356 static void unix_remove_bsd_socket(struct sock *sk) 357 { 358 if (!hlist_unhashed(&sk->sk_bind_node)) { 359 spin_lock(&bsd_socket_locks[sk->sk_hash]); 360 __sk_del_bind_node(sk); 361 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 362 363 sk_node_init(&sk->sk_bind_node); 364 } 365 } 366 367 static struct sock *__unix_find_socket_byname(struct net *net, 368 struct sockaddr_un *sunname, 369 int len, unsigned int hash) 370 { 371 struct sock *s; 372 373 sk_for_each(s, &net->unx.table.buckets[hash]) { 374 struct unix_sock *u = unix_sk(s); 375 376 if (u->addr->len == len && 377 !memcmp(u->addr->name, sunname, len)) 378 return s; 379 } 380 return NULL; 381 } 382 383 static inline struct sock *unix_find_socket_byname(struct net *net, 384 struct sockaddr_un *sunname, 385 int len, unsigned int hash) 386 { 387 struct sock *s; 388 389 spin_lock(&net->unx.table.locks[hash]); 390 s = __unix_find_socket_byname(net, sunname, len, hash); 391 if (s) 392 sock_hold(s); 393 spin_unlock(&net->unx.table.locks[hash]); 394 return s; 395 } 396 397 static struct sock *unix_find_socket_byinode(struct inode *i) 398 { 399 unsigned int hash = unix_bsd_hash(i); 400 struct sock *s; 401 402 spin_lock(&bsd_socket_locks[hash]); 403 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 404 struct dentry *dentry = unix_sk(s)->path.dentry; 405 406 if (dentry && d_backing_inode(dentry) == i) { 407 sock_hold(s); 408 spin_unlock(&bsd_socket_locks[hash]); 409 return s; 410 } 411 } 412 spin_unlock(&bsd_socket_locks[hash]); 413 return NULL; 414 } 415 416 /* Support code for asymmetrically connected dgram sockets 417 * 418 * If a datagram socket is connected to a socket not itself connected 419 * to the first socket (eg, /dev/log), clients may only enqueue more 420 * messages if the present receive queue of the server socket is not 421 * "too large". This means there's a second writeability condition 422 * poll and sendmsg need to test. The dgram recv code will do a wake 423 * up on the peer_wait wait queue of a socket upon reception of a 424 * datagram which needs to be propagated to sleeping would-be writers 425 * since these might not have sent anything so far. This can't be 426 * accomplished via poll_wait because the lifetime of the server 427 * socket might be less than that of its clients if these break their 428 * association with it or if the server socket is closed while clients 429 * are still connected to it and there's no way to inform "a polling 430 * implementation" that it should let go of a certain wait queue 431 * 432 * In order to propagate a wake up, a wait_queue_entry_t of the client 433 * socket is enqueued on the peer_wait queue of the server socket 434 * whose wake function does a wake_up on the ordinary client socket 435 * wait queue. This connection is established whenever a write (or 436 * poll for write) hit the flow control condition and broken when the 437 * association to the server socket is dissolved or after a wake up 438 * was relayed. 439 */ 440 441 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 442 void *key) 443 { 444 struct unix_sock *u; 445 wait_queue_head_t *u_sleep; 446 447 u = container_of(q, struct unix_sock, peer_wake); 448 449 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 450 q); 451 u->peer_wake.private = NULL; 452 453 /* relaying can only happen while the wq still exists */ 454 u_sleep = sk_sleep(&u->sk); 455 if (u_sleep) 456 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 457 458 return 0; 459 } 460 461 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 462 { 463 struct unix_sock *u, *u_other; 464 int rc; 465 466 u = unix_sk(sk); 467 u_other = unix_sk(other); 468 rc = 0; 469 spin_lock(&u_other->peer_wait.lock); 470 471 if (!u->peer_wake.private) { 472 u->peer_wake.private = other; 473 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 474 475 rc = 1; 476 } 477 478 spin_unlock(&u_other->peer_wait.lock); 479 return rc; 480 } 481 482 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 483 struct sock *other) 484 { 485 struct unix_sock *u, *u_other; 486 487 u = unix_sk(sk); 488 u_other = unix_sk(other); 489 spin_lock(&u_other->peer_wait.lock); 490 491 if (u->peer_wake.private == other) { 492 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 493 u->peer_wake.private = NULL; 494 } 495 496 spin_unlock(&u_other->peer_wait.lock); 497 } 498 499 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 500 struct sock *other) 501 { 502 unix_dgram_peer_wake_disconnect(sk, other); 503 wake_up_interruptible_poll(sk_sleep(sk), 504 EPOLLOUT | 505 EPOLLWRNORM | 506 EPOLLWRBAND); 507 } 508 509 /* preconditions: 510 * - unix_peer(sk) == other 511 * - association is stable 512 */ 513 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 514 { 515 int connected; 516 517 connected = unix_dgram_peer_wake_connect(sk, other); 518 519 /* If other is SOCK_DEAD, we want to make sure we signal 520 * POLLOUT, such that a subsequent write() can get a 521 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 522 * to other and its full, we will hang waiting for POLLOUT. 523 */ 524 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 525 return 1; 526 527 if (connected) 528 unix_dgram_peer_wake_disconnect(sk, other); 529 530 return 0; 531 } 532 533 static int unix_writable(const struct sock *sk) 534 { 535 return sk->sk_state != TCP_LISTEN && 536 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; 537 } 538 539 static void unix_write_space(struct sock *sk) 540 { 541 struct socket_wq *wq; 542 543 rcu_read_lock(); 544 if (unix_writable(sk)) { 545 wq = rcu_dereference(sk->sk_wq); 546 if (skwq_has_sleeper(wq)) 547 wake_up_interruptible_sync_poll(&wq->wait, 548 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 549 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 550 } 551 rcu_read_unlock(); 552 } 553 554 /* When dgram socket disconnects (or changes its peer), we clear its receive 555 * queue of packets arrived from previous peer. First, it allows to do 556 * flow control based only on wmem_alloc; second, sk connected to peer 557 * may receive messages only from that peer. */ 558 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 559 { 560 if (!skb_queue_empty(&sk->sk_receive_queue)) { 561 skb_queue_purge(&sk->sk_receive_queue); 562 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 563 564 /* If one link of bidirectional dgram pipe is disconnected, 565 * we signal error. Messages are lost. Do not make this, 566 * when peer was not connected to us. 567 */ 568 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 569 WRITE_ONCE(other->sk_err, ECONNRESET); 570 sk_error_report(other); 571 } 572 } 573 other->sk_state = TCP_CLOSE; 574 } 575 576 static void unix_sock_destructor(struct sock *sk) 577 { 578 struct unix_sock *u = unix_sk(sk); 579 580 skb_queue_purge(&sk->sk_receive_queue); 581 582 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 583 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 584 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 585 if (!sock_flag(sk, SOCK_DEAD)) { 586 pr_info("Attempt to release alive unix socket: %p\n", sk); 587 return; 588 } 589 590 if (u->addr) 591 unix_release_addr(u->addr); 592 593 atomic_long_dec(&unix_nr_socks); 594 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 595 #ifdef UNIX_REFCNT_DEBUG 596 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 597 atomic_long_read(&unix_nr_socks)); 598 #endif 599 } 600 601 static void unix_release_sock(struct sock *sk, int embrion) 602 { 603 struct unix_sock *u = unix_sk(sk); 604 struct sock *skpair; 605 struct sk_buff *skb; 606 struct path path; 607 int state; 608 609 unix_remove_socket(sock_net(sk), sk); 610 unix_remove_bsd_socket(sk); 611 612 /* Clear state */ 613 unix_state_lock(sk); 614 sock_orphan(sk); 615 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 616 path = u->path; 617 u->path.dentry = NULL; 618 u->path.mnt = NULL; 619 state = sk->sk_state; 620 sk->sk_state = TCP_CLOSE; 621 622 skpair = unix_peer(sk); 623 unix_peer(sk) = NULL; 624 625 unix_state_unlock(sk); 626 627 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 628 if (u->oob_skb) { 629 kfree_skb(u->oob_skb); 630 u->oob_skb = NULL; 631 } 632 #endif 633 634 wake_up_interruptible_all(&u->peer_wait); 635 636 if (skpair != NULL) { 637 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 638 unix_state_lock(skpair); 639 /* No more writes */ 640 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 641 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) 642 WRITE_ONCE(skpair->sk_err, ECONNRESET); 643 unix_state_unlock(skpair); 644 skpair->sk_state_change(skpair); 645 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 646 } 647 648 unix_dgram_peer_wake_disconnect(sk, skpair); 649 sock_put(skpair); /* It may now die */ 650 } 651 652 /* Try to flush out this socket. Throw out buffers at least */ 653 654 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 655 if (state == TCP_LISTEN) 656 unix_release_sock(skb->sk, 1); 657 /* passed fds are erased in the kfree_skb hook */ 658 UNIXCB(skb).consumed = skb->len; 659 kfree_skb(skb); 660 } 661 662 if (path.dentry) 663 path_put(&path); 664 665 sock_put(sk); 666 667 /* ---- Socket is dead now and most probably destroyed ---- */ 668 669 /* 670 * Fixme: BSD difference: In BSD all sockets connected to us get 671 * ECONNRESET and we die on the spot. In Linux we behave 672 * like files and pipes do and wait for the last 673 * dereference. 674 * 675 * Can't we simply set sock->err? 676 * 677 * What the above comment does talk about? --ANK(980817) 678 */ 679 680 if (READ_ONCE(unix_tot_inflight)) 681 unix_gc(); /* Garbage collect fds */ 682 } 683 684 static void init_peercred(struct sock *sk) 685 { 686 const struct cred *old_cred; 687 struct pid *old_pid; 688 689 spin_lock(&sk->sk_peer_lock); 690 old_pid = sk->sk_peer_pid; 691 old_cred = sk->sk_peer_cred; 692 sk->sk_peer_pid = get_pid(task_tgid(current)); 693 sk->sk_peer_cred = get_current_cred(); 694 spin_unlock(&sk->sk_peer_lock); 695 696 put_pid(old_pid); 697 put_cred(old_cred); 698 } 699 700 static void copy_peercred(struct sock *sk, struct sock *peersk) 701 { 702 const struct cred *old_cred; 703 struct pid *old_pid; 704 705 if (sk < peersk) { 706 spin_lock(&sk->sk_peer_lock); 707 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); 708 } else { 709 spin_lock(&peersk->sk_peer_lock); 710 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); 711 } 712 old_pid = sk->sk_peer_pid; 713 old_cred = sk->sk_peer_cred; 714 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 715 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 716 717 spin_unlock(&sk->sk_peer_lock); 718 spin_unlock(&peersk->sk_peer_lock); 719 720 put_pid(old_pid); 721 put_cred(old_cred); 722 } 723 724 static int unix_listen(struct socket *sock, int backlog) 725 { 726 int err; 727 struct sock *sk = sock->sk; 728 struct unix_sock *u = unix_sk(sk); 729 730 err = -EOPNOTSUPP; 731 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 732 goto out; /* Only stream/seqpacket sockets accept */ 733 err = -EINVAL; 734 if (!READ_ONCE(u->addr)) 735 goto out; /* No listens on an unbound socket */ 736 unix_state_lock(sk); 737 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 738 goto out_unlock; 739 if (backlog > sk->sk_max_ack_backlog) 740 wake_up_interruptible_all(&u->peer_wait); 741 sk->sk_max_ack_backlog = backlog; 742 sk->sk_state = TCP_LISTEN; 743 /* set credentials so connect can copy them */ 744 init_peercred(sk); 745 err = 0; 746 747 out_unlock: 748 unix_state_unlock(sk); 749 out: 750 return err; 751 } 752 753 static int unix_release(struct socket *); 754 static int unix_bind(struct socket *, struct sockaddr *, int); 755 static int unix_stream_connect(struct socket *, struct sockaddr *, 756 int addr_len, int flags); 757 static int unix_socketpair(struct socket *, struct socket *); 758 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); 759 static int unix_getname(struct socket *, struct sockaddr *, int); 760 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 761 static __poll_t unix_dgram_poll(struct file *, struct socket *, 762 poll_table *); 763 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 764 #ifdef CONFIG_COMPAT 765 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 766 #endif 767 static int unix_shutdown(struct socket *, int); 768 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 769 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 770 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 771 struct pipe_inode_info *, size_t size, 772 unsigned int flags); 773 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 774 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 775 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 776 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 777 static int unix_dgram_connect(struct socket *, struct sockaddr *, 778 int, int); 779 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 780 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 781 int); 782 783 #ifdef CONFIG_PROC_FS 784 static int unix_count_nr_fds(struct sock *sk) 785 { 786 struct sk_buff *skb; 787 struct unix_sock *u; 788 int nr_fds = 0; 789 790 spin_lock(&sk->sk_receive_queue.lock); 791 skb = skb_peek(&sk->sk_receive_queue); 792 while (skb) { 793 u = unix_sk(skb->sk); 794 nr_fds += atomic_read(&u->scm_stat.nr_fds); 795 skb = skb_peek_next(skb, &sk->sk_receive_queue); 796 } 797 spin_unlock(&sk->sk_receive_queue.lock); 798 799 return nr_fds; 800 } 801 802 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 803 { 804 struct sock *sk = sock->sk; 805 unsigned char s_state; 806 struct unix_sock *u; 807 int nr_fds = 0; 808 809 if (sk) { 810 s_state = READ_ONCE(sk->sk_state); 811 u = unix_sk(sk); 812 813 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 814 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 815 * SOCK_DGRAM is ordinary. So, no lock is needed. 816 */ 817 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 818 nr_fds = atomic_read(&u->scm_stat.nr_fds); 819 else if (s_state == TCP_LISTEN) 820 nr_fds = unix_count_nr_fds(sk); 821 822 seq_printf(m, "scm_fds: %u\n", nr_fds); 823 } 824 } 825 #else 826 #define unix_show_fdinfo NULL 827 #endif 828 829 static const struct proto_ops unix_stream_ops = { 830 .family = PF_UNIX, 831 .owner = THIS_MODULE, 832 .release = unix_release, 833 .bind = unix_bind, 834 .connect = unix_stream_connect, 835 .socketpair = unix_socketpair, 836 .accept = unix_accept, 837 .getname = unix_getname, 838 .poll = unix_poll, 839 .ioctl = unix_ioctl, 840 #ifdef CONFIG_COMPAT 841 .compat_ioctl = unix_compat_ioctl, 842 #endif 843 .listen = unix_listen, 844 .shutdown = unix_shutdown, 845 .sendmsg = unix_stream_sendmsg, 846 .recvmsg = unix_stream_recvmsg, 847 .read_skb = unix_stream_read_skb, 848 .mmap = sock_no_mmap, 849 .splice_read = unix_stream_splice_read, 850 .set_peek_off = sk_set_peek_off, 851 .show_fdinfo = unix_show_fdinfo, 852 }; 853 854 static const struct proto_ops unix_dgram_ops = { 855 .family = PF_UNIX, 856 .owner = THIS_MODULE, 857 .release = unix_release, 858 .bind = unix_bind, 859 .connect = unix_dgram_connect, 860 .socketpair = unix_socketpair, 861 .accept = sock_no_accept, 862 .getname = unix_getname, 863 .poll = unix_dgram_poll, 864 .ioctl = unix_ioctl, 865 #ifdef CONFIG_COMPAT 866 .compat_ioctl = unix_compat_ioctl, 867 #endif 868 .listen = sock_no_listen, 869 .shutdown = unix_shutdown, 870 .sendmsg = unix_dgram_sendmsg, 871 .read_skb = unix_read_skb, 872 .recvmsg = unix_dgram_recvmsg, 873 .mmap = sock_no_mmap, 874 .set_peek_off = sk_set_peek_off, 875 .show_fdinfo = unix_show_fdinfo, 876 }; 877 878 static const struct proto_ops unix_seqpacket_ops = { 879 .family = PF_UNIX, 880 .owner = THIS_MODULE, 881 .release = unix_release, 882 .bind = unix_bind, 883 .connect = unix_stream_connect, 884 .socketpair = unix_socketpair, 885 .accept = unix_accept, 886 .getname = unix_getname, 887 .poll = unix_dgram_poll, 888 .ioctl = unix_ioctl, 889 #ifdef CONFIG_COMPAT 890 .compat_ioctl = unix_compat_ioctl, 891 #endif 892 .listen = unix_listen, 893 .shutdown = unix_shutdown, 894 .sendmsg = unix_seqpacket_sendmsg, 895 .recvmsg = unix_seqpacket_recvmsg, 896 .mmap = sock_no_mmap, 897 .set_peek_off = sk_set_peek_off, 898 .show_fdinfo = unix_show_fdinfo, 899 }; 900 901 static void unix_close(struct sock *sk, long timeout) 902 { 903 /* Nothing to do here, unix socket does not need a ->close(). 904 * This is merely for sockmap. 905 */ 906 } 907 908 static void unix_unhash(struct sock *sk) 909 { 910 /* Nothing to do here, unix socket does not need a ->unhash(). 911 * This is merely for sockmap. 912 */ 913 } 914 915 static bool unix_bpf_bypass_getsockopt(int level, int optname) 916 { 917 if (level == SOL_SOCKET) { 918 switch (optname) { 919 case SO_PEERPIDFD: 920 return true; 921 default: 922 return false; 923 } 924 } 925 926 return false; 927 } 928 929 struct proto unix_dgram_proto = { 930 .name = "UNIX", 931 .owner = THIS_MODULE, 932 .obj_size = sizeof(struct unix_sock), 933 .close = unix_close, 934 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 935 #ifdef CONFIG_BPF_SYSCALL 936 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 937 #endif 938 }; 939 940 struct proto unix_stream_proto = { 941 .name = "UNIX-STREAM", 942 .owner = THIS_MODULE, 943 .obj_size = sizeof(struct unix_sock), 944 .close = unix_close, 945 .unhash = unix_unhash, 946 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 947 #ifdef CONFIG_BPF_SYSCALL 948 .psock_update_sk_prot = unix_stream_bpf_update_proto, 949 #endif 950 }; 951 952 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 953 { 954 struct unix_sock *u; 955 struct sock *sk; 956 int err; 957 958 atomic_long_inc(&unix_nr_socks); 959 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 960 err = -ENFILE; 961 goto err; 962 } 963 964 if (type == SOCK_STREAM) 965 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 966 else /*dgram and seqpacket */ 967 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 968 969 if (!sk) { 970 err = -ENOMEM; 971 goto err; 972 } 973 974 sock_init_data(sock, sk); 975 976 sk->sk_hash = unix_unbound_hash(sk); 977 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 978 sk->sk_write_space = unix_write_space; 979 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; 980 sk->sk_destruct = unix_sock_destructor; 981 u = unix_sk(sk); 982 u->listener = NULL; 983 u->vertex = NULL; 984 u->path.dentry = NULL; 985 u->path.mnt = NULL; 986 spin_lock_init(&u->lock); 987 mutex_init(&u->iolock); /* single task reading lock */ 988 mutex_init(&u->bindlock); /* single task binding lock */ 989 init_waitqueue_head(&u->peer_wait); 990 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 991 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 992 unix_insert_unbound_socket(net, sk); 993 994 sock_prot_inuse_add(net, sk->sk_prot, 1); 995 996 return sk; 997 998 err: 999 atomic_long_dec(&unix_nr_socks); 1000 return ERR_PTR(err); 1001 } 1002 1003 static int unix_create(struct net *net, struct socket *sock, int protocol, 1004 int kern) 1005 { 1006 struct sock *sk; 1007 1008 if (protocol && protocol != PF_UNIX) 1009 return -EPROTONOSUPPORT; 1010 1011 sock->state = SS_UNCONNECTED; 1012 1013 switch (sock->type) { 1014 case SOCK_STREAM: 1015 sock->ops = &unix_stream_ops; 1016 break; 1017 /* 1018 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1019 * nothing uses it. 1020 */ 1021 case SOCK_RAW: 1022 sock->type = SOCK_DGRAM; 1023 fallthrough; 1024 case SOCK_DGRAM: 1025 sock->ops = &unix_dgram_ops; 1026 break; 1027 case SOCK_SEQPACKET: 1028 sock->ops = &unix_seqpacket_ops; 1029 break; 1030 default: 1031 return -ESOCKTNOSUPPORT; 1032 } 1033 1034 sk = unix_create1(net, sock, kern, sock->type); 1035 if (IS_ERR(sk)) 1036 return PTR_ERR(sk); 1037 1038 return 0; 1039 } 1040 1041 static int unix_release(struct socket *sock) 1042 { 1043 struct sock *sk = sock->sk; 1044 1045 if (!sk) 1046 return 0; 1047 1048 sk->sk_prot->close(sk, 0); 1049 unix_release_sock(sk, 0); 1050 sock->sk = NULL; 1051 1052 return 0; 1053 } 1054 1055 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1056 int type) 1057 { 1058 struct inode *inode; 1059 struct path path; 1060 struct sock *sk; 1061 int err; 1062 1063 unix_mkname_bsd(sunaddr, addr_len); 1064 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1065 if (err) 1066 goto fail; 1067 1068 err = path_permission(&path, MAY_WRITE); 1069 if (err) 1070 goto path_put; 1071 1072 err = -ECONNREFUSED; 1073 inode = d_backing_inode(path.dentry); 1074 if (!S_ISSOCK(inode->i_mode)) 1075 goto path_put; 1076 1077 sk = unix_find_socket_byinode(inode); 1078 if (!sk) 1079 goto path_put; 1080 1081 err = -EPROTOTYPE; 1082 if (sk->sk_type == type) 1083 touch_atime(&path); 1084 else 1085 goto sock_put; 1086 1087 path_put(&path); 1088 1089 return sk; 1090 1091 sock_put: 1092 sock_put(sk); 1093 path_put: 1094 path_put(&path); 1095 fail: 1096 return ERR_PTR(err); 1097 } 1098 1099 static struct sock *unix_find_abstract(struct net *net, 1100 struct sockaddr_un *sunaddr, 1101 int addr_len, int type) 1102 { 1103 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1104 struct dentry *dentry; 1105 struct sock *sk; 1106 1107 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1108 if (!sk) 1109 return ERR_PTR(-ECONNREFUSED); 1110 1111 dentry = unix_sk(sk)->path.dentry; 1112 if (dentry) 1113 touch_atime(&unix_sk(sk)->path); 1114 1115 return sk; 1116 } 1117 1118 static struct sock *unix_find_other(struct net *net, 1119 struct sockaddr_un *sunaddr, 1120 int addr_len, int type) 1121 { 1122 struct sock *sk; 1123 1124 if (sunaddr->sun_path[0]) 1125 sk = unix_find_bsd(sunaddr, addr_len, type); 1126 else 1127 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1128 1129 return sk; 1130 } 1131 1132 static int unix_autobind(struct sock *sk) 1133 { 1134 struct unix_sock *u = unix_sk(sk); 1135 unsigned int new_hash, old_hash; 1136 struct net *net = sock_net(sk); 1137 struct unix_address *addr; 1138 u32 lastnum, ordernum; 1139 int err; 1140 1141 err = mutex_lock_interruptible(&u->bindlock); 1142 if (err) 1143 return err; 1144 1145 if (u->addr) 1146 goto out; 1147 1148 err = -ENOMEM; 1149 addr = kzalloc(sizeof(*addr) + 1150 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1151 if (!addr) 1152 goto out; 1153 1154 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1155 addr->name->sun_family = AF_UNIX; 1156 refcount_set(&addr->refcnt, 1); 1157 1158 old_hash = sk->sk_hash; 1159 ordernum = get_random_u32(); 1160 lastnum = ordernum & 0xFFFFF; 1161 retry: 1162 ordernum = (ordernum + 1) & 0xFFFFF; 1163 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1164 1165 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1166 unix_table_double_lock(net, old_hash, new_hash); 1167 1168 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1169 unix_table_double_unlock(net, old_hash, new_hash); 1170 1171 /* __unix_find_socket_byname() may take long time if many names 1172 * are already in use. 1173 */ 1174 cond_resched(); 1175 1176 if (ordernum == lastnum) { 1177 /* Give up if all names seems to be in use. */ 1178 err = -ENOSPC; 1179 unix_release_addr(addr); 1180 goto out; 1181 } 1182 1183 goto retry; 1184 } 1185 1186 __unix_set_addr_hash(net, sk, addr, new_hash); 1187 unix_table_double_unlock(net, old_hash, new_hash); 1188 err = 0; 1189 1190 out: mutex_unlock(&u->bindlock); 1191 return err; 1192 } 1193 1194 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1195 int addr_len) 1196 { 1197 umode_t mode = S_IFSOCK | 1198 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1199 struct unix_sock *u = unix_sk(sk); 1200 unsigned int new_hash, old_hash; 1201 struct net *net = sock_net(sk); 1202 struct mnt_idmap *idmap; 1203 struct unix_address *addr; 1204 struct dentry *dentry; 1205 struct path parent; 1206 int err; 1207 1208 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1209 addr = unix_create_addr(sunaddr, addr_len); 1210 if (!addr) 1211 return -ENOMEM; 1212 1213 /* 1214 * Get the parent directory, calculate the hash for last 1215 * component. 1216 */ 1217 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1218 if (IS_ERR(dentry)) { 1219 err = PTR_ERR(dentry); 1220 goto out; 1221 } 1222 1223 /* 1224 * All right, let's create it. 1225 */ 1226 idmap = mnt_idmap(parent.mnt); 1227 err = security_path_mknod(&parent, dentry, mode, 0); 1228 if (!err) 1229 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1230 if (err) 1231 goto out_path; 1232 err = mutex_lock_interruptible(&u->bindlock); 1233 if (err) 1234 goto out_unlink; 1235 if (u->addr) 1236 goto out_unlock; 1237 1238 old_hash = sk->sk_hash; 1239 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1240 unix_table_double_lock(net, old_hash, new_hash); 1241 u->path.mnt = mntget(parent.mnt); 1242 u->path.dentry = dget(dentry); 1243 __unix_set_addr_hash(net, sk, addr, new_hash); 1244 unix_table_double_unlock(net, old_hash, new_hash); 1245 unix_insert_bsd_socket(sk); 1246 mutex_unlock(&u->bindlock); 1247 done_path_create(&parent, dentry); 1248 return 0; 1249 1250 out_unlock: 1251 mutex_unlock(&u->bindlock); 1252 err = -EINVAL; 1253 out_unlink: 1254 /* failed after successful mknod? unlink what we'd created... */ 1255 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1256 out_path: 1257 done_path_create(&parent, dentry); 1258 out: 1259 unix_release_addr(addr); 1260 return err == -EEXIST ? -EADDRINUSE : err; 1261 } 1262 1263 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1264 int addr_len) 1265 { 1266 struct unix_sock *u = unix_sk(sk); 1267 unsigned int new_hash, old_hash; 1268 struct net *net = sock_net(sk); 1269 struct unix_address *addr; 1270 int err; 1271 1272 addr = unix_create_addr(sunaddr, addr_len); 1273 if (!addr) 1274 return -ENOMEM; 1275 1276 err = mutex_lock_interruptible(&u->bindlock); 1277 if (err) 1278 goto out; 1279 1280 if (u->addr) { 1281 err = -EINVAL; 1282 goto out_mutex; 1283 } 1284 1285 old_hash = sk->sk_hash; 1286 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1287 unix_table_double_lock(net, old_hash, new_hash); 1288 1289 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1290 goto out_spin; 1291 1292 __unix_set_addr_hash(net, sk, addr, new_hash); 1293 unix_table_double_unlock(net, old_hash, new_hash); 1294 mutex_unlock(&u->bindlock); 1295 return 0; 1296 1297 out_spin: 1298 unix_table_double_unlock(net, old_hash, new_hash); 1299 err = -EADDRINUSE; 1300 out_mutex: 1301 mutex_unlock(&u->bindlock); 1302 out: 1303 unix_release_addr(addr); 1304 return err; 1305 } 1306 1307 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1308 { 1309 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1310 struct sock *sk = sock->sk; 1311 int err; 1312 1313 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1314 sunaddr->sun_family == AF_UNIX) 1315 return unix_autobind(sk); 1316 1317 err = unix_validate_addr(sunaddr, addr_len); 1318 if (err) 1319 return err; 1320 1321 if (sunaddr->sun_path[0]) 1322 err = unix_bind_bsd(sk, sunaddr, addr_len); 1323 else 1324 err = unix_bind_abstract(sk, sunaddr, addr_len); 1325 1326 return err; 1327 } 1328 1329 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1330 { 1331 if (unlikely(sk1 == sk2) || !sk2) { 1332 unix_state_lock(sk1); 1333 return; 1334 } 1335 if (sk1 > sk2) 1336 swap(sk1, sk2); 1337 1338 unix_state_lock(sk1); 1339 unix_state_lock_nested(sk2, U_LOCK_SECOND); 1340 } 1341 1342 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1343 { 1344 if (unlikely(sk1 == sk2) || !sk2) { 1345 unix_state_unlock(sk1); 1346 return; 1347 } 1348 unix_state_unlock(sk1); 1349 unix_state_unlock(sk2); 1350 } 1351 1352 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1353 int alen, int flags) 1354 { 1355 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1356 struct sock *sk = sock->sk; 1357 struct sock *other; 1358 int err; 1359 1360 err = -EINVAL; 1361 if (alen < offsetofend(struct sockaddr, sa_family)) 1362 goto out; 1363 1364 if (addr->sa_family != AF_UNSPEC) { 1365 err = unix_validate_addr(sunaddr, alen); 1366 if (err) 1367 goto out; 1368 1369 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1370 if (err) 1371 goto out; 1372 1373 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1374 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1375 !READ_ONCE(unix_sk(sk)->addr)) { 1376 err = unix_autobind(sk); 1377 if (err) 1378 goto out; 1379 } 1380 1381 restart: 1382 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1383 if (IS_ERR(other)) { 1384 err = PTR_ERR(other); 1385 goto out; 1386 } 1387 1388 unix_state_double_lock(sk, other); 1389 1390 /* Apparently VFS overslept socket death. Retry. */ 1391 if (sock_flag(other, SOCK_DEAD)) { 1392 unix_state_double_unlock(sk, other); 1393 sock_put(other); 1394 goto restart; 1395 } 1396 1397 err = -EPERM; 1398 if (!unix_may_send(sk, other)) 1399 goto out_unlock; 1400 1401 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1402 if (err) 1403 goto out_unlock; 1404 1405 sk->sk_state = other->sk_state = TCP_ESTABLISHED; 1406 } else { 1407 /* 1408 * 1003.1g breaking connected state with AF_UNSPEC 1409 */ 1410 other = NULL; 1411 unix_state_double_lock(sk, other); 1412 } 1413 1414 /* 1415 * If it was connected, reconnect. 1416 */ 1417 if (unix_peer(sk)) { 1418 struct sock *old_peer = unix_peer(sk); 1419 1420 unix_peer(sk) = other; 1421 if (!other) 1422 sk->sk_state = TCP_CLOSE; 1423 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1424 1425 unix_state_double_unlock(sk, other); 1426 1427 if (other != old_peer) 1428 unix_dgram_disconnected(sk, old_peer); 1429 sock_put(old_peer); 1430 } else { 1431 unix_peer(sk) = other; 1432 unix_state_double_unlock(sk, other); 1433 } 1434 1435 return 0; 1436 1437 out_unlock: 1438 unix_state_double_unlock(sk, other); 1439 sock_put(other); 1440 out: 1441 return err; 1442 } 1443 1444 static long unix_wait_for_peer(struct sock *other, long timeo) 1445 __releases(&unix_sk(other)->lock) 1446 { 1447 struct unix_sock *u = unix_sk(other); 1448 int sched; 1449 DEFINE_WAIT(wait); 1450 1451 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1452 1453 sched = !sock_flag(other, SOCK_DEAD) && 1454 !(other->sk_shutdown & RCV_SHUTDOWN) && 1455 unix_recvq_full_lockless(other); 1456 1457 unix_state_unlock(other); 1458 1459 if (sched) 1460 timeo = schedule_timeout(timeo); 1461 1462 finish_wait(&u->peer_wait, &wait); 1463 return timeo; 1464 } 1465 1466 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1467 int addr_len, int flags) 1468 { 1469 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1470 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1471 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1472 struct net *net = sock_net(sk); 1473 struct sk_buff *skb = NULL; 1474 long timeo; 1475 int err; 1476 int st; 1477 1478 err = unix_validate_addr(sunaddr, addr_len); 1479 if (err) 1480 goto out; 1481 1482 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1483 if (err) 1484 goto out; 1485 1486 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1487 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1488 !READ_ONCE(u->addr)) { 1489 err = unix_autobind(sk); 1490 if (err) 1491 goto out; 1492 } 1493 1494 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1495 1496 /* First of all allocate resources. 1497 If we will make it after state is locked, 1498 we will have to recheck all again in any case. 1499 */ 1500 1501 /* create new sock for complete connection */ 1502 newsk = unix_create1(net, NULL, 0, sock->type); 1503 if (IS_ERR(newsk)) { 1504 err = PTR_ERR(newsk); 1505 newsk = NULL; 1506 goto out; 1507 } 1508 1509 err = -ENOMEM; 1510 1511 /* Allocate skb for sending to listening sock */ 1512 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1513 if (skb == NULL) 1514 goto out; 1515 1516 restart: 1517 /* Find listening sock. */ 1518 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1519 if (IS_ERR(other)) { 1520 err = PTR_ERR(other); 1521 other = NULL; 1522 goto out; 1523 } 1524 1525 /* Latch state of peer */ 1526 unix_state_lock(other); 1527 1528 /* Apparently VFS overslept socket death. Retry. */ 1529 if (sock_flag(other, SOCK_DEAD)) { 1530 unix_state_unlock(other); 1531 sock_put(other); 1532 goto restart; 1533 } 1534 1535 err = -ECONNREFUSED; 1536 if (other->sk_state != TCP_LISTEN) 1537 goto out_unlock; 1538 if (other->sk_shutdown & RCV_SHUTDOWN) 1539 goto out_unlock; 1540 1541 if (unix_recvq_full(other)) { 1542 err = -EAGAIN; 1543 if (!timeo) 1544 goto out_unlock; 1545 1546 timeo = unix_wait_for_peer(other, timeo); 1547 1548 err = sock_intr_errno(timeo); 1549 if (signal_pending(current)) 1550 goto out; 1551 sock_put(other); 1552 goto restart; 1553 } 1554 1555 /* Latch our state. 1556 1557 It is tricky place. We need to grab our state lock and cannot 1558 drop lock on peer. It is dangerous because deadlock is 1559 possible. Connect to self case and simultaneous 1560 attempt to connect are eliminated by checking socket 1561 state. other is TCP_LISTEN, if sk is TCP_LISTEN we 1562 check this before attempt to grab lock. 1563 1564 Well, and we have to recheck the state after socket locked. 1565 */ 1566 st = sk->sk_state; 1567 1568 switch (st) { 1569 case TCP_CLOSE: 1570 /* This is ok... continue with connect */ 1571 break; 1572 case TCP_ESTABLISHED: 1573 /* Socket is already connected */ 1574 err = -EISCONN; 1575 goto out_unlock; 1576 default: 1577 err = -EINVAL; 1578 goto out_unlock; 1579 } 1580 1581 unix_state_lock_nested(sk, U_LOCK_SECOND); 1582 1583 if (sk->sk_state != st) { 1584 unix_state_unlock(sk); 1585 unix_state_unlock(other); 1586 sock_put(other); 1587 goto restart; 1588 } 1589 1590 err = security_unix_stream_connect(sk, other, newsk); 1591 if (err) { 1592 unix_state_unlock(sk); 1593 goto out_unlock; 1594 } 1595 1596 /* The way is open! Fastly set all the necessary fields... */ 1597 1598 sock_hold(sk); 1599 unix_peer(newsk) = sk; 1600 newsk->sk_state = TCP_ESTABLISHED; 1601 newsk->sk_type = sk->sk_type; 1602 init_peercred(newsk); 1603 newu = unix_sk(newsk); 1604 newu->listener = other; 1605 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1606 otheru = unix_sk(other); 1607 1608 /* copy address information from listening to new sock 1609 * 1610 * The contents of *(otheru->addr) and otheru->path 1611 * are seen fully set up here, since we have found 1612 * otheru in hash under its lock. Insertion into the 1613 * hash chain we'd found it in had been done in an 1614 * earlier critical area protected by the chain's lock, 1615 * the same one where we'd set *(otheru->addr) contents, 1616 * as well as otheru->path and otheru->addr itself. 1617 * 1618 * Using smp_store_release() here to set newu->addr 1619 * is enough to make those stores, as well as stores 1620 * to newu->path visible to anyone who gets newu->addr 1621 * by smp_load_acquire(). IOW, the same warranties 1622 * as for unix_sock instances bound in unix_bind() or 1623 * in unix_autobind(). 1624 */ 1625 if (otheru->path.dentry) { 1626 path_get(&otheru->path); 1627 newu->path = otheru->path; 1628 } 1629 refcount_inc(&otheru->addr->refcnt); 1630 smp_store_release(&newu->addr, otheru->addr); 1631 1632 /* Set credentials */ 1633 copy_peercred(sk, other); 1634 1635 sock->state = SS_CONNECTED; 1636 sk->sk_state = TCP_ESTABLISHED; 1637 sock_hold(newsk); 1638 1639 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1640 unix_peer(sk) = newsk; 1641 1642 unix_state_unlock(sk); 1643 1644 /* take ten and send info to listening sock */ 1645 spin_lock(&other->sk_receive_queue.lock); 1646 __skb_queue_tail(&other->sk_receive_queue, skb); 1647 spin_unlock(&other->sk_receive_queue.lock); 1648 unix_state_unlock(other); 1649 other->sk_data_ready(other); 1650 sock_put(other); 1651 return 0; 1652 1653 out_unlock: 1654 if (other) 1655 unix_state_unlock(other); 1656 1657 out: 1658 kfree_skb(skb); 1659 if (newsk) 1660 unix_release_sock(newsk, 0); 1661 if (other) 1662 sock_put(other); 1663 return err; 1664 } 1665 1666 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1667 { 1668 struct sock *ska = socka->sk, *skb = sockb->sk; 1669 1670 /* Join our sockets back to back */ 1671 sock_hold(ska); 1672 sock_hold(skb); 1673 unix_peer(ska) = skb; 1674 unix_peer(skb) = ska; 1675 init_peercred(ska); 1676 init_peercred(skb); 1677 1678 ska->sk_state = TCP_ESTABLISHED; 1679 skb->sk_state = TCP_ESTABLISHED; 1680 socka->state = SS_CONNECTED; 1681 sockb->state = SS_CONNECTED; 1682 return 0; 1683 } 1684 1685 static void unix_sock_inherit_flags(const struct socket *old, 1686 struct socket *new) 1687 { 1688 if (test_bit(SOCK_PASSCRED, &old->flags)) 1689 set_bit(SOCK_PASSCRED, &new->flags); 1690 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1691 set_bit(SOCK_PASSPIDFD, &new->flags); 1692 if (test_bit(SOCK_PASSSEC, &old->flags)) 1693 set_bit(SOCK_PASSSEC, &new->flags); 1694 } 1695 1696 static int unix_accept(struct socket *sock, struct socket *newsock, 1697 struct proto_accept_arg *arg) 1698 { 1699 struct sock *sk = sock->sk; 1700 struct sk_buff *skb; 1701 struct sock *tsk; 1702 1703 arg->err = -EOPNOTSUPP; 1704 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1705 goto out; 1706 1707 arg->err = -EINVAL; 1708 if (sk->sk_state != TCP_LISTEN) 1709 goto out; 1710 1711 /* If socket state is TCP_LISTEN it cannot change (for now...), 1712 * so that no locks are necessary. 1713 */ 1714 1715 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1716 &arg->err); 1717 if (!skb) { 1718 /* This means receive shutdown. */ 1719 if (arg->err == 0) 1720 arg->err = -EINVAL; 1721 goto out; 1722 } 1723 1724 tsk = skb->sk; 1725 skb_free_datagram(sk, skb); 1726 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1727 1728 /* attach accepted sock to socket */ 1729 unix_state_lock(tsk); 1730 unix_update_edges(unix_sk(tsk)); 1731 newsock->state = SS_CONNECTED; 1732 unix_sock_inherit_flags(sock, newsock); 1733 sock_graft(tsk, newsock); 1734 unix_state_unlock(tsk); 1735 return 0; 1736 1737 out: 1738 return arg->err; 1739 } 1740 1741 1742 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1743 { 1744 struct sock *sk = sock->sk; 1745 struct unix_address *addr; 1746 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1747 int err = 0; 1748 1749 if (peer) { 1750 sk = unix_peer_get(sk); 1751 1752 err = -ENOTCONN; 1753 if (!sk) 1754 goto out; 1755 err = 0; 1756 } else { 1757 sock_hold(sk); 1758 } 1759 1760 addr = smp_load_acquire(&unix_sk(sk)->addr); 1761 if (!addr) { 1762 sunaddr->sun_family = AF_UNIX; 1763 sunaddr->sun_path[0] = 0; 1764 err = offsetof(struct sockaddr_un, sun_path); 1765 } else { 1766 err = addr->len; 1767 memcpy(sunaddr, addr->name, addr->len); 1768 1769 if (peer) 1770 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1771 CGROUP_UNIX_GETPEERNAME); 1772 else 1773 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1774 CGROUP_UNIX_GETSOCKNAME); 1775 } 1776 sock_put(sk); 1777 out: 1778 return err; 1779 } 1780 1781 /* The "user->unix_inflight" variable is protected by the garbage 1782 * collection lock, and we just read it locklessly here. If you go 1783 * over the limit, there might be a tiny race in actually noticing 1784 * it across threads. Tough. 1785 */ 1786 static inline bool too_many_unix_fds(struct task_struct *p) 1787 { 1788 struct user_struct *user = current_user(); 1789 1790 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1791 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1792 return false; 1793 } 1794 1795 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1796 { 1797 if (too_many_unix_fds(current)) 1798 return -ETOOMANYREFS; 1799 1800 UNIXCB(skb).fp = scm->fp; 1801 scm->fp = NULL; 1802 1803 if (unix_prepare_fpl(UNIXCB(skb).fp)) 1804 return -ENOMEM; 1805 1806 return 0; 1807 } 1808 1809 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1810 { 1811 scm->fp = UNIXCB(skb).fp; 1812 UNIXCB(skb).fp = NULL; 1813 1814 unix_destroy_fpl(scm->fp); 1815 } 1816 1817 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1818 { 1819 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1820 } 1821 1822 static void unix_destruct_scm(struct sk_buff *skb) 1823 { 1824 struct scm_cookie scm; 1825 1826 memset(&scm, 0, sizeof(scm)); 1827 scm.pid = UNIXCB(skb).pid; 1828 if (UNIXCB(skb).fp) 1829 unix_detach_fds(&scm, skb); 1830 1831 /* Alas, it calls VFS */ 1832 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1833 scm_destroy(&scm); 1834 sock_wfree(skb); 1835 } 1836 1837 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1838 { 1839 int err = 0; 1840 1841 UNIXCB(skb).pid = get_pid(scm->pid); 1842 UNIXCB(skb).uid = scm->creds.uid; 1843 UNIXCB(skb).gid = scm->creds.gid; 1844 UNIXCB(skb).fp = NULL; 1845 unix_get_secdata(scm, skb); 1846 if (scm->fp && send_fds) 1847 err = unix_attach_fds(scm, skb); 1848 1849 skb->destructor = unix_destruct_scm; 1850 return err; 1851 } 1852 1853 static bool unix_passcred_enabled(const struct socket *sock, 1854 const struct sock *other) 1855 { 1856 return test_bit(SOCK_PASSCRED, &sock->flags) || 1857 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1858 !other->sk_socket || 1859 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1860 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1861 } 1862 1863 /* 1864 * Some apps rely on write() giving SCM_CREDENTIALS 1865 * We include credentials if source or destination socket 1866 * asserted SOCK_PASSCRED. 1867 */ 1868 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1869 const struct sock *other) 1870 { 1871 if (UNIXCB(skb).pid) 1872 return; 1873 if (unix_passcred_enabled(sock, other)) { 1874 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1875 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1876 } 1877 } 1878 1879 static bool unix_skb_scm_eq(struct sk_buff *skb, 1880 struct scm_cookie *scm) 1881 { 1882 return UNIXCB(skb).pid == scm->pid && 1883 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1884 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1885 unix_secdata_eq(scm, skb); 1886 } 1887 1888 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1889 { 1890 struct scm_fp_list *fp = UNIXCB(skb).fp; 1891 struct unix_sock *u = unix_sk(sk); 1892 1893 if (unlikely(fp && fp->count)) { 1894 atomic_add(fp->count, &u->scm_stat.nr_fds); 1895 unix_add_edges(fp, u); 1896 } 1897 } 1898 1899 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1900 { 1901 struct scm_fp_list *fp = UNIXCB(skb).fp; 1902 struct unix_sock *u = unix_sk(sk); 1903 1904 if (unlikely(fp && fp->count)) { 1905 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1906 unix_del_edges(fp); 1907 } 1908 } 1909 1910 /* 1911 * Send AF_UNIX data. 1912 */ 1913 1914 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1915 size_t len) 1916 { 1917 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 1918 struct sock *sk = sock->sk, *other = NULL; 1919 struct unix_sock *u = unix_sk(sk); 1920 struct scm_cookie scm; 1921 struct sk_buff *skb; 1922 int data_len = 0; 1923 int sk_locked; 1924 long timeo; 1925 int err; 1926 1927 err = scm_send(sock, msg, &scm, false); 1928 if (err < 0) 1929 return err; 1930 1931 wait_for_unix_gc(scm.fp); 1932 1933 err = -EOPNOTSUPP; 1934 if (msg->msg_flags&MSG_OOB) 1935 goto out; 1936 1937 if (msg->msg_namelen) { 1938 err = unix_validate_addr(sunaddr, msg->msg_namelen); 1939 if (err) 1940 goto out; 1941 1942 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 1943 msg->msg_name, 1944 &msg->msg_namelen, 1945 NULL); 1946 if (err) 1947 goto out; 1948 } else { 1949 sunaddr = NULL; 1950 err = -ENOTCONN; 1951 other = unix_peer_get(sk); 1952 if (!other) 1953 goto out; 1954 } 1955 1956 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1957 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1958 !READ_ONCE(u->addr)) { 1959 err = unix_autobind(sk); 1960 if (err) 1961 goto out; 1962 } 1963 1964 err = -EMSGSIZE; 1965 if (len > sk->sk_sndbuf - 32) 1966 goto out; 1967 1968 if (len > SKB_MAX_ALLOC) { 1969 data_len = min_t(size_t, 1970 len - SKB_MAX_ALLOC, 1971 MAX_SKB_FRAGS * PAGE_SIZE); 1972 data_len = PAGE_ALIGN(data_len); 1973 1974 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 1975 } 1976 1977 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 1978 msg->msg_flags & MSG_DONTWAIT, &err, 1979 PAGE_ALLOC_COSTLY_ORDER); 1980 if (skb == NULL) 1981 goto out; 1982 1983 err = unix_scm_to_skb(&scm, skb, true); 1984 if (err < 0) 1985 goto out_free; 1986 1987 skb_put(skb, len - data_len); 1988 skb->data_len = data_len; 1989 skb->len = len; 1990 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 1991 if (err) 1992 goto out_free; 1993 1994 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1995 1996 restart: 1997 if (!other) { 1998 err = -ECONNRESET; 1999 if (sunaddr == NULL) 2000 goto out_free; 2001 2002 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, 2003 sk->sk_type); 2004 if (IS_ERR(other)) { 2005 err = PTR_ERR(other); 2006 other = NULL; 2007 goto out_free; 2008 } 2009 } 2010 2011 if (sk_filter(other, skb) < 0) { 2012 /* Toss the packet but do not return any error to the sender */ 2013 err = len; 2014 goto out_free; 2015 } 2016 2017 sk_locked = 0; 2018 unix_state_lock(other); 2019 restart_locked: 2020 err = -EPERM; 2021 if (!unix_may_send(sk, other)) 2022 goto out_unlock; 2023 2024 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2025 /* 2026 * Check with 1003.1g - what should 2027 * datagram error 2028 */ 2029 unix_state_unlock(other); 2030 sock_put(other); 2031 2032 if (!sk_locked) 2033 unix_state_lock(sk); 2034 2035 err = 0; 2036 if (sk->sk_type == SOCK_SEQPACKET) { 2037 /* We are here only when racing with unix_release_sock() 2038 * is clearing @other. Never change state to TCP_CLOSE 2039 * unlike SOCK_DGRAM wants. 2040 */ 2041 unix_state_unlock(sk); 2042 err = -EPIPE; 2043 } else if (unix_peer(sk) == other) { 2044 unix_peer(sk) = NULL; 2045 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2046 2047 sk->sk_state = TCP_CLOSE; 2048 unix_state_unlock(sk); 2049 2050 unix_dgram_disconnected(sk, other); 2051 sock_put(other); 2052 err = -ECONNREFUSED; 2053 } else { 2054 unix_state_unlock(sk); 2055 } 2056 2057 other = NULL; 2058 if (err) 2059 goto out_free; 2060 goto restart; 2061 } 2062 2063 err = -EPIPE; 2064 if (other->sk_shutdown & RCV_SHUTDOWN) 2065 goto out_unlock; 2066 2067 if (sk->sk_type != SOCK_SEQPACKET) { 2068 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2069 if (err) 2070 goto out_unlock; 2071 } 2072 2073 /* other == sk && unix_peer(other) != sk if 2074 * - unix_peer(sk) == NULL, destination address bound to sk 2075 * - unix_peer(sk) == sk by time of get but disconnected before lock 2076 */ 2077 if (other != sk && 2078 unlikely(unix_peer(other) != sk && 2079 unix_recvq_full_lockless(other))) { 2080 if (timeo) { 2081 timeo = unix_wait_for_peer(other, timeo); 2082 2083 err = sock_intr_errno(timeo); 2084 if (signal_pending(current)) 2085 goto out_free; 2086 2087 goto restart; 2088 } 2089 2090 if (!sk_locked) { 2091 unix_state_unlock(other); 2092 unix_state_double_lock(sk, other); 2093 } 2094 2095 if (unix_peer(sk) != other || 2096 unix_dgram_peer_wake_me(sk, other)) { 2097 err = -EAGAIN; 2098 sk_locked = 1; 2099 goto out_unlock; 2100 } 2101 2102 if (!sk_locked) { 2103 sk_locked = 1; 2104 goto restart_locked; 2105 } 2106 } 2107 2108 if (unlikely(sk_locked)) 2109 unix_state_unlock(sk); 2110 2111 if (sock_flag(other, SOCK_RCVTSTAMP)) 2112 __net_timestamp(skb); 2113 maybe_add_creds(skb, sock, other); 2114 scm_stat_add(other, skb); 2115 skb_queue_tail(&other->sk_receive_queue, skb); 2116 unix_state_unlock(other); 2117 other->sk_data_ready(other); 2118 sock_put(other); 2119 scm_destroy(&scm); 2120 return len; 2121 2122 out_unlock: 2123 if (sk_locked) 2124 unix_state_unlock(sk); 2125 unix_state_unlock(other); 2126 out_free: 2127 kfree_skb(skb); 2128 out: 2129 if (other) 2130 sock_put(other); 2131 scm_destroy(&scm); 2132 return err; 2133 } 2134 2135 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2136 * bytes, and a minimum of a full page. 2137 */ 2138 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2139 2140 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2141 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2142 struct scm_cookie *scm, bool fds_sent) 2143 { 2144 struct unix_sock *ousk = unix_sk(other); 2145 struct sk_buff *skb; 2146 int err = 0; 2147 2148 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2149 2150 if (!skb) 2151 return err; 2152 2153 err = unix_scm_to_skb(scm, skb, !fds_sent); 2154 if (err < 0) { 2155 kfree_skb(skb); 2156 return err; 2157 } 2158 skb_put(skb, 1); 2159 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2160 2161 if (err) { 2162 kfree_skb(skb); 2163 return err; 2164 } 2165 2166 unix_state_lock(other); 2167 2168 if (sock_flag(other, SOCK_DEAD) || 2169 (other->sk_shutdown & RCV_SHUTDOWN)) { 2170 unix_state_unlock(other); 2171 kfree_skb(skb); 2172 return -EPIPE; 2173 } 2174 2175 maybe_add_creds(skb, sock, other); 2176 skb_get(skb); 2177 2178 scm_stat_add(other, skb); 2179 2180 spin_lock(&other->sk_receive_queue.lock); 2181 if (ousk->oob_skb) 2182 consume_skb(ousk->oob_skb); 2183 WRITE_ONCE(ousk->oob_skb, skb); 2184 __skb_queue_tail(&other->sk_receive_queue, skb); 2185 spin_unlock(&other->sk_receive_queue.lock); 2186 2187 sk_send_sigurg(other); 2188 unix_state_unlock(other); 2189 other->sk_data_ready(other); 2190 2191 return err; 2192 } 2193 #endif 2194 2195 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2196 size_t len) 2197 { 2198 struct sock *sk = sock->sk; 2199 struct sock *other = NULL; 2200 int err, size; 2201 struct sk_buff *skb; 2202 int sent = 0; 2203 struct scm_cookie scm; 2204 bool fds_sent = false; 2205 int data_len; 2206 2207 err = scm_send(sock, msg, &scm, false); 2208 if (err < 0) 2209 return err; 2210 2211 wait_for_unix_gc(scm.fp); 2212 2213 err = -EOPNOTSUPP; 2214 if (msg->msg_flags & MSG_OOB) { 2215 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2216 if (len) 2217 len--; 2218 else 2219 #endif 2220 goto out_err; 2221 } 2222 2223 if (msg->msg_namelen) { 2224 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2225 goto out_err; 2226 } else { 2227 err = -ENOTCONN; 2228 other = unix_peer(sk); 2229 if (!other) 2230 goto out_err; 2231 } 2232 2233 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2234 goto pipe_err; 2235 2236 while (sent < len) { 2237 size = len - sent; 2238 2239 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2240 skb = sock_alloc_send_pskb(sk, 0, 0, 2241 msg->msg_flags & MSG_DONTWAIT, 2242 &err, 0); 2243 } else { 2244 /* Keep two messages in the pipe so it schedules better */ 2245 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); 2246 2247 /* allow fallback to order-0 allocations */ 2248 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2249 2250 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2251 2252 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2253 2254 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2255 msg->msg_flags & MSG_DONTWAIT, &err, 2256 get_order(UNIX_SKB_FRAGS_SZ)); 2257 } 2258 if (!skb) 2259 goto out_err; 2260 2261 /* Only send the fds in the first buffer */ 2262 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2263 if (err < 0) { 2264 kfree_skb(skb); 2265 goto out_err; 2266 } 2267 fds_sent = true; 2268 2269 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2270 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2271 sk->sk_allocation); 2272 if (err < 0) { 2273 kfree_skb(skb); 2274 goto out_err; 2275 } 2276 size = err; 2277 refcount_add(size, &sk->sk_wmem_alloc); 2278 } else { 2279 skb_put(skb, size - data_len); 2280 skb->data_len = data_len; 2281 skb->len = size; 2282 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2283 if (err) { 2284 kfree_skb(skb); 2285 goto out_err; 2286 } 2287 } 2288 2289 unix_state_lock(other); 2290 2291 if (sock_flag(other, SOCK_DEAD) || 2292 (other->sk_shutdown & RCV_SHUTDOWN)) 2293 goto pipe_err_free; 2294 2295 maybe_add_creds(skb, sock, other); 2296 scm_stat_add(other, skb); 2297 skb_queue_tail(&other->sk_receive_queue, skb); 2298 unix_state_unlock(other); 2299 other->sk_data_ready(other); 2300 sent += size; 2301 } 2302 2303 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2304 if (msg->msg_flags & MSG_OOB) { 2305 err = queue_oob(sock, msg, other, &scm, fds_sent); 2306 if (err) 2307 goto out_err; 2308 sent++; 2309 } 2310 #endif 2311 2312 scm_destroy(&scm); 2313 2314 return sent; 2315 2316 pipe_err_free: 2317 unix_state_unlock(other); 2318 kfree_skb(skb); 2319 pipe_err: 2320 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) 2321 send_sig(SIGPIPE, current, 0); 2322 err = -EPIPE; 2323 out_err: 2324 scm_destroy(&scm); 2325 return sent ? : err; 2326 } 2327 2328 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2329 size_t len) 2330 { 2331 int err; 2332 struct sock *sk = sock->sk; 2333 2334 err = sock_error(sk); 2335 if (err) 2336 return err; 2337 2338 if (sk->sk_state != TCP_ESTABLISHED) 2339 return -ENOTCONN; 2340 2341 if (msg->msg_namelen) 2342 msg->msg_namelen = 0; 2343 2344 return unix_dgram_sendmsg(sock, msg, len); 2345 } 2346 2347 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2348 size_t size, int flags) 2349 { 2350 struct sock *sk = sock->sk; 2351 2352 if (sk->sk_state != TCP_ESTABLISHED) 2353 return -ENOTCONN; 2354 2355 return unix_dgram_recvmsg(sock, msg, size, flags); 2356 } 2357 2358 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2359 { 2360 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2361 2362 if (addr) { 2363 msg->msg_namelen = addr->len; 2364 memcpy(msg->msg_name, addr->name, addr->len); 2365 } 2366 } 2367 2368 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2369 int flags) 2370 { 2371 struct scm_cookie scm; 2372 struct socket *sock = sk->sk_socket; 2373 struct unix_sock *u = unix_sk(sk); 2374 struct sk_buff *skb, *last; 2375 long timeo; 2376 int skip; 2377 int err; 2378 2379 err = -EOPNOTSUPP; 2380 if (flags&MSG_OOB) 2381 goto out; 2382 2383 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2384 2385 do { 2386 mutex_lock(&u->iolock); 2387 2388 skip = sk_peek_offset(sk, flags); 2389 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2390 &skip, &err, &last); 2391 if (skb) { 2392 if (!(flags & MSG_PEEK)) 2393 scm_stat_del(sk, skb); 2394 break; 2395 } 2396 2397 mutex_unlock(&u->iolock); 2398 2399 if (err != -EAGAIN) 2400 break; 2401 } while (timeo && 2402 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2403 &err, &timeo, last)); 2404 2405 if (!skb) { /* implies iolock unlocked */ 2406 unix_state_lock(sk); 2407 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2408 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2409 (sk->sk_shutdown & RCV_SHUTDOWN)) 2410 err = 0; 2411 unix_state_unlock(sk); 2412 goto out; 2413 } 2414 2415 if (wq_has_sleeper(&u->peer_wait)) 2416 wake_up_interruptible_sync_poll(&u->peer_wait, 2417 EPOLLOUT | EPOLLWRNORM | 2418 EPOLLWRBAND); 2419 2420 if (msg->msg_name) { 2421 unix_copy_addr(msg, skb->sk); 2422 2423 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2424 msg->msg_name, 2425 &msg->msg_namelen); 2426 } 2427 2428 if (size > skb->len - skip) 2429 size = skb->len - skip; 2430 else if (size < skb->len - skip) 2431 msg->msg_flags |= MSG_TRUNC; 2432 2433 err = skb_copy_datagram_msg(skb, skip, msg, size); 2434 if (err) 2435 goto out_free; 2436 2437 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2438 __sock_recv_timestamp(msg, sk, skb); 2439 2440 memset(&scm, 0, sizeof(scm)); 2441 2442 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2443 unix_set_secdata(&scm, skb); 2444 2445 if (!(flags & MSG_PEEK)) { 2446 if (UNIXCB(skb).fp) 2447 unix_detach_fds(&scm, skb); 2448 2449 sk_peek_offset_bwd(sk, skb->len); 2450 } else { 2451 /* It is questionable: on PEEK we could: 2452 - do not return fds - good, but too simple 8) 2453 - return fds, and do not return them on read (old strategy, 2454 apparently wrong) 2455 - clone fds (I chose it for now, it is the most universal 2456 solution) 2457 2458 POSIX 1003.1g does not actually define this clearly 2459 at all. POSIX 1003.1g doesn't define a lot of things 2460 clearly however! 2461 2462 */ 2463 2464 sk_peek_offset_fwd(sk, size); 2465 2466 if (UNIXCB(skb).fp) 2467 unix_peek_fds(&scm, skb); 2468 } 2469 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2470 2471 scm_recv_unix(sock, msg, &scm, flags); 2472 2473 out_free: 2474 skb_free_datagram(sk, skb); 2475 mutex_unlock(&u->iolock); 2476 out: 2477 return err; 2478 } 2479 2480 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2481 int flags) 2482 { 2483 struct sock *sk = sock->sk; 2484 2485 #ifdef CONFIG_BPF_SYSCALL 2486 const struct proto *prot = READ_ONCE(sk->sk_prot); 2487 2488 if (prot != &unix_dgram_proto) 2489 return prot->recvmsg(sk, msg, size, flags, NULL); 2490 #endif 2491 return __unix_dgram_recvmsg(sk, msg, size, flags); 2492 } 2493 2494 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2495 { 2496 struct unix_sock *u = unix_sk(sk); 2497 struct sk_buff *skb; 2498 int err; 2499 2500 mutex_lock(&u->iolock); 2501 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2502 mutex_unlock(&u->iolock); 2503 if (!skb) 2504 return err; 2505 2506 return recv_actor(sk, skb); 2507 } 2508 2509 /* 2510 * Sleep until more data has arrived. But check for races.. 2511 */ 2512 static long unix_stream_data_wait(struct sock *sk, long timeo, 2513 struct sk_buff *last, unsigned int last_len, 2514 bool freezable) 2515 { 2516 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2517 struct sk_buff *tail; 2518 DEFINE_WAIT(wait); 2519 2520 unix_state_lock(sk); 2521 2522 for (;;) { 2523 prepare_to_wait(sk_sleep(sk), &wait, state); 2524 2525 tail = skb_peek_tail(&sk->sk_receive_queue); 2526 if (tail != last || 2527 (tail && tail->len != last_len) || 2528 sk->sk_err || 2529 (sk->sk_shutdown & RCV_SHUTDOWN) || 2530 signal_pending(current) || 2531 !timeo) 2532 break; 2533 2534 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2535 unix_state_unlock(sk); 2536 timeo = schedule_timeout(timeo); 2537 unix_state_lock(sk); 2538 2539 if (sock_flag(sk, SOCK_DEAD)) 2540 break; 2541 2542 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2543 } 2544 2545 finish_wait(sk_sleep(sk), &wait); 2546 unix_state_unlock(sk); 2547 return timeo; 2548 } 2549 2550 static unsigned int unix_skb_len(const struct sk_buff *skb) 2551 { 2552 return skb->len - UNIXCB(skb).consumed; 2553 } 2554 2555 struct unix_stream_read_state { 2556 int (*recv_actor)(struct sk_buff *, int, int, 2557 struct unix_stream_read_state *); 2558 struct socket *socket; 2559 struct msghdr *msg; 2560 struct pipe_inode_info *pipe; 2561 size_t size; 2562 int flags; 2563 unsigned int splice_flags; 2564 }; 2565 2566 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2567 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2568 { 2569 struct socket *sock = state->socket; 2570 struct sock *sk = sock->sk; 2571 struct unix_sock *u = unix_sk(sk); 2572 int chunk = 1; 2573 struct sk_buff *oob_skb; 2574 2575 mutex_lock(&u->iolock); 2576 unix_state_lock(sk); 2577 spin_lock(&sk->sk_receive_queue.lock); 2578 2579 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2580 spin_unlock(&sk->sk_receive_queue.lock); 2581 unix_state_unlock(sk); 2582 mutex_unlock(&u->iolock); 2583 return -EINVAL; 2584 } 2585 2586 oob_skb = u->oob_skb; 2587 2588 if (!(state->flags & MSG_PEEK)) 2589 WRITE_ONCE(u->oob_skb, NULL); 2590 else 2591 skb_get(oob_skb); 2592 2593 spin_unlock(&sk->sk_receive_queue.lock); 2594 unix_state_unlock(sk); 2595 2596 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2597 2598 if (!(state->flags & MSG_PEEK)) 2599 UNIXCB(oob_skb).consumed += 1; 2600 2601 consume_skb(oob_skb); 2602 2603 mutex_unlock(&u->iolock); 2604 2605 if (chunk < 0) 2606 return -EFAULT; 2607 2608 state->msg->msg_flags |= MSG_OOB; 2609 return 1; 2610 } 2611 2612 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2613 int flags, int copied) 2614 { 2615 struct unix_sock *u = unix_sk(sk); 2616 2617 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { 2618 skb_unlink(skb, &sk->sk_receive_queue); 2619 consume_skb(skb); 2620 skb = NULL; 2621 } else { 2622 struct sk_buff *unlinked_skb = NULL; 2623 2624 spin_lock(&sk->sk_receive_queue.lock); 2625 2626 if (skb == u->oob_skb) { 2627 if (copied) { 2628 skb = NULL; 2629 } else if (sock_flag(sk, SOCK_URGINLINE)) { 2630 if (!(flags & MSG_PEEK)) { 2631 WRITE_ONCE(u->oob_skb, NULL); 2632 consume_skb(skb); 2633 } 2634 } else if (flags & MSG_PEEK) { 2635 skb = NULL; 2636 } else { 2637 __skb_unlink(skb, &sk->sk_receive_queue); 2638 WRITE_ONCE(u->oob_skb, NULL); 2639 unlinked_skb = skb; 2640 skb = skb_peek(&sk->sk_receive_queue); 2641 } 2642 } 2643 2644 spin_unlock(&sk->sk_receive_queue.lock); 2645 2646 if (unlinked_skb) { 2647 WARN_ON_ONCE(skb_unref(unlinked_skb)); 2648 kfree_skb(unlinked_skb); 2649 } 2650 } 2651 return skb; 2652 } 2653 #endif 2654 2655 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2656 { 2657 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) 2658 return -ENOTCONN; 2659 2660 return unix_read_skb(sk, recv_actor); 2661 } 2662 2663 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2664 bool freezable) 2665 { 2666 struct scm_cookie scm; 2667 struct socket *sock = state->socket; 2668 struct sock *sk = sock->sk; 2669 struct unix_sock *u = unix_sk(sk); 2670 int copied = 0; 2671 int flags = state->flags; 2672 int noblock = flags & MSG_DONTWAIT; 2673 bool check_creds = false; 2674 int target; 2675 int err = 0; 2676 long timeo; 2677 int skip; 2678 size_t size = state->size; 2679 unsigned int last_len; 2680 2681 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { 2682 err = -EINVAL; 2683 goto out; 2684 } 2685 2686 if (unlikely(flags & MSG_OOB)) { 2687 err = -EOPNOTSUPP; 2688 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2689 err = unix_stream_recv_urg(state); 2690 #endif 2691 goto out; 2692 } 2693 2694 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2695 timeo = sock_rcvtimeo(sk, noblock); 2696 2697 memset(&scm, 0, sizeof(scm)); 2698 2699 /* Lock the socket to prevent queue disordering 2700 * while sleeps in memcpy_tomsg 2701 */ 2702 mutex_lock(&u->iolock); 2703 2704 skip = max(sk_peek_offset(sk, flags), 0); 2705 2706 do { 2707 int chunk; 2708 bool drop_skb; 2709 struct sk_buff *skb, *last; 2710 2711 redo: 2712 unix_state_lock(sk); 2713 if (sock_flag(sk, SOCK_DEAD)) { 2714 err = -ECONNRESET; 2715 goto unlock; 2716 } 2717 last = skb = skb_peek(&sk->sk_receive_queue); 2718 last_len = last ? last->len : 0; 2719 2720 again: 2721 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2722 if (skb) { 2723 skb = manage_oob(skb, sk, flags, copied); 2724 if (!skb && copied) { 2725 unix_state_unlock(sk); 2726 break; 2727 } 2728 } 2729 #endif 2730 if (skb == NULL) { 2731 if (copied >= target) 2732 goto unlock; 2733 2734 /* 2735 * POSIX 1003.1g mandates this order. 2736 */ 2737 2738 err = sock_error(sk); 2739 if (err) 2740 goto unlock; 2741 if (sk->sk_shutdown & RCV_SHUTDOWN) 2742 goto unlock; 2743 2744 unix_state_unlock(sk); 2745 if (!timeo) { 2746 err = -EAGAIN; 2747 break; 2748 } 2749 2750 mutex_unlock(&u->iolock); 2751 2752 timeo = unix_stream_data_wait(sk, timeo, last, 2753 last_len, freezable); 2754 2755 if (signal_pending(current)) { 2756 err = sock_intr_errno(timeo); 2757 scm_destroy(&scm); 2758 goto out; 2759 } 2760 2761 mutex_lock(&u->iolock); 2762 goto redo; 2763 unlock: 2764 unix_state_unlock(sk); 2765 break; 2766 } 2767 2768 while (skip >= unix_skb_len(skb)) { 2769 skip -= unix_skb_len(skb); 2770 last = skb; 2771 last_len = skb->len; 2772 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2773 if (!skb) 2774 goto again; 2775 } 2776 2777 unix_state_unlock(sk); 2778 2779 if (check_creds) { 2780 /* Never glue messages from different writers */ 2781 if (!unix_skb_scm_eq(skb, &scm)) 2782 break; 2783 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2784 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2785 /* Copy credentials */ 2786 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2787 unix_set_secdata(&scm, skb); 2788 check_creds = true; 2789 } 2790 2791 /* Copy address just once */ 2792 if (state->msg && state->msg->msg_name) { 2793 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2794 state->msg->msg_name); 2795 unix_copy_addr(state->msg, skb->sk); 2796 2797 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2798 state->msg->msg_name, 2799 &state->msg->msg_namelen); 2800 2801 sunaddr = NULL; 2802 } 2803 2804 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2805 skb_get(skb); 2806 chunk = state->recv_actor(skb, skip, chunk, state); 2807 drop_skb = !unix_skb_len(skb); 2808 /* skb is only safe to use if !drop_skb */ 2809 consume_skb(skb); 2810 if (chunk < 0) { 2811 if (copied == 0) 2812 copied = -EFAULT; 2813 break; 2814 } 2815 copied += chunk; 2816 size -= chunk; 2817 2818 if (drop_skb) { 2819 /* the skb was touched by a concurrent reader; 2820 * we should not expect anything from this skb 2821 * anymore and assume it invalid - we can be 2822 * sure it was dropped from the socket queue 2823 * 2824 * let's report a short read 2825 */ 2826 err = 0; 2827 break; 2828 } 2829 2830 /* Mark read part of skb as used */ 2831 if (!(flags & MSG_PEEK)) { 2832 UNIXCB(skb).consumed += chunk; 2833 2834 sk_peek_offset_bwd(sk, chunk); 2835 2836 if (UNIXCB(skb).fp) { 2837 scm_stat_del(sk, skb); 2838 unix_detach_fds(&scm, skb); 2839 } 2840 2841 if (unix_skb_len(skb)) 2842 break; 2843 2844 skb_unlink(skb, &sk->sk_receive_queue); 2845 consume_skb(skb); 2846 2847 if (scm.fp) 2848 break; 2849 } else { 2850 /* It is questionable, see note in unix_dgram_recvmsg. 2851 */ 2852 if (UNIXCB(skb).fp) 2853 unix_peek_fds(&scm, skb); 2854 2855 sk_peek_offset_fwd(sk, chunk); 2856 2857 if (UNIXCB(skb).fp) 2858 break; 2859 2860 skip = 0; 2861 last = skb; 2862 last_len = skb->len; 2863 unix_state_lock(sk); 2864 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2865 if (skb) 2866 goto again; 2867 unix_state_unlock(sk); 2868 break; 2869 } 2870 } while (size); 2871 2872 mutex_unlock(&u->iolock); 2873 if (state->msg) 2874 scm_recv_unix(sock, state->msg, &scm, flags); 2875 else 2876 scm_destroy(&scm); 2877 out: 2878 return copied ? : err; 2879 } 2880 2881 static int unix_stream_read_actor(struct sk_buff *skb, 2882 int skip, int chunk, 2883 struct unix_stream_read_state *state) 2884 { 2885 int ret; 2886 2887 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2888 state->msg, chunk); 2889 return ret ?: chunk; 2890 } 2891 2892 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2893 size_t size, int flags) 2894 { 2895 struct unix_stream_read_state state = { 2896 .recv_actor = unix_stream_read_actor, 2897 .socket = sk->sk_socket, 2898 .msg = msg, 2899 .size = size, 2900 .flags = flags 2901 }; 2902 2903 return unix_stream_read_generic(&state, true); 2904 } 2905 2906 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2907 size_t size, int flags) 2908 { 2909 struct unix_stream_read_state state = { 2910 .recv_actor = unix_stream_read_actor, 2911 .socket = sock, 2912 .msg = msg, 2913 .size = size, 2914 .flags = flags 2915 }; 2916 2917 #ifdef CONFIG_BPF_SYSCALL 2918 struct sock *sk = sock->sk; 2919 const struct proto *prot = READ_ONCE(sk->sk_prot); 2920 2921 if (prot != &unix_stream_proto) 2922 return prot->recvmsg(sk, msg, size, flags, NULL); 2923 #endif 2924 return unix_stream_read_generic(&state, true); 2925 } 2926 2927 static int unix_stream_splice_actor(struct sk_buff *skb, 2928 int skip, int chunk, 2929 struct unix_stream_read_state *state) 2930 { 2931 return skb_splice_bits(skb, state->socket->sk, 2932 UNIXCB(skb).consumed + skip, 2933 state->pipe, chunk, state->splice_flags); 2934 } 2935 2936 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2937 struct pipe_inode_info *pipe, 2938 size_t size, unsigned int flags) 2939 { 2940 struct unix_stream_read_state state = { 2941 .recv_actor = unix_stream_splice_actor, 2942 .socket = sock, 2943 .pipe = pipe, 2944 .size = size, 2945 .splice_flags = flags, 2946 }; 2947 2948 if (unlikely(*ppos)) 2949 return -ESPIPE; 2950 2951 if (sock->file->f_flags & O_NONBLOCK || 2952 flags & SPLICE_F_NONBLOCK) 2953 state.flags = MSG_DONTWAIT; 2954 2955 return unix_stream_read_generic(&state, false); 2956 } 2957 2958 static int unix_shutdown(struct socket *sock, int mode) 2959 { 2960 struct sock *sk = sock->sk; 2961 struct sock *other; 2962 2963 if (mode < SHUT_RD || mode > SHUT_RDWR) 2964 return -EINVAL; 2965 /* This maps: 2966 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 2967 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 2968 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 2969 */ 2970 ++mode; 2971 2972 unix_state_lock(sk); 2973 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 2974 other = unix_peer(sk); 2975 if (other) 2976 sock_hold(other); 2977 unix_state_unlock(sk); 2978 sk->sk_state_change(sk); 2979 2980 if (other && 2981 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 2982 2983 int peer_mode = 0; 2984 const struct proto *prot = READ_ONCE(other->sk_prot); 2985 2986 if (prot->unhash) 2987 prot->unhash(other); 2988 if (mode&RCV_SHUTDOWN) 2989 peer_mode |= SEND_SHUTDOWN; 2990 if (mode&SEND_SHUTDOWN) 2991 peer_mode |= RCV_SHUTDOWN; 2992 unix_state_lock(other); 2993 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 2994 unix_state_unlock(other); 2995 other->sk_state_change(other); 2996 if (peer_mode == SHUTDOWN_MASK) 2997 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 2998 else if (peer_mode & RCV_SHUTDOWN) 2999 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3000 } 3001 if (other) 3002 sock_put(other); 3003 3004 return 0; 3005 } 3006 3007 long unix_inq_len(struct sock *sk) 3008 { 3009 struct sk_buff *skb; 3010 long amount = 0; 3011 3012 if (sk->sk_state == TCP_LISTEN) 3013 return -EINVAL; 3014 3015 spin_lock(&sk->sk_receive_queue.lock); 3016 if (sk->sk_type == SOCK_STREAM || 3017 sk->sk_type == SOCK_SEQPACKET) { 3018 skb_queue_walk(&sk->sk_receive_queue, skb) 3019 amount += unix_skb_len(skb); 3020 } else { 3021 skb = skb_peek(&sk->sk_receive_queue); 3022 if (skb) 3023 amount = skb->len; 3024 } 3025 spin_unlock(&sk->sk_receive_queue.lock); 3026 3027 return amount; 3028 } 3029 EXPORT_SYMBOL_GPL(unix_inq_len); 3030 3031 long unix_outq_len(struct sock *sk) 3032 { 3033 return sk_wmem_alloc_get(sk); 3034 } 3035 EXPORT_SYMBOL_GPL(unix_outq_len); 3036 3037 static int unix_open_file(struct sock *sk) 3038 { 3039 struct path path; 3040 struct file *f; 3041 int fd; 3042 3043 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3044 return -EPERM; 3045 3046 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3047 return -ENOENT; 3048 3049 path = unix_sk(sk)->path; 3050 if (!path.dentry) 3051 return -ENOENT; 3052 3053 path_get(&path); 3054 3055 fd = get_unused_fd_flags(O_CLOEXEC); 3056 if (fd < 0) 3057 goto out; 3058 3059 f = dentry_open(&path, O_PATH, current_cred()); 3060 if (IS_ERR(f)) { 3061 put_unused_fd(fd); 3062 fd = PTR_ERR(f); 3063 goto out; 3064 } 3065 3066 fd_install(fd, f); 3067 out: 3068 path_put(&path); 3069 3070 return fd; 3071 } 3072 3073 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3074 { 3075 struct sock *sk = sock->sk; 3076 long amount = 0; 3077 int err; 3078 3079 switch (cmd) { 3080 case SIOCOUTQ: 3081 amount = unix_outq_len(sk); 3082 err = put_user(amount, (int __user *)arg); 3083 break; 3084 case SIOCINQ: 3085 amount = unix_inq_len(sk); 3086 if (amount < 0) 3087 err = amount; 3088 else 3089 err = put_user(amount, (int __user *)arg); 3090 break; 3091 case SIOCUNIXFILE: 3092 err = unix_open_file(sk); 3093 break; 3094 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3095 case SIOCATMARK: 3096 { 3097 struct sk_buff *skb; 3098 int answ = 0; 3099 3100 skb = skb_peek(&sk->sk_receive_queue); 3101 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) 3102 answ = 1; 3103 err = put_user(answ, (int __user *)arg); 3104 } 3105 break; 3106 #endif 3107 default: 3108 err = -ENOIOCTLCMD; 3109 break; 3110 } 3111 return err; 3112 } 3113 3114 #ifdef CONFIG_COMPAT 3115 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3116 { 3117 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3118 } 3119 #endif 3120 3121 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3122 { 3123 struct sock *sk = sock->sk; 3124 __poll_t mask; 3125 u8 shutdown; 3126 3127 sock_poll_wait(file, sock, wait); 3128 mask = 0; 3129 shutdown = READ_ONCE(sk->sk_shutdown); 3130 3131 /* exceptional events? */ 3132 if (READ_ONCE(sk->sk_err)) 3133 mask |= EPOLLERR; 3134 if (shutdown == SHUTDOWN_MASK) 3135 mask |= EPOLLHUP; 3136 if (shutdown & RCV_SHUTDOWN) 3137 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3138 3139 /* readable? */ 3140 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3141 mask |= EPOLLIN | EPOLLRDNORM; 3142 if (sk_is_readable(sk)) 3143 mask |= EPOLLIN | EPOLLRDNORM; 3144 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3145 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3146 mask |= EPOLLPRI; 3147 #endif 3148 3149 /* Connection-based need to check for termination and startup */ 3150 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3151 sk->sk_state == TCP_CLOSE) 3152 mask |= EPOLLHUP; 3153 3154 /* 3155 * we set writable also when the other side has shut down the 3156 * connection. This prevents stuck sockets. 3157 */ 3158 if (unix_writable(sk)) 3159 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3160 3161 return mask; 3162 } 3163 3164 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3165 poll_table *wait) 3166 { 3167 struct sock *sk = sock->sk, *other; 3168 unsigned int writable; 3169 __poll_t mask; 3170 u8 shutdown; 3171 3172 sock_poll_wait(file, sock, wait); 3173 mask = 0; 3174 shutdown = READ_ONCE(sk->sk_shutdown); 3175 3176 /* exceptional events? */ 3177 if (READ_ONCE(sk->sk_err) || 3178 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3179 mask |= EPOLLERR | 3180 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3181 3182 if (shutdown & RCV_SHUTDOWN) 3183 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3184 if (shutdown == SHUTDOWN_MASK) 3185 mask |= EPOLLHUP; 3186 3187 /* readable? */ 3188 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3189 mask |= EPOLLIN | EPOLLRDNORM; 3190 if (sk_is_readable(sk)) 3191 mask |= EPOLLIN | EPOLLRDNORM; 3192 3193 /* Connection-based need to check for termination and startup */ 3194 if (sk->sk_type == SOCK_SEQPACKET) { 3195 if (sk->sk_state == TCP_CLOSE) 3196 mask |= EPOLLHUP; 3197 /* connection hasn't started yet? */ 3198 if (sk->sk_state == TCP_SYN_SENT) 3199 return mask; 3200 } 3201 3202 /* No write status requested, avoid expensive OUT tests. */ 3203 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3204 return mask; 3205 3206 writable = unix_writable(sk); 3207 if (writable) { 3208 unix_state_lock(sk); 3209 3210 other = unix_peer(sk); 3211 if (other && unix_peer(other) != sk && 3212 unix_recvq_full_lockless(other) && 3213 unix_dgram_peer_wake_me(sk, other)) 3214 writable = 0; 3215 3216 unix_state_unlock(sk); 3217 } 3218 3219 if (writable) 3220 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3221 else 3222 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3223 3224 return mask; 3225 } 3226 3227 #ifdef CONFIG_PROC_FS 3228 3229 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3230 3231 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3232 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3233 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3234 3235 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3236 { 3237 unsigned long offset = get_offset(*pos); 3238 unsigned long bucket = get_bucket(*pos); 3239 unsigned long count = 0; 3240 struct sock *sk; 3241 3242 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3243 sk; sk = sk_next(sk)) { 3244 if (++count == offset) 3245 break; 3246 } 3247 3248 return sk; 3249 } 3250 3251 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3252 { 3253 unsigned long bucket = get_bucket(*pos); 3254 struct net *net = seq_file_net(seq); 3255 struct sock *sk; 3256 3257 while (bucket < UNIX_HASH_SIZE) { 3258 spin_lock(&net->unx.table.locks[bucket]); 3259 3260 sk = unix_from_bucket(seq, pos); 3261 if (sk) 3262 return sk; 3263 3264 spin_unlock(&net->unx.table.locks[bucket]); 3265 3266 *pos = set_bucket_offset(++bucket, 1); 3267 } 3268 3269 return NULL; 3270 } 3271 3272 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3273 loff_t *pos) 3274 { 3275 unsigned long bucket = get_bucket(*pos); 3276 3277 sk = sk_next(sk); 3278 if (sk) 3279 return sk; 3280 3281 3282 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3283 3284 *pos = set_bucket_offset(++bucket, 1); 3285 3286 return unix_get_first(seq, pos); 3287 } 3288 3289 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3290 { 3291 if (!*pos) 3292 return SEQ_START_TOKEN; 3293 3294 return unix_get_first(seq, pos); 3295 } 3296 3297 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3298 { 3299 ++*pos; 3300 3301 if (v == SEQ_START_TOKEN) 3302 return unix_get_first(seq, pos); 3303 3304 return unix_get_next(seq, v, pos); 3305 } 3306 3307 static void unix_seq_stop(struct seq_file *seq, void *v) 3308 { 3309 struct sock *sk = v; 3310 3311 if (sk) 3312 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3313 } 3314 3315 static int unix_seq_show(struct seq_file *seq, void *v) 3316 { 3317 3318 if (v == SEQ_START_TOKEN) 3319 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3320 "Inode Path\n"); 3321 else { 3322 struct sock *s = v; 3323 struct unix_sock *u = unix_sk(s); 3324 unix_state_lock(s); 3325 3326 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3327 s, 3328 refcount_read(&s->sk_refcnt), 3329 0, 3330 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3331 s->sk_type, 3332 s->sk_socket ? 3333 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3334 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3335 sock_i_ino(s)); 3336 3337 if (u->addr) { // under a hash table lock here 3338 int i, len; 3339 seq_putc(seq, ' '); 3340 3341 i = 0; 3342 len = u->addr->len - 3343 offsetof(struct sockaddr_un, sun_path); 3344 if (u->addr->name->sun_path[0]) { 3345 len--; 3346 } else { 3347 seq_putc(seq, '@'); 3348 i++; 3349 } 3350 for ( ; i < len; i++) 3351 seq_putc(seq, u->addr->name->sun_path[i] ?: 3352 '@'); 3353 } 3354 unix_state_unlock(s); 3355 seq_putc(seq, '\n'); 3356 } 3357 3358 return 0; 3359 } 3360 3361 static const struct seq_operations unix_seq_ops = { 3362 .start = unix_seq_start, 3363 .next = unix_seq_next, 3364 .stop = unix_seq_stop, 3365 .show = unix_seq_show, 3366 }; 3367 3368 #ifdef CONFIG_BPF_SYSCALL 3369 struct bpf_unix_iter_state { 3370 struct seq_net_private p; 3371 unsigned int cur_sk; 3372 unsigned int end_sk; 3373 unsigned int max_sk; 3374 struct sock **batch; 3375 bool st_bucket_done; 3376 }; 3377 3378 struct bpf_iter__unix { 3379 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3380 __bpf_md_ptr(struct unix_sock *, unix_sk); 3381 uid_t uid __aligned(8); 3382 }; 3383 3384 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3385 struct unix_sock *unix_sk, uid_t uid) 3386 { 3387 struct bpf_iter__unix ctx; 3388 3389 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3390 ctx.meta = meta; 3391 ctx.unix_sk = unix_sk; 3392 ctx.uid = uid; 3393 return bpf_iter_run_prog(prog, &ctx); 3394 } 3395 3396 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3397 3398 { 3399 struct bpf_unix_iter_state *iter = seq->private; 3400 unsigned int expected = 1; 3401 struct sock *sk; 3402 3403 sock_hold(start_sk); 3404 iter->batch[iter->end_sk++] = start_sk; 3405 3406 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3407 if (iter->end_sk < iter->max_sk) { 3408 sock_hold(sk); 3409 iter->batch[iter->end_sk++] = sk; 3410 } 3411 3412 expected++; 3413 } 3414 3415 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3416 3417 return expected; 3418 } 3419 3420 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3421 { 3422 while (iter->cur_sk < iter->end_sk) 3423 sock_put(iter->batch[iter->cur_sk++]); 3424 } 3425 3426 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3427 unsigned int new_batch_sz) 3428 { 3429 struct sock **new_batch; 3430 3431 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3432 GFP_USER | __GFP_NOWARN); 3433 if (!new_batch) 3434 return -ENOMEM; 3435 3436 bpf_iter_unix_put_batch(iter); 3437 kvfree(iter->batch); 3438 iter->batch = new_batch; 3439 iter->max_sk = new_batch_sz; 3440 3441 return 0; 3442 } 3443 3444 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3445 loff_t *pos) 3446 { 3447 struct bpf_unix_iter_state *iter = seq->private; 3448 unsigned int expected; 3449 bool resized = false; 3450 struct sock *sk; 3451 3452 if (iter->st_bucket_done) 3453 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3454 3455 again: 3456 /* Get a new batch */ 3457 iter->cur_sk = 0; 3458 iter->end_sk = 0; 3459 3460 sk = unix_get_first(seq, pos); 3461 if (!sk) 3462 return NULL; /* Done */ 3463 3464 expected = bpf_iter_unix_hold_batch(seq, sk); 3465 3466 if (iter->end_sk == expected) { 3467 iter->st_bucket_done = true; 3468 return sk; 3469 } 3470 3471 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3472 resized = true; 3473 goto again; 3474 } 3475 3476 return sk; 3477 } 3478 3479 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3480 { 3481 if (!*pos) 3482 return SEQ_START_TOKEN; 3483 3484 /* bpf iter does not support lseek, so it always 3485 * continue from where it was stop()-ped. 3486 */ 3487 return bpf_iter_unix_batch(seq, pos); 3488 } 3489 3490 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3491 { 3492 struct bpf_unix_iter_state *iter = seq->private; 3493 struct sock *sk; 3494 3495 /* Whenever seq_next() is called, the iter->cur_sk is 3496 * done with seq_show(), so advance to the next sk in 3497 * the batch. 3498 */ 3499 if (iter->cur_sk < iter->end_sk) 3500 sock_put(iter->batch[iter->cur_sk++]); 3501 3502 ++*pos; 3503 3504 if (iter->cur_sk < iter->end_sk) 3505 sk = iter->batch[iter->cur_sk]; 3506 else 3507 sk = bpf_iter_unix_batch(seq, pos); 3508 3509 return sk; 3510 } 3511 3512 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3513 { 3514 struct bpf_iter_meta meta; 3515 struct bpf_prog *prog; 3516 struct sock *sk = v; 3517 uid_t uid; 3518 bool slow; 3519 int ret; 3520 3521 if (v == SEQ_START_TOKEN) 3522 return 0; 3523 3524 slow = lock_sock_fast(sk); 3525 3526 if (unlikely(sk_unhashed(sk))) { 3527 ret = SEQ_SKIP; 3528 goto unlock; 3529 } 3530 3531 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3532 meta.seq = seq; 3533 prog = bpf_iter_get_info(&meta, false); 3534 ret = unix_prog_seq_show(prog, &meta, v, uid); 3535 unlock: 3536 unlock_sock_fast(sk, slow); 3537 return ret; 3538 } 3539 3540 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3541 { 3542 struct bpf_unix_iter_state *iter = seq->private; 3543 struct bpf_iter_meta meta; 3544 struct bpf_prog *prog; 3545 3546 if (!v) { 3547 meta.seq = seq; 3548 prog = bpf_iter_get_info(&meta, true); 3549 if (prog) 3550 (void)unix_prog_seq_show(prog, &meta, v, 0); 3551 } 3552 3553 if (iter->cur_sk < iter->end_sk) 3554 bpf_iter_unix_put_batch(iter); 3555 } 3556 3557 static const struct seq_operations bpf_iter_unix_seq_ops = { 3558 .start = bpf_iter_unix_seq_start, 3559 .next = bpf_iter_unix_seq_next, 3560 .stop = bpf_iter_unix_seq_stop, 3561 .show = bpf_iter_unix_seq_show, 3562 }; 3563 #endif 3564 #endif 3565 3566 static const struct net_proto_family unix_family_ops = { 3567 .family = PF_UNIX, 3568 .create = unix_create, 3569 .owner = THIS_MODULE, 3570 }; 3571 3572 3573 static int __net_init unix_net_init(struct net *net) 3574 { 3575 int i; 3576 3577 net->unx.sysctl_max_dgram_qlen = 10; 3578 if (unix_sysctl_register(net)) 3579 goto out; 3580 3581 #ifdef CONFIG_PROC_FS 3582 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3583 sizeof(struct seq_net_private))) 3584 goto err_sysctl; 3585 #endif 3586 3587 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3588 sizeof(spinlock_t), GFP_KERNEL); 3589 if (!net->unx.table.locks) 3590 goto err_proc; 3591 3592 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3593 sizeof(struct hlist_head), 3594 GFP_KERNEL); 3595 if (!net->unx.table.buckets) 3596 goto free_locks; 3597 3598 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3599 spin_lock_init(&net->unx.table.locks[i]); 3600 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3601 } 3602 3603 return 0; 3604 3605 free_locks: 3606 kvfree(net->unx.table.locks); 3607 err_proc: 3608 #ifdef CONFIG_PROC_FS 3609 remove_proc_entry("unix", net->proc_net); 3610 err_sysctl: 3611 #endif 3612 unix_sysctl_unregister(net); 3613 out: 3614 return -ENOMEM; 3615 } 3616 3617 static void __net_exit unix_net_exit(struct net *net) 3618 { 3619 kvfree(net->unx.table.buckets); 3620 kvfree(net->unx.table.locks); 3621 unix_sysctl_unregister(net); 3622 remove_proc_entry("unix", net->proc_net); 3623 } 3624 3625 static struct pernet_operations unix_net_ops = { 3626 .init = unix_net_init, 3627 .exit = unix_net_exit, 3628 }; 3629 3630 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3631 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3632 struct unix_sock *unix_sk, uid_t uid) 3633 3634 #define INIT_BATCH_SZ 16 3635 3636 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3637 { 3638 struct bpf_unix_iter_state *iter = priv_data; 3639 int err; 3640 3641 err = bpf_iter_init_seq_net(priv_data, aux); 3642 if (err) 3643 return err; 3644 3645 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3646 if (err) { 3647 bpf_iter_fini_seq_net(priv_data); 3648 return err; 3649 } 3650 3651 return 0; 3652 } 3653 3654 static void bpf_iter_fini_unix(void *priv_data) 3655 { 3656 struct bpf_unix_iter_state *iter = priv_data; 3657 3658 bpf_iter_fini_seq_net(priv_data); 3659 kvfree(iter->batch); 3660 } 3661 3662 static const struct bpf_iter_seq_info unix_seq_info = { 3663 .seq_ops = &bpf_iter_unix_seq_ops, 3664 .init_seq_private = bpf_iter_init_unix, 3665 .fini_seq_private = bpf_iter_fini_unix, 3666 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3667 }; 3668 3669 static const struct bpf_func_proto * 3670 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3671 const struct bpf_prog *prog) 3672 { 3673 switch (func_id) { 3674 case BPF_FUNC_setsockopt: 3675 return &bpf_sk_setsockopt_proto; 3676 case BPF_FUNC_getsockopt: 3677 return &bpf_sk_getsockopt_proto; 3678 default: 3679 return NULL; 3680 } 3681 } 3682 3683 static struct bpf_iter_reg unix_reg_info = { 3684 .target = "unix", 3685 .ctx_arg_info_size = 1, 3686 .ctx_arg_info = { 3687 { offsetof(struct bpf_iter__unix, unix_sk), 3688 PTR_TO_BTF_ID_OR_NULL }, 3689 }, 3690 .get_func_proto = bpf_iter_unix_get_func_proto, 3691 .seq_info = &unix_seq_info, 3692 }; 3693 3694 static void __init bpf_iter_register(void) 3695 { 3696 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3697 if (bpf_iter_reg_target(&unix_reg_info)) 3698 pr_warn("Warning: could not register bpf iterator unix\n"); 3699 } 3700 #endif 3701 3702 static int __init af_unix_init(void) 3703 { 3704 int i, rc = -1; 3705 3706 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3707 3708 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3709 spin_lock_init(&bsd_socket_locks[i]); 3710 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3711 } 3712 3713 rc = proto_register(&unix_dgram_proto, 1); 3714 if (rc != 0) { 3715 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3716 goto out; 3717 } 3718 3719 rc = proto_register(&unix_stream_proto, 1); 3720 if (rc != 0) { 3721 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3722 proto_unregister(&unix_dgram_proto); 3723 goto out; 3724 } 3725 3726 sock_register(&unix_family_ops); 3727 register_pernet_subsys(&unix_net_ops); 3728 unix_bpf_build_proto(); 3729 3730 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3731 bpf_iter_register(); 3732 #endif 3733 3734 out: 3735 return rc; 3736 } 3737 3738 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3739 fs_initcall(af_unix_init); 3740