1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * FUSE: Filesystem in Userspace 4 * Copyright (c) 2023-2024 DataDirect Networks. 5 */ 6 7 #include "fuse_i.h" 8 #include "dev_uring_i.h" 9 #include "fuse_dev_i.h" 10 #include "fuse_trace.h" 11 12 #include <linux/fs.h> 13 #include <linux/io_uring/cmd.h> 14 15 static bool __read_mostly enable_uring; 16 module_param(enable_uring, bool, 0644); 17 MODULE_PARM_DESC(enable_uring, 18 "Enable userspace communication through io-uring"); 19 20 #define FUSE_URING_IOV_SEGS 2 /* header and payload */ 21 22 23 bool fuse_uring_enabled(void) 24 { 25 return enable_uring; 26 } 27 28 struct fuse_uring_pdu { 29 struct fuse_ring_ent *ent; 30 }; 31 32 static const struct fuse_iqueue_ops fuse_io_uring_ops; 33 34 static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd, 35 struct fuse_ring_ent *ring_ent) 36 { 37 struct fuse_uring_pdu *pdu = 38 io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); 39 40 pdu->ent = ring_ent; 41 } 42 43 static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) 44 { 45 struct fuse_uring_pdu *pdu = 46 io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); 47 48 return pdu->ent; 49 } 50 51 static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) 52 { 53 struct fuse_ring *ring = queue->ring; 54 struct fuse_conn *fc = ring->fc; 55 56 lockdep_assert_held(&queue->lock); 57 lockdep_assert_held(&fc->bg_lock); 58 59 /* 60 * Allow one bg request per queue, ignoring global fc limits. 61 * This prevents a single queue from consuming all resources and 62 * eliminates the need for remote queue wake-ups when global 63 * limits are met but this queue has no more waiting requests. 64 */ 65 while ((fc->active_background < fc->max_background || 66 !queue->active_background) && 67 (!list_empty(&queue->fuse_req_bg_queue))) { 68 struct fuse_req *req; 69 70 req = list_first_entry(&queue->fuse_req_bg_queue, 71 struct fuse_req, list); 72 fc->active_background++; 73 queue->active_background++; 74 75 list_move_tail(&req->list, &queue->fuse_req_queue); 76 } 77 } 78 79 static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, 80 int error) 81 { 82 struct fuse_ring_queue *queue = ent->queue; 83 struct fuse_ring *ring = queue->ring; 84 struct fuse_conn *fc = ring->fc; 85 86 lockdep_assert_not_held(&queue->lock); 87 spin_lock(&queue->lock); 88 ent->fuse_req = NULL; 89 list_del_init(&req->list); 90 if (test_bit(FR_BACKGROUND, &req->flags)) { 91 queue->active_background--; 92 spin_lock(&fc->bg_lock); 93 fuse_uring_flush_bg(queue); 94 spin_unlock(&fc->bg_lock); 95 } 96 97 spin_unlock(&queue->lock); 98 99 if (error) 100 req->out.h.error = error; 101 102 clear_bit(FR_SENT, &req->flags); 103 fuse_request_end(req); 104 } 105 106 /* Abort all list queued request on the given ring queue */ 107 static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) 108 { 109 struct fuse_req *req; 110 LIST_HEAD(req_list); 111 112 spin_lock(&queue->lock); 113 list_for_each_entry(req, &queue->fuse_req_queue, list) 114 clear_bit(FR_PENDING, &req->flags); 115 list_splice_init(&queue->fuse_req_queue, &req_list); 116 spin_unlock(&queue->lock); 117 118 /* must not hold queue lock to avoid order issues with fi->lock */ 119 fuse_dev_end_requests(&req_list); 120 } 121 122 void fuse_uring_abort_end_requests(struct fuse_ring *ring) 123 { 124 int qid; 125 struct fuse_ring_queue *queue; 126 struct fuse_conn *fc = ring->fc; 127 128 for (qid = 0; qid < ring->nr_queues; qid++) { 129 queue = READ_ONCE(ring->queues[qid]); 130 if (!queue) 131 continue; 132 133 queue->stopped = true; 134 135 WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); 136 spin_lock(&queue->lock); 137 spin_lock(&fc->bg_lock); 138 fuse_uring_flush_bg(queue); 139 spin_unlock(&fc->bg_lock); 140 spin_unlock(&queue->lock); 141 fuse_uring_abort_end_queue_requests(queue); 142 } 143 } 144 145 static bool ent_list_request_expired(struct fuse_conn *fc, struct list_head *list) 146 { 147 struct fuse_ring_ent *ent; 148 struct fuse_req *req; 149 150 ent = list_first_entry_or_null(list, struct fuse_ring_ent, list); 151 if (!ent) 152 return false; 153 154 req = ent->fuse_req; 155 156 return time_is_before_jiffies(req->create_time + 157 fc->timeout.req_timeout); 158 } 159 160 bool fuse_uring_request_expired(struct fuse_conn *fc) 161 { 162 struct fuse_ring *ring = fc->ring; 163 struct fuse_ring_queue *queue; 164 int qid; 165 166 if (!ring) 167 return false; 168 169 for (qid = 0; qid < ring->nr_queues; qid++) { 170 queue = READ_ONCE(ring->queues[qid]); 171 if (!queue) 172 continue; 173 174 spin_lock(&queue->lock); 175 if (fuse_request_expired(fc, &queue->fuse_req_queue) || 176 fuse_request_expired(fc, &queue->fuse_req_bg_queue) || 177 ent_list_request_expired(fc, &queue->ent_w_req_queue) || 178 ent_list_request_expired(fc, &queue->ent_in_userspace)) { 179 spin_unlock(&queue->lock); 180 return true; 181 } 182 spin_unlock(&queue->lock); 183 } 184 185 return false; 186 } 187 188 void fuse_uring_destruct(struct fuse_conn *fc) 189 { 190 struct fuse_ring *ring = fc->ring; 191 int qid; 192 193 if (!ring) 194 return; 195 196 for (qid = 0; qid < ring->nr_queues; qid++) { 197 struct fuse_ring_queue *queue = ring->queues[qid]; 198 struct fuse_ring_ent *ent, *next; 199 200 if (!queue) 201 continue; 202 203 WARN_ON(!list_empty(&queue->ent_avail_queue)); 204 WARN_ON(!list_empty(&queue->ent_w_req_queue)); 205 WARN_ON(!list_empty(&queue->ent_commit_queue)); 206 WARN_ON(!list_empty(&queue->ent_in_userspace)); 207 208 list_for_each_entry_safe(ent, next, &queue->ent_released, 209 list) { 210 list_del_init(&ent->list); 211 kfree(ent); 212 } 213 214 kfree(queue->fpq.processing); 215 kfree(queue); 216 ring->queues[qid] = NULL; 217 } 218 219 kfree(ring->queues); 220 kfree(ring); 221 fc->ring = NULL; 222 } 223 224 /* 225 * Basic ring setup for this connection based on the provided configuration 226 */ 227 static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) 228 { 229 struct fuse_ring *ring; 230 size_t nr_queues = num_possible_cpus(); 231 struct fuse_ring *res = NULL; 232 size_t max_payload_size; 233 234 ring = kzalloc_obj(*fc->ring, GFP_KERNEL_ACCOUNT); 235 if (!ring) 236 return NULL; 237 238 ring->queues = kzalloc_objs(struct fuse_ring_queue *, nr_queues, 239 GFP_KERNEL_ACCOUNT); 240 if (!ring->queues) 241 goto out_err; 242 243 max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); 244 max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); 245 246 spin_lock(&fc->lock); 247 if (fc->ring) { 248 /* race, another thread created the ring in the meantime */ 249 spin_unlock(&fc->lock); 250 res = fc->ring; 251 goto out_err; 252 } 253 254 init_waitqueue_head(&ring->stop_waitq); 255 256 ring->nr_queues = nr_queues; 257 ring->fc = fc; 258 ring->max_payload_sz = max_payload_size; 259 smp_store_release(&fc->ring, ring); 260 261 spin_unlock(&fc->lock); 262 return ring; 263 264 out_err: 265 kfree(ring->queues); 266 kfree(ring); 267 return res; 268 } 269 270 static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, 271 int qid) 272 { 273 struct fuse_conn *fc = ring->fc; 274 struct fuse_ring_queue *queue; 275 struct list_head *pq; 276 277 queue = kzalloc_obj(*queue, GFP_KERNEL_ACCOUNT); 278 if (!queue) 279 return NULL; 280 pq = kzalloc_objs(struct list_head, FUSE_PQ_HASH_SIZE, GFP_KERNEL); 281 if (!pq) { 282 kfree(queue); 283 return NULL; 284 } 285 286 queue->qid = qid; 287 queue->ring = ring; 288 spin_lock_init(&queue->lock); 289 290 INIT_LIST_HEAD(&queue->ent_avail_queue); 291 INIT_LIST_HEAD(&queue->ent_commit_queue); 292 INIT_LIST_HEAD(&queue->ent_w_req_queue); 293 INIT_LIST_HEAD(&queue->ent_in_userspace); 294 INIT_LIST_HEAD(&queue->fuse_req_queue); 295 INIT_LIST_HEAD(&queue->fuse_req_bg_queue); 296 INIT_LIST_HEAD(&queue->ent_released); 297 298 queue->fpq.processing = pq; 299 fuse_pqueue_init(&queue->fpq); 300 301 spin_lock(&fc->lock); 302 if (ring->queues[qid]) { 303 spin_unlock(&fc->lock); 304 kfree(queue->fpq.processing); 305 kfree(queue); 306 return ring->queues[qid]; 307 } 308 309 /* 310 * write_once and lock as the caller mostly doesn't take the lock at all 311 */ 312 WRITE_ONCE(ring->queues[qid], queue); 313 spin_unlock(&fc->lock); 314 315 return queue; 316 } 317 318 static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) 319 { 320 clear_bit(FR_SENT, &req->flags); 321 req->out.h.error = -ECONNABORTED; 322 fuse_request_end(req); 323 } 324 325 /* 326 * Release a request/entry on connection tear down 327 */ 328 static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) 329 { 330 struct fuse_req *req; 331 struct io_uring_cmd *cmd; 332 333 struct fuse_ring_queue *queue = ent->queue; 334 335 spin_lock(&queue->lock); 336 cmd = ent->cmd; 337 ent->cmd = NULL; 338 req = ent->fuse_req; 339 ent->fuse_req = NULL; 340 if (req) { 341 /* remove entry from queue->fpq->processing */ 342 list_del_init(&req->list); 343 } 344 345 /* 346 * The entry must not be freed immediately, due to access of direct 347 * pointer access of entries through IO_URING_F_CANCEL - there is a risk 348 * of race between daemon termination (which triggers IO_URING_F_CANCEL 349 * and accesses entries without checking the list state first 350 */ 351 list_move(&ent->list, &queue->ent_released); 352 ent->state = FRRS_RELEASED; 353 spin_unlock(&queue->lock); 354 355 if (cmd) 356 io_uring_cmd_done(cmd, -ENOTCONN, IO_URING_F_UNLOCKED); 357 358 if (req) 359 fuse_uring_stop_fuse_req_end(req); 360 } 361 362 static void fuse_uring_stop_list_entries(struct list_head *head, 363 struct fuse_ring_queue *queue, 364 enum fuse_ring_req_state exp_state) 365 { 366 struct fuse_ring *ring = queue->ring; 367 struct fuse_ring_ent *ent, *next; 368 ssize_t queue_refs = SSIZE_MAX; 369 LIST_HEAD(to_teardown); 370 371 spin_lock(&queue->lock); 372 list_for_each_entry_safe(ent, next, head, list) { 373 if (ent->state != exp_state) { 374 pr_warn("entry teardown qid=%d state=%d expected=%d", 375 queue->qid, ent->state, exp_state); 376 continue; 377 } 378 379 ent->state = FRRS_TEARDOWN; 380 list_move(&ent->list, &to_teardown); 381 } 382 spin_unlock(&queue->lock); 383 384 /* no queue lock to avoid lock order issues */ 385 list_for_each_entry_safe(ent, next, &to_teardown, list) { 386 fuse_uring_entry_teardown(ent); 387 queue_refs = atomic_dec_return(&ring->queue_refs); 388 WARN_ON_ONCE(queue_refs < 0); 389 } 390 } 391 392 static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) 393 { 394 fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue, 395 FRRS_USERSPACE); 396 fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue, 397 FRRS_AVAILABLE); 398 } 399 400 /* 401 * Log state debug info 402 */ 403 static void fuse_uring_log_ent_state(struct fuse_ring *ring) 404 { 405 int qid; 406 struct fuse_ring_ent *ent; 407 408 for (qid = 0; qid < ring->nr_queues; qid++) { 409 struct fuse_ring_queue *queue = ring->queues[qid]; 410 411 if (!queue) 412 continue; 413 414 spin_lock(&queue->lock); 415 /* 416 * Log entries from the intermediate queue, the other queues 417 * should be empty 418 */ 419 list_for_each_entry(ent, &queue->ent_w_req_queue, list) { 420 pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n", 421 ring, qid, ent, ent->state); 422 } 423 list_for_each_entry(ent, &queue->ent_commit_queue, list) { 424 pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", 425 ring, qid, ent, ent->state); 426 } 427 spin_unlock(&queue->lock); 428 } 429 ring->stop_debug_log = 1; 430 } 431 432 static void fuse_uring_async_stop_queues(struct work_struct *work) 433 { 434 int qid; 435 struct fuse_ring *ring = 436 container_of(work, struct fuse_ring, async_teardown_work.work); 437 438 /* XXX code dup */ 439 for (qid = 0; qid < ring->nr_queues; qid++) { 440 struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); 441 442 if (!queue) 443 continue; 444 445 fuse_uring_teardown_entries(queue); 446 } 447 448 /* 449 * Some ring entries might be in the middle of IO operations, 450 * i.e. in process to get handled by file_operations::uring_cmd 451 * or on the way to userspace - we could handle that with conditions in 452 * run time code, but easier/cleaner to have an async tear down handler 453 * If there are still queue references left 454 */ 455 if (atomic_read(&ring->queue_refs) > 0) { 456 if (time_after(jiffies, 457 ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT)) 458 fuse_uring_log_ent_state(ring); 459 460 schedule_delayed_work(&ring->async_teardown_work, 461 FUSE_URING_TEARDOWN_INTERVAL); 462 } else { 463 wake_up_all(&ring->stop_waitq); 464 } 465 } 466 467 /* 468 * Stop the ring queues 469 */ 470 void fuse_uring_stop_queues(struct fuse_ring *ring) 471 { 472 int qid; 473 474 for (qid = 0; qid < ring->nr_queues; qid++) { 475 struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); 476 477 if (!queue) 478 continue; 479 480 fuse_uring_teardown_entries(queue); 481 } 482 483 if (atomic_read(&ring->queue_refs) > 0) { 484 ring->teardown_time = jiffies; 485 INIT_DELAYED_WORK(&ring->async_teardown_work, 486 fuse_uring_async_stop_queues); 487 schedule_delayed_work(&ring->async_teardown_work, 488 FUSE_URING_TEARDOWN_INTERVAL); 489 } else { 490 wake_up_all(&ring->stop_waitq); 491 } 492 } 493 494 /* 495 * Handle IO_URING_F_CANCEL, typically should come on daemon termination. 496 * 497 * Releasing the last entry should trigger fuse_dev_release() if 498 * the daemon was terminated 499 */ 500 static void fuse_uring_cancel(struct io_uring_cmd *cmd, 501 unsigned int issue_flags) 502 { 503 struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); 504 struct fuse_ring_queue *queue; 505 bool need_cmd_done = false; 506 507 /* 508 * direct access on ent - it must not be destructed as long as 509 * IO_URING_F_CANCEL might come up 510 */ 511 queue = ent->queue; 512 spin_lock(&queue->lock); 513 if (ent->state == FRRS_AVAILABLE) { 514 ent->state = FRRS_USERSPACE; 515 list_move_tail(&ent->list, &queue->ent_in_userspace); 516 need_cmd_done = true; 517 ent->cmd = NULL; 518 } 519 spin_unlock(&queue->lock); 520 521 if (need_cmd_done) { 522 /* no queue lock to avoid lock order issues */ 523 io_uring_cmd_done(cmd, -ENOTCONN, issue_flags); 524 } 525 } 526 527 static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, 528 struct fuse_ring_ent *ring_ent) 529 { 530 uring_cmd_set_ring_ent(cmd, ring_ent); 531 io_uring_cmd_mark_cancelable(cmd, issue_flags); 532 } 533 534 /* 535 * Checks for errors and stores it into the request 536 */ 537 static int fuse_uring_out_header_has_err(struct fuse_out_header *oh, 538 struct fuse_req *req, 539 struct fuse_conn *fc) 540 { 541 int err; 542 543 err = -EINVAL; 544 if (oh->unique == 0) { 545 /* Not supported through io-uring yet */ 546 pr_warn_once("notify through fuse-io-uring not supported\n"); 547 goto err; 548 } 549 550 if (oh->error <= -ERESTARTSYS || oh->error > 0) 551 goto err; 552 553 if (oh->error) { 554 err = oh->error; 555 goto err; 556 } 557 558 err = -ENOENT; 559 if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) { 560 pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n", 561 req->in.h.unique, 562 oh->unique & ~FUSE_INT_REQ_BIT); 563 goto err; 564 } 565 566 /* 567 * Is it an interrupt reply ID? 568 * XXX: Not supported through fuse-io-uring yet, it should not even 569 * find the request - should not happen. 570 */ 571 WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT); 572 573 err = 0; 574 err: 575 return err; 576 } 577 578 static int fuse_uring_copy_from_ring(struct fuse_ring *ring, 579 struct fuse_req *req, 580 struct fuse_ring_ent *ent) 581 { 582 struct fuse_copy_state cs; 583 struct fuse_args *args = req->args; 584 struct iov_iter iter; 585 int err; 586 struct fuse_uring_ent_in_out ring_in_out; 587 588 err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out, 589 sizeof(ring_in_out)); 590 if (err) 591 return -EFAULT; 592 593 err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz, 594 &iter); 595 if (err) 596 return err; 597 598 fuse_copy_init(&cs, false, &iter); 599 cs.is_uring = true; 600 cs.req = req; 601 602 err = fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); 603 fuse_copy_finish(&cs); 604 return err; 605 } 606 607 /* 608 * Copy data from the req to the ring buffer 609 */ 610 static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, 611 struct fuse_ring_ent *ent) 612 { 613 struct fuse_copy_state cs; 614 struct fuse_args *args = req->args; 615 struct fuse_in_arg *in_args = args->in_args; 616 int num_args = args->in_numargs; 617 int err; 618 struct iov_iter iter; 619 struct fuse_uring_ent_in_out ent_in_out = { 620 .flags = 0, 621 .commit_id = req->in.h.unique, 622 }; 623 624 err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter); 625 if (err) { 626 pr_info_ratelimited("fuse: Import of user buffer failed\n"); 627 return err; 628 } 629 630 fuse_copy_init(&cs, true, &iter); 631 cs.is_uring = true; 632 cs.req = req; 633 634 if (num_args > 0) { 635 /* 636 * Expectation is that the first argument is the per op header. 637 * Some op code have that as zero size. 638 */ 639 if (args->in_args[0].size > 0) { 640 err = copy_to_user(&ent->headers->op_in, in_args->value, 641 in_args->size); 642 if (err) { 643 pr_info_ratelimited( 644 "Copying the header failed.\n"); 645 return -EFAULT; 646 } 647 } 648 in_args++; 649 num_args--; 650 } 651 652 /* copy the payload */ 653 err = fuse_copy_args(&cs, num_args, args->in_pages, 654 (struct fuse_arg *)in_args, 0); 655 fuse_copy_finish(&cs); 656 if (err) { 657 pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); 658 return err; 659 } 660 661 ent_in_out.payload_sz = cs.ring.copied_sz; 662 err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, 663 sizeof(ent_in_out)); 664 return err ? -EFAULT : 0; 665 } 666 667 static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, 668 struct fuse_req *req) 669 { 670 struct fuse_ring_queue *queue = ent->queue; 671 struct fuse_ring *ring = queue->ring; 672 int err; 673 674 err = -EIO; 675 if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { 676 pr_err("qid=%d ring-req=%p invalid state %d on send\n", 677 queue->qid, ent, ent->state); 678 return err; 679 } 680 681 err = -EINVAL; 682 if (WARN_ON(req->in.h.unique == 0)) 683 return err; 684 685 /* copy the request */ 686 err = fuse_uring_args_to_ring(ring, req, ent); 687 if (unlikely(err)) { 688 pr_info_ratelimited("Copy to ring failed: %d\n", err); 689 return err; 690 } 691 692 /* copy fuse_in_header */ 693 err = copy_to_user(&ent->headers->in_out, &req->in.h, 694 sizeof(req->in.h)); 695 if (err) { 696 err = -EFAULT; 697 return err; 698 } 699 700 return 0; 701 } 702 703 static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, 704 struct fuse_req *req) 705 { 706 int err; 707 708 err = fuse_uring_copy_to_ring(ent, req); 709 if (!err) 710 set_bit(FR_SENT, &req->flags); 711 else 712 fuse_uring_req_end(ent, req, err); 713 714 return err; 715 } 716 717 /* 718 * Write data to the ring buffer and send the request to userspace, 719 * userspace will read it 720 * This is comparable with classical read(/dev/fuse) 721 */ 722 static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, 723 struct fuse_req *req, 724 unsigned int issue_flags) 725 { 726 struct fuse_ring_queue *queue = ent->queue; 727 int err; 728 struct io_uring_cmd *cmd; 729 730 err = fuse_uring_prepare_send(ent, req); 731 if (err) 732 return err; 733 734 spin_lock(&queue->lock); 735 cmd = ent->cmd; 736 ent->cmd = NULL; 737 ent->state = FRRS_USERSPACE; 738 list_move_tail(&ent->list, &queue->ent_in_userspace); 739 spin_unlock(&queue->lock); 740 741 io_uring_cmd_done(cmd, 0, issue_flags); 742 return 0; 743 } 744 745 /* 746 * Make a ring entry available for fuse_req assignment 747 */ 748 static void fuse_uring_ent_avail(struct fuse_ring_ent *ent, 749 struct fuse_ring_queue *queue) 750 { 751 WARN_ON_ONCE(!ent->cmd); 752 list_move(&ent->list, &queue->ent_avail_queue); 753 ent->state = FRRS_AVAILABLE; 754 } 755 756 /* Used to find the request on SQE commit */ 757 static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent, 758 struct fuse_req *req) 759 { 760 struct fuse_ring_queue *queue = ent->queue; 761 struct fuse_pqueue *fpq = &queue->fpq; 762 unsigned int hash; 763 764 req->ring_entry = ent; 765 hash = fuse_req_hash(req->in.h.unique); 766 list_move_tail(&req->list, &fpq->processing[hash]); 767 } 768 769 /* 770 * Assign a fuse queue entry to the given entry 771 */ 772 static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, 773 struct fuse_req *req) 774 { 775 struct fuse_ring_queue *queue = ent->queue; 776 777 lockdep_assert_held(&queue->lock); 778 779 if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE && 780 ent->state != FRRS_COMMIT)) { 781 pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid, 782 ent->state); 783 } 784 785 clear_bit(FR_PENDING, &req->flags); 786 ent->fuse_req = req; 787 ent->state = FRRS_FUSE_REQ; 788 list_move_tail(&ent->list, &queue->ent_w_req_queue); 789 fuse_uring_add_to_pq(ent, req); 790 } 791 792 /* Fetch the next fuse request if available */ 793 static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent) 794 __must_hold(&queue->lock) 795 { 796 struct fuse_req *req; 797 struct fuse_ring_queue *queue = ent->queue; 798 struct list_head *req_queue = &queue->fuse_req_queue; 799 800 lockdep_assert_held(&queue->lock); 801 802 /* get and assign the next entry while it is still holding the lock */ 803 req = list_first_entry_or_null(req_queue, struct fuse_req, list); 804 if (req) 805 fuse_uring_add_req_to_ring_ent(ent, req); 806 807 return req; 808 } 809 810 /* 811 * Read data from the ring buffer, which user space has written to 812 * This is comparible with handling of classical write(/dev/fuse). 813 * Also make the ring request available again for new fuse requests. 814 */ 815 static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req, 816 unsigned int issue_flags) 817 { 818 struct fuse_ring *ring = ent->queue->ring; 819 struct fuse_conn *fc = ring->fc; 820 ssize_t err = 0; 821 822 err = copy_from_user(&req->out.h, &ent->headers->in_out, 823 sizeof(req->out.h)); 824 if (err) { 825 req->out.h.error = -EFAULT; 826 goto out; 827 } 828 829 err = fuse_uring_out_header_has_err(&req->out.h, req, fc); 830 if (err) { 831 /* req->out.h.error already set */ 832 goto out; 833 } 834 835 err = fuse_uring_copy_from_ring(ring, req, ent); 836 out: 837 fuse_uring_req_end(ent, req, err); 838 } 839 840 /* 841 * Get the next fuse req and send it 842 */ 843 static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent, 844 struct fuse_ring_queue *queue, 845 unsigned int issue_flags) 846 { 847 int err; 848 struct fuse_req *req; 849 850 retry: 851 spin_lock(&queue->lock); 852 fuse_uring_ent_avail(ent, queue); 853 req = fuse_uring_ent_assign_req(ent); 854 spin_unlock(&queue->lock); 855 856 if (req) { 857 err = fuse_uring_send_next_to_ring(ent, req, issue_flags); 858 if (err) 859 goto retry; 860 } 861 } 862 863 static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent) 864 { 865 struct fuse_ring_queue *queue = ent->queue; 866 867 lockdep_assert_held(&queue->lock); 868 869 if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE)) 870 return -EIO; 871 872 ent->state = FRRS_COMMIT; 873 list_move(&ent->list, &queue->ent_commit_queue); 874 875 return 0; 876 } 877 878 /* FUSE_URING_CMD_COMMIT_AND_FETCH handler */ 879 static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, 880 struct fuse_conn *fc) 881 { 882 const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe128_cmd(cmd->sqe, 883 struct fuse_uring_cmd_req); 884 struct fuse_ring_ent *ent; 885 int err; 886 struct fuse_ring *ring = fc->ring; 887 struct fuse_ring_queue *queue; 888 uint64_t commit_id = READ_ONCE(cmd_req->commit_id); 889 unsigned int qid = READ_ONCE(cmd_req->qid); 890 struct fuse_pqueue *fpq; 891 struct fuse_req *req; 892 893 err = -ENOTCONN; 894 if (!ring) 895 return err; 896 897 if (qid >= ring->nr_queues) 898 return -EINVAL; 899 900 queue = ring->queues[qid]; 901 if (!queue) 902 return err; 903 fpq = &queue->fpq; 904 905 if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped)) 906 return err; 907 908 spin_lock(&queue->lock); 909 /* Find a request based on the unique ID of the fuse request 910 * This should get revised, as it needs a hash calculation and list 911 * search. And full struct fuse_pqueue is needed (memory overhead). 912 * As well as the link from req to ring_ent. 913 */ 914 req = fuse_request_find(fpq, commit_id); 915 err = -ENOENT; 916 if (!req) { 917 pr_info("qid=%d commit_id %llu not found\n", queue->qid, 918 commit_id); 919 spin_unlock(&queue->lock); 920 return err; 921 } 922 list_del_init(&req->list); 923 ent = req->ring_entry; 924 req->ring_entry = NULL; 925 926 err = fuse_ring_ent_set_commit(ent); 927 if (err != 0) { 928 pr_info_ratelimited("qid=%d commit_id %llu state %d", 929 queue->qid, commit_id, ent->state); 930 spin_unlock(&queue->lock); 931 req->out.h.error = err; 932 clear_bit(FR_SENT, &req->flags); 933 fuse_request_end(req); 934 return err; 935 } 936 937 ent->cmd = cmd; 938 spin_unlock(&queue->lock); 939 940 /* without the queue lock, as other locks are taken */ 941 fuse_uring_prepare_cancel(cmd, issue_flags, ent); 942 fuse_uring_commit(ent, req, issue_flags); 943 944 /* 945 * Fetching the next request is absolutely required as queued 946 * fuse requests would otherwise not get processed - committing 947 * and fetching is done in one step vs legacy fuse, which has separated 948 * read (fetch request) and write (commit result). 949 */ 950 fuse_uring_next_fuse_req(ent, queue, issue_flags); 951 return 0; 952 } 953 954 static bool is_ring_ready(struct fuse_ring *ring, int current_qid) 955 { 956 int qid; 957 struct fuse_ring_queue *queue; 958 bool ready = true; 959 960 for (qid = 0; qid < ring->nr_queues && ready; qid++) { 961 if (current_qid == qid) 962 continue; 963 964 queue = ring->queues[qid]; 965 if (!queue) { 966 ready = false; 967 break; 968 } 969 970 spin_lock(&queue->lock); 971 if (list_empty(&queue->ent_avail_queue)) 972 ready = false; 973 spin_unlock(&queue->lock); 974 } 975 976 return ready; 977 } 978 979 /* 980 * fuse_uring_req_fetch command handling 981 */ 982 static void fuse_uring_do_register(struct fuse_ring_ent *ent, 983 struct io_uring_cmd *cmd, 984 unsigned int issue_flags) 985 { 986 struct fuse_ring_queue *queue = ent->queue; 987 struct fuse_ring *ring = queue->ring; 988 struct fuse_conn *fc = ring->fc; 989 struct fuse_iqueue *fiq = &fc->iq; 990 991 fuse_uring_prepare_cancel(cmd, issue_flags, ent); 992 993 spin_lock(&queue->lock); 994 ent->cmd = cmd; 995 fuse_uring_ent_avail(ent, queue); 996 spin_unlock(&queue->lock); 997 998 if (!ring->ready) { 999 bool ready = is_ring_ready(ring, queue->qid); 1000 1001 if (ready) { 1002 WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); 1003 WRITE_ONCE(ring->ready, true); 1004 wake_up_all(&fc->blocked_waitq); 1005 } 1006 } 1007 } 1008 1009 /* 1010 * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] 1011 * the payload 1012 */ 1013 static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, 1014 struct iovec iov[FUSE_URING_IOV_SEGS]) 1015 { 1016 struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1017 struct iov_iter iter; 1018 ssize_t ret; 1019 1020 if (sqe->len != FUSE_URING_IOV_SEGS) 1021 return -EINVAL; 1022 1023 /* 1024 * Direction for buffer access will actually be READ and WRITE, 1025 * using write for the import should include READ access as well. 1026 */ 1027 ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS, 1028 FUSE_URING_IOV_SEGS, &iov, &iter); 1029 if (ret < 0) 1030 return ret; 1031 1032 return 0; 1033 } 1034 1035 static struct fuse_ring_ent * 1036 fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, 1037 struct fuse_ring_queue *queue) 1038 { 1039 struct fuse_ring *ring = queue->ring; 1040 struct fuse_ring_ent *ent; 1041 size_t payload_size; 1042 struct iovec iov[FUSE_URING_IOV_SEGS]; 1043 int err; 1044 1045 err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov); 1046 if (err) { 1047 pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n", 1048 err); 1049 return ERR_PTR(err); 1050 } 1051 1052 err = -EINVAL; 1053 if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) { 1054 pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len); 1055 return ERR_PTR(err); 1056 } 1057 1058 payload_size = iov[1].iov_len; 1059 if (payload_size < ring->max_payload_sz) { 1060 pr_info_ratelimited("Invalid req payload len %zu\n", 1061 payload_size); 1062 return ERR_PTR(err); 1063 } 1064 1065 err = -ENOMEM; 1066 ent = kzalloc_obj(*ent, GFP_KERNEL_ACCOUNT); 1067 if (!ent) 1068 return ERR_PTR(err); 1069 1070 INIT_LIST_HEAD(&ent->list); 1071 1072 ent->queue = queue; 1073 ent->headers = iov[0].iov_base; 1074 ent->payload = iov[1].iov_base; 1075 1076 atomic_inc(&ring->queue_refs); 1077 return ent; 1078 } 1079 1080 /* 1081 * Register header and payload buffer with the kernel and puts the 1082 * entry as "ready to get fuse requests" on the queue 1083 */ 1084 static int fuse_uring_register(struct io_uring_cmd *cmd, 1085 unsigned int issue_flags, struct fuse_conn *fc) 1086 { 1087 const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe128_cmd(cmd->sqe, 1088 struct fuse_uring_cmd_req); 1089 struct fuse_ring *ring = smp_load_acquire(&fc->ring); 1090 struct fuse_ring_queue *queue; 1091 struct fuse_ring_ent *ent; 1092 int err; 1093 unsigned int qid = READ_ONCE(cmd_req->qid); 1094 1095 err = -ENOMEM; 1096 if (!ring) { 1097 ring = fuse_uring_create(fc); 1098 if (!ring) 1099 return err; 1100 } 1101 1102 if (qid >= ring->nr_queues) { 1103 pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); 1104 return -EINVAL; 1105 } 1106 1107 queue = ring->queues[qid]; 1108 if (!queue) { 1109 queue = fuse_uring_create_queue(ring, qid); 1110 if (!queue) 1111 return err; 1112 } 1113 1114 /* 1115 * The created queue above does not need to be destructed in 1116 * case of entry errors below, will be done at ring destruction time. 1117 */ 1118 1119 ent = fuse_uring_create_ring_ent(cmd, queue); 1120 if (IS_ERR(ent)) 1121 return PTR_ERR(ent); 1122 1123 fuse_uring_do_register(ent, cmd, issue_flags); 1124 1125 return 0; 1126 } 1127 1128 /* 1129 * Entry function from io_uring to handle the given passthrough command 1130 * (op code IORING_OP_URING_CMD) 1131 */ 1132 int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) 1133 { 1134 struct fuse_dev *fud; 1135 struct fuse_conn *fc; 1136 u32 cmd_op = cmd->cmd_op; 1137 int err; 1138 1139 if ((unlikely(issue_flags & IO_URING_F_CANCEL))) { 1140 fuse_uring_cancel(cmd, issue_flags); 1141 return 0; 1142 } 1143 1144 /* This extra SQE size holds struct fuse_uring_cmd_req */ 1145 if (!(issue_flags & IO_URING_F_SQE128)) 1146 return -EINVAL; 1147 1148 fud = fuse_get_dev(cmd->file); 1149 if (IS_ERR(fud)) { 1150 pr_info_ratelimited("No fuse device found\n"); 1151 return PTR_ERR(fud); 1152 } 1153 fc = fud->fc; 1154 1155 /* Once a connection has io-uring enabled on it, it can't be disabled */ 1156 if (!enable_uring && !fc->io_uring) { 1157 pr_info_ratelimited("fuse-io-uring is disabled\n"); 1158 return -EOPNOTSUPP; 1159 } 1160 1161 if (fc->aborted) 1162 return -ECONNABORTED; 1163 if (!fc->connected) 1164 return -ENOTCONN; 1165 1166 /* 1167 * fuse_uring_register() needs the ring to be initialized, 1168 * we need to know the max payload size 1169 */ 1170 if (!fc->initialized) 1171 return -EAGAIN; 1172 1173 switch (cmd_op) { 1174 case FUSE_IO_URING_CMD_REGISTER: 1175 err = fuse_uring_register(cmd, issue_flags, fc); 1176 if (err) { 1177 pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n", 1178 err); 1179 fc->io_uring = 0; 1180 wake_up_all(&fc->blocked_waitq); 1181 return err; 1182 } 1183 break; 1184 case FUSE_IO_URING_CMD_COMMIT_AND_FETCH: 1185 err = fuse_uring_commit_fetch(cmd, issue_flags, fc); 1186 if (err) { 1187 pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n", 1188 err); 1189 return err; 1190 } 1191 break; 1192 default: 1193 return -EINVAL; 1194 } 1195 1196 return -EIOCBQUEUED; 1197 } 1198 1199 static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, 1200 ssize_t ret, unsigned int issue_flags) 1201 { 1202 struct fuse_ring_queue *queue = ent->queue; 1203 1204 spin_lock(&queue->lock); 1205 ent->state = FRRS_USERSPACE; 1206 list_move_tail(&ent->list, &queue->ent_in_userspace); 1207 ent->cmd = NULL; 1208 spin_unlock(&queue->lock); 1209 1210 io_uring_cmd_done(cmd, ret, issue_flags); 1211 } 1212 1213 /* 1214 * This prepares and sends the ring request in fuse-uring task context. 1215 * User buffers are not mapped yet - the application does not have permission 1216 * to write to it - this has to be executed in ring task context. 1217 */ 1218 static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw) 1219 { 1220 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; 1221 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 1222 struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); 1223 struct fuse_ring_queue *queue = ent->queue; 1224 int err; 1225 1226 if (!tw.cancel) { 1227 err = fuse_uring_prepare_send(ent, ent->fuse_req); 1228 if (err) { 1229 fuse_uring_next_fuse_req(ent, queue, issue_flags); 1230 return; 1231 } 1232 } else { 1233 err = -ECANCELED; 1234 } 1235 1236 fuse_uring_send(ent, cmd, err, issue_flags); 1237 } 1238 1239 static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) 1240 { 1241 unsigned int qid; 1242 struct fuse_ring_queue *queue; 1243 1244 qid = task_cpu(current); 1245 1246 if (WARN_ONCE(qid >= ring->nr_queues, 1247 "Core number (%u) exceeds nr queues (%zu)\n", qid, 1248 ring->nr_queues)) 1249 qid = 0; 1250 1251 queue = ring->queues[qid]; 1252 WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); 1253 1254 return queue; 1255 } 1256 1257 static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) 1258 { 1259 struct io_uring_cmd *cmd = ent->cmd; 1260 1261 uring_cmd_set_ring_ent(cmd, ent); 1262 io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); 1263 } 1264 1265 /* queue a fuse request and send it if a ring entry is available */ 1266 void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) 1267 { 1268 struct fuse_conn *fc = req->fm->fc; 1269 struct fuse_ring *ring = fc->ring; 1270 struct fuse_ring_queue *queue; 1271 struct fuse_ring_ent *ent = NULL; 1272 int err; 1273 1274 err = -EINVAL; 1275 queue = fuse_uring_task_to_queue(ring); 1276 if (!queue) 1277 goto err; 1278 1279 fuse_request_assign_unique(fiq, req); 1280 1281 spin_lock(&queue->lock); 1282 err = -ENOTCONN; 1283 if (unlikely(queue->stopped)) 1284 goto err_unlock; 1285 1286 set_bit(FR_URING, &req->flags); 1287 req->ring_queue = queue; 1288 ent = list_first_entry_or_null(&queue->ent_avail_queue, 1289 struct fuse_ring_ent, list); 1290 if (ent) 1291 fuse_uring_add_req_to_ring_ent(ent, req); 1292 else 1293 list_add_tail(&req->list, &queue->fuse_req_queue); 1294 spin_unlock(&queue->lock); 1295 1296 if (ent) 1297 fuse_uring_dispatch_ent(ent); 1298 1299 return; 1300 1301 err_unlock: 1302 spin_unlock(&queue->lock); 1303 err: 1304 req->out.h.error = err; 1305 clear_bit(FR_PENDING, &req->flags); 1306 fuse_request_end(req); 1307 } 1308 1309 bool fuse_uring_queue_bq_req(struct fuse_req *req) 1310 { 1311 struct fuse_conn *fc = req->fm->fc; 1312 struct fuse_ring *ring = fc->ring; 1313 struct fuse_ring_queue *queue; 1314 struct fuse_ring_ent *ent = NULL; 1315 1316 queue = fuse_uring_task_to_queue(ring); 1317 if (!queue) 1318 return false; 1319 1320 spin_lock(&queue->lock); 1321 if (unlikely(queue->stopped)) { 1322 spin_unlock(&queue->lock); 1323 return false; 1324 } 1325 1326 set_bit(FR_URING, &req->flags); 1327 req->ring_queue = queue; 1328 list_add_tail(&req->list, &queue->fuse_req_bg_queue); 1329 1330 ent = list_first_entry_or_null(&queue->ent_avail_queue, 1331 struct fuse_ring_ent, list); 1332 spin_lock(&fc->bg_lock); 1333 fc->num_background++; 1334 if (fc->num_background == fc->max_background) 1335 fc->blocked = 1; 1336 fuse_uring_flush_bg(queue); 1337 spin_unlock(&fc->bg_lock); 1338 1339 /* 1340 * Due to bg_queue flush limits there might be other bg requests 1341 * in the queue that need to be handled first. Or no further req 1342 * might be available. 1343 */ 1344 req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req, 1345 list); 1346 if (ent && req) { 1347 fuse_uring_add_req_to_ring_ent(ent, req); 1348 spin_unlock(&queue->lock); 1349 1350 fuse_uring_dispatch_ent(ent); 1351 } else { 1352 spin_unlock(&queue->lock); 1353 } 1354 1355 return true; 1356 } 1357 1358 bool fuse_uring_remove_pending_req(struct fuse_req *req) 1359 { 1360 struct fuse_ring_queue *queue = req->ring_queue; 1361 1362 return fuse_remove_pending_req(req, &queue->lock); 1363 } 1364 1365 static const struct fuse_iqueue_ops fuse_io_uring_ops = { 1366 /* should be send over io-uring as enhancement */ 1367 .send_forget = fuse_dev_queue_forget, 1368 1369 /* 1370 * could be send over io-uring, but interrupts should be rare, 1371 * no need to make the code complex 1372 */ 1373 .send_interrupt = fuse_dev_queue_interrupt, 1374 .send_req = fuse_uring_queue_fuse_req, 1375 }; 1376