1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * FUSE: Filesystem in Userspace 4 * Copyright (c) 2023-2024 DataDirect Networks. 5 */ 6 7 #include "fuse_i.h" 8 #include "dev_uring_i.h" 9 #include "fuse_dev_i.h" 10 #include "fuse_trace.h" 11 12 #include <linux/fs.h> 13 #include <linux/io_uring/cmd.h> 14 15 static bool __read_mostly enable_uring; 16 module_param(enable_uring, bool, 0644); 17 MODULE_PARM_DESC(enable_uring, 18 "Enable userspace communication through io-uring"); 19 20 #define FUSE_URING_IOV_SEGS 2 /* header and payload */ 21 22 23 bool fuse_uring_enabled(void) 24 { 25 return enable_uring; 26 } 27 28 struct fuse_uring_pdu { 29 struct fuse_ring_ent *ent; 30 }; 31 32 static const struct fuse_iqueue_ops fuse_io_uring_ops; 33 34 static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd, 35 struct fuse_ring_ent *ring_ent) 36 { 37 struct fuse_uring_pdu *pdu = 38 io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); 39 40 pdu->ent = ring_ent; 41 } 42 43 static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) 44 { 45 struct fuse_uring_pdu *pdu = 46 io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); 47 48 return pdu->ent; 49 } 50 51 static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) 52 { 53 struct fuse_ring *ring = queue->ring; 54 struct fuse_conn *fc = ring->fc; 55 56 lockdep_assert_held(&queue->lock); 57 lockdep_assert_held(&fc->bg_lock); 58 59 /* 60 * Allow one bg request per queue, ignoring global fc limits. 61 * This prevents a single queue from consuming all resources and 62 * eliminates the need for remote queue wake-ups when global 63 * limits are met but this queue has no more waiting requests. 64 */ 65 while ((fc->active_background < fc->max_background || 66 !queue->active_background) && 67 (!list_empty(&queue->fuse_req_bg_queue))) { 68 struct fuse_req *req; 69 70 req = list_first_entry(&queue->fuse_req_bg_queue, 71 struct fuse_req, list); 72 fc->active_background++; 73 queue->active_background++; 74 75 list_move_tail(&req->list, &queue->fuse_req_queue); 76 } 77 } 78 79 static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, 80 int error) 81 { 82 struct fuse_ring_queue *queue = ent->queue; 83 struct fuse_ring *ring = queue->ring; 84 struct fuse_conn *fc = ring->fc; 85 86 lockdep_assert_not_held(&queue->lock); 87 spin_lock(&queue->lock); 88 ent->fuse_req = NULL; 89 if (test_bit(FR_BACKGROUND, &req->flags)) { 90 queue->active_background--; 91 spin_lock(&fc->bg_lock); 92 fuse_uring_flush_bg(queue); 93 spin_unlock(&fc->bg_lock); 94 } 95 96 spin_unlock(&queue->lock); 97 98 if (error) 99 req->out.h.error = error; 100 101 clear_bit(FR_SENT, &req->flags); 102 fuse_request_end(req); 103 } 104 105 /* Abort all list queued request on the given ring queue */ 106 static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) 107 { 108 struct fuse_req *req; 109 LIST_HEAD(req_list); 110 111 spin_lock(&queue->lock); 112 list_for_each_entry(req, &queue->fuse_req_queue, list) 113 clear_bit(FR_PENDING, &req->flags); 114 list_splice_init(&queue->fuse_req_queue, &req_list); 115 spin_unlock(&queue->lock); 116 117 /* must not hold queue lock to avoid order issues with fi->lock */ 118 fuse_dev_end_requests(&req_list); 119 } 120 121 void fuse_uring_abort_end_requests(struct fuse_ring *ring) 122 { 123 int qid; 124 struct fuse_ring_queue *queue; 125 struct fuse_conn *fc = ring->fc; 126 127 for (qid = 0; qid < ring->nr_queues; qid++) { 128 queue = READ_ONCE(ring->queues[qid]); 129 if (!queue) 130 continue; 131 132 queue->stopped = true; 133 134 WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); 135 spin_lock(&queue->lock); 136 spin_lock(&fc->bg_lock); 137 fuse_uring_flush_bg(queue); 138 spin_unlock(&fc->bg_lock); 139 spin_unlock(&queue->lock); 140 fuse_uring_abort_end_queue_requests(queue); 141 } 142 } 143 144 static bool ent_list_request_expired(struct fuse_conn *fc, struct list_head *list) 145 { 146 struct fuse_ring_ent *ent; 147 struct fuse_req *req; 148 149 ent = list_first_entry_or_null(list, struct fuse_ring_ent, list); 150 if (!ent) 151 return false; 152 153 req = ent->fuse_req; 154 155 return time_is_before_jiffies(req->create_time + 156 fc->timeout.req_timeout); 157 } 158 159 bool fuse_uring_request_expired(struct fuse_conn *fc) 160 { 161 struct fuse_ring *ring = fc->ring; 162 struct fuse_ring_queue *queue; 163 int qid; 164 165 if (!ring) 166 return false; 167 168 for (qid = 0; qid < ring->nr_queues; qid++) { 169 queue = READ_ONCE(ring->queues[qid]); 170 if (!queue) 171 continue; 172 173 spin_lock(&queue->lock); 174 if (fuse_request_expired(fc, &queue->fuse_req_queue) || 175 fuse_request_expired(fc, &queue->fuse_req_bg_queue) || 176 ent_list_request_expired(fc, &queue->ent_w_req_queue) || 177 ent_list_request_expired(fc, &queue->ent_in_userspace)) { 178 spin_unlock(&queue->lock); 179 return true; 180 } 181 spin_unlock(&queue->lock); 182 } 183 184 return false; 185 } 186 187 void fuse_uring_destruct(struct fuse_conn *fc) 188 { 189 struct fuse_ring *ring = fc->ring; 190 int qid; 191 192 if (!ring) 193 return; 194 195 for (qid = 0; qid < ring->nr_queues; qid++) { 196 struct fuse_ring_queue *queue = ring->queues[qid]; 197 struct fuse_ring_ent *ent, *next; 198 199 if (!queue) 200 continue; 201 202 WARN_ON(!list_empty(&queue->ent_avail_queue)); 203 WARN_ON(!list_empty(&queue->ent_w_req_queue)); 204 WARN_ON(!list_empty(&queue->ent_commit_queue)); 205 WARN_ON(!list_empty(&queue->ent_in_userspace)); 206 207 list_for_each_entry_safe(ent, next, &queue->ent_released, 208 list) { 209 list_del_init(&ent->list); 210 kfree(ent); 211 } 212 213 kfree(queue->fpq.processing); 214 kfree(queue); 215 ring->queues[qid] = NULL; 216 } 217 218 kfree(ring->queues); 219 kfree(ring); 220 fc->ring = NULL; 221 } 222 223 /* 224 * Basic ring setup for this connection based on the provided configuration 225 */ 226 static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) 227 { 228 struct fuse_ring *ring; 229 size_t nr_queues = num_possible_cpus(); 230 struct fuse_ring *res = NULL; 231 size_t max_payload_size; 232 233 ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); 234 if (!ring) 235 return NULL; 236 237 ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), 238 GFP_KERNEL_ACCOUNT); 239 if (!ring->queues) 240 goto out_err; 241 242 max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); 243 max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); 244 245 spin_lock(&fc->lock); 246 if (fc->ring) { 247 /* race, another thread created the ring in the meantime */ 248 spin_unlock(&fc->lock); 249 res = fc->ring; 250 goto out_err; 251 } 252 253 init_waitqueue_head(&ring->stop_waitq); 254 255 ring->nr_queues = nr_queues; 256 ring->fc = fc; 257 ring->max_payload_sz = max_payload_size; 258 smp_store_release(&fc->ring, ring); 259 260 spin_unlock(&fc->lock); 261 return ring; 262 263 out_err: 264 kfree(ring->queues); 265 kfree(ring); 266 return res; 267 } 268 269 static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, 270 int qid) 271 { 272 struct fuse_conn *fc = ring->fc; 273 struct fuse_ring_queue *queue; 274 struct list_head *pq; 275 276 queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); 277 if (!queue) 278 return NULL; 279 pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); 280 if (!pq) { 281 kfree(queue); 282 return NULL; 283 } 284 285 queue->qid = qid; 286 queue->ring = ring; 287 spin_lock_init(&queue->lock); 288 289 INIT_LIST_HEAD(&queue->ent_avail_queue); 290 INIT_LIST_HEAD(&queue->ent_commit_queue); 291 INIT_LIST_HEAD(&queue->ent_w_req_queue); 292 INIT_LIST_HEAD(&queue->ent_in_userspace); 293 INIT_LIST_HEAD(&queue->fuse_req_queue); 294 INIT_LIST_HEAD(&queue->fuse_req_bg_queue); 295 INIT_LIST_HEAD(&queue->ent_released); 296 297 queue->fpq.processing = pq; 298 fuse_pqueue_init(&queue->fpq); 299 300 spin_lock(&fc->lock); 301 if (ring->queues[qid]) { 302 spin_unlock(&fc->lock); 303 kfree(queue->fpq.processing); 304 kfree(queue); 305 return ring->queues[qid]; 306 } 307 308 /* 309 * write_once and lock as the caller mostly doesn't take the lock at all 310 */ 311 WRITE_ONCE(ring->queues[qid], queue); 312 spin_unlock(&fc->lock); 313 314 return queue; 315 } 316 317 static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) 318 { 319 clear_bit(FR_SENT, &req->flags); 320 req->out.h.error = -ECONNABORTED; 321 fuse_request_end(req); 322 } 323 324 /* 325 * Release a request/entry on connection tear down 326 */ 327 static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) 328 { 329 struct fuse_req *req; 330 struct io_uring_cmd *cmd; 331 332 struct fuse_ring_queue *queue = ent->queue; 333 334 spin_lock(&queue->lock); 335 cmd = ent->cmd; 336 ent->cmd = NULL; 337 req = ent->fuse_req; 338 ent->fuse_req = NULL; 339 if (req) { 340 /* remove entry from queue->fpq->processing */ 341 list_del_init(&req->list); 342 } 343 344 /* 345 * The entry must not be freed immediately, due to access of direct 346 * pointer access of entries through IO_URING_F_CANCEL - there is a risk 347 * of race between daemon termination (which triggers IO_URING_F_CANCEL 348 * and accesses entries without checking the list state first 349 */ 350 list_move(&ent->list, &queue->ent_released); 351 ent->state = FRRS_RELEASED; 352 spin_unlock(&queue->lock); 353 354 if (cmd) 355 io_uring_cmd_done(cmd, -ENOTCONN, IO_URING_F_UNLOCKED); 356 357 if (req) 358 fuse_uring_stop_fuse_req_end(req); 359 } 360 361 static void fuse_uring_stop_list_entries(struct list_head *head, 362 struct fuse_ring_queue *queue, 363 enum fuse_ring_req_state exp_state) 364 { 365 struct fuse_ring *ring = queue->ring; 366 struct fuse_ring_ent *ent, *next; 367 ssize_t queue_refs = SSIZE_MAX; 368 LIST_HEAD(to_teardown); 369 370 spin_lock(&queue->lock); 371 list_for_each_entry_safe(ent, next, head, list) { 372 if (ent->state != exp_state) { 373 pr_warn("entry teardown qid=%d state=%d expected=%d", 374 queue->qid, ent->state, exp_state); 375 continue; 376 } 377 378 ent->state = FRRS_TEARDOWN; 379 list_move(&ent->list, &to_teardown); 380 } 381 spin_unlock(&queue->lock); 382 383 /* no queue lock to avoid lock order issues */ 384 list_for_each_entry_safe(ent, next, &to_teardown, list) { 385 fuse_uring_entry_teardown(ent); 386 queue_refs = atomic_dec_return(&ring->queue_refs); 387 WARN_ON_ONCE(queue_refs < 0); 388 } 389 } 390 391 static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) 392 { 393 fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue, 394 FRRS_USERSPACE); 395 fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue, 396 FRRS_AVAILABLE); 397 } 398 399 /* 400 * Log state debug info 401 */ 402 static void fuse_uring_log_ent_state(struct fuse_ring *ring) 403 { 404 int qid; 405 struct fuse_ring_ent *ent; 406 407 for (qid = 0; qid < ring->nr_queues; qid++) { 408 struct fuse_ring_queue *queue = ring->queues[qid]; 409 410 if (!queue) 411 continue; 412 413 spin_lock(&queue->lock); 414 /* 415 * Log entries from the intermediate queue, the other queues 416 * should be empty 417 */ 418 list_for_each_entry(ent, &queue->ent_w_req_queue, list) { 419 pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n", 420 ring, qid, ent, ent->state); 421 } 422 list_for_each_entry(ent, &queue->ent_commit_queue, list) { 423 pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", 424 ring, qid, ent, ent->state); 425 } 426 spin_unlock(&queue->lock); 427 } 428 ring->stop_debug_log = 1; 429 } 430 431 static void fuse_uring_async_stop_queues(struct work_struct *work) 432 { 433 int qid; 434 struct fuse_ring *ring = 435 container_of(work, struct fuse_ring, async_teardown_work.work); 436 437 /* XXX code dup */ 438 for (qid = 0; qid < ring->nr_queues; qid++) { 439 struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); 440 441 if (!queue) 442 continue; 443 444 fuse_uring_teardown_entries(queue); 445 } 446 447 /* 448 * Some ring entries might be in the middle of IO operations, 449 * i.e. in process to get handled by file_operations::uring_cmd 450 * or on the way to userspace - we could handle that with conditions in 451 * run time code, but easier/cleaner to have an async tear down handler 452 * If there are still queue references left 453 */ 454 if (atomic_read(&ring->queue_refs) > 0) { 455 if (time_after(jiffies, 456 ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT)) 457 fuse_uring_log_ent_state(ring); 458 459 schedule_delayed_work(&ring->async_teardown_work, 460 FUSE_URING_TEARDOWN_INTERVAL); 461 } else { 462 wake_up_all(&ring->stop_waitq); 463 } 464 } 465 466 /* 467 * Stop the ring queues 468 */ 469 void fuse_uring_stop_queues(struct fuse_ring *ring) 470 { 471 int qid; 472 473 for (qid = 0; qid < ring->nr_queues; qid++) { 474 struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); 475 476 if (!queue) 477 continue; 478 479 fuse_uring_teardown_entries(queue); 480 } 481 482 if (atomic_read(&ring->queue_refs) > 0) { 483 ring->teardown_time = jiffies; 484 INIT_DELAYED_WORK(&ring->async_teardown_work, 485 fuse_uring_async_stop_queues); 486 schedule_delayed_work(&ring->async_teardown_work, 487 FUSE_URING_TEARDOWN_INTERVAL); 488 } else { 489 wake_up_all(&ring->stop_waitq); 490 } 491 } 492 493 /* 494 * Handle IO_URING_F_CANCEL, typically should come on daemon termination. 495 * 496 * Releasing the last entry should trigger fuse_dev_release() if 497 * the daemon was terminated 498 */ 499 static void fuse_uring_cancel(struct io_uring_cmd *cmd, 500 unsigned int issue_flags) 501 { 502 struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); 503 struct fuse_ring_queue *queue; 504 bool need_cmd_done = false; 505 506 /* 507 * direct access on ent - it must not be destructed as long as 508 * IO_URING_F_CANCEL might come up 509 */ 510 queue = ent->queue; 511 spin_lock(&queue->lock); 512 if (ent->state == FRRS_AVAILABLE) { 513 ent->state = FRRS_USERSPACE; 514 list_move_tail(&ent->list, &queue->ent_in_userspace); 515 need_cmd_done = true; 516 ent->cmd = NULL; 517 } 518 spin_unlock(&queue->lock); 519 520 if (need_cmd_done) { 521 /* no queue lock to avoid lock order issues */ 522 io_uring_cmd_done(cmd, -ENOTCONN, issue_flags); 523 } 524 } 525 526 static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, 527 struct fuse_ring_ent *ring_ent) 528 { 529 uring_cmd_set_ring_ent(cmd, ring_ent); 530 io_uring_cmd_mark_cancelable(cmd, issue_flags); 531 } 532 533 /* 534 * Checks for errors and stores it into the request 535 */ 536 static int fuse_uring_out_header_has_err(struct fuse_out_header *oh, 537 struct fuse_req *req, 538 struct fuse_conn *fc) 539 { 540 int err; 541 542 err = -EINVAL; 543 if (oh->unique == 0) { 544 /* Not supported through io-uring yet */ 545 pr_warn_once("notify through fuse-io-uring not supported\n"); 546 goto err; 547 } 548 549 if (oh->error <= -ERESTARTSYS || oh->error > 0) 550 goto err; 551 552 if (oh->error) { 553 err = oh->error; 554 goto err; 555 } 556 557 err = -ENOENT; 558 if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) { 559 pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n", 560 req->in.h.unique, 561 oh->unique & ~FUSE_INT_REQ_BIT); 562 goto err; 563 } 564 565 /* 566 * Is it an interrupt reply ID? 567 * XXX: Not supported through fuse-io-uring yet, it should not even 568 * find the request - should not happen. 569 */ 570 WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT); 571 572 err = 0; 573 err: 574 return err; 575 } 576 577 static int fuse_uring_copy_from_ring(struct fuse_ring *ring, 578 struct fuse_req *req, 579 struct fuse_ring_ent *ent) 580 { 581 struct fuse_copy_state cs; 582 struct fuse_args *args = req->args; 583 struct iov_iter iter; 584 int err; 585 struct fuse_uring_ent_in_out ring_in_out; 586 587 err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out, 588 sizeof(ring_in_out)); 589 if (err) 590 return -EFAULT; 591 592 err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz, 593 &iter); 594 if (err) 595 return err; 596 597 fuse_copy_init(&cs, false, &iter); 598 cs.is_uring = true; 599 cs.req = req; 600 601 return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); 602 } 603 604 /* 605 * Copy data from the req to the ring buffer 606 */ 607 static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, 608 struct fuse_ring_ent *ent) 609 { 610 struct fuse_copy_state cs; 611 struct fuse_args *args = req->args; 612 struct fuse_in_arg *in_args = args->in_args; 613 int num_args = args->in_numargs; 614 int err; 615 struct iov_iter iter; 616 struct fuse_uring_ent_in_out ent_in_out = { 617 .flags = 0, 618 .commit_id = req->in.h.unique, 619 }; 620 621 err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter); 622 if (err) { 623 pr_info_ratelimited("fuse: Import of user buffer failed\n"); 624 return err; 625 } 626 627 fuse_copy_init(&cs, true, &iter); 628 cs.is_uring = true; 629 cs.req = req; 630 631 if (num_args > 0) { 632 /* 633 * Expectation is that the first argument is the per op header. 634 * Some op code have that as zero size. 635 */ 636 if (args->in_args[0].size > 0) { 637 err = copy_to_user(&ent->headers->op_in, in_args->value, 638 in_args->size); 639 if (err) { 640 pr_info_ratelimited( 641 "Copying the header failed.\n"); 642 return -EFAULT; 643 } 644 } 645 in_args++; 646 num_args--; 647 } 648 649 /* copy the payload */ 650 err = fuse_copy_args(&cs, num_args, args->in_pages, 651 (struct fuse_arg *)in_args, 0); 652 if (err) { 653 pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); 654 return err; 655 } 656 657 ent_in_out.payload_sz = cs.ring.copied_sz; 658 err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, 659 sizeof(ent_in_out)); 660 return err ? -EFAULT : 0; 661 } 662 663 static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, 664 struct fuse_req *req) 665 { 666 struct fuse_ring_queue *queue = ent->queue; 667 struct fuse_ring *ring = queue->ring; 668 int err; 669 670 err = -EIO; 671 if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { 672 pr_err("qid=%d ring-req=%p invalid state %d on send\n", 673 queue->qid, ent, ent->state); 674 return err; 675 } 676 677 err = -EINVAL; 678 if (WARN_ON(req->in.h.unique == 0)) 679 return err; 680 681 /* copy the request */ 682 err = fuse_uring_args_to_ring(ring, req, ent); 683 if (unlikely(err)) { 684 pr_info_ratelimited("Copy to ring failed: %d\n", err); 685 return err; 686 } 687 688 /* copy fuse_in_header */ 689 err = copy_to_user(&ent->headers->in_out, &req->in.h, 690 sizeof(req->in.h)); 691 if (err) { 692 err = -EFAULT; 693 return err; 694 } 695 696 return 0; 697 } 698 699 static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, 700 struct fuse_req *req) 701 { 702 int err; 703 704 err = fuse_uring_copy_to_ring(ent, req); 705 if (!err) 706 set_bit(FR_SENT, &req->flags); 707 else 708 fuse_uring_req_end(ent, req, err); 709 710 return err; 711 } 712 713 /* 714 * Write data to the ring buffer and send the request to userspace, 715 * userspace will read it 716 * This is comparable with classical read(/dev/fuse) 717 */ 718 static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, 719 struct fuse_req *req, 720 unsigned int issue_flags) 721 { 722 struct fuse_ring_queue *queue = ent->queue; 723 int err; 724 struct io_uring_cmd *cmd; 725 726 err = fuse_uring_prepare_send(ent, req); 727 if (err) 728 return err; 729 730 spin_lock(&queue->lock); 731 cmd = ent->cmd; 732 ent->cmd = NULL; 733 ent->state = FRRS_USERSPACE; 734 list_move_tail(&ent->list, &queue->ent_in_userspace); 735 spin_unlock(&queue->lock); 736 737 io_uring_cmd_done(cmd, 0, issue_flags); 738 return 0; 739 } 740 741 /* 742 * Make a ring entry available for fuse_req assignment 743 */ 744 static void fuse_uring_ent_avail(struct fuse_ring_ent *ent, 745 struct fuse_ring_queue *queue) 746 { 747 WARN_ON_ONCE(!ent->cmd); 748 list_move(&ent->list, &queue->ent_avail_queue); 749 ent->state = FRRS_AVAILABLE; 750 } 751 752 /* Used to find the request on SQE commit */ 753 static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent, 754 struct fuse_req *req) 755 { 756 struct fuse_ring_queue *queue = ent->queue; 757 struct fuse_pqueue *fpq = &queue->fpq; 758 unsigned int hash; 759 760 req->ring_entry = ent; 761 hash = fuse_req_hash(req->in.h.unique); 762 list_move_tail(&req->list, &fpq->processing[hash]); 763 } 764 765 /* 766 * Assign a fuse queue entry to the given entry 767 */ 768 static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, 769 struct fuse_req *req) 770 { 771 struct fuse_ring_queue *queue = ent->queue; 772 773 lockdep_assert_held(&queue->lock); 774 775 if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE && 776 ent->state != FRRS_COMMIT)) { 777 pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid, 778 ent->state); 779 } 780 781 clear_bit(FR_PENDING, &req->flags); 782 ent->fuse_req = req; 783 ent->state = FRRS_FUSE_REQ; 784 list_move_tail(&ent->list, &queue->ent_w_req_queue); 785 fuse_uring_add_to_pq(ent, req); 786 } 787 788 /* Fetch the next fuse request if available */ 789 static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent) 790 __must_hold(&queue->lock) 791 { 792 struct fuse_req *req; 793 struct fuse_ring_queue *queue = ent->queue; 794 struct list_head *req_queue = &queue->fuse_req_queue; 795 796 lockdep_assert_held(&queue->lock); 797 798 /* get and assign the next entry while it is still holding the lock */ 799 req = list_first_entry_or_null(req_queue, struct fuse_req, list); 800 if (req) 801 fuse_uring_add_req_to_ring_ent(ent, req); 802 803 return req; 804 } 805 806 /* 807 * Read data from the ring buffer, which user space has written to 808 * This is comparible with handling of classical write(/dev/fuse). 809 * Also make the ring request available again for new fuse requests. 810 */ 811 static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req, 812 unsigned int issue_flags) 813 { 814 struct fuse_ring *ring = ent->queue->ring; 815 struct fuse_conn *fc = ring->fc; 816 ssize_t err = 0; 817 818 err = copy_from_user(&req->out.h, &ent->headers->in_out, 819 sizeof(req->out.h)); 820 if (err) { 821 req->out.h.error = -EFAULT; 822 goto out; 823 } 824 825 err = fuse_uring_out_header_has_err(&req->out.h, req, fc); 826 if (err) { 827 /* req->out.h.error already set */ 828 goto out; 829 } 830 831 err = fuse_uring_copy_from_ring(ring, req, ent); 832 out: 833 fuse_uring_req_end(ent, req, err); 834 } 835 836 /* 837 * Get the next fuse req and send it 838 */ 839 static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent, 840 struct fuse_ring_queue *queue, 841 unsigned int issue_flags) 842 { 843 int err; 844 struct fuse_req *req; 845 846 retry: 847 spin_lock(&queue->lock); 848 fuse_uring_ent_avail(ent, queue); 849 req = fuse_uring_ent_assign_req(ent); 850 spin_unlock(&queue->lock); 851 852 if (req) { 853 err = fuse_uring_send_next_to_ring(ent, req, issue_flags); 854 if (err) 855 goto retry; 856 } 857 } 858 859 static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent) 860 { 861 struct fuse_ring_queue *queue = ent->queue; 862 863 lockdep_assert_held(&queue->lock); 864 865 if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE)) 866 return -EIO; 867 868 ent->state = FRRS_COMMIT; 869 list_move(&ent->list, &queue->ent_commit_queue); 870 871 return 0; 872 } 873 874 /* FUSE_URING_CMD_COMMIT_AND_FETCH handler */ 875 static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, 876 struct fuse_conn *fc) 877 { 878 const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); 879 struct fuse_ring_ent *ent; 880 int err; 881 struct fuse_ring *ring = fc->ring; 882 struct fuse_ring_queue *queue; 883 uint64_t commit_id = READ_ONCE(cmd_req->commit_id); 884 unsigned int qid = READ_ONCE(cmd_req->qid); 885 struct fuse_pqueue *fpq; 886 struct fuse_req *req; 887 888 err = -ENOTCONN; 889 if (!ring) 890 return err; 891 892 if (qid >= ring->nr_queues) 893 return -EINVAL; 894 895 queue = ring->queues[qid]; 896 if (!queue) 897 return err; 898 fpq = &queue->fpq; 899 900 if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped)) 901 return err; 902 903 spin_lock(&queue->lock); 904 /* Find a request based on the unique ID of the fuse request 905 * This should get revised, as it needs a hash calculation and list 906 * search. And full struct fuse_pqueue is needed (memory overhead). 907 * As well as the link from req to ring_ent. 908 */ 909 req = fuse_request_find(fpq, commit_id); 910 err = -ENOENT; 911 if (!req) { 912 pr_info("qid=%d commit_id %llu not found\n", queue->qid, 913 commit_id); 914 spin_unlock(&queue->lock); 915 return err; 916 } 917 list_del_init(&req->list); 918 ent = req->ring_entry; 919 req->ring_entry = NULL; 920 921 err = fuse_ring_ent_set_commit(ent); 922 if (err != 0) { 923 pr_info_ratelimited("qid=%d commit_id %llu state %d", 924 queue->qid, commit_id, ent->state); 925 spin_unlock(&queue->lock); 926 req->out.h.error = err; 927 clear_bit(FR_SENT, &req->flags); 928 fuse_request_end(req); 929 return err; 930 } 931 932 ent->cmd = cmd; 933 spin_unlock(&queue->lock); 934 935 /* without the queue lock, as other locks are taken */ 936 fuse_uring_prepare_cancel(cmd, issue_flags, ent); 937 fuse_uring_commit(ent, req, issue_flags); 938 939 /* 940 * Fetching the next request is absolutely required as queued 941 * fuse requests would otherwise not get processed - committing 942 * and fetching is done in one step vs legacy fuse, which has separated 943 * read (fetch request) and write (commit result). 944 */ 945 fuse_uring_next_fuse_req(ent, queue, issue_flags); 946 return 0; 947 } 948 949 static bool is_ring_ready(struct fuse_ring *ring, int current_qid) 950 { 951 int qid; 952 struct fuse_ring_queue *queue; 953 bool ready = true; 954 955 for (qid = 0; qid < ring->nr_queues && ready; qid++) { 956 if (current_qid == qid) 957 continue; 958 959 queue = ring->queues[qid]; 960 if (!queue) { 961 ready = false; 962 break; 963 } 964 965 spin_lock(&queue->lock); 966 if (list_empty(&queue->ent_avail_queue)) 967 ready = false; 968 spin_unlock(&queue->lock); 969 } 970 971 return ready; 972 } 973 974 /* 975 * fuse_uring_req_fetch command handling 976 */ 977 static void fuse_uring_do_register(struct fuse_ring_ent *ent, 978 struct io_uring_cmd *cmd, 979 unsigned int issue_flags) 980 { 981 struct fuse_ring_queue *queue = ent->queue; 982 struct fuse_ring *ring = queue->ring; 983 struct fuse_conn *fc = ring->fc; 984 struct fuse_iqueue *fiq = &fc->iq; 985 986 fuse_uring_prepare_cancel(cmd, issue_flags, ent); 987 988 spin_lock(&queue->lock); 989 ent->cmd = cmd; 990 fuse_uring_ent_avail(ent, queue); 991 spin_unlock(&queue->lock); 992 993 if (!ring->ready) { 994 bool ready = is_ring_ready(ring, queue->qid); 995 996 if (ready) { 997 WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); 998 WRITE_ONCE(ring->ready, true); 999 wake_up_all(&fc->blocked_waitq); 1000 } 1001 } 1002 } 1003 1004 /* 1005 * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] 1006 * the payload 1007 */ 1008 static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, 1009 struct iovec iov[FUSE_URING_IOV_SEGS]) 1010 { 1011 struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1012 struct iov_iter iter; 1013 ssize_t ret; 1014 1015 if (sqe->len != FUSE_URING_IOV_SEGS) 1016 return -EINVAL; 1017 1018 /* 1019 * Direction for buffer access will actually be READ and WRITE, 1020 * using write for the import should include READ access as well. 1021 */ 1022 ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS, 1023 FUSE_URING_IOV_SEGS, &iov, &iter); 1024 if (ret < 0) 1025 return ret; 1026 1027 return 0; 1028 } 1029 1030 static struct fuse_ring_ent * 1031 fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, 1032 struct fuse_ring_queue *queue) 1033 { 1034 struct fuse_ring *ring = queue->ring; 1035 struct fuse_ring_ent *ent; 1036 size_t payload_size; 1037 struct iovec iov[FUSE_URING_IOV_SEGS]; 1038 int err; 1039 1040 err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov); 1041 if (err) { 1042 pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n", 1043 err); 1044 return ERR_PTR(err); 1045 } 1046 1047 err = -EINVAL; 1048 if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) { 1049 pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len); 1050 return ERR_PTR(err); 1051 } 1052 1053 payload_size = iov[1].iov_len; 1054 if (payload_size < ring->max_payload_sz) { 1055 pr_info_ratelimited("Invalid req payload len %zu\n", 1056 payload_size); 1057 return ERR_PTR(err); 1058 } 1059 1060 err = -ENOMEM; 1061 ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT); 1062 if (!ent) 1063 return ERR_PTR(err); 1064 1065 INIT_LIST_HEAD(&ent->list); 1066 1067 ent->queue = queue; 1068 ent->headers = iov[0].iov_base; 1069 ent->payload = iov[1].iov_base; 1070 1071 atomic_inc(&ring->queue_refs); 1072 return ent; 1073 } 1074 1075 /* 1076 * Register header and payload buffer with the kernel and puts the 1077 * entry as "ready to get fuse requests" on the queue 1078 */ 1079 static int fuse_uring_register(struct io_uring_cmd *cmd, 1080 unsigned int issue_flags, struct fuse_conn *fc) 1081 { 1082 const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); 1083 struct fuse_ring *ring = smp_load_acquire(&fc->ring); 1084 struct fuse_ring_queue *queue; 1085 struct fuse_ring_ent *ent; 1086 int err; 1087 unsigned int qid = READ_ONCE(cmd_req->qid); 1088 1089 err = -ENOMEM; 1090 if (!ring) { 1091 ring = fuse_uring_create(fc); 1092 if (!ring) 1093 return err; 1094 } 1095 1096 if (qid >= ring->nr_queues) { 1097 pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); 1098 return -EINVAL; 1099 } 1100 1101 queue = ring->queues[qid]; 1102 if (!queue) { 1103 queue = fuse_uring_create_queue(ring, qid); 1104 if (!queue) 1105 return err; 1106 } 1107 1108 /* 1109 * The created queue above does not need to be destructed in 1110 * case of entry errors below, will be done at ring destruction time. 1111 */ 1112 1113 ent = fuse_uring_create_ring_ent(cmd, queue); 1114 if (IS_ERR(ent)) 1115 return PTR_ERR(ent); 1116 1117 fuse_uring_do_register(ent, cmd, issue_flags); 1118 1119 return 0; 1120 } 1121 1122 /* 1123 * Entry function from io_uring to handle the given passthrough command 1124 * (op code IORING_OP_URING_CMD) 1125 */ 1126 int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) 1127 { 1128 struct fuse_dev *fud; 1129 struct fuse_conn *fc; 1130 u32 cmd_op = cmd->cmd_op; 1131 int err; 1132 1133 if ((unlikely(issue_flags & IO_URING_F_CANCEL))) { 1134 fuse_uring_cancel(cmd, issue_flags); 1135 return 0; 1136 } 1137 1138 /* This extra SQE size holds struct fuse_uring_cmd_req */ 1139 if (!(issue_flags & IO_URING_F_SQE128)) 1140 return -EINVAL; 1141 1142 fud = fuse_get_dev(cmd->file); 1143 if (IS_ERR(fud)) { 1144 pr_info_ratelimited("No fuse device found\n"); 1145 return PTR_ERR(fud); 1146 } 1147 fc = fud->fc; 1148 1149 /* Once a connection has io-uring enabled on it, it can't be disabled */ 1150 if (!enable_uring && !fc->io_uring) { 1151 pr_info_ratelimited("fuse-io-uring is disabled\n"); 1152 return -EOPNOTSUPP; 1153 } 1154 1155 if (fc->aborted) 1156 return -ECONNABORTED; 1157 if (!fc->connected) 1158 return -ENOTCONN; 1159 1160 /* 1161 * fuse_uring_register() needs the ring to be initialized, 1162 * we need to know the max payload size 1163 */ 1164 if (!fc->initialized) 1165 return -EAGAIN; 1166 1167 switch (cmd_op) { 1168 case FUSE_IO_URING_CMD_REGISTER: 1169 err = fuse_uring_register(cmd, issue_flags, fc); 1170 if (err) { 1171 pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n", 1172 err); 1173 fc->io_uring = 0; 1174 wake_up_all(&fc->blocked_waitq); 1175 return err; 1176 } 1177 break; 1178 case FUSE_IO_URING_CMD_COMMIT_AND_FETCH: 1179 err = fuse_uring_commit_fetch(cmd, issue_flags, fc); 1180 if (err) { 1181 pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n", 1182 err); 1183 return err; 1184 } 1185 break; 1186 default: 1187 return -EINVAL; 1188 } 1189 1190 return -EIOCBQUEUED; 1191 } 1192 1193 static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, 1194 ssize_t ret, unsigned int issue_flags) 1195 { 1196 struct fuse_ring_queue *queue = ent->queue; 1197 1198 spin_lock(&queue->lock); 1199 ent->state = FRRS_USERSPACE; 1200 list_move_tail(&ent->list, &queue->ent_in_userspace); 1201 ent->cmd = NULL; 1202 spin_unlock(&queue->lock); 1203 1204 io_uring_cmd_done(cmd, ret, issue_flags); 1205 } 1206 1207 /* 1208 * This prepares and sends the ring request in fuse-uring task context. 1209 * User buffers are not mapped yet - the application does not have permission 1210 * to write to it - this has to be executed in ring task context. 1211 */ 1212 static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, 1213 unsigned int issue_flags) 1214 { 1215 struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); 1216 struct fuse_ring_queue *queue = ent->queue; 1217 int err; 1218 1219 if (!(issue_flags & IO_URING_F_TASK_DEAD)) { 1220 err = fuse_uring_prepare_send(ent, ent->fuse_req); 1221 if (err) { 1222 fuse_uring_next_fuse_req(ent, queue, issue_flags); 1223 return; 1224 } 1225 } else { 1226 err = -ECANCELED; 1227 } 1228 1229 fuse_uring_send(ent, cmd, err, issue_flags); 1230 } 1231 1232 static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) 1233 { 1234 unsigned int qid; 1235 struct fuse_ring_queue *queue; 1236 1237 qid = task_cpu(current); 1238 1239 if (WARN_ONCE(qid >= ring->nr_queues, 1240 "Core number (%u) exceeds nr queues (%zu)\n", qid, 1241 ring->nr_queues)) 1242 qid = 0; 1243 1244 queue = ring->queues[qid]; 1245 WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); 1246 1247 return queue; 1248 } 1249 1250 static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) 1251 { 1252 struct io_uring_cmd *cmd = ent->cmd; 1253 1254 uring_cmd_set_ring_ent(cmd, ent); 1255 io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); 1256 } 1257 1258 /* queue a fuse request and send it if a ring entry is available */ 1259 void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) 1260 { 1261 struct fuse_conn *fc = req->fm->fc; 1262 struct fuse_ring *ring = fc->ring; 1263 struct fuse_ring_queue *queue; 1264 struct fuse_ring_ent *ent = NULL; 1265 int err; 1266 1267 err = -EINVAL; 1268 queue = fuse_uring_task_to_queue(ring); 1269 if (!queue) 1270 goto err; 1271 1272 fuse_request_assign_unique(fiq, req); 1273 1274 spin_lock(&queue->lock); 1275 err = -ENOTCONN; 1276 if (unlikely(queue->stopped)) 1277 goto err_unlock; 1278 1279 set_bit(FR_URING, &req->flags); 1280 req->ring_queue = queue; 1281 ent = list_first_entry_or_null(&queue->ent_avail_queue, 1282 struct fuse_ring_ent, list); 1283 if (ent) 1284 fuse_uring_add_req_to_ring_ent(ent, req); 1285 else 1286 list_add_tail(&req->list, &queue->fuse_req_queue); 1287 spin_unlock(&queue->lock); 1288 1289 if (ent) 1290 fuse_uring_dispatch_ent(ent); 1291 1292 return; 1293 1294 err_unlock: 1295 spin_unlock(&queue->lock); 1296 err: 1297 req->out.h.error = err; 1298 clear_bit(FR_PENDING, &req->flags); 1299 fuse_request_end(req); 1300 } 1301 1302 bool fuse_uring_queue_bq_req(struct fuse_req *req) 1303 { 1304 struct fuse_conn *fc = req->fm->fc; 1305 struct fuse_ring *ring = fc->ring; 1306 struct fuse_ring_queue *queue; 1307 struct fuse_ring_ent *ent = NULL; 1308 1309 queue = fuse_uring_task_to_queue(ring); 1310 if (!queue) 1311 return false; 1312 1313 spin_lock(&queue->lock); 1314 if (unlikely(queue->stopped)) { 1315 spin_unlock(&queue->lock); 1316 return false; 1317 } 1318 1319 set_bit(FR_URING, &req->flags); 1320 req->ring_queue = queue; 1321 list_add_tail(&req->list, &queue->fuse_req_bg_queue); 1322 1323 ent = list_first_entry_or_null(&queue->ent_avail_queue, 1324 struct fuse_ring_ent, list); 1325 spin_lock(&fc->bg_lock); 1326 fc->num_background++; 1327 if (fc->num_background == fc->max_background) 1328 fc->blocked = 1; 1329 fuse_uring_flush_bg(queue); 1330 spin_unlock(&fc->bg_lock); 1331 1332 /* 1333 * Due to bg_queue flush limits there might be other bg requests 1334 * in the queue that need to be handled first. Or no further req 1335 * might be available. 1336 */ 1337 req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req, 1338 list); 1339 if (ent && req) { 1340 fuse_uring_add_req_to_ring_ent(ent, req); 1341 spin_unlock(&queue->lock); 1342 1343 fuse_uring_dispatch_ent(ent); 1344 } else { 1345 spin_unlock(&queue->lock); 1346 } 1347 1348 return true; 1349 } 1350 1351 bool fuse_uring_remove_pending_req(struct fuse_req *req) 1352 { 1353 struct fuse_ring_queue *queue = req->ring_queue; 1354 1355 return fuse_remove_pending_req(req, &queue->lock); 1356 } 1357 1358 static const struct fuse_iqueue_ops fuse_io_uring_ops = { 1359 /* should be send over io-uring as enhancement */ 1360 .send_forget = fuse_dev_queue_forget, 1361 1362 /* 1363 * could be send over io-uring, but interrupts should be rare, 1364 * no need to make the code complex 1365 */ 1366 .send_interrupt = fuse_dev_queue_interrupt, 1367 .send_req = fuse_uring_queue_fuse_req, 1368 }; 1369