1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 FUSE: Filesystem in Userspace 4 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> 5 */ 6 7 #include "fuse_i.h" 8 #include "dev.h" 9 10 #include <linux/pagemap.h> 11 #include <linux/slab.h> 12 #include <linux/kernel.h> 13 #include <linux/sched.h> 14 #include <linux/sched/signal.h> 15 #include <linux/module.h> 16 #include <linux/swap.h> 17 #include <linux/falloc.h> 18 #include <linux/uio.h> 19 #include <linux/fs.h> 20 #include <linux/filelock.h> 21 #include <linux/splice.h> 22 #include <linux/task_io_accounting_ops.h> 23 #include <linux/iomap.h> 24 25 static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, 26 unsigned int open_flags, int opcode, 27 struct fuse_open_out *outargp) 28 { 29 struct fuse_open_in inarg; 30 FUSE_ARGS(args); 31 32 memset(&inarg, 0, sizeof(inarg)); 33 inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); 34 if (!fm->fc->atomic_o_trunc) 35 inarg.flags &= ~O_TRUNC; 36 37 if (fm->fc->handle_killpriv_v2 && 38 (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) { 39 inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; 40 } 41 42 args.opcode = opcode; 43 args.nodeid = nodeid; 44 args.in_numargs = 1; 45 args.in_args[0].size = sizeof(inarg); 46 args.in_args[0].value = &inarg; 47 args.out_numargs = 1; 48 args.out_args[0].size = sizeof(*outargp); 49 args.out_args[0].value = outargp; 50 51 return fuse_simple_request(fm, &args); 52 } 53 54 struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) 55 { 56 struct fuse_file *ff; 57 58 ff = kzalloc_obj(struct fuse_file, GFP_KERNEL_ACCOUNT); 59 if (unlikely(!ff)) 60 return NULL; 61 62 ff->fm = fm; 63 if (release) { 64 ff->args = kzalloc_obj(*ff->args, GFP_KERNEL_ACCOUNT); 65 if (!ff->args) { 66 kfree(ff); 67 return NULL; 68 } 69 } 70 71 INIT_LIST_HEAD(&ff->write_entry); 72 refcount_set(&ff->count, 1); 73 RB_CLEAR_NODE(&ff->polled_node); 74 init_waitqueue_head(&ff->poll_wait); 75 76 ff->kh = atomic64_inc_return(&fm->fc->khctr); 77 78 return ff; 79 } 80 81 void fuse_file_free(struct fuse_file *ff) 82 { 83 kfree(ff->args); 84 kfree(ff); 85 } 86 87 static struct fuse_file *fuse_file_get(struct fuse_file *ff) 88 { 89 refcount_inc(&ff->count); 90 return ff; 91 } 92 93 static void fuse_release_end(struct fuse_args *args, int error) 94 { 95 struct fuse_release_args *ra = container_of(args, typeof(*ra), args); 96 97 iput(ra->inode); 98 kfree(ra); 99 } 100 101 static void fuse_file_put(struct fuse_file *ff, bool sync) 102 { 103 if (refcount_dec_and_test(&ff->count)) { 104 struct fuse_release_args *ra = &ff->args->release_args; 105 struct fuse_args *args = (ra ? &ra->args : NULL); 106 107 if (ra && ra->inode) 108 fuse_file_io_release(ff, ra->inode); 109 110 if (!args) { 111 /* Do nothing when server does not implement 'opendir' */ 112 } else if (args->opcode == FUSE_RELEASE && ff->fm->fc->no_open) { 113 fuse_release_end(args, 0); 114 } else if (sync) { 115 fuse_simple_request(ff->fm, args); 116 fuse_release_end(args, 0); 117 } else { 118 /* 119 * DAX inodes may need to issue a number of synchronous 120 * request for clearing the mappings. 121 */ 122 if (ra && ra->inode && FUSE_IS_DAX(ra->inode)) 123 args->may_block = true; 124 args->end = fuse_release_end; 125 if (fuse_simple_background(ff->fm, args, 126 GFP_KERNEL | __GFP_NOFAIL)) 127 fuse_release_end(args, -ENOTCONN); 128 } 129 kfree(ff); 130 } 131 } 132 133 struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, 134 unsigned int open_flags, bool isdir) 135 { 136 struct fuse_conn *fc = fm->fc; 137 struct fuse_file *ff; 138 int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; 139 bool open = isdir ? !fc->no_opendir : !fc->no_open; 140 bool release = !isdir || open; 141 142 /* 143 * ff->args->release_args still needs to be allocated (so we can hold an 144 * inode reference while there are pending inflight file operations when 145 * ->release() is called, see fuse_prepare_release()) even if 146 * fc->no_open is set else it becomes possible for reclaim to deadlock 147 * if while servicing the readahead request the server triggers reclaim 148 * and reclaim evicts the inode of the file being read ahead. 149 */ 150 ff = fuse_file_alloc(fm, release); 151 if (!ff) 152 return ERR_PTR(-ENOMEM); 153 154 ff->fh = 0; 155 /* Default for no-open */ 156 ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); 157 if (open) { 158 /* Store outarg for fuse_finish_open() */ 159 struct fuse_open_out *outargp = &ff->args->open_outarg; 160 int err; 161 162 err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); 163 if (!err) { 164 ff->fh = outargp->fh; 165 ff->open_flags = outargp->open_flags; 166 } else if (err != -ENOSYS) { 167 fuse_file_free(ff); 168 return ERR_PTR(err); 169 } else { 170 if (isdir) { 171 /* No release needed */ 172 kfree(ff->args); 173 ff->args = NULL; 174 fc->no_opendir = 1; 175 } else { 176 fc->no_open = 1; 177 } 178 } 179 } 180 181 if (isdir) 182 ff->open_flags &= ~FOPEN_DIRECT_IO; 183 184 ff->nodeid = nodeid; 185 186 return ff; 187 } 188 189 int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, 190 bool isdir) 191 { 192 struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir); 193 194 if (!IS_ERR(ff)) 195 file->private_data = ff; 196 197 return PTR_ERR_OR_ZERO(ff); 198 } 199 EXPORT_SYMBOL_GPL(fuse_do_open); 200 201 static void fuse_link_write_file(struct file *file) 202 { 203 struct inode *inode = file_inode(file); 204 struct fuse_inode *fi = get_fuse_inode(inode); 205 struct fuse_file *ff = file->private_data; 206 /* 207 * file may be written through mmap, so chain it onto the 208 * inodes's write_file list 209 */ 210 spin_lock(&fi->lock); 211 if (list_empty(&ff->write_entry)) 212 list_add(&ff->write_entry, &fi->write_files); 213 spin_unlock(&fi->lock); 214 } 215 216 int fuse_finish_open(struct inode *inode, struct file *file) 217 { 218 struct fuse_file *ff = file->private_data; 219 struct fuse_conn *fc = get_fuse_conn(inode); 220 int err; 221 222 err = fuse_file_io_open(file, inode); 223 if (err) 224 return err; 225 226 if (ff->open_flags & FOPEN_STREAM) 227 stream_open(inode, file); 228 else if (ff->open_flags & FOPEN_NONSEEKABLE) 229 nonseekable_open(inode, file); 230 231 if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) 232 fuse_link_write_file(file); 233 234 return 0; 235 } 236 237 static void fuse_truncate_update_attr(struct inode *inode, struct file *file) 238 { 239 struct fuse_conn *fc = get_fuse_conn(inode); 240 struct fuse_inode *fi = get_fuse_inode(inode); 241 242 spin_lock(&fi->lock); 243 fi->attr_version = atomic64_inc_return(&fc->attr_version); 244 i_size_write(inode, 0); 245 spin_unlock(&fi->lock); 246 file_update_time(file); 247 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); 248 } 249 250 static int fuse_open(struct inode *inode, struct file *file) 251 { 252 struct fuse_mount *fm = get_fuse_mount(inode); 253 struct fuse_inode *fi = get_fuse_inode(inode); 254 struct fuse_conn *fc = fm->fc; 255 struct fuse_file *ff; 256 int err; 257 bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc; 258 bool is_wb_truncate = is_truncate && fc->writeback_cache; 259 bool dax_truncate = is_truncate && FUSE_IS_DAX(inode); 260 261 if (fuse_is_bad(inode)) 262 return -EIO; 263 264 err = generic_file_open(inode, file); 265 if (err) 266 return err; 267 268 if (is_wb_truncate || dax_truncate) 269 inode_lock(inode); 270 271 if (dax_truncate) { 272 filemap_invalidate_lock(inode->i_mapping); 273 err = fuse_dax_break_layouts(inode, 0, -1); 274 if (err) 275 goto out_inode_unlock; 276 } 277 278 if (is_wb_truncate || dax_truncate) 279 fuse_set_nowrite(inode); 280 281 err = fuse_do_open(fm, get_node_id(inode), file, false); 282 if (!err) { 283 ff = file->private_data; 284 err = fuse_finish_open(inode, file); 285 if (err) 286 fuse_sync_release(fi, ff, file->f_flags); 287 else if (is_truncate) 288 fuse_truncate_update_attr(inode, file); 289 } 290 291 if (is_wb_truncate || dax_truncate) 292 fuse_release_nowrite(inode); 293 if (!err) { 294 if (is_truncate) 295 truncate_pagecache(inode, 0); 296 else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) 297 invalidate_inode_pages2(inode->i_mapping); 298 } 299 if (dax_truncate) 300 filemap_invalidate_unlock(inode->i_mapping); 301 out_inode_unlock: 302 if (is_wb_truncate || dax_truncate) 303 inode_unlock(inode); 304 305 return err; 306 } 307 308 static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, 309 unsigned int flags, int opcode, bool sync) 310 { 311 struct fuse_conn *fc = ff->fm->fc; 312 struct fuse_release_args *ra = &ff->args->release_args; 313 314 if (fuse_file_passthrough(ff)) 315 fuse_passthrough_release(ff, fuse_inode_backing(fi)); 316 317 /* Inode is NULL on error path of fuse_create_open() */ 318 if (likely(fi)) { 319 spin_lock(&fi->lock); 320 list_del(&ff->write_entry); 321 spin_unlock(&fi->lock); 322 } 323 spin_lock(&fc->lock); 324 if (!RB_EMPTY_NODE(&ff->polled_node)) 325 rb_erase(&ff->polled_node, &fc->polled_files); 326 spin_unlock(&fc->lock); 327 328 wake_up_interruptible_all(&ff->poll_wait); 329 330 if (!ra) 331 return; 332 333 /* ff->args was used for open outarg */ 334 memset(ff->args, 0, sizeof(*ff->args)); 335 ra->inarg.fh = ff->fh; 336 ra->inarg.flags = flags; 337 ra->args.in_numargs = 1; 338 ra->args.in_args[0].size = sizeof(struct fuse_release_in); 339 ra->args.in_args[0].value = &ra->inarg; 340 ra->args.opcode = opcode; 341 ra->args.nodeid = ff->nodeid; 342 ra->args.force = true; 343 ra->args.nocreds = true; 344 345 /* 346 * Hold inode until release is finished. 347 * From fuse_sync_release() the refcount is 1 and everything's 348 * synchronous, so we are fine with not doing igrab() here. 349 */ 350 ra->inode = sync ? NULL : igrab(&fi->inode); 351 } 352 353 void fuse_file_release(struct inode *inode, struct fuse_file *ff, 354 unsigned int open_flags, fl_owner_t id, bool isdir) 355 { 356 struct fuse_inode *fi = get_fuse_inode(inode); 357 struct fuse_release_args *ra = &ff->args->release_args; 358 int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; 359 360 fuse_prepare_release(fi, ff, open_flags, opcode, false); 361 362 if (ra && ff->flock) { 363 ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; 364 ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); 365 } 366 367 /* 368 * Normally this will send the RELEASE request, however if 369 * some asynchronous READ or WRITE requests are outstanding, 370 * the sending will be delayed. 371 * 372 * Make the release synchronous if this is a fuseblk mount, 373 * synchronous RELEASE is allowed (and desirable) in this case 374 * because the server can be trusted not to screw up. 375 * 376 * Always use the asynchronous file put because the current thread 377 * might be the fuse server. This can happen if a process starts some 378 * aio and closes the fd before the aio completes. Since aio takes its 379 * own ref to the file, the IO completion has to drop the ref, which is 380 * how the fuse server can end up closing its clients' files. 381 * 382 * Exception is virtio-fs, which is not affected by the above (server is 383 * on host, cannot close open files in guest). Virtio-fs needs sync 384 * release, because the num_waiting mechanism to wait for all requests 385 * before commencing with fs shutdown doesn't work if submounts are 386 * used. 387 */ 388 fuse_file_put(ff, ff->fm->fc->auto_submounts); 389 } 390 391 void fuse_release_common(struct file *file, bool isdir) 392 { 393 fuse_file_release(file_inode(file), file->private_data, file->f_flags, 394 (fl_owner_t) file, isdir); 395 } 396 397 static int fuse_release(struct inode *inode, struct file *file) 398 { 399 struct fuse_conn *fc = get_fuse_conn(inode); 400 401 /* 402 * Dirty pages might remain despite write_inode_now() call from 403 * fuse_flush() due to writes racing with the close. 404 */ 405 if (fc->writeback_cache) 406 write_inode_now(inode, 1); 407 408 fuse_release_common(file, false); 409 410 /* return value is ignored by VFS */ 411 return 0; 412 } 413 414 void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, 415 unsigned int flags) 416 { 417 WARN_ON(refcount_read(&ff->count) > 1); 418 fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true); 419 fuse_file_put(ff, true); 420 } 421 EXPORT_SYMBOL_GPL(fuse_sync_release); 422 423 /* 424 * Scramble the ID space with XTEA, so that the value of the files_struct 425 * pointer is not exposed to userspace. 426 */ 427 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) 428 { 429 u32 *k = fc->scramble_key; 430 u64 v = (unsigned long) id; 431 u32 v0 = v; 432 u32 v1 = v >> 32; 433 u32 sum = 0; 434 int i; 435 436 for (i = 0; i < 32; i++) { 437 v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); 438 sum += 0x9E3779B9; 439 v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); 440 } 441 442 return (u64) v0 + ((u64) v1 << 32); 443 } 444 445 struct fuse_writepage_args { 446 struct fuse_io_args ia; 447 struct list_head queue_entry; 448 struct inode *inode; 449 struct fuse_sync_bucket *bucket; 450 }; 451 452 /* 453 * Wait for all pending writepages on the inode to finish. 454 * 455 * This is currently done by blocking further writes with FUSE_NOWRITE 456 * and waiting for all sent writes to complete. 457 * 458 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage 459 * could conflict with truncation. 460 */ 461 static void fuse_sync_writes(struct inode *inode) 462 { 463 fuse_set_nowrite(inode); 464 fuse_release_nowrite(inode); 465 } 466 467 static int fuse_flush(struct file *file, fl_owner_t id) 468 { 469 struct inode *inode = file_inode(file); 470 struct fuse_mount *fm = get_fuse_mount(inode); 471 struct fuse_file *ff = file->private_data; 472 struct fuse_flush_in inarg; 473 FUSE_ARGS(args); 474 int err; 475 476 if (fuse_is_bad(inode)) 477 return -EIO; 478 479 if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) 480 return 0; 481 482 err = write_inode_now(inode, 1); 483 if (err) 484 return err; 485 486 err = filemap_check_errors(file->f_mapping); 487 if (err) 488 return err; 489 490 err = 0; 491 if (fm->fc->no_flush) 492 goto inval_attr_out; 493 494 memset(&inarg, 0, sizeof(inarg)); 495 inarg.fh = ff->fh; 496 inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); 497 args.opcode = FUSE_FLUSH; 498 args.nodeid = get_node_id(inode); 499 args.in_numargs = 1; 500 args.in_args[0].size = sizeof(inarg); 501 args.in_args[0].value = &inarg; 502 args.force = true; 503 504 err = fuse_simple_request(fm, &args); 505 if (err == -ENOSYS) { 506 fm->fc->no_flush = 1; 507 err = 0; 508 } 509 510 inval_attr_out: 511 /* 512 * In memory i_blocks is not maintained by fuse, if writeback cache is 513 * enabled, i_blocks from cached attr may not be accurate. 514 */ 515 if (!err && fm->fc->writeback_cache) 516 fuse_invalidate_attr_mask(inode, STATX_BLOCKS); 517 return err; 518 } 519 520 int fuse_fsync_common(struct file *file, loff_t start, loff_t end, 521 int datasync, int opcode) 522 { 523 struct inode *inode = file->f_mapping->host; 524 struct fuse_mount *fm = get_fuse_mount(inode); 525 struct fuse_file *ff = file->private_data; 526 FUSE_ARGS(args); 527 struct fuse_fsync_in inarg; 528 529 memset(&inarg, 0, sizeof(inarg)); 530 inarg.fh = ff->fh; 531 inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0; 532 args.opcode = opcode; 533 args.nodeid = get_node_id(inode); 534 args.in_numargs = 1; 535 args.in_args[0].size = sizeof(inarg); 536 args.in_args[0].value = &inarg; 537 return fuse_simple_request(fm, &args); 538 } 539 540 static int fuse_fsync(struct file *file, loff_t start, loff_t end, 541 int datasync) 542 { 543 struct inode *inode = file->f_mapping->host; 544 struct fuse_conn *fc = get_fuse_conn(inode); 545 int err; 546 547 if (fuse_is_bad(inode)) 548 return -EIO; 549 550 inode_lock(inode); 551 552 /* 553 * Start writeback against all dirty pages of the inode, then 554 * wait for all outstanding writes, before sending the FSYNC 555 * request. 556 */ 557 err = file_write_and_wait_range(file, start, end); 558 if (err) 559 goto out; 560 561 fuse_sync_writes(inode); 562 563 /* 564 * Due to implementation of fuse writeback 565 * file_write_and_wait_range() does not catch errors. 566 * We have to do this directly after fuse_sync_writes() 567 */ 568 err = file_check_and_advance_wb_err(file); 569 if (err) 570 goto out; 571 572 err = sync_inode_metadata(inode, 1); 573 if (err) 574 goto out; 575 576 if (fc->no_fsync) 577 goto out; 578 579 err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC); 580 if (err == -ENOSYS) { 581 fc->no_fsync = 1; 582 err = 0; 583 } 584 out: 585 inode_unlock(inode); 586 587 return err; 588 } 589 590 void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, 591 size_t count, int opcode) 592 { 593 struct fuse_file *ff = file->private_data; 594 struct fuse_args *args = &ia->ap.args; 595 596 ia->read.in.fh = ff->fh; 597 ia->read.in.offset = pos; 598 ia->read.in.size = count; 599 ia->read.in.flags = file->f_flags; 600 args->opcode = opcode; 601 args->nodeid = ff->nodeid; 602 args->in_numargs = 1; 603 args->in_args[0].size = sizeof(ia->read.in); 604 args->in_args[0].value = &ia->read.in; 605 args->out_argvar = true; 606 args->out_numargs = 1; 607 args->out_args[0].size = count; 608 } 609 610 static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres, 611 bool should_dirty) 612 { 613 unsigned int i; 614 615 for (i = 0; i < ap->num_folios; i++) { 616 if (should_dirty) 617 folio_mark_dirty_lock(ap->folios[i]); 618 if (ap->args.is_pinned) 619 unpin_folio(ap->folios[i]); 620 } 621 622 if (nres > 0 && ap->args.invalidate_vmap) 623 invalidate_kernel_vmap_range(ap->args.vmap_base, nres); 624 } 625 626 static void fuse_io_release(struct kref *kref) 627 { 628 kfree(container_of(kref, struct fuse_io_priv, refcnt)); 629 } 630 631 static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) 632 { 633 if (io->err) 634 return io->err; 635 636 if (io->bytes >= 0 && io->write) 637 return -EIO; 638 639 return io->bytes < 0 ? io->size : io->bytes; 640 } 641 642 static void fuse_aio_invalidate_worker(struct work_struct *work) 643 { 644 struct fuse_io_priv *io = container_of(work, struct fuse_io_priv, work); 645 struct address_space *mapping = io->iocb->ki_filp->f_mapping; 646 ssize_t res = fuse_get_res_by_io(io); 647 pgoff_t start = io->offset >> PAGE_SHIFT; 648 pgoff_t end = (io->offset + res - 1) >> PAGE_SHIFT; 649 650 invalidate_inode_pages2_range(mapping, start, end); 651 io->iocb->ki_complete(io->iocb, res); 652 kref_put(&io->refcnt, fuse_io_release); 653 } 654 655 /* 656 * In case of short read, the caller sets 'pos' to the position of 657 * actual end of fuse request in IO request. Otherwise, if bytes_requested 658 * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. 659 * 660 * An example: 661 * User requested DIO read of 64K. It was split into two 32K fuse requests, 662 * both submitted asynchronously. The first of them was ACKed by userspace as 663 * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The 664 * second request was ACKed as short, e.g. only 1K was read, resulting in 665 * pos == 33K. 666 * 667 * Thus, when all fuse requests are completed, the minimal non-negative 'pos' 668 * will be equal to the length of the longest contiguous fragment of 669 * transferred data starting from the beginning of IO request. 670 */ 671 static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) 672 { 673 int left; 674 675 spin_lock(&io->lock); 676 if (err) 677 io->err = io->err ? : err; 678 else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes)) 679 io->bytes = pos; 680 681 left = --io->reqs; 682 if (!left && io->blocking) 683 complete(io->done); 684 spin_unlock(&io->lock); 685 686 if (!left && !io->blocking) { 687 struct inode *inode = file_inode(io->iocb->ki_filp); 688 struct address_space *mapping = io->iocb->ki_filp->f_mapping; 689 ssize_t res = fuse_get_res_by_io(io); 690 691 if (res >= 0) { 692 struct fuse_conn *fc = get_fuse_conn(inode); 693 struct fuse_inode *fi = get_fuse_inode(inode); 694 695 spin_lock(&fi->lock); 696 fi->attr_version = atomic64_inc_return(&fc->attr_version); 697 spin_unlock(&fi->lock); 698 } 699 700 if (io->write && res > 0 && mapping->nrpages) { 701 /* 702 * As in generic_file_direct_write(), invalidate after the 703 * write, to invalidate read-ahead cache that may have competed 704 * with the write. 705 */ 706 INIT_WORK(&io->work, fuse_aio_invalidate_worker); 707 queue_work(inode->i_sb->s_dio_done_wq, &io->work); 708 return; 709 } 710 711 io->iocb->ki_complete(io->iocb, res); 712 } 713 714 kref_put(&io->refcnt, fuse_io_release); 715 } 716 717 static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, 718 unsigned int nfolios) 719 { 720 struct fuse_io_args *ia; 721 722 ia = kzalloc_obj(*ia); 723 if (ia) { 724 ia->io = io; 725 ia->ap.folios = fuse_folios_alloc(nfolios, GFP_KERNEL, 726 &ia->ap.descs); 727 if (!ia->ap.folios) { 728 kfree(ia); 729 ia = NULL; 730 } 731 } 732 return ia; 733 } 734 735 static void fuse_io_free(struct fuse_io_args *ia) 736 { 737 kfree(ia->ap.folios); 738 kfree(ia); 739 } 740 741 static void fuse_aio_complete_req(struct fuse_args *args, int err) 742 { 743 struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); 744 struct fuse_io_priv *io = ia->io; 745 ssize_t pos = -1; 746 size_t nres; 747 748 if (err) { 749 /* Nothing */ 750 } else if (io->write) { 751 if (ia->write.out.size > ia->write.in.size) { 752 err = -EIO; 753 } else { 754 nres = ia->write.out.size; 755 if (ia->write.in.size != ia->write.out.size) 756 pos = ia->write.in.offset - io->offset + 757 ia->write.out.size; 758 } 759 } else { 760 u32 outsize = args->out_args[0].size; 761 762 nres = outsize; 763 if (ia->read.in.size != outsize) 764 pos = ia->read.in.offset - io->offset + outsize; 765 } 766 767 fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty); 768 769 fuse_aio_complete(io, err, pos); 770 fuse_io_free(ia); 771 } 772 773 static ssize_t fuse_async_req_send(struct fuse_mount *fm, 774 struct fuse_io_args *ia, size_t num_bytes) 775 { 776 ssize_t err; 777 struct fuse_io_priv *io = ia->io; 778 779 spin_lock(&io->lock); 780 kref_get(&io->refcnt); 781 io->size += num_bytes; 782 io->reqs++; 783 spin_unlock(&io->lock); 784 785 ia->ap.args.end = fuse_aio_complete_req; 786 ia->ap.args.may_block = io->should_dirty; 787 err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL); 788 if (err) 789 fuse_aio_complete_req(&ia->ap.args, err); 790 791 return num_bytes; 792 } 793 794 static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, 795 fl_owner_t owner) 796 { 797 struct file *file = ia->io->iocb->ki_filp; 798 struct fuse_file *ff = file->private_data; 799 struct fuse_mount *fm = ff->fm; 800 801 fuse_read_args_fill(ia, file, pos, count, FUSE_READ); 802 if (owner != NULL) { 803 ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; 804 ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner); 805 } 806 807 if (ia->io->async) 808 return fuse_async_req_send(fm, ia, count); 809 810 return fuse_simple_request(fm, &ia->ap.args); 811 } 812 813 static void fuse_read_update_size(struct inode *inode, loff_t size, 814 u64 attr_ver) 815 { 816 struct fuse_conn *fc = get_fuse_conn(inode); 817 struct fuse_inode *fi = get_fuse_inode(inode); 818 819 spin_lock(&fi->lock); 820 if (attr_ver >= fi->attr_version && size < inode->i_size && 821 !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { 822 fi->attr_version = atomic64_inc_return(&fc->attr_version); 823 i_size_write(inode, size); 824 } 825 spin_unlock(&fi->lock); 826 } 827 828 static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, 829 struct fuse_args_pages *ap) 830 { 831 struct fuse_conn *fc = get_fuse_conn(inode); 832 833 /* 834 * If writeback_cache is enabled, a short read means there's a hole in 835 * the file. Some data after the hole is in page cache, but has not 836 * reached the client fs yet. So the hole is not present there. 837 */ 838 if (!fc->writeback_cache) { 839 loff_t pos = folio_pos(ap->folios[0]) + num_read; 840 fuse_read_update_size(inode, pos, attr_ver); 841 } 842 } 843 844 static int fuse_do_readfolio(struct file *file, struct folio *folio, 845 size_t off, size_t len) 846 { 847 struct inode *inode = folio->mapping->host; 848 struct fuse_mount *fm = get_fuse_mount(inode); 849 loff_t pos = folio_pos(folio) + off; 850 struct fuse_folio_desc desc = { 851 .offset = off, 852 .length = len, 853 }; 854 struct fuse_io_args ia = { 855 .ap.args.page_zeroing = true, 856 .ap.args.out_pages = true, 857 .ap.num_folios = 1, 858 .ap.folios = &folio, 859 .ap.descs = &desc, 860 }; 861 ssize_t res; 862 u64 attr_ver; 863 864 attr_ver = fuse_get_attr_version(fm->fc); 865 866 /* Don't overflow end offset */ 867 if (pos + (desc.length - 1) == LLONG_MAX) 868 desc.length--; 869 870 fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); 871 res = fuse_simple_request(fm, &ia.ap.args); 872 if (res < 0) 873 return res; 874 /* 875 * Short read means EOF. If file size is larger, truncate it 876 */ 877 if (res < desc.length) 878 fuse_short_read(inode, attr_ver, res, &ia.ap); 879 880 return 0; 881 } 882 883 static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 884 unsigned int flags, struct iomap *iomap, 885 struct iomap *srcmap) 886 { 887 iomap->type = IOMAP_MAPPED; 888 iomap->length = length; 889 iomap->offset = offset; 890 return 0; 891 } 892 893 static const struct iomap_ops fuse_iomap_ops = { 894 .iomap_begin = fuse_iomap_begin, 895 }; 896 897 struct fuse_fill_read_data { 898 struct file *file; 899 900 /* Fields below are used if sending the read request asynchronously */ 901 struct fuse_conn *fc; 902 struct fuse_io_args *ia; 903 unsigned int nr_bytes; 904 }; 905 906 /* forward declarations */ 907 static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, 908 unsigned len, struct fuse_args_pages *ap, 909 unsigned cur_bytes, bool write); 910 static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, 911 unsigned int count, bool async); 912 913 static int fuse_handle_readahead(struct folio *folio, 914 struct readahead_control *rac, 915 struct fuse_fill_read_data *data, loff_t pos, 916 size_t len) 917 { 918 struct fuse_io_args *ia = data->ia; 919 size_t off = offset_in_folio(folio, pos); 920 struct fuse_conn *fc = data->fc; 921 struct fuse_args_pages *ap; 922 unsigned int nr_pages; 923 924 if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes, 925 false)) { 926 fuse_send_readpages(ia, data->file, data->nr_bytes, 927 fc->async_read); 928 data->nr_bytes = 0; 929 data->ia = NULL; 930 ia = NULL; 931 } 932 if (!ia) { 933 if (fuse_chan_num_background(fc->chan) >= fc->congestion_threshold && 934 rac->ra->async_size >= readahead_count(rac)) 935 /* 936 * Congested and only async pages left, so skip the 937 * rest. 938 */ 939 return -EAGAIN; 940 941 nr_pages = min(fc->max_pages, readahead_count(rac)); 942 data->ia = fuse_io_alloc(NULL, nr_pages); 943 if (!data->ia) 944 return -ENOMEM; 945 ia = data->ia; 946 } 947 folio_get(folio); 948 ap = &ia->ap; 949 ap->folios[ap->num_folios] = folio; 950 ap->descs[ap->num_folios].offset = off; 951 ap->descs[ap->num_folios].length = len; 952 data->nr_bytes += len; 953 ap->num_folios++; 954 955 return 0; 956 } 957 958 static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, 959 struct iomap_read_folio_ctx *ctx, 960 size_t len) 961 { 962 struct fuse_fill_read_data *data = ctx->read_ctx; 963 struct folio *folio = ctx->cur_folio; 964 loff_t pos = iter->pos; 965 size_t off = offset_in_folio(folio, pos); 966 struct file *file = data->file; 967 int ret; 968 969 if (ctx->rac) { 970 ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len); 971 } else { 972 /* 973 * for non-readahead read requests, do reads synchronously 974 * since it's not guaranteed that the server can handle 975 * out-of-order reads 976 */ 977 ret = fuse_do_readfolio(file, folio, off, len); 978 if (!ret) 979 iomap_finish_folio_read(folio, off, len, ret); 980 } 981 return ret; 982 } 983 984 static void fuse_iomap_submit_read(const struct iomap_iter *iter, 985 struct iomap_read_folio_ctx *ctx) 986 { 987 struct fuse_fill_read_data *data = ctx->read_ctx; 988 989 if (data->ia) 990 fuse_send_readpages(data->ia, data->file, data->nr_bytes, 991 data->fc->async_read); 992 } 993 994 static const struct iomap_read_ops fuse_iomap_read_ops = { 995 .read_folio_range = fuse_iomap_read_folio_range_async, 996 .submit_read = fuse_iomap_submit_read, 997 }; 998 999 static int fuse_read_folio(struct file *file, struct folio *folio) 1000 { 1001 struct inode *inode = folio->mapping->host; 1002 struct fuse_fill_read_data data = { 1003 .file = file, 1004 }; 1005 struct iomap_read_folio_ctx ctx = { 1006 .cur_folio = folio, 1007 .ops = &fuse_iomap_read_ops, 1008 .read_ctx = &data, 1009 1010 }; 1011 1012 if (fuse_is_bad(inode)) { 1013 folio_unlock(folio); 1014 return -EIO; 1015 } 1016 1017 iomap_read_folio(&fuse_iomap_ops, &ctx, NULL); 1018 fuse_invalidate_atime(inode); 1019 return 0; 1020 } 1021 1022 static int fuse_iomap_read_folio_range(const struct iomap_iter *iter, 1023 struct folio *folio, loff_t pos, 1024 size_t len) 1025 { 1026 struct file *file = iter->private; 1027 size_t off = offset_in_folio(folio, pos); 1028 1029 return fuse_do_readfolio(file, folio, off, len); 1030 } 1031 1032 static void fuse_readpages_end(struct fuse_args *args, int err) 1033 { 1034 int i; 1035 struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); 1036 struct fuse_args_pages *ap = &ia->ap; 1037 size_t count = ia->read.in.size; 1038 size_t num_read = args->out_args[0].size; 1039 struct address_space *mapping; 1040 struct inode *inode; 1041 1042 WARN_ON_ONCE(!ap->num_folios); 1043 mapping = ap->folios[0]->mapping; 1044 inode = mapping->host; 1045 1046 /* 1047 * Short read means EOF. If file size is larger, truncate it 1048 */ 1049 if (!err && num_read < count) 1050 fuse_short_read(inode, ia->read.attr_ver, num_read, ap); 1051 1052 fuse_invalidate_atime(inode); 1053 1054 for (i = 0; i < ap->num_folios; i++) { 1055 iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset, 1056 ap->descs[i].length, err); 1057 folio_put(ap->folios[i]); 1058 } 1059 if (ia->ff) 1060 fuse_file_put(ia->ff, false); 1061 1062 fuse_io_free(ia); 1063 } 1064 1065 static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, 1066 unsigned int count, bool async) 1067 { 1068 struct fuse_file *ff = file->private_data; 1069 struct fuse_mount *fm = ff->fm; 1070 struct fuse_args_pages *ap = &ia->ap; 1071 loff_t pos = folio_pos(ap->folios[0]); 1072 ssize_t res; 1073 int err; 1074 1075 ap->args.out_pages = true; 1076 ap->args.page_zeroing = true; 1077 ap->args.page_replace = true; 1078 1079 /* Don't overflow end offset */ 1080 if (pos + (count - 1) == LLONG_MAX) { 1081 count--; 1082 ap->descs[ap->num_folios - 1].length--; 1083 } 1084 WARN_ON((loff_t) (pos + count) < 0); 1085 1086 fuse_read_args_fill(ia, file, pos, count, FUSE_READ); 1087 ia->read.attr_ver = fuse_get_attr_version(fm->fc); 1088 if (async) { 1089 ia->ff = fuse_file_get(ff); 1090 ap->args.end = fuse_readpages_end; 1091 err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); 1092 if (!err) 1093 return; 1094 } else { 1095 res = fuse_simple_request(fm, &ap->args); 1096 err = res < 0 ? res : 0; 1097 } 1098 fuse_readpages_end(&ap->args, err); 1099 } 1100 1101 static void fuse_readahead(struct readahead_control *rac) 1102 { 1103 struct inode *inode = rac->mapping->host; 1104 struct fuse_conn *fc = get_fuse_conn(inode); 1105 struct fuse_fill_read_data data = { 1106 .file = rac->file, 1107 .fc = fc, 1108 }; 1109 struct iomap_read_folio_ctx ctx = { 1110 .ops = &fuse_iomap_read_ops, 1111 .rac = rac, 1112 .read_ctx = &data 1113 }; 1114 1115 if (fuse_is_bad(inode)) 1116 return; 1117 1118 iomap_readahead(&fuse_iomap_ops, &ctx, NULL); 1119 } 1120 1121 static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) 1122 { 1123 struct inode *inode = iocb->ki_filp->f_mapping->host; 1124 struct fuse_conn *fc = get_fuse_conn(inode); 1125 1126 /* 1127 * In auto invalidate mode, always update attributes on read. 1128 * Otherwise, only update if we attempt to read past EOF (to ensure 1129 * i_size is up to date). 1130 */ 1131 if (fc->auto_inval_data || 1132 (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { 1133 int err; 1134 err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE); 1135 if (err) 1136 return err; 1137 } 1138 1139 return generic_file_read_iter(iocb, to); 1140 } 1141 1142 static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, 1143 loff_t pos, size_t count) 1144 { 1145 struct fuse_args *args = &ia->ap.args; 1146 1147 ia->write.in.fh = ff->fh; 1148 ia->write.in.offset = pos; 1149 ia->write.in.size = count; 1150 args->opcode = FUSE_WRITE; 1151 args->nodeid = ff->nodeid; 1152 args->in_numargs = 2; 1153 if (ff->fm->fc->minor < 9) 1154 args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; 1155 else 1156 args->in_args[0].size = sizeof(ia->write.in); 1157 args->in_args[0].value = &ia->write.in; 1158 args->in_args[1].size = count; 1159 args->out_numargs = 1; 1160 args->out_args[0].size = sizeof(ia->write.out); 1161 args->out_args[0].value = &ia->write.out; 1162 } 1163 1164 static unsigned int fuse_write_flags(struct kiocb *iocb) 1165 { 1166 unsigned int flags = iocb->ki_filp->f_flags; 1167 1168 if (iocb_is_dsync(iocb)) 1169 flags |= O_DSYNC; 1170 if (iocb->ki_flags & IOCB_SYNC) 1171 flags |= O_SYNC; 1172 1173 return flags; 1174 } 1175 1176 static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, 1177 size_t count, fl_owner_t owner) 1178 { 1179 struct kiocb *iocb = ia->io->iocb; 1180 struct file *file = iocb->ki_filp; 1181 struct fuse_file *ff = file->private_data; 1182 struct fuse_mount *fm = ff->fm; 1183 struct fuse_write_in *inarg = &ia->write.in; 1184 ssize_t err; 1185 1186 fuse_write_args_fill(ia, ff, pos, count); 1187 inarg->flags = fuse_write_flags(iocb); 1188 if (owner != NULL) { 1189 inarg->write_flags |= FUSE_WRITE_LOCKOWNER; 1190 inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner); 1191 } 1192 1193 if (ia->io->async) 1194 return fuse_async_req_send(fm, ia, count); 1195 1196 err = fuse_simple_request(fm, &ia->ap.args); 1197 if (!err && ia->write.out.size > count) 1198 err = -EIO; 1199 1200 return err ?: ia->write.out.size; 1201 } 1202 1203 bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written) 1204 { 1205 struct fuse_conn *fc = get_fuse_conn(inode); 1206 struct fuse_inode *fi = get_fuse_inode(inode); 1207 bool ret = false; 1208 1209 spin_lock(&fi->lock); 1210 fi->attr_version = atomic64_inc_return(&fc->attr_version); 1211 if (written > 0 && pos > inode->i_size) { 1212 i_size_write(inode, pos); 1213 ret = true; 1214 } 1215 spin_unlock(&fi->lock); 1216 1217 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); 1218 1219 return ret; 1220 } 1221 1222 static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, 1223 struct kiocb *iocb, struct inode *inode, 1224 loff_t pos, size_t count) 1225 { 1226 struct fuse_args_pages *ap = &ia->ap; 1227 struct file *file = iocb->ki_filp; 1228 struct fuse_file *ff = file->private_data; 1229 struct fuse_mount *fm = ff->fm; 1230 unsigned int offset, i; 1231 bool short_write; 1232 int err; 1233 1234 for (i = 0; i < ap->num_folios; i++) 1235 folio_wait_writeback(ap->folios[i]); 1236 1237 fuse_write_args_fill(ia, ff, pos, count); 1238 ia->write.in.flags = fuse_write_flags(iocb); 1239 if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID)) 1240 ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; 1241 1242 err = fuse_simple_request(fm, &ap->args); 1243 if (!err && ia->write.out.size > count) 1244 err = -EIO; 1245 1246 short_write = ia->write.out.size < count; 1247 offset = ap->descs[0].offset; 1248 count = ia->write.out.size; 1249 for (i = 0; i < ap->num_folios; i++) { 1250 struct folio *folio = ap->folios[i]; 1251 1252 if (err) { 1253 folio_clear_uptodate(folio); 1254 } else { 1255 if (count >= folio_size(folio) - offset) 1256 count -= folio_size(folio) - offset; 1257 else { 1258 if (short_write) 1259 folio_clear_uptodate(folio); 1260 count = 0; 1261 } 1262 offset = 0; 1263 } 1264 if (ia->write.folio_locked && (i == ap->num_folios - 1)) 1265 folio_unlock(folio); 1266 folio_put(folio); 1267 } 1268 1269 return err; 1270 } 1271 1272 static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, 1273 struct address_space *mapping, 1274 struct iov_iter *ii, loff_t pos, 1275 unsigned int max_folios) 1276 { 1277 struct fuse_args_pages *ap = &ia->ap; 1278 struct fuse_conn *fc = get_fuse_conn(mapping->host); 1279 size_t count = 0; 1280 unsigned int num; 1281 int err = 0; 1282 1283 num = min(iov_iter_count(ii), fc->max_write); 1284 1285 ap->args.in_pages = true; 1286 1287 while (num && ap->num_folios < max_folios) { 1288 size_t tmp; 1289 struct folio *folio; 1290 pgoff_t index = pos >> PAGE_SHIFT; 1291 unsigned int bytes; 1292 unsigned int folio_offset; 1293 1294 again: 1295 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, 1296 mapping_gfp_mask(mapping)); 1297 if (IS_ERR(folio)) { 1298 err = PTR_ERR(folio); 1299 break; 1300 } 1301 1302 if (mapping_writably_mapped(mapping)) 1303 flush_dcache_folio(folio); 1304 1305 folio_offset = offset_in_folio(folio, pos); 1306 bytes = min(folio_size(folio) - folio_offset, num); 1307 1308 tmp = copy_folio_from_iter_atomic(folio, folio_offset, bytes, ii); 1309 flush_dcache_folio(folio); 1310 1311 if (!tmp) { 1312 folio_unlock(folio); 1313 folio_put(folio); 1314 1315 /* 1316 * Ensure forward progress by faulting in 1317 * while not holding the folio lock: 1318 */ 1319 if (fault_in_iov_iter_readable(ii, bytes)) { 1320 err = -EFAULT; 1321 break; 1322 } 1323 1324 goto again; 1325 } 1326 1327 ap->folios[ap->num_folios] = folio; 1328 ap->descs[ap->num_folios].offset = folio_offset; 1329 ap->descs[ap->num_folios].length = tmp; 1330 ap->num_folios++; 1331 1332 count += tmp; 1333 pos += tmp; 1334 num -= tmp; 1335 1336 /* If we copied full folio, mark it uptodate */ 1337 if (tmp == folio_size(folio)) 1338 folio_mark_uptodate(folio); 1339 1340 if (folio_test_uptodate(folio)) { 1341 folio_unlock(folio); 1342 } else { 1343 ia->write.folio_locked = true; 1344 break; 1345 } 1346 if (!fc->big_writes) 1347 break; 1348 if (folio_offset + tmp != folio_size(folio)) 1349 break; 1350 } 1351 1352 return count > 0 ? count : err; 1353 } 1354 1355 static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, 1356 unsigned int max_pages) 1357 { 1358 unsigned int pages = ((pos + len - 1) >> PAGE_SHIFT) - 1359 (pos >> PAGE_SHIFT) + 1; 1360 1361 return min(pages, max_pages); 1362 } 1363 1364 static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) 1365 { 1366 struct address_space *mapping = iocb->ki_filp->f_mapping; 1367 struct inode *inode = mapping->host; 1368 struct fuse_conn *fc = get_fuse_conn(inode); 1369 struct fuse_inode *fi = get_fuse_inode(inode); 1370 loff_t pos = iocb->ki_pos; 1371 int err = 0; 1372 ssize_t res = 0; 1373 1374 if (inode->i_size < pos + iov_iter_count(ii)) 1375 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 1376 1377 do { 1378 ssize_t count; 1379 struct fuse_io_args ia = {}; 1380 struct fuse_args_pages *ap = &ia.ap; 1381 unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), 1382 fc->max_pages); 1383 1384 ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->descs); 1385 if (!ap->folios) { 1386 err = -ENOMEM; 1387 break; 1388 } 1389 1390 count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages); 1391 if (count <= 0) { 1392 err = count; 1393 } else { 1394 err = fuse_send_write_pages(&ia, iocb, inode, 1395 pos, count); 1396 if (!err) { 1397 size_t num_written = ia.write.out.size; 1398 1399 res += num_written; 1400 pos += num_written; 1401 1402 /* break out of the loop on short write */ 1403 if (num_written != count) 1404 err = -EIO; 1405 } 1406 } 1407 kfree(ap->folios); 1408 } while (!err && iov_iter_count(ii)); 1409 1410 fuse_write_update_attr(inode, pos, res); 1411 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 1412 1413 if (!res) 1414 return err; 1415 iocb->ki_pos += res; 1416 return res; 1417 } 1418 1419 static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter) 1420 { 1421 struct inode *inode = file_inode(iocb->ki_filp); 1422 1423 return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); 1424 } 1425 1426 /* 1427 * @return true if an exclusive lock for direct IO writes is needed 1428 */ 1429 static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from) 1430 { 1431 struct file *file = iocb->ki_filp; 1432 struct fuse_file *ff = file->private_data; 1433 struct inode *inode = file_inode(iocb->ki_filp); 1434 struct fuse_inode *fi = get_fuse_inode(inode); 1435 1436 /* Server side has to advise that it supports parallel dio writes. */ 1437 if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES)) 1438 return true; 1439 1440 /* 1441 * Append will need to know the eventual EOF - always needs an 1442 * exclusive lock. 1443 */ 1444 if (iocb->ki_flags & IOCB_APPEND) 1445 return true; 1446 1447 /* shared locks are not allowed with parallel page cache IO */ 1448 if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) 1449 return true; 1450 1451 /* Parallel dio beyond EOF is not supported, at least for now. */ 1452 if (fuse_io_past_eof(iocb, from)) 1453 return true; 1454 1455 return false; 1456 } 1457 1458 static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, 1459 bool *exclusive) 1460 { 1461 struct inode *inode = file_inode(iocb->ki_filp); 1462 struct fuse_inode *fi = get_fuse_inode(inode); 1463 1464 *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); 1465 if (*exclusive) { 1466 inode_lock(inode); 1467 } else { 1468 inode_lock_shared(inode); 1469 /* 1470 * New parallal dio allowed only if inode is not in caching 1471 * mode and denies new opens in caching mode. This check 1472 * should be performed only after taking shared inode lock. 1473 * Previous past eof check was without inode lock and might 1474 * have raced, so check it again. 1475 */ 1476 if (fuse_io_past_eof(iocb, from) || 1477 fuse_inode_uncached_io_start(fi, NULL) != 0) { 1478 inode_unlock_shared(inode); 1479 inode_lock(inode); 1480 *exclusive = true; 1481 } 1482 } 1483 } 1484 1485 static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) 1486 { 1487 struct inode *inode = file_inode(iocb->ki_filp); 1488 struct fuse_inode *fi = get_fuse_inode(inode); 1489 1490 if (exclusive) { 1491 inode_unlock(inode); 1492 } else { 1493 /* Allow opens in caching mode after last parallel dio end */ 1494 fuse_inode_uncached_io_end(fi); 1495 inode_unlock_shared(inode); 1496 } 1497 } 1498 1499 static const struct iomap_write_ops fuse_iomap_write_ops = { 1500 .read_folio_range = fuse_iomap_read_folio_range, 1501 }; 1502 1503 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) 1504 { 1505 struct file *file = iocb->ki_filp; 1506 struct mnt_idmap *idmap = file_mnt_idmap(file); 1507 struct address_space *mapping = file->f_mapping; 1508 ssize_t written = 0; 1509 struct inode *inode = mapping->host; 1510 ssize_t err, count; 1511 struct fuse_conn *fc = get_fuse_conn(inode); 1512 bool writeback = false; 1513 1514 if (fc->writeback_cache) { 1515 /* Update size (EOF optimization) and mode (SUID clearing) */ 1516 err = fuse_update_attributes(mapping->host, file, 1517 STATX_SIZE | STATX_MODE); 1518 if (err) 1519 return err; 1520 1521 if (!fc->handle_killpriv_v2 || 1522 !setattr_should_drop_suidgid(idmap, file_inode(file))) 1523 writeback = true; 1524 } 1525 1526 inode_lock(inode); 1527 1528 err = count = generic_write_checks(iocb, from); 1529 if (err <= 0) 1530 goto out; 1531 1532 task_io_account_write(count); 1533 1534 err = kiocb_modified(iocb); 1535 if (err) 1536 goto out; 1537 1538 if (iocb->ki_flags & IOCB_DIRECT) { 1539 written = generic_file_direct_write(iocb, from); 1540 if (written < 0 || !iov_iter_count(from)) 1541 goto out; 1542 written = direct_write_fallback(iocb, from, written, 1543 fuse_perform_write(iocb, from)); 1544 } else if (writeback) { 1545 /* 1546 * Use iomap so that we can do granular uptodate reads 1547 * and granular dirty tracking for large folios. 1548 */ 1549 written = iomap_file_buffered_write(iocb, from, 1550 &fuse_iomap_ops, 1551 &fuse_iomap_write_ops, 1552 file); 1553 } else { 1554 written = fuse_perform_write(iocb, from); 1555 } 1556 out: 1557 inode_unlock(inode); 1558 if (written > 0) 1559 written = generic_write_sync(iocb, written); 1560 1561 return written ? written : err; 1562 } 1563 1564 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) 1565 { 1566 return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset; 1567 } 1568 1569 static inline size_t fuse_get_frag_size(const struct iov_iter *ii, 1570 size_t max_size) 1571 { 1572 return min(iov_iter_single_seg_count(ii), max_size); 1573 } 1574 1575 static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, 1576 size_t *nbytesp, int write, 1577 unsigned int max_pages, 1578 bool use_pages_for_kvec_io) 1579 { 1580 bool flush_or_invalidate = false; 1581 unsigned int nr_pages = 0; 1582 size_t nbytes = 0; /* # bytes already packed in req */ 1583 ssize_t ret = 0; 1584 1585 /* Special case for kernel I/O: can copy directly into the buffer. 1586 * However if the implementation of fuse_conn requires pages instead of 1587 * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead. 1588 */ 1589 if (iov_iter_is_kvec(ii)) { 1590 void *user_addr = (void *)fuse_get_user_addr(ii); 1591 1592 if (!use_pages_for_kvec_io) { 1593 size_t frag_size = fuse_get_frag_size(ii, *nbytesp); 1594 1595 if (write) 1596 ap->args.in_args[1].value = user_addr; 1597 else 1598 ap->args.out_args[0].value = user_addr; 1599 1600 iov_iter_advance(ii, frag_size); 1601 *nbytesp = frag_size; 1602 return 0; 1603 } 1604 1605 if (is_vmalloc_addr(user_addr)) { 1606 ap->args.vmap_base = user_addr; 1607 flush_or_invalidate = true; 1608 } 1609 } 1610 1611 /* 1612 * Until there is support for iov_iter_extract_folios(), we have to 1613 * manually extract pages using iov_iter_extract_pages() and then 1614 * copy that to a folios array. 1615 */ 1616 struct page **pages = kcalloc(max_pages, sizeof(struct page *), 1617 GFP_KERNEL); 1618 if (!pages) { 1619 ret = -ENOMEM; 1620 goto out; 1621 } 1622 1623 while (nbytes < *nbytesp && nr_pages < max_pages) { 1624 unsigned nfolios, i; 1625 size_t start; 1626 1627 ret = iov_iter_extract_pages(ii, &pages, 1628 *nbytesp - nbytes, 1629 max_pages - nr_pages, 1630 0, &start); 1631 if (ret < 0) 1632 break; 1633 1634 nbytes += ret; 1635 1636 nfolios = DIV_ROUND_UP(ret + start, PAGE_SIZE); 1637 1638 for (i = 0; i < nfolios; i++) { 1639 struct folio *folio = page_folio(pages[i]); 1640 unsigned int offset = start + 1641 (folio_page_idx(folio, pages[i]) << PAGE_SHIFT); 1642 unsigned int len = umin(ret, PAGE_SIZE - start); 1643 1644 ap->descs[ap->num_folios].offset = offset; 1645 ap->descs[ap->num_folios].length = len; 1646 ap->folios[ap->num_folios] = folio; 1647 start = 0; 1648 ret -= len; 1649 ap->num_folios++; 1650 } 1651 1652 nr_pages += nfolios; 1653 } 1654 kfree(pages); 1655 1656 if (write && flush_or_invalidate) 1657 flush_kernel_vmap_range(ap->args.vmap_base, nbytes); 1658 1659 ap->args.invalidate_vmap = !write && flush_or_invalidate; 1660 ap->args.is_pinned = iov_iter_extract_will_pin(ii); 1661 ap->args.user_pages = true; 1662 if (write) 1663 ap->args.in_pages = true; 1664 else 1665 ap->args.out_pages = true; 1666 1667 out: 1668 *nbytesp = nbytes; 1669 1670 return ret < 0 ? ret : 0; 1671 } 1672 1673 ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, 1674 loff_t *ppos, int flags) 1675 { 1676 int write = flags & FUSE_DIO_WRITE; 1677 int cuse = flags & FUSE_DIO_CUSE; 1678 struct file *file = io->iocb->ki_filp; 1679 struct address_space *mapping = file->f_mapping; 1680 struct inode *inode = mapping->host; 1681 struct fuse_file *ff = file->private_data; 1682 struct fuse_conn *fc = ff->fm->fc; 1683 size_t nmax = write ? fc->max_write : fc->max_read; 1684 loff_t pos = *ppos; 1685 size_t count = iov_iter_count(iter); 1686 pgoff_t idx_from = pos >> PAGE_SHIFT; 1687 pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT; 1688 ssize_t res = 0; 1689 int err = 0; 1690 struct fuse_io_args *ia; 1691 unsigned int max_pages; 1692 bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO; 1693 1694 max_pages = iov_iter_npages(iter, fc->max_pages); 1695 ia = fuse_io_alloc(io, max_pages); 1696 if (!ia) 1697 return -ENOMEM; 1698 1699 if (fopen_direct_io) { 1700 res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); 1701 if (res) { 1702 fuse_io_free(ia); 1703 return res; 1704 } 1705 } 1706 if (!cuse && filemap_range_has_writeback(mapping, pos, (pos + count - 1))) { 1707 if (!write) 1708 inode_lock(inode); 1709 fuse_sync_writes(inode); 1710 if (!write) 1711 inode_unlock(inode); 1712 } 1713 1714 if (fopen_direct_io && write) { 1715 res = invalidate_inode_pages2_range(mapping, idx_from, idx_to); 1716 if (res) { 1717 fuse_io_free(ia); 1718 return res; 1719 } 1720 } 1721 1722 io->should_dirty = !write && user_backed_iter(iter); 1723 while (count) { 1724 ssize_t nres; 1725 fl_owner_t owner = current->files; 1726 size_t nbytes = min(count, nmax); 1727 1728 err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, 1729 max_pages, fc->use_pages_for_kvec_io); 1730 if (err && !nbytes) 1731 break; 1732 1733 if (write) { 1734 if (!capable(CAP_FSETID)) 1735 ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; 1736 1737 nres = fuse_send_write(ia, pos, nbytes, owner); 1738 } else { 1739 nres = fuse_send_read(ia, pos, nbytes, owner); 1740 } 1741 1742 if (!io->async || nres < 0) { 1743 fuse_release_user_pages(&ia->ap, nres, io->should_dirty); 1744 fuse_io_free(ia); 1745 } 1746 ia = NULL; 1747 if (nres < 0) { 1748 iov_iter_revert(iter, nbytes); 1749 err = nres; 1750 break; 1751 } 1752 WARN_ON(nres > nbytes); 1753 1754 count -= nres; 1755 res += nres; 1756 pos += nres; 1757 if (nres != nbytes) { 1758 iov_iter_revert(iter, nbytes - nres); 1759 break; 1760 } 1761 if (count) { 1762 max_pages = iov_iter_npages(iter, fc->max_pages); 1763 ia = fuse_io_alloc(io, max_pages); 1764 if (!ia) 1765 break; 1766 } 1767 } 1768 if (ia) 1769 fuse_io_free(ia); 1770 if (res > 0) 1771 *ppos = pos; 1772 1773 return res > 0 ? res : err; 1774 } 1775 EXPORT_SYMBOL_GPL(fuse_direct_io); 1776 1777 static ssize_t __fuse_direct_read(struct fuse_io_priv *io, 1778 struct iov_iter *iter, 1779 loff_t *ppos) 1780 { 1781 ssize_t res; 1782 struct inode *inode = file_inode(io->iocb->ki_filp); 1783 1784 res = fuse_direct_io(io, iter, ppos, 0); 1785 1786 fuse_invalidate_atime(inode); 1787 1788 return res; 1789 } 1790 1791 static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter); 1792 1793 static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) 1794 { 1795 ssize_t res; 1796 1797 if (!is_sync_kiocb(iocb)) { 1798 res = fuse_direct_IO(iocb, to); 1799 } else { 1800 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); 1801 1802 res = __fuse_direct_read(&io, to, &iocb->ki_pos); 1803 } 1804 1805 return res; 1806 } 1807 1808 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) 1809 { 1810 struct inode *inode = file_inode(iocb->ki_filp); 1811 struct address_space *mapping = inode->i_mapping; 1812 loff_t pos = iocb->ki_pos; 1813 ssize_t res; 1814 bool exclusive; 1815 1816 fuse_dio_lock(iocb, from, &exclusive); 1817 res = generic_write_checks(iocb, from); 1818 if (res > 0) { 1819 task_io_account_write(res); 1820 if (!is_sync_kiocb(iocb)) { 1821 res = fuse_direct_IO(iocb, from); 1822 } else { 1823 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); 1824 1825 res = fuse_direct_io(&io, from, &iocb->ki_pos, 1826 FUSE_DIO_WRITE); 1827 fuse_write_update_attr(inode, iocb->ki_pos, res); 1828 } 1829 if (res > 0 && mapping->nrpages) { 1830 /* 1831 * As in generic_file_direct_write(), invalidate after 1832 * write, to invalidate read-ahead cache that may have 1833 * with the write. 1834 */ 1835 invalidate_inode_pages2_range(mapping, 1836 pos >> PAGE_SHIFT, 1837 (pos + res - 1) >> PAGE_SHIFT); 1838 } 1839 } 1840 fuse_dio_unlock(iocb, exclusive); 1841 1842 return res; 1843 } 1844 1845 static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 1846 { 1847 struct file *file = iocb->ki_filp; 1848 struct fuse_file *ff = file->private_data; 1849 struct inode *inode = file_inode(file); 1850 1851 if (fuse_is_bad(inode)) 1852 return -EIO; 1853 1854 if (FUSE_IS_DAX(inode)) 1855 return fuse_dax_read_iter(iocb, to); 1856 1857 /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ 1858 if (ff->open_flags & FOPEN_DIRECT_IO) 1859 return fuse_direct_read_iter(iocb, to); 1860 else if (fuse_file_passthrough(ff)) 1861 return fuse_passthrough_read_iter(iocb, to); 1862 else 1863 return fuse_cache_read_iter(iocb, to); 1864 } 1865 1866 static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 1867 { 1868 struct file *file = iocb->ki_filp; 1869 struct fuse_file *ff = file->private_data; 1870 struct inode *inode = file_inode(file); 1871 1872 if (fuse_is_bad(inode)) 1873 return -EIO; 1874 1875 if (FUSE_IS_DAX(inode)) 1876 return fuse_dax_write_iter(iocb, from); 1877 1878 /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ 1879 if (ff->open_flags & FOPEN_DIRECT_IO) 1880 return fuse_direct_write_iter(iocb, from); 1881 else if (fuse_file_passthrough(ff)) 1882 return fuse_passthrough_write_iter(iocb, from); 1883 else 1884 return fuse_cache_write_iter(iocb, from); 1885 } 1886 1887 static ssize_t fuse_splice_read(struct file *in, loff_t *ppos, 1888 struct pipe_inode_info *pipe, size_t len, 1889 unsigned int flags) 1890 { 1891 struct fuse_file *ff = in->private_data; 1892 1893 /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ 1894 if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) 1895 return fuse_passthrough_splice_read(in, ppos, pipe, len, flags); 1896 else 1897 return filemap_splice_read(in, ppos, pipe, len, flags); 1898 } 1899 1900 static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, 1901 loff_t *ppos, size_t len, unsigned int flags) 1902 { 1903 struct fuse_file *ff = out->private_data; 1904 1905 /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ 1906 if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) 1907 return fuse_passthrough_splice_write(pipe, out, ppos, len, flags); 1908 else 1909 return iter_file_splice_write(pipe, out, ppos, len, flags); 1910 } 1911 1912 static void fuse_writepage_free(struct fuse_writepage_args *wpa) 1913 { 1914 struct fuse_args_pages *ap = &wpa->ia.ap; 1915 1916 if (wpa->bucket) 1917 fuse_sync_bucket_dec(wpa->bucket); 1918 1919 fuse_file_put(wpa->ia.ff, false); 1920 1921 kfree(ap->folios); 1922 kfree(wpa); 1923 } 1924 1925 static void fuse_writepage_finish(struct fuse_writepage_args *wpa) 1926 { 1927 struct fuse_args_pages *ap = &wpa->ia.ap; 1928 struct inode *inode = wpa->inode; 1929 struct fuse_inode *fi = get_fuse_inode(inode); 1930 int i; 1931 1932 for (i = 0; i < ap->num_folios; i++) 1933 /* 1934 * Benchmarks showed that ending writeback within the 1935 * scope of the fi->lock alleviates xarray lock 1936 * contention and noticeably improves performance. 1937 */ 1938 iomap_finish_folio_write(inode, ap->folios[i], 1939 ap->descs[i].length); 1940 1941 wake_up(&fi->page_waitq); 1942 } 1943 1944 /* Called under fi->lock, may release and reacquire it */ 1945 static void fuse_send_writepage(struct fuse_mount *fm, 1946 struct fuse_writepage_args *wpa, loff_t size) 1947 __releases(fi->lock) 1948 __acquires(fi->lock) 1949 { 1950 struct fuse_inode *fi = get_fuse_inode(wpa->inode); 1951 struct fuse_args_pages *ap = &wpa->ia.ap; 1952 struct fuse_write_in *inarg = &wpa->ia.write.in; 1953 struct fuse_args *args = &ap->args; 1954 __u64 data_size = 0; 1955 int err, i; 1956 1957 for (i = 0; i < ap->num_folios; i++) 1958 data_size += ap->descs[i].length; 1959 1960 fi->writectr++; 1961 if (inarg->offset + data_size <= size) { 1962 inarg->size = data_size; 1963 } else if (inarg->offset < size) { 1964 inarg->size = size - inarg->offset; 1965 } else { 1966 /* Got truncated off completely */ 1967 goto out_free; 1968 } 1969 1970 args->in_args[1].size = inarg->size; 1971 args->force = true; 1972 args->nocreds = true; 1973 1974 err = fuse_simple_background(fm, args, GFP_ATOMIC); 1975 if (err == -ENOMEM) { 1976 spin_unlock(&fi->lock); 1977 err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL); 1978 spin_lock(&fi->lock); 1979 } 1980 1981 /* Fails on broken connection only */ 1982 if (unlikely(err)) 1983 goto out_free; 1984 1985 return; 1986 1987 out_free: 1988 fi->writectr--; 1989 fuse_writepage_finish(wpa); 1990 spin_unlock(&fi->lock); 1991 fuse_writepage_free(wpa); 1992 spin_lock(&fi->lock); 1993 } 1994 1995 /* 1996 * If fi->writectr is positive (no truncate or fsync going on) send 1997 * all queued writepage requests. 1998 * 1999 * Called with fi->lock 2000 */ 2001 void fuse_flush_writepages(struct inode *inode) 2002 __releases(fi->lock) 2003 __acquires(fi->lock) 2004 { 2005 struct fuse_mount *fm = get_fuse_mount(inode); 2006 struct fuse_inode *fi = get_fuse_inode(inode); 2007 loff_t crop = i_size_read(inode); 2008 struct fuse_writepage_args *wpa; 2009 2010 while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { 2011 wpa = list_entry(fi->queued_writes.next, 2012 struct fuse_writepage_args, queue_entry); 2013 list_del_init(&wpa->queue_entry); 2014 fuse_send_writepage(fm, wpa, crop); 2015 } 2016 } 2017 2018 static void fuse_writepage_end(struct fuse_args *args, int error) 2019 { 2020 struct fuse_writepage_args *wpa = 2021 container_of(args, typeof(*wpa), ia.ap.args); 2022 struct inode *inode = wpa->inode; 2023 struct fuse_inode *fi = get_fuse_inode(inode); 2024 struct fuse_conn *fc = get_fuse_conn(inode); 2025 2026 mapping_set_error(inode->i_mapping, error); 2027 /* 2028 * A writeback finished and this might have updated mtime/ctime on 2029 * server making local mtime/ctime stale. Hence invalidate attrs. 2030 * Do this only if writeback_cache is not enabled. If writeback_cache 2031 * is enabled, we trust local ctime/mtime. 2032 */ 2033 if (!fc->writeback_cache) 2034 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); 2035 spin_lock(&fi->lock); 2036 fi->writectr--; 2037 fuse_writepage_finish(wpa); 2038 spin_unlock(&fi->lock); 2039 fuse_writepage_free(wpa); 2040 } 2041 2042 static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi) 2043 { 2044 struct fuse_file *ff; 2045 2046 spin_lock(&fi->lock); 2047 ff = list_first_entry_or_null(&fi->write_files, struct fuse_file, 2048 write_entry); 2049 if (ff) 2050 fuse_file_get(ff); 2051 spin_unlock(&fi->lock); 2052 2053 return ff; 2054 } 2055 2056 static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi) 2057 { 2058 struct fuse_file *ff = __fuse_write_file_get(fi); 2059 WARN_ON(!ff); 2060 return ff; 2061 } 2062 2063 int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) 2064 { 2065 struct fuse_inode *fi = get_fuse_inode(inode); 2066 struct fuse_file *ff; 2067 int err; 2068 2069 ff = __fuse_write_file_get(fi); 2070 err = fuse_flush_times(inode, ff); 2071 if (ff) 2072 fuse_file_put(ff, false); 2073 2074 return err; 2075 } 2076 2077 static struct fuse_writepage_args *fuse_writepage_args_alloc(void) 2078 { 2079 struct fuse_writepage_args *wpa; 2080 struct fuse_args_pages *ap; 2081 2082 wpa = kzalloc_obj(*wpa, GFP_NOFS); 2083 if (wpa) { 2084 ap = &wpa->ia.ap; 2085 ap->num_folios = 0; 2086 ap->folios = fuse_folios_alloc(1, GFP_NOFS, &ap->descs); 2087 if (!ap->folios) { 2088 kfree(wpa); 2089 wpa = NULL; 2090 } 2091 } 2092 return wpa; 2093 2094 } 2095 2096 static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, 2097 struct fuse_writepage_args *wpa) 2098 { 2099 if (!fc->sync_fs) 2100 return; 2101 2102 rcu_read_lock(); 2103 /* Prevent resurrection of dead bucket in unlikely race with syncfs */ 2104 do { 2105 wpa->bucket = rcu_dereference(fc->curr_bucket); 2106 } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count))); 2107 rcu_read_unlock(); 2108 } 2109 2110 static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, 2111 uint32_t folio_index, loff_t offset, unsigned len) 2112 { 2113 struct fuse_args_pages *ap = &wpa->ia.ap; 2114 2115 ap->folios[folio_index] = folio; 2116 ap->descs[folio_index].offset = offset; 2117 ap->descs[folio_index].length = len; 2118 } 2119 2120 static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, 2121 size_t offset, 2122 struct fuse_file *ff) 2123 { 2124 struct inode *inode = folio->mapping->host; 2125 struct fuse_conn *fc = get_fuse_conn(inode); 2126 struct fuse_writepage_args *wpa; 2127 struct fuse_args_pages *ap; 2128 2129 wpa = fuse_writepage_args_alloc(); 2130 if (!wpa) 2131 return NULL; 2132 2133 fuse_writepage_add_to_bucket(fc, wpa); 2134 fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio) + offset, 0); 2135 wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; 2136 wpa->inode = inode; 2137 wpa->ia.ff = ff; 2138 2139 ap = &wpa->ia.ap; 2140 ap->args.in_pages = true; 2141 ap->args.end = fuse_writepage_end; 2142 2143 return wpa; 2144 } 2145 2146 struct fuse_fill_wb_data { 2147 struct fuse_writepage_args *wpa; 2148 struct fuse_file *ff; 2149 unsigned int max_folios; 2150 /* 2151 * nr_bytes won't overflow since fuse_folios_need_send() caps 2152 * wb requests to never exceed fc->max_pages (which has an upper bound 2153 * of U16_MAX). 2154 */ 2155 unsigned int nr_bytes; 2156 }; 2157 2158 static bool fuse_pages_realloc(struct fuse_fill_wb_data *data, 2159 unsigned int max_pages) 2160 { 2161 struct fuse_args_pages *ap = &data->wpa->ia.ap; 2162 struct folio **folios; 2163 struct fuse_folio_desc *descs; 2164 unsigned int nfolios = min_t(unsigned int, 2165 max_t(unsigned int, data->max_folios * 2, 2166 FUSE_DEFAULT_MAX_PAGES_PER_REQ), 2167 max_pages); 2168 WARN_ON(nfolios <= data->max_folios); 2169 2170 folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs); 2171 if (!folios) 2172 return false; 2173 2174 memcpy(folios, ap->folios, sizeof(struct folio *) * ap->num_folios); 2175 memcpy(descs, ap->descs, sizeof(struct fuse_folio_desc) * ap->num_folios); 2176 kfree(ap->folios); 2177 ap->folios = folios; 2178 ap->descs = descs; 2179 data->max_folios = nfolios; 2180 2181 return true; 2182 } 2183 2184 static void fuse_writepages_send(struct inode *inode, 2185 struct fuse_fill_wb_data *data) 2186 { 2187 struct fuse_writepage_args *wpa = data->wpa; 2188 struct fuse_inode *fi = get_fuse_inode(inode); 2189 2190 spin_lock(&fi->lock); 2191 list_add_tail(&wpa->queue_entry, &fi->queued_writes); 2192 fuse_flush_writepages(inode); 2193 spin_unlock(&fi->lock); 2194 } 2195 2196 static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, 2197 unsigned len, struct fuse_args_pages *ap, 2198 unsigned cur_bytes, bool write) 2199 { 2200 struct folio *prev_folio; 2201 struct fuse_folio_desc prev_desc; 2202 unsigned bytes = cur_bytes + len; 2203 loff_t prev_pos; 2204 size_t max_bytes = write ? fc->max_write : fc->max_read; 2205 2206 WARN_ON(!ap->num_folios); 2207 2208 /* Reached max pages or max folio slots */ 2209 if (ap->num_folios >= fc->max_pages) 2210 return true; 2211 2212 if (DIV_ROUND_UP(bytes, PAGE_SIZE) > fc->max_pages) 2213 return true; 2214 2215 if (bytes > max_bytes) 2216 return true; 2217 2218 /* Discontinuity */ 2219 prev_folio = ap->folios[ap->num_folios - 1]; 2220 prev_desc = ap->descs[ap->num_folios - 1]; 2221 prev_pos = folio_pos(prev_folio) + prev_desc.offset + prev_desc.length; 2222 if (prev_pos != pos) 2223 return true; 2224 2225 return false; 2226 } 2227 2228 static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, 2229 struct folio *folio, u64 pos, 2230 unsigned len, u64 end_pos) 2231 { 2232 struct fuse_fill_wb_data *data = wpc->wb_ctx; 2233 struct fuse_writepage_args *wpa = data->wpa; 2234 struct fuse_args_pages *ap = &wpa->ia.ap; 2235 struct inode *inode = wpc->inode; 2236 struct fuse_inode *fi = get_fuse_inode(inode); 2237 struct fuse_conn *fc = get_fuse_conn(inode); 2238 loff_t offset = offset_in_folio(folio, pos); 2239 2240 WARN_ON_ONCE(!data); 2241 2242 if (!data->ff) { 2243 data->ff = fuse_write_file_get(fi); 2244 if (!data->ff) 2245 return -EIO; 2246 } 2247 2248 if (wpa) { 2249 bool send = fuse_folios_need_send(fc, pos, len, ap, 2250 data->nr_bytes, true); 2251 2252 if (!send) { 2253 /* 2254 * Need to grow the pages array? If so, did the 2255 * expansion fail? 2256 */ 2257 send = (ap->num_folios == data->max_folios) && 2258 !fuse_pages_realloc(data, fc->max_pages); 2259 } 2260 2261 if (send) { 2262 fuse_writepages_send(inode, data); 2263 data->wpa = NULL; 2264 data->nr_bytes = 0; 2265 } 2266 } 2267 2268 if (data->wpa == NULL) { 2269 wpa = fuse_writepage_args_setup(folio, offset, data->ff); 2270 if (!wpa) 2271 return -ENOMEM; 2272 fuse_file_get(wpa->ia.ff); 2273 data->max_folios = 1; 2274 ap = &wpa->ia.ap; 2275 } 2276 2277 fuse_writepage_args_page_fill(wpa, folio, ap->num_folios, 2278 offset, len); 2279 data->nr_bytes += len; 2280 2281 ap->num_folios++; 2282 if (!data->wpa) 2283 data->wpa = wpa; 2284 2285 return len; 2286 } 2287 2288 static int fuse_iomap_writeback_submit(struct iomap_writepage_ctx *wpc, 2289 int error) 2290 { 2291 struct fuse_fill_wb_data *data = wpc->wb_ctx; 2292 2293 WARN_ON_ONCE(!data); 2294 2295 if (data->wpa) { 2296 WARN_ON(!data->wpa->ia.ap.num_folios); 2297 fuse_writepages_send(wpc->inode, data); 2298 } 2299 2300 if (data->ff) 2301 fuse_file_put(data->ff, false); 2302 2303 return error; 2304 } 2305 2306 static const struct iomap_writeback_ops fuse_writeback_ops = { 2307 .writeback_range = fuse_iomap_writeback_range, 2308 .writeback_submit = fuse_iomap_writeback_submit, 2309 }; 2310 2311 static int fuse_writepages(struct address_space *mapping, 2312 struct writeback_control *wbc) 2313 { 2314 struct inode *inode = mapping->host; 2315 struct fuse_conn *fc = get_fuse_conn(inode); 2316 struct fuse_fill_wb_data data = {}; 2317 struct iomap_writepage_ctx wpc = { 2318 .inode = inode, 2319 .iomap.type = IOMAP_MAPPED, 2320 .wbc = wbc, 2321 .ops = &fuse_writeback_ops, 2322 .wb_ctx = &data, 2323 }; 2324 2325 if (fuse_is_bad(inode)) 2326 return -EIO; 2327 2328 if (wbc->sync_mode == WB_SYNC_NONE && 2329 fuse_chan_num_background(fc->chan) >= fc->congestion_threshold) 2330 return 0; 2331 2332 return iomap_writepages(&wpc); 2333 } 2334 2335 static int fuse_launder_folio(struct folio *folio) 2336 { 2337 int err = 0; 2338 struct fuse_fill_wb_data data = {}; 2339 struct iomap_writepage_ctx wpc = { 2340 .inode = folio->mapping->host, 2341 .iomap.type = IOMAP_MAPPED, 2342 .ops = &fuse_writeback_ops, 2343 .wb_ctx = &data, 2344 }; 2345 2346 if (folio_clear_dirty_for_io(folio)) { 2347 err = iomap_writeback_folio(&wpc, folio); 2348 err = fuse_iomap_writeback_submit(&wpc, err); 2349 if (!err) 2350 folio_wait_writeback(folio); 2351 } 2352 return err; 2353 } 2354 2355 /* 2356 * Write back dirty data/metadata now (there may not be any suitable 2357 * open files later for data) 2358 */ 2359 static void fuse_vma_close(struct vm_area_struct *vma) 2360 { 2361 int err; 2362 2363 err = write_inode_now(vma->vm_file->f_mapping->host, 1); 2364 mapping_set_error(vma->vm_file->f_mapping, err); 2365 } 2366 2367 /* 2368 * Wait for writeback against this page to complete before allowing it 2369 * to be marked dirty again, and hence written back again, possibly 2370 * before the previous writepage completed. 2371 * 2372 * Block here, instead of in ->writepage(), so that the userspace fs 2373 * can only block processes actually operating on the filesystem. 2374 * 2375 * Otherwise unprivileged userspace fs would be able to block 2376 * unrelated: 2377 * 2378 * - page migration 2379 * - sync(2) 2380 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER 2381 */ 2382 static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) 2383 { 2384 struct folio *folio = page_folio(vmf->page); 2385 struct inode *inode = file_inode(vmf->vma->vm_file); 2386 2387 file_update_time(vmf->vma->vm_file); 2388 folio_lock(folio); 2389 if (folio->mapping != inode->i_mapping) { 2390 folio_unlock(folio); 2391 return VM_FAULT_NOPAGE; 2392 } 2393 2394 folio_wait_writeback(folio); 2395 return VM_FAULT_LOCKED; 2396 } 2397 2398 static const struct vm_operations_struct fuse_file_vm_ops = { 2399 .close = fuse_vma_close, 2400 .fault = filemap_fault, 2401 .map_pages = filemap_map_pages, 2402 .page_mkwrite = fuse_page_mkwrite, 2403 }; 2404 2405 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) 2406 { 2407 struct fuse_file *ff = file->private_data; 2408 struct fuse_conn *fc = ff->fm->fc; 2409 struct inode *inode = file_inode(file); 2410 int rc; 2411 2412 /* DAX mmap is superior to direct_io mmap */ 2413 if (FUSE_IS_DAX(inode)) 2414 return fuse_dax_mmap(file, vma); 2415 2416 /* 2417 * If inode is in passthrough io mode, because it has some file open 2418 * in passthrough mode, either mmap to backing file or fail mmap, 2419 * because mixing cached mmap and passthrough io mode is not allowed. 2420 */ 2421 if (fuse_file_passthrough(ff)) 2422 return fuse_passthrough_mmap(file, vma); 2423 else if (fuse_inode_backing(get_fuse_inode(inode))) 2424 return -ENODEV; 2425 2426 /* 2427 * FOPEN_DIRECT_IO handling is special compared to O_DIRECT, 2428 * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP. 2429 */ 2430 if (ff->open_flags & FOPEN_DIRECT_IO) { 2431 /* 2432 * Can't provide the coherency needed for MAP_SHARED 2433 * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set. 2434 */ 2435 if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap) 2436 return -ENODEV; 2437 2438 invalidate_inode_pages2(file->f_mapping); 2439 2440 if (!(vma->vm_flags & VM_MAYSHARE)) { 2441 /* MAP_PRIVATE */ 2442 return generic_file_mmap(file, vma); 2443 } 2444 2445 /* 2446 * First mmap of direct_io file enters caching inode io mode. 2447 * Also waits for parallel dio writers to go into serial mode 2448 * (exclusive instead of shared lock). 2449 * After first mmap, the inode stays in caching io mode until 2450 * the direct_io file release. 2451 */ 2452 rc = fuse_file_cached_io_open(inode, ff); 2453 if (rc) 2454 return rc; 2455 } 2456 2457 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 2458 fuse_link_write_file(file); 2459 2460 file_accessed(file); 2461 vma->vm_ops = &fuse_file_vm_ops; 2462 return 0; 2463 } 2464 2465 static int convert_fuse_file_lock(struct fuse_conn *fc, 2466 const struct fuse_file_lock *ffl, 2467 struct file_lock *fl) 2468 { 2469 switch (ffl->type) { 2470 case F_UNLCK: 2471 break; 2472 2473 case F_RDLCK: 2474 case F_WRLCK: 2475 if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX || 2476 ffl->end < ffl->start) 2477 return -EIO; 2478 2479 fl->fl_start = ffl->start; 2480 fl->fl_end = ffl->end; 2481 2482 /* 2483 * Convert pid into init's pid namespace. The locks API will 2484 * translate it into the caller's pid namespace. 2485 */ 2486 rcu_read_lock(); 2487 fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); 2488 rcu_read_unlock(); 2489 break; 2490 2491 default: 2492 return -EIO; 2493 } 2494 fl->c.flc_type = ffl->type; 2495 return 0; 2496 } 2497 2498 static void fuse_lk_fill(struct fuse_args *args, struct file *file, 2499 const struct file_lock *fl, int opcode, pid_t pid, 2500 int flock, struct fuse_lk_in *inarg) 2501 { 2502 struct inode *inode = file_inode(file); 2503 struct fuse_conn *fc = get_fuse_conn(inode); 2504 struct fuse_file *ff = file->private_data; 2505 2506 memset(inarg, 0, sizeof(*inarg)); 2507 inarg->fh = ff->fh; 2508 inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner); 2509 inarg->lk.start = fl->fl_start; 2510 inarg->lk.end = fl->fl_end; 2511 inarg->lk.type = fl->c.flc_type; 2512 inarg->lk.pid = pid; 2513 if (flock) 2514 inarg->lk_flags |= FUSE_LK_FLOCK; 2515 args->opcode = opcode; 2516 args->nodeid = get_node_id(inode); 2517 args->in_numargs = 1; 2518 args->in_args[0].size = sizeof(*inarg); 2519 args->in_args[0].value = inarg; 2520 } 2521 2522 static int fuse_getlk(struct file *file, struct file_lock *fl) 2523 { 2524 struct inode *inode = file_inode(file); 2525 struct fuse_mount *fm = get_fuse_mount(inode); 2526 FUSE_ARGS(args); 2527 struct fuse_lk_in inarg; 2528 struct fuse_lk_out outarg; 2529 int err; 2530 2531 fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); 2532 args.out_numargs = 1; 2533 args.out_args[0].size = sizeof(outarg); 2534 args.out_args[0].value = &outarg; 2535 err = fuse_simple_request(fm, &args); 2536 if (!err) 2537 err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl); 2538 2539 return err; 2540 } 2541 2542 static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) 2543 { 2544 struct inode *inode = file_inode(file); 2545 struct fuse_mount *fm = get_fuse_mount(inode); 2546 FUSE_ARGS(args); 2547 struct fuse_lk_in inarg; 2548 int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; 2549 struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL; 2550 pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns); 2551 int err; 2552 2553 if (fl->fl_lmops && fl->fl_lmops->lm_grant) { 2554 /* NLM needs asynchronous locks, which we don't support yet */ 2555 return -ENOLCK; 2556 } 2557 2558 fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); 2559 err = fuse_simple_request(fm, &args); 2560 2561 /* locking is restartable */ 2562 if (err == -EINTR) 2563 err = -ERESTARTSYS; 2564 2565 return err; 2566 } 2567 2568 static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) 2569 { 2570 struct inode *inode = file_inode(file); 2571 struct fuse_conn *fc = get_fuse_conn(inode); 2572 int err; 2573 2574 if (cmd == F_CANCELLK) { 2575 err = 0; 2576 } else if (cmd == F_GETLK) { 2577 if (fc->no_lock) { 2578 posix_test_lock(file, fl); 2579 err = 0; 2580 } else 2581 err = fuse_getlk(file, fl); 2582 } else { 2583 if (fc->no_lock) 2584 err = posix_lock_file(file, fl, NULL); 2585 else 2586 err = fuse_setlk(file, fl, 0); 2587 } 2588 return err; 2589 } 2590 2591 static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) 2592 { 2593 struct inode *inode = file_inode(file); 2594 struct fuse_conn *fc = get_fuse_conn(inode); 2595 int err; 2596 2597 if (fc->no_flock) { 2598 err = locks_lock_file_wait(file, fl); 2599 } else { 2600 struct fuse_file *ff = file->private_data; 2601 2602 /* emulate flock with POSIX locks */ 2603 err = fuse_setlk(file, fl, 1); 2604 if (!err) 2605 ff->flock = true; 2606 } 2607 2608 return err; 2609 } 2610 2611 static sector_t fuse_bmap(struct address_space *mapping, sector_t block) 2612 { 2613 struct inode *inode = mapping->host; 2614 struct fuse_mount *fm = get_fuse_mount(inode); 2615 FUSE_ARGS(args); 2616 struct fuse_bmap_in inarg; 2617 struct fuse_bmap_out outarg; 2618 int err; 2619 2620 if (!inode->i_sb->s_bdev || fm->fc->no_bmap) 2621 return 0; 2622 2623 memset(&inarg, 0, sizeof(inarg)); 2624 inarg.block = block; 2625 inarg.blocksize = inode->i_sb->s_blocksize; 2626 args.opcode = FUSE_BMAP; 2627 args.nodeid = get_node_id(inode); 2628 args.in_numargs = 1; 2629 args.in_args[0].size = sizeof(inarg); 2630 args.in_args[0].value = &inarg; 2631 args.out_numargs = 1; 2632 args.out_args[0].size = sizeof(outarg); 2633 args.out_args[0].value = &outarg; 2634 err = fuse_simple_request(fm, &args); 2635 if (err == -ENOSYS) 2636 fm->fc->no_bmap = 1; 2637 2638 return err ? 0 : outarg.block; 2639 } 2640 2641 static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) 2642 { 2643 struct inode *inode = file->f_mapping->host; 2644 struct fuse_mount *fm = get_fuse_mount(inode); 2645 struct fuse_file *ff = file->private_data; 2646 FUSE_ARGS(args); 2647 struct fuse_lseek_in inarg = { 2648 .fh = ff->fh, 2649 .offset = offset, 2650 .whence = whence 2651 }; 2652 struct fuse_lseek_out outarg; 2653 int err; 2654 2655 if (fm->fc->no_lseek) 2656 goto fallback; 2657 2658 args.opcode = FUSE_LSEEK; 2659 args.nodeid = ff->nodeid; 2660 args.in_numargs = 1; 2661 args.in_args[0].size = sizeof(inarg); 2662 args.in_args[0].value = &inarg; 2663 args.out_numargs = 1; 2664 args.out_args[0].size = sizeof(outarg); 2665 args.out_args[0].value = &outarg; 2666 err = fuse_simple_request(fm, &args); 2667 if (err) { 2668 if (err == -ENOSYS) { 2669 fm->fc->no_lseek = 1; 2670 goto fallback; 2671 } 2672 return err; 2673 } 2674 2675 return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); 2676 2677 fallback: 2678 err = fuse_update_attributes(inode, file, STATX_SIZE); 2679 if (!err) 2680 return generic_file_llseek(file, offset, whence); 2681 else 2682 return err; 2683 } 2684 2685 static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) 2686 { 2687 loff_t retval; 2688 struct inode *inode = file_inode(file); 2689 2690 switch (whence) { 2691 case SEEK_SET: 2692 case SEEK_CUR: 2693 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ 2694 retval = generic_file_llseek(file, offset, whence); 2695 break; 2696 case SEEK_END: 2697 inode_lock(inode); 2698 retval = fuse_update_attributes(inode, file, STATX_SIZE); 2699 if (!retval) 2700 retval = generic_file_llseek(file, offset, whence); 2701 inode_unlock(inode); 2702 break; 2703 case SEEK_HOLE: 2704 case SEEK_DATA: 2705 inode_lock(inode); 2706 retval = fuse_lseek(file, offset, whence); 2707 inode_unlock(inode); 2708 break; 2709 default: 2710 retval = -EINVAL; 2711 } 2712 2713 return retval; 2714 } 2715 2716 static void fuse_do_truncate(struct file *file) 2717 { 2718 struct inode *inode = file->f_mapping->host; 2719 struct iattr attr; 2720 2721 attr.ia_valid = ATTR_SIZE; 2722 attr.ia_size = i_size_read(inode); 2723 2724 attr.ia_file = file; 2725 attr.ia_valid |= ATTR_FILE; 2726 2727 fuse_do_setattr(file_mnt_idmap(file), file_dentry(file), &attr, file); 2728 } 2729 2730 static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) 2731 { 2732 return round_up(off, fc->max_pages << PAGE_SHIFT); 2733 } 2734 2735 static ssize_t 2736 fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 2737 { 2738 DECLARE_COMPLETION_ONSTACK(wait); 2739 ssize_t ret = 0; 2740 struct file *file = iocb->ki_filp; 2741 struct fuse_file *ff = file->private_data; 2742 loff_t pos = 0; 2743 struct inode *inode; 2744 loff_t i_size; 2745 size_t count = iov_iter_count(iter), shortened = 0; 2746 loff_t offset = iocb->ki_pos; 2747 struct fuse_io_priv *io; 2748 bool async = ff->fm->fc->async_dio; 2749 2750 pos = offset; 2751 inode = file->f_mapping->host; 2752 i_size = i_size_read(inode); 2753 2754 if ((iov_iter_rw(iter) == READ) && (offset >= i_size)) 2755 return 0; 2756 2757 if ((iov_iter_rw(iter) == WRITE) && async && !inode->i_sb->s_dio_done_wq) { 2758 ret = sb_init_dio_done_wq(inode->i_sb); 2759 if (ret < 0) 2760 return ret; 2761 } 2762 2763 io = kmalloc_obj(struct fuse_io_priv); 2764 if (!io) 2765 return -ENOMEM; 2766 spin_lock_init(&io->lock); 2767 kref_init(&io->refcnt); 2768 io->reqs = 1; 2769 io->bytes = -1; 2770 io->size = 0; 2771 io->offset = offset; 2772 io->write = (iov_iter_rw(iter) == WRITE); 2773 io->err = 0; 2774 /* 2775 * By default, we want to optimize all I/Os with async request 2776 * submission to the client filesystem if supported. 2777 */ 2778 io->async = async; 2779 io->iocb = iocb; 2780 io->blocking = is_sync_kiocb(iocb); 2781 2782 /* optimization for short read */ 2783 if (io->async && !io->write && offset + count > i_size) { 2784 iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset)); 2785 shortened = count - iov_iter_count(iter); 2786 count -= shortened; 2787 } 2788 2789 /* 2790 * We cannot asynchronously extend the size of a file. 2791 * In such case the aio will behave exactly like sync io. 2792 */ 2793 if ((offset + count > i_size) && io->write) 2794 io->blocking = true; 2795 2796 if (io->async && io->blocking) { 2797 /* 2798 * Additional reference to keep io around after 2799 * calling fuse_aio_complete() 2800 */ 2801 kref_get(&io->refcnt); 2802 io->done = &wait; 2803 } 2804 2805 if (iov_iter_rw(iter) == WRITE) { 2806 ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); 2807 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); 2808 } else { 2809 ret = __fuse_direct_read(io, iter, &pos); 2810 } 2811 iov_iter_reexpand(iter, iov_iter_count(iter) + shortened); 2812 2813 if (io->async) { 2814 bool blocking = io->blocking; 2815 2816 fuse_aio_complete(io, ret < 0 ? ret : 0, -1); 2817 2818 /* we have a non-extending, async request, so return */ 2819 if (!blocking) 2820 return -EIOCBQUEUED; 2821 2822 wait_for_completion(&wait); 2823 ret = fuse_get_res_by_io(io); 2824 } 2825 2826 kref_put(&io->refcnt, fuse_io_release); 2827 2828 if (iov_iter_rw(iter) == WRITE) { 2829 fuse_write_update_attr(inode, pos, ret); 2830 /* For extending writes we already hold exclusive lock */ 2831 if (ret < 0 && offset + count > i_size) 2832 fuse_do_truncate(file); 2833 } 2834 2835 return ret; 2836 } 2837 2838 static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) 2839 { 2840 int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX); 2841 2842 if (!err) 2843 fuse_sync_writes(inode); 2844 2845 return err; 2846 } 2847 2848 static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, 2849 loff_t length) 2850 { 2851 struct fuse_file *ff = file->private_data; 2852 struct inode *inode = file_inode(file); 2853 struct fuse_inode *fi = get_fuse_inode(inode); 2854 struct fuse_mount *fm = ff->fm; 2855 FUSE_ARGS(args); 2856 struct fuse_fallocate_in inarg = { 2857 .fh = ff->fh, 2858 .offset = offset, 2859 .length = length, 2860 .mode = mode 2861 }; 2862 int err; 2863 bool block_faults = FUSE_IS_DAX(inode) && 2864 (!(mode & FALLOC_FL_KEEP_SIZE) || 2865 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))); 2866 2867 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 2868 FALLOC_FL_ZERO_RANGE)) 2869 return -EOPNOTSUPP; 2870 2871 if (fm->fc->no_fallocate) 2872 return -EOPNOTSUPP; 2873 2874 inode_lock(inode); 2875 if (block_faults) { 2876 filemap_invalidate_lock(inode->i_mapping); 2877 err = fuse_dax_break_layouts(inode, 0, -1); 2878 if (err) 2879 goto out; 2880 } 2881 2882 if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) { 2883 loff_t endbyte = offset + length - 1; 2884 2885 err = fuse_writeback_range(inode, offset, endbyte); 2886 if (err) 2887 goto out; 2888 } 2889 2890 if (!(mode & FALLOC_FL_KEEP_SIZE) && 2891 offset + length > i_size_read(inode)) { 2892 err = inode_newsize_ok(inode, offset + length); 2893 if (err) 2894 goto out; 2895 } 2896 2897 err = file_modified(file); 2898 if (err) 2899 goto out; 2900 2901 if (!(mode & FALLOC_FL_KEEP_SIZE)) 2902 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 2903 2904 args.opcode = FUSE_FALLOCATE; 2905 args.nodeid = ff->nodeid; 2906 args.in_numargs = 1; 2907 args.in_args[0].size = sizeof(inarg); 2908 args.in_args[0].value = &inarg; 2909 err = fuse_simple_request(fm, &args); 2910 if (err == -ENOSYS) { 2911 fm->fc->no_fallocate = 1; 2912 err = -EOPNOTSUPP; 2913 } 2914 if (err) 2915 goto out; 2916 2917 /* we could have extended the file */ 2918 if (!(mode & FALLOC_FL_KEEP_SIZE)) { 2919 if (fuse_write_update_attr(inode, offset + length, length)) 2920 file_update_time(file); 2921 } 2922 2923 if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) 2924 truncate_pagecache_range(inode, offset, offset + length - 1); 2925 2926 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); 2927 2928 out: 2929 if (!(mode & FALLOC_FL_KEEP_SIZE)) 2930 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 2931 2932 if (block_faults) 2933 filemap_invalidate_unlock(inode->i_mapping); 2934 2935 inode_unlock(inode); 2936 2937 fuse_flush_time_update(inode); 2938 2939 return err; 2940 } 2941 2942 static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, 2943 struct file *file_out, loff_t pos_out, 2944 size_t len, unsigned int flags) 2945 { 2946 struct fuse_file *ff_in = file_in->private_data; 2947 struct fuse_file *ff_out = file_out->private_data; 2948 struct inode *inode_in = file_inode(file_in); 2949 struct inode *inode_out = file_inode(file_out); 2950 struct fuse_inode *fi_out = get_fuse_inode(inode_out); 2951 struct fuse_mount *fm = ff_in->fm; 2952 struct fuse_conn *fc = fm->fc; 2953 FUSE_ARGS(args); 2954 struct fuse_copy_file_range_in inarg = { 2955 .fh_in = ff_in->fh, 2956 .off_in = pos_in, 2957 .nodeid_out = ff_out->nodeid, 2958 .fh_out = ff_out->fh, 2959 .off_out = pos_out, 2960 .len = len, 2961 .flags = flags 2962 }; 2963 struct fuse_write_out outarg; 2964 struct fuse_copy_file_range_out outarg_64; 2965 u64 bytes_copied; 2966 ssize_t err; 2967 /* mark unstable when write-back is not used, and file_out gets 2968 * extended */ 2969 bool is_unstable = (!fc->writeback_cache) && 2970 ((pos_out + len) > inode_out->i_size); 2971 2972 if (fc->no_copy_file_range) 2973 return -EOPNOTSUPP; 2974 2975 if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) 2976 return -EXDEV; 2977 2978 inode_lock(inode_in); 2979 err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1); 2980 inode_unlock(inode_in); 2981 if (err) 2982 return err; 2983 2984 inode_lock(inode_out); 2985 2986 err = file_modified(file_out); 2987 if (err) 2988 goto out; 2989 2990 /* 2991 * Write out dirty pages in the destination file before sending the COPY 2992 * request to userspace. After the request is completed, truncate off 2993 * pages (including partial ones) from the cache that have been copied, 2994 * since these contain stale data at that point. 2995 * 2996 * This should be mostly correct, but if the COPY writes to partial 2997 * pages (at the start or end) and the parts not covered by the COPY are 2998 * written through a memory map after calling fuse_writeback_range(), 2999 * then these partial page modifications will be lost on truncation. 3000 * 3001 * It is unlikely that someone would rely on such mixed style 3002 * modifications. Yet this does give less guarantees than if the 3003 * copying was performed with write(2). 3004 * 3005 * To fix this a mapping->invalidate_lock could be used to prevent new 3006 * faults while the copy is ongoing. 3007 */ 3008 err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); 3009 if (err) 3010 goto out; 3011 3012 if (is_unstable) 3013 set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); 3014 3015 args.opcode = FUSE_COPY_FILE_RANGE_64; 3016 args.nodeid = ff_in->nodeid; 3017 args.in_numargs = 1; 3018 args.in_args[0].size = sizeof(inarg); 3019 args.in_args[0].value = &inarg; 3020 args.out_numargs = 1; 3021 args.out_args[0].size = sizeof(outarg_64); 3022 args.out_args[0].value = &outarg_64; 3023 if (fc->no_copy_file_range_64) { 3024 fallback: 3025 /* Fall back to old op that can't handle large copy length */ 3026 args.opcode = FUSE_COPY_FILE_RANGE; 3027 args.out_args[0].size = sizeof(outarg); 3028 args.out_args[0].value = &outarg; 3029 inarg.len = len = min_t(size_t, len, UINT_MAX & PAGE_MASK); 3030 } 3031 err = fuse_simple_request(fm, &args); 3032 if (err == -ENOSYS) { 3033 if (fc->no_copy_file_range_64) { 3034 fc->no_copy_file_range = 1; 3035 err = -EOPNOTSUPP; 3036 } else { 3037 fc->no_copy_file_range_64 = 1; 3038 goto fallback; 3039 } 3040 } 3041 if (err) 3042 goto out; 3043 3044 bytes_copied = fc->no_copy_file_range_64 ? 3045 outarg.size : outarg_64.bytes_copied; 3046 3047 if (bytes_copied > len) { 3048 err = -EIO; 3049 goto out; 3050 } 3051 3052 truncate_inode_pages_range(inode_out->i_mapping, 3053 ALIGN_DOWN(pos_out, PAGE_SIZE), 3054 ALIGN(pos_out + bytes_copied, PAGE_SIZE) - 1); 3055 3056 file_update_time(file_out); 3057 fuse_write_update_attr(inode_out, pos_out + bytes_copied, bytes_copied); 3058 3059 err = bytes_copied; 3060 out: 3061 if (is_unstable) 3062 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); 3063 3064 inode_unlock(inode_out); 3065 file_accessed(file_in); 3066 3067 fuse_flush_time_update(inode_out); 3068 3069 return err; 3070 } 3071 3072 static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off, 3073 struct file *dst_file, loff_t dst_off, 3074 size_t len, unsigned int flags) 3075 { 3076 ssize_t ret; 3077 3078 ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off, 3079 len, flags); 3080 3081 if (ret == -EOPNOTSUPP || ret == -EXDEV) 3082 ret = splice_copy_file_range(src_file, src_off, dst_file, 3083 dst_off, len); 3084 return ret; 3085 } 3086 3087 static const struct file_operations fuse_file_operations = { 3088 .llseek = fuse_file_llseek, 3089 .read_iter = fuse_file_read_iter, 3090 .write_iter = fuse_file_write_iter, 3091 .mmap = fuse_file_mmap, 3092 .open = fuse_open, 3093 .flush = fuse_flush, 3094 .release = fuse_release, 3095 .fsync = fuse_fsync, 3096 .lock = fuse_file_lock, 3097 .get_unmapped_area = thp_get_unmapped_area, 3098 .flock = fuse_file_flock, 3099 .splice_read = fuse_splice_read, 3100 .splice_write = fuse_splice_write, 3101 .unlocked_ioctl = fuse_file_ioctl, 3102 .compat_ioctl = fuse_file_compat_ioctl, 3103 .poll = fuse_file_poll, 3104 .fallocate = fuse_file_fallocate, 3105 .copy_file_range = fuse_copy_file_range, 3106 .setlease = generic_setlease, 3107 }; 3108 3109 static const struct address_space_operations fuse_file_aops = { 3110 .read_folio = fuse_read_folio, 3111 .readahead = fuse_readahead, 3112 .writepages = fuse_writepages, 3113 .launder_folio = fuse_launder_folio, 3114 .dirty_folio = iomap_dirty_folio, 3115 .release_folio = iomap_release_folio, 3116 .invalidate_folio = iomap_invalidate_folio, 3117 .is_partially_uptodate = iomap_is_partially_uptodate, 3118 .migrate_folio = filemap_migrate_folio, 3119 .bmap = fuse_bmap, 3120 .direct_IO = fuse_direct_IO, 3121 }; 3122 3123 void fuse_init_file_inode(struct inode *inode, unsigned int flags) 3124 { 3125 struct fuse_inode *fi = get_fuse_inode(inode); 3126 struct fuse_conn *fc = get_fuse_conn(inode); 3127 3128 inode->i_fop = &fuse_file_operations; 3129 inode->i_data.a_ops = &fuse_file_aops; 3130 if (fc->writeback_cache) 3131 mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data); 3132 3133 INIT_LIST_HEAD(&fi->write_files); 3134 INIT_LIST_HEAD(&fi->queued_writes); 3135 fi->writectr = 0; 3136 fi->iocachectr = 0; 3137 init_waitqueue_head(&fi->page_waitq); 3138 init_waitqueue_head(&fi->direct_io_waitq); 3139 3140 if (IS_ENABLED(CONFIG_FUSE_DAX)) 3141 fuse_dax_inode_init(inode, flags); 3142 } 3143