1 /* 2 FUSE: Filesystem in Userspace 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> 4 5 This program can be distributed under the terms of the GNU GPL. 6 See the file COPYING. 7 */ 8 9 #include "fuse_i.h" 10 11 #include <linux/pagemap.h> 12 #include <linux/slab.h> 13 #include <linux/kernel.h> 14 #include <linux/sched.h> 15 #include <linux/sched/signal.h> 16 #include <linux/module.h> 17 #include <linux/swap.h> 18 #include <linux/falloc.h> 19 #include <linux/uio.h> 20 #include <linux/fs.h> 21 #include <linux/filelock.h> 22 #include <linux/splice.h> 23 #include <linux/task_io_accounting_ops.h> 24 #include <linux/iomap.h> 25 26 static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, 27 unsigned int open_flags, int opcode, 28 struct fuse_open_out *outargp) 29 { 30 struct fuse_open_in inarg; 31 FUSE_ARGS(args); 32 33 memset(&inarg, 0, sizeof(inarg)); 34 inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); 35 if (!fm->fc->atomic_o_trunc) 36 inarg.flags &= ~O_TRUNC; 37 38 if (fm->fc->handle_killpriv_v2 && 39 (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) { 40 inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; 41 } 42 43 args.opcode = opcode; 44 args.nodeid = nodeid; 45 args.in_numargs = 1; 46 args.in_args[0].size = sizeof(inarg); 47 args.in_args[0].value = &inarg; 48 args.out_numargs = 1; 49 args.out_args[0].size = sizeof(*outargp); 50 args.out_args[0].value = outargp; 51 52 return fuse_simple_request(fm, &args); 53 } 54 55 struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) 56 { 57 struct fuse_file *ff; 58 59 ff = kzalloc_obj(struct fuse_file, GFP_KERNEL_ACCOUNT); 60 if (unlikely(!ff)) 61 return NULL; 62 63 ff->fm = fm; 64 if (release) { 65 ff->args = kzalloc_obj(*ff->args, GFP_KERNEL_ACCOUNT); 66 if (!ff->args) { 67 kfree(ff); 68 return NULL; 69 } 70 } 71 72 INIT_LIST_HEAD(&ff->write_entry); 73 refcount_set(&ff->count, 1); 74 RB_CLEAR_NODE(&ff->polled_node); 75 init_waitqueue_head(&ff->poll_wait); 76 77 ff->kh = atomic64_inc_return(&fm->fc->khctr); 78 79 return ff; 80 } 81 82 void fuse_file_free(struct fuse_file *ff) 83 { 84 kfree(ff->args); 85 kfree(ff); 86 } 87 88 static struct fuse_file *fuse_file_get(struct fuse_file *ff) 89 { 90 refcount_inc(&ff->count); 91 return ff; 92 } 93 94 static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, 95 int error) 96 { 97 struct fuse_release_args *ra = container_of(args, typeof(*ra), args); 98 99 iput(ra->inode); 100 kfree(ra); 101 } 102 103 static void fuse_file_put(struct fuse_file *ff, bool sync) 104 { 105 if (refcount_dec_and_test(&ff->count)) { 106 struct fuse_release_args *ra = &ff->args->release_args; 107 struct fuse_args *args = (ra ? &ra->args : NULL); 108 109 if (ra && ra->inode) 110 fuse_file_io_release(ff, ra->inode); 111 112 if (!args) { 113 /* Do nothing when server does not implement 'opendir' */ 114 } else if (args->opcode == FUSE_RELEASE && ff->fm->fc->no_open) { 115 fuse_release_end(ff->fm, args, 0); 116 } else if (sync) { 117 fuse_simple_request(ff->fm, args); 118 fuse_release_end(ff->fm, args, 0); 119 } else { 120 /* 121 * DAX inodes may need to issue a number of synchronous 122 * request for clearing the mappings. 123 */ 124 if (ra && ra->inode && FUSE_IS_DAX(ra->inode)) 125 args->may_block = true; 126 args->end = fuse_release_end; 127 if (fuse_simple_background(ff->fm, args, 128 GFP_KERNEL | __GFP_NOFAIL)) 129 fuse_release_end(ff->fm, args, -ENOTCONN); 130 } 131 kfree(ff); 132 } 133 } 134 135 struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, 136 unsigned int open_flags, bool isdir) 137 { 138 struct fuse_conn *fc = fm->fc; 139 struct fuse_file *ff; 140 int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; 141 bool open = isdir ? !fc->no_opendir : !fc->no_open; 142 bool release = !isdir || open; 143 144 /* 145 * ff->args->release_args still needs to be allocated (so we can hold an 146 * inode reference while there are pending inflight file operations when 147 * ->release() is called, see fuse_prepare_release()) even if 148 * fc->no_open is set else it becomes possible for reclaim to deadlock 149 * if while servicing the readahead request the server triggers reclaim 150 * and reclaim evicts the inode of the file being read ahead. 151 */ 152 ff = fuse_file_alloc(fm, release); 153 if (!ff) 154 return ERR_PTR(-ENOMEM); 155 156 ff->fh = 0; 157 /* Default for no-open */ 158 ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); 159 if (open) { 160 /* Store outarg for fuse_finish_open() */ 161 struct fuse_open_out *outargp = &ff->args->open_outarg; 162 int err; 163 164 err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); 165 if (!err) { 166 ff->fh = outargp->fh; 167 ff->open_flags = outargp->open_flags; 168 } else if (err != -ENOSYS) { 169 fuse_file_free(ff); 170 return ERR_PTR(err); 171 } else { 172 if (isdir) { 173 /* No release needed */ 174 kfree(ff->args); 175 ff->args = NULL; 176 fc->no_opendir = 1; 177 } else { 178 fc->no_open = 1; 179 } 180 } 181 } 182 183 if (isdir) 184 ff->open_flags &= ~FOPEN_DIRECT_IO; 185 186 ff->nodeid = nodeid; 187 188 return ff; 189 } 190 191 int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, 192 bool isdir) 193 { 194 struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir); 195 196 if (!IS_ERR(ff)) 197 file->private_data = ff; 198 199 return PTR_ERR_OR_ZERO(ff); 200 } 201 EXPORT_SYMBOL_GPL(fuse_do_open); 202 203 static void fuse_link_write_file(struct file *file) 204 { 205 struct inode *inode = file_inode(file); 206 struct fuse_inode *fi = get_fuse_inode(inode); 207 struct fuse_file *ff = file->private_data; 208 /* 209 * file may be written through mmap, so chain it onto the 210 * inodes's write_file list 211 */ 212 spin_lock(&fi->lock); 213 if (list_empty(&ff->write_entry)) 214 list_add(&ff->write_entry, &fi->write_files); 215 spin_unlock(&fi->lock); 216 } 217 218 int fuse_finish_open(struct inode *inode, struct file *file) 219 { 220 struct fuse_file *ff = file->private_data; 221 struct fuse_conn *fc = get_fuse_conn(inode); 222 int err; 223 224 err = fuse_file_io_open(file, inode); 225 if (err) 226 return err; 227 228 if (ff->open_flags & FOPEN_STREAM) 229 stream_open(inode, file); 230 else if (ff->open_flags & FOPEN_NONSEEKABLE) 231 nonseekable_open(inode, file); 232 233 if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) 234 fuse_link_write_file(file); 235 236 return 0; 237 } 238 239 static void fuse_truncate_update_attr(struct inode *inode, struct file *file) 240 { 241 struct fuse_conn *fc = get_fuse_conn(inode); 242 struct fuse_inode *fi = get_fuse_inode(inode); 243 244 spin_lock(&fi->lock); 245 fi->attr_version = atomic64_inc_return(&fc->attr_version); 246 i_size_write(inode, 0); 247 spin_unlock(&fi->lock); 248 file_update_time(file); 249 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); 250 } 251 252 static int fuse_open(struct inode *inode, struct file *file) 253 { 254 struct fuse_mount *fm = get_fuse_mount(inode); 255 struct fuse_inode *fi = get_fuse_inode(inode); 256 struct fuse_conn *fc = fm->fc; 257 struct fuse_file *ff; 258 int err; 259 bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc; 260 bool is_wb_truncate = is_truncate && fc->writeback_cache; 261 bool dax_truncate = is_truncate && FUSE_IS_DAX(inode); 262 263 if (fuse_is_bad(inode)) 264 return -EIO; 265 266 err = generic_file_open(inode, file); 267 if (err) 268 return err; 269 270 if (is_wb_truncate || dax_truncate) 271 inode_lock(inode); 272 273 if (dax_truncate) { 274 filemap_invalidate_lock(inode->i_mapping); 275 err = fuse_dax_break_layouts(inode, 0, -1); 276 if (err) 277 goto out_inode_unlock; 278 } 279 280 if (is_wb_truncate || dax_truncate) 281 fuse_set_nowrite(inode); 282 283 err = fuse_do_open(fm, get_node_id(inode), file, false); 284 if (!err) { 285 ff = file->private_data; 286 err = fuse_finish_open(inode, file); 287 if (err) 288 fuse_sync_release(fi, ff, file->f_flags); 289 else if (is_truncate) 290 fuse_truncate_update_attr(inode, file); 291 } 292 293 if (is_wb_truncate || dax_truncate) 294 fuse_release_nowrite(inode); 295 if (!err) { 296 if (is_truncate) 297 truncate_pagecache(inode, 0); 298 else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) 299 invalidate_inode_pages2(inode->i_mapping); 300 } 301 if (dax_truncate) 302 filemap_invalidate_unlock(inode->i_mapping); 303 out_inode_unlock: 304 if (is_wb_truncate || dax_truncate) 305 inode_unlock(inode); 306 307 return err; 308 } 309 310 static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, 311 unsigned int flags, int opcode, bool sync) 312 { 313 struct fuse_conn *fc = ff->fm->fc; 314 struct fuse_release_args *ra = &ff->args->release_args; 315 316 if (fuse_file_passthrough(ff)) 317 fuse_passthrough_release(ff, fuse_inode_backing(fi)); 318 319 /* Inode is NULL on error path of fuse_create_open() */ 320 if (likely(fi)) { 321 spin_lock(&fi->lock); 322 list_del(&ff->write_entry); 323 spin_unlock(&fi->lock); 324 } 325 spin_lock(&fc->lock); 326 if (!RB_EMPTY_NODE(&ff->polled_node)) 327 rb_erase(&ff->polled_node, &fc->polled_files); 328 spin_unlock(&fc->lock); 329 330 wake_up_interruptible_all(&ff->poll_wait); 331 332 if (!ra) 333 return; 334 335 /* ff->args was used for open outarg */ 336 memset(ff->args, 0, sizeof(*ff->args)); 337 ra->inarg.fh = ff->fh; 338 ra->inarg.flags = flags; 339 ra->args.in_numargs = 1; 340 ra->args.in_args[0].size = sizeof(struct fuse_release_in); 341 ra->args.in_args[0].value = &ra->inarg; 342 ra->args.opcode = opcode; 343 ra->args.nodeid = ff->nodeid; 344 ra->args.force = true; 345 ra->args.nocreds = true; 346 347 /* 348 * Hold inode until release is finished. 349 * From fuse_sync_release() the refcount is 1 and everything's 350 * synchronous, so we are fine with not doing igrab() here. 351 */ 352 ra->inode = sync ? NULL : igrab(&fi->inode); 353 } 354 355 void fuse_file_release(struct inode *inode, struct fuse_file *ff, 356 unsigned int open_flags, fl_owner_t id, bool isdir) 357 { 358 struct fuse_inode *fi = get_fuse_inode(inode); 359 struct fuse_release_args *ra = &ff->args->release_args; 360 int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; 361 362 fuse_prepare_release(fi, ff, open_flags, opcode, false); 363 364 if (ra && ff->flock) { 365 ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; 366 ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); 367 } 368 369 /* 370 * Normally this will send the RELEASE request, however if 371 * some asynchronous READ or WRITE requests are outstanding, 372 * the sending will be delayed. 373 * 374 * Make the release synchronous if this is a fuseblk mount, 375 * synchronous RELEASE is allowed (and desirable) in this case 376 * because the server can be trusted not to screw up. 377 * 378 * Always use the asynchronous file put because the current thread 379 * might be the fuse server. This can happen if a process starts some 380 * aio and closes the fd before the aio completes. Since aio takes its 381 * own ref to the file, the IO completion has to drop the ref, which is 382 * how the fuse server can end up closing its clients' files. 383 */ 384 fuse_file_put(ff, false); 385 } 386 387 void fuse_release_common(struct file *file, bool isdir) 388 { 389 fuse_file_release(file_inode(file), file->private_data, file->f_flags, 390 (fl_owner_t) file, isdir); 391 } 392 393 static int fuse_release(struct inode *inode, struct file *file) 394 { 395 struct fuse_conn *fc = get_fuse_conn(inode); 396 397 /* 398 * Dirty pages might remain despite write_inode_now() call from 399 * fuse_flush() due to writes racing with the close. 400 */ 401 if (fc->writeback_cache) 402 write_inode_now(inode, 1); 403 404 fuse_release_common(file, false); 405 406 /* return value is ignored by VFS */ 407 return 0; 408 } 409 410 void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, 411 unsigned int flags) 412 { 413 WARN_ON(refcount_read(&ff->count) > 1); 414 fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true); 415 fuse_file_put(ff, true); 416 } 417 EXPORT_SYMBOL_GPL(fuse_sync_release); 418 419 /* 420 * Scramble the ID space with XTEA, so that the value of the files_struct 421 * pointer is not exposed to userspace. 422 */ 423 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) 424 { 425 u32 *k = fc->scramble_key; 426 u64 v = (unsigned long) id; 427 u32 v0 = v; 428 u32 v1 = v >> 32; 429 u32 sum = 0; 430 int i; 431 432 for (i = 0; i < 32; i++) { 433 v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); 434 sum += 0x9E3779B9; 435 v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); 436 } 437 438 return (u64) v0 + ((u64) v1 << 32); 439 } 440 441 struct fuse_writepage_args { 442 struct fuse_io_args ia; 443 struct list_head queue_entry; 444 struct inode *inode; 445 struct fuse_sync_bucket *bucket; 446 }; 447 448 /* 449 * Wait for all pending writepages on the inode to finish. 450 * 451 * This is currently done by blocking further writes with FUSE_NOWRITE 452 * and waiting for all sent writes to complete. 453 * 454 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage 455 * could conflict with truncation. 456 */ 457 static void fuse_sync_writes(struct inode *inode) 458 { 459 fuse_set_nowrite(inode); 460 fuse_release_nowrite(inode); 461 } 462 463 static int fuse_flush(struct file *file, fl_owner_t id) 464 { 465 struct inode *inode = file_inode(file); 466 struct fuse_mount *fm = get_fuse_mount(inode); 467 struct fuse_file *ff = file->private_data; 468 struct fuse_flush_in inarg; 469 FUSE_ARGS(args); 470 int err; 471 472 if (fuse_is_bad(inode)) 473 return -EIO; 474 475 if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) 476 return 0; 477 478 err = write_inode_now(inode, 1); 479 if (err) 480 return err; 481 482 err = filemap_check_errors(file->f_mapping); 483 if (err) 484 return err; 485 486 err = 0; 487 if (fm->fc->no_flush) 488 goto inval_attr_out; 489 490 memset(&inarg, 0, sizeof(inarg)); 491 inarg.fh = ff->fh; 492 inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); 493 args.opcode = FUSE_FLUSH; 494 args.nodeid = get_node_id(inode); 495 args.in_numargs = 1; 496 args.in_args[0].size = sizeof(inarg); 497 args.in_args[0].value = &inarg; 498 args.force = true; 499 500 err = fuse_simple_request(fm, &args); 501 if (err == -ENOSYS) { 502 fm->fc->no_flush = 1; 503 err = 0; 504 } 505 506 inval_attr_out: 507 /* 508 * In memory i_blocks is not maintained by fuse, if writeback cache is 509 * enabled, i_blocks from cached attr may not be accurate. 510 */ 511 if (!err && fm->fc->writeback_cache) 512 fuse_invalidate_attr_mask(inode, STATX_BLOCKS); 513 return err; 514 } 515 516 int fuse_fsync_common(struct file *file, loff_t start, loff_t end, 517 int datasync, int opcode) 518 { 519 struct inode *inode = file->f_mapping->host; 520 struct fuse_mount *fm = get_fuse_mount(inode); 521 struct fuse_file *ff = file->private_data; 522 FUSE_ARGS(args); 523 struct fuse_fsync_in inarg; 524 525 memset(&inarg, 0, sizeof(inarg)); 526 inarg.fh = ff->fh; 527 inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0; 528 args.opcode = opcode; 529 args.nodeid = get_node_id(inode); 530 args.in_numargs = 1; 531 args.in_args[0].size = sizeof(inarg); 532 args.in_args[0].value = &inarg; 533 return fuse_simple_request(fm, &args); 534 } 535 536 static int fuse_fsync(struct file *file, loff_t start, loff_t end, 537 int datasync) 538 { 539 struct inode *inode = file->f_mapping->host; 540 struct fuse_conn *fc = get_fuse_conn(inode); 541 int err; 542 543 if (fuse_is_bad(inode)) 544 return -EIO; 545 546 inode_lock(inode); 547 548 /* 549 * Start writeback against all dirty pages of the inode, then 550 * wait for all outstanding writes, before sending the FSYNC 551 * request. 552 */ 553 err = file_write_and_wait_range(file, start, end); 554 if (err) 555 goto out; 556 557 fuse_sync_writes(inode); 558 559 /* 560 * Due to implementation of fuse writeback 561 * file_write_and_wait_range() does not catch errors. 562 * We have to do this directly after fuse_sync_writes() 563 */ 564 err = file_check_and_advance_wb_err(file); 565 if (err) 566 goto out; 567 568 err = sync_inode_metadata(inode, 1); 569 if (err) 570 goto out; 571 572 if (fc->no_fsync) 573 goto out; 574 575 err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC); 576 if (err == -ENOSYS) { 577 fc->no_fsync = 1; 578 err = 0; 579 } 580 out: 581 inode_unlock(inode); 582 583 return err; 584 } 585 586 void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, 587 size_t count, int opcode) 588 { 589 struct fuse_file *ff = file->private_data; 590 struct fuse_args *args = &ia->ap.args; 591 592 ia->read.in.fh = ff->fh; 593 ia->read.in.offset = pos; 594 ia->read.in.size = count; 595 ia->read.in.flags = file->f_flags; 596 args->opcode = opcode; 597 args->nodeid = ff->nodeid; 598 args->in_numargs = 1; 599 args->in_args[0].size = sizeof(ia->read.in); 600 args->in_args[0].value = &ia->read.in; 601 args->out_argvar = true; 602 args->out_numargs = 1; 603 args->out_args[0].size = count; 604 } 605 606 static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres, 607 bool should_dirty) 608 { 609 unsigned int i; 610 611 for (i = 0; i < ap->num_folios; i++) { 612 if (should_dirty) 613 folio_mark_dirty_lock(ap->folios[i]); 614 if (ap->args.is_pinned) 615 unpin_folio(ap->folios[i]); 616 } 617 618 if (nres > 0 && ap->args.invalidate_vmap) 619 invalidate_kernel_vmap_range(ap->args.vmap_base, nres); 620 } 621 622 static void fuse_io_release(struct kref *kref) 623 { 624 kfree(container_of(kref, struct fuse_io_priv, refcnt)); 625 } 626 627 static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) 628 { 629 if (io->err) 630 return io->err; 631 632 if (io->bytes >= 0 && io->write) 633 return -EIO; 634 635 return io->bytes < 0 ? io->size : io->bytes; 636 } 637 638 /* 639 * In case of short read, the caller sets 'pos' to the position of 640 * actual end of fuse request in IO request. Otherwise, if bytes_requested 641 * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. 642 * 643 * An example: 644 * User requested DIO read of 64K. It was split into two 32K fuse requests, 645 * both submitted asynchronously. The first of them was ACKed by userspace as 646 * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The 647 * second request was ACKed as short, e.g. only 1K was read, resulting in 648 * pos == 33K. 649 * 650 * Thus, when all fuse requests are completed, the minimal non-negative 'pos' 651 * will be equal to the length of the longest contiguous fragment of 652 * transferred data starting from the beginning of IO request. 653 */ 654 static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) 655 { 656 int left; 657 658 spin_lock(&io->lock); 659 if (err) 660 io->err = io->err ? : err; 661 else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes)) 662 io->bytes = pos; 663 664 left = --io->reqs; 665 if (!left && io->blocking) 666 complete(io->done); 667 spin_unlock(&io->lock); 668 669 if (!left && !io->blocking) { 670 ssize_t res = fuse_get_res_by_io(io); 671 672 if (res >= 0) { 673 struct inode *inode = file_inode(io->iocb->ki_filp); 674 struct fuse_conn *fc = get_fuse_conn(inode); 675 struct fuse_inode *fi = get_fuse_inode(inode); 676 677 spin_lock(&fi->lock); 678 fi->attr_version = atomic64_inc_return(&fc->attr_version); 679 spin_unlock(&fi->lock); 680 } 681 682 io->iocb->ki_complete(io->iocb, res); 683 } 684 685 kref_put(&io->refcnt, fuse_io_release); 686 } 687 688 static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, 689 unsigned int nfolios) 690 { 691 struct fuse_io_args *ia; 692 693 ia = kzalloc_obj(*ia); 694 if (ia) { 695 ia->io = io; 696 ia->ap.folios = fuse_folios_alloc(nfolios, GFP_KERNEL, 697 &ia->ap.descs); 698 if (!ia->ap.folios) { 699 kfree(ia); 700 ia = NULL; 701 } 702 } 703 return ia; 704 } 705 706 static void fuse_io_free(struct fuse_io_args *ia) 707 { 708 kfree(ia->ap.folios); 709 kfree(ia); 710 } 711 712 static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, 713 int err) 714 { 715 struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); 716 struct fuse_io_priv *io = ia->io; 717 ssize_t pos = -1; 718 size_t nres; 719 720 if (err) { 721 /* Nothing */ 722 } else if (io->write) { 723 if (ia->write.out.size > ia->write.in.size) { 724 err = -EIO; 725 } else { 726 nres = ia->write.out.size; 727 if (ia->write.in.size != ia->write.out.size) 728 pos = ia->write.in.offset - io->offset + 729 ia->write.out.size; 730 } 731 } else { 732 u32 outsize = args->out_args[0].size; 733 734 nres = outsize; 735 if (ia->read.in.size != outsize) 736 pos = ia->read.in.offset - io->offset + outsize; 737 } 738 739 fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty); 740 741 fuse_aio_complete(io, err, pos); 742 fuse_io_free(ia); 743 } 744 745 static ssize_t fuse_async_req_send(struct fuse_mount *fm, 746 struct fuse_io_args *ia, size_t num_bytes) 747 { 748 ssize_t err; 749 struct fuse_io_priv *io = ia->io; 750 751 spin_lock(&io->lock); 752 kref_get(&io->refcnt); 753 io->size += num_bytes; 754 io->reqs++; 755 spin_unlock(&io->lock); 756 757 ia->ap.args.end = fuse_aio_complete_req; 758 ia->ap.args.may_block = io->should_dirty; 759 err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL); 760 if (err) 761 fuse_aio_complete_req(fm, &ia->ap.args, err); 762 763 return num_bytes; 764 } 765 766 static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, 767 fl_owner_t owner) 768 { 769 struct file *file = ia->io->iocb->ki_filp; 770 struct fuse_file *ff = file->private_data; 771 struct fuse_mount *fm = ff->fm; 772 773 fuse_read_args_fill(ia, file, pos, count, FUSE_READ); 774 if (owner != NULL) { 775 ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; 776 ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner); 777 } 778 779 if (ia->io->async) 780 return fuse_async_req_send(fm, ia, count); 781 782 return fuse_simple_request(fm, &ia->ap.args); 783 } 784 785 static void fuse_read_update_size(struct inode *inode, loff_t size, 786 u64 attr_ver) 787 { 788 struct fuse_conn *fc = get_fuse_conn(inode); 789 struct fuse_inode *fi = get_fuse_inode(inode); 790 791 spin_lock(&fi->lock); 792 if (attr_ver >= fi->attr_version && size < inode->i_size && 793 !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { 794 fi->attr_version = atomic64_inc_return(&fc->attr_version); 795 i_size_write(inode, size); 796 } 797 spin_unlock(&fi->lock); 798 } 799 800 static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, 801 struct fuse_args_pages *ap) 802 { 803 struct fuse_conn *fc = get_fuse_conn(inode); 804 805 /* 806 * If writeback_cache is enabled, a short read means there's a hole in 807 * the file. Some data after the hole is in page cache, but has not 808 * reached the client fs yet. So the hole is not present there. 809 */ 810 if (!fc->writeback_cache) { 811 loff_t pos = folio_pos(ap->folios[0]) + num_read; 812 fuse_read_update_size(inode, pos, attr_ver); 813 } 814 } 815 816 static int fuse_do_readfolio(struct file *file, struct folio *folio, 817 size_t off, size_t len) 818 { 819 struct inode *inode = folio->mapping->host; 820 struct fuse_mount *fm = get_fuse_mount(inode); 821 loff_t pos = folio_pos(folio) + off; 822 struct fuse_folio_desc desc = { 823 .offset = off, 824 .length = len, 825 }; 826 struct fuse_io_args ia = { 827 .ap.args.page_zeroing = true, 828 .ap.args.out_pages = true, 829 .ap.num_folios = 1, 830 .ap.folios = &folio, 831 .ap.descs = &desc, 832 }; 833 ssize_t res; 834 u64 attr_ver; 835 836 attr_ver = fuse_get_attr_version(fm->fc); 837 838 /* Don't overflow end offset */ 839 if (pos + (desc.length - 1) == LLONG_MAX) 840 desc.length--; 841 842 fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); 843 res = fuse_simple_request(fm, &ia.ap.args); 844 if (res < 0) 845 return res; 846 /* 847 * Short read means EOF. If file size is larger, truncate it 848 */ 849 if (res < desc.length) 850 fuse_short_read(inode, attr_ver, res, &ia.ap); 851 852 return 0; 853 } 854 855 static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 856 unsigned int flags, struct iomap *iomap, 857 struct iomap *srcmap) 858 { 859 iomap->type = IOMAP_MAPPED; 860 iomap->length = length; 861 iomap->offset = offset; 862 return 0; 863 } 864 865 static const struct iomap_ops fuse_iomap_ops = { 866 .iomap_begin = fuse_iomap_begin, 867 }; 868 869 struct fuse_fill_read_data { 870 struct file *file; 871 872 /* Fields below are used if sending the read request asynchronously */ 873 struct fuse_conn *fc; 874 struct fuse_io_args *ia; 875 unsigned int nr_bytes; 876 }; 877 878 /* forward declarations */ 879 static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, 880 unsigned len, struct fuse_args_pages *ap, 881 unsigned cur_bytes, bool write); 882 static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, 883 unsigned int count, bool async); 884 885 static int fuse_handle_readahead(struct folio *folio, 886 struct readahead_control *rac, 887 struct fuse_fill_read_data *data, loff_t pos, 888 size_t len) 889 { 890 struct fuse_io_args *ia = data->ia; 891 size_t off = offset_in_folio(folio, pos); 892 struct fuse_conn *fc = data->fc; 893 struct fuse_args_pages *ap; 894 unsigned int nr_pages; 895 896 if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes, 897 false)) { 898 fuse_send_readpages(ia, data->file, data->nr_bytes, 899 fc->async_read); 900 data->nr_bytes = 0; 901 data->ia = NULL; 902 ia = NULL; 903 } 904 if (!ia) { 905 if (fc->num_background >= fc->congestion_threshold && 906 rac->ra->async_size >= readahead_count(rac)) 907 /* 908 * Congested and only async pages left, so skip the 909 * rest. 910 */ 911 return -EAGAIN; 912 913 nr_pages = min(fc->max_pages, readahead_count(rac)); 914 data->ia = fuse_io_alloc(NULL, nr_pages); 915 if (!data->ia) 916 return -ENOMEM; 917 ia = data->ia; 918 } 919 folio_get(folio); 920 ap = &ia->ap; 921 ap->folios[ap->num_folios] = folio; 922 ap->descs[ap->num_folios].offset = off; 923 ap->descs[ap->num_folios].length = len; 924 data->nr_bytes += len; 925 ap->num_folios++; 926 927 return 0; 928 } 929 930 static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, 931 struct iomap_read_folio_ctx *ctx, 932 size_t len) 933 { 934 struct fuse_fill_read_data *data = ctx->read_ctx; 935 struct folio *folio = ctx->cur_folio; 936 loff_t pos = iter->pos; 937 size_t off = offset_in_folio(folio, pos); 938 struct file *file = data->file; 939 int ret; 940 941 if (ctx->rac) { 942 ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len); 943 } else { 944 /* 945 * for non-readahead read requests, do reads synchronously 946 * since it's not guaranteed that the server can handle 947 * out-of-order reads 948 */ 949 ret = fuse_do_readfolio(file, folio, off, len); 950 if (!ret) 951 iomap_finish_folio_read(folio, off, len, ret); 952 } 953 return ret; 954 } 955 956 static void fuse_iomap_submit_read(const struct iomap_iter *iter, 957 struct iomap_read_folio_ctx *ctx) 958 { 959 struct fuse_fill_read_data *data = ctx->read_ctx; 960 961 if (data->ia) 962 fuse_send_readpages(data->ia, data->file, data->nr_bytes, 963 data->fc->async_read); 964 } 965 966 static const struct iomap_read_ops fuse_iomap_read_ops = { 967 .read_folio_range = fuse_iomap_read_folio_range_async, 968 .submit_read = fuse_iomap_submit_read, 969 }; 970 971 static int fuse_read_folio(struct file *file, struct folio *folio) 972 { 973 struct inode *inode = folio->mapping->host; 974 struct fuse_fill_read_data data = { 975 .file = file, 976 }; 977 struct iomap_read_folio_ctx ctx = { 978 .cur_folio = folio, 979 .ops = &fuse_iomap_read_ops, 980 .read_ctx = &data, 981 982 }; 983 984 if (fuse_is_bad(inode)) { 985 folio_unlock(folio); 986 return -EIO; 987 } 988 989 iomap_read_folio(&fuse_iomap_ops, &ctx, NULL); 990 fuse_invalidate_atime(inode); 991 return 0; 992 } 993 994 static int fuse_iomap_read_folio_range(const struct iomap_iter *iter, 995 struct folio *folio, loff_t pos, 996 size_t len) 997 { 998 struct file *file = iter->private; 999 size_t off = offset_in_folio(folio, pos); 1000 1001 return fuse_do_readfolio(file, folio, off, len); 1002 } 1003 1004 static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, 1005 int err) 1006 { 1007 int i; 1008 struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); 1009 struct fuse_args_pages *ap = &ia->ap; 1010 size_t count = ia->read.in.size; 1011 size_t num_read = args->out_args[0].size; 1012 struct address_space *mapping; 1013 struct inode *inode; 1014 1015 WARN_ON_ONCE(!ap->num_folios); 1016 mapping = ap->folios[0]->mapping; 1017 inode = mapping->host; 1018 1019 /* 1020 * Short read means EOF. If file size is larger, truncate it 1021 */ 1022 if (!err && num_read < count) 1023 fuse_short_read(inode, ia->read.attr_ver, num_read, ap); 1024 1025 fuse_invalidate_atime(inode); 1026 1027 for (i = 0; i < ap->num_folios; i++) { 1028 iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset, 1029 ap->descs[i].length, err); 1030 folio_put(ap->folios[i]); 1031 } 1032 if (ia->ff) 1033 fuse_file_put(ia->ff, false); 1034 1035 fuse_io_free(ia); 1036 } 1037 1038 static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, 1039 unsigned int count, bool async) 1040 { 1041 struct fuse_file *ff = file->private_data; 1042 struct fuse_mount *fm = ff->fm; 1043 struct fuse_args_pages *ap = &ia->ap; 1044 loff_t pos = folio_pos(ap->folios[0]); 1045 ssize_t res; 1046 int err; 1047 1048 ap->args.out_pages = true; 1049 ap->args.page_zeroing = true; 1050 ap->args.page_replace = true; 1051 1052 /* Don't overflow end offset */ 1053 if (pos + (count - 1) == LLONG_MAX) { 1054 count--; 1055 ap->descs[ap->num_folios - 1].length--; 1056 } 1057 WARN_ON((loff_t) (pos + count) < 0); 1058 1059 fuse_read_args_fill(ia, file, pos, count, FUSE_READ); 1060 ia->read.attr_ver = fuse_get_attr_version(fm->fc); 1061 if (async) { 1062 ia->ff = fuse_file_get(ff); 1063 ap->args.end = fuse_readpages_end; 1064 err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); 1065 if (!err) 1066 return; 1067 } else { 1068 res = fuse_simple_request(fm, &ap->args); 1069 err = res < 0 ? res : 0; 1070 } 1071 fuse_readpages_end(fm, &ap->args, err); 1072 } 1073 1074 static void fuse_readahead(struct readahead_control *rac) 1075 { 1076 struct inode *inode = rac->mapping->host; 1077 struct fuse_conn *fc = get_fuse_conn(inode); 1078 struct fuse_fill_read_data data = { 1079 .file = rac->file, 1080 .fc = fc, 1081 }; 1082 struct iomap_read_folio_ctx ctx = { 1083 .ops = &fuse_iomap_read_ops, 1084 .rac = rac, 1085 .read_ctx = &data 1086 }; 1087 1088 if (fuse_is_bad(inode)) 1089 return; 1090 1091 iomap_readahead(&fuse_iomap_ops, &ctx, NULL); 1092 } 1093 1094 static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) 1095 { 1096 struct inode *inode = iocb->ki_filp->f_mapping->host; 1097 struct fuse_conn *fc = get_fuse_conn(inode); 1098 1099 /* 1100 * In auto invalidate mode, always update attributes on read. 1101 * Otherwise, only update if we attempt to read past EOF (to ensure 1102 * i_size is up to date). 1103 */ 1104 if (fc->auto_inval_data || 1105 (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { 1106 int err; 1107 err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE); 1108 if (err) 1109 return err; 1110 } 1111 1112 return generic_file_read_iter(iocb, to); 1113 } 1114 1115 static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, 1116 loff_t pos, size_t count) 1117 { 1118 struct fuse_args *args = &ia->ap.args; 1119 1120 ia->write.in.fh = ff->fh; 1121 ia->write.in.offset = pos; 1122 ia->write.in.size = count; 1123 args->opcode = FUSE_WRITE; 1124 args->nodeid = ff->nodeid; 1125 args->in_numargs = 2; 1126 if (ff->fm->fc->minor < 9) 1127 args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; 1128 else 1129 args->in_args[0].size = sizeof(ia->write.in); 1130 args->in_args[0].value = &ia->write.in; 1131 args->in_args[1].size = count; 1132 args->out_numargs = 1; 1133 args->out_args[0].size = sizeof(ia->write.out); 1134 args->out_args[0].value = &ia->write.out; 1135 } 1136 1137 static unsigned int fuse_write_flags(struct kiocb *iocb) 1138 { 1139 unsigned int flags = iocb->ki_filp->f_flags; 1140 1141 if (iocb_is_dsync(iocb)) 1142 flags |= O_DSYNC; 1143 if (iocb->ki_flags & IOCB_SYNC) 1144 flags |= O_SYNC; 1145 1146 return flags; 1147 } 1148 1149 static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, 1150 size_t count, fl_owner_t owner) 1151 { 1152 struct kiocb *iocb = ia->io->iocb; 1153 struct file *file = iocb->ki_filp; 1154 struct fuse_file *ff = file->private_data; 1155 struct fuse_mount *fm = ff->fm; 1156 struct fuse_write_in *inarg = &ia->write.in; 1157 ssize_t err; 1158 1159 fuse_write_args_fill(ia, ff, pos, count); 1160 inarg->flags = fuse_write_flags(iocb); 1161 if (owner != NULL) { 1162 inarg->write_flags |= FUSE_WRITE_LOCKOWNER; 1163 inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner); 1164 } 1165 1166 if (ia->io->async) 1167 return fuse_async_req_send(fm, ia, count); 1168 1169 err = fuse_simple_request(fm, &ia->ap.args); 1170 if (!err && ia->write.out.size > count) 1171 err = -EIO; 1172 1173 return err ?: ia->write.out.size; 1174 } 1175 1176 bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written) 1177 { 1178 struct fuse_conn *fc = get_fuse_conn(inode); 1179 struct fuse_inode *fi = get_fuse_inode(inode); 1180 bool ret = false; 1181 1182 spin_lock(&fi->lock); 1183 fi->attr_version = atomic64_inc_return(&fc->attr_version); 1184 if (written > 0 && pos > inode->i_size) { 1185 i_size_write(inode, pos); 1186 ret = true; 1187 } 1188 spin_unlock(&fi->lock); 1189 1190 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); 1191 1192 return ret; 1193 } 1194 1195 static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, 1196 struct kiocb *iocb, struct inode *inode, 1197 loff_t pos, size_t count) 1198 { 1199 struct fuse_args_pages *ap = &ia->ap; 1200 struct file *file = iocb->ki_filp; 1201 struct fuse_file *ff = file->private_data; 1202 struct fuse_mount *fm = ff->fm; 1203 unsigned int offset, i; 1204 bool short_write; 1205 int err; 1206 1207 for (i = 0; i < ap->num_folios; i++) 1208 folio_wait_writeback(ap->folios[i]); 1209 1210 fuse_write_args_fill(ia, ff, pos, count); 1211 ia->write.in.flags = fuse_write_flags(iocb); 1212 if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID)) 1213 ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; 1214 1215 err = fuse_simple_request(fm, &ap->args); 1216 if (!err && ia->write.out.size > count) 1217 err = -EIO; 1218 1219 short_write = ia->write.out.size < count; 1220 offset = ap->descs[0].offset; 1221 count = ia->write.out.size; 1222 for (i = 0; i < ap->num_folios; i++) { 1223 struct folio *folio = ap->folios[i]; 1224 1225 if (err) { 1226 folio_clear_uptodate(folio); 1227 } else { 1228 if (count >= folio_size(folio) - offset) 1229 count -= folio_size(folio) - offset; 1230 else { 1231 if (short_write) 1232 folio_clear_uptodate(folio); 1233 count = 0; 1234 } 1235 offset = 0; 1236 } 1237 if (ia->write.folio_locked && (i == ap->num_folios - 1)) 1238 folio_unlock(folio); 1239 folio_put(folio); 1240 } 1241 1242 return err; 1243 } 1244 1245 static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, 1246 struct address_space *mapping, 1247 struct iov_iter *ii, loff_t pos, 1248 unsigned int max_folios) 1249 { 1250 struct fuse_args_pages *ap = &ia->ap; 1251 struct fuse_conn *fc = get_fuse_conn(mapping->host); 1252 size_t count = 0; 1253 unsigned int num; 1254 int err = 0; 1255 1256 num = min(iov_iter_count(ii), fc->max_write); 1257 1258 ap->args.in_pages = true; 1259 1260 while (num && ap->num_folios < max_folios) { 1261 size_t tmp; 1262 struct folio *folio; 1263 pgoff_t index = pos >> PAGE_SHIFT; 1264 unsigned int bytes; 1265 unsigned int folio_offset; 1266 1267 again: 1268 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, 1269 mapping_gfp_mask(mapping)); 1270 if (IS_ERR(folio)) { 1271 err = PTR_ERR(folio); 1272 break; 1273 } 1274 1275 if (mapping_writably_mapped(mapping)) 1276 flush_dcache_folio(folio); 1277 1278 folio_offset = offset_in_folio(folio, pos); 1279 bytes = min(folio_size(folio) - folio_offset, num); 1280 1281 tmp = copy_folio_from_iter_atomic(folio, folio_offset, bytes, ii); 1282 flush_dcache_folio(folio); 1283 1284 if (!tmp) { 1285 folio_unlock(folio); 1286 folio_put(folio); 1287 1288 /* 1289 * Ensure forward progress by faulting in 1290 * while not holding the folio lock: 1291 */ 1292 if (fault_in_iov_iter_readable(ii, bytes)) { 1293 err = -EFAULT; 1294 break; 1295 } 1296 1297 goto again; 1298 } 1299 1300 ap->folios[ap->num_folios] = folio; 1301 ap->descs[ap->num_folios].offset = folio_offset; 1302 ap->descs[ap->num_folios].length = tmp; 1303 ap->num_folios++; 1304 1305 count += tmp; 1306 pos += tmp; 1307 num -= tmp; 1308 1309 /* If we copied full folio, mark it uptodate */ 1310 if (tmp == folio_size(folio)) 1311 folio_mark_uptodate(folio); 1312 1313 if (folio_test_uptodate(folio)) { 1314 folio_unlock(folio); 1315 } else { 1316 ia->write.folio_locked = true; 1317 break; 1318 } 1319 if (!fc->big_writes) 1320 break; 1321 if (folio_offset + tmp != folio_size(folio)) 1322 break; 1323 } 1324 1325 return count > 0 ? count : err; 1326 } 1327 1328 static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, 1329 unsigned int max_pages) 1330 { 1331 unsigned int pages = ((pos + len - 1) >> PAGE_SHIFT) - 1332 (pos >> PAGE_SHIFT) + 1; 1333 1334 return min(pages, max_pages); 1335 } 1336 1337 static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) 1338 { 1339 struct address_space *mapping = iocb->ki_filp->f_mapping; 1340 struct inode *inode = mapping->host; 1341 struct fuse_conn *fc = get_fuse_conn(inode); 1342 struct fuse_inode *fi = get_fuse_inode(inode); 1343 loff_t pos = iocb->ki_pos; 1344 int err = 0; 1345 ssize_t res = 0; 1346 1347 if (inode->i_size < pos + iov_iter_count(ii)) 1348 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 1349 1350 do { 1351 ssize_t count; 1352 struct fuse_io_args ia = {}; 1353 struct fuse_args_pages *ap = &ia.ap; 1354 unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), 1355 fc->max_pages); 1356 1357 ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->descs); 1358 if (!ap->folios) { 1359 err = -ENOMEM; 1360 break; 1361 } 1362 1363 count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages); 1364 if (count <= 0) { 1365 err = count; 1366 } else { 1367 err = fuse_send_write_pages(&ia, iocb, inode, 1368 pos, count); 1369 if (!err) { 1370 size_t num_written = ia.write.out.size; 1371 1372 res += num_written; 1373 pos += num_written; 1374 1375 /* break out of the loop on short write */ 1376 if (num_written != count) 1377 err = -EIO; 1378 } 1379 } 1380 kfree(ap->folios); 1381 } while (!err && iov_iter_count(ii)); 1382 1383 fuse_write_update_attr(inode, pos, res); 1384 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 1385 1386 if (!res) 1387 return err; 1388 iocb->ki_pos += res; 1389 return res; 1390 } 1391 1392 static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter) 1393 { 1394 struct inode *inode = file_inode(iocb->ki_filp); 1395 1396 return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); 1397 } 1398 1399 /* 1400 * @return true if an exclusive lock for direct IO writes is needed 1401 */ 1402 static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from) 1403 { 1404 struct file *file = iocb->ki_filp; 1405 struct fuse_file *ff = file->private_data; 1406 struct inode *inode = file_inode(iocb->ki_filp); 1407 struct fuse_inode *fi = get_fuse_inode(inode); 1408 1409 /* Server side has to advise that it supports parallel dio writes. */ 1410 if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES)) 1411 return true; 1412 1413 /* 1414 * Append will need to know the eventual EOF - always needs an 1415 * exclusive lock. 1416 */ 1417 if (iocb->ki_flags & IOCB_APPEND) 1418 return true; 1419 1420 /* shared locks are not allowed with parallel page cache IO */ 1421 if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) 1422 return true; 1423 1424 /* Parallel dio beyond EOF is not supported, at least for now. */ 1425 if (fuse_io_past_eof(iocb, from)) 1426 return true; 1427 1428 return false; 1429 } 1430 1431 static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, 1432 bool *exclusive) 1433 { 1434 struct inode *inode = file_inode(iocb->ki_filp); 1435 struct fuse_inode *fi = get_fuse_inode(inode); 1436 1437 *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); 1438 if (*exclusive) { 1439 inode_lock(inode); 1440 } else { 1441 inode_lock_shared(inode); 1442 /* 1443 * New parallal dio allowed only if inode is not in caching 1444 * mode and denies new opens in caching mode. This check 1445 * should be performed only after taking shared inode lock. 1446 * Previous past eof check was without inode lock and might 1447 * have raced, so check it again. 1448 */ 1449 if (fuse_io_past_eof(iocb, from) || 1450 fuse_inode_uncached_io_start(fi, NULL) != 0) { 1451 inode_unlock_shared(inode); 1452 inode_lock(inode); 1453 *exclusive = true; 1454 } 1455 } 1456 } 1457 1458 static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) 1459 { 1460 struct inode *inode = file_inode(iocb->ki_filp); 1461 struct fuse_inode *fi = get_fuse_inode(inode); 1462 1463 if (exclusive) { 1464 inode_unlock(inode); 1465 } else { 1466 /* Allow opens in caching mode after last parallel dio end */ 1467 fuse_inode_uncached_io_end(fi); 1468 inode_unlock_shared(inode); 1469 } 1470 } 1471 1472 static const struct iomap_write_ops fuse_iomap_write_ops = { 1473 .read_folio_range = fuse_iomap_read_folio_range, 1474 }; 1475 1476 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) 1477 { 1478 struct file *file = iocb->ki_filp; 1479 struct mnt_idmap *idmap = file_mnt_idmap(file); 1480 struct address_space *mapping = file->f_mapping; 1481 ssize_t written = 0; 1482 struct inode *inode = mapping->host; 1483 ssize_t err, count; 1484 struct fuse_conn *fc = get_fuse_conn(inode); 1485 bool writeback = false; 1486 1487 if (fc->writeback_cache) { 1488 /* Update size (EOF optimization) and mode (SUID clearing) */ 1489 err = fuse_update_attributes(mapping->host, file, 1490 STATX_SIZE | STATX_MODE); 1491 if (err) 1492 return err; 1493 1494 if (!fc->handle_killpriv_v2 || 1495 !setattr_should_drop_suidgid(idmap, file_inode(file))) 1496 writeback = true; 1497 } 1498 1499 inode_lock(inode); 1500 1501 err = count = generic_write_checks(iocb, from); 1502 if (err <= 0) 1503 goto out; 1504 1505 task_io_account_write(count); 1506 1507 err = kiocb_modified(iocb); 1508 if (err) 1509 goto out; 1510 1511 if (iocb->ki_flags & IOCB_DIRECT) { 1512 written = generic_file_direct_write(iocb, from); 1513 if (written < 0 || !iov_iter_count(from)) 1514 goto out; 1515 written = direct_write_fallback(iocb, from, written, 1516 fuse_perform_write(iocb, from)); 1517 } else if (writeback) { 1518 /* 1519 * Use iomap so that we can do granular uptodate reads 1520 * and granular dirty tracking for large folios. 1521 */ 1522 written = iomap_file_buffered_write(iocb, from, 1523 &fuse_iomap_ops, 1524 &fuse_iomap_write_ops, 1525 file); 1526 } else { 1527 written = fuse_perform_write(iocb, from); 1528 } 1529 out: 1530 inode_unlock(inode); 1531 if (written > 0) 1532 written = generic_write_sync(iocb, written); 1533 1534 return written ? written : err; 1535 } 1536 1537 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) 1538 { 1539 return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset; 1540 } 1541 1542 static inline size_t fuse_get_frag_size(const struct iov_iter *ii, 1543 size_t max_size) 1544 { 1545 return min(iov_iter_single_seg_count(ii), max_size); 1546 } 1547 1548 static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, 1549 size_t *nbytesp, int write, 1550 unsigned int max_pages, 1551 bool use_pages_for_kvec_io) 1552 { 1553 bool flush_or_invalidate = false; 1554 unsigned int nr_pages = 0; 1555 size_t nbytes = 0; /* # bytes already packed in req */ 1556 ssize_t ret = 0; 1557 1558 /* Special case for kernel I/O: can copy directly into the buffer. 1559 * However if the implementation of fuse_conn requires pages instead of 1560 * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead. 1561 */ 1562 if (iov_iter_is_kvec(ii)) { 1563 void *user_addr = (void *)fuse_get_user_addr(ii); 1564 1565 if (!use_pages_for_kvec_io) { 1566 size_t frag_size = fuse_get_frag_size(ii, *nbytesp); 1567 1568 if (write) 1569 ap->args.in_args[1].value = user_addr; 1570 else 1571 ap->args.out_args[0].value = user_addr; 1572 1573 iov_iter_advance(ii, frag_size); 1574 *nbytesp = frag_size; 1575 return 0; 1576 } 1577 1578 if (is_vmalloc_addr(user_addr)) { 1579 ap->args.vmap_base = user_addr; 1580 flush_or_invalidate = true; 1581 } 1582 } 1583 1584 /* 1585 * Until there is support for iov_iter_extract_folios(), we have to 1586 * manually extract pages using iov_iter_extract_pages() and then 1587 * copy that to a folios array. 1588 */ 1589 struct page **pages = kzalloc(max_pages * sizeof(struct page *), 1590 GFP_KERNEL); 1591 if (!pages) { 1592 ret = -ENOMEM; 1593 goto out; 1594 } 1595 1596 while (nbytes < *nbytesp && nr_pages < max_pages) { 1597 unsigned nfolios, i; 1598 size_t start; 1599 1600 ret = iov_iter_extract_pages(ii, &pages, 1601 *nbytesp - nbytes, 1602 max_pages - nr_pages, 1603 0, &start); 1604 if (ret < 0) 1605 break; 1606 1607 nbytes += ret; 1608 1609 nfolios = DIV_ROUND_UP(ret + start, PAGE_SIZE); 1610 1611 for (i = 0; i < nfolios; i++) { 1612 struct folio *folio = page_folio(pages[i]); 1613 unsigned int offset = start + 1614 (folio_page_idx(folio, pages[i]) << PAGE_SHIFT); 1615 unsigned int len = umin(ret, PAGE_SIZE - start); 1616 1617 ap->descs[ap->num_folios].offset = offset; 1618 ap->descs[ap->num_folios].length = len; 1619 ap->folios[ap->num_folios] = folio; 1620 start = 0; 1621 ret -= len; 1622 ap->num_folios++; 1623 } 1624 1625 nr_pages += nfolios; 1626 } 1627 kfree(pages); 1628 1629 if (write && flush_or_invalidate) 1630 flush_kernel_vmap_range(ap->args.vmap_base, nbytes); 1631 1632 ap->args.invalidate_vmap = !write && flush_or_invalidate; 1633 ap->args.is_pinned = iov_iter_extract_will_pin(ii); 1634 ap->args.user_pages = true; 1635 if (write) 1636 ap->args.in_pages = true; 1637 else 1638 ap->args.out_pages = true; 1639 1640 out: 1641 *nbytesp = nbytes; 1642 1643 return ret < 0 ? ret : 0; 1644 } 1645 1646 ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, 1647 loff_t *ppos, int flags) 1648 { 1649 int write = flags & FUSE_DIO_WRITE; 1650 int cuse = flags & FUSE_DIO_CUSE; 1651 struct file *file = io->iocb->ki_filp; 1652 struct address_space *mapping = file->f_mapping; 1653 struct inode *inode = mapping->host; 1654 struct fuse_file *ff = file->private_data; 1655 struct fuse_conn *fc = ff->fm->fc; 1656 size_t nmax = write ? fc->max_write : fc->max_read; 1657 loff_t pos = *ppos; 1658 size_t count = iov_iter_count(iter); 1659 pgoff_t idx_from = pos >> PAGE_SHIFT; 1660 pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT; 1661 ssize_t res = 0; 1662 int err = 0; 1663 struct fuse_io_args *ia; 1664 unsigned int max_pages; 1665 bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO; 1666 1667 max_pages = iov_iter_npages(iter, fc->max_pages); 1668 ia = fuse_io_alloc(io, max_pages); 1669 if (!ia) 1670 return -ENOMEM; 1671 1672 if (fopen_direct_io) { 1673 res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); 1674 if (res) { 1675 fuse_io_free(ia); 1676 return res; 1677 } 1678 } 1679 if (!cuse && filemap_range_has_writeback(mapping, pos, (pos + count - 1))) { 1680 if (!write) 1681 inode_lock(inode); 1682 fuse_sync_writes(inode); 1683 if (!write) 1684 inode_unlock(inode); 1685 } 1686 1687 if (fopen_direct_io && write) { 1688 res = invalidate_inode_pages2_range(mapping, idx_from, idx_to); 1689 if (res) { 1690 fuse_io_free(ia); 1691 return res; 1692 } 1693 } 1694 1695 io->should_dirty = !write && user_backed_iter(iter); 1696 while (count) { 1697 ssize_t nres; 1698 fl_owner_t owner = current->files; 1699 size_t nbytes = min(count, nmax); 1700 1701 err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, 1702 max_pages, fc->use_pages_for_kvec_io); 1703 if (err && !nbytes) 1704 break; 1705 1706 if (write) { 1707 if (!capable(CAP_FSETID)) 1708 ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; 1709 1710 nres = fuse_send_write(ia, pos, nbytes, owner); 1711 } else { 1712 nres = fuse_send_read(ia, pos, nbytes, owner); 1713 } 1714 1715 if (!io->async || nres < 0) { 1716 fuse_release_user_pages(&ia->ap, nres, io->should_dirty); 1717 fuse_io_free(ia); 1718 } 1719 ia = NULL; 1720 if (nres < 0) { 1721 iov_iter_revert(iter, nbytes); 1722 err = nres; 1723 break; 1724 } 1725 WARN_ON(nres > nbytes); 1726 1727 count -= nres; 1728 res += nres; 1729 pos += nres; 1730 if (nres != nbytes) { 1731 iov_iter_revert(iter, nbytes - nres); 1732 break; 1733 } 1734 if (count) { 1735 max_pages = iov_iter_npages(iter, fc->max_pages); 1736 ia = fuse_io_alloc(io, max_pages); 1737 if (!ia) 1738 break; 1739 } 1740 } 1741 if (ia) 1742 fuse_io_free(ia); 1743 if (res > 0) 1744 *ppos = pos; 1745 1746 if (res > 0 && write && fopen_direct_io) { 1747 /* 1748 * As in generic_file_direct_write(), invalidate after the 1749 * write, to invalidate read-ahead cache that may have competed 1750 * with the write. 1751 */ 1752 invalidate_inode_pages2_range(mapping, idx_from, idx_to); 1753 } 1754 1755 return res > 0 ? res : err; 1756 } 1757 EXPORT_SYMBOL_GPL(fuse_direct_io); 1758 1759 static ssize_t __fuse_direct_read(struct fuse_io_priv *io, 1760 struct iov_iter *iter, 1761 loff_t *ppos) 1762 { 1763 ssize_t res; 1764 struct inode *inode = file_inode(io->iocb->ki_filp); 1765 1766 res = fuse_direct_io(io, iter, ppos, 0); 1767 1768 fuse_invalidate_atime(inode); 1769 1770 return res; 1771 } 1772 1773 static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter); 1774 1775 static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) 1776 { 1777 ssize_t res; 1778 1779 if (!is_sync_kiocb(iocb)) { 1780 res = fuse_direct_IO(iocb, to); 1781 } else { 1782 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); 1783 1784 res = __fuse_direct_read(&io, to, &iocb->ki_pos); 1785 } 1786 1787 return res; 1788 } 1789 1790 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) 1791 { 1792 struct inode *inode = file_inode(iocb->ki_filp); 1793 ssize_t res; 1794 bool exclusive; 1795 1796 fuse_dio_lock(iocb, from, &exclusive); 1797 res = generic_write_checks(iocb, from); 1798 if (res > 0) { 1799 task_io_account_write(res); 1800 if (!is_sync_kiocb(iocb)) { 1801 res = fuse_direct_IO(iocb, from); 1802 } else { 1803 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); 1804 1805 res = fuse_direct_io(&io, from, &iocb->ki_pos, 1806 FUSE_DIO_WRITE); 1807 fuse_write_update_attr(inode, iocb->ki_pos, res); 1808 } 1809 } 1810 fuse_dio_unlock(iocb, exclusive); 1811 1812 return res; 1813 } 1814 1815 static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 1816 { 1817 struct file *file = iocb->ki_filp; 1818 struct fuse_file *ff = file->private_data; 1819 struct inode *inode = file_inode(file); 1820 1821 if (fuse_is_bad(inode)) 1822 return -EIO; 1823 1824 if (FUSE_IS_DAX(inode)) 1825 return fuse_dax_read_iter(iocb, to); 1826 1827 /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ 1828 if (ff->open_flags & FOPEN_DIRECT_IO) 1829 return fuse_direct_read_iter(iocb, to); 1830 else if (fuse_file_passthrough(ff)) 1831 return fuse_passthrough_read_iter(iocb, to); 1832 else 1833 return fuse_cache_read_iter(iocb, to); 1834 } 1835 1836 static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 1837 { 1838 struct file *file = iocb->ki_filp; 1839 struct fuse_file *ff = file->private_data; 1840 struct inode *inode = file_inode(file); 1841 1842 if (fuse_is_bad(inode)) 1843 return -EIO; 1844 1845 if (FUSE_IS_DAX(inode)) 1846 return fuse_dax_write_iter(iocb, from); 1847 1848 /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ 1849 if (ff->open_flags & FOPEN_DIRECT_IO) 1850 return fuse_direct_write_iter(iocb, from); 1851 else if (fuse_file_passthrough(ff)) 1852 return fuse_passthrough_write_iter(iocb, from); 1853 else 1854 return fuse_cache_write_iter(iocb, from); 1855 } 1856 1857 static ssize_t fuse_splice_read(struct file *in, loff_t *ppos, 1858 struct pipe_inode_info *pipe, size_t len, 1859 unsigned int flags) 1860 { 1861 struct fuse_file *ff = in->private_data; 1862 1863 /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ 1864 if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) 1865 return fuse_passthrough_splice_read(in, ppos, pipe, len, flags); 1866 else 1867 return filemap_splice_read(in, ppos, pipe, len, flags); 1868 } 1869 1870 static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, 1871 loff_t *ppos, size_t len, unsigned int flags) 1872 { 1873 struct fuse_file *ff = out->private_data; 1874 1875 /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ 1876 if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) 1877 return fuse_passthrough_splice_write(pipe, out, ppos, len, flags); 1878 else 1879 return iter_file_splice_write(pipe, out, ppos, len, flags); 1880 } 1881 1882 static void fuse_writepage_free(struct fuse_writepage_args *wpa) 1883 { 1884 struct fuse_args_pages *ap = &wpa->ia.ap; 1885 1886 if (wpa->bucket) 1887 fuse_sync_bucket_dec(wpa->bucket); 1888 1889 fuse_file_put(wpa->ia.ff, false); 1890 1891 kfree(ap->folios); 1892 kfree(wpa); 1893 } 1894 1895 static void fuse_writepage_finish(struct fuse_writepage_args *wpa) 1896 { 1897 struct fuse_args_pages *ap = &wpa->ia.ap; 1898 struct inode *inode = wpa->inode; 1899 struct fuse_inode *fi = get_fuse_inode(inode); 1900 int i; 1901 1902 for (i = 0; i < ap->num_folios; i++) 1903 /* 1904 * Benchmarks showed that ending writeback within the 1905 * scope of the fi->lock alleviates xarray lock 1906 * contention and noticeably improves performance. 1907 */ 1908 iomap_finish_folio_write(inode, ap->folios[i], 1909 ap->descs[i].length); 1910 1911 wake_up(&fi->page_waitq); 1912 } 1913 1914 /* Called under fi->lock, may release and reacquire it */ 1915 static void fuse_send_writepage(struct fuse_mount *fm, 1916 struct fuse_writepage_args *wpa, loff_t size) 1917 __releases(fi->lock) 1918 __acquires(fi->lock) 1919 { 1920 struct fuse_inode *fi = get_fuse_inode(wpa->inode); 1921 struct fuse_args_pages *ap = &wpa->ia.ap; 1922 struct fuse_write_in *inarg = &wpa->ia.write.in; 1923 struct fuse_args *args = &ap->args; 1924 __u64 data_size = 0; 1925 int err, i; 1926 1927 for (i = 0; i < ap->num_folios; i++) 1928 data_size += ap->descs[i].length; 1929 1930 fi->writectr++; 1931 if (inarg->offset + data_size <= size) { 1932 inarg->size = data_size; 1933 } else if (inarg->offset < size) { 1934 inarg->size = size - inarg->offset; 1935 } else { 1936 /* Got truncated off completely */ 1937 goto out_free; 1938 } 1939 1940 args->in_args[1].size = inarg->size; 1941 args->force = true; 1942 args->nocreds = true; 1943 1944 err = fuse_simple_background(fm, args, GFP_ATOMIC); 1945 if (err == -ENOMEM) { 1946 spin_unlock(&fi->lock); 1947 err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL); 1948 spin_lock(&fi->lock); 1949 } 1950 1951 /* Fails on broken connection only */ 1952 if (unlikely(err)) 1953 goto out_free; 1954 1955 return; 1956 1957 out_free: 1958 fi->writectr--; 1959 fuse_writepage_finish(wpa); 1960 spin_unlock(&fi->lock); 1961 fuse_writepage_free(wpa); 1962 spin_lock(&fi->lock); 1963 } 1964 1965 /* 1966 * If fi->writectr is positive (no truncate or fsync going on) send 1967 * all queued writepage requests. 1968 * 1969 * Called with fi->lock 1970 */ 1971 void fuse_flush_writepages(struct inode *inode) 1972 __releases(fi->lock) 1973 __acquires(fi->lock) 1974 { 1975 struct fuse_mount *fm = get_fuse_mount(inode); 1976 struct fuse_inode *fi = get_fuse_inode(inode); 1977 loff_t crop = i_size_read(inode); 1978 struct fuse_writepage_args *wpa; 1979 1980 while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { 1981 wpa = list_entry(fi->queued_writes.next, 1982 struct fuse_writepage_args, queue_entry); 1983 list_del_init(&wpa->queue_entry); 1984 fuse_send_writepage(fm, wpa, crop); 1985 } 1986 } 1987 1988 static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, 1989 int error) 1990 { 1991 struct fuse_writepage_args *wpa = 1992 container_of(args, typeof(*wpa), ia.ap.args); 1993 struct inode *inode = wpa->inode; 1994 struct fuse_inode *fi = get_fuse_inode(inode); 1995 struct fuse_conn *fc = get_fuse_conn(inode); 1996 1997 mapping_set_error(inode->i_mapping, error); 1998 /* 1999 * A writeback finished and this might have updated mtime/ctime on 2000 * server making local mtime/ctime stale. Hence invalidate attrs. 2001 * Do this only if writeback_cache is not enabled. If writeback_cache 2002 * is enabled, we trust local ctime/mtime. 2003 */ 2004 if (!fc->writeback_cache) 2005 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); 2006 spin_lock(&fi->lock); 2007 fi->writectr--; 2008 fuse_writepage_finish(wpa); 2009 spin_unlock(&fi->lock); 2010 fuse_writepage_free(wpa); 2011 } 2012 2013 static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi) 2014 { 2015 struct fuse_file *ff; 2016 2017 spin_lock(&fi->lock); 2018 ff = list_first_entry_or_null(&fi->write_files, struct fuse_file, 2019 write_entry); 2020 if (ff) 2021 fuse_file_get(ff); 2022 spin_unlock(&fi->lock); 2023 2024 return ff; 2025 } 2026 2027 static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi) 2028 { 2029 struct fuse_file *ff = __fuse_write_file_get(fi); 2030 WARN_ON(!ff); 2031 return ff; 2032 } 2033 2034 int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) 2035 { 2036 struct fuse_inode *fi = get_fuse_inode(inode); 2037 struct fuse_file *ff; 2038 int err; 2039 2040 ff = __fuse_write_file_get(fi); 2041 err = fuse_flush_times(inode, ff); 2042 if (ff) 2043 fuse_file_put(ff, false); 2044 2045 return err; 2046 } 2047 2048 static struct fuse_writepage_args *fuse_writepage_args_alloc(void) 2049 { 2050 struct fuse_writepage_args *wpa; 2051 struct fuse_args_pages *ap; 2052 2053 wpa = kzalloc_obj(*wpa, GFP_NOFS); 2054 if (wpa) { 2055 ap = &wpa->ia.ap; 2056 ap->num_folios = 0; 2057 ap->folios = fuse_folios_alloc(1, GFP_NOFS, &ap->descs); 2058 if (!ap->folios) { 2059 kfree(wpa); 2060 wpa = NULL; 2061 } 2062 } 2063 return wpa; 2064 2065 } 2066 2067 static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, 2068 struct fuse_writepage_args *wpa) 2069 { 2070 if (!fc->sync_fs) 2071 return; 2072 2073 rcu_read_lock(); 2074 /* Prevent resurrection of dead bucket in unlikely race with syncfs */ 2075 do { 2076 wpa->bucket = rcu_dereference(fc->curr_bucket); 2077 } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count))); 2078 rcu_read_unlock(); 2079 } 2080 2081 static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, 2082 uint32_t folio_index, loff_t offset, unsigned len) 2083 { 2084 struct fuse_args_pages *ap = &wpa->ia.ap; 2085 2086 ap->folios[folio_index] = folio; 2087 ap->descs[folio_index].offset = offset; 2088 ap->descs[folio_index].length = len; 2089 } 2090 2091 static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, 2092 size_t offset, 2093 struct fuse_file *ff) 2094 { 2095 struct inode *inode = folio->mapping->host; 2096 struct fuse_conn *fc = get_fuse_conn(inode); 2097 struct fuse_writepage_args *wpa; 2098 struct fuse_args_pages *ap; 2099 2100 wpa = fuse_writepage_args_alloc(); 2101 if (!wpa) 2102 return NULL; 2103 2104 fuse_writepage_add_to_bucket(fc, wpa); 2105 fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio) + offset, 0); 2106 wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; 2107 wpa->inode = inode; 2108 wpa->ia.ff = ff; 2109 2110 ap = &wpa->ia.ap; 2111 ap->args.in_pages = true; 2112 ap->args.end = fuse_writepage_end; 2113 2114 return wpa; 2115 } 2116 2117 struct fuse_fill_wb_data { 2118 struct fuse_writepage_args *wpa; 2119 struct fuse_file *ff; 2120 unsigned int max_folios; 2121 /* 2122 * nr_bytes won't overflow since fuse_folios_need_send() caps 2123 * wb requests to never exceed fc->max_pages (which has an upper bound 2124 * of U16_MAX). 2125 */ 2126 unsigned int nr_bytes; 2127 }; 2128 2129 static bool fuse_pages_realloc(struct fuse_fill_wb_data *data, 2130 unsigned int max_pages) 2131 { 2132 struct fuse_args_pages *ap = &data->wpa->ia.ap; 2133 struct folio **folios; 2134 struct fuse_folio_desc *descs; 2135 unsigned int nfolios = min_t(unsigned int, 2136 max_t(unsigned int, data->max_folios * 2, 2137 FUSE_DEFAULT_MAX_PAGES_PER_REQ), 2138 max_pages); 2139 WARN_ON(nfolios <= data->max_folios); 2140 2141 folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs); 2142 if (!folios) 2143 return false; 2144 2145 memcpy(folios, ap->folios, sizeof(struct folio *) * ap->num_folios); 2146 memcpy(descs, ap->descs, sizeof(struct fuse_folio_desc) * ap->num_folios); 2147 kfree(ap->folios); 2148 ap->folios = folios; 2149 ap->descs = descs; 2150 data->max_folios = nfolios; 2151 2152 return true; 2153 } 2154 2155 static void fuse_writepages_send(struct inode *inode, 2156 struct fuse_fill_wb_data *data) 2157 { 2158 struct fuse_writepage_args *wpa = data->wpa; 2159 struct fuse_inode *fi = get_fuse_inode(inode); 2160 2161 spin_lock(&fi->lock); 2162 list_add_tail(&wpa->queue_entry, &fi->queued_writes); 2163 fuse_flush_writepages(inode); 2164 spin_unlock(&fi->lock); 2165 } 2166 2167 static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, 2168 unsigned len, struct fuse_args_pages *ap, 2169 unsigned cur_bytes, bool write) 2170 { 2171 struct folio *prev_folio; 2172 struct fuse_folio_desc prev_desc; 2173 unsigned bytes = cur_bytes + len; 2174 loff_t prev_pos; 2175 size_t max_bytes = write ? fc->max_write : fc->max_read; 2176 2177 WARN_ON(!ap->num_folios); 2178 2179 /* Reached max pages */ 2180 if (DIV_ROUND_UP(bytes, PAGE_SIZE) > fc->max_pages) 2181 return true; 2182 2183 if (bytes > max_bytes) 2184 return true; 2185 2186 /* Discontinuity */ 2187 prev_folio = ap->folios[ap->num_folios - 1]; 2188 prev_desc = ap->descs[ap->num_folios - 1]; 2189 prev_pos = folio_pos(prev_folio) + prev_desc.offset + prev_desc.length; 2190 if (prev_pos != pos) 2191 return true; 2192 2193 return false; 2194 } 2195 2196 static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, 2197 struct folio *folio, u64 pos, 2198 unsigned len, u64 end_pos) 2199 { 2200 struct fuse_fill_wb_data *data = wpc->wb_ctx; 2201 struct fuse_writepage_args *wpa = data->wpa; 2202 struct fuse_args_pages *ap = &wpa->ia.ap; 2203 struct inode *inode = wpc->inode; 2204 struct fuse_inode *fi = get_fuse_inode(inode); 2205 struct fuse_conn *fc = get_fuse_conn(inode); 2206 loff_t offset = offset_in_folio(folio, pos); 2207 2208 WARN_ON_ONCE(!data); 2209 2210 if (!data->ff) { 2211 data->ff = fuse_write_file_get(fi); 2212 if (!data->ff) 2213 return -EIO; 2214 } 2215 2216 if (wpa) { 2217 bool send = fuse_folios_need_send(fc, pos, len, ap, 2218 data->nr_bytes, true); 2219 2220 if (!send) { 2221 /* 2222 * Need to grow the pages array? If so, did the 2223 * expansion fail? 2224 */ 2225 send = (ap->num_folios == data->max_folios) && 2226 !fuse_pages_realloc(data, fc->max_pages); 2227 } 2228 2229 if (send) { 2230 fuse_writepages_send(inode, data); 2231 data->wpa = NULL; 2232 data->nr_bytes = 0; 2233 } 2234 } 2235 2236 if (data->wpa == NULL) { 2237 wpa = fuse_writepage_args_setup(folio, offset, data->ff); 2238 if (!wpa) 2239 return -ENOMEM; 2240 fuse_file_get(wpa->ia.ff); 2241 data->max_folios = 1; 2242 ap = &wpa->ia.ap; 2243 } 2244 2245 fuse_writepage_args_page_fill(wpa, folio, ap->num_folios, 2246 offset, len); 2247 data->nr_bytes += len; 2248 2249 ap->num_folios++; 2250 if (!data->wpa) 2251 data->wpa = wpa; 2252 2253 return len; 2254 } 2255 2256 static int fuse_iomap_writeback_submit(struct iomap_writepage_ctx *wpc, 2257 int error) 2258 { 2259 struct fuse_fill_wb_data *data = wpc->wb_ctx; 2260 2261 WARN_ON_ONCE(!data); 2262 2263 if (data->wpa) { 2264 WARN_ON(!data->wpa->ia.ap.num_folios); 2265 fuse_writepages_send(wpc->inode, data); 2266 } 2267 2268 if (data->ff) 2269 fuse_file_put(data->ff, false); 2270 2271 return error; 2272 } 2273 2274 static const struct iomap_writeback_ops fuse_writeback_ops = { 2275 .writeback_range = fuse_iomap_writeback_range, 2276 .writeback_submit = fuse_iomap_writeback_submit, 2277 }; 2278 2279 static int fuse_writepages(struct address_space *mapping, 2280 struct writeback_control *wbc) 2281 { 2282 struct inode *inode = mapping->host; 2283 struct fuse_conn *fc = get_fuse_conn(inode); 2284 struct fuse_fill_wb_data data = {}; 2285 struct iomap_writepage_ctx wpc = { 2286 .inode = inode, 2287 .iomap.type = IOMAP_MAPPED, 2288 .wbc = wbc, 2289 .ops = &fuse_writeback_ops, 2290 .wb_ctx = &data, 2291 }; 2292 2293 if (fuse_is_bad(inode)) 2294 return -EIO; 2295 2296 if (wbc->sync_mode == WB_SYNC_NONE && 2297 fc->num_background >= fc->congestion_threshold) 2298 return 0; 2299 2300 return iomap_writepages(&wpc); 2301 } 2302 2303 static int fuse_launder_folio(struct folio *folio) 2304 { 2305 int err = 0; 2306 struct fuse_fill_wb_data data = {}; 2307 struct iomap_writepage_ctx wpc = { 2308 .inode = folio->mapping->host, 2309 .iomap.type = IOMAP_MAPPED, 2310 .ops = &fuse_writeback_ops, 2311 .wb_ctx = &data, 2312 }; 2313 2314 if (folio_clear_dirty_for_io(folio)) { 2315 err = iomap_writeback_folio(&wpc, folio); 2316 err = fuse_iomap_writeback_submit(&wpc, err); 2317 if (!err) 2318 folio_wait_writeback(folio); 2319 } 2320 return err; 2321 } 2322 2323 /* 2324 * Write back dirty data/metadata now (there may not be any suitable 2325 * open files later for data) 2326 */ 2327 static void fuse_vma_close(struct vm_area_struct *vma) 2328 { 2329 int err; 2330 2331 err = write_inode_now(vma->vm_file->f_mapping->host, 1); 2332 mapping_set_error(vma->vm_file->f_mapping, err); 2333 } 2334 2335 /* 2336 * Wait for writeback against this page to complete before allowing it 2337 * to be marked dirty again, and hence written back again, possibly 2338 * before the previous writepage completed. 2339 * 2340 * Block here, instead of in ->writepage(), so that the userspace fs 2341 * can only block processes actually operating on the filesystem. 2342 * 2343 * Otherwise unprivileged userspace fs would be able to block 2344 * unrelated: 2345 * 2346 * - page migration 2347 * - sync(2) 2348 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER 2349 */ 2350 static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) 2351 { 2352 struct folio *folio = page_folio(vmf->page); 2353 struct inode *inode = file_inode(vmf->vma->vm_file); 2354 2355 file_update_time(vmf->vma->vm_file); 2356 folio_lock(folio); 2357 if (folio->mapping != inode->i_mapping) { 2358 folio_unlock(folio); 2359 return VM_FAULT_NOPAGE; 2360 } 2361 2362 folio_wait_writeback(folio); 2363 return VM_FAULT_LOCKED; 2364 } 2365 2366 static const struct vm_operations_struct fuse_file_vm_ops = { 2367 .close = fuse_vma_close, 2368 .fault = filemap_fault, 2369 .map_pages = filemap_map_pages, 2370 .page_mkwrite = fuse_page_mkwrite, 2371 }; 2372 2373 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) 2374 { 2375 struct fuse_file *ff = file->private_data; 2376 struct fuse_conn *fc = ff->fm->fc; 2377 struct inode *inode = file_inode(file); 2378 int rc; 2379 2380 /* DAX mmap is superior to direct_io mmap */ 2381 if (FUSE_IS_DAX(inode)) 2382 return fuse_dax_mmap(file, vma); 2383 2384 /* 2385 * If inode is in passthrough io mode, because it has some file open 2386 * in passthrough mode, either mmap to backing file or fail mmap, 2387 * because mixing cached mmap and passthrough io mode is not allowed. 2388 */ 2389 if (fuse_file_passthrough(ff)) 2390 return fuse_passthrough_mmap(file, vma); 2391 else if (fuse_inode_backing(get_fuse_inode(inode))) 2392 return -ENODEV; 2393 2394 /* 2395 * FOPEN_DIRECT_IO handling is special compared to O_DIRECT, 2396 * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP. 2397 */ 2398 if (ff->open_flags & FOPEN_DIRECT_IO) { 2399 /* 2400 * Can't provide the coherency needed for MAP_SHARED 2401 * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set. 2402 */ 2403 if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap) 2404 return -ENODEV; 2405 2406 invalidate_inode_pages2(file->f_mapping); 2407 2408 if (!(vma->vm_flags & VM_MAYSHARE)) { 2409 /* MAP_PRIVATE */ 2410 return generic_file_mmap(file, vma); 2411 } 2412 2413 /* 2414 * First mmap of direct_io file enters caching inode io mode. 2415 * Also waits for parallel dio writers to go into serial mode 2416 * (exclusive instead of shared lock). 2417 * After first mmap, the inode stays in caching io mode until 2418 * the direct_io file release. 2419 */ 2420 rc = fuse_file_cached_io_open(inode, ff); 2421 if (rc) 2422 return rc; 2423 } 2424 2425 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 2426 fuse_link_write_file(file); 2427 2428 file_accessed(file); 2429 vma->vm_ops = &fuse_file_vm_ops; 2430 return 0; 2431 } 2432 2433 static int convert_fuse_file_lock(struct fuse_conn *fc, 2434 const struct fuse_file_lock *ffl, 2435 struct file_lock *fl) 2436 { 2437 switch (ffl->type) { 2438 case F_UNLCK: 2439 break; 2440 2441 case F_RDLCK: 2442 case F_WRLCK: 2443 if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX || 2444 ffl->end < ffl->start) 2445 return -EIO; 2446 2447 fl->fl_start = ffl->start; 2448 fl->fl_end = ffl->end; 2449 2450 /* 2451 * Convert pid into init's pid namespace. The locks API will 2452 * translate it into the caller's pid namespace. 2453 */ 2454 rcu_read_lock(); 2455 fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); 2456 rcu_read_unlock(); 2457 break; 2458 2459 default: 2460 return -EIO; 2461 } 2462 fl->c.flc_type = ffl->type; 2463 return 0; 2464 } 2465 2466 static void fuse_lk_fill(struct fuse_args *args, struct file *file, 2467 const struct file_lock *fl, int opcode, pid_t pid, 2468 int flock, struct fuse_lk_in *inarg) 2469 { 2470 struct inode *inode = file_inode(file); 2471 struct fuse_conn *fc = get_fuse_conn(inode); 2472 struct fuse_file *ff = file->private_data; 2473 2474 memset(inarg, 0, sizeof(*inarg)); 2475 inarg->fh = ff->fh; 2476 inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner); 2477 inarg->lk.start = fl->fl_start; 2478 inarg->lk.end = fl->fl_end; 2479 inarg->lk.type = fl->c.flc_type; 2480 inarg->lk.pid = pid; 2481 if (flock) 2482 inarg->lk_flags |= FUSE_LK_FLOCK; 2483 args->opcode = opcode; 2484 args->nodeid = get_node_id(inode); 2485 args->in_numargs = 1; 2486 args->in_args[0].size = sizeof(*inarg); 2487 args->in_args[0].value = inarg; 2488 } 2489 2490 static int fuse_getlk(struct file *file, struct file_lock *fl) 2491 { 2492 struct inode *inode = file_inode(file); 2493 struct fuse_mount *fm = get_fuse_mount(inode); 2494 FUSE_ARGS(args); 2495 struct fuse_lk_in inarg; 2496 struct fuse_lk_out outarg; 2497 int err; 2498 2499 fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); 2500 args.out_numargs = 1; 2501 args.out_args[0].size = sizeof(outarg); 2502 args.out_args[0].value = &outarg; 2503 err = fuse_simple_request(fm, &args); 2504 if (!err) 2505 err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl); 2506 2507 return err; 2508 } 2509 2510 static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) 2511 { 2512 struct inode *inode = file_inode(file); 2513 struct fuse_mount *fm = get_fuse_mount(inode); 2514 FUSE_ARGS(args); 2515 struct fuse_lk_in inarg; 2516 int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; 2517 struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL; 2518 pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns); 2519 int err; 2520 2521 if (fl->fl_lmops && fl->fl_lmops->lm_grant) { 2522 /* NLM needs asynchronous locks, which we don't support yet */ 2523 return -ENOLCK; 2524 } 2525 2526 fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); 2527 err = fuse_simple_request(fm, &args); 2528 2529 /* locking is restartable */ 2530 if (err == -EINTR) 2531 err = -ERESTARTSYS; 2532 2533 return err; 2534 } 2535 2536 static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) 2537 { 2538 struct inode *inode = file_inode(file); 2539 struct fuse_conn *fc = get_fuse_conn(inode); 2540 int err; 2541 2542 if (cmd == F_CANCELLK) { 2543 err = 0; 2544 } else if (cmd == F_GETLK) { 2545 if (fc->no_lock) { 2546 posix_test_lock(file, fl); 2547 err = 0; 2548 } else 2549 err = fuse_getlk(file, fl); 2550 } else { 2551 if (fc->no_lock) 2552 err = posix_lock_file(file, fl, NULL); 2553 else 2554 err = fuse_setlk(file, fl, 0); 2555 } 2556 return err; 2557 } 2558 2559 static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) 2560 { 2561 struct inode *inode = file_inode(file); 2562 struct fuse_conn *fc = get_fuse_conn(inode); 2563 int err; 2564 2565 if (fc->no_flock) { 2566 err = locks_lock_file_wait(file, fl); 2567 } else { 2568 struct fuse_file *ff = file->private_data; 2569 2570 /* emulate flock with POSIX locks */ 2571 ff->flock = true; 2572 err = fuse_setlk(file, fl, 1); 2573 } 2574 2575 return err; 2576 } 2577 2578 static sector_t fuse_bmap(struct address_space *mapping, sector_t block) 2579 { 2580 struct inode *inode = mapping->host; 2581 struct fuse_mount *fm = get_fuse_mount(inode); 2582 FUSE_ARGS(args); 2583 struct fuse_bmap_in inarg; 2584 struct fuse_bmap_out outarg; 2585 int err; 2586 2587 if (!inode->i_sb->s_bdev || fm->fc->no_bmap) 2588 return 0; 2589 2590 memset(&inarg, 0, sizeof(inarg)); 2591 inarg.block = block; 2592 inarg.blocksize = inode->i_sb->s_blocksize; 2593 args.opcode = FUSE_BMAP; 2594 args.nodeid = get_node_id(inode); 2595 args.in_numargs = 1; 2596 args.in_args[0].size = sizeof(inarg); 2597 args.in_args[0].value = &inarg; 2598 args.out_numargs = 1; 2599 args.out_args[0].size = sizeof(outarg); 2600 args.out_args[0].value = &outarg; 2601 err = fuse_simple_request(fm, &args); 2602 if (err == -ENOSYS) 2603 fm->fc->no_bmap = 1; 2604 2605 return err ? 0 : outarg.block; 2606 } 2607 2608 static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) 2609 { 2610 struct inode *inode = file->f_mapping->host; 2611 struct fuse_mount *fm = get_fuse_mount(inode); 2612 struct fuse_file *ff = file->private_data; 2613 FUSE_ARGS(args); 2614 struct fuse_lseek_in inarg = { 2615 .fh = ff->fh, 2616 .offset = offset, 2617 .whence = whence 2618 }; 2619 struct fuse_lseek_out outarg; 2620 int err; 2621 2622 if (fm->fc->no_lseek) 2623 goto fallback; 2624 2625 args.opcode = FUSE_LSEEK; 2626 args.nodeid = ff->nodeid; 2627 args.in_numargs = 1; 2628 args.in_args[0].size = sizeof(inarg); 2629 args.in_args[0].value = &inarg; 2630 args.out_numargs = 1; 2631 args.out_args[0].size = sizeof(outarg); 2632 args.out_args[0].value = &outarg; 2633 err = fuse_simple_request(fm, &args); 2634 if (err) { 2635 if (err == -ENOSYS) { 2636 fm->fc->no_lseek = 1; 2637 goto fallback; 2638 } 2639 return err; 2640 } 2641 2642 return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); 2643 2644 fallback: 2645 err = fuse_update_attributes(inode, file, STATX_SIZE); 2646 if (!err) 2647 return generic_file_llseek(file, offset, whence); 2648 else 2649 return err; 2650 } 2651 2652 static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) 2653 { 2654 loff_t retval; 2655 struct inode *inode = file_inode(file); 2656 2657 switch (whence) { 2658 case SEEK_SET: 2659 case SEEK_CUR: 2660 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ 2661 retval = generic_file_llseek(file, offset, whence); 2662 break; 2663 case SEEK_END: 2664 inode_lock(inode); 2665 retval = fuse_update_attributes(inode, file, STATX_SIZE); 2666 if (!retval) 2667 retval = generic_file_llseek(file, offset, whence); 2668 inode_unlock(inode); 2669 break; 2670 case SEEK_HOLE: 2671 case SEEK_DATA: 2672 inode_lock(inode); 2673 retval = fuse_lseek(file, offset, whence); 2674 inode_unlock(inode); 2675 break; 2676 default: 2677 retval = -EINVAL; 2678 } 2679 2680 return retval; 2681 } 2682 2683 /* 2684 * All files which have been polled are linked to RB tree 2685 * fuse_conn->polled_files which is indexed by kh. Walk the tree and 2686 * find the matching one. 2687 */ 2688 static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh, 2689 struct rb_node **parent_out) 2690 { 2691 struct rb_node **link = &fc->polled_files.rb_node; 2692 struct rb_node *last = NULL; 2693 2694 while (*link) { 2695 struct fuse_file *ff; 2696 2697 last = *link; 2698 ff = rb_entry(last, struct fuse_file, polled_node); 2699 2700 if (kh < ff->kh) 2701 link = &last->rb_left; 2702 else if (kh > ff->kh) 2703 link = &last->rb_right; 2704 else 2705 return link; 2706 } 2707 2708 if (parent_out) 2709 *parent_out = last; 2710 return link; 2711 } 2712 2713 /* 2714 * The file is about to be polled. Make sure it's on the polled_files 2715 * RB tree. Note that files once added to the polled_files tree are 2716 * not removed before the file is released. This is because a file 2717 * polled once is likely to be polled again. 2718 */ 2719 static void fuse_register_polled_file(struct fuse_conn *fc, 2720 struct fuse_file *ff) 2721 { 2722 spin_lock(&fc->lock); 2723 if (RB_EMPTY_NODE(&ff->polled_node)) { 2724 struct rb_node **link, *parent; 2725 2726 link = fuse_find_polled_node(fc, ff->kh, &parent); 2727 BUG_ON(*link); 2728 rb_link_node(&ff->polled_node, parent, link); 2729 rb_insert_color(&ff->polled_node, &fc->polled_files); 2730 } 2731 spin_unlock(&fc->lock); 2732 } 2733 2734 __poll_t fuse_file_poll(struct file *file, poll_table *wait) 2735 { 2736 struct fuse_file *ff = file->private_data; 2737 struct fuse_mount *fm = ff->fm; 2738 struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; 2739 struct fuse_poll_out outarg; 2740 FUSE_ARGS(args); 2741 int err; 2742 2743 if (fm->fc->no_poll) 2744 return DEFAULT_POLLMASK; 2745 2746 poll_wait(file, &ff->poll_wait, wait); 2747 inarg.events = mangle_poll(poll_requested_events(wait)); 2748 2749 /* 2750 * Ask for notification iff there's someone waiting for it. 2751 * The client may ignore the flag and always notify. 2752 */ 2753 if (waitqueue_active(&ff->poll_wait)) { 2754 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; 2755 fuse_register_polled_file(fm->fc, ff); 2756 } 2757 2758 args.opcode = FUSE_POLL; 2759 args.nodeid = ff->nodeid; 2760 args.in_numargs = 1; 2761 args.in_args[0].size = sizeof(inarg); 2762 args.in_args[0].value = &inarg; 2763 args.out_numargs = 1; 2764 args.out_args[0].size = sizeof(outarg); 2765 args.out_args[0].value = &outarg; 2766 err = fuse_simple_request(fm, &args); 2767 2768 if (!err) 2769 return demangle_poll(outarg.revents); 2770 if (err == -ENOSYS) { 2771 fm->fc->no_poll = 1; 2772 return DEFAULT_POLLMASK; 2773 } 2774 return EPOLLERR; 2775 } 2776 EXPORT_SYMBOL_GPL(fuse_file_poll); 2777 2778 /* 2779 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and 2780 * wakes up the poll waiters. 2781 */ 2782 int fuse_notify_poll_wakeup(struct fuse_conn *fc, 2783 struct fuse_notify_poll_wakeup_out *outarg) 2784 { 2785 u64 kh = outarg->kh; 2786 struct rb_node **link; 2787 2788 spin_lock(&fc->lock); 2789 2790 link = fuse_find_polled_node(fc, kh, NULL); 2791 if (*link) { 2792 struct fuse_file *ff; 2793 2794 ff = rb_entry(*link, struct fuse_file, polled_node); 2795 wake_up_interruptible_sync(&ff->poll_wait); 2796 } 2797 2798 spin_unlock(&fc->lock); 2799 return 0; 2800 } 2801 2802 static void fuse_do_truncate(struct file *file) 2803 { 2804 struct inode *inode = file->f_mapping->host; 2805 struct iattr attr; 2806 2807 attr.ia_valid = ATTR_SIZE; 2808 attr.ia_size = i_size_read(inode); 2809 2810 attr.ia_file = file; 2811 attr.ia_valid |= ATTR_FILE; 2812 2813 fuse_do_setattr(file_mnt_idmap(file), file_dentry(file), &attr, file); 2814 } 2815 2816 static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) 2817 { 2818 return round_up(off, fc->max_pages << PAGE_SHIFT); 2819 } 2820 2821 static ssize_t 2822 fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 2823 { 2824 DECLARE_COMPLETION_ONSTACK(wait); 2825 ssize_t ret = 0; 2826 struct file *file = iocb->ki_filp; 2827 struct fuse_file *ff = file->private_data; 2828 loff_t pos = 0; 2829 struct inode *inode; 2830 loff_t i_size; 2831 size_t count = iov_iter_count(iter), shortened = 0; 2832 loff_t offset = iocb->ki_pos; 2833 struct fuse_io_priv *io; 2834 2835 pos = offset; 2836 inode = file->f_mapping->host; 2837 i_size = i_size_read(inode); 2838 2839 if ((iov_iter_rw(iter) == READ) && (offset >= i_size)) 2840 return 0; 2841 2842 io = kmalloc_obj(struct fuse_io_priv); 2843 if (!io) 2844 return -ENOMEM; 2845 spin_lock_init(&io->lock); 2846 kref_init(&io->refcnt); 2847 io->reqs = 1; 2848 io->bytes = -1; 2849 io->size = 0; 2850 io->offset = offset; 2851 io->write = (iov_iter_rw(iter) == WRITE); 2852 io->err = 0; 2853 /* 2854 * By default, we want to optimize all I/Os with async request 2855 * submission to the client filesystem if supported. 2856 */ 2857 io->async = ff->fm->fc->async_dio; 2858 io->iocb = iocb; 2859 io->blocking = is_sync_kiocb(iocb); 2860 2861 /* optimization for short read */ 2862 if (io->async && !io->write && offset + count > i_size) { 2863 iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset)); 2864 shortened = count - iov_iter_count(iter); 2865 count -= shortened; 2866 } 2867 2868 /* 2869 * We cannot asynchronously extend the size of a file. 2870 * In such case the aio will behave exactly like sync io. 2871 */ 2872 if ((offset + count > i_size) && io->write) 2873 io->blocking = true; 2874 2875 if (io->async && io->blocking) { 2876 /* 2877 * Additional reference to keep io around after 2878 * calling fuse_aio_complete() 2879 */ 2880 kref_get(&io->refcnt); 2881 io->done = &wait; 2882 } 2883 2884 if (iov_iter_rw(iter) == WRITE) { 2885 ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); 2886 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); 2887 } else { 2888 ret = __fuse_direct_read(io, iter, &pos); 2889 } 2890 iov_iter_reexpand(iter, iov_iter_count(iter) + shortened); 2891 2892 if (io->async) { 2893 bool blocking = io->blocking; 2894 2895 fuse_aio_complete(io, ret < 0 ? ret : 0, -1); 2896 2897 /* we have a non-extending, async request, so return */ 2898 if (!blocking) 2899 return -EIOCBQUEUED; 2900 2901 wait_for_completion(&wait); 2902 ret = fuse_get_res_by_io(io); 2903 } 2904 2905 kref_put(&io->refcnt, fuse_io_release); 2906 2907 if (iov_iter_rw(iter) == WRITE) { 2908 fuse_write_update_attr(inode, pos, ret); 2909 /* For extending writes we already hold exclusive lock */ 2910 if (ret < 0 && offset + count > i_size) 2911 fuse_do_truncate(file); 2912 } 2913 2914 return ret; 2915 } 2916 2917 static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) 2918 { 2919 int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX); 2920 2921 if (!err) 2922 fuse_sync_writes(inode); 2923 2924 return err; 2925 } 2926 2927 static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, 2928 loff_t length) 2929 { 2930 struct fuse_file *ff = file->private_data; 2931 struct inode *inode = file_inode(file); 2932 struct fuse_inode *fi = get_fuse_inode(inode); 2933 struct fuse_mount *fm = ff->fm; 2934 FUSE_ARGS(args); 2935 struct fuse_fallocate_in inarg = { 2936 .fh = ff->fh, 2937 .offset = offset, 2938 .length = length, 2939 .mode = mode 2940 }; 2941 int err; 2942 bool block_faults = FUSE_IS_DAX(inode) && 2943 (!(mode & FALLOC_FL_KEEP_SIZE) || 2944 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))); 2945 2946 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 2947 FALLOC_FL_ZERO_RANGE)) 2948 return -EOPNOTSUPP; 2949 2950 if (fm->fc->no_fallocate) 2951 return -EOPNOTSUPP; 2952 2953 inode_lock(inode); 2954 if (block_faults) { 2955 filemap_invalidate_lock(inode->i_mapping); 2956 err = fuse_dax_break_layouts(inode, 0, -1); 2957 if (err) 2958 goto out; 2959 } 2960 2961 if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) { 2962 loff_t endbyte = offset + length - 1; 2963 2964 err = fuse_writeback_range(inode, offset, endbyte); 2965 if (err) 2966 goto out; 2967 } 2968 2969 if (!(mode & FALLOC_FL_KEEP_SIZE) && 2970 offset + length > i_size_read(inode)) { 2971 err = inode_newsize_ok(inode, offset + length); 2972 if (err) 2973 goto out; 2974 } 2975 2976 err = file_modified(file); 2977 if (err) 2978 goto out; 2979 2980 if (!(mode & FALLOC_FL_KEEP_SIZE)) 2981 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 2982 2983 args.opcode = FUSE_FALLOCATE; 2984 args.nodeid = ff->nodeid; 2985 args.in_numargs = 1; 2986 args.in_args[0].size = sizeof(inarg); 2987 args.in_args[0].value = &inarg; 2988 err = fuse_simple_request(fm, &args); 2989 if (err == -ENOSYS) { 2990 fm->fc->no_fallocate = 1; 2991 err = -EOPNOTSUPP; 2992 } 2993 if (err) 2994 goto out; 2995 2996 /* we could have extended the file */ 2997 if (!(mode & FALLOC_FL_KEEP_SIZE)) { 2998 if (fuse_write_update_attr(inode, offset + length, length)) 2999 file_update_time(file); 3000 } 3001 3002 if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) 3003 truncate_pagecache_range(inode, offset, offset + length - 1); 3004 3005 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); 3006 3007 out: 3008 if (!(mode & FALLOC_FL_KEEP_SIZE)) 3009 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 3010 3011 if (block_faults) 3012 filemap_invalidate_unlock(inode->i_mapping); 3013 3014 inode_unlock(inode); 3015 3016 fuse_flush_time_update(inode); 3017 3018 return err; 3019 } 3020 3021 static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, 3022 struct file *file_out, loff_t pos_out, 3023 size_t len, unsigned int flags) 3024 { 3025 struct fuse_file *ff_in = file_in->private_data; 3026 struct fuse_file *ff_out = file_out->private_data; 3027 struct inode *inode_in = file_inode(file_in); 3028 struct inode *inode_out = file_inode(file_out); 3029 struct fuse_inode *fi_out = get_fuse_inode(inode_out); 3030 struct fuse_mount *fm = ff_in->fm; 3031 struct fuse_conn *fc = fm->fc; 3032 FUSE_ARGS(args); 3033 struct fuse_copy_file_range_in inarg = { 3034 .fh_in = ff_in->fh, 3035 .off_in = pos_in, 3036 .nodeid_out = ff_out->nodeid, 3037 .fh_out = ff_out->fh, 3038 .off_out = pos_out, 3039 .len = len, 3040 .flags = flags 3041 }; 3042 struct fuse_write_out outarg; 3043 struct fuse_copy_file_range_out outarg_64; 3044 u64 bytes_copied; 3045 ssize_t err; 3046 /* mark unstable when write-back is not used, and file_out gets 3047 * extended */ 3048 bool is_unstable = (!fc->writeback_cache) && 3049 ((pos_out + len) > inode_out->i_size); 3050 3051 if (fc->no_copy_file_range) 3052 return -EOPNOTSUPP; 3053 3054 if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) 3055 return -EXDEV; 3056 3057 inode_lock(inode_in); 3058 err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1); 3059 inode_unlock(inode_in); 3060 if (err) 3061 return err; 3062 3063 inode_lock(inode_out); 3064 3065 err = file_modified(file_out); 3066 if (err) 3067 goto out; 3068 3069 /* 3070 * Write out dirty pages in the destination file before sending the COPY 3071 * request to userspace. After the request is completed, truncate off 3072 * pages (including partial ones) from the cache that have been copied, 3073 * since these contain stale data at that point. 3074 * 3075 * This should be mostly correct, but if the COPY writes to partial 3076 * pages (at the start or end) and the parts not covered by the COPY are 3077 * written through a memory map after calling fuse_writeback_range(), 3078 * then these partial page modifications will be lost on truncation. 3079 * 3080 * It is unlikely that someone would rely on such mixed style 3081 * modifications. Yet this does give less guarantees than if the 3082 * copying was performed with write(2). 3083 * 3084 * To fix this a mapping->invalidate_lock could be used to prevent new 3085 * faults while the copy is ongoing. 3086 */ 3087 err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); 3088 if (err) 3089 goto out; 3090 3091 if (is_unstable) 3092 set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); 3093 3094 args.opcode = FUSE_COPY_FILE_RANGE_64; 3095 args.nodeid = ff_in->nodeid; 3096 args.in_numargs = 1; 3097 args.in_args[0].size = sizeof(inarg); 3098 args.in_args[0].value = &inarg; 3099 args.out_numargs = 1; 3100 args.out_args[0].size = sizeof(outarg_64); 3101 args.out_args[0].value = &outarg_64; 3102 if (fc->no_copy_file_range_64) { 3103 fallback: 3104 /* Fall back to old op that can't handle large copy length */ 3105 args.opcode = FUSE_COPY_FILE_RANGE; 3106 args.out_args[0].size = sizeof(outarg); 3107 args.out_args[0].value = &outarg; 3108 inarg.len = len = min_t(size_t, len, UINT_MAX & PAGE_MASK); 3109 } 3110 err = fuse_simple_request(fm, &args); 3111 if (err == -ENOSYS) { 3112 if (fc->no_copy_file_range_64) { 3113 fc->no_copy_file_range = 1; 3114 err = -EOPNOTSUPP; 3115 } else { 3116 fc->no_copy_file_range_64 = 1; 3117 goto fallback; 3118 } 3119 } 3120 if (err) 3121 goto out; 3122 3123 bytes_copied = fc->no_copy_file_range_64 ? 3124 outarg.size : outarg_64.bytes_copied; 3125 3126 if (bytes_copied > len) { 3127 err = -EIO; 3128 goto out; 3129 } 3130 3131 truncate_inode_pages_range(inode_out->i_mapping, 3132 ALIGN_DOWN(pos_out, PAGE_SIZE), 3133 ALIGN(pos_out + bytes_copied, PAGE_SIZE) - 1); 3134 3135 file_update_time(file_out); 3136 fuse_write_update_attr(inode_out, pos_out + bytes_copied, bytes_copied); 3137 3138 err = bytes_copied; 3139 out: 3140 if (is_unstable) 3141 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); 3142 3143 inode_unlock(inode_out); 3144 file_accessed(file_in); 3145 3146 fuse_flush_time_update(inode_out); 3147 3148 return err; 3149 } 3150 3151 static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off, 3152 struct file *dst_file, loff_t dst_off, 3153 size_t len, unsigned int flags) 3154 { 3155 ssize_t ret; 3156 3157 ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off, 3158 len, flags); 3159 3160 if (ret == -EOPNOTSUPP || ret == -EXDEV) 3161 ret = splice_copy_file_range(src_file, src_off, dst_file, 3162 dst_off, len); 3163 return ret; 3164 } 3165 3166 static const struct file_operations fuse_file_operations = { 3167 .llseek = fuse_file_llseek, 3168 .read_iter = fuse_file_read_iter, 3169 .write_iter = fuse_file_write_iter, 3170 .mmap = fuse_file_mmap, 3171 .open = fuse_open, 3172 .flush = fuse_flush, 3173 .release = fuse_release, 3174 .fsync = fuse_fsync, 3175 .lock = fuse_file_lock, 3176 .get_unmapped_area = thp_get_unmapped_area, 3177 .flock = fuse_file_flock, 3178 .splice_read = fuse_splice_read, 3179 .splice_write = fuse_splice_write, 3180 .unlocked_ioctl = fuse_file_ioctl, 3181 .compat_ioctl = fuse_file_compat_ioctl, 3182 .poll = fuse_file_poll, 3183 .fallocate = fuse_file_fallocate, 3184 .copy_file_range = fuse_copy_file_range, 3185 .setlease = generic_setlease, 3186 }; 3187 3188 static const struct address_space_operations fuse_file_aops = { 3189 .read_folio = fuse_read_folio, 3190 .readahead = fuse_readahead, 3191 .writepages = fuse_writepages, 3192 .launder_folio = fuse_launder_folio, 3193 .dirty_folio = iomap_dirty_folio, 3194 .release_folio = iomap_release_folio, 3195 .invalidate_folio = iomap_invalidate_folio, 3196 .is_partially_uptodate = iomap_is_partially_uptodate, 3197 .migrate_folio = filemap_migrate_folio, 3198 .bmap = fuse_bmap, 3199 .direct_IO = fuse_direct_IO, 3200 }; 3201 3202 void fuse_init_file_inode(struct inode *inode, unsigned int flags) 3203 { 3204 struct fuse_inode *fi = get_fuse_inode(inode); 3205 struct fuse_conn *fc = get_fuse_conn(inode); 3206 3207 inode->i_fop = &fuse_file_operations; 3208 inode->i_data.a_ops = &fuse_file_aops; 3209 if (fc->writeback_cache) 3210 mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data); 3211 3212 INIT_LIST_HEAD(&fi->write_files); 3213 INIT_LIST_HEAD(&fi->queued_writes); 3214 fi->writectr = 0; 3215 fi->iocachectr = 0; 3216 init_waitqueue_head(&fi->page_waitq); 3217 init_waitqueue_head(&fi->direct_io_waitq); 3218 3219 if (IS_ENABLED(CONFIG_FUSE_DAX)) 3220 fuse_dax_inode_init(inode, flags); 3221 } 3222