xref: /linux/fs/fuse/file.c (revision 4232da23d75d173195c6766729e51947b64f83cd)
1  /*
2    FUSE: Filesystem in Userspace
3    Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
4  
5    This program can be distributed under the terms of the GNU GPL.
6    See the file COPYING.
7  */
8  
9  #include "fuse_i.h"
10  
11  #include <linux/pagemap.h>
12  #include <linux/slab.h>
13  #include <linux/kernel.h>
14  #include <linux/sched.h>
15  #include <linux/sched/signal.h>
16  #include <linux/module.h>
17  #include <linux/swap.h>
18  #include <linux/falloc.h>
19  #include <linux/uio.h>
20  #include <linux/fs.h>
21  #include <linux/filelock.h>
22  #include <linux/splice.h>
23  #include <linux/task_io_accounting_ops.h>
24  
25  static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
26  			  unsigned int open_flags, int opcode,
27  			  struct fuse_open_out *outargp)
28  {
29  	struct fuse_open_in inarg;
30  	FUSE_ARGS(args);
31  
32  	memset(&inarg, 0, sizeof(inarg));
33  	inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
34  	if (!fm->fc->atomic_o_trunc)
35  		inarg.flags &= ~O_TRUNC;
36  
37  	if (fm->fc->handle_killpriv_v2 &&
38  	    (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) {
39  		inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
40  	}
41  
42  	args.opcode = opcode;
43  	args.nodeid = nodeid;
44  	args.in_numargs = 1;
45  	args.in_args[0].size = sizeof(inarg);
46  	args.in_args[0].value = &inarg;
47  	args.out_numargs = 1;
48  	args.out_args[0].size = sizeof(*outargp);
49  	args.out_args[0].value = outargp;
50  
51  	return fuse_simple_request(fm, &args);
52  }
53  
54  struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release)
55  {
56  	struct fuse_file *ff;
57  
58  	ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
59  	if (unlikely(!ff))
60  		return NULL;
61  
62  	ff->fm = fm;
63  	if (release) {
64  		ff->args = kzalloc(sizeof(*ff->args), GFP_KERNEL_ACCOUNT);
65  		if (!ff->args) {
66  			kfree(ff);
67  			return NULL;
68  		}
69  	}
70  
71  	INIT_LIST_HEAD(&ff->write_entry);
72  	refcount_set(&ff->count, 1);
73  	RB_CLEAR_NODE(&ff->polled_node);
74  	init_waitqueue_head(&ff->poll_wait);
75  
76  	ff->kh = atomic64_inc_return(&fm->fc->khctr);
77  
78  	return ff;
79  }
80  
81  void fuse_file_free(struct fuse_file *ff)
82  {
83  	kfree(ff->args);
84  	kfree(ff);
85  }
86  
87  static struct fuse_file *fuse_file_get(struct fuse_file *ff)
88  {
89  	refcount_inc(&ff->count);
90  	return ff;
91  }
92  
93  static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
94  			     int error)
95  {
96  	struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
97  
98  	iput(ra->inode);
99  	kfree(ra);
100  }
101  
102  static void fuse_file_put(struct fuse_file *ff, bool sync)
103  {
104  	if (refcount_dec_and_test(&ff->count)) {
105  		struct fuse_release_args *ra = &ff->args->release_args;
106  		struct fuse_args *args = (ra ? &ra->args : NULL);
107  
108  		if (ra && ra->inode)
109  			fuse_file_io_release(ff, ra->inode);
110  
111  		if (!args) {
112  			/* Do nothing when server does not implement 'open' */
113  		} else if (sync) {
114  			fuse_simple_request(ff->fm, args);
115  			fuse_release_end(ff->fm, args, 0);
116  		} else {
117  			args->end = fuse_release_end;
118  			if (fuse_simple_background(ff->fm, args,
119  						   GFP_KERNEL | __GFP_NOFAIL))
120  				fuse_release_end(ff->fm, args, -ENOTCONN);
121  		}
122  		kfree(ff);
123  	}
124  }
125  
126  struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
127  				 unsigned int open_flags, bool isdir)
128  {
129  	struct fuse_conn *fc = fm->fc;
130  	struct fuse_file *ff;
131  	int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
132  	bool open = isdir ? !fc->no_opendir : !fc->no_open;
133  
134  	ff = fuse_file_alloc(fm, open);
135  	if (!ff)
136  		return ERR_PTR(-ENOMEM);
137  
138  	ff->fh = 0;
139  	/* Default for no-open */
140  	ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
141  	if (open) {
142  		/* Store outarg for fuse_finish_open() */
143  		struct fuse_open_out *outargp = &ff->args->open_outarg;
144  		int err;
145  
146  		err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp);
147  		if (!err) {
148  			ff->fh = outargp->fh;
149  			ff->open_flags = outargp->open_flags;
150  		} else if (err != -ENOSYS) {
151  			fuse_file_free(ff);
152  			return ERR_PTR(err);
153  		} else {
154  			/* No release needed */
155  			kfree(ff->args);
156  			ff->args = NULL;
157  			if (isdir)
158  				fc->no_opendir = 1;
159  			else
160  				fc->no_open = 1;
161  		}
162  	}
163  
164  	if (isdir)
165  		ff->open_flags &= ~FOPEN_DIRECT_IO;
166  
167  	ff->nodeid = nodeid;
168  
169  	return ff;
170  }
171  
172  int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
173  		 bool isdir)
174  {
175  	struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir);
176  
177  	if (!IS_ERR(ff))
178  		file->private_data = ff;
179  
180  	return PTR_ERR_OR_ZERO(ff);
181  }
182  EXPORT_SYMBOL_GPL(fuse_do_open);
183  
184  static void fuse_link_write_file(struct file *file)
185  {
186  	struct inode *inode = file_inode(file);
187  	struct fuse_inode *fi = get_fuse_inode(inode);
188  	struct fuse_file *ff = file->private_data;
189  	/*
190  	 * file may be written through mmap, so chain it onto the
191  	 * inodes's write_file list
192  	 */
193  	spin_lock(&fi->lock);
194  	if (list_empty(&ff->write_entry))
195  		list_add(&ff->write_entry, &fi->write_files);
196  	spin_unlock(&fi->lock);
197  }
198  
199  int fuse_finish_open(struct inode *inode, struct file *file)
200  {
201  	struct fuse_file *ff = file->private_data;
202  	struct fuse_conn *fc = get_fuse_conn(inode);
203  	int err;
204  
205  	err = fuse_file_io_open(file, inode);
206  	if (err)
207  		return err;
208  
209  	if (ff->open_flags & FOPEN_STREAM)
210  		stream_open(inode, file);
211  	else if (ff->open_flags & FOPEN_NONSEEKABLE)
212  		nonseekable_open(inode, file);
213  
214  	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
215  		fuse_link_write_file(file);
216  
217  	return 0;
218  }
219  
220  static void fuse_truncate_update_attr(struct inode *inode, struct file *file)
221  {
222  	struct fuse_conn *fc = get_fuse_conn(inode);
223  	struct fuse_inode *fi = get_fuse_inode(inode);
224  
225  	spin_lock(&fi->lock);
226  	fi->attr_version = atomic64_inc_return(&fc->attr_version);
227  	i_size_write(inode, 0);
228  	spin_unlock(&fi->lock);
229  	file_update_time(file);
230  	fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
231  }
232  
233  static int fuse_open(struct inode *inode, struct file *file)
234  {
235  	struct fuse_mount *fm = get_fuse_mount(inode);
236  	struct fuse_inode *fi = get_fuse_inode(inode);
237  	struct fuse_conn *fc = fm->fc;
238  	struct fuse_file *ff;
239  	int err;
240  	bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc;
241  	bool is_wb_truncate = is_truncate && fc->writeback_cache;
242  	bool dax_truncate = is_truncate && FUSE_IS_DAX(inode);
243  
244  	if (fuse_is_bad(inode))
245  		return -EIO;
246  
247  	err = generic_file_open(inode, file);
248  	if (err)
249  		return err;
250  
251  	if (is_wb_truncate || dax_truncate)
252  		inode_lock(inode);
253  
254  	if (dax_truncate) {
255  		filemap_invalidate_lock(inode->i_mapping);
256  		err = fuse_dax_break_layouts(inode, 0, 0);
257  		if (err)
258  			goto out_inode_unlock;
259  	}
260  
261  	if (is_wb_truncate || dax_truncate)
262  		fuse_set_nowrite(inode);
263  
264  	err = fuse_do_open(fm, get_node_id(inode), file, false);
265  	if (!err) {
266  		ff = file->private_data;
267  		err = fuse_finish_open(inode, file);
268  		if (err)
269  			fuse_sync_release(fi, ff, file->f_flags);
270  		else if (is_truncate)
271  			fuse_truncate_update_attr(inode, file);
272  	}
273  
274  	if (is_wb_truncate || dax_truncate)
275  		fuse_release_nowrite(inode);
276  	if (!err) {
277  		if (is_truncate)
278  			truncate_pagecache(inode, 0);
279  		else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
280  			invalidate_inode_pages2(inode->i_mapping);
281  	}
282  	if (dax_truncate)
283  		filemap_invalidate_unlock(inode->i_mapping);
284  out_inode_unlock:
285  	if (is_wb_truncate || dax_truncate)
286  		inode_unlock(inode);
287  
288  	return err;
289  }
290  
291  static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
292  				 unsigned int flags, int opcode, bool sync)
293  {
294  	struct fuse_conn *fc = ff->fm->fc;
295  	struct fuse_release_args *ra = &ff->args->release_args;
296  
297  	if (fuse_file_passthrough(ff))
298  		fuse_passthrough_release(ff, fuse_inode_backing(fi));
299  
300  	/* Inode is NULL on error path of fuse_create_open() */
301  	if (likely(fi)) {
302  		spin_lock(&fi->lock);
303  		list_del(&ff->write_entry);
304  		spin_unlock(&fi->lock);
305  	}
306  	spin_lock(&fc->lock);
307  	if (!RB_EMPTY_NODE(&ff->polled_node))
308  		rb_erase(&ff->polled_node, &fc->polled_files);
309  	spin_unlock(&fc->lock);
310  
311  	wake_up_interruptible_all(&ff->poll_wait);
312  
313  	if (!ra)
314  		return;
315  
316  	/* ff->args was used for open outarg */
317  	memset(ff->args, 0, sizeof(*ff->args));
318  	ra->inarg.fh = ff->fh;
319  	ra->inarg.flags = flags;
320  	ra->args.in_numargs = 1;
321  	ra->args.in_args[0].size = sizeof(struct fuse_release_in);
322  	ra->args.in_args[0].value = &ra->inarg;
323  	ra->args.opcode = opcode;
324  	ra->args.nodeid = ff->nodeid;
325  	ra->args.force = true;
326  	ra->args.nocreds = true;
327  
328  	/*
329  	 * Hold inode until release is finished.
330  	 * From fuse_sync_release() the refcount is 1 and everything's
331  	 * synchronous, so we are fine with not doing igrab() here.
332  	 */
333  	ra->inode = sync ? NULL : igrab(&fi->inode);
334  }
335  
336  void fuse_file_release(struct inode *inode, struct fuse_file *ff,
337  		       unsigned int open_flags, fl_owner_t id, bool isdir)
338  {
339  	struct fuse_inode *fi = get_fuse_inode(inode);
340  	struct fuse_release_args *ra = &ff->args->release_args;
341  	int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
342  
343  	fuse_prepare_release(fi, ff, open_flags, opcode, false);
344  
345  	if (ra && ff->flock) {
346  		ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
347  		ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
348  	}
349  
350  	/*
351  	 * Normally this will send the RELEASE request, however if
352  	 * some asynchronous READ or WRITE requests are outstanding,
353  	 * the sending will be delayed.
354  	 *
355  	 * Make the release synchronous if this is a fuseblk mount,
356  	 * synchronous RELEASE is allowed (and desirable) in this case
357  	 * because the server can be trusted not to screw up.
358  	 */
359  	fuse_file_put(ff, ff->fm->fc->destroy);
360  }
361  
362  void fuse_release_common(struct file *file, bool isdir)
363  {
364  	fuse_file_release(file_inode(file), file->private_data, file->f_flags,
365  			  (fl_owner_t) file, isdir);
366  }
367  
368  static int fuse_release(struct inode *inode, struct file *file)
369  {
370  	struct fuse_conn *fc = get_fuse_conn(inode);
371  
372  	/*
373  	 * Dirty pages might remain despite write_inode_now() call from
374  	 * fuse_flush() due to writes racing with the close.
375  	 */
376  	if (fc->writeback_cache)
377  		write_inode_now(inode, 1);
378  
379  	fuse_release_common(file, false);
380  
381  	/* return value is ignored by VFS */
382  	return 0;
383  }
384  
385  void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
386  		       unsigned int flags)
387  {
388  	WARN_ON(refcount_read(&ff->count) > 1);
389  	fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true);
390  	fuse_file_put(ff, true);
391  }
392  EXPORT_SYMBOL_GPL(fuse_sync_release);
393  
394  /*
395   * Scramble the ID space with XTEA, so that the value of the files_struct
396   * pointer is not exposed to userspace.
397   */
398  u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
399  {
400  	u32 *k = fc->scramble_key;
401  	u64 v = (unsigned long) id;
402  	u32 v0 = v;
403  	u32 v1 = v >> 32;
404  	u32 sum = 0;
405  	int i;
406  
407  	for (i = 0; i < 32; i++) {
408  		v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
409  		sum += 0x9E3779B9;
410  		v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
411  	}
412  
413  	return (u64) v0 + ((u64) v1 << 32);
414  }
415  
416  struct fuse_writepage_args {
417  	struct fuse_io_args ia;
418  	struct rb_node writepages_entry;
419  	struct list_head queue_entry;
420  	struct fuse_writepage_args *next;
421  	struct inode *inode;
422  	struct fuse_sync_bucket *bucket;
423  };
424  
425  static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
426  					    pgoff_t idx_from, pgoff_t idx_to)
427  {
428  	struct rb_node *n;
429  
430  	n = fi->writepages.rb_node;
431  
432  	while (n) {
433  		struct fuse_writepage_args *wpa;
434  		pgoff_t curr_index;
435  
436  		wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
437  		WARN_ON(get_fuse_inode(wpa->inode) != fi);
438  		curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
439  		if (idx_from >= curr_index + wpa->ia.ap.num_pages)
440  			n = n->rb_right;
441  		else if (idx_to < curr_index)
442  			n = n->rb_left;
443  		else
444  			return wpa;
445  	}
446  	return NULL;
447  }
448  
449  /*
450   * Check if any page in a range is under writeback
451   *
452   * This is currently done by walking the list of writepage requests
453   * for the inode, which can be pretty inefficient.
454   */
455  static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
456  				   pgoff_t idx_to)
457  {
458  	struct fuse_inode *fi = get_fuse_inode(inode);
459  	bool found;
460  
461  	spin_lock(&fi->lock);
462  	found = fuse_find_writeback(fi, idx_from, idx_to);
463  	spin_unlock(&fi->lock);
464  
465  	return found;
466  }
467  
468  static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
469  {
470  	return fuse_range_is_writeback(inode, index, index);
471  }
472  
473  /*
474   * Wait for page writeback to be completed.
475   *
476   * Since fuse doesn't rely on the VM writeback tracking, this has to
477   * use some other means.
478   */
479  static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
480  {
481  	struct fuse_inode *fi = get_fuse_inode(inode);
482  
483  	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
484  }
485  
486  /*
487   * Wait for all pending writepages on the inode to finish.
488   *
489   * This is currently done by blocking further writes with FUSE_NOWRITE
490   * and waiting for all sent writes to complete.
491   *
492   * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
493   * could conflict with truncation.
494   */
495  static void fuse_sync_writes(struct inode *inode)
496  {
497  	fuse_set_nowrite(inode);
498  	fuse_release_nowrite(inode);
499  }
500  
501  static int fuse_flush(struct file *file, fl_owner_t id)
502  {
503  	struct inode *inode = file_inode(file);
504  	struct fuse_mount *fm = get_fuse_mount(inode);
505  	struct fuse_file *ff = file->private_data;
506  	struct fuse_flush_in inarg;
507  	FUSE_ARGS(args);
508  	int err;
509  
510  	if (fuse_is_bad(inode))
511  		return -EIO;
512  
513  	if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
514  		return 0;
515  
516  	err = write_inode_now(inode, 1);
517  	if (err)
518  		return err;
519  
520  	inode_lock(inode);
521  	fuse_sync_writes(inode);
522  	inode_unlock(inode);
523  
524  	err = filemap_check_errors(file->f_mapping);
525  	if (err)
526  		return err;
527  
528  	err = 0;
529  	if (fm->fc->no_flush)
530  		goto inval_attr_out;
531  
532  	memset(&inarg, 0, sizeof(inarg));
533  	inarg.fh = ff->fh;
534  	inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
535  	args.opcode = FUSE_FLUSH;
536  	args.nodeid = get_node_id(inode);
537  	args.in_numargs = 1;
538  	args.in_args[0].size = sizeof(inarg);
539  	args.in_args[0].value = &inarg;
540  	args.force = true;
541  
542  	err = fuse_simple_request(fm, &args);
543  	if (err == -ENOSYS) {
544  		fm->fc->no_flush = 1;
545  		err = 0;
546  	}
547  
548  inval_attr_out:
549  	/*
550  	 * In memory i_blocks is not maintained by fuse, if writeback cache is
551  	 * enabled, i_blocks from cached attr may not be accurate.
552  	 */
553  	if (!err && fm->fc->writeback_cache)
554  		fuse_invalidate_attr_mask(inode, STATX_BLOCKS);
555  	return err;
556  }
557  
558  int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
559  		      int datasync, int opcode)
560  {
561  	struct inode *inode = file->f_mapping->host;
562  	struct fuse_mount *fm = get_fuse_mount(inode);
563  	struct fuse_file *ff = file->private_data;
564  	FUSE_ARGS(args);
565  	struct fuse_fsync_in inarg;
566  
567  	memset(&inarg, 0, sizeof(inarg));
568  	inarg.fh = ff->fh;
569  	inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
570  	args.opcode = opcode;
571  	args.nodeid = get_node_id(inode);
572  	args.in_numargs = 1;
573  	args.in_args[0].size = sizeof(inarg);
574  	args.in_args[0].value = &inarg;
575  	return fuse_simple_request(fm, &args);
576  }
577  
578  static int fuse_fsync(struct file *file, loff_t start, loff_t end,
579  		      int datasync)
580  {
581  	struct inode *inode = file->f_mapping->host;
582  	struct fuse_conn *fc = get_fuse_conn(inode);
583  	int err;
584  
585  	if (fuse_is_bad(inode))
586  		return -EIO;
587  
588  	inode_lock(inode);
589  
590  	/*
591  	 * Start writeback against all dirty pages of the inode, then
592  	 * wait for all outstanding writes, before sending the FSYNC
593  	 * request.
594  	 */
595  	err = file_write_and_wait_range(file, start, end);
596  	if (err)
597  		goto out;
598  
599  	fuse_sync_writes(inode);
600  
601  	/*
602  	 * Due to implementation of fuse writeback
603  	 * file_write_and_wait_range() does not catch errors.
604  	 * We have to do this directly after fuse_sync_writes()
605  	 */
606  	err = file_check_and_advance_wb_err(file);
607  	if (err)
608  		goto out;
609  
610  	err = sync_inode_metadata(inode, 1);
611  	if (err)
612  		goto out;
613  
614  	if (fc->no_fsync)
615  		goto out;
616  
617  	err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
618  	if (err == -ENOSYS) {
619  		fc->no_fsync = 1;
620  		err = 0;
621  	}
622  out:
623  	inode_unlock(inode);
624  
625  	return err;
626  }
627  
628  void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
629  			 size_t count, int opcode)
630  {
631  	struct fuse_file *ff = file->private_data;
632  	struct fuse_args *args = &ia->ap.args;
633  
634  	ia->read.in.fh = ff->fh;
635  	ia->read.in.offset = pos;
636  	ia->read.in.size = count;
637  	ia->read.in.flags = file->f_flags;
638  	args->opcode = opcode;
639  	args->nodeid = ff->nodeid;
640  	args->in_numargs = 1;
641  	args->in_args[0].size = sizeof(ia->read.in);
642  	args->in_args[0].value = &ia->read.in;
643  	args->out_argvar = true;
644  	args->out_numargs = 1;
645  	args->out_args[0].size = count;
646  }
647  
648  static void fuse_release_user_pages(struct fuse_args_pages *ap,
649  				    bool should_dirty)
650  {
651  	unsigned int i;
652  
653  	for (i = 0; i < ap->num_pages; i++) {
654  		if (should_dirty)
655  			set_page_dirty_lock(ap->pages[i]);
656  		if (ap->args.is_pinned)
657  			unpin_user_page(ap->pages[i]);
658  	}
659  }
660  
661  static void fuse_io_release(struct kref *kref)
662  {
663  	kfree(container_of(kref, struct fuse_io_priv, refcnt));
664  }
665  
666  static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
667  {
668  	if (io->err)
669  		return io->err;
670  
671  	if (io->bytes >= 0 && io->write)
672  		return -EIO;
673  
674  	return io->bytes < 0 ? io->size : io->bytes;
675  }
676  
677  /*
678   * In case of short read, the caller sets 'pos' to the position of
679   * actual end of fuse request in IO request. Otherwise, if bytes_requested
680   * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
681   *
682   * An example:
683   * User requested DIO read of 64K. It was split into two 32K fuse requests,
684   * both submitted asynchronously. The first of them was ACKed by userspace as
685   * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
686   * second request was ACKed as short, e.g. only 1K was read, resulting in
687   * pos == 33K.
688   *
689   * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
690   * will be equal to the length of the longest contiguous fragment of
691   * transferred data starting from the beginning of IO request.
692   */
693  static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
694  {
695  	int left;
696  
697  	spin_lock(&io->lock);
698  	if (err)
699  		io->err = io->err ? : err;
700  	else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
701  		io->bytes = pos;
702  
703  	left = --io->reqs;
704  	if (!left && io->blocking)
705  		complete(io->done);
706  	spin_unlock(&io->lock);
707  
708  	if (!left && !io->blocking) {
709  		ssize_t res = fuse_get_res_by_io(io);
710  
711  		if (res >= 0) {
712  			struct inode *inode = file_inode(io->iocb->ki_filp);
713  			struct fuse_conn *fc = get_fuse_conn(inode);
714  			struct fuse_inode *fi = get_fuse_inode(inode);
715  
716  			spin_lock(&fi->lock);
717  			fi->attr_version = atomic64_inc_return(&fc->attr_version);
718  			spin_unlock(&fi->lock);
719  		}
720  
721  		io->iocb->ki_complete(io->iocb, res);
722  	}
723  
724  	kref_put(&io->refcnt, fuse_io_release);
725  }
726  
727  static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
728  					  unsigned int npages)
729  {
730  	struct fuse_io_args *ia;
731  
732  	ia = kzalloc(sizeof(*ia), GFP_KERNEL);
733  	if (ia) {
734  		ia->io = io;
735  		ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL,
736  						&ia->ap.descs);
737  		if (!ia->ap.pages) {
738  			kfree(ia);
739  			ia = NULL;
740  		}
741  	}
742  	return ia;
743  }
744  
745  static void fuse_io_free(struct fuse_io_args *ia)
746  {
747  	kfree(ia->ap.pages);
748  	kfree(ia);
749  }
750  
751  static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
752  				  int err)
753  {
754  	struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
755  	struct fuse_io_priv *io = ia->io;
756  	ssize_t pos = -1;
757  
758  	fuse_release_user_pages(&ia->ap, io->should_dirty);
759  
760  	if (err) {
761  		/* Nothing */
762  	} else if (io->write) {
763  		if (ia->write.out.size > ia->write.in.size) {
764  			err = -EIO;
765  		} else if (ia->write.in.size != ia->write.out.size) {
766  			pos = ia->write.in.offset - io->offset +
767  				ia->write.out.size;
768  		}
769  	} else {
770  		u32 outsize = args->out_args[0].size;
771  
772  		if (ia->read.in.size != outsize)
773  			pos = ia->read.in.offset - io->offset + outsize;
774  	}
775  
776  	fuse_aio_complete(io, err, pos);
777  	fuse_io_free(ia);
778  }
779  
780  static ssize_t fuse_async_req_send(struct fuse_mount *fm,
781  				   struct fuse_io_args *ia, size_t num_bytes)
782  {
783  	ssize_t err;
784  	struct fuse_io_priv *io = ia->io;
785  
786  	spin_lock(&io->lock);
787  	kref_get(&io->refcnt);
788  	io->size += num_bytes;
789  	io->reqs++;
790  	spin_unlock(&io->lock);
791  
792  	ia->ap.args.end = fuse_aio_complete_req;
793  	ia->ap.args.may_block = io->should_dirty;
794  	err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
795  	if (err)
796  		fuse_aio_complete_req(fm, &ia->ap.args, err);
797  
798  	return num_bytes;
799  }
800  
801  static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
802  			      fl_owner_t owner)
803  {
804  	struct file *file = ia->io->iocb->ki_filp;
805  	struct fuse_file *ff = file->private_data;
806  	struct fuse_mount *fm = ff->fm;
807  
808  	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
809  	if (owner != NULL) {
810  		ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
811  		ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner);
812  	}
813  
814  	if (ia->io->async)
815  		return fuse_async_req_send(fm, ia, count);
816  
817  	return fuse_simple_request(fm, &ia->ap.args);
818  }
819  
820  static void fuse_read_update_size(struct inode *inode, loff_t size,
821  				  u64 attr_ver)
822  {
823  	struct fuse_conn *fc = get_fuse_conn(inode);
824  	struct fuse_inode *fi = get_fuse_inode(inode);
825  
826  	spin_lock(&fi->lock);
827  	if (attr_ver >= fi->attr_version && size < inode->i_size &&
828  	    !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
829  		fi->attr_version = atomic64_inc_return(&fc->attr_version);
830  		i_size_write(inode, size);
831  	}
832  	spin_unlock(&fi->lock);
833  }
834  
835  static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
836  			    struct fuse_args_pages *ap)
837  {
838  	struct fuse_conn *fc = get_fuse_conn(inode);
839  
840  	/*
841  	 * If writeback_cache is enabled, a short read means there's a hole in
842  	 * the file.  Some data after the hole is in page cache, but has not
843  	 * reached the client fs yet.  So the hole is not present there.
844  	 */
845  	if (!fc->writeback_cache) {
846  		loff_t pos = page_offset(ap->pages[0]) + num_read;
847  		fuse_read_update_size(inode, pos, attr_ver);
848  	}
849  }
850  
851  static int fuse_do_readpage(struct file *file, struct page *page)
852  {
853  	struct inode *inode = page->mapping->host;
854  	struct fuse_mount *fm = get_fuse_mount(inode);
855  	loff_t pos = page_offset(page);
856  	struct fuse_page_desc desc = { .length = PAGE_SIZE };
857  	struct fuse_io_args ia = {
858  		.ap.args.page_zeroing = true,
859  		.ap.args.out_pages = true,
860  		.ap.num_pages = 1,
861  		.ap.pages = &page,
862  		.ap.descs = &desc,
863  	};
864  	ssize_t res;
865  	u64 attr_ver;
866  
867  	/*
868  	 * Page writeback can extend beyond the lifetime of the
869  	 * page-cache page, so make sure we read a properly synced
870  	 * page.
871  	 */
872  	fuse_wait_on_page_writeback(inode, page->index);
873  
874  	attr_ver = fuse_get_attr_version(fm->fc);
875  
876  	/* Don't overflow end offset */
877  	if (pos + (desc.length - 1) == LLONG_MAX)
878  		desc.length--;
879  
880  	fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
881  	res = fuse_simple_request(fm, &ia.ap.args);
882  	if (res < 0)
883  		return res;
884  	/*
885  	 * Short read means EOF.  If file size is larger, truncate it
886  	 */
887  	if (res < desc.length)
888  		fuse_short_read(inode, attr_ver, res, &ia.ap);
889  
890  	SetPageUptodate(page);
891  
892  	return 0;
893  }
894  
895  static int fuse_read_folio(struct file *file, struct folio *folio)
896  {
897  	struct page *page = &folio->page;
898  	struct inode *inode = page->mapping->host;
899  	int err;
900  
901  	err = -EIO;
902  	if (fuse_is_bad(inode))
903  		goto out;
904  
905  	err = fuse_do_readpage(file, page);
906  	fuse_invalidate_atime(inode);
907   out:
908  	unlock_page(page);
909  	return err;
910  }
911  
912  static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
913  			       int err)
914  {
915  	int i;
916  	struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
917  	struct fuse_args_pages *ap = &ia->ap;
918  	size_t count = ia->read.in.size;
919  	size_t num_read = args->out_args[0].size;
920  	struct address_space *mapping = NULL;
921  
922  	for (i = 0; mapping == NULL && i < ap->num_pages; i++)
923  		mapping = ap->pages[i]->mapping;
924  
925  	if (mapping) {
926  		struct inode *inode = mapping->host;
927  
928  		/*
929  		 * Short read means EOF. If file size is larger, truncate it
930  		 */
931  		if (!err && num_read < count)
932  			fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
933  
934  		fuse_invalidate_atime(inode);
935  	}
936  
937  	for (i = 0; i < ap->num_pages; i++) {
938  		struct page *page = ap->pages[i];
939  
940  		if (!err)
941  			SetPageUptodate(page);
942  		else
943  			SetPageError(page);
944  		unlock_page(page);
945  		put_page(page);
946  	}
947  	if (ia->ff)
948  		fuse_file_put(ia->ff, false);
949  
950  	fuse_io_free(ia);
951  }
952  
953  static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
954  {
955  	struct fuse_file *ff = file->private_data;
956  	struct fuse_mount *fm = ff->fm;
957  	struct fuse_args_pages *ap = &ia->ap;
958  	loff_t pos = page_offset(ap->pages[0]);
959  	size_t count = ap->num_pages << PAGE_SHIFT;
960  	ssize_t res;
961  	int err;
962  
963  	ap->args.out_pages = true;
964  	ap->args.page_zeroing = true;
965  	ap->args.page_replace = true;
966  
967  	/* Don't overflow end offset */
968  	if (pos + (count - 1) == LLONG_MAX) {
969  		count--;
970  		ap->descs[ap->num_pages - 1].length--;
971  	}
972  	WARN_ON((loff_t) (pos + count) < 0);
973  
974  	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
975  	ia->read.attr_ver = fuse_get_attr_version(fm->fc);
976  	if (fm->fc->async_read) {
977  		ia->ff = fuse_file_get(ff);
978  		ap->args.end = fuse_readpages_end;
979  		err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
980  		if (!err)
981  			return;
982  	} else {
983  		res = fuse_simple_request(fm, &ap->args);
984  		err = res < 0 ? res : 0;
985  	}
986  	fuse_readpages_end(fm, &ap->args, err);
987  }
988  
989  static void fuse_readahead(struct readahead_control *rac)
990  {
991  	struct inode *inode = rac->mapping->host;
992  	struct fuse_conn *fc = get_fuse_conn(inode);
993  	unsigned int i, max_pages, nr_pages = 0;
994  
995  	if (fuse_is_bad(inode))
996  		return;
997  
998  	max_pages = min_t(unsigned int, fc->max_pages,
999  			fc->max_read / PAGE_SIZE);
1000  
1001  	for (;;) {
1002  		struct fuse_io_args *ia;
1003  		struct fuse_args_pages *ap;
1004  
1005  		if (fc->num_background >= fc->congestion_threshold &&
1006  		    rac->ra->async_size >= readahead_count(rac))
1007  			/*
1008  			 * Congested and only async pages left, so skip the
1009  			 * rest.
1010  			 */
1011  			break;
1012  
1013  		nr_pages = readahead_count(rac) - nr_pages;
1014  		if (nr_pages > max_pages)
1015  			nr_pages = max_pages;
1016  		if (nr_pages == 0)
1017  			break;
1018  		ia = fuse_io_alloc(NULL, nr_pages);
1019  		if (!ia)
1020  			return;
1021  		ap = &ia->ap;
1022  		nr_pages = __readahead_batch(rac, ap->pages, nr_pages);
1023  		for (i = 0; i < nr_pages; i++) {
1024  			fuse_wait_on_page_writeback(inode,
1025  						    readahead_index(rac) + i);
1026  			ap->descs[i].length = PAGE_SIZE;
1027  		}
1028  		ap->num_pages = nr_pages;
1029  		fuse_send_readpages(ia, rac->file);
1030  	}
1031  }
1032  
1033  static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
1034  {
1035  	struct inode *inode = iocb->ki_filp->f_mapping->host;
1036  	struct fuse_conn *fc = get_fuse_conn(inode);
1037  
1038  	/*
1039  	 * In auto invalidate mode, always update attributes on read.
1040  	 * Otherwise, only update if we attempt to read past EOF (to ensure
1041  	 * i_size is up to date).
1042  	 */
1043  	if (fc->auto_inval_data ||
1044  	    (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
1045  		int err;
1046  		err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE);
1047  		if (err)
1048  			return err;
1049  	}
1050  
1051  	return generic_file_read_iter(iocb, to);
1052  }
1053  
1054  static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
1055  				 loff_t pos, size_t count)
1056  {
1057  	struct fuse_args *args = &ia->ap.args;
1058  
1059  	ia->write.in.fh = ff->fh;
1060  	ia->write.in.offset = pos;
1061  	ia->write.in.size = count;
1062  	args->opcode = FUSE_WRITE;
1063  	args->nodeid = ff->nodeid;
1064  	args->in_numargs = 2;
1065  	if (ff->fm->fc->minor < 9)
1066  		args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
1067  	else
1068  		args->in_args[0].size = sizeof(ia->write.in);
1069  	args->in_args[0].value = &ia->write.in;
1070  	args->in_args[1].size = count;
1071  	args->out_numargs = 1;
1072  	args->out_args[0].size = sizeof(ia->write.out);
1073  	args->out_args[0].value = &ia->write.out;
1074  }
1075  
1076  static unsigned int fuse_write_flags(struct kiocb *iocb)
1077  {
1078  	unsigned int flags = iocb->ki_filp->f_flags;
1079  
1080  	if (iocb_is_dsync(iocb))
1081  		flags |= O_DSYNC;
1082  	if (iocb->ki_flags & IOCB_SYNC)
1083  		flags |= O_SYNC;
1084  
1085  	return flags;
1086  }
1087  
1088  static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
1089  			       size_t count, fl_owner_t owner)
1090  {
1091  	struct kiocb *iocb = ia->io->iocb;
1092  	struct file *file = iocb->ki_filp;
1093  	struct fuse_file *ff = file->private_data;
1094  	struct fuse_mount *fm = ff->fm;
1095  	struct fuse_write_in *inarg = &ia->write.in;
1096  	ssize_t err;
1097  
1098  	fuse_write_args_fill(ia, ff, pos, count);
1099  	inarg->flags = fuse_write_flags(iocb);
1100  	if (owner != NULL) {
1101  		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
1102  		inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner);
1103  	}
1104  
1105  	if (ia->io->async)
1106  		return fuse_async_req_send(fm, ia, count);
1107  
1108  	err = fuse_simple_request(fm, &ia->ap.args);
1109  	if (!err && ia->write.out.size > count)
1110  		err = -EIO;
1111  
1112  	return err ?: ia->write.out.size;
1113  }
1114  
1115  bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written)
1116  {
1117  	struct fuse_conn *fc = get_fuse_conn(inode);
1118  	struct fuse_inode *fi = get_fuse_inode(inode);
1119  	bool ret = false;
1120  
1121  	spin_lock(&fi->lock);
1122  	fi->attr_version = atomic64_inc_return(&fc->attr_version);
1123  	if (written > 0 && pos > inode->i_size) {
1124  		i_size_write(inode, pos);
1125  		ret = true;
1126  	}
1127  	spin_unlock(&fi->lock);
1128  
1129  	fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
1130  
1131  	return ret;
1132  }
1133  
1134  static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
1135  				     struct kiocb *iocb, struct inode *inode,
1136  				     loff_t pos, size_t count)
1137  {
1138  	struct fuse_args_pages *ap = &ia->ap;
1139  	struct file *file = iocb->ki_filp;
1140  	struct fuse_file *ff = file->private_data;
1141  	struct fuse_mount *fm = ff->fm;
1142  	unsigned int offset, i;
1143  	bool short_write;
1144  	int err;
1145  
1146  	for (i = 0; i < ap->num_pages; i++)
1147  		fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
1148  
1149  	fuse_write_args_fill(ia, ff, pos, count);
1150  	ia->write.in.flags = fuse_write_flags(iocb);
1151  	if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID))
1152  		ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
1153  
1154  	err = fuse_simple_request(fm, &ap->args);
1155  	if (!err && ia->write.out.size > count)
1156  		err = -EIO;
1157  
1158  	short_write = ia->write.out.size < count;
1159  	offset = ap->descs[0].offset;
1160  	count = ia->write.out.size;
1161  	for (i = 0; i < ap->num_pages; i++) {
1162  		struct page *page = ap->pages[i];
1163  
1164  		if (err) {
1165  			ClearPageUptodate(page);
1166  		} else {
1167  			if (count >= PAGE_SIZE - offset)
1168  				count -= PAGE_SIZE - offset;
1169  			else {
1170  				if (short_write)
1171  					ClearPageUptodate(page);
1172  				count = 0;
1173  			}
1174  			offset = 0;
1175  		}
1176  		if (ia->write.page_locked && (i == ap->num_pages - 1))
1177  			unlock_page(page);
1178  		put_page(page);
1179  	}
1180  
1181  	return err;
1182  }
1183  
1184  static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
1185  				     struct address_space *mapping,
1186  				     struct iov_iter *ii, loff_t pos,
1187  				     unsigned int max_pages)
1188  {
1189  	struct fuse_args_pages *ap = &ia->ap;
1190  	struct fuse_conn *fc = get_fuse_conn(mapping->host);
1191  	unsigned offset = pos & (PAGE_SIZE - 1);
1192  	size_t count = 0;
1193  	int err;
1194  
1195  	ap->args.in_pages = true;
1196  	ap->descs[0].offset = offset;
1197  
1198  	do {
1199  		size_t tmp;
1200  		struct page *page;
1201  		pgoff_t index = pos >> PAGE_SHIFT;
1202  		size_t bytes = min_t(size_t, PAGE_SIZE - offset,
1203  				     iov_iter_count(ii));
1204  
1205  		bytes = min_t(size_t, bytes, fc->max_write - count);
1206  
1207   again:
1208  		err = -EFAULT;
1209  		if (fault_in_iov_iter_readable(ii, bytes))
1210  			break;
1211  
1212  		err = -ENOMEM;
1213  		page = grab_cache_page_write_begin(mapping, index);
1214  		if (!page)
1215  			break;
1216  
1217  		if (mapping_writably_mapped(mapping))
1218  			flush_dcache_page(page);
1219  
1220  		tmp = copy_page_from_iter_atomic(page, offset, bytes, ii);
1221  		flush_dcache_page(page);
1222  
1223  		if (!tmp) {
1224  			unlock_page(page);
1225  			put_page(page);
1226  			goto again;
1227  		}
1228  
1229  		err = 0;
1230  		ap->pages[ap->num_pages] = page;
1231  		ap->descs[ap->num_pages].length = tmp;
1232  		ap->num_pages++;
1233  
1234  		count += tmp;
1235  		pos += tmp;
1236  		offset += tmp;
1237  		if (offset == PAGE_SIZE)
1238  			offset = 0;
1239  
1240  		/* If we copied full page, mark it uptodate */
1241  		if (tmp == PAGE_SIZE)
1242  			SetPageUptodate(page);
1243  
1244  		if (PageUptodate(page)) {
1245  			unlock_page(page);
1246  		} else {
1247  			ia->write.page_locked = true;
1248  			break;
1249  		}
1250  		if (!fc->big_writes)
1251  			break;
1252  	} while (iov_iter_count(ii) && count < fc->max_write &&
1253  		 ap->num_pages < max_pages && offset == 0);
1254  
1255  	return count > 0 ? count : err;
1256  }
1257  
1258  static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
1259  				     unsigned int max_pages)
1260  {
1261  	return min_t(unsigned int,
1262  		     ((pos + len - 1) >> PAGE_SHIFT) -
1263  		     (pos >> PAGE_SHIFT) + 1,
1264  		     max_pages);
1265  }
1266  
1267  static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
1268  {
1269  	struct address_space *mapping = iocb->ki_filp->f_mapping;
1270  	struct inode *inode = mapping->host;
1271  	struct fuse_conn *fc = get_fuse_conn(inode);
1272  	struct fuse_inode *fi = get_fuse_inode(inode);
1273  	loff_t pos = iocb->ki_pos;
1274  	int err = 0;
1275  	ssize_t res = 0;
1276  
1277  	if (inode->i_size < pos + iov_iter_count(ii))
1278  		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1279  
1280  	do {
1281  		ssize_t count;
1282  		struct fuse_io_args ia = {};
1283  		struct fuse_args_pages *ap = &ia.ap;
1284  		unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
1285  						      fc->max_pages);
1286  
1287  		ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
1288  		if (!ap->pages) {
1289  			err = -ENOMEM;
1290  			break;
1291  		}
1292  
1293  		count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages);
1294  		if (count <= 0) {
1295  			err = count;
1296  		} else {
1297  			err = fuse_send_write_pages(&ia, iocb, inode,
1298  						    pos, count);
1299  			if (!err) {
1300  				size_t num_written = ia.write.out.size;
1301  
1302  				res += num_written;
1303  				pos += num_written;
1304  
1305  				/* break out of the loop on short write */
1306  				if (num_written != count)
1307  					err = -EIO;
1308  			}
1309  		}
1310  		kfree(ap->pages);
1311  	} while (!err && iov_iter_count(ii));
1312  
1313  	fuse_write_update_attr(inode, pos, res);
1314  	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1315  
1316  	if (!res)
1317  		return err;
1318  	iocb->ki_pos += res;
1319  	return res;
1320  }
1321  
1322  static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter)
1323  {
1324  	struct inode *inode = file_inode(iocb->ki_filp);
1325  
1326  	return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
1327  }
1328  
1329  /*
1330   * @return true if an exclusive lock for direct IO writes is needed
1331   */
1332  static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from)
1333  {
1334  	struct file *file = iocb->ki_filp;
1335  	struct fuse_file *ff = file->private_data;
1336  	struct inode *inode = file_inode(iocb->ki_filp);
1337  	struct fuse_inode *fi = get_fuse_inode(inode);
1338  
1339  	/* Server side has to advise that it supports parallel dio writes. */
1340  	if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES))
1341  		return true;
1342  
1343  	/*
1344  	 * Append will need to know the eventual EOF - always needs an
1345  	 * exclusive lock.
1346  	 */
1347  	if (iocb->ki_flags & IOCB_APPEND)
1348  		return true;
1349  
1350  	/* shared locks are not allowed with parallel page cache IO */
1351  	if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state))
1352  		return false;
1353  
1354  	/* Parallel dio beyond EOF is not supported, at least for now. */
1355  	if (fuse_io_past_eof(iocb, from))
1356  		return true;
1357  
1358  	return false;
1359  }
1360  
1361  static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
1362  			  bool *exclusive)
1363  {
1364  	struct inode *inode = file_inode(iocb->ki_filp);
1365  	struct fuse_inode *fi = get_fuse_inode(inode);
1366  
1367  	*exclusive = fuse_dio_wr_exclusive_lock(iocb, from);
1368  	if (*exclusive) {
1369  		inode_lock(inode);
1370  	} else {
1371  		inode_lock_shared(inode);
1372  		/*
1373  		 * New parallal dio allowed only if inode is not in caching
1374  		 * mode and denies new opens in caching mode. This check
1375  		 * should be performed only after taking shared inode lock.
1376  		 * Previous past eof check was without inode lock and might
1377  		 * have raced, so check it again.
1378  		 */
1379  		if (fuse_io_past_eof(iocb, from) ||
1380  		    fuse_inode_uncached_io_start(fi, NULL) != 0) {
1381  			inode_unlock_shared(inode);
1382  			inode_lock(inode);
1383  			*exclusive = true;
1384  		}
1385  	}
1386  }
1387  
1388  static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
1389  {
1390  	struct inode *inode = file_inode(iocb->ki_filp);
1391  	struct fuse_inode *fi = get_fuse_inode(inode);
1392  
1393  	if (exclusive) {
1394  		inode_unlock(inode);
1395  	} else {
1396  		/* Allow opens in caching mode after last parallel dio end */
1397  		fuse_inode_uncached_io_end(fi);
1398  		inode_unlock_shared(inode);
1399  	}
1400  }
1401  
1402  static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
1403  {
1404  	struct file *file = iocb->ki_filp;
1405  	struct address_space *mapping = file->f_mapping;
1406  	ssize_t written = 0;
1407  	struct inode *inode = mapping->host;
1408  	ssize_t err, count;
1409  	struct fuse_conn *fc = get_fuse_conn(inode);
1410  
1411  	if (fc->writeback_cache) {
1412  		/* Update size (EOF optimization) and mode (SUID clearing) */
1413  		err = fuse_update_attributes(mapping->host, file,
1414  					     STATX_SIZE | STATX_MODE);
1415  		if (err)
1416  			return err;
1417  
1418  		if (fc->handle_killpriv_v2 &&
1419  		    setattr_should_drop_suidgid(&nop_mnt_idmap,
1420  						file_inode(file))) {
1421  			goto writethrough;
1422  		}
1423  
1424  		return generic_file_write_iter(iocb, from);
1425  	}
1426  
1427  writethrough:
1428  	inode_lock(inode);
1429  
1430  	err = count = generic_write_checks(iocb, from);
1431  	if (err <= 0)
1432  		goto out;
1433  
1434  	task_io_account_write(count);
1435  
1436  	err = file_remove_privs(file);
1437  	if (err)
1438  		goto out;
1439  
1440  	err = file_update_time(file);
1441  	if (err)
1442  		goto out;
1443  
1444  	if (iocb->ki_flags & IOCB_DIRECT) {
1445  		written = generic_file_direct_write(iocb, from);
1446  		if (written < 0 || !iov_iter_count(from))
1447  			goto out;
1448  		written = direct_write_fallback(iocb, from, written,
1449  				fuse_perform_write(iocb, from));
1450  	} else {
1451  		written = fuse_perform_write(iocb, from);
1452  	}
1453  out:
1454  	inode_unlock(inode);
1455  	if (written > 0)
1456  		written = generic_write_sync(iocb, written);
1457  
1458  	return written ? written : err;
1459  }
1460  
1461  static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
1462  {
1463  	return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset;
1464  }
1465  
1466  static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
1467  					size_t max_size)
1468  {
1469  	return min(iov_iter_single_seg_count(ii), max_size);
1470  }
1471  
1472  static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
1473  			       size_t *nbytesp, int write,
1474  			       unsigned int max_pages)
1475  {
1476  	size_t nbytes = 0;  /* # bytes already packed in req */
1477  	ssize_t ret = 0;
1478  
1479  	/* Special case for kernel I/O: can copy directly into the buffer */
1480  	if (iov_iter_is_kvec(ii)) {
1481  		unsigned long user_addr = fuse_get_user_addr(ii);
1482  		size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
1483  
1484  		if (write)
1485  			ap->args.in_args[1].value = (void *) user_addr;
1486  		else
1487  			ap->args.out_args[0].value = (void *) user_addr;
1488  
1489  		iov_iter_advance(ii, frag_size);
1490  		*nbytesp = frag_size;
1491  		return 0;
1492  	}
1493  
1494  	while (nbytes < *nbytesp && ap->num_pages < max_pages) {
1495  		unsigned npages;
1496  		size_t start;
1497  		struct page **pt_pages;
1498  
1499  		pt_pages = &ap->pages[ap->num_pages];
1500  		ret = iov_iter_extract_pages(ii, &pt_pages,
1501  					     *nbytesp - nbytes,
1502  					     max_pages - ap->num_pages,
1503  					     0, &start);
1504  		if (ret < 0)
1505  			break;
1506  
1507  		nbytes += ret;
1508  
1509  		ret += start;
1510  		npages = DIV_ROUND_UP(ret, PAGE_SIZE);
1511  
1512  		ap->descs[ap->num_pages].offset = start;
1513  		fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
1514  
1515  		ap->num_pages += npages;
1516  		ap->descs[ap->num_pages - 1].length -=
1517  			(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
1518  	}
1519  
1520  	ap->args.is_pinned = iov_iter_extract_will_pin(ii);
1521  	ap->args.user_pages = true;
1522  	if (write)
1523  		ap->args.in_pages = true;
1524  	else
1525  		ap->args.out_pages = true;
1526  
1527  	*nbytesp = nbytes;
1528  
1529  	return ret < 0 ? ret : 0;
1530  }
1531  
1532  ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1533  		       loff_t *ppos, int flags)
1534  {
1535  	int write = flags & FUSE_DIO_WRITE;
1536  	int cuse = flags & FUSE_DIO_CUSE;
1537  	struct file *file = io->iocb->ki_filp;
1538  	struct address_space *mapping = file->f_mapping;
1539  	struct inode *inode = mapping->host;
1540  	struct fuse_file *ff = file->private_data;
1541  	struct fuse_conn *fc = ff->fm->fc;
1542  	size_t nmax = write ? fc->max_write : fc->max_read;
1543  	loff_t pos = *ppos;
1544  	size_t count = iov_iter_count(iter);
1545  	pgoff_t idx_from = pos >> PAGE_SHIFT;
1546  	pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
1547  	ssize_t res = 0;
1548  	int err = 0;
1549  	struct fuse_io_args *ia;
1550  	unsigned int max_pages;
1551  	bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO;
1552  
1553  	max_pages = iov_iter_npages(iter, fc->max_pages);
1554  	ia = fuse_io_alloc(io, max_pages);
1555  	if (!ia)
1556  		return -ENOMEM;
1557  
1558  	if (fopen_direct_io && fc->direct_io_allow_mmap) {
1559  		res = filemap_write_and_wait_range(mapping, pos, pos + count - 1);
1560  		if (res) {
1561  			fuse_io_free(ia);
1562  			return res;
1563  		}
1564  	}
1565  	if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
1566  		if (!write)
1567  			inode_lock(inode);
1568  		fuse_sync_writes(inode);
1569  		if (!write)
1570  			inode_unlock(inode);
1571  	}
1572  
1573  	if (fopen_direct_io && write) {
1574  		res = invalidate_inode_pages2_range(mapping, idx_from, idx_to);
1575  		if (res) {
1576  			fuse_io_free(ia);
1577  			return res;
1578  		}
1579  	}
1580  
1581  	io->should_dirty = !write && user_backed_iter(iter);
1582  	while (count) {
1583  		ssize_t nres;
1584  		fl_owner_t owner = current->files;
1585  		size_t nbytes = min(count, nmax);
1586  
1587  		err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
1588  					  max_pages);
1589  		if (err && !nbytes)
1590  			break;
1591  
1592  		if (write) {
1593  			if (!capable(CAP_FSETID))
1594  				ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
1595  
1596  			nres = fuse_send_write(ia, pos, nbytes, owner);
1597  		} else {
1598  			nres = fuse_send_read(ia, pos, nbytes, owner);
1599  		}
1600  
1601  		if (!io->async || nres < 0) {
1602  			fuse_release_user_pages(&ia->ap, io->should_dirty);
1603  			fuse_io_free(ia);
1604  		}
1605  		ia = NULL;
1606  		if (nres < 0) {
1607  			iov_iter_revert(iter, nbytes);
1608  			err = nres;
1609  			break;
1610  		}
1611  		WARN_ON(nres > nbytes);
1612  
1613  		count -= nres;
1614  		res += nres;
1615  		pos += nres;
1616  		if (nres != nbytes) {
1617  			iov_iter_revert(iter, nbytes - nres);
1618  			break;
1619  		}
1620  		if (count) {
1621  			max_pages = iov_iter_npages(iter, fc->max_pages);
1622  			ia = fuse_io_alloc(io, max_pages);
1623  			if (!ia)
1624  				break;
1625  		}
1626  	}
1627  	if (ia)
1628  		fuse_io_free(ia);
1629  	if (res > 0)
1630  		*ppos = pos;
1631  
1632  	return res > 0 ? res : err;
1633  }
1634  EXPORT_SYMBOL_GPL(fuse_direct_io);
1635  
1636  static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
1637  				  struct iov_iter *iter,
1638  				  loff_t *ppos)
1639  {
1640  	ssize_t res;
1641  	struct inode *inode = file_inode(io->iocb->ki_filp);
1642  
1643  	res = fuse_direct_io(io, iter, ppos, 0);
1644  
1645  	fuse_invalidate_atime(inode);
1646  
1647  	return res;
1648  }
1649  
1650  static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
1651  
1652  static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
1653  {
1654  	ssize_t res;
1655  
1656  	if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
1657  		res = fuse_direct_IO(iocb, to);
1658  	} else {
1659  		struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1660  
1661  		res = __fuse_direct_read(&io, to, &iocb->ki_pos);
1662  	}
1663  
1664  	return res;
1665  }
1666  
1667  static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
1668  {
1669  	struct inode *inode = file_inode(iocb->ki_filp);
1670  	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1671  	ssize_t res;
1672  	bool exclusive;
1673  
1674  	fuse_dio_lock(iocb, from, &exclusive);
1675  	res = generic_write_checks(iocb, from);
1676  	if (res > 0) {
1677  		task_io_account_write(res);
1678  		if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
1679  			res = fuse_direct_IO(iocb, from);
1680  		} else {
1681  			res = fuse_direct_io(&io, from, &iocb->ki_pos,
1682  					     FUSE_DIO_WRITE);
1683  			fuse_write_update_attr(inode, iocb->ki_pos, res);
1684  		}
1685  	}
1686  	fuse_dio_unlock(iocb, exclusive);
1687  
1688  	return res;
1689  }
1690  
1691  static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1692  {
1693  	struct file *file = iocb->ki_filp;
1694  	struct fuse_file *ff = file->private_data;
1695  	struct inode *inode = file_inode(file);
1696  
1697  	if (fuse_is_bad(inode))
1698  		return -EIO;
1699  
1700  	if (FUSE_IS_DAX(inode))
1701  		return fuse_dax_read_iter(iocb, to);
1702  
1703  	/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1704  	if (ff->open_flags & FOPEN_DIRECT_IO)
1705  		return fuse_direct_read_iter(iocb, to);
1706  	else if (fuse_file_passthrough(ff))
1707  		return fuse_passthrough_read_iter(iocb, to);
1708  	else
1709  		return fuse_cache_read_iter(iocb, to);
1710  }
1711  
1712  static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1713  {
1714  	struct file *file = iocb->ki_filp;
1715  	struct fuse_file *ff = file->private_data;
1716  	struct inode *inode = file_inode(file);
1717  
1718  	if (fuse_is_bad(inode))
1719  		return -EIO;
1720  
1721  	if (FUSE_IS_DAX(inode))
1722  		return fuse_dax_write_iter(iocb, from);
1723  
1724  	/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1725  	if (ff->open_flags & FOPEN_DIRECT_IO)
1726  		return fuse_direct_write_iter(iocb, from);
1727  	else if (fuse_file_passthrough(ff))
1728  		return fuse_passthrough_write_iter(iocb, from);
1729  	else
1730  		return fuse_cache_write_iter(iocb, from);
1731  }
1732  
1733  static ssize_t fuse_splice_read(struct file *in, loff_t *ppos,
1734  				struct pipe_inode_info *pipe, size_t len,
1735  				unsigned int flags)
1736  {
1737  	struct fuse_file *ff = in->private_data;
1738  
1739  	/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1740  	if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
1741  		return fuse_passthrough_splice_read(in, ppos, pipe, len, flags);
1742  	else
1743  		return filemap_splice_read(in, ppos, pipe, len, flags);
1744  }
1745  
1746  static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out,
1747  				 loff_t *ppos, size_t len, unsigned int flags)
1748  {
1749  	struct fuse_file *ff = out->private_data;
1750  
1751  	/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1752  	if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
1753  		return fuse_passthrough_splice_write(pipe, out, ppos, len, flags);
1754  	else
1755  		return iter_file_splice_write(pipe, out, ppos, len, flags);
1756  }
1757  
1758  static void fuse_writepage_free(struct fuse_writepage_args *wpa)
1759  {
1760  	struct fuse_args_pages *ap = &wpa->ia.ap;
1761  	int i;
1762  
1763  	if (wpa->bucket)
1764  		fuse_sync_bucket_dec(wpa->bucket);
1765  
1766  	for (i = 0; i < ap->num_pages; i++)
1767  		__free_page(ap->pages[i]);
1768  
1769  	if (wpa->ia.ff)
1770  		fuse_file_put(wpa->ia.ff, false);
1771  
1772  	kfree(ap->pages);
1773  	kfree(wpa);
1774  }
1775  
1776  static void fuse_writepage_finish(struct fuse_mount *fm,
1777  				  struct fuse_writepage_args *wpa)
1778  {
1779  	struct fuse_args_pages *ap = &wpa->ia.ap;
1780  	struct inode *inode = wpa->inode;
1781  	struct fuse_inode *fi = get_fuse_inode(inode);
1782  	struct backing_dev_info *bdi = inode_to_bdi(inode);
1783  	int i;
1784  
1785  	for (i = 0; i < ap->num_pages; i++) {
1786  		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1787  		dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
1788  		wb_writeout_inc(&bdi->wb);
1789  	}
1790  	wake_up(&fi->page_waitq);
1791  }
1792  
1793  /* Called under fi->lock, may release and reacquire it */
1794  static void fuse_send_writepage(struct fuse_mount *fm,
1795  				struct fuse_writepage_args *wpa, loff_t size)
1796  __releases(fi->lock)
1797  __acquires(fi->lock)
1798  {
1799  	struct fuse_writepage_args *aux, *next;
1800  	struct fuse_inode *fi = get_fuse_inode(wpa->inode);
1801  	struct fuse_write_in *inarg = &wpa->ia.write.in;
1802  	struct fuse_args *args = &wpa->ia.ap.args;
1803  	__u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
1804  	int err;
1805  
1806  	fi->writectr++;
1807  	if (inarg->offset + data_size <= size) {
1808  		inarg->size = data_size;
1809  	} else if (inarg->offset < size) {
1810  		inarg->size = size - inarg->offset;
1811  	} else {
1812  		/* Got truncated off completely */
1813  		goto out_free;
1814  	}
1815  
1816  	args->in_args[1].size = inarg->size;
1817  	args->force = true;
1818  	args->nocreds = true;
1819  
1820  	err = fuse_simple_background(fm, args, GFP_ATOMIC);
1821  	if (err == -ENOMEM) {
1822  		spin_unlock(&fi->lock);
1823  		err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL);
1824  		spin_lock(&fi->lock);
1825  	}
1826  
1827  	/* Fails on broken connection only */
1828  	if (unlikely(err))
1829  		goto out_free;
1830  
1831  	return;
1832  
1833   out_free:
1834  	fi->writectr--;
1835  	rb_erase(&wpa->writepages_entry, &fi->writepages);
1836  	fuse_writepage_finish(fm, wpa);
1837  	spin_unlock(&fi->lock);
1838  
1839  	/* After fuse_writepage_finish() aux request list is private */
1840  	for (aux = wpa->next; aux; aux = next) {
1841  		next = aux->next;
1842  		aux->next = NULL;
1843  		fuse_writepage_free(aux);
1844  	}
1845  
1846  	fuse_writepage_free(wpa);
1847  	spin_lock(&fi->lock);
1848  }
1849  
1850  /*
1851   * If fi->writectr is positive (no truncate or fsync going on) send
1852   * all queued writepage requests.
1853   *
1854   * Called with fi->lock
1855   */
1856  void fuse_flush_writepages(struct inode *inode)
1857  __releases(fi->lock)
1858  __acquires(fi->lock)
1859  {
1860  	struct fuse_mount *fm = get_fuse_mount(inode);
1861  	struct fuse_inode *fi = get_fuse_inode(inode);
1862  	loff_t crop = i_size_read(inode);
1863  	struct fuse_writepage_args *wpa;
1864  
1865  	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
1866  		wpa = list_entry(fi->queued_writes.next,
1867  				 struct fuse_writepage_args, queue_entry);
1868  		list_del_init(&wpa->queue_entry);
1869  		fuse_send_writepage(fm, wpa, crop);
1870  	}
1871  }
1872  
1873  static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
1874  						struct fuse_writepage_args *wpa)
1875  {
1876  	pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
1877  	pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1;
1878  	struct rb_node **p = &root->rb_node;
1879  	struct rb_node  *parent = NULL;
1880  
1881  	WARN_ON(!wpa->ia.ap.num_pages);
1882  	while (*p) {
1883  		struct fuse_writepage_args *curr;
1884  		pgoff_t curr_index;
1885  
1886  		parent = *p;
1887  		curr = rb_entry(parent, struct fuse_writepage_args,
1888  				writepages_entry);
1889  		WARN_ON(curr->inode != wpa->inode);
1890  		curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;
1891  
1892  		if (idx_from >= curr_index + curr->ia.ap.num_pages)
1893  			p = &(*p)->rb_right;
1894  		else if (idx_to < curr_index)
1895  			p = &(*p)->rb_left;
1896  		else
1897  			return curr;
1898  	}
1899  
1900  	rb_link_node(&wpa->writepages_entry, parent, p);
1901  	rb_insert_color(&wpa->writepages_entry, root);
1902  	return NULL;
1903  }
1904  
1905  static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
1906  {
1907  	WARN_ON(fuse_insert_writeback(root, wpa));
1908  }
1909  
1910  static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
1911  			       int error)
1912  {
1913  	struct fuse_writepage_args *wpa =
1914  		container_of(args, typeof(*wpa), ia.ap.args);
1915  	struct inode *inode = wpa->inode;
1916  	struct fuse_inode *fi = get_fuse_inode(inode);
1917  	struct fuse_conn *fc = get_fuse_conn(inode);
1918  
1919  	mapping_set_error(inode->i_mapping, error);
1920  	/*
1921  	 * A writeback finished and this might have updated mtime/ctime on
1922  	 * server making local mtime/ctime stale.  Hence invalidate attrs.
1923  	 * Do this only if writeback_cache is not enabled.  If writeback_cache
1924  	 * is enabled, we trust local ctime/mtime.
1925  	 */
1926  	if (!fc->writeback_cache)
1927  		fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
1928  	spin_lock(&fi->lock);
1929  	rb_erase(&wpa->writepages_entry, &fi->writepages);
1930  	while (wpa->next) {
1931  		struct fuse_mount *fm = get_fuse_mount(inode);
1932  		struct fuse_write_in *inarg = &wpa->ia.write.in;
1933  		struct fuse_writepage_args *next = wpa->next;
1934  
1935  		wpa->next = next->next;
1936  		next->next = NULL;
1937  		next->ia.ff = fuse_file_get(wpa->ia.ff);
1938  		tree_insert(&fi->writepages, next);
1939  
1940  		/*
1941  		 * Skip fuse_flush_writepages() to make it easy to crop requests
1942  		 * based on primary request size.
1943  		 *
1944  		 * 1st case (trivial): there are no concurrent activities using
1945  		 * fuse_set/release_nowrite.  Then we're on safe side because
1946  		 * fuse_flush_writepages() would call fuse_send_writepage()
1947  		 * anyway.
1948  		 *
1949  		 * 2nd case: someone called fuse_set_nowrite and it is waiting
1950  		 * now for completion of all in-flight requests.  This happens
1951  		 * rarely and no more than once per page, so this should be
1952  		 * okay.
1953  		 *
1954  		 * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
1955  		 * of fuse_set_nowrite..fuse_release_nowrite section.  The fact
1956  		 * that fuse_set_nowrite returned implies that all in-flight
1957  		 * requests were completed along with all of their secondary
1958  		 * requests.  Further primary requests are blocked by negative
1959  		 * writectr.  Hence there cannot be any in-flight requests and
1960  		 * no invocations of fuse_writepage_end() while we're in
1961  		 * fuse_set_nowrite..fuse_release_nowrite section.
1962  		 */
1963  		fuse_send_writepage(fm, next, inarg->offset + inarg->size);
1964  	}
1965  	fi->writectr--;
1966  	fuse_writepage_finish(fm, wpa);
1967  	spin_unlock(&fi->lock);
1968  	fuse_writepage_free(wpa);
1969  }
1970  
1971  static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi)
1972  {
1973  	struct fuse_file *ff;
1974  
1975  	spin_lock(&fi->lock);
1976  	ff = list_first_entry_or_null(&fi->write_files, struct fuse_file,
1977  				      write_entry);
1978  	if (ff)
1979  		fuse_file_get(ff);
1980  	spin_unlock(&fi->lock);
1981  
1982  	return ff;
1983  }
1984  
1985  static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi)
1986  {
1987  	struct fuse_file *ff = __fuse_write_file_get(fi);
1988  	WARN_ON(!ff);
1989  	return ff;
1990  }
1991  
1992  int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
1993  {
1994  	struct fuse_inode *fi = get_fuse_inode(inode);
1995  	struct fuse_file *ff;
1996  	int err;
1997  
1998  	/*
1999  	 * Inode is always written before the last reference is dropped and
2000  	 * hence this should not be reached from reclaim.
2001  	 *
2002  	 * Writing back the inode from reclaim can deadlock if the request
2003  	 * processing itself needs an allocation.  Allocations triggering
2004  	 * reclaim while serving a request can't be prevented, because it can
2005  	 * involve any number of unrelated userspace processes.
2006  	 */
2007  	WARN_ON(wbc->for_reclaim);
2008  
2009  	ff = __fuse_write_file_get(fi);
2010  	err = fuse_flush_times(inode, ff);
2011  	if (ff)
2012  		fuse_file_put(ff, false);
2013  
2014  	return err;
2015  }
2016  
2017  static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
2018  {
2019  	struct fuse_writepage_args *wpa;
2020  	struct fuse_args_pages *ap;
2021  
2022  	wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
2023  	if (wpa) {
2024  		ap = &wpa->ia.ap;
2025  		ap->num_pages = 0;
2026  		ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs);
2027  		if (!ap->pages) {
2028  			kfree(wpa);
2029  			wpa = NULL;
2030  		}
2031  	}
2032  	return wpa;
2033  
2034  }
2035  
2036  static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
2037  					 struct fuse_writepage_args *wpa)
2038  {
2039  	if (!fc->sync_fs)
2040  		return;
2041  
2042  	rcu_read_lock();
2043  	/* Prevent resurrection of dead bucket in unlikely race with syncfs */
2044  	do {
2045  		wpa->bucket = rcu_dereference(fc->curr_bucket);
2046  	} while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
2047  	rcu_read_unlock();
2048  }
2049  
2050  static int fuse_writepage_locked(struct folio *folio)
2051  {
2052  	struct address_space *mapping = folio->mapping;
2053  	struct inode *inode = mapping->host;
2054  	struct fuse_conn *fc = get_fuse_conn(inode);
2055  	struct fuse_inode *fi = get_fuse_inode(inode);
2056  	struct fuse_writepage_args *wpa;
2057  	struct fuse_args_pages *ap;
2058  	struct folio *tmp_folio;
2059  	int error = -ENOMEM;
2060  
2061  	folio_start_writeback(folio);
2062  
2063  	wpa = fuse_writepage_args_alloc();
2064  	if (!wpa)
2065  		goto err;
2066  	ap = &wpa->ia.ap;
2067  
2068  	tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
2069  	if (!tmp_folio)
2070  		goto err_free;
2071  
2072  	error = -EIO;
2073  	wpa->ia.ff = fuse_write_file_get(fi);
2074  	if (!wpa->ia.ff)
2075  		goto err_nofile;
2076  
2077  	fuse_writepage_add_to_bucket(fc, wpa);
2078  	fuse_write_args_fill(&wpa->ia, wpa->ia.ff, folio_pos(folio), 0);
2079  
2080  	folio_copy(tmp_folio, folio);
2081  	wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
2082  	wpa->next = NULL;
2083  	ap->args.in_pages = true;
2084  	ap->num_pages = 1;
2085  	ap->pages[0] = &tmp_folio->page;
2086  	ap->descs[0].offset = 0;
2087  	ap->descs[0].length = PAGE_SIZE;
2088  	ap->args.end = fuse_writepage_end;
2089  	wpa->inode = inode;
2090  
2091  	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
2092  	node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP);
2093  
2094  	spin_lock(&fi->lock);
2095  	tree_insert(&fi->writepages, wpa);
2096  	list_add_tail(&wpa->queue_entry, &fi->queued_writes);
2097  	fuse_flush_writepages(inode);
2098  	spin_unlock(&fi->lock);
2099  
2100  	folio_end_writeback(folio);
2101  
2102  	return 0;
2103  
2104  err_nofile:
2105  	folio_put(tmp_folio);
2106  err_free:
2107  	kfree(wpa);
2108  err:
2109  	mapping_set_error(folio->mapping, error);
2110  	folio_end_writeback(folio);
2111  	return error;
2112  }
2113  
2114  struct fuse_fill_wb_data {
2115  	struct fuse_writepage_args *wpa;
2116  	struct fuse_file *ff;
2117  	struct inode *inode;
2118  	struct page **orig_pages;
2119  	unsigned int max_pages;
2120  };
2121  
2122  static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
2123  {
2124  	struct fuse_args_pages *ap = &data->wpa->ia.ap;
2125  	struct fuse_conn *fc = get_fuse_conn(data->inode);
2126  	struct page **pages;
2127  	struct fuse_page_desc *descs;
2128  	unsigned int npages = min_t(unsigned int,
2129  				    max_t(unsigned int, data->max_pages * 2,
2130  					  FUSE_DEFAULT_MAX_PAGES_PER_REQ),
2131  				    fc->max_pages);
2132  	WARN_ON(npages <= data->max_pages);
2133  
2134  	pages = fuse_pages_alloc(npages, GFP_NOFS, &descs);
2135  	if (!pages)
2136  		return false;
2137  
2138  	memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages);
2139  	memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages);
2140  	kfree(ap->pages);
2141  	ap->pages = pages;
2142  	ap->descs = descs;
2143  	data->max_pages = npages;
2144  
2145  	return true;
2146  }
2147  
2148  static void fuse_writepages_send(struct fuse_fill_wb_data *data)
2149  {
2150  	struct fuse_writepage_args *wpa = data->wpa;
2151  	struct inode *inode = data->inode;
2152  	struct fuse_inode *fi = get_fuse_inode(inode);
2153  	int num_pages = wpa->ia.ap.num_pages;
2154  	int i;
2155  
2156  	wpa->ia.ff = fuse_file_get(data->ff);
2157  	spin_lock(&fi->lock);
2158  	list_add_tail(&wpa->queue_entry, &fi->queued_writes);
2159  	fuse_flush_writepages(inode);
2160  	spin_unlock(&fi->lock);
2161  
2162  	for (i = 0; i < num_pages; i++)
2163  		end_page_writeback(data->orig_pages[i]);
2164  }
2165  
2166  /*
2167   * Check under fi->lock if the page is under writeback, and insert it onto the
2168   * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's
2169   * one already added for a page at this offset.  If there's none, then insert
2170   * this new request onto the auxiliary list, otherwise reuse the existing one by
2171   * swapping the new temp page with the old one.
2172   */
2173  static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
2174  			       struct page *page)
2175  {
2176  	struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
2177  	struct fuse_writepage_args *tmp;
2178  	struct fuse_writepage_args *old_wpa;
2179  	struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
2180  
2181  	WARN_ON(new_ap->num_pages != 0);
2182  	new_ap->num_pages = 1;
2183  
2184  	spin_lock(&fi->lock);
2185  	old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
2186  	if (!old_wpa) {
2187  		spin_unlock(&fi->lock);
2188  		return true;
2189  	}
2190  
2191  	for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
2192  		pgoff_t curr_index;
2193  
2194  		WARN_ON(tmp->inode != new_wpa->inode);
2195  		curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
2196  		if (curr_index == page->index) {
2197  			WARN_ON(tmp->ia.ap.num_pages != 1);
2198  			swap(tmp->ia.ap.pages[0], new_ap->pages[0]);
2199  			break;
2200  		}
2201  	}
2202  
2203  	if (!tmp) {
2204  		new_wpa->next = old_wpa->next;
2205  		old_wpa->next = new_wpa;
2206  	}
2207  
2208  	spin_unlock(&fi->lock);
2209  
2210  	if (tmp) {
2211  		struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode);
2212  
2213  		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
2214  		dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP);
2215  		wb_writeout_inc(&bdi->wb);
2216  		fuse_writepage_free(new_wpa);
2217  	}
2218  
2219  	return false;
2220  }
2221  
2222  static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
2223  				     struct fuse_args_pages *ap,
2224  				     struct fuse_fill_wb_data *data)
2225  {
2226  	WARN_ON(!ap->num_pages);
2227  
2228  	/*
2229  	 * Being under writeback is unlikely but possible.  For example direct
2230  	 * read to an mmaped fuse file will set the page dirty twice; once when
2231  	 * the pages are faulted with get_user_pages(), and then after the read
2232  	 * completed.
2233  	 */
2234  	if (fuse_page_is_writeback(data->inode, page->index))
2235  		return true;
2236  
2237  	/* Reached max pages */
2238  	if (ap->num_pages == fc->max_pages)
2239  		return true;
2240  
2241  	/* Reached max write bytes */
2242  	if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write)
2243  		return true;
2244  
2245  	/* Discontinuity */
2246  	if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)
2247  		return true;
2248  
2249  	/* Need to grow the pages array?  If so, did the expansion fail? */
2250  	if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
2251  		return true;
2252  
2253  	return false;
2254  }
2255  
2256  static int fuse_writepages_fill(struct folio *folio,
2257  		struct writeback_control *wbc, void *_data)
2258  {
2259  	struct fuse_fill_wb_data *data = _data;
2260  	struct fuse_writepage_args *wpa = data->wpa;
2261  	struct fuse_args_pages *ap = &wpa->ia.ap;
2262  	struct inode *inode = data->inode;
2263  	struct fuse_inode *fi = get_fuse_inode(inode);
2264  	struct fuse_conn *fc = get_fuse_conn(inode);
2265  	struct page *tmp_page;
2266  	int err;
2267  
2268  	if (!data->ff) {
2269  		err = -EIO;
2270  		data->ff = fuse_write_file_get(fi);
2271  		if (!data->ff)
2272  			goto out_unlock;
2273  	}
2274  
2275  	if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) {
2276  		fuse_writepages_send(data);
2277  		data->wpa = NULL;
2278  	}
2279  
2280  	err = -ENOMEM;
2281  	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2282  	if (!tmp_page)
2283  		goto out_unlock;
2284  
2285  	/*
2286  	 * The page must not be redirtied until the writeout is completed
2287  	 * (i.e. userspace has sent a reply to the write request).  Otherwise
2288  	 * there could be more than one temporary page instance for each real
2289  	 * page.
2290  	 *
2291  	 * This is ensured by holding the page lock in page_mkwrite() while
2292  	 * checking fuse_page_is_writeback().  We already hold the page lock
2293  	 * since clear_page_dirty_for_io() and keep it held until we add the
2294  	 * request to the fi->writepages list and increment ap->num_pages.
2295  	 * After this fuse_page_is_writeback() will indicate that the page is
2296  	 * under writeback, so we can release the page lock.
2297  	 */
2298  	if (data->wpa == NULL) {
2299  		err = -ENOMEM;
2300  		wpa = fuse_writepage_args_alloc();
2301  		if (!wpa) {
2302  			__free_page(tmp_page);
2303  			goto out_unlock;
2304  		}
2305  		fuse_writepage_add_to_bucket(fc, wpa);
2306  
2307  		data->max_pages = 1;
2308  
2309  		ap = &wpa->ia.ap;
2310  		fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0);
2311  		wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
2312  		wpa->next = NULL;
2313  		ap->args.in_pages = true;
2314  		ap->args.end = fuse_writepage_end;
2315  		ap->num_pages = 0;
2316  		wpa->inode = inode;
2317  	}
2318  	folio_start_writeback(folio);
2319  
2320  	copy_highpage(tmp_page, &folio->page);
2321  	ap->pages[ap->num_pages] = tmp_page;
2322  	ap->descs[ap->num_pages].offset = 0;
2323  	ap->descs[ap->num_pages].length = PAGE_SIZE;
2324  	data->orig_pages[ap->num_pages] = &folio->page;
2325  
2326  	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
2327  	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
2328  
2329  	err = 0;
2330  	if (data->wpa) {
2331  		/*
2332  		 * Protected by fi->lock against concurrent access by
2333  		 * fuse_page_is_writeback().
2334  		 */
2335  		spin_lock(&fi->lock);
2336  		ap->num_pages++;
2337  		spin_unlock(&fi->lock);
2338  	} else if (fuse_writepage_add(wpa, &folio->page)) {
2339  		data->wpa = wpa;
2340  	} else {
2341  		folio_end_writeback(folio);
2342  	}
2343  out_unlock:
2344  	folio_unlock(folio);
2345  
2346  	return err;
2347  }
2348  
2349  static int fuse_writepages(struct address_space *mapping,
2350  			   struct writeback_control *wbc)
2351  {
2352  	struct inode *inode = mapping->host;
2353  	struct fuse_conn *fc = get_fuse_conn(inode);
2354  	struct fuse_fill_wb_data data;
2355  	int err;
2356  
2357  	err = -EIO;
2358  	if (fuse_is_bad(inode))
2359  		goto out;
2360  
2361  	if (wbc->sync_mode == WB_SYNC_NONE &&
2362  	    fc->num_background >= fc->congestion_threshold)
2363  		return 0;
2364  
2365  	data.inode = inode;
2366  	data.wpa = NULL;
2367  	data.ff = NULL;
2368  
2369  	err = -ENOMEM;
2370  	data.orig_pages = kcalloc(fc->max_pages,
2371  				  sizeof(struct page *),
2372  				  GFP_NOFS);
2373  	if (!data.orig_pages)
2374  		goto out;
2375  
2376  	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
2377  	if (data.wpa) {
2378  		WARN_ON(!data.wpa->ia.ap.num_pages);
2379  		fuse_writepages_send(&data);
2380  	}
2381  	if (data.ff)
2382  		fuse_file_put(data.ff, false);
2383  
2384  	kfree(data.orig_pages);
2385  out:
2386  	return err;
2387  }
2388  
2389  /*
2390   * It's worthy to make sure that space is reserved on disk for the write,
2391   * but how to implement it without killing performance need more thinking.
2392   */
2393  static int fuse_write_begin(struct file *file, struct address_space *mapping,
2394  		loff_t pos, unsigned len, struct page **pagep, void **fsdata)
2395  {
2396  	pgoff_t index = pos >> PAGE_SHIFT;
2397  	struct fuse_conn *fc = get_fuse_conn(file_inode(file));
2398  	struct page *page;
2399  	loff_t fsize;
2400  	int err = -ENOMEM;
2401  
2402  	WARN_ON(!fc->writeback_cache);
2403  
2404  	page = grab_cache_page_write_begin(mapping, index);
2405  	if (!page)
2406  		goto error;
2407  
2408  	fuse_wait_on_page_writeback(mapping->host, page->index);
2409  
2410  	if (PageUptodate(page) || len == PAGE_SIZE)
2411  		goto success;
2412  	/*
2413  	 * Check if the start this page comes after the end of file, in which
2414  	 * case the readpage can be optimized away.
2415  	 */
2416  	fsize = i_size_read(mapping->host);
2417  	if (fsize <= (pos & PAGE_MASK)) {
2418  		size_t off = pos & ~PAGE_MASK;
2419  		if (off)
2420  			zero_user_segment(page, 0, off);
2421  		goto success;
2422  	}
2423  	err = fuse_do_readpage(file, page);
2424  	if (err)
2425  		goto cleanup;
2426  success:
2427  	*pagep = page;
2428  	return 0;
2429  
2430  cleanup:
2431  	unlock_page(page);
2432  	put_page(page);
2433  error:
2434  	return err;
2435  }
2436  
2437  static int fuse_write_end(struct file *file, struct address_space *mapping,
2438  		loff_t pos, unsigned len, unsigned copied,
2439  		struct page *page, void *fsdata)
2440  {
2441  	struct inode *inode = page->mapping->host;
2442  
2443  	/* Haven't copied anything?  Skip zeroing, size extending, dirtying. */
2444  	if (!copied)
2445  		goto unlock;
2446  
2447  	pos += copied;
2448  	if (!PageUptodate(page)) {
2449  		/* Zero any unwritten bytes at the end of the page */
2450  		size_t endoff = pos & ~PAGE_MASK;
2451  		if (endoff)
2452  			zero_user_segment(page, endoff, PAGE_SIZE);
2453  		SetPageUptodate(page);
2454  	}
2455  
2456  	if (pos > inode->i_size)
2457  		i_size_write(inode, pos);
2458  
2459  	set_page_dirty(page);
2460  
2461  unlock:
2462  	unlock_page(page);
2463  	put_page(page);
2464  
2465  	return copied;
2466  }
2467  
2468  static int fuse_launder_folio(struct folio *folio)
2469  {
2470  	int err = 0;
2471  	if (folio_clear_dirty_for_io(folio)) {
2472  		struct inode *inode = folio->mapping->host;
2473  
2474  		/* Serialize with pending writeback for the same page */
2475  		fuse_wait_on_page_writeback(inode, folio->index);
2476  		err = fuse_writepage_locked(folio);
2477  		if (!err)
2478  			fuse_wait_on_page_writeback(inode, folio->index);
2479  	}
2480  	return err;
2481  }
2482  
2483  /*
2484   * Write back dirty data/metadata now (there may not be any suitable
2485   * open files later for data)
2486   */
2487  static void fuse_vma_close(struct vm_area_struct *vma)
2488  {
2489  	int err;
2490  
2491  	err = write_inode_now(vma->vm_file->f_mapping->host, 1);
2492  	mapping_set_error(vma->vm_file->f_mapping, err);
2493  }
2494  
2495  /*
2496   * Wait for writeback against this page to complete before allowing it
2497   * to be marked dirty again, and hence written back again, possibly
2498   * before the previous writepage completed.
2499   *
2500   * Block here, instead of in ->writepage(), so that the userspace fs
2501   * can only block processes actually operating on the filesystem.
2502   *
2503   * Otherwise unprivileged userspace fs would be able to block
2504   * unrelated:
2505   *
2506   * - page migration
2507   * - sync(2)
2508   * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
2509   */
2510  static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
2511  {
2512  	struct page *page = vmf->page;
2513  	struct inode *inode = file_inode(vmf->vma->vm_file);
2514  
2515  	file_update_time(vmf->vma->vm_file);
2516  	lock_page(page);
2517  	if (page->mapping != inode->i_mapping) {
2518  		unlock_page(page);
2519  		return VM_FAULT_NOPAGE;
2520  	}
2521  
2522  	fuse_wait_on_page_writeback(inode, page->index);
2523  	return VM_FAULT_LOCKED;
2524  }
2525  
2526  static const struct vm_operations_struct fuse_file_vm_ops = {
2527  	.close		= fuse_vma_close,
2528  	.fault		= filemap_fault,
2529  	.map_pages	= filemap_map_pages,
2530  	.page_mkwrite	= fuse_page_mkwrite,
2531  };
2532  
2533  static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
2534  {
2535  	struct fuse_file *ff = file->private_data;
2536  	struct fuse_conn *fc = ff->fm->fc;
2537  	struct inode *inode = file_inode(file);
2538  	int rc;
2539  
2540  	/* DAX mmap is superior to direct_io mmap */
2541  	if (FUSE_IS_DAX(inode))
2542  		return fuse_dax_mmap(file, vma);
2543  
2544  	/*
2545  	 * If inode is in passthrough io mode, because it has some file open
2546  	 * in passthrough mode, either mmap to backing file or fail mmap,
2547  	 * because mixing cached mmap and passthrough io mode is not allowed.
2548  	 */
2549  	if (fuse_file_passthrough(ff))
2550  		return fuse_passthrough_mmap(file, vma);
2551  	else if (fuse_inode_backing(get_fuse_inode(inode)))
2552  		return -ENODEV;
2553  
2554  	/*
2555  	 * FOPEN_DIRECT_IO handling is special compared to O_DIRECT,
2556  	 * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP.
2557  	 */
2558  	if (ff->open_flags & FOPEN_DIRECT_IO) {
2559  		/*
2560  		 * Can't provide the coherency needed for MAP_SHARED
2561  		 * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set.
2562  		 */
2563  		if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap)
2564  			return -ENODEV;
2565  
2566  		invalidate_inode_pages2(file->f_mapping);
2567  
2568  		if (!(vma->vm_flags & VM_MAYSHARE)) {
2569  			/* MAP_PRIVATE */
2570  			return generic_file_mmap(file, vma);
2571  		}
2572  
2573  		/*
2574  		 * First mmap of direct_io file enters caching inode io mode.
2575  		 * Also waits for parallel dio writers to go into serial mode
2576  		 * (exclusive instead of shared lock).
2577  		 * After first mmap, the inode stays in caching io mode until
2578  		 * the direct_io file release.
2579  		 */
2580  		rc = fuse_file_cached_io_open(inode, ff);
2581  		if (rc)
2582  			return rc;
2583  	}
2584  
2585  	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
2586  		fuse_link_write_file(file);
2587  
2588  	file_accessed(file);
2589  	vma->vm_ops = &fuse_file_vm_ops;
2590  	return 0;
2591  }
2592  
2593  static int convert_fuse_file_lock(struct fuse_conn *fc,
2594  				  const struct fuse_file_lock *ffl,
2595  				  struct file_lock *fl)
2596  {
2597  	switch (ffl->type) {
2598  	case F_UNLCK:
2599  		break;
2600  
2601  	case F_RDLCK:
2602  	case F_WRLCK:
2603  		if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
2604  		    ffl->end < ffl->start)
2605  			return -EIO;
2606  
2607  		fl->fl_start = ffl->start;
2608  		fl->fl_end = ffl->end;
2609  
2610  		/*
2611  		 * Convert pid into init's pid namespace.  The locks API will
2612  		 * translate it into the caller's pid namespace.
2613  		 */
2614  		rcu_read_lock();
2615  		fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
2616  		rcu_read_unlock();
2617  		break;
2618  
2619  	default:
2620  		return -EIO;
2621  	}
2622  	fl->c.flc_type = ffl->type;
2623  	return 0;
2624  }
2625  
2626  static void fuse_lk_fill(struct fuse_args *args, struct file *file,
2627  			 const struct file_lock *fl, int opcode, pid_t pid,
2628  			 int flock, struct fuse_lk_in *inarg)
2629  {
2630  	struct inode *inode = file_inode(file);
2631  	struct fuse_conn *fc = get_fuse_conn(inode);
2632  	struct fuse_file *ff = file->private_data;
2633  
2634  	memset(inarg, 0, sizeof(*inarg));
2635  	inarg->fh = ff->fh;
2636  	inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner);
2637  	inarg->lk.start = fl->fl_start;
2638  	inarg->lk.end = fl->fl_end;
2639  	inarg->lk.type = fl->c.flc_type;
2640  	inarg->lk.pid = pid;
2641  	if (flock)
2642  		inarg->lk_flags |= FUSE_LK_FLOCK;
2643  	args->opcode = opcode;
2644  	args->nodeid = get_node_id(inode);
2645  	args->in_numargs = 1;
2646  	args->in_args[0].size = sizeof(*inarg);
2647  	args->in_args[0].value = inarg;
2648  }
2649  
2650  static int fuse_getlk(struct file *file, struct file_lock *fl)
2651  {
2652  	struct inode *inode = file_inode(file);
2653  	struct fuse_mount *fm = get_fuse_mount(inode);
2654  	FUSE_ARGS(args);
2655  	struct fuse_lk_in inarg;
2656  	struct fuse_lk_out outarg;
2657  	int err;
2658  
2659  	fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
2660  	args.out_numargs = 1;
2661  	args.out_args[0].size = sizeof(outarg);
2662  	args.out_args[0].value = &outarg;
2663  	err = fuse_simple_request(fm, &args);
2664  	if (!err)
2665  		err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl);
2666  
2667  	return err;
2668  }
2669  
2670  static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
2671  {
2672  	struct inode *inode = file_inode(file);
2673  	struct fuse_mount *fm = get_fuse_mount(inode);
2674  	FUSE_ARGS(args);
2675  	struct fuse_lk_in inarg;
2676  	int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
2677  	struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL;
2678  	pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
2679  	int err;
2680  
2681  	if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
2682  		/* NLM needs asynchronous locks, which we don't support yet */
2683  		return -ENOLCK;
2684  	}
2685  
2686  	fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
2687  	err = fuse_simple_request(fm, &args);
2688  
2689  	/* locking is restartable */
2690  	if (err == -EINTR)
2691  		err = -ERESTARTSYS;
2692  
2693  	return err;
2694  }
2695  
2696  static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
2697  {
2698  	struct inode *inode = file_inode(file);
2699  	struct fuse_conn *fc = get_fuse_conn(inode);
2700  	int err;
2701  
2702  	if (cmd == F_CANCELLK) {
2703  		err = 0;
2704  	} else if (cmd == F_GETLK) {
2705  		if (fc->no_lock) {
2706  			posix_test_lock(file, fl);
2707  			err = 0;
2708  		} else
2709  			err = fuse_getlk(file, fl);
2710  	} else {
2711  		if (fc->no_lock)
2712  			err = posix_lock_file(file, fl, NULL);
2713  		else
2714  			err = fuse_setlk(file, fl, 0);
2715  	}
2716  	return err;
2717  }
2718  
2719  static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
2720  {
2721  	struct inode *inode = file_inode(file);
2722  	struct fuse_conn *fc = get_fuse_conn(inode);
2723  	int err;
2724  
2725  	if (fc->no_flock) {
2726  		err = locks_lock_file_wait(file, fl);
2727  	} else {
2728  		struct fuse_file *ff = file->private_data;
2729  
2730  		/* emulate flock with POSIX locks */
2731  		ff->flock = true;
2732  		err = fuse_setlk(file, fl, 1);
2733  	}
2734  
2735  	return err;
2736  }
2737  
2738  static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
2739  {
2740  	struct inode *inode = mapping->host;
2741  	struct fuse_mount *fm = get_fuse_mount(inode);
2742  	FUSE_ARGS(args);
2743  	struct fuse_bmap_in inarg;
2744  	struct fuse_bmap_out outarg;
2745  	int err;
2746  
2747  	if (!inode->i_sb->s_bdev || fm->fc->no_bmap)
2748  		return 0;
2749  
2750  	memset(&inarg, 0, sizeof(inarg));
2751  	inarg.block = block;
2752  	inarg.blocksize = inode->i_sb->s_blocksize;
2753  	args.opcode = FUSE_BMAP;
2754  	args.nodeid = get_node_id(inode);
2755  	args.in_numargs = 1;
2756  	args.in_args[0].size = sizeof(inarg);
2757  	args.in_args[0].value = &inarg;
2758  	args.out_numargs = 1;
2759  	args.out_args[0].size = sizeof(outarg);
2760  	args.out_args[0].value = &outarg;
2761  	err = fuse_simple_request(fm, &args);
2762  	if (err == -ENOSYS)
2763  		fm->fc->no_bmap = 1;
2764  
2765  	return err ? 0 : outarg.block;
2766  }
2767  
2768  static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
2769  {
2770  	struct inode *inode = file->f_mapping->host;
2771  	struct fuse_mount *fm = get_fuse_mount(inode);
2772  	struct fuse_file *ff = file->private_data;
2773  	FUSE_ARGS(args);
2774  	struct fuse_lseek_in inarg = {
2775  		.fh = ff->fh,
2776  		.offset = offset,
2777  		.whence = whence
2778  	};
2779  	struct fuse_lseek_out outarg;
2780  	int err;
2781  
2782  	if (fm->fc->no_lseek)
2783  		goto fallback;
2784  
2785  	args.opcode = FUSE_LSEEK;
2786  	args.nodeid = ff->nodeid;
2787  	args.in_numargs = 1;
2788  	args.in_args[0].size = sizeof(inarg);
2789  	args.in_args[0].value = &inarg;
2790  	args.out_numargs = 1;
2791  	args.out_args[0].size = sizeof(outarg);
2792  	args.out_args[0].value = &outarg;
2793  	err = fuse_simple_request(fm, &args);
2794  	if (err) {
2795  		if (err == -ENOSYS) {
2796  			fm->fc->no_lseek = 1;
2797  			goto fallback;
2798  		}
2799  		return err;
2800  	}
2801  
2802  	return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
2803  
2804  fallback:
2805  	err = fuse_update_attributes(inode, file, STATX_SIZE);
2806  	if (!err)
2807  		return generic_file_llseek(file, offset, whence);
2808  	else
2809  		return err;
2810  }
2811  
2812  static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
2813  {
2814  	loff_t retval;
2815  	struct inode *inode = file_inode(file);
2816  
2817  	switch (whence) {
2818  	case SEEK_SET:
2819  	case SEEK_CUR:
2820  		 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
2821  		retval = generic_file_llseek(file, offset, whence);
2822  		break;
2823  	case SEEK_END:
2824  		inode_lock(inode);
2825  		retval = fuse_update_attributes(inode, file, STATX_SIZE);
2826  		if (!retval)
2827  			retval = generic_file_llseek(file, offset, whence);
2828  		inode_unlock(inode);
2829  		break;
2830  	case SEEK_HOLE:
2831  	case SEEK_DATA:
2832  		inode_lock(inode);
2833  		retval = fuse_lseek(file, offset, whence);
2834  		inode_unlock(inode);
2835  		break;
2836  	default:
2837  		retval = -EINVAL;
2838  	}
2839  
2840  	return retval;
2841  }
2842  
2843  /*
2844   * All files which have been polled are linked to RB tree
2845   * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
2846   * find the matching one.
2847   */
2848  static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
2849  					      struct rb_node **parent_out)
2850  {
2851  	struct rb_node **link = &fc->polled_files.rb_node;
2852  	struct rb_node *last = NULL;
2853  
2854  	while (*link) {
2855  		struct fuse_file *ff;
2856  
2857  		last = *link;
2858  		ff = rb_entry(last, struct fuse_file, polled_node);
2859  
2860  		if (kh < ff->kh)
2861  			link = &last->rb_left;
2862  		else if (kh > ff->kh)
2863  			link = &last->rb_right;
2864  		else
2865  			return link;
2866  	}
2867  
2868  	if (parent_out)
2869  		*parent_out = last;
2870  	return link;
2871  }
2872  
2873  /*
2874   * The file is about to be polled.  Make sure it's on the polled_files
2875   * RB tree.  Note that files once added to the polled_files tree are
2876   * not removed before the file is released.  This is because a file
2877   * polled once is likely to be polled again.
2878   */
2879  static void fuse_register_polled_file(struct fuse_conn *fc,
2880  				      struct fuse_file *ff)
2881  {
2882  	spin_lock(&fc->lock);
2883  	if (RB_EMPTY_NODE(&ff->polled_node)) {
2884  		struct rb_node **link, *parent;
2885  
2886  		link = fuse_find_polled_node(fc, ff->kh, &parent);
2887  		BUG_ON(*link);
2888  		rb_link_node(&ff->polled_node, parent, link);
2889  		rb_insert_color(&ff->polled_node, &fc->polled_files);
2890  	}
2891  	spin_unlock(&fc->lock);
2892  }
2893  
2894  __poll_t fuse_file_poll(struct file *file, poll_table *wait)
2895  {
2896  	struct fuse_file *ff = file->private_data;
2897  	struct fuse_mount *fm = ff->fm;
2898  	struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
2899  	struct fuse_poll_out outarg;
2900  	FUSE_ARGS(args);
2901  	int err;
2902  
2903  	if (fm->fc->no_poll)
2904  		return DEFAULT_POLLMASK;
2905  
2906  	poll_wait(file, &ff->poll_wait, wait);
2907  	inarg.events = mangle_poll(poll_requested_events(wait));
2908  
2909  	/*
2910  	 * Ask for notification iff there's someone waiting for it.
2911  	 * The client may ignore the flag and always notify.
2912  	 */
2913  	if (waitqueue_active(&ff->poll_wait)) {
2914  		inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
2915  		fuse_register_polled_file(fm->fc, ff);
2916  	}
2917  
2918  	args.opcode = FUSE_POLL;
2919  	args.nodeid = ff->nodeid;
2920  	args.in_numargs = 1;
2921  	args.in_args[0].size = sizeof(inarg);
2922  	args.in_args[0].value = &inarg;
2923  	args.out_numargs = 1;
2924  	args.out_args[0].size = sizeof(outarg);
2925  	args.out_args[0].value = &outarg;
2926  	err = fuse_simple_request(fm, &args);
2927  
2928  	if (!err)
2929  		return demangle_poll(outarg.revents);
2930  	if (err == -ENOSYS) {
2931  		fm->fc->no_poll = 1;
2932  		return DEFAULT_POLLMASK;
2933  	}
2934  	return EPOLLERR;
2935  }
2936  EXPORT_SYMBOL_GPL(fuse_file_poll);
2937  
2938  /*
2939   * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
2940   * wakes up the poll waiters.
2941   */
2942  int fuse_notify_poll_wakeup(struct fuse_conn *fc,
2943  			    struct fuse_notify_poll_wakeup_out *outarg)
2944  {
2945  	u64 kh = outarg->kh;
2946  	struct rb_node **link;
2947  
2948  	spin_lock(&fc->lock);
2949  
2950  	link = fuse_find_polled_node(fc, kh, NULL);
2951  	if (*link) {
2952  		struct fuse_file *ff;
2953  
2954  		ff = rb_entry(*link, struct fuse_file, polled_node);
2955  		wake_up_interruptible_sync(&ff->poll_wait);
2956  	}
2957  
2958  	spin_unlock(&fc->lock);
2959  	return 0;
2960  }
2961  
2962  static void fuse_do_truncate(struct file *file)
2963  {
2964  	struct inode *inode = file->f_mapping->host;
2965  	struct iattr attr;
2966  
2967  	attr.ia_valid = ATTR_SIZE;
2968  	attr.ia_size = i_size_read(inode);
2969  
2970  	attr.ia_file = file;
2971  	attr.ia_valid |= ATTR_FILE;
2972  
2973  	fuse_do_setattr(file_dentry(file), &attr, file);
2974  }
2975  
2976  static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
2977  {
2978  	return round_up(off, fc->max_pages << PAGE_SHIFT);
2979  }
2980  
2981  static ssize_t
2982  fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
2983  {
2984  	DECLARE_COMPLETION_ONSTACK(wait);
2985  	ssize_t ret = 0;
2986  	struct file *file = iocb->ki_filp;
2987  	struct fuse_file *ff = file->private_data;
2988  	loff_t pos = 0;
2989  	struct inode *inode;
2990  	loff_t i_size;
2991  	size_t count = iov_iter_count(iter), shortened = 0;
2992  	loff_t offset = iocb->ki_pos;
2993  	struct fuse_io_priv *io;
2994  
2995  	pos = offset;
2996  	inode = file->f_mapping->host;
2997  	i_size = i_size_read(inode);
2998  
2999  	if ((iov_iter_rw(iter) == READ) && (offset >= i_size))
3000  		return 0;
3001  
3002  	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
3003  	if (!io)
3004  		return -ENOMEM;
3005  	spin_lock_init(&io->lock);
3006  	kref_init(&io->refcnt);
3007  	io->reqs = 1;
3008  	io->bytes = -1;
3009  	io->size = 0;
3010  	io->offset = offset;
3011  	io->write = (iov_iter_rw(iter) == WRITE);
3012  	io->err = 0;
3013  	/*
3014  	 * By default, we want to optimize all I/Os with async request
3015  	 * submission to the client filesystem if supported.
3016  	 */
3017  	io->async = ff->fm->fc->async_dio;
3018  	io->iocb = iocb;
3019  	io->blocking = is_sync_kiocb(iocb);
3020  
3021  	/* optimization for short read */
3022  	if (io->async && !io->write && offset + count > i_size) {
3023  		iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset));
3024  		shortened = count - iov_iter_count(iter);
3025  		count -= shortened;
3026  	}
3027  
3028  	/*
3029  	 * We cannot asynchronously extend the size of a file.
3030  	 * In such case the aio will behave exactly like sync io.
3031  	 */
3032  	if ((offset + count > i_size) && io->write)
3033  		io->blocking = true;
3034  
3035  	if (io->async && io->blocking) {
3036  		/*
3037  		 * Additional reference to keep io around after
3038  		 * calling fuse_aio_complete()
3039  		 */
3040  		kref_get(&io->refcnt);
3041  		io->done = &wait;
3042  	}
3043  
3044  	if (iov_iter_rw(iter) == WRITE) {
3045  		ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
3046  		fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
3047  	} else {
3048  		ret = __fuse_direct_read(io, iter, &pos);
3049  	}
3050  	iov_iter_reexpand(iter, iov_iter_count(iter) + shortened);
3051  
3052  	if (io->async) {
3053  		bool blocking = io->blocking;
3054  
3055  		fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
3056  
3057  		/* we have a non-extending, async request, so return */
3058  		if (!blocking)
3059  			return -EIOCBQUEUED;
3060  
3061  		wait_for_completion(&wait);
3062  		ret = fuse_get_res_by_io(io);
3063  	}
3064  
3065  	kref_put(&io->refcnt, fuse_io_release);
3066  
3067  	if (iov_iter_rw(iter) == WRITE) {
3068  		fuse_write_update_attr(inode, pos, ret);
3069  		/* For extending writes we already hold exclusive lock */
3070  		if (ret < 0 && offset + count > i_size)
3071  			fuse_do_truncate(file);
3072  	}
3073  
3074  	return ret;
3075  }
3076  
3077  static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
3078  {
3079  	int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX);
3080  
3081  	if (!err)
3082  		fuse_sync_writes(inode);
3083  
3084  	return err;
3085  }
3086  
3087  static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
3088  				loff_t length)
3089  {
3090  	struct fuse_file *ff = file->private_data;
3091  	struct inode *inode = file_inode(file);
3092  	struct fuse_inode *fi = get_fuse_inode(inode);
3093  	struct fuse_mount *fm = ff->fm;
3094  	FUSE_ARGS(args);
3095  	struct fuse_fallocate_in inarg = {
3096  		.fh = ff->fh,
3097  		.offset = offset,
3098  		.length = length,
3099  		.mode = mode
3100  	};
3101  	int err;
3102  	bool block_faults = FUSE_IS_DAX(inode) &&
3103  		(!(mode & FALLOC_FL_KEEP_SIZE) ||
3104  		 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)));
3105  
3106  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3107  		     FALLOC_FL_ZERO_RANGE))
3108  		return -EOPNOTSUPP;
3109  
3110  	if (fm->fc->no_fallocate)
3111  		return -EOPNOTSUPP;
3112  
3113  	inode_lock(inode);
3114  	if (block_faults) {
3115  		filemap_invalidate_lock(inode->i_mapping);
3116  		err = fuse_dax_break_layouts(inode, 0, 0);
3117  		if (err)
3118  			goto out;
3119  	}
3120  
3121  	if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
3122  		loff_t endbyte = offset + length - 1;
3123  
3124  		err = fuse_writeback_range(inode, offset, endbyte);
3125  		if (err)
3126  			goto out;
3127  	}
3128  
3129  	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
3130  	    offset + length > i_size_read(inode)) {
3131  		err = inode_newsize_ok(inode, offset + length);
3132  		if (err)
3133  			goto out;
3134  	}
3135  
3136  	err = file_modified(file);
3137  	if (err)
3138  		goto out;
3139  
3140  	if (!(mode & FALLOC_FL_KEEP_SIZE))
3141  		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
3142  
3143  	args.opcode = FUSE_FALLOCATE;
3144  	args.nodeid = ff->nodeid;
3145  	args.in_numargs = 1;
3146  	args.in_args[0].size = sizeof(inarg);
3147  	args.in_args[0].value = &inarg;
3148  	err = fuse_simple_request(fm, &args);
3149  	if (err == -ENOSYS) {
3150  		fm->fc->no_fallocate = 1;
3151  		err = -EOPNOTSUPP;
3152  	}
3153  	if (err)
3154  		goto out;
3155  
3156  	/* we could have extended the file */
3157  	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3158  		if (fuse_write_update_attr(inode, offset + length, length))
3159  			file_update_time(file);
3160  	}
3161  
3162  	if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
3163  		truncate_pagecache_range(inode, offset, offset + length - 1);
3164  
3165  	fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
3166  
3167  out:
3168  	if (!(mode & FALLOC_FL_KEEP_SIZE))
3169  		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
3170  
3171  	if (block_faults)
3172  		filemap_invalidate_unlock(inode->i_mapping);
3173  
3174  	inode_unlock(inode);
3175  
3176  	fuse_flush_time_update(inode);
3177  
3178  	return err;
3179  }
3180  
3181  static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
3182  				      struct file *file_out, loff_t pos_out,
3183  				      size_t len, unsigned int flags)
3184  {
3185  	struct fuse_file *ff_in = file_in->private_data;
3186  	struct fuse_file *ff_out = file_out->private_data;
3187  	struct inode *inode_in = file_inode(file_in);
3188  	struct inode *inode_out = file_inode(file_out);
3189  	struct fuse_inode *fi_out = get_fuse_inode(inode_out);
3190  	struct fuse_mount *fm = ff_in->fm;
3191  	struct fuse_conn *fc = fm->fc;
3192  	FUSE_ARGS(args);
3193  	struct fuse_copy_file_range_in inarg = {
3194  		.fh_in = ff_in->fh,
3195  		.off_in = pos_in,
3196  		.nodeid_out = ff_out->nodeid,
3197  		.fh_out = ff_out->fh,
3198  		.off_out = pos_out,
3199  		.len = len,
3200  		.flags = flags
3201  	};
3202  	struct fuse_write_out outarg;
3203  	ssize_t err;
3204  	/* mark unstable when write-back is not used, and file_out gets
3205  	 * extended */
3206  	bool is_unstable = (!fc->writeback_cache) &&
3207  			   ((pos_out + len) > inode_out->i_size);
3208  
3209  	if (fc->no_copy_file_range)
3210  		return -EOPNOTSUPP;
3211  
3212  	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
3213  		return -EXDEV;
3214  
3215  	inode_lock(inode_in);
3216  	err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
3217  	inode_unlock(inode_in);
3218  	if (err)
3219  		return err;
3220  
3221  	inode_lock(inode_out);
3222  
3223  	err = file_modified(file_out);
3224  	if (err)
3225  		goto out;
3226  
3227  	/*
3228  	 * Write out dirty pages in the destination file before sending the COPY
3229  	 * request to userspace.  After the request is completed, truncate off
3230  	 * pages (including partial ones) from the cache that have been copied,
3231  	 * since these contain stale data at that point.
3232  	 *
3233  	 * This should be mostly correct, but if the COPY writes to partial
3234  	 * pages (at the start or end) and the parts not covered by the COPY are
3235  	 * written through a memory map after calling fuse_writeback_range(),
3236  	 * then these partial page modifications will be lost on truncation.
3237  	 *
3238  	 * It is unlikely that someone would rely on such mixed style
3239  	 * modifications.  Yet this does give less guarantees than if the
3240  	 * copying was performed with write(2).
3241  	 *
3242  	 * To fix this a mapping->invalidate_lock could be used to prevent new
3243  	 * faults while the copy is ongoing.
3244  	 */
3245  	err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
3246  	if (err)
3247  		goto out;
3248  
3249  	if (is_unstable)
3250  		set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3251  
3252  	args.opcode = FUSE_COPY_FILE_RANGE;
3253  	args.nodeid = ff_in->nodeid;
3254  	args.in_numargs = 1;
3255  	args.in_args[0].size = sizeof(inarg);
3256  	args.in_args[0].value = &inarg;
3257  	args.out_numargs = 1;
3258  	args.out_args[0].size = sizeof(outarg);
3259  	args.out_args[0].value = &outarg;
3260  	err = fuse_simple_request(fm, &args);
3261  	if (err == -ENOSYS) {
3262  		fc->no_copy_file_range = 1;
3263  		err = -EOPNOTSUPP;
3264  	}
3265  	if (err)
3266  		goto out;
3267  
3268  	truncate_inode_pages_range(inode_out->i_mapping,
3269  				   ALIGN_DOWN(pos_out, PAGE_SIZE),
3270  				   ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
3271  
3272  	file_update_time(file_out);
3273  	fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size);
3274  
3275  	err = outarg.size;
3276  out:
3277  	if (is_unstable)
3278  		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3279  
3280  	inode_unlock(inode_out);
3281  	file_accessed(file_in);
3282  
3283  	fuse_flush_time_update(inode_out);
3284  
3285  	return err;
3286  }
3287  
3288  static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
3289  				    struct file *dst_file, loff_t dst_off,
3290  				    size_t len, unsigned int flags)
3291  {
3292  	ssize_t ret;
3293  
3294  	ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
3295  				     len, flags);
3296  
3297  	if (ret == -EOPNOTSUPP || ret == -EXDEV)
3298  		ret = splice_copy_file_range(src_file, src_off, dst_file,
3299  					     dst_off, len);
3300  	return ret;
3301  }
3302  
3303  static const struct file_operations fuse_file_operations = {
3304  	.llseek		= fuse_file_llseek,
3305  	.read_iter	= fuse_file_read_iter,
3306  	.write_iter	= fuse_file_write_iter,
3307  	.mmap		= fuse_file_mmap,
3308  	.open		= fuse_open,
3309  	.flush		= fuse_flush,
3310  	.release	= fuse_release,
3311  	.fsync		= fuse_fsync,
3312  	.lock		= fuse_file_lock,
3313  	.get_unmapped_area = thp_get_unmapped_area,
3314  	.flock		= fuse_file_flock,
3315  	.splice_read	= fuse_splice_read,
3316  	.splice_write	= fuse_splice_write,
3317  	.unlocked_ioctl	= fuse_file_ioctl,
3318  	.compat_ioctl	= fuse_file_compat_ioctl,
3319  	.poll		= fuse_file_poll,
3320  	.fallocate	= fuse_file_fallocate,
3321  	.copy_file_range = fuse_copy_file_range,
3322  };
3323  
3324  static const struct address_space_operations fuse_file_aops  = {
3325  	.read_folio	= fuse_read_folio,
3326  	.readahead	= fuse_readahead,
3327  	.writepages	= fuse_writepages,
3328  	.launder_folio	= fuse_launder_folio,
3329  	.dirty_folio	= filemap_dirty_folio,
3330  	.migrate_folio	= filemap_migrate_folio,
3331  	.bmap		= fuse_bmap,
3332  	.direct_IO	= fuse_direct_IO,
3333  	.write_begin	= fuse_write_begin,
3334  	.write_end	= fuse_write_end,
3335  };
3336  
3337  void fuse_init_file_inode(struct inode *inode, unsigned int flags)
3338  {
3339  	struct fuse_inode *fi = get_fuse_inode(inode);
3340  
3341  	inode->i_fop = &fuse_file_operations;
3342  	inode->i_data.a_ops = &fuse_file_aops;
3343  
3344  	INIT_LIST_HEAD(&fi->write_files);
3345  	INIT_LIST_HEAD(&fi->queued_writes);
3346  	fi->writectr = 0;
3347  	fi->iocachectr = 0;
3348  	init_waitqueue_head(&fi->page_waitq);
3349  	init_waitqueue_head(&fi->direct_io_waitq);
3350  	fi->writepages = RB_ROOT;
3351  
3352  	if (IS_ENABLED(CONFIG_FUSE_DAX))
3353  		fuse_dax_inode_init(inode, flags);
3354  }
3355