xref: /linux/fs/nfs/localio.c (revision 1309c52de15b6a4204e569ea1b181c4e9dc25927)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * NFS client support for local clients to bypass network stack
4  *
5  * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com>
6  * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com>
7  * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
8  * Copyright (C) 2024 NeilBrown <neilb@suse.de>
9  */
10 
11 #include <linux/module.h>
12 #include <linux/errno.h>
13 #include <linux/vfs.h>
14 #include <linux/file.h>
15 #include <linux/inet.h>
16 #include <linux/sunrpc/addr.h>
17 #include <linux/inetdevice.h>
18 #include <net/addrconf.h>
19 #include <linux/nfs_common.h>
20 #include <linux/nfslocalio.h>
21 #include <linux/bvec.h>
22 
23 #include <linux/nfs.h>
24 #include <linux/nfs_fs.h>
25 #include <linux/nfs_xdr.h>
26 
27 #include "internal.h"
28 #include "pnfs.h"
29 #include "nfstrace.h"
30 
31 #define NFSDBG_FACILITY		NFSDBG_VFS
32 
33 #define NFSLOCAL_MAX_IOS	3
34 
35 struct nfs_local_kiocb {
36 	struct kiocb		kiocb;
37 	struct bio_vec		*bvec;
38 	struct nfs_pgio_header	*hdr;
39 	struct work_struct	work;
40 	void (*aio_complete_work)(struct work_struct *);
41 	struct nfsd_file	*localio;
42 	/* Begin mostly DIO-specific members */
43 	size_t                  end_len;
44 	short int		end_iter_index;
45 	atomic_t		n_iters;
46 	struct iov_iter		iters[NFSLOCAL_MAX_IOS];
47 	bool			iter_is_dio_aligned[NFSLOCAL_MAX_IOS];
48 	/* End mostly DIO-specific members */
49 };
50 
51 struct nfs_local_fsync_ctx {
52 	struct nfsd_file	*localio;
53 	struct nfs_commit_data	*data;
54 	struct work_struct	work;
55 	struct completion	*done;
56 };
57 
58 static bool localio_enabled __read_mostly = true;
59 module_param(localio_enabled, bool, 0644);
60 
61 static void nfs_local_do_read(struct nfs_local_kiocb *iocb,
62 			      const struct rpc_call_ops *call_ops);
63 static void nfs_local_do_write(struct nfs_local_kiocb *iocb,
64 			       const struct rpc_call_ops *call_ops);
65 
66 static inline bool nfs_client_is_local(const struct nfs_client *clp)
67 {
68 	return !!rcu_access_pointer(clp->cl_uuid.net);
69 }
70 
71 bool nfs_server_is_local(const struct nfs_client *clp)
72 {
73 	return nfs_client_is_local(clp) && localio_enabled;
74 }
75 EXPORT_SYMBOL_GPL(nfs_server_is_local);
76 
77 /*
78  * UUID_IS_LOCAL XDR functions
79  */
80 
81 static void localio_xdr_enc_uuidargs(struct rpc_rqst *req,
82 				     struct xdr_stream *xdr,
83 				     const void *data)
84 {
85 	const u8 *uuid = data;
86 
87 	encode_opaque_fixed(xdr, uuid, UUID_SIZE);
88 }
89 
90 static int localio_xdr_dec_uuidres(struct rpc_rqst *req,
91 				   struct xdr_stream *xdr,
92 				   void *result)
93 {
94 	/* void return */
95 	return 0;
96 }
97 
98 static const struct rpc_procinfo nfs_localio_procedures[] = {
99 	[LOCALIOPROC_UUID_IS_LOCAL] = {
100 		.p_proc = LOCALIOPROC_UUID_IS_LOCAL,
101 		.p_encode = localio_xdr_enc_uuidargs,
102 		.p_decode = localio_xdr_dec_uuidres,
103 		.p_arglen = XDR_QUADLEN(UUID_SIZE),
104 		.p_replen = 0,
105 		.p_statidx = LOCALIOPROC_UUID_IS_LOCAL,
106 		.p_name = "UUID_IS_LOCAL",
107 	},
108 };
109 
110 static unsigned int nfs_localio_counts[ARRAY_SIZE(nfs_localio_procedures)];
111 static const struct rpc_version nfslocalio_version1 = {
112 	.number			= 1,
113 	.nrprocs		= ARRAY_SIZE(nfs_localio_procedures),
114 	.procs			= nfs_localio_procedures,
115 	.counts			= nfs_localio_counts,
116 };
117 
118 static const struct rpc_version *nfslocalio_version[] = {
119        [1]			= &nfslocalio_version1,
120 };
121 
122 extern const struct rpc_program nfslocalio_program;
123 static struct rpc_stat		nfslocalio_rpcstat = { &nfslocalio_program };
124 
125 const struct rpc_program nfslocalio_program = {
126 	.name			= "nfslocalio",
127 	.number			= NFS_LOCALIO_PROGRAM,
128 	.nrvers			= ARRAY_SIZE(nfslocalio_version),
129 	.version		= nfslocalio_version,
130 	.stats			= &nfslocalio_rpcstat,
131 };
132 
133 /*
134  * nfs_init_localioclient - Initialise an NFS localio client connection
135  */
136 static struct rpc_clnt *nfs_init_localioclient(struct nfs_client *clp)
137 {
138 	struct rpc_clnt *rpcclient_localio;
139 
140 	rpcclient_localio = rpc_bind_new_program(clp->cl_rpcclient,
141 						 &nfslocalio_program, 1);
142 
143 	dprintk_rcu("%s: server (%s) %s NFS LOCALIO.\n",
144 		__func__, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
145 		(IS_ERR(rpcclient_localio) ? "does not support" : "supports"));
146 
147 	return rpcclient_localio;
148 }
149 
150 static bool nfs_server_uuid_is_local(struct nfs_client *clp)
151 {
152 	u8 uuid[UUID_SIZE];
153 	struct rpc_message msg = {
154 		.rpc_argp = &uuid,
155 	};
156 	struct rpc_clnt *rpcclient_localio;
157 	int status;
158 
159 	rpcclient_localio = nfs_init_localioclient(clp);
160 	if (IS_ERR(rpcclient_localio))
161 		return false;
162 
163 	export_uuid(uuid, &clp->cl_uuid.uuid);
164 
165 	msg.rpc_proc = &nfs_localio_procedures[LOCALIOPROC_UUID_IS_LOCAL];
166 	status = rpc_call_sync(rpcclient_localio, &msg, 0);
167 	dprintk("%s: NFS reply UUID_IS_LOCAL: status=%d\n",
168 		__func__, status);
169 	rpc_shutdown_client(rpcclient_localio);
170 
171 	/* Server is only local if it initialized required struct members */
172 	if (status || !rcu_access_pointer(clp->cl_uuid.net) || !clp->cl_uuid.dom)
173 		return false;
174 
175 	return true;
176 }
177 
178 /*
179  * nfs_local_probe - probe local i/o support for an nfs_server and nfs_client
180  * - called after alloc_client and init_client (so cl_rpcclient exists)
181  * - this function is idempotent, it can be called for old or new clients
182  */
183 static void nfs_local_probe(struct nfs_client *clp)
184 {
185 	/* Disallow localio if disabled via sysfs or AUTH_SYS isn't used */
186 	if (!localio_enabled ||
187 	    clp->cl_rpcclient->cl_auth->au_flavor != RPC_AUTH_UNIX) {
188 		nfs_localio_disable_client(clp);
189 		return;
190 	}
191 
192 	if (nfs_client_is_local(clp))
193 		return;
194 
195 	if (!nfs_uuid_begin(&clp->cl_uuid))
196 		return;
197 	if (nfs_server_uuid_is_local(clp))
198 		nfs_localio_enable_client(clp);
199 	nfs_uuid_end(&clp->cl_uuid);
200 }
201 
202 void nfs_local_probe_async_work(struct work_struct *work)
203 {
204 	struct nfs_client *clp =
205 		container_of(work, struct nfs_client, cl_local_probe_work);
206 
207 	if (!refcount_inc_not_zero(&clp->cl_count))
208 		return;
209 	nfs_local_probe(clp);
210 	nfs_put_client(clp);
211 }
212 
213 void nfs_local_probe_async(struct nfs_client *clp)
214 {
215 	queue_work(nfsiod_workqueue, &clp->cl_local_probe_work);
216 }
217 EXPORT_SYMBOL_GPL(nfs_local_probe_async);
218 
219 static inline void nfs_local_file_put(struct nfsd_file *localio)
220 {
221 	/* nfs_to_nfsd_file_put_local() expects an __rcu pointer
222 	 * but we have a __kernel pointer.  It is always safe
223 	 * to cast a __kernel pointer to an __rcu pointer
224 	 * because the cast only weakens what is known about the pointer.
225 	 */
226 	struct nfsd_file __rcu *nf = (struct nfsd_file __rcu*) localio;
227 
228 	nfs_to_nfsd_file_put_local(&nf);
229 }
230 
231 /*
232  * __nfs_local_open_fh - open a local filehandle in terms of nfsd_file.
233  *
234  * Returns a pointer to a struct nfsd_file or ERR_PTR.
235  * Caller must release returned nfsd_file with nfs_to_nfsd_file_put_local().
236  */
237 static struct nfsd_file *
238 __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
239 		    struct nfs_fh *fh, struct nfs_file_localio *nfl,
240 		    struct nfsd_file __rcu **pnf,
241 		    const fmode_t mode)
242 {
243 	int status = 0;
244 	struct nfsd_file *localio;
245 
246 	localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient,
247 				    cred, fh, nfl, pnf, mode);
248 	if (IS_ERR(localio)) {
249 		status = PTR_ERR(localio);
250 		switch (status) {
251 		case -ENOMEM:
252 		case -ENXIO:
253 		case -ENOENT:
254 			/* Revalidate localio */
255 			nfs_localio_disable_client(clp);
256 			nfs_local_probe(clp);
257 		}
258 	}
259 	trace_nfs_local_open_fh(fh, mode, status);
260 	return localio;
261 }
262 
263 /*
264  * nfs_local_open_fh - open a local filehandle in terms of nfsd_file.
265  * First checking if the open nfsd_file is already cached, otherwise
266  * must __nfs_local_open_fh and insert the nfsd_file in nfs_file_localio.
267  *
268  * Returns a pointer to a struct nfsd_file or NULL.
269  */
270 struct nfsd_file *
271 nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
272 		  struct nfs_fh *fh, struct nfs_file_localio *nfl,
273 		  const fmode_t mode)
274 {
275 	struct nfsd_file *nf, __rcu **pnf;
276 
277 	if (!nfs_server_is_local(clp))
278 		return NULL;
279 	if (mode & ~(FMODE_READ | FMODE_WRITE))
280 		return NULL;
281 
282 	if (mode & FMODE_WRITE)
283 		pnf = &nfl->rw_file;
284 	else
285 		pnf = &nfl->ro_file;
286 
287 	nf = __nfs_local_open_fh(clp, cred, fh, nfl, pnf, mode);
288 	if (IS_ERR(nf))
289 		return NULL;
290 	return nf;
291 }
292 EXPORT_SYMBOL_GPL(nfs_local_open_fh);
293 
294 /*
295  * Ensure all page cache allocations are done from GFP_NOFS context to
296  * prevent direct reclaim recursion back into NFS via nfs_writepages.
297  */
298 static void
299 nfs_local_mapping_set_gfp_nofs_context(struct address_space *m)
300 {
301 	gfp_t gfp_mask = mapping_gfp_mask(m);
302 
303 	mapping_set_gfp_mask(m, (gfp_mask & ~(__GFP_FS)));
304 }
305 
306 static void
307 nfs_local_iocb_free(struct nfs_local_kiocb *iocb)
308 {
309 	kfree(iocb->bvec);
310 	kfree(iocb);
311 }
312 
313 static struct nfs_local_kiocb *
314 nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
315 		     struct file *file, gfp_t flags)
316 {
317 	struct nfs_local_kiocb *iocb;
318 
319 	iocb = kzalloc(sizeof(*iocb), flags);
320 	if (iocb == NULL)
321 		return NULL;
322 
323 	iocb->bvec = kmalloc_array(hdr->page_array.npages,
324 				   sizeof(struct bio_vec), flags);
325 	if (iocb->bvec == NULL) {
326 		kfree(iocb);
327 		return NULL;
328 	}
329 
330 	nfs_local_mapping_set_gfp_nofs_context(file->f_mapping);
331 	init_sync_kiocb(&iocb->kiocb, file);
332 
333 	iocb->hdr = hdr;
334 	iocb->kiocb.ki_pos = hdr->args.offset;
335 	iocb->kiocb.ki_flags &= ~IOCB_APPEND;
336 	iocb->kiocb.ki_complete = NULL;
337 	iocb->aio_complete_work = NULL;
338 
339 	iocb->end_iter_index = -1;
340 
341 	return iocb;
342 }
343 
344 static bool
345 nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw,
346 			  size_t len, struct nfs_local_dio *local_dio)
347 {
348 	struct nfs_pgio_header *hdr = iocb->hdr;
349 	loff_t offset = hdr->args.offset;
350 	u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
351 	loff_t start_end, orig_end, middle_end;
352 
353 	nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
354 			&nf_dio_offset_align, &nf_dio_read_offset_align);
355 	if (rw == ITER_DEST)
356 		nf_dio_offset_align = nf_dio_read_offset_align;
357 
358 	if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align))
359 		return false;
360 	if (unlikely(len < nf_dio_offset_align))
361 		return false;
362 
363 	local_dio->mem_align = nf_dio_mem_align;
364 	local_dio->offset_align = nf_dio_offset_align;
365 
366 	start_end = round_up(offset, nf_dio_offset_align);
367 	orig_end = offset + len;
368 	middle_end = round_down(orig_end, nf_dio_offset_align);
369 
370 	local_dio->middle_offset = start_end;
371 	local_dio->end_offset = middle_end;
372 
373 	local_dio->start_len = start_end - offset;
374 	local_dio->middle_len = middle_end - start_end;
375 	local_dio->end_len = orig_end - middle_end;
376 
377 	if (rw == ITER_DEST)
378 		trace_nfs_local_dio_read(hdr->inode, offset, len, local_dio);
379 	else
380 		trace_nfs_local_dio_write(hdr->inode, offset, len, local_dio);
381 	return true;
382 }
383 
384 static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
385 		unsigned int addr_mask, unsigned int len_mask)
386 {
387 	const struct bio_vec *bvec = i->bvec;
388 	size_t skip = i->iov_offset;
389 	size_t size = i->count;
390 
391 	if (size & len_mask)
392 		return false;
393 	do {
394 		size_t len = bvec->bv_len;
395 
396 		if (len > size)
397 			len = size;
398 		if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
399 			return false;
400 		bvec++;
401 		size -= len;
402 		skip = 0;
403 	} while (size);
404 
405 	return true;
406 }
407 
408 static void
409 nfs_local_iter_setup(struct iov_iter *iter, int rw, struct bio_vec *bvec,
410 		     unsigned int nvecs, unsigned long total,
411 		     size_t start, size_t len)
412 {
413 	iov_iter_bvec(iter, rw, bvec, nvecs, total);
414 	if (start)
415 		iov_iter_advance(iter, start);
416 	iov_iter_truncate(iter, len);
417 }
418 
419 /*
420  * Setup as many as 3 iov_iter based on extents described by @local_dio.
421  * Returns the number of iov_iter that were setup.
422  */
423 static int
424 nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
425 			  unsigned int nvecs, unsigned long total,
426 			  struct nfs_local_dio *local_dio)
427 {
428 	int n_iters = 0;
429 	struct iov_iter *iters = iocb->iters;
430 
431 	/* Setup misaligned start? */
432 	if (local_dio->start_len) {
433 		nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec,
434 				     nvecs, total, 0, local_dio->start_len);
435 		++n_iters;
436 	}
437 
438 	/*
439 	 * Setup DIO-aligned middle, if there is no misaligned end (below)
440 	 * then AIO completion is used, see nfs_local_call_{read,write}
441 	 */
442 	nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, nvecs,
443 			     total, local_dio->start_len, local_dio->middle_len);
444 
445 	iocb->iter_is_dio_aligned[n_iters] =
446 		nfs_iov_iter_aligned_bvec(&iters[n_iters],
447 			local_dio->mem_align-1, local_dio->offset_align-1);
448 
449 	if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) {
450 		trace_nfs_local_dio_misaligned(iocb->hdr->inode,
451 			local_dio->start_len, local_dio->middle_len, local_dio);
452 		return 0; /* no DIO-aligned IO possible */
453 	}
454 	iocb->end_iter_index = n_iters;
455 	++n_iters;
456 
457 	/* Setup misaligned end? */
458 	if (local_dio->end_len) {
459 		nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec,
460 				     nvecs, total, local_dio->start_len +
461 				     local_dio->middle_len, local_dio->end_len);
462 		iocb->end_iter_index = n_iters;
463 		++n_iters;
464 	}
465 
466 	atomic_set(&iocb->n_iters, n_iters);
467 	return n_iters;
468 }
469 
470 static noinline_for_stack void
471 nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw)
472 {
473 	struct nfs_pgio_header *hdr = iocb->hdr;
474 	struct page **pagevec = hdr->page_array.pagevec;
475 	unsigned long v, total;
476 	unsigned int base;
477 	size_t len;
478 
479 	v = 0;
480 	total = hdr->args.count;
481 	base = hdr->args.pgbase;
482 	pagevec += base >> PAGE_SHIFT;
483 	base &= ~PAGE_MASK;
484 	while (total && v < hdr->page_array.npages) {
485 		len = min_t(size_t, total, PAGE_SIZE - base);
486 		bvec_set_page(&iocb->bvec[v], *pagevec, len, base);
487 		total -= len;
488 		++pagevec;
489 		++v;
490 		base = 0;
491 	}
492 	len = hdr->args.count - total;
493 
494 	/*
495 	 * For each iocb, iocb->n_iters is always at least 1 and we always
496 	 * end io after first nfs_local_pgio_done call unless misaligned DIO.
497 	 */
498 	atomic_set(&iocb->n_iters, 1);
499 
500 	if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
501 		struct nfs_local_dio local_dio;
502 
503 		if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) &&
504 		    nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) {
505 			/* Ensure DIO WRITE's IO on stable storage upon completion */
506 			if (rw == ITER_SOURCE)
507 				iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC;
508 			return; /* is DIO-aligned */
509 		}
510 	}
511 
512 	/* Use buffered IO */
513 	iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len);
514 }
515 
516 static void
517 nfs_local_hdr_release(struct nfs_pgio_header *hdr,
518 		const struct rpc_call_ops *call_ops)
519 {
520 	call_ops->rpc_call_done(&hdr->task, hdr);
521 	call_ops->rpc_release(hdr);
522 }
523 
524 static void
525 nfs_local_pgio_init(struct nfs_pgio_header *hdr,
526 		const struct rpc_call_ops *call_ops)
527 {
528 	hdr->task.tk_ops = call_ops;
529 	if (!hdr->task.tk_start)
530 		hdr->task.tk_start = ktime_get();
531 }
532 
533 static bool nfs_local_pgio_done(struct nfs_local_kiocb *iocb, long status)
534 {
535 	struct nfs_pgio_header *hdr = iocb->hdr;
536 
537 	/* Must handle partial completions */
538 	if (status >= 0) {
539 		hdr->res.count += status;
540 		/* @hdr was initialized to 0 (zeroed during allocation) */
541 		if (hdr->task.tk_status == 0)
542 			hdr->res.op_status = NFS4_OK;
543 	} else {
544 		hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
545 		hdr->task.tk_status = status;
546 	}
547 
548 	BUG_ON(atomic_read(&iocb->n_iters) <= 0);
549 	return atomic_dec_and_test(&iocb->n_iters);
550 }
551 
552 static void
553 nfs_local_iocb_release(struct nfs_local_kiocb *iocb)
554 {
555 	nfs_local_file_put(iocb->localio);
556 	nfs_local_iocb_free(iocb);
557 }
558 
559 static void nfs_local_pgio_restart(struct nfs_local_kiocb *iocb,
560 				   struct nfs_pgio_header *hdr)
561 {
562 	int status = 0;
563 
564 	iocb->kiocb.ki_pos = hdr->args.offset;
565 	iocb->kiocb.ki_flags &= ~(IOCB_DSYNC | IOCB_SYNC | IOCB_DIRECT);
566 	iocb->kiocb.ki_complete = NULL;
567 	iocb->aio_complete_work = NULL;
568 	iocb->end_iter_index = -1;
569 
570 	switch (hdr->rw_mode) {
571 	case FMODE_READ:
572 		nfs_local_iters_init(iocb, ITER_DEST);
573 		nfs_local_do_read(iocb, hdr->task.tk_ops);
574 		break;
575 	case FMODE_WRITE:
576 		nfs_local_iters_init(iocb, ITER_SOURCE);
577 		nfs_local_do_write(iocb, hdr->task.tk_ops);
578 		break;
579 	default:
580 		status = -EOPNOTSUPP;
581 	}
582 
583 	if (unlikely(status != 0)) {
584 		nfs_local_iocb_release(iocb);
585 		hdr->task.tk_status = status;
586 		nfs_local_hdr_release(hdr, hdr->task.tk_ops);
587 	}
588 }
589 
590 static void nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
591 {
592 	struct nfs_pgio_header *hdr = iocb->hdr;
593 	struct rpc_task *task = &hdr->task;
594 
595 	task->tk_action = NULL;
596 	task->tk_ops->rpc_call_done(task, hdr);
597 
598 	if (task->tk_action == NULL) {
599 		nfs_local_iocb_release(iocb);
600 		task->tk_ops->rpc_release(hdr);
601 	} else
602 		nfs_local_pgio_restart(iocb, hdr);
603 }
604 
605 /*
606  * Complete the I/O from iocb->kiocb.ki_complete()
607  *
608  * Note that this function can be called from a bottom half context,
609  * hence we need to queue the rpc_call_done() etc to a workqueue
610  */
611 static inline void nfs_local_pgio_aio_complete(struct nfs_local_kiocb *iocb)
612 {
613 	INIT_WORK(&iocb->work, iocb->aio_complete_work);
614 	queue_work(nfsiod_workqueue, &iocb->work);
615 }
616 
617 static void nfs_local_read_done(struct nfs_local_kiocb *iocb)
618 {
619 	struct nfs_pgio_header *hdr = iocb->hdr;
620 	struct file *filp = iocb->kiocb.ki_filp;
621 	long status = hdr->task.tk_status;
622 
623 	if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) {
624 		/* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
625 		pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n");
626 	}
627 
628 	/*
629 	 * Must clear replen otherwise NFSv3 data corruption will occur
630 	 * if/when switching from LOCALIO back to using normal RPC.
631 	 */
632 	hdr->res.replen = 0;
633 
634 	/* nfs_readpage_result() handles short read */
635 
636 	if (hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp)))
637 		hdr->res.eof = true;
638 
639 	dprintk("%s: read %ld bytes eof %d.\n", __func__,
640 			status > 0 ? status : 0, hdr->res.eof);
641 }
642 
643 static inline void nfs_local_read_iocb_done(struct nfs_local_kiocb *iocb)
644 {
645 	nfs_local_read_done(iocb);
646 	nfs_local_pgio_release(iocb);
647 }
648 
649 static void nfs_local_read_aio_complete_work(struct work_struct *work)
650 {
651 	struct nfs_local_kiocb *iocb =
652 		container_of(work, struct nfs_local_kiocb, work);
653 
654 	nfs_local_read_iocb_done(iocb);
655 }
656 
657 static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
658 {
659 	struct nfs_local_kiocb *iocb =
660 		container_of(kiocb, struct nfs_local_kiocb, kiocb);
661 
662 	/* AIO completion of DIO read should always be last to complete */
663 	if (unlikely(!nfs_local_pgio_done(iocb, ret)))
664 		return;
665 
666 	nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */
667 }
668 
669 static void nfs_local_call_read(struct work_struct *work)
670 {
671 	struct nfs_local_kiocb *iocb =
672 		container_of(work, struct nfs_local_kiocb, work);
673 	struct file *filp = iocb->kiocb.ki_filp;
674 	ssize_t status;
675 	int n_iters;
676 
677 	n_iters = atomic_read(&iocb->n_iters);
678 	for (int i = 0; i < n_iters ; i++) {
679 		if (iocb->iter_is_dio_aligned[i]) {
680 			iocb->kiocb.ki_flags |= IOCB_DIRECT;
681 			/* Only use AIO completion if DIO-aligned segment is last */
682 			if (i == iocb->end_iter_index) {
683 				iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
684 				iocb->aio_complete_work = nfs_local_read_aio_complete_work;
685 			}
686 		} else
687 			iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
688 
689 		scoped_with_creds(filp->f_cred)
690 			status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]);
691 
692 		if (status == -EIOCBQUEUED)
693 			continue;
694 		/* Break on completion, errors, or short reads */
695 		if (nfs_local_pgio_done(iocb, status) || status < 0 ||
696 		    (size_t)status < iov_iter_count(&iocb->iters[i])) {
697 			nfs_local_read_iocb_done(iocb);
698 			break;
699 		}
700 	}
701 }
702 
703 static void nfs_local_do_read(struct nfs_local_kiocb *iocb,
704 			      const struct rpc_call_ops *call_ops)
705 {
706 	struct nfs_pgio_header *hdr = iocb->hdr;
707 
708 	dprintk("%s: vfs_read count=%u pos=%llu\n",
709 		__func__, hdr->args.count, hdr->args.offset);
710 
711 	nfs_local_pgio_init(hdr, call_ops);
712 	hdr->res.eof = false;
713 
714 	INIT_WORK(&iocb->work, nfs_local_call_read);
715 	queue_work(nfslocaliod_workqueue, &iocb->work);
716 }
717 
718 static void
719 nfs_copy_boot_verifier(struct nfs_write_verifier *verifier, struct inode *inode)
720 {
721 	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
722 	u32 *verf = (u32 *)verifier->data;
723 	unsigned int seq;
724 
725 	do {
726 		seq = read_seqbegin(&clp->cl_boot_lock);
727 		verf[0] = (u32)clp->cl_nfssvc_boot.tv_sec;
728 		verf[1] = (u32)clp->cl_nfssvc_boot.tv_nsec;
729 	} while (read_seqretry(&clp->cl_boot_lock, seq));
730 }
731 
732 static void
733 nfs_reset_boot_verifier(struct inode *inode)
734 {
735 	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
736 
737 	write_seqlock(&clp->cl_boot_lock);
738 	ktime_get_real_ts64(&clp->cl_nfssvc_boot);
739 	write_sequnlock(&clp->cl_boot_lock);
740 }
741 
742 static void
743 nfs_set_local_verifier(struct inode *inode,
744 		struct nfs_writeverf *verf,
745 		enum nfs3_stable_how how)
746 {
747 	nfs_copy_boot_verifier(&verf->verifier, inode);
748 	verf->committed = how;
749 }
750 
751 /* Factored out from fs/nfsd/vfs.h:fh_getattr() */
752 static int __vfs_getattr(const struct path *p, struct kstat *stat, int version)
753 {
754 	u32 request_mask = STATX_BASIC_STATS;
755 
756 	if (version == 4)
757 		request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE);
758 	return vfs_getattr(p, stat, request_mask, AT_STATX_SYNC_AS_STAT);
759 }
760 
761 /* Copied from fs/nfsd/nfsfh.c:nfsd4_change_attribute() */
762 static u64 __nfsd4_change_attribute(const struct kstat *stat,
763 				    const struct inode *inode)
764 {
765 	u64 chattr;
766 
767 	if (stat->result_mask & STATX_CHANGE_COOKIE) {
768 		chattr = stat->change_cookie;
769 		if (S_ISREG(inode->i_mode) &&
770 		    !(stat->attributes & STATX_ATTR_CHANGE_MONOTONIC)) {
771 			chattr += (u64)stat->ctime.tv_sec << 30;
772 			chattr += stat->ctime.tv_nsec;
773 		}
774 	} else {
775 		chattr = time_to_chattr(&stat->ctime);
776 	}
777 	return chattr;
778 }
779 
780 static void nfs_local_vfs_getattr(struct nfs_local_kiocb *iocb)
781 {
782 	struct kstat stat;
783 	struct file *filp = iocb->kiocb.ki_filp;
784 	struct nfs_pgio_header *hdr = iocb->hdr;
785 	struct nfs_fattr *fattr = hdr->res.fattr;
786 	int version = NFS_PROTO(hdr->inode)->version;
787 
788 	if (unlikely(!fattr) || __vfs_getattr(&filp->f_path, &stat, version))
789 		return;
790 
791 	fattr->valid = (NFS_ATTR_FATTR_FILEID |
792 			NFS_ATTR_FATTR_CHANGE |
793 			NFS_ATTR_FATTR_SIZE |
794 			NFS_ATTR_FATTR_ATIME |
795 			NFS_ATTR_FATTR_MTIME |
796 			NFS_ATTR_FATTR_CTIME |
797 			NFS_ATTR_FATTR_SPACE_USED);
798 
799 	fattr->fileid = stat.ino;
800 	fattr->size = stat.size;
801 	fattr->atime = stat.atime;
802 	fattr->mtime = stat.mtime;
803 	fattr->ctime = stat.ctime;
804 	if (version == 4) {
805 		fattr->change_attr =
806 			__nfsd4_change_attribute(&stat, file_inode(filp));
807 	} else
808 		fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);
809 	fattr->du.nfs3.used = stat.blocks << 9;
810 }
811 
812 static void nfs_local_write_done(struct nfs_local_kiocb *iocb)
813 {
814 	struct nfs_pgio_header *hdr = iocb->hdr;
815 	long status = hdr->task.tk_status;
816 
817 	dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0);
818 
819 	if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) {
820 		/* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
821 		pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n");
822 	}
823 
824 	if (status < 0)
825 		nfs_reset_boot_verifier(hdr->inode);
826 }
827 
828 static inline void nfs_local_write_iocb_done(struct nfs_local_kiocb *iocb)
829 {
830 	nfs_local_write_done(iocb);
831 	nfs_local_vfs_getattr(iocb);
832 	nfs_local_pgio_release(iocb);
833 }
834 
835 static void nfs_local_write_aio_complete_work(struct work_struct *work)
836 {
837 	struct nfs_local_kiocb *iocb =
838 		container_of(work, struct nfs_local_kiocb, work);
839 
840 	nfs_local_write_iocb_done(iocb);
841 }
842 
843 static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
844 {
845 	struct nfs_local_kiocb *iocb =
846 		container_of(kiocb, struct nfs_local_kiocb, kiocb);
847 
848 	/* AIO completion of DIO write should always be last to complete */
849 	if (unlikely(!nfs_local_pgio_done(iocb, ret)))
850 		return;
851 
852 	nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */
853 }
854 
855 static void nfs_local_call_write(struct work_struct *work)
856 {
857 	struct nfs_local_kiocb *iocb =
858 		container_of(work, struct nfs_local_kiocb, work);
859 	struct file *filp = iocb->kiocb.ki_filp;
860 	unsigned long old_flags = current->flags;
861 	ssize_t status;
862 	int n_iters;
863 
864 	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
865 
866 	file_start_write(filp);
867 	n_iters = atomic_read(&iocb->n_iters);
868 	for (int i = 0; i < n_iters ; i++) {
869 		if (iocb->iter_is_dio_aligned[i]) {
870 			iocb->kiocb.ki_flags |= IOCB_DIRECT;
871 			/* Only use AIO completion if DIO-aligned segment is last */
872 			if (i == iocb->end_iter_index) {
873 				iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
874 				iocb->aio_complete_work = nfs_local_write_aio_complete_work;
875 			}
876 		} else
877 			iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
878 
879 		scoped_with_creds(filp->f_cred)
880 			status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]);
881 
882 		if (status == -EIOCBQUEUED)
883 			continue;
884 		/* Break on completion, errors, or short writes */
885 		if (nfs_local_pgio_done(iocb, status) || status < 0 ||
886 		    (size_t)status < iov_iter_count(&iocb->iters[i])) {
887 			nfs_local_write_iocb_done(iocb);
888 			break;
889 		}
890 	}
891 	file_end_write(filp);
892 
893 	current->flags = old_flags;
894 }
895 
896 static void nfs_local_do_write(struct nfs_local_kiocb *iocb,
897 			       const struct rpc_call_ops *call_ops)
898 {
899 	struct nfs_pgio_header *hdr = iocb->hdr;
900 
901 	dprintk("%s: vfs_write count=%u pos=%llu %s\n",
902 		__func__, hdr->args.count, hdr->args.offset,
903 		(hdr->args.stable == NFS_UNSTABLE) ?  "unstable" : "stable");
904 
905 	switch (hdr->args.stable) {
906 	default:
907 		break;
908 	case NFS_DATA_SYNC:
909 		iocb->kiocb.ki_flags |= IOCB_DSYNC;
910 		break;
911 	case NFS_FILE_SYNC:
912 		iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC;
913 	}
914 
915 	nfs_local_pgio_init(hdr, call_ops);
916 
917 	nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
918 
919 	INIT_WORK(&iocb->work, nfs_local_call_write);
920 	queue_work(nfslocaliod_workqueue, &iocb->work);
921 }
922 
923 static struct nfs_local_kiocb *
924 nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio)
925 {
926 	struct file *file = nfs_to->nfsd_file_file(localio);
927 	struct nfs_local_kiocb *iocb;
928 	gfp_t gfp_mask;
929 	int rw;
930 
931 	if (hdr->rw_mode & FMODE_READ) {
932 		if (!file->f_op->read_iter)
933 			return ERR_PTR(-EOPNOTSUPP);
934 		gfp_mask = GFP_KERNEL;
935 		rw = ITER_DEST;
936 	} else {
937 		if (!file->f_op->write_iter)
938 			return ERR_PTR(-EOPNOTSUPP);
939 		gfp_mask = GFP_NOIO;
940 		rw = ITER_SOURCE;
941 	}
942 
943 	iocb = nfs_local_iocb_alloc(hdr, file, gfp_mask);
944 	if (iocb == NULL)
945 		return ERR_PTR(-ENOMEM);
946 	iocb->hdr = hdr;
947 	iocb->localio = localio;
948 
949 	nfs_local_iters_init(iocb, rw);
950 
951 	return iocb;
952 }
953 
954 int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio,
955 		   struct nfs_pgio_header *hdr,
956 		   const struct rpc_call_ops *call_ops)
957 {
958 	struct nfs_local_kiocb *iocb;
959 	int status = 0;
960 
961 	if (!hdr->args.count)
962 		return 0;
963 
964 	iocb = nfs_local_iocb_init(hdr, localio);
965 	if (IS_ERR(iocb))
966 		return PTR_ERR(iocb);
967 
968 	switch (hdr->rw_mode) {
969 	case FMODE_READ:
970 		nfs_local_do_read(iocb, call_ops);
971 		break;
972 	case FMODE_WRITE:
973 		nfs_local_do_write(iocb, call_ops);
974 		break;
975 	default:
976 		dprintk("%s: invalid mode: %d\n", __func__,
977 			hdr->rw_mode);
978 		status = -EOPNOTSUPP;
979 	}
980 
981 	if (unlikely(status != 0)) {
982 		nfs_local_iocb_release(iocb);
983 		hdr->task.tk_status = status;
984 		nfs_local_hdr_release(hdr, call_ops);
985 	}
986 	return status;
987 }
988 
989 static void
990 nfs_local_init_commit(struct nfs_commit_data *data,
991 		const struct rpc_call_ops *call_ops)
992 {
993 	data->task.tk_ops = call_ops;
994 }
995 
996 static int
997 nfs_local_run_commit(struct file *filp, struct nfs_commit_data *data)
998 {
999 	loff_t start = data->args.offset;
1000 	loff_t end = LLONG_MAX;
1001 
1002 	if (data->args.count > 0) {
1003 		end = start + data->args.count - 1;
1004 		if (end < start)
1005 			end = LLONG_MAX;
1006 	}
1007 
1008 	nfs_local_mapping_set_gfp_nofs_context(filp->f_mapping);
1009 
1010 	dprintk("%s: commit %llu - %llu\n", __func__, start, end);
1011 	return vfs_fsync_range(filp, start, end, 0);
1012 }
1013 
1014 static void
1015 nfs_local_commit_done(struct nfs_commit_data *data, int status)
1016 {
1017 	if (status >= 0) {
1018 		nfs_set_local_verifier(data->inode,
1019 				data->res.verf,
1020 				NFS_FILE_SYNC);
1021 		data->res.op_status = NFS4_OK;
1022 		data->task.tk_status = 0;
1023 	} else {
1024 		nfs_reset_boot_verifier(data->inode);
1025 		data->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
1026 		data->task.tk_status = status;
1027 	}
1028 }
1029 
1030 static void
1031 nfs_local_release_commit_data(struct nfsd_file *localio,
1032 		struct nfs_commit_data *data,
1033 		const struct rpc_call_ops *call_ops)
1034 {
1035 	nfs_local_file_put(localio);
1036 	call_ops->rpc_call_done(&data->task, data);
1037 	call_ops->rpc_release(data);
1038 }
1039 
1040 static void
1041 nfs_local_fsync_ctx_free(struct nfs_local_fsync_ctx *ctx)
1042 {
1043 	nfs_local_release_commit_data(ctx->localio, ctx->data,
1044 				      ctx->data->task.tk_ops);
1045 	kfree(ctx);
1046 }
1047 
1048 static void
1049 nfs_local_fsync_work(struct work_struct *work)
1050 {
1051 	unsigned long old_flags = current->flags;
1052 	struct nfs_local_fsync_ctx *ctx;
1053 	int status;
1054 
1055 	ctx = container_of(work, struct nfs_local_fsync_ctx, work);
1056 
1057 	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
1058 
1059 	status = nfs_local_run_commit(nfs_to->nfsd_file_file(ctx->localio),
1060 				      ctx->data);
1061 	nfs_local_commit_done(ctx->data, status);
1062 	if (ctx->done != NULL)
1063 		complete(ctx->done);
1064 	nfs_local_fsync_ctx_free(ctx);
1065 
1066 	current->flags = old_flags;
1067 }
1068 
1069 static struct nfs_local_fsync_ctx *
1070 nfs_local_fsync_ctx_alloc(struct nfs_commit_data *data,
1071 			  struct nfsd_file *localio, gfp_t flags)
1072 {
1073 	struct nfs_local_fsync_ctx *ctx = kmalloc(sizeof(*ctx), flags);
1074 
1075 	if (ctx != NULL) {
1076 		ctx->localio = localio;
1077 		ctx->data = data;
1078 		INIT_WORK(&ctx->work, nfs_local_fsync_work);
1079 		ctx->done = NULL;
1080 	}
1081 	return ctx;
1082 }
1083 
1084 int nfs_local_commit(struct nfsd_file *localio,
1085 		     struct nfs_commit_data *data,
1086 		     const struct rpc_call_ops *call_ops, int how)
1087 {
1088 	struct nfs_local_fsync_ctx *ctx;
1089 
1090 	ctx = nfs_local_fsync_ctx_alloc(data, localio, GFP_NOIO);
1091 	if (!ctx) {
1092 		nfs_local_commit_done(data, -ENOMEM);
1093 		nfs_local_release_commit_data(localio, data, call_ops);
1094 		return -ENOMEM;
1095 	}
1096 
1097 	nfs_local_init_commit(data, call_ops);
1098 
1099 	if (how & FLUSH_SYNC) {
1100 		DECLARE_COMPLETION_ONSTACK(done);
1101 		ctx->done = &done;
1102 		queue_work(nfslocaliod_workqueue, &ctx->work);
1103 		wait_for_completion(&done);
1104 	} else
1105 		queue_work(nfslocaliod_workqueue, &ctx->work);
1106 
1107 	return 0;
1108 }
1109