xref: /linux/fs/nfs/direct.c (revision 5394eea106517d5b0d4a372f00e63d5db8cb0370)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * linux/fs/nfs/direct.c
4  *
5  * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
6  *
7  * High-performance uncached I/O for the Linux NFS client
8  *
9  * There are important applications whose performance or correctness
10  * depends on uncached access to file data.  Database clusters
11  * (multiple copies of the same instance running on separate hosts)
12  * implement their own cache coherency protocol that subsumes file
13  * system cache protocols.  Applications that process datasets
14  * considerably larger than the client's memory do not always benefit
15  * from a local cache.  A streaming video server, for instance, has no
16  * need to cache the contents of a file.
17  *
18  * When an application requests uncached I/O, all read and write requests
19  * are made directly to the server; data stored or fetched via these
20  * requests is not cached in the Linux page cache.  The client does not
21  * correct unaligned requests from applications.  All requested bytes are
22  * held on permanent storage before a direct write system call returns to
23  * an application.
24  *
25  * Solaris implements an uncached I/O facility called directio() that
26  * is used for backups and sequential I/O to very large files.  Solaris
27  * also supports uncaching whole NFS partitions with "-o forcedirectio,"
28  * an undocumented mount option.
29  *
30  * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
31  * help from Andrew Morton.
32  *
33  * 18 Dec 2001	Initial implementation for 2.4  --cel
34  * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
35  * 08 Jun 2003	Port to 2.5 APIs  --cel
36  * 31 Mar 2004	Handle direct I/O without VFS support  --cel
37  * 15 Sep 2004	Parallel async reads  --cel
38  * 04 May 2005	support O_DIRECT with aio  --cel
39  *
40  */
41 
42 #include <linux/errno.h>
43 #include <linux/sched.h>
44 #include <linux/kernel.h>
45 #include <linux/file.h>
46 #include <linux/pagemap.h>
47 #include <linux/kref.h>
48 #include <linux/slab.h>
49 #include <linux/task_io_accounting_ops.h>
50 #include <linux/module.h>
51 
52 #include <linux/nfs_fs.h>
53 #include <linux/nfs_page.h>
54 #include <linux/sunrpc/clnt.h>
55 
56 #include <linux/uaccess.h>
57 #include <linux/atomic.h>
58 
59 #include "delegation.h"
60 #include "internal.h"
61 #include "iostat.h"
62 #include "pnfs.h"
63 #include "fscache.h"
64 #include "nfstrace.h"
65 
66 #define NFSDBG_FACILITY		NFSDBG_VFS
67 
68 static struct kmem_cache *nfs_direct_cachep;
69 
70 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
71 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
72 static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
73 static void nfs_direct_write_schedule_work(struct work_struct *work);
74 
get_dreq(struct nfs_direct_req * dreq)75 static inline void get_dreq(struct nfs_direct_req *dreq)
76 {
77 	atomic_inc(&dreq->io_count);
78 }
79 
put_dreq(struct nfs_direct_req * dreq)80 static inline int put_dreq(struct nfs_direct_req *dreq)
81 {
82 	return atomic_dec_and_test(&dreq->io_count);
83 }
84 
85 static void
nfs_direct_handle_truncated(struct nfs_direct_req * dreq,const struct nfs_pgio_header * hdr,ssize_t dreq_len)86 nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
87 			    const struct nfs_pgio_header *hdr,
88 			    ssize_t dreq_len)
89 {
90 	if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) ||
91 	      test_bit(NFS_IOHDR_EOF, &hdr->flags)))
92 		return;
93 	if (dreq->max_count >= dreq_len) {
94 		dreq->max_count = dreq_len;
95 		if (dreq->count > dreq_len)
96 			dreq->count = dreq_len;
97 	}
98 
99 	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error)
100 		dreq->error = hdr->error;
101 }
102 
103 static void
nfs_direct_count_bytes(struct nfs_direct_req * dreq,const struct nfs_pgio_header * hdr)104 nfs_direct_count_bytes(struct nfs_direct_req *dreq,
105 		       const struct nfs_pgio_header *hdr)
106 {
107 	loff_t hdr_end = hdr->io_start + hdr->good_bytes;
108 	ssize_t dreq_len = 0;
109 
110 	if (hdr_end > dreq->io_start)
111 		dreq_len = hdr_end - dreq->io_start;
112 
113 	nfs_direct_handle_truncated(dreq, hdr, dreq_len);
114 
115 	if (dreq_len > dreq->max_count)
116 		dreq_len = dreq->max_count;
117 
118 	if (dreq->count < dreq_len)
119 		dreq->count = dreq_len;
120 }
121 
nfs_direct_truncate_request(struct nfs_direct_req * dreq,struct nfs_page * req)122 static void nfs_direct_truncate_request(struct nfs_direct_req *dreq,
123 					struct nfs_page *req)
124 {
125 	loff_t offs = req_offset(req);
126 	size_t req_start = (size_t)(offs - dreq->io_start);
127 
128 	if (req_start < dreq->max_count)
129 		dreq->max_count = req_start;
130 	if (req_start < dreq->count)
131 		dreq->count = req_start;
132 }
133 
nfs_direct_file_adjust_size_locked(struct inode * inode,loff_t offset,size_t count)134 static void nfs_direct_file_adjust_size_locked(struct inode *inode,
135 					       loff_t offset, size_t count)
136 {
137 	loff_t newsize = offset + (loff_t)count;
138 	loff_t oldsize = i_size_read(inode);
139 
140 	if (newsize > oldsize) {
141 		i_size_write(inode, newsize);
142 		NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
143 		trace_nfs_size_grow(inode, newsize);
144 		nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
145 	}
146 }
147 
148 /**
149  * nfs_swap_rw - NFS address space operation for swap I/O
150  * @iocb: target I/O control block
151  * @iter: I/O buffer
152  *
153  * Perform IO to the swap-file.  This is much like direct IO.
154  */
nfs_swap_rw(struct kiocb * iocb,struct iov_iter * iter)155 int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
156 {
157 	ssize_t ret;
158 
159 	if (iov_iter_rw(iter) == READ)
160 		ret = nfs_file_direct_read(iocb, iter, true);
161 	else
162 		ret = nfs_file_direct_write(iocb, iter, true);
163 	if (ret < 0)
164 		return ret;
165 	return 0;
166 }
167 
nfs_direct_release_pages(struct page ** pages,unsigned int npages)168 static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
169 {
170 	unsigned int i;
171 	for (i = 0; i < npages; i++)
172 		put_page(pages[i]);
173 }
174 
nfs_init_cinfo_from_dreq(struct nfs_commit_info * cinfo,struct nfs_direct_req * dreq)175 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
176 			      struct nfs_direct_req *dreq)
177 {
178 	cinfo->inode = dreq->inode;
179 	cinfo->mds = &dreq->mds_cinfo;
180 	cinfo->ds = &dreq->ds_cinfo;
181 	cinfo->dreq = dreq;
182 	cinfo->completion_ops = &nfs_direct_commit_completion_ops;
183 }
184 
nfs_direct_req_alloc(void)185 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
186 {
187 	struct nfs_direct_req *dreq;
188 
189 	dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
190 	if (!dreq)
191 		return NULL;
192 
193 	kref_init(&dreq->kref);
194 	kref_get(&dreq->kref);
195 	init_completion(&dreq->completion);
196 	INIT_LIST_HEAD(&dreq->mds_cinfo.list);
197 	pnfs_init_ds_commit_info(&dreq->ds_cinfo);
198 	INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
199 	spin_lock_init(&dreq->lock);
200 
201 	return dreq;
202 }
203 
nfs_direct_req_free(struct kref * kref)204 static void nfs_direct_req_free(struct kref *kref)
205 {
206 	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
207 
208 	pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode);
209 	if (dreq->l_ctx != NULL)
210 		nfs_put_lock_context(dreq->l_ctx);
211 	if (dreq->ctx != NULL)
212 		put_nfs_open_context(dreq->ctx);
213 	kmem_cache_free(nfs_direct_cachep, dreq);
214 }
215 
nfs_direct_req_release(struct nfs_direct_req * dreq)216 static void nfs_direct_req_release(struct nfs_direct_req *dreq)
217 {
218 	kref_put(&dreq->kref, nfs_direct_req_free);
219 }
220 
nfs_dreq_bytes_left(struct nfs_direct_req * dreq,loff_t offset)221 ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset)
222 {
223 	loff_t start = offset - dreq->io_start;
224 	return dreq->max_count - start;
225 }
226 EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
227 
228 /*
229  * Collects and returns the final error value/byte-count.
230  */
nfs_direct_wait(struct nfs_direct_req * dreq)231 static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
232 {
233 	ssize_t result = -EIOCBQUEUED;
234 
235 	/* Async requests don't wait here */
236 	if (dreq->iocb)
237 		goto out;
238 
239 	result = wait_for_completion_killable(&dreq->completion);
240 
241 	if (!result) {
242 		result = dreq->count;
243 		WARN_ON_ONCE(dreq->count < 0);
244 	}
245 	if (!result)
246 		result = dreq->error;
247 
248 out:
249 	return (ssize_t) result;
250 }
251 
252 /*
253  * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
254  * the iocb is still valid here if this is a synchronous request.
255  */
nfs_direct_complete(struct nfs_direct_req * dreq)256 static void nfs_direct_complete(struct nfs_direct_req *dreq)
257 {
258 	struct inode *inode = dreq->inode;
259 
260 	inode_dio_end(inode);
261 
262 	if (dreq->iocb) {
263 		long res = (long) dreq->error;
264 		if (dreq->count != 0) {
265 			res = (long) dreq->count;
266 			WARN_ON_ONCE(dreq->count < 0);
267 		}
268 		dreq->iocb->ki_complete(dreq->iocb, res);
269 	}
270 
271 	complete(&dreq->completion);
272 
273 	nfs_direct_req_release(dreq);
274 }
275 
nfs_direct_read_completion(struct nfs_pgio_header * hdr)276 static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
277 {
278 	unsigned long bytes = 0;
279 	struct nfs_direct_req *dreq = hdr->dreq;
280 
281 	spin_lock(&dreq->lock);
282 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
283 		spin_unlock(&dreq->lock);
284 		goto out_put;
285 	}
286 
287 	nfs_direct_count_bytes(dreq, hdr);
288 	spin_unlock(&dreq->lock);
289 
290 	nfs_update_delegated_atime(dreq->inode);
291 
292 	while (!list_empty(&hdr->pages)) {
293 		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
294 		struct page *page = req->wb_page;
295 
296 		if (!PageCompound(page) && bytes < hdr->good_bytes &&
297 		    (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY))
298 			set_page_dirty(page);
299 		bytes += req->wb_bytes;
300 		nfs_list_remove_request(req);
301 		nfs_release_request(req);
302 	}
303 out_put:
304 	if (put_dreq(dreq))
305 		nfs_direct_complete(dreq);
306 	hdr->release(hdr);
307 }
308 
nfs_read_sync_pgio_error(struct list_head * head,int error)309 static void nfs_read_sync_pgio_error(struct list_head *head, int error)
310 {
311 	struct nfs_page *req;
312 
313 	while (!list_empty(head)) {
314 		req = nfs_list_entry(head->next);
315 		nfs_list_remove_request(req);
316 		nfs_release_request(req);
317 	}
318 }
319 
nfs_direct_pgio_init(struct nfs_pgio_header * hdr)320 static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
321 {
322 	get_dreq(hdr->dreq);
323 	set_bit(NFS_IOHDR_ODIRECT, &hdr->flags);
324 }
325 
326 static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
327 	.error_cleanup = nfs_read_sync_pgio_error,
328 	.init_hdr = nfs_direct_pgio_init,
329 	.completion = nfs_direct_read_completion,
330 };
331 
332 /*
333  * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
334  * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
335  * bail and stop sending more reads.  Read length accounting is
336  * handled automatically by nfs_direct_read_result().  Otherwise, if
337  * no requests have been sent, just return an error.
338  */
339 
nfs_direct_read_schedule_iovec(struct nfs_direct_req * dreq,struct iov_iter * iter,loff_t pos)340 static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
341 					      struct iov_iter *iter,
342 					      loff_t pos)
343 {
344 	struct nfs_pageio_descriptor desc;
345 	struct inode *inode = dreq->inode;
346 	ssize_t result = -EINVAL;
347 	size_t requested_bytes = 0;
348 	size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
349 
350 	nfs_pageio_init_read(&desc, dreq->inode, false,
351 			     &nfs_direct_read_completion_ops);
352 	get_dreq(dreq);
353 	desc.pg_dreq = dreq;
354 	inode_dio_begin(inode);
355 
356 	while (iov_iter_count(iter)) {
357 		struct page **pagevec;
358 		size_t bytes;
359 		size_t pgbase;
360 		unsigned npages, i;
361 
362 		result = iov_iter_get_pages_alloc2(iter, &pagevec,
363 						  rsize, &pgbase);
364 		if (result < 0)
365 			break;
366 
367 		bytes = result;
368 		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
369 		for (i = 0; i < npages; i++) {
370 			struct nfs_page *req;
371 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
372 			/* XXX do we need to do the eof zeroing found in async_filler? */
373 			req = nfs_page_create_from_page(dreq->ctx, pagevec[i],
374 							pgbase, pos, req_len);
375 			if (IS_ERR(req)) {
376 				result = PTR_ERR(req);
377 				break;
378 			}
379 			if (!nfs_pageio_add_request(&desc, req)) {
380 				result = desc.pg_error;
381 				nfs_release_request(req);
382 				break;
383 			}
384 			pgbase = 0;
385 			bytes -= req_len;
386 			requested_bytes += req_len;
387 			pos += req_len;
388 		}
389 		nfs_direct_release_pages(pagevec, npages);
390 		kvfree(pagevec);
391 		if (result < 0)
392 			break;
393 	}
394 
395 	nfs_pageio_complete(&desc);
396 
397 	/*
398 	 * If no bytes were started, return the error, and let the
399 	 * generic layer handle the completion.
400 	 */
401 	if (requested_bytes == 0) {
402 		inode_dio_end(inode);
403 		nfs_direct_req_release(dreq);
404 		return result < 0 ? result : -EIO;
405 	}
406 
407 	if (put_dreq(dreq))
408 		nfs_direct_complete(dreq);
409 	return requested_bytes;
410 }
411 
412 /**
413  * nfs_file_direct_read - file direct read operation for NFS files
414  * @iocb: target I/O control block
415  * @iter: vector of user buffers into which to read data
416  * @swap: flag indicating this is swap IO, not O_DIRECT IO
417  *
418  * We use this function for direct reads instead of calling
419  * generic_file_aio_read() in order to avoid gfar's check to see if
420  * the request starts before the end of the file.  For that check
421  * to work, we must generate a GETATTR before each direct read, and
422  * even then there is a window between the GETATTR and the subsequent
423  * READ where the file size could change.  Our preference is simply
424  * to do all reads the application wants, and the server will take
425  * care of managing the end of file boundary.
426  *
427  * This function also eliminates unnecessarily updating the file's
428  * atime locally, as the NFS server sets the file's atime, and this
429  * client must read the updated atime from the server back into its
430  * cache.
431  */
nfs_file_direct_read(struct kiocb * iocb,struct iov_iter * iter,bool swap)432 ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
433 			     bool swap)
434 {
435 	struct file *file = iocb->ki_filp;
436 	struct address_space *mapping = file->f_mapping;
437 	struct inode *inode = mapping->host;
438 	struct nfs_direct_req *dreq;
439 	struct nfs_lock_context *l_ctx;
440 	ssize_t result, requested;
441 	size_t count = iov_iter_count(iter);
442 	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
443 
444 	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
445 		file, count, (long long) iocb->ki_pos);
446 
447 	result = 0;
448 	if (!count)
449 		goto out;
450 
451 	task_io_account_read(count);
452 
453 	result = -ENOMEM;
454 	dreq = nfs_direct_req_alloc();
455 	if (dreq == NULL)
456 		goto out;
457 
458 	dreq->inode = inode;
459 	dreq->max_count = count;
460 	dreq->io_start = iocb->ki_pos;
461 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
462 	l_ctx = nfs_get_lock_context(dreq->ctx);
463 	if (IS_ERR(l_ctx)) {
464 		result = PTR_ERR(l_ctx);
465 		nfs_direct_req_release(dreq);
466 		goto out_release;
467 	}
468 	dreq->l_ctx = l_ctx;
469 	if (!is_sync_kiocb(iocb))
470 		dreq->iocb = iocb;
471 
472 	if (user_backed_iter(iter))
473 		dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
474 
475 	if (!swap) {
476 		result = nfs_start_io_direct(inode);
477 		if (result) {
478 			/* release the reference that would usually be
479 			 * consumed by nfs_direct_read_schedule_iovec()
480 			 */
481 			nfs_direct_req_release(dreq);
482 			goto out_release;
483 		}
484 	}
485 
486 	NFS_I(inode)->read_io += count;
487 	requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
488 
489 	if (!swap)
490 		nfs_end_io_direct(inode);
491 
492 	if (requested > 0) {
493 		result = nfs_direct_wait(dreq);
494 		if (result > 0) {
495 			requested -= result;
496 			iocb->ki_pos += result;
497 		}
498 		iov_iter_revert(iter, requested);
499 	} else {
500 		result = requested;
501 	}
502 
503 out_release:
504 	nfs_direct_req_release(dreq);
505 out:
506 	return result;
507 }
508 
nfs_direct_add_page_head(struct list_head * list,struct nfs_page * req)509 static void nfs_direct_add_page_head(struct list_head *list,
510 				     struct nfs_page *req)
511 {
512 	struct nfs_page *head = req->wb_head;
513 
514 	if (!list_empty(&head->wb_list) || !nfs_lock_request(head))
515 		return;
516 	if (!list_empty(&head->wb_list)) {
517 		nfs_unlock_request(head);
518 		return;
519 	}
520 	list_add(&head->wb_list, list);
521 	kref_get(&head->wb_kref);
522 	kref_get(&head->wb_kref);
523 }
524 
nfs_direct_join_group(struct list_head * list,struct nfs_commit_info * cinfo,struct inode * inode)525 static void nfs_direct_join_group(struct list_head *list,
526 				  struct nfs_commit_info *cinfo,
527 				  struct inode *inode)
528 {
529 	struct nfs_page *req, *subreq;
530 
531 	list_for_each_entry(req, list, wb_list) {
532 		if (req->wb_head != req) {
533 			nfs_direct_add_page_head(&req->wb_list, req);
534 			continue;
535 		}
536 		subreq = req->wb_this_page;
537 		if (subreq == req)
538 			continue;
539 		do {
540 			/*
541 			 * Remove subrequests from this list before freeing
542 			 * them in the call to nfs_join_page_group().
543 			 */
544 			if (!list_empty(&subreq->wb_list)) {
545 				nfs_list_remove_request(subreq);
546 				nfs_release_request(subreq);
547 			}
548 		} while ((subreq = subreq->wb_this_page) != req);
549 		nfs_join_page_group(req, cinfo, inode);
550 	}
551 }
552 
553 static void
nfs_direct_write_scan_commit_list(struct inode * inode,struct list_head * list,struct nfs_commit_info * cinfo)554 nfs_direct_write_scan_commit_list(struct inode *inode,
555 				  struct list_head *list,
556 				  struct nfs_commit_info *cinfo)
557 {
558 	mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
559 	pnfs_recover_commit_reqs(list, cinfo);
560 	nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
561 	mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
562 }
563 
nfs_direct_write_reschedule(struct nfs_direct_req * dreq)564 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
565 {
566 	struct nfs_pageio_descriptor desc;
567 	struct nfs_page *req;
568 	LIST_HEAD(reqs);
569 	struct nfs_commit_info cinfo;
570 
571 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
572 	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
573 
574 	nfs_direct_join_group(&reqs, &cinfo, dreq->inode);
575 
576 	nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
577 	get_dreq(dreq);
578 
579 	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
580 			      &nfs_direct_write_completion_ops);
581 	desc.pg_dreq = dreq;
582 
583 	while (!list_empty(&reqs)) {
584 		req = nfs_list_entry(reqs.next);
585 		/* Bump the transmission count */
586 		req->wb_nio++;
587 		if (!nfs_pageio_add_request(&desc, req)) {
588 			spin_lock(&dreq->lock);
589 			if (dreq->error < 0) {
590 				desc.pg_error = dreq->error;
591 			} else if (desc.pg_error != -EAGAIN) {
592 				dreq->flags = 0;
593 				if (!desc.pg_error)
594 					desc.pg_error = -EIO;
595 				dreq->error = desc.pg_error;
596 			} else
597 				dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
598 			spin_unlock(&dreq->lock);
599 			break;
600 		}
601 		nfs_release_request(req);
602 	}
603 	nfs_pageio_complete(&desc);
604 
605 	while (!list_empty(&reqs)) {
606 		req = nfs_list_entry(reqs.next);
607 		nfs_list_remove_request(req);
608 		nfs_unlock_and_release_request(req);
609 		if (desc.pg_error == -EAGAIN) {
610 			nfs_mark_request_commit(req, NULL, &cinfo, 0);
611 		} else {
612 			spin_lock(&dreq->lock);
613 			nfs_direct_truncate_request(dreq, req);
614 			spin_unlock(&dreq->lock);
615 			nfs_release_request(req);
616 		}
617 	}
618 
619 	if (put_dreq(dreq))
620 		nfs_direct_write_complete(dreq);
621 }
622 
nfs_direct_commit_complete(struct nfs_commit_data * data)623 static void nfs_direct_commit_complete(struct nfs_commit_data *data)
624 {
625 	const struct nfs_writeverf *verf = data->res.verf;
626 	struct nfs_direct_req *dreq = data->dreq;
627 	struct nfs_commit_info cinfo;
628 	struct nfs_page *req;
629 	int status = data->task.tk_status;
630 
631 	trace_nfs_direct_commit_complete(dreq);
632 
633 	spin_lock(&dreq->lock);
634 	if (status < 0) {
635 		/* Errors in commit are fatal */
636 		dreq->error = status;
637 		dreq->flags = NFS_ODIRECT_DONE;
638 	} else {
639 		status = dreq->error;
640 	}
641 	spin_unlock(&dreq->lock);
642 
643 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
644 
645 	while (!list_empty(&data->pages)) {
646 		req = nfs_list_entry(data->pages.next);
647 		nfs_list_remove_request(req);
648 		if (status < 0) {
649 			spin_lock(&dreq->lock);
650 			nfs_direct_truncate_request(dreq, req);
651 			spin_unlock(&dreq->lock);
652 			nfs_release_request(req);
653 		} else if (!nfs_write_match_verf(verf, req)) {
654 			spin_lock(&dreq->lock);
655 			if (dreq->flags == 0)
656 				dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
657 			spin_unlock(&dreq->lock);
658 			/*
659 			 * Despite the reboot, the write was successful,
660 			 * so reset wb_nio.
661 			 */
662 			req->wb_nio = 0;
663 			nfs_mark_request_commit(req, NULL, &cinfo, 0);
664 		} else
665 			nfs_release_request(req);
666 		nfs_unlock_and_release_request(req);
667 	}
668 
669 	if (nfs_commit_end(cinfo.mds))
670 		nfs_direct_write_complete(dreq);
671 }
672 
nfs_direct_resched_write(struct nfs_commit_info * cinfo,struct nfs_page * req)673 static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
674 		struct nfs_page *req)
675 {
676 	struct nfs_direct_req *dreq = cinfo->dreq;
677 
678 	trace_nfs_direct_resched_write(dreq);
679 
680 	spin_lock(&dreq->lock);
681 	if (dreq->flags != NFS_ODIRECT_DONE)
682 		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
683 	spin_unlock(&dreq->lock);
684 	nfs_mark_request_commit(req, NULL, cinfo, 0);
685 }
686 
687 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
688 	.completion = nfs_direct_commit_complete,
689 	.resched_write = nfs_direct_resched_write,
690 };
691 
nfs_direct_commit_schedule(struct nfs_direct_req * dreq)692 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
693 {
694 	int res;
695 	struct nfs_commit_info cinfo;
696 	LIST_HEAD(mds_list);
697 
698 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
699 	nfs_commit_begin(cinfo.mds);
700 	nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
701 	res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
702 	if (res < 0) { /* res == -ENOMEM */
703 		spin_lock(&dreq->lock);
704 		if (dreq->flags == 0)
705 			dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
706 		spin_unlock(&dreq->lock);
707 	}
708 	if (nfs_commit_end(cinfo.mds))
709 		nfs_direct_write_complete(dreq);
710 }
711 
nfs_direct_write_clear_reqs(struct nfs_direct_req * dreq)712 static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
713 {
714 	struct nfs_commit_info cinfo;
715 	struct nfs_page *req;
716 	LIST_HEAD(reqs);
717 
718 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
719 	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
720 
721 	while (!list_empty(&reqs)) {
722 		req = nfs_list_entry(reqs.next);
723 		nfs_list_remove_request(req);
724 		nfs_direct_truncate_request(dreq, req);
725 		nfs_release_request(req);
726 		nfs_unlock_and_release_request(req);
727 	}
728 }
729 
nfs_direct_write_schedule_work(struct work_struct * work)730 static void nfs_direct_write_schedule_work(struct work_struct *work)
731 {
732 	struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
733 	int flags = dreq->flags;
734 
735 	dreq->flags = 0;
736 	switch (flags) {
737 		case NFS_ODIRECT_DO_COMMIT:
738 			nfs_direct_commit_schedule(dreq);
739 			break;
740 		case NFS_ODIRECT_RESCHED_WRITES:
741 			nfs_direct_write_reschedule(dreq);
742 			break;
743 		default:
744 			nfs_direct_write_clear_reqs(dreq);
745 			nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
746 			nfs_direct_complete(dreq);
747 	}
748 }
749 
nfs_direct_write_complete(struct nfs_direct_req * dreq)750 static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
751 {
752 	trace_nfs_direct_write_complete(dreq);
753 	queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */
754 }
755 
nfs_direct_write_completion(struct nfs_pgio_header * hdr)756 static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
757 {
758 	struct nfs_direct_req *dreq = hdr->dreq;
759 	struct nfs_commit_info cinfo;
760 	struct nfs_page *req = nfs_list_entry(hdr->pages.next);
761 	struct inode *inode = dreq->inode;
762 	int flags = NFS_ODIRECT_DONE;
763 
764 	trace_nfs_direct_write_completion(dreq);
765 
766 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
767 
768 	spin_lock(&dreq->lock);
769 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
770 		spin_unlock(&dreq->lock);
771 		goto out_put;
772 	}
773 
774 	nfs_direct_count_bytes(dreq, hdr);
775 	if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) &&
776 	    !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
777 		if (!dreq->flags)
778 			dreq->flags = NFS_ODIRECT_DO_COMMIT;
779 		flags = dreq->flags;
780 	}
781 	spin_unlock(&dreq->lock);
782 
783 	spin_lock(&inode->i_lock);
784 	nfs_direct_file_adjust_size_locked(inode, dreq->io_start, dreq->count);
785 	nfs_update_delegated_mtime_locked(dreq->inode);
786 	spin_unlock(&inode->i_lock);
787 
788 	while (!list_empty(&hdr->pages)) {
789 
790 		req = nfs_list_entry(hdr->pages.next);
791 		nfs_list_remove_request(req);
792 		if (flags == NFS_ODIRECT_DO_COMMIT) {
793 			kref_get(&req->wb_kref);
794 			memcpy(&req->wb_verf, &hdr->verf.verifier,
795 			       sizeof(req->wb_verf));
796 			nfs_mark_request_commit(req, hdr->lseg, &cinfo,
797 				hdr->ds_commit_idx);
798 		} else if (flags == NFS_ODIRECT_RESCHED_WRITES) {
799 			kref_get(&req->wb_kref);
800 			nfs_mark_request_commit(req, NULL, &cinfo, 0);
801 		}
802 		nfs_unlock_and_release_request(req);
803 	}
804 
805 out_put:
806 	if (put_dreq(dreq))
807 		nfs_direct_write_complete(dreq);
808 	hdr->release(hdr);
809 }
810 
nfs_write_sync_pgio_error(struct list_head * head,int error)811 static void nfs_write_sync_pgio_error(struct list_head *head, int error)
812 {
813 	struct nfs_page *req;
814 
815 	while (!list_empty(head)) {
816 		req = nfs_list_entry(head->next);
817 		nfs_list_remove_request(req);
818 		nfs_unlock_and_release_request(req);
819 	}
820 }
821 
nfs_direct_write_reschedule_io(struct nfs_pgio_header * hdr)822 static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
823 {
824 	struct nfs_direct_req *dreq = hdr->dreq;
825 	struct nfs_page *req;
826 	struct nfs_commit_info cinfo;
827 
828 	trace_nfs_direct_write_reschedule_io(dreq);
829 
830 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
831 	spin_lock(&dreq->lock);
832 	if (dreq->error == 0)
833 		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
834 	set_bit(NFS_IOHDR_REDO, &hdr->flags);
835 	spin_unlock(&dreq->lock);
836 	while (!list_empty(&hdr->pages)) {
837 		req = nfs_list_entry(hdr->pages.next);
838 		nfs_list_remove_request(req);
839 		nfs_unlock_request(req);
840 		nfs_mark_request_commit(req, NULL, &cinfo, 0);
841 	}
842 }
843 
844 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
845 	.error_cleanup = nfs_write_sync_pgio_error,
846 	.init_hdr = nfs_direct_pgio_init,
847 	.completion = nfs_direct_write_completion,
848 	.reschedule_io = nfs_direct_write_reschedule_io,
849 };
850 
851 
852 /*
853  * NB: Return the value of the first error return code.  Subsequent
854  *     errors after the first one are ignored.
855  */
856 /*
857  * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
858  * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
859  * bail and stop sending more writes.  Write length accounting is
860  * handled automatically by nfs_direct_write_result().  Otherwise, if
861  * no requests have been sent, just return an error.
862  */
nfs_direct_write_schedule_iovec(struct nfs_direct_req * dreq,struct iov_iter * iter,loff_t pos,int ioflags)863 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
864 					       struct iov_iter *iter,
865 					       loff_t pos, int ioflags)
866 {
867 	struct nfs_pageio_descriptor desc;
868 	struct inode *inode = dreq->inode;
869 	struct nfs_commit_info cinfo;
870 	ssize_t result = 0;
871 	size_t requested_bytes = 0;
872 	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
873 	bool defer = false;
874 
875 	trace_nfs_direct_write_schedule_iovec(dreq);
876 
877 	nfs_pageio_init_write(&desc, inode, ioflags, false,
878 			      &nfs_direct_write_completion_ops);
879 	desc.pg_dreq = dreq;
880 	get_dreq(dreq);
881 	inode_dio_begin(inode);
882 
883 	NFS_I(inode)->write_io += iov_iter_count(iter);
884 	while (iov_iter_count(iter)) {
885 		struct page **pagevec;
886 		size_t bytes;
887 		size_t pgbase;
888 		unsigned npages, i;
889 
890 		result = iov_iter_get_pages_alloc2(iter, &pagevec,
891 						  wsize, &pgbase);
892 		if (result < 0)
893 			break;
894 
895 		bytes = result;
896 		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
897 		for (i = 0; i < npages; i++) {
898 			struct nfs_page *req;
899 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
900 
901 			req = nfs_page_create_from_page(dreq->ctx, pagevec[i],
902 							pgbase, pos, req_len);
903 			if (IS_ERR(req)) {
904 				result = PTR_ERR(req);
905 				break;
906 			}
907 
908 			if (desc.pg_error < 0) {
909 				nfs_free_request(req);
910 				result = desc.pg_error;
911 				break;
912 			}
913 
914 			pgbase = 0;
915 			bytes -= req_len;
916 			requested_bytes += req_len;
917 			pos += req_len;
918 
919 			if (defer) {
920 				nfs_mark_request_commit(req, NULL, &cinfo, 0);
921 				continue;
922 			}
923 
924 			nfs_lock_request(req);
925 			if (nfs_pageio_add_request(&desc, req))
926 				continue;
927 
928 			/* Exit on hard errors */
929 			if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) {
930 				result = desc.pg_error;
931 				nfs_unlock_and_release_request(req);
932 				break;
933 			}
934 
935 			/* If the error is soft, defer remaining requests */
936 			nfs_init_cinfo_from_dreq(&cinfo, dreq);
937 			spin_lock(&dreq->lock);
938 			dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
939 			spin_unlock(&dreq->lock);
940 			nfs_unlock_request(req);
941 			nfs_mark_request_commit(req, NULL, &cinfo, 0);
942 			desc.pg_error = 0;
943 			defer = true;
944 		}
945 		nfs_direct_release_pages(pagevec, npages);
946 		kvfree(pagevec);
947 		if (result < 0)
948 			break;
949 	}
950 	nfs_pageio_complete(&desc);
951 
952 	/*
953 	 * If no bytes were started, return the error, and let the
954 	 * generic layer handle the completion.
955 	 */
956 	if (requested_bytes == 0) {
957 		inode_dio_end(inode);
958 		nfs_direct_req_release(dreq);
959 		return result < 0 ? result : -EIO;
960 	}
961 
962 	if (put_dreq(dreq))
963 		nfs_direct_write_complete(dreq);
964 	return requested_bytes;
965 }
966 
967 /**
968  * nfs_file_direct_write - file direct write operation for NFS files
969  * @iocb: target I/O control block
970  * @iter: vector of user buffers from which to write data
971  * @swap: flag indicating this is swap IO, not O_DIRECT IO
972  *
973  * We use this function for direct writes instead of calling
974  * generic_file_aio_write() in order to avoid taking the inode
975  * semaphore and updating the i_size.  The NFS server will set
976  * the new i_size and this client must read the updated size
977  * back into its cache.  We let the server do generic write
978  * parameter checking and report problems.
979  *
980  * We eliminate local atime updates, see direct read above.
981  *
982  * We avoid unnecessary page cache invalidations for normal cached
983  * readers of this file.
984  *
985  * Note that O_APPEND is not supported for NFS direct writes, as there
986  * is no atomic O_APPEND write facility in the NFS protocol.
987  */
nfs_file_direct_write(struct kiocb * iocb,struct iov_iter * iter,bool swap)988 ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
989 			      bool swap)
990 {
991 	ssize_t result, requested;
992 	size_t count;
993 	struct file *file = iocb->ki_filp;
994 	struct address_space *mapping = file->f_mapping;
995 	struct inode *inode = mapping->host;
996 	struct nfs_direct_req *dreq;
997 	struct nfs_lock_context *l_ctx;
998 	loff_t pos, end;
999 
1000 	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
1001 		file, iov_iter_count(iter), (long long) iocb->ki_pos);
1002 
1003 	if (swap)
1004 		/* bypass generic checks */
1005 		result =  iov_iter_count(iter);
1006 	else
1007 		result = generic_write_checks(iocb, iter);
1008 	if (result <= 0)
1009 		return result;
1010 	count = result;
1011 	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
1012 
1013 	pos = iocb->ki_pos;
1014 	end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
1015 
1016 	task_io_account_write(count);
1017 
1018 	result = -ENOMEM;
1019 	dreq = nfs_direct_req_alloc();
1020 	if (!dreq)
1021 		goto out;
1022 
1023 	dreq->inode = inode;
1024 	dreq->max_count = count;
1025 	dreq->io_start = pos;
1026 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
1027 	l_ctx = nfs_get_lock_context(dreq->ctx);
1028 	if (IS_ERR(l_ctx)) {
1029 		result = PTR_ERR(l_ctx);
1030 		nfs_direct_req_release(dreq);
1031 		goto out_release;
1032 	}
1033 	dreq->l_ctx = l_ctx;
1034 	if (!is_sync_kiocb(iocb))
1035 		dreq->iocb = iocb;
1036 	pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
1037 
1038 	if (swap) {
1039 		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
1040 							    FLUSH_STABLE);
1041 	} else {
1042 		result = nfs_start_io_direct(inode);
1043 		if (result) {
1044 			/* release the reference that would usually be
1045 			 * consumed by nfs_direct_write_schedule_iovec()
1046 			 */
1047 			nfs_direct_req_release(dreq);
1048 			goto out_release;
1049 		}
1050 
1051 		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
1052 							    FLUSH_COND_STABLE);
1053 
1054 		if (mapping->nrpages) {
1055 			invalidate_inode_pages2_range(mapping,
1056 						      pos >> PAGE_SHIFT, end);
1057 		}
1058 
1059 		nfs_end_io_direct(inode);
1060 	}
1061 
1062 	if (requested > 0) {
1063 		result = nfs_direct_wait(dreq);
1064 		if (result > 0) {
1065 			requested -= result;
1066 			iocb->ki_pos = pos + result;
1067 			/* XXX: should check the generic_write_sync retval */
1068 			generic_write_sync(iocb, result);
1069 		}
1070 		iov_iter_revert(iter, requested);
1071 	} else {
1072 		result = requested;
1073 	}
1074 	nfs_fscache_invalidate(inode, FSCACHE_INVAL_DIO_WRITE);
1075 out_release:
1076 	nfs_direct_req_release(dreq);
1077 out:
1078 	return result;
1079 }
1080 
1081 /**
1082  * nfs_init_directcache - create a slab cache for nfs_direct_req structures
1083  *
1084  */
nfs_init_directcache(void)1085 int __init nfs_init_directcache(void)
1086 {
1087 	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
1088 						sizeof(struct nfs_direct_req),
1089 						0, SLAB_RECLAIM_ACCOUNT,
1090 						NULL);
1091 	if (nfs_direct_cachep == NULL)
1092 		return -ENOMEM;
1093 
1094 	return 0;
1095 }
1096 
1097 /**
1098  * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
1099  *
1100  */
nfs_destroy_directcache(void)1101 void nfs_destroy_directcache(void)
1102 {
1103 	kmem_cache_destroy(nfs_direct_cachep);
1104 }
1105