xref: /linux/fs/nfs/direct.c (revision 878ba61aa98cbb97a513757800e77613f856a029)
1 /*
2  * linux/fs/nfs/direct.c
3  *
4  * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
5  *
6  * High-performance uncached I/O for the Linux NFS client
7  *
8  * There are important applications whose performance or correctness
9  * depends on uncached access to file data.  Database clusters
10  * (multiple copies of the same instance running on separate hosts)
11  * implement their own cache coherency protocol that subsumes file
12  * system cache protocols.  Applications that process datasets
13  * considerably larger than the client's memory do not always benefit
14  * from a local cache.  A streaming video server, for instance, has no
15  * need to cache the contents of a file.
16  *
17  * When an application requests uncached I/O, all read and write requests
18  * are made directly to the server; data stored or fetched via these
19  * requests is not cached in the Linux page cache.  The client does not
20  * correct unaligned requests from applications.  All requested bytes are
21  * held on permanent storage before a direct write system call returns to
22  * an application.
23  *
24  * Solaris implements an uncached I/O facility called directio() that
25  * is used for backups and sequential I/O to very large files.  Solaris
26  * also supports uncaching whole NFS partitions with "-o forcedirectio,"
27  * an undocumented mount option.
28  *
29  * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
30  * help from Andrew Morton.
31  *
32  * 18 Dec 2001	Initial implementation for 2.4  --cel
33  * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
34  * 08 Jun 2003	Port to 2.5 APIs  --cel
35  * 31 Mar 2004	Handle direct I/O without VFS support  --cel
36  * 15 Sep 2004	Parallel async reads  --cel
37  * 04 May 2005	support O_DIRECT with aio  --cel
38  *
39  */
40 
41 #include <linux/errno.h>
42 #include <linux/sched.h>
43 #include <linux/kernel.h>
44 #include <linux/file.h>
45 #include <linux/pagemap.h>
46 #include <linux/kref.h>
47 #include <linux/slab.h>
48 #include <linux/task_io_accounting_ops.h>
49 #include <linux/module.h>
50 
51 #include <linux/nfs_fs.h>
52 #include <linux/nfs_page.h>
53 #include <linux/sunrpc/clnt.h>
54 
55 #include <asm/uaccess.h>
56 #include <linux/atomic.h>
57 
58 #include "internal.h"
59 #include "iostat.h"
60 #include "pnfs.h"
61 
62 #define NFSDBG_FACILITY		NFSDBG_VFS
63 
64 static struct kmem_cache *nfs_direct_cachep;
65 
66 /*
67  * This represents a set of asynchronous requests that we're waiting on
68  */
69 struct nfs_direct_mirror {
70 	ssize_t count;
71 };
72 
73 struct nfs_direct_req {
74 	struct kref		kref;		/* release manager */
75 
76 	/* I/O parameters */
77 	struct nfs_open_context	*ctx;		/* file open context info */
78 	struct nfs_lock_context *l_ctx;		/* Lock context info */
79 	struct kiocb *		iocb;		/* controlling i/o request */
80 	struct inode *		inode;		/* target file of i/o */
81 
82 	/* completion state */
83 	atomic_t		io_count;	/* i/os we're waiting for */
84 	spinlock_t		lock;		/* protect completion state */
85 
86 	struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
87 	int			mirror_count;
88 
89 	ssize_t			count,		/* bytes actually processed */
90 				bytes_left,	/* bytes left to be sent */
91 				io_start,	/* start of IO */
92 				error;		/* any reported error */
93 	struct completion	completion;	/* wait for i/o completion */
94 
95 	/* commit state */
96 	struct nfs_mds_commit_info mds_cinfo;	/* Storage for cinfo */
97 	struct pnfs_ds_commit_info ds_cinfo;	/* Storage for cinfo */
98 	struct work_struct	work;
99 	int			flags;
100 #define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
101 #define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
102 	struct nfs_writeverf	verf;		/* unstable write verifier */
103 };
104 
105 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
106 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
107 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
108 static void nfs_direct_write_schedule_work(struct work_struct *work);
109 
110 static inline void get_dreq(struct nfs_direct_req *dreq)
111 {
112 	atomic_inc(&dreq->io_count);
113 }
114 
115 static inline int put_dreq(struct nfs_direct_req *dreq)
116 {
117 	return atomic_dec_and_test(&dreq->io_count);
118 }
119 
120 void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
121 {
122 	dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
123 }
124 EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);
125 
126 static void
127 nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
128 {
129 	int i;
130 	ssize_t count;
131 
132 	WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count);
133 
134 	count = dreq->mirrors[hdr->pgio_mirror_idx].count;
135 	if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
136 		count = hdr->io_start + hdr->good_bytes - dreq->io_start;
137 		dreq->mirrors[hdr->pgio_mirror_idx].count = count;
138 	}
139 
140 	/* update the dreq->count by finding the minimum agreed count from all
141 	 * mirrors */
142 	count = dreq->mirrors[0].count;
143 
144 	for (i = 1; i < dreq->mirror_count; i++)
145 		count = min(count, dreq->mirrors[i].count);
146 
147 	dreq->count = count;
148 }
149 
150 /*
151  * nfs_direct_select_verf - select the right verifier
152  * @dreq - direct request possibly spanning multiple servers
153  * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
154  * @commit_idx - commit bucket index for the DS
155  *
156  * returns the correct verifier to use given the role of the server
157  */
158 static struct nfs_writeverf *
159 nfs_direct_select_verf(struct nfs_direct_req *dreq,
160 		       struct nfs_client *ds_clp,
161 		       int commit_idx)
162 {
163 	struct nfs_writeverf *verfp = &dreq->verf;
164 
165 #ifdef CONFIG_NFS_V4_1
166 	if (ds_clp) {
167 		/* pNFS is in use, use the DS verf */
168 		if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
169 			verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
170 		else
171 			WARN_ON_ONCE(1);
172 	}
173 #endif
174 	return verfp;
175 }
176 
177 
178 /*
179  * nfs_direct_set_hdr_verf - set the write/commit verifier
180  * @dreq - direct request possibly spanning multiple servers
181  * @hdr - pageio header to validate against previously seen verfs
182  *
183  * Set the server's (MDS or DS) "seen" verifier
184  */
185 static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
186 				    struct nfs_pgio_header *hdr)
187 {
188 	struct nfs_writeverf *verfp;
189 
190 	verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
191 	WARN_ON_ONCE(verfp->committed >= 0);
192 	memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
193 	WARN_ON_ONCE(verfp->committed < 0);
194 }
195 
196 /*
197  * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
198  * @dreq - direct request possibly spanning multiple servers
199  * @hdr - pageio header to validate against previously seen verf
200  *
201  * set the server's "seen" verf if not initialized.
202  * returns result of comparison between @hdr->verf and the "seen"
203  * verf of the server used by @hdr (DS or MDS)
204  */
205 static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
206 					  struct nfs_pgio_header *hdr)
207 {
208 	struct nfs_writeverf *verfp;
209 
210 	verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
211 	if (verfp->committed < 0) {
212 		nfs_direct_set_hdr_verf(dreq, hdr);
213 		return 0;
214 	}
215 	return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
216 }
217 
218 /*
219  * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
220  * @dreq - direct request possibly spanning multiple servers
221  * @data - commit data to validate against previously seen verf
222  *
223  * returns result of comparison between @data->verf and the verf of
224  * the server used by @data (DS or MDS)
225  */
226 static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
227 					   struct nfs_commit_data *data)
228 {
229 	struct nfs_writeverf *verfp;
230 
231 	verfp = nfs_direct_select_verf(dreq, data->ds_clp,
232 					 data->ds_commit_index);
233 
234 	/* verifier not set so always fail */
235 	if (verfp->committed < 0)
236 		return 1;
237 
238 	return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
239 }
240 
241 /**
242  * nfs_direct_IO - NFS address space operation for direct I/O
243  * @rw: direction (read or write)
244  * @iocb: target I/O control block
245  * @iov: array of vectors that define I/O buffer
246  * @pos: offset in file to begin the operation
247  * @nr_segs: size of iovec array
248  *
249  * The presence of this routine in the address space ops vector means
250  * the NFS client supports direct I/O. However, for most direct IO, we
251  * shunt off direct read and write requests before the VFS gets them,
252  * so this method is only ever called for swap.
253  */
254 ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
255 {
256 	struct inode *inode = iocb->ki_filp->f_mapping->host;
257 
258 	/* we only support swap file calling nfs_direct_IO */
259 	if (!IS_SWAPFILE(inode))
260 		return 0;
261 
262 #ifndef CONFIG_NFS_SWAP
263 	dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
264 			iocb->ki_filp, (long long) pos, iter->nr_segs);
265 
266 	return -EINVAL;
267 #else
268 	VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
269 
270 	if (rw == READ)
271 		return nfs_file_direct_read(iocb, iter, pos);
272 	return nfs_file_direct_write(iocb, iter, pos);
273 #endif /* CONFIG_NFS_SWAP */
274 }
275 
276 static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
277 {
278 	unsigned int i;
279 	for (i = 0; i < npages; i++)
280 		page_cache_release(pages[i]);
281 }
282 
283 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
284 			      struct nfs_direct_req *dreq)
285 {
286 	cinfo->lock = &dreq->lock;
287 	cinfo->mds = &dreq->mds_cinfo;
288 	cinfo->ds = &dreq->ds_cinfo;
289 	cinfo->dreq = dreq;
290 	cinfo->completion_ops = &nfs_direct_commit_completion_ops;
291 }
292 
293 static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq,
294 					     struct nfs_pageio_descriptor *pgio,
295 					     struct nfs_page *req)
296 {
297 	int mirror_count = 1;
298 
299 	if (pgio->pg_ops->pg_get_mirror_count)
300 		mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
301 
302 	dreq->mirror_count = mirror_count;
303 }
304 
305 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
306 {
307 	struct nfs_direct_req *dreq;
308 
309 	dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
310 	if (!dreq)
311 		return NULL;
312 
313 	kref_init(&dreq->kref);
314 	kref_get(&dreq->kref);
315 	init_completion(&dreq->completion);
316 	INIT_LIST_HEAD(&dreq->mds_cinfo.list);
317 	dreq->verf.committed = NFS_INVALID_STABLE_HOW;	/* not set yet */
318 	INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
319 	dreq->mirror_count = 1;
320 	spin_lock_init(&dreq->lock);
321 
322 	return dreq;
323 }
324 
325 static void nfs_direct_req_free(struct kref *kref)
326 {
327 	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
328 
329 	nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
330 	if (dreq->l_ctx != NULL)
331 		nfs_put_lock_context(dreq->l_ctx);
332 	if (dreq->ctx != NULL)
333 		put_nfs_open_context(dreq->ctx);
334 	kmem_cache_free(nfs_direct_cachep, dreq);
335 }
336 
337 static void nfs_direct_req_release(struct nfs_direct_req *dreq)
338 {
339 	kref_put(&dreq->kref, nfs_direct_req_free);
340 }
341 
342 ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
343 {
344 	return dreq->bytes_left;
345 }
346 EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
347 
348 /*
349  * Collects and returns the final error value/byte-count.
350  */
351 static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
352 {
353 	ssize_t result = -EIOCBQUEUED;
354 
355 	/* Async requests don't wait here */
356 	if (dreq->iocb)
357 		goto out;
358 
359 	result = wait_for_completion_killable(&dreq->completion);
360 
361 	if (!result)
362 		result = dreq->error;
363 	if (!result)
364 		result = dreq->count;
365 
366 out:
367 	return (ssize_t) result;
368 }
369 
370 /*
371  * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
372  * the iocb is still valid here if this is a synchronous request.
373  */
374 static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
375 {
376 	struct inode *inode = dreq->inode;
377 
378 	if (dreq->iocb && write) {
379 		loff_t pos = dreq->iocb->ki_pos + dreq->count;
380 
381 		spin_lock(&inode->i_lock);
382 		if (i_size_read(inode) < pos)
383 			i_size_write(inode, pos);
384 		spin_unlock(&inode->i_lock);
385 	}
386 
387 	if (write)
388 		nfs_zap_mapping(inode, inode->i_mapping);
389 
390 	inode_dio_done(inode);
391 
392 	if (dreq->iocb) {
393 		long res = (long) dreq->error;
394 		if (!res)
395 			res = (long) dreq->count;
396 		aio_complete(dreq->iocb, res, 0);
397 	}
398 
399 	complete_all(&dreq->completion);
400 
401 	nfs_direct_req_release(dreq);
402 }
403 
404 static void nfs_direct_readpage_release(struct nfs_page *req)
405 {
406 	dprintk("NFS: direct read done (%s/%llu %d@%lld)\n",
407 		req->wb_context->dentry->d_inode->i_sb->s_id,
408 		(unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
409 		req->wb_bytes,
410 		(long long)req_offset(req));
411 	nfs_release_request(req);
412 }
413 
414 static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
415 {
416 	unsigned long bytes = 0;
417 	struct nfs_direct_req *dreq = hdr->dreq;
418 
419 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
420 		goto out_put;
421 
422 	spin_lock(&dreq->lock);
423 	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
424 		dreq->error = hdr->error;
425 	else
426 		nfs_direct_good_bytes(dreq, hdr);
427 
428 	spin_unlock(&dreq->lock);
429 
430 	while (!list_empty(&hdr->pages)) {
431 		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
432 		struct page *page = req->wb_page;
433 
434 		if (!PageCompound(page) && bytes < hdr->good_bytes)
435 			set_page_dirty(page);
436 		bytes += req->wb_bytes;
437 		nfs_list_remove_request(req);
438 		nfs_direct_readpage_release(req);
439 	}
440 out_put:
441 	if (put_dreq(dreq))
442 		nfs_direct_complete(dreq, false);
443 	hdr->release(hdr);
444 }
445 
446 static void nfs_read_sync_pgio_error(struct list_head *head)
447 {
448 	struct nfs_page *req;
449 
450 	while (!list_empty(head)) {
451 		req = nfs_list_entry(head->next);
452 		nfs_list_remove_request(req);
453 		nfs_release_request(req);
454 	}
455 }
456 
457 static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
458 {
459 	get_dreq(hdr->dreq);
460 }
461 
462 static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
463 	.error_cleanup = nfs_read_sync_pgio_error,
464 	.init_hdr = nfs_direct_pgio_init,
465 	.completion = nfs_direct_read_completion,
466 };
467 
468 /*
469  * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
470  * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
471  * bail and stop sending more reads.  Read length accounting is
472  * handled automatically by nfs_direct_read_result().  Otherwise, if
473  * no requests have been sent, just return an error.
474  */
475 
476 static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
477 					      struct iov_iter *iter,
478 					      loff_t pos)
479 {
480 	struct nfs_pageio_descriptor desc;
481 	struct inode *inode = dreq->inode;
482 	ssize_t result = -EINVAL;
483 	size_t requested_bytes = 0;
484 	size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
485 
486 	nfs_pageio_init_read(&desc, dreq->inode, false,
487 			     &nfs_direct_read_completion_ops);
488 	get_dreq(dreq);
489 	desc.pg_dreq = dreq;
490 	atomic_inc(&inode->i_dio_count);
491 
492 	while (iov_iter_count(iter)) {
493 		struct page **pagevec;
494 		size_t bytes;
495 		size_t pgbase;
496 		unsigned npages, i;
497 
498 		result = iov_iter_get_pages_alloc(iter, &pagevec,
499 						  rsize, &pgbase);
500 		if (result < 0)
501 			break;
502 
503 		bytes = result;
504 		iov_iter_advance(iter, bytes);
505 		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
506 		for (i = 0; i < npages; i++) {
507 			struct nfs_page *req;
508 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
509 			/* XXX do we need to do the eof zeroing found in async_filler? */
510 			req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
511 						 pgbase, req_len);
512 			if (IS_ERR(req)) {
513 				result = PTR_ERR(req);
514 				break;
515 			}
516 			req->wb_index = pos >> PAGE_SHIFT;
517 			req->wb_offset = pos & ~PAGE_MASK;
518 			if (!nfs_pageio_add_request(&desc, req)) {
519 				result = desc.pg_error;
520 				nfs_release_request(req);
521 				break;
522 			}
523 			pgbase = 0;
524 			bytes -= req_len;
525 			requested_bytes += req_len;
526 			pos += req_len;
527 			dreq->bytes_left -= req_len;
528 		}
529 		nfs_direct_release_pages(pagevec, npages);
530 		kvfree(pagevec);
531 		if (result < 0)
532 			break;
533 	}
534 
535 	nfs_pageio_complete(&desc);
536 
537 	/*
538 	 * If no bytes were started, return the error, and let the
539 	 * generic layer handle the completion.
540 	 */
541 	if (requested_bytes == 0) {
542 		inode_dio_done(inode);
543 		nfs_direct_req_release(dreq);
544 		return result < 0 ? result : -EIO;
545 	}
546 
547 	if (put_dreq(dreq))
548 		nfs_direct_complete(dreq, false);
549 	return 0;
550 }
551 
552 /**
553  * nfs_file_direct_read - file direct read operation for NFS files
554  * @iocb: target I/O control block
555  * @iter: vector of user buffers into which to read data
556  * @pos: byte offset in file where reading starts
557  *
558  * We use this function for direct reads instead of calling
559  * generic_file_aio_read() in order to avoid gfar's check to see if
560  * the request starts before the end of the file.  For that check
561  * to work, we must generate a GETATTR before each direct read, and
562  * even then there is a window between the GETATTR and the subsequent
563  * READ where the file size could change.  Our preference is simply
564  * to do all reads the application wants, and the server will take
565  * care of managing the end of file boundary.
566  *
567  * This function also eliminates unnecessarily updating the file's
568  * atime locally, as the NFS server sets the file's atime, and this
569  * client must read the updated atime from the server back into its
570  * cache.
571  */
572 ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
573 				loff_t pos)
574 {
575 	struct file *file = iocb->ki_filp;
576 	struct address_space *mapping = file->f_mapping;
577 	struct inode *inode = mapping->host;
578 	struct nfs_direct_req *dreq;
579 	struct nfs_lock_context *l_ctx;
580 	ssize_t result = -EINVAL;
581 	size_t count = iov_iter_count(iter);
582 	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
583 
584 	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
585 		file, count, (long long) pos);
586 
587 	result = 0;
588 	if (!count)
589 		goto out;
590 
591 	mutex_lock(&inode->i_mutex);
592 	result = nfs_sync_mapping(mapping);
593 	if (result)
594 		goto out_unlock;
595 
596 	task_io_account_read(count);
597 
598 	result = -ENOMEM;
599 	dreq = nfs_direct_req_alloc();
600 	if (dreq == NULL)
601 		goto out_unlock;
602 
603 	dreq->inode = inode;
604 	dreq->bytes_left = count;
605 	dreq->io_start = pos;
606 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
607 	l_ctx = nfs_get_lock_context(dreq->ctx);
608 	if (IS_ERR(l_ctx)) {
609 		result = PTR_ERR(l_ctx);
610 		goto out_release;
611 	}
612 	dreq->l_ctx = l_ctx;
613 	if (!is_sync_kiocb(iocb))
614 		dreq->iocb = iocb;
615 
616 	NFS_I(inode)->read_io += count;
617 	result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
618 
619 	mutex_unlock(&inode->i_mutex);
620 
621 	if (!result) {
622 		result = nfs_direct_wait(dreq);
623 		if (result > 0)
624 			iocb->ki_pos = pos + result;
625 	}
626 
627 	nfs_direct_req_release(dreq);
628 	return result;
629 
630 out_release:
631 	nfs_direct_req_release(dreq);
632 out_unlock:
633 	mutex_unlock(&inode->i_mutex);
634 out:
635 	return result;
636 }
637 
638 static void
639 nfs_direct_write_scan_commit_list(struct inode *inode,
640 				  struct list_head *list,
641 				  struct nfs_commit_info *cinfo)
642 {
643 	spin_lock(cinfo->lock);
644 #ifdef CONFIG_NFS_V4_1
645 	if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
646 		NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
647 #endif
648 	nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
649 	spin_unlock(cinfo->lock);
650 }
651 
652 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
653 {
654 	struct nfs_pageio_descriptor desc;
655 	struct nfs_page *req, *tmp;
656 	LIST_HEAD(reqs);
657 	struct nfs_commit_info cinfo;
658 	LIST_HEAD(failed);
659 	int i;
660 
661 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
662 	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
663 
664 	dreq->count = 0;
665 	for (i = 0; i < dreq->mirror_count; i++)
666 		dreq->mirrors[i].count = 0;
667 	get_dreq(dreq);
668 
669 	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
670 			      &nfs_direct_write_completion_ops);
671 	desc.pg_dreq = dreq;
672 
673 	req = nfs_list_entry(reqs.next);
674 	nfs_direct_setup_mirroring(dreq, &desc, req);
675 
676 	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
677 		if (!nfs_pageio_add_request(&desc, req)) {
678 			nfs_list_remove_request(req);
679 			nfs_list_add_request(req, &failed);
680 			spin_lock(cinfo.lock);
681 			dreq->flags = 0;
682 			dreq->error = -EIO;
683 			spin_unlock(cinfo.lock);
684 		}
685 		nfs_release_request(req);
686 	}
687 	nfs_pageio_complete(&desc);
688 
689 	while (!list_empty(&failed)) {
690 		req = nfs_list_entry(failed.next);
691 		nfs_list_remove_request(req);
692 		nfs_unlock_and_release_request(req);
693 	}
694 
695 	if (put_dreq(dreq))
696 		nfs_direct_write_complete(dreq, dreq->inode);
697 }
698 
699 static void nfs_direct_commit_complete(struct nfs_commit_data *data)
700 {
701 	struct nfs_direct_req *dreq = data->dreq;
702 	struct nfs_commit_info cinfo;
703 	struct nfs_page *req;
704 	int status = data->task.tk_status;
705 
706 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
707 	if (status < 0) {
708 		dprintk("NFS: %5u commit failed with error %d.\n",
709 			data->task.tk_pid, status);
710 		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
711 	} else if (nfs_direct_cmp_commit_data_verf(dreq, data)) {
712 		dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
713 		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
714 	}
715 
716 	dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
717 	while (!list_empty(&data->pages)) {
718 		req = nfs_list_entry(data->pages.next);
719 		nfs_list_remove_request(req);
720 		if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
721 			/* Note the rewrite will go through mds */
722 			nfs_mark_request_commit(req, NULL, &cinfo, 0);
723 		} else
724 			nfs_release_request(req);
725 		nfs_unlock_and_release_request(req);
726 	}
727 
728 	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
729 		nfs_direct_write_complete(dreq, data->inode);
730 }
731 
732 static void nfs_direct_error_cleanup(struct nfs_inode *nfsi)
733 {
734 	/* There is no lock to clear */
735 }
736 
737 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
738 	.completion = nfs_direct_commit_complete,
739 	.error_cleanup = nfs_direct_error_cleanup,
740 };
741 
742 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
743 {
744 	int res;
745 	struct nfs_commit_info cinfo;
746 	LIST_HEAD(mds_list);
747 
748 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
749 	nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
750 	res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
751 	if (res < 0) /* res == -ENOMEM */
752 		nfs_direct_write_reschedule(dreq);
753 }
754 
755 static void nfs_direct_write_schedule_work(struct work_struct *work)
756 {
757 	struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
758 	int flags = dreq->flags;
759 
760 	dreq->flags = 0;
761 	switch (flags) {
762 		case NFS_ODIRECT_DO_COMMIT:
763 			nfs_direct_commit_schedule(dreq);
764 			break;
765 		case NFS_ODIRECT_RESCHED_WRITES:
766 			nfs_direct_write_reschedule(dreq);
767 			break;
768 		default:
769 			nfs_direct_complete(dreq, true);
770 	}
771 }
772 
773 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
774 {
775 	schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
776 }
777 
778 static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
779 {
780 	struct nfs_direct_req *dreq = hdr->dreq;
781 	struct nfs_commit_info cinfo;
782 	bool request_commit = false;
783 	struct nfs_page *req = nfs_list_entry(hdr->pages.next);
784 
785 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
786 		goto out_put;
787 
788 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
789 
790 	spin_lock(&dreq->lock);
791 
792 	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
793 		dreq->flags = 0;
794 		dreq->error = hdr->error;
795 	}
796 	if (dreq->error == 0) {
797 		nfs_direct_good_bytes(dreq, hdr);
798 		if (nfs_write_need_commit(hdr)) {
799 			if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
800 				request_commit = true;
801 			else if (dreq->flags == 0) {
802 				nfs_direct_set_hdr_verf(dreq, hdr);
803 				request_commit = true;
804 				dreq->flags = NFS_ODIRECT_DO_COMMIT;
805 			} else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
806 				request_commit = true;
807 				if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
808 					dreq->flags =
809 						NFS_ODIRECT_RESCHED_WRITES;
810 			}
811 		}
812 	}
813 	spin_unlock(&dreq->lock);
814 
815 	while (!list_empty(&hdr->pages)) {
816 
817 		req = nfs_list_entry(hdr->pages.next);
818 		nfs_list_remove_request(req);
819 		if (request_commit) {
820 			kref_get(&req->wb_kref);
821 			nfs_mark_request_commit(req, hdr->lseg, &cinfo,
822 				hdr->ds_commit_idx);
823 		}
824 		nfs_unlock_and_release_request(req);
825 	}
826 
827 out_put:
828 	if (put_dreq(dreq))
829 		nfs_direct_write_complete(dreq, hdr->inode);
830 	hdr->release(hdr);
831 }
832 
833 static void nfs_write_sync_pgio_error(struct list_head *head)
834 {
835 	struct nfs_page *req;
836 
837 	while (!list_empty(head)) {
838 		req = nfs_list_entry(head->next);
839 		nfs_list_remove_request(req);
840 		nfs_unlock_and_release_request(req);
841 	}
842 }
843 
844 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
845 	.error_cleanup = nfs_write_sync_pgio_error,
846 	.init_hdr = nfs_direct_pgio_init,
847 	.completion = nfs_direct_write_completion,
848 };
849 
850 
851 /*
852  * NB: Return the value of the first error return code.  Subsequent
853  *     errors after the first one are ignored.
854  */
855 /*
856  * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
857  * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
858  * bail and stop sending more writes.  Write length accounting is
859  * handled automatically by nfs_direct_write_result().  Otherwise, if
860  * no requests have been sent, just return an error.
861  */
862 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
863 					       struct iov_iter *iter,
864 					       loff_t pos)
865 {
866 	struct nfs_pageio_descriptor desc;
867 	struct inode *inode = dreq->inode;
868 	ssize_t result = 0;
869 	size_t requested_bytes = 0;
870 	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
871 
872 	nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
873 			      &nfs_direct_write_completion_ops);
874 	desc.pg_dreq = dreq;
875 	get_dreq(dreq);
876 	atomic_inc(&inode->i_dio_count);
877 
878 	NFS_I(inode)->write_io += iov_iter_count(iter);
879 	while (iov_iter_count(iter)) {
880 		struct page **pagevec;
881 		size_t bytes;
882 		size_t pgbase;
883 		unsigned npages, i;
884 
885 		result = iov_iter_get_pages_alloc(iter, &pagevec,
886 						  wsize, &pgbase);
887 		if (result < 0)
888 			break;
889 
890 		bytes = result;
891 		iov_iter_advance(iter, bytes);
892 		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
893 		for (i = 0; i < npages; i++) {
894 			struct nfs_page *req;
895 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
896 
897 			req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
898 						 pgbase, req_len);
899 			if (IS_ERR(req)) {
900 				result = PTR_ERR(req);
901 				break;
902 			}
903 
904 			nfs_direct_setup_mirroring(dreq, &desc, req);
905 
906 			nfs_lock_request(req);
907 			req->wb_index = pos >> PAGE_SHIFT;
908 			req->wb_offset = pos & ~PAGE_MASK;
909 			if (!nfs_pageio_add_request(&desc, req)) {
910 				result = desc.pg_error;
911 				nfs_unlock_and_release_request(req);
912 				break;
913 			}
914 			pgbase = 0;
915 			bytes -= req_len;
916 			requested_bytes += req_len;
917 			pos += req_len;
918 			dreq->bytes_left -= req_len;
919 		}
920 		nfs_direct_release_pages(pagevec, npages);
921 		kvfree(pagevec);
922 		if (result < 0)
923 			break;
924 	}
925 	nfs_pageio_complete(&desc);
926 
927 	/*
928 	 * If no bytes were started, return the error, and let the
929 	 * generic layer handle the completion.
930 	 */
931 	if (requested_bytes == 0) {
932 		inode_dio_done(inode);
933 		nfs_direct_req_release(dreq);
934 		return result < 0 ? result : -EIO;
935 	}
936 
937 	if (put_dreq(dreq))
938 		nfs_direct_write_complete(dreq, dreq->inode);
939 	return 0;
940 }
941 
942 /**
943  * nfs_file_direct_write - file direct write operation for NFS files
944  * @iocb: target I/O control block
945  * @iter: vector of user buffers from which to write data
946  * @pos: byte offset in file where writing starts
947  *
948  * We use this function for direct writes instead of calling
949  * generic_file_aio_write() in order to avoid taking the inode
950  * semaphore and updating the i_size.  The NFS server will set
951  * the new i_size and this client must read the updated size
952  * back into its cache.  We let the server do generic write
953  * parameter checking and report problems.
954  *
955  * We eliminate local atime updates, see direct read above.
956  *
957  * We avoid unnecessary page cache invalidations for normal cached
958  * readers of this file.
959  *
960  * Note that O_APPEND is not supported for NFS direct writes, as there
961  * is no atomic O_APPEND write facility in the NFS protocol.
962  */
963 ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
964 				loff_t pos)
965 {
966 	ssize_t result = -EINVAL;
967 	struct file *file = iocb->ki_filp;
968 	struct address_space *mapping = file->f_mapping;
969 	struct inode *inode = mapping->host;
970 	struct nfs_direct_req *dreq;
971 	struct nfs_lock_context *l_ctx;
972 	loff_t end;
973 	size_t count = iov_iter_count(iter);
974 	end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
975 
976 	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
977 
978 	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
979 		file, count, (long long) pos);
980 
981 	result = generic_write_checks(file, &pos, &count, 0);
982 	if (result)
983 		goto out;
984 
985 	result = -EINVAL;
986 	if ((ssize_t) count < 0)
987 		goto out;
988 	result = 0;
989 	if (!count)
990 		goto out;
991 
992 	mutex_lock(&inode->i_mutex);
993 
994 	result = nfs_sync_mapping(mapping);
995 	if (result)
996 		goto out_unlock;
997 
998 	if (mapping->nrpages) {
999 		result = invalidate_inode_pages2_range(mapping,
1000 					pos >> PAGE_CACHE_SHIFT, end);
1001 		if (result)
1002 			goto out_unlock;
1003 	}
1004 
1005 	task_io_account_write(count);
1006 
1007 	result = -ENOMEM;
1008 	dreq = nfs_direct_req_alloc();
1009 	if (!dreq)
1010 		goto out_unlock;
1011 
1012 	dreq->inode = inode;
1013 	dreq->bytes_left = count;
1014 	dreq->io_start = pos;
1015 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
1016 	l_ctx = nfs_get_lock_context(dreq->ctx);
1017 	if (IS_ERR(l_ctx)) {
1018 		result = PTR_ERR(l_ctx);
1019 		goto out_release;
1020 	}
1021 	dreq->l_ctx = l_ctx;
1022 	if (!is_sync_kiocb(iocb))
1023 		dreq->iocb = iocb;
1024 
1025 	result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
1026 
1027 	if (mapping->nrpages) {
1028 		invalidate_inode_pages2_range(mapping,
1029 					      pos >> PAGE_CACHE_SHIFT, end);
1030 	}
1031 
1032 	mutex_unlock(&inode->i_mutex);
1033 
1034 	if (!result) {
1035 		result = nfs_direct_wait(dreq);
1036 		if (result > 0) {
1037 			struct inode *inode = mapping->host;
1038 
1039 			iocb->ki_pos = pos + result;
1040 			spin_lock(&inode->i_lock);
1041 			if (i_size_read(inode) < iocb->ki_pos)
1042 				i_size_write(inode, iocb->ki_pos);
1043 			spin_unlock(&inode->i_lock);
1044 		}
1045 	}
1046 	nfs_direct_req_release(dreq);
1047 	return result;
1048 
1049 out_release:
1050 	nfs_direct_req_release(dreq);
1051 out_unlock:
1052 	mutex_unlock(&inode->i_mutex);
1053 out:
1054 	return result;
1055 }
1056 
1057 /**
1058  * nfs_init_directcache - create a slab cache for nfs_direct_req structures
1059  *
1060  */
1061 int __init nfs_init_directcache(void)
1062 {
1063 	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
1064 						sizeof(struct nfs_direct_req),
1065 						0, (SLAB_RECLAIM_ACCOUNT|
1066 							SLAB_MEM_SPREAD),
1067 						NULL);
1068 	if (nfs_direct_cachep == NULL)
1069 		return -ENOMEM;
1070 
1071 	return 0;
1072 }
1073 
1074 /**
1075  * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
1076  *
1077  */
1078 void nfs_destroy_directcache(void)
1079 {
1080 	kmem_cache_destroy(nfs_direct_cachep);
1081 }
1082