1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/fs/nfs/direct.c 4 * 5 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com> 6 * 7 * High-performance uncached I/O for the Linux NFS client 8 * 9 * There are important applications whose performance or correctness 10 * depends on uncached access to file data. Database clusters 11 * (multiple copies of the same instance running on separate hosts) 12 * implement their own cache coherency protocol that subsumes file 13 * system cache protocols. Applications that process datasets 14 * considerably larger than the client's memory do not always benefit 15 * from a local cache. A streaming video server, for instance, has no 16 * need to cache the contents of a file. 17 * 18 * When an application requests uncached I/O, all read and write requests 19 * are made directly to the server; data stored or fetched via these 20 * requests is not cached in the Linux page cache. The client does not 21 * correct unaligned requests from applications. All requested bytes are 22 * held on permanent storage before a direct write system call returns to 23 * an application. 24 * 25 * Solaris implements an uncached I/O facility called directio() that 26 * is used for backups and sequential I/O to very large files. Solaris 27 * also supports uncaching whole NFS partitions with "-o forcedirectio," 28 * an undocumented mount option. 29 * 30 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with 31 * help from Andrew Morton. 32 * 33 * 18 Dec 2001 Initial implementation for 2.4 --cel 34 * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy 35 * 08 Jun 2003 Port to 2.5 APIs --cel 36 * 31 Mar 2004 Handle direct I/O without VFS support --cel 37 * 15 Sep 2004 Parallel async reads --cel 38 * 04 May 2005 support O_DIRECT with aio --cel 39 * 40 */ 41 42 #include <linux/errno.h> 43 #include <linux/sched.h> 44 #include <linux/kernel.h> 45 #include <linux/file.h> 46 #include <linux/pagemap.h> 47 #include <linux/kref.h> 48 #include <linux/slab.h> 49 #include <linux/task_io_accounting_ops.h> 50 #include <linux/module.h> 51 52 #include <linux/nfs_fs.h> 53 #include <linux/nfs_page.h> 54 #include <linux/sunrpc/clnt.h> 55 56 #include <linux/uaccess.h> 57 #include <linux/atomic.h> 58 59 #include "delegation.h" 60 #include "internal.h" 61 #include "iostat.h" 62 #include "pnfs.h" 63 #include "fscache.h" 64 #include "nfstrace.h" 65 66 #define NFSDBG_FACILITY NFSDBG_VFS 67 68 static struct kmem_cache *nfs_direct_cachep; 69 70 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; 71 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops; 72 static void nfs_direct_write_complete(struct nfs_direct_req *dreq); 73 static void nfs_direct_write_schedule_work(struct work_struct *work); 74 75 static inline void get_dreq(struct nfs_direct_req *dreq) 76 { 77 atomic_inc(&dreq->io_count); 78 } 79 80 static inline int put_dreq(struct nfs_direct_req *dreq) 81 { 82 return atomic_dec_and_test(&dreq->io_count); 83 } 84 85 static void 86 nfs_direct_handle_truncated(struct nfs_direct_req *dreq, 87 const struct nfs_pgio_header *hdr, 88 ssize_t dreq_len) 89 { 90 if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) || 91 test_bit(NFS_IOHDR_EOF, &hdr->flags))) 92 return; 93 if (dreq->max_count >= dreq_len) { 94 dreq->max_count = dreq_len; 95 if (dreq->count > dreq_len) 96 dreq->count = dreq_len; 97 } 98 99 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error) 100 dreq->error = hdr->error; 101 } 102 103 static void 104 nfs_direct_count_bytes(struct nfs_direct_req *dreq, 105 const struct nfs_pgio_header *hdr) 106 { 107 loff_t hdr_end = hdr->io_start + hdr->good_bytes; 108 ssize_t dreq_len = 0; 109 110 if (hdr_end > dreq->io_start) 111 dreq_len = hdr_end - dreq->io_start; 112 113 nfs_direct_handle_truncated(dreq, hdr, dreq_len); 114 115 if (dreq_len > dreq->max_count) 116 dreq_len = dreq->max_count; 117 118 if (dreq->count < dreq_len) 119 dreq->count = dreq_len; 120 } 121 122 static void nfs_direct_truncate_request(struct nfs_direct_req *dreq, 123 struct nfs_page *req) 124 { 125 loff_t offs = req_offset(req); 126 size_t req_start = (size_t)(offs - dreq->io_start); 127 128 if (req_start < dreq->max_count) 129 dreq->max_count = req_start; 130 if (req_start < dreq->count) 131 dreq->count = req_start; 132 } 133 134 static void nfs_direct_file_adjust_size_locked(struct inode *inode, 135 loff_t offset, size_t count) 136 { 137 loff_t newsize = offset + (loff_t)count; 138 loff_t oldsize = i_size_read(inode); 139 140 if (newsize > oldsize) { 141 i_size_write(inode, newsize); 142 NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; 143 trace_nfs_size_grow(inode, newsize); 144 nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); 145 } 146 } 147 148 /** 149 * nfs_swap_rw - NFS address space operation for swap I/O 150 * @iocb: target I/O control block 151 * @iter: I/O buffer 152 * 153 * Perform IO to the swap-file. This is much like direct IO. 154 */ 155 int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) 156 { 157 ssize_t ret; 158 159 if (iov_iter_rw(iter) == READ) 160 ret = nfs_file_direct_read(iocb, iter, true); 161 else 162 ret = nfs_file_direct_write(iocb, iter, true); 163 if (ret < 0) 164 return ret; 165 return 0; 166 } 167 168 static void nfs_direct_release_pages(struct page **pages, unsigned int npages) 169 { 170 unsigned int i; 171 for (i = 0; i < npages; i++) 172 put_page(pages[i]); 173 } 174 175 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, 176 struct nfs_direct_req *dreq) 177 { 178 cinfo->inode = dreq->inode; 179 cinfo->mds = &dreq->mds_cinfo; 180 cinfo->ds = &dreq->ds_cinfo; 181 cinfo->dreq = dreq; 182 cinfo->completion_ops = &nfs_direct_commit_completion_ops; 183 } 184 185 static inline struct nfs_direct_req *nfs_direct_req_alloc(void) 186 { 187 struct nfs_direct_req *dreq; 188 189 dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL); 190 if (!dreq) 191 return NULL; 192 193 kref_init(&dreq->kref); 194 kref_get(&dreq->kref); 195 init_completion(&dreq->completion); 196 INIT_LIST_HEAD(&dreq->mds_cinfo.list); 197 pnfs_init_ds_commit_info(&dreq->ds_cinfo); 198 INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); 199 spin_lock_init(&dreq->lock); 200 201 return dreq; 202 } 203 204 static void nfs_direct_req_free(struct kref *kref) 205 { 206 struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); 207 208 pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode); 209 if (dreq->l_ctx != NULL) 210 nfs_put_lock_context(dreq->l_ctx); 211 if (dreq->ctx != NULL) 212 put_nfs_open_context(dreq->ctx); 213 kmem_cache_free(nfs_direct_cachep, dreq); 214 } 215 216 static void nfs_direct_req_release(struct nfs_direct_req *dreq) 217 { 218 kref_put(&dreq->kref, nfs_direct_req_free); 219 } 220 221 ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset) 222 { 223 loff_t start = offset - dreq->io_start; 224 return dreq->max_count - start; 225 } 226 EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); 227 228 /* 229 * Collects and returns the final error value/byte-count. 230 */ 231 static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq) 232 { 233 ssize_t result = -EIOCBQUEUED; 234 235 /* Async requests don't wait here */ 236 if (dreq->iocb) 237 goto out; 238 239 result = wait_for_completion_killable(&dreq->completion); 240 241 if (!result) { 242 result = dreq->count; 243 WARN_ON_ONCE(dreq->count < 0); 244 } 245 if (!result) 246 result = dreq->error; 247 248 out: 249 return (ssize_t) result; 250 } 251 252 /* 253 * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust 254 * the iocb is still valid here if this is a synchronous request. 255 */ 256 static void nfs_direct_complete(struct nfs_direct_req *dreq) 257 { 258 struct inode *inode = dreq->inode; 259 260 inode_dio_end(inode); 261 262 if (dreq->iocb) { 263 long res = (long) dreq->error; 264 if (dreq->count != 0) { 265 res = (long) dreq->count; 266 WARN_ON_ONCE(dreq->count < 0); 267 } 268 dreq->iocb->ki_complete(dreq->iocb, res); 269 } 270 271 complete(&dreq->completion); 272 273 nfs_direct_req_release(dreq); 274 } 275 276 static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) 277 { 278 unsigned long bytes = 0; 279 struct nfs_direct_req *dreq = hdr->dreq; 280 281 spin_lock(&dreq->lock); 282 if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { 283 spin_unlock(&dreq->lock); 284 goto out_put; 285 } 286 287 nfs_direct_count_bytes(dreq, hdr); 288 spin_unlock(&dreq->lock); 289 290 nfs_update_delegated_atime(dreq->inode); 291 292 while (!list_empty(&hdr->pages)) { 293 struct nfs_page *req = nfs_list_entry(hdr->pages.next); 294 struct page *page = req->wb_page; 295 296 if (!PageCompound(page) && bytes < hdr->good_bytes && 297 (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY)) 298 set_page_dirty(page); 299 bytes += req->wb_bytes; 300 nfs_list_remove_request(req); 301 nfs_release_request(req); 302 } 303 out_put: 304 if (put_dreq(dreq)) 305 nfs_direct_complete(dreq); 306 hdr->release(hdr); 307 } 308 309 static void nfs_read_sync_pgio_error(struct list_head *head, int error) 310 { 311 struct nfs_page *req; 312 313 while (!list_empty(head)) { 314 req = nfs_list_entry(head->next); 315 nfs_list_remove_request(req); 316 nfs_release_request(req); 317 } 318 } 319 320 static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr) 321 { 322 get_dreq(hdr->dreq); 323 set_bit(NFS_IOHDR_ODIRECT, &hdr->flags); 324 } 325 326 static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { 327 .error_cleanup = nfs_read_sync_pgio_error, 328 .init_hdr = nfs_direct_pgio_init, 329 .completion = nfs_direct_read_completion, 330 }; 331 332 /* 333 * For each rsize'd chunk of the user's buffer, dispatch an NFS READ 334 * operation. If nfs_readdata_alloc() or get_user_pages() fails, 335 * bail and stop sending more reads. Read length accounting is 336 * handled automatically by nfs_direct_read_result(). Otherwise, if 337 * no requests have been sent, just return an error. 338 */ 339 340 static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, 341 struct iov_iter *iter, 342 loff_t pos) 343 { 344 struct nfs_pageio_descriptor desc; 345 struct inode *inode = dreq->inode; 346 ssize_t result = -EINVAL; 347 size_t requested_bytes = 0; 348 size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE); 349 350 nfs_pageio_init_read(&desc, dreq->inode, false, 351 &nfs_direct_read_completion_ops); 352 get_dreq(dreq); 353 desc.pg_dreq = dreq; 354 inode_dio_begin(inode); 355 356 while (iov_iter_count(iter)) { 357 struct page **pagevec; 358 size_t bytes; 359 size_t pgbase; 360 unsigned npages, i; 361 362 result = iov_iter_get_pages_alloc2(iter, &pagevec, 363 rsize, &pgbase); 364 if (result < 0) 365 break; 366 367 bytes = result; 368 npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; 369 for (i = 0; i < npages; i++) { 370 struct nfs_page *req; 371 unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); 372 /* XXX do we need to do the eof zeroing found in async_filler? */ 373 req = nfs_page_create_from_page(dreq->ctx, pagevec[i], 374 pgbase, pos, req_len); 375 if (IS_ERR(req)) { 376 result = PTR_ERR(req); 377 break; 378 } 379 if (!nfs_pageio_add_request(&desc, req)) { 380 result = desc.pg_error; 381 nfs_release_request(req); 382 break; 383 } 384 pgbase = 0; 385 bytes -= req_len; 386 requested_bytes += req_len; 387 pos += req_len; 388 } 389 nfs_direct_release_pages(pagevec, npages); 390 kvfree(pagevec); 391 if (result < 0) 392 break; 393 } 394 395 nfs_pageio_complete(&desc); 396 397 /* 398 * If no bytes were started, return the error, and let the 399 * generic layer handle the completion. 400 */ 401 if (requested_bytes == 0) { 402 inode_dio_end(inode); 403 nfs_direct_req_release(dreq); 404 return result < 0 ? result : -EIO; 405 } 406 407 if (put_dreq(dreq)) 408 nfs_direct_complete(dreq); 409 return requested_bytes; 410 } 411 412 /** 413 * nfs_file_direct_read - file direct read operation for NFS files 414 * @iocb: target I/O control block 415 * @iter: vector of user buffers into which to read data 416 * @swap: flag indicating this is swap IO, not O_DIRECT IO 417 * 418 * We use this function for direct reads instead of calling 419 * generic_file_aio_read() in order to avoid gfar's check to see if 420 * the request starts before the end of the file. For that check 421 * to work, we must generate a GETATTR before each direct read, and 422 * even then there is a window between the GETATTR and the subsequent 423 * READ where the file size could change. Our preference is simply 424 * to do all reads the application wants, and the server will take 425 * care of managing the end of file boundary. 426 * 427 * This function also eliminates unnecessarily updating the file's 428 * atime locally, as the NFS server sets the file's atime, and this 429 * client must read the updated atime from the server back into its 430 * cache. 431 */ 432 ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, 433 bool swap) 434 { 435 struct file *file = iocb->ki_filp; 436 struct address_space *mapping = file->f_mapping; 437 struct inode *inode = mapping->host; 438 struct nfs_direct_req *dreq; 439 struct nfs_lock_context *l_ctx; 440 ssize_t result, requested; 441 size_t count = iov_iter_count(iter); 442 nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); 443 444 dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", 445 file, count, (long long) iocb->ki_pos); 446 447 result = 0; 448 if (!count) 449 goto out; 450 451 task_io_account_read(count); 452 453 result = -ENOMEM; 454 dreq = nfs_direct_req_alloc(); 455 if (dreq == NULL) 456 goto out; 457 458 dreq->inode = inode; 459 dreq->max_count = count; 460 dreq->io_start = iocb->ki_pos; 461 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 462 l_ctx = nfs_get_lock_context(dreq->ctx); 463 if (IS_ERR(l_ctx)) { 464 result = PTR_ERR(l_ctx); 465 nfs_direct_req_release(dreq); 466 goto out_release; 467 } 468 dreq->l_ctx = l_ctx; 469 if (!is_sync_kiocb(iocb)) { 470 dreq->iocb = iocb; 471 } else if (iocb->ki_flags & IOCB_NOWAIT) { 472 result = -EAGAIN; 473 nfs_direct_req_release(dreq); 474 goto out_release; 475 } 476 477 if (user_backed_iter(iter)) 478 dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; 479 480 if (!swap) { 481 if (iocb->ki_flags & IOCB_NOWAIT) 482 result = nfs_start_io_direct_nowait(inode); 483 else 484 result = nfs_start_io_direct(inode); 485 if (result) { 486 /* release the reference that would usually be 487 * consumed by nfs_direct_read_schedule_iovec() 488 */ 489 nfs_direct_req_release(dreq); 490 goto out_release; 491 } 492 } 493 494 NFS_I(inode)->read_io += count; 495 requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); 496 497 if (!swap) 498 nfs_end_io_direct(inode); 499 500 if (requested > 0) { 501 result = nfs_direct_wait(dreq); 502 if (result > 0) { 503 requested -= result; 504 iocb->ki_pos += result; 505 } 506 iov_iter_revert(iter, requested); 507 } else { 508 result = requested; 509 } 510 511 out_release: 512 nfs_direct_req_release(dreq); 513 out: 514 return result; 515 } 516 517 static void nfs_direct_add_page_head(struct list_head *list, 518 struct nfs_page *req) 519 { 520 struct nfs_page *head = req->wb_head; 521 522 if (!list_empty(&head->wb_list) || !nfs_lock_request(head)) 523 return; 524 if (!list_empty(&head->wb_list)) { 525 nfs_unlock_request(head); 526 return; 527 } 528 list_add(&head->wb_list, list); 529 kref_get(&head->wb_kref); 530 kref_get(&head->wb_kref); 531 } 532 533 static void nfs_direct_join_group(struct list_head *list, 534 struct nfs_commit_info *cinfo, 535 struct inode *inode) 536 { 537 struct nfs_page *req, *subreq; 538 539 list_for_each_entry(req, list, wb_list) { 540 if (req->wb_head != req) { 541 nfs_direct_add_page_head(&req->wb_list, req); 542 continue; 543 } 544 subreq = req->wb_this_page; 545 if (subreq == req) 546 continue; 547 do { 548 /* 549 * Remove subrequests from this list before freeing 550 * them in the call to nfs_join_page_group(). 551 */ 552 if (!list_empty(&subreq->wb_list)) { 553 nfs_list_remove_request(subreq); 554 nfs_release_request(subreq); 555 } 556 } while ((subreq = subreq->wb_this_page) != req); 557 nfs_join_page_group(req, cinfo, inode); 558 } 559 } 560 561 static void 562 nfs_direct_write_scan_commit_list(struct inode *inode, 563 struct list_head *list, 564 struct nfs_commit_info *cinfo) 565 { 566 mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); 567 pnfs_recover_commit_reqs(list, cinfo); 568 nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); 569 mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); 570 } 571 572 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) 573 { 574 struct nfs_pageio_descriptor desc; 575 struct nfs_page *req; 576 LIST_HEAD(reqs); 577 struct nfs_commit_info cinfo; 578 579 nfs_init_cinfo_from_dreq(&cinfo, dreq); 580 nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); 581 582 nfs_direct_join_group(&reqs, &cinfo, dreq->inode); 583 584 nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); 585 get_dreq(dreq); 586 587 nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, 588 &nfs_direct_write_completion_ops); 589 desc.pg_dreq = dreq; 590 591 while (!list_empty(&reqs)) { 592 req = nfs_list_entry(reqs.next); 593 /* Bump the transmission count */ 594 req->wb_nio++; 595 if (!nfs_pageio_add_request(&desc, req)) { 596 spin_lock(&dreq->lock); 597 if (dreq->error < 0) { 598 desc.pg_error = dreq->error; 599 } else if (desc.pg_error != -EAGAIN) { 600 dreq->flags = 0; 601 if (!desc.pg_error) 602 desc.pg_error = -EIO; 603 dreq->error = desc.pg_error; 604 } else 605 dreq->flags = NFS_ODIRECT_RESCHED_WRITES; 606 spin_unlock(&dreq->lock); 607 break; 608 } 609 nfs_release_request(req); 610 } 611 nfs_pageio_complete(&desc); 612 613 while (!list_empty(&reqs)) { 614 req = nfs_list_entry(reqs.next); 615 nfs_list_remove_request(req); 616 nfs_unlock_and_release_request(req); 617 if (desc.pg_error == -EAGAIN) { 618 nfs_mark_request_commit(req, NULL, &cinfo, 0); 619 } else { 620 spin_lock(&dreq->lock); 621 nfs_direct_truncate_request(dreq, req); 622 spin_unlock(&dreq->lock); 623 nfs_release_request(req); 624 } 625 } 626 627 if (put_dreq(dreq)) 628 nfs_direct_write_complete(dreq); 629 } 630 631 static void nfs_direct_commit_complete(struct nfs_commit_data *data) 632 { 633 const struct nfs_writeverf *verf = data->res.verf; 634 struct nfs_direct_req *dreq = data->dreq; 635 struct nfs_commit_info cinfo; 636 struct nfs_page *req; 637 int status = data->task.tk_status; 638 639 trace_nfs_direct_commit_complete(dreq); 640 641 spin_lock(&dreq->lock); 642 if (status < 0) { 643 /* Errors in commit are fatal */ 644 dreq->error = status; 645 dreq->flags = NFS_ODIRECT_DONE; 646 } else { 647 status = dreq->error; 648 } 649 spin_unlock(&dreq->lock); 650 651 nfs_init_cinfo_from_dreq(&cinfo, dreq); 652 653 while (!list_empty(&data->pages)) { 654 req = nfs_list_entry(data->pages.next); 655 nfs_list_remove_request(req); 656 if (status < 0) { 657 spin_lock(&dreq->lock); 658 nfs_direct_truncate_request(dreq, req); 659 spin_unlock(&dreq->lock); 660 nfs_release_request(req); 661 } else if (!nfs_write_match_verf(verf, req)) { 662 spin_lock(&dreq->lock); 663 if (dreq->flags == 0) 664 dreq->flags = NFS_ODIRECT_RESCHED_WRITES; 665 spin_unlock(&dreq->lock); 666 /* 667 * Despite the reboot, the write was successful, 668 * so reset wb_nio. 669 */ 670 req->wb_nio = 0; 671 nfs_mark_request_commit(req, NULL, &cinfo, 0); 672 } else 673 nfs_release_request(req); 674 nfs_unlock_and_release_request(req); 675 } 676 677 if (nfs_commit_end(cinfo.mds)) 678 nfs_direct_write_complete(dreq); 679 } 680 681 static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, 682 struct nfs_page *req) 683 { 684 struct nfs_direct_req *dreq = cinfo->dreq; 685 686 trace_nfs_direct_resched_write(dreq); 687 688 spin_lock(&dreq->lock); 689 if (dreq->flags != NFS_ODIRECT_DONE) 690 dreq->flags = NFS_ODIRECT_RESCHED_WRITES; 691 spin_unlock(&dreq->lock); 692 nfs_mark_request_commit(req, NULL, cinfo, 0); 693 } 694 695 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = { 696 .completion = nfs_direct_commit_complete, 697 .resched_write = nfs_direct_resched_write, 698 }; 699 700 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) 701 { 702 int res; 703 struct nfs_commit_info cinfo; 704 LIST_HEAD(mds_list); 705 706 nfs_init_cinfo_from_dreq(&cinfo, dreq); 707 nfs_commit_begin(cinfo.mds); 708 nfs_scan_commit(dreq->inode, &mds_list, &cinfo); 709 res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo); 710 if (res < 0) { /* res == -ENOMEM */ 711 spin_lock(&dreq->lock); 712 if (dreq->flags == 0) 713 dreq->flags = NFS_ODIRECT_RESCHED_WRITES; 714 spin_unlock(&dreq->lock); 715 } 716 if (nfs_commit_end(cinfo.mds)) 717 nfs_direct_write_complete(dreq); 718 } 719 720 static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq) 721 { 722 struct nfs_commit_info cinfo; 723 struct nfs_page *req; 724 LIST_HEAD(reqs); 725 726 nfs_init_cinfo_from_dreq(&cinfo, dreq); 727 nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); 728 729 while (!list_empty(&reqs)) { 730 req = nfs_list_entry(reqs.next); 731 nfs_list_remove_request(req); 732 nfs_direct_truncate_request(dreq, req); 733 nfs_release_request(req); 734 nfs_unlock_and_release_request(req); 735 } 736 } 737 738 static void nfs_direct_write_schedule_work(struct work_struct *work) 739 { 740 struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work); 741 int flags = dreq->flags; 742 743 dreq->flags = 0; 744 switch (flags) { 745 case NFS_ODIRECT_DO_COMMIT: 746 nfs_direct_commit_schedule(dreq); 747 break; 748 case NFS_ODIRECT_RESCHED_WRITES: 749 nfs_direct_write_reschedule(dreq); 750 break; 751 default: 752 nfs_direct_write_clear_reqs(dreq); 753 nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping); 754 nfs_direct_complete(dreq); 755 } 756 } 757 758 static void nfs_direct_write_complete(struct nfs_direct_req *dreq) 759 { 760 trace_nfs_direct_write_complete(dreq); 761 queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */ 762 } 763 764 static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) 765 { 766 struct nfs_direct_req *dreq = hdr->dreq; 767 struct nfs_commit_info cinfo; 768 struct inode *inode = dreq->inode; 769 int flags = NFS_ODIRECT_DONE; 770 771 trace_nfs_direct_write_completion(dreq); 772 773 nfs_init_cinfo_from_dreq(&cinfo, dreq); 774 775 spin_lock(&dreq->lock); 776 if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { 777 spin_unlock(&dreq->lock); 778 goto out_put; 779 } 780 781 nfs_direct_count_bytes(dreq, hdr); 782 if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) && 783 !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { 784 if (!dreq->flags) 785 dreq->flags = NFS_ODIRECT_DO_COMMIT; 786 flags = dreq->flags; 787 } 788 spin_unlock(&dreq->lock); 789 790 spin_lock(&inode->i_lock); 791 nfs_direct_file_adjust_size_locked(inode, dreq->io_start, dreq->count); 792 nfs_update_delegated_mtime_locked(dreq->inode); 793 spin_unlock(&inode->i_lock); 794 795 while (!list_empty(&hdr->pages)) { 796 struct nfs_page *req; 797 798 req = nfs_list_entry(hdr->pages.next); 799 nfs_list_remove_request(req); 800 if (flags == NFS_ODIRECT_DO_COMMIT) { 801 kref_get(&req->wb_kref); 802 memcpy(&req->wb_verf, &hdr->verf.verifier, 803 sizeof(req->wb_verf)); 804 nfs_mark_request_commit(req, hdr->lseg, &cinfo, 805 hdr->ds_commit_idx); 806 } else if (flags == NFS_ODIRECT_RESCHED_WRITES) { 807 kref_get(&req->wb_kref); 808 nfs_mark_request_commit(req, NULL, &cinfo, 0); 809 } 810 nfs_unlock_and_release_request(req); 811 } 812 813 out_put: 814 if (put_dreq(dreq)) 815 nfs_direct_write_complete(dreq); 816 hdr->release(hdr); 817 } 818 819 static void nfs_write_sync_pgio_error(struct list_head *head, int error) 820 { 821 struct nfs_page *req; 822 823 while (!list_empty(head)) { 824 req = nfs_list_entry(head->next); 825 nfs_list_remove_request(req); 826 nfs_unlock_and_release_request(req); 827 } 828 } 829 830 static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) 831 { 832 struct nfs_direct_req *dreq = hdr->dreq; 833 struct nfs_page *req; 834 struct nfs_commit_info cinfo; 835 836 trace_nfs_direct_write_reschedule_io(dreq); 837 838 nfs_init_cinfo_from_dreq(&cinfo, dreq); 839 spin_lock(&dreq->lock); 840 if (dreq->error == 0) 841 dreq->flags = NFS_ODIRECT_RESCHED_WRITES; 842 set_bit(NFS_IOHDR_REDO, &hdr->flags); 843 spin_unlock(&dreq->lock); 844 while (!list_empty(&hdr->pages)) { 845 req = nfs_list_entry(hdr->pages.next); 846 nfs_list_remove_request(req); 847 nfs_unlock_request(req); 848 nfs_mark_request_commit(req, NULL, &cinfo, 0); 849 } 850 } 851 852 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { 853 .error_cleanup = nfs_write_sync_pgio_error, 854 .init_hdr = nfs_direct_pgio_init, 855 .completion = nfs_direct_write_completion, 856 .reschedule_io = nfs_direct_write_reschedule_io, 857 }; 858 859 860 /* 861 * NB: Return the value of the first error return code. Subsequent 862 * errors after the first one are ignored. 863 */ 864 /* 865 * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE 866 * operation. If nfs_writedata_alloc() or get_user_pages() fails, 867 * bail and stop sending more writes. Write length accounting is 868 * handled automatically by nfs_direct_write_result(). Otherwise, if 869 * no requests have been sent, just return an error. 870 */ 871 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, 872 struct iov_iter *iter, 873 loff_t pos, int ioflags) 874 { 875 struct nfs_pageio_descriptor desc; 876 struct inode *inode = dreq->inode; 877 struct nfs_commit_info cinfo; 878 ssize_t result = 0; 879 size_t requested_bytes = 0; 880 size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); 881 bool defer = false; 882 883 trace_nfs_direct_write_schedule_iovec(dreq); 884 885 nfs_pageio_init_write(&desc, inode, ioflags, false, 886 &nfs_direct_write_completion_ops); 887 desc.pg_dreq = dreq; 888 get_dreq(dreq); 889 inode_dio_begin(inode); 890 891 NFS_I(inode)->write_io += iov_iter_count(iter); 892 while (iov_iter_count(iter)) { 893 struct page **pagevec; 894 size_t bytes; 895 size_t pgbase; 896 unsigned npages, i; 897 898 result = iov_iter_get_pages_alloc2(iter, &pagevec, 899 wsize, &pgbase); 900 if (result < 0) 901 break; 902 903 bytes = result; 904 npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; 905 for (i = 0; i < npages; i++) { 906 struct nfs_page *req; 907 unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); 908 909 req = nfs_page_create_from_page(dreq->ctx, pagevec[i], 910 pgbase, pos, req_len); 911 if (IS_ERR(req)) { 912 result = PTR_ERR(req); 913 break; 914 } 915 916 if (desc.pg_error < 0) { 917 nfs_free_request(req); 918 result = desc.pg_error; 919 break; 920 } 921 922 pgbase = 0; 923 bytes -= req_len; 924 requested_bytes += req_len; 925 pos += req_len; 926 927 if (defer) { 928 nfs_mark_request_commit(req, NULL, &cinfo, 0); 929 continue; 930 } 931 932 nfs_lock_request(req); 933 if (nfs_pageio_add_request(&desc, req)) 934 continue; 935 936 /* Exit on hard errors */ 937 if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) { 938 result = desc.pg_error; 939 nfs_unlock_and_release_request(req); 940 break; 941 } 942 943 /* If the error is soft, defer remaining requests */ 944 nfs_init_cinfo_from_dreq(&cinfo, dreq); 945 spin_lock(&dreq->lock); 946 dreq->flags = NFS_ODIRECT_RESCHED_WRITES; 947 spin_unlock(&dreq->lock); 948 nfs_unlock_request(req); 949 nfs_mark_request_commit(req, NULL, &cinfo, 0); 950 desc.pg_error = 0; 951 defer = true; 952 } 953 nfs_direct_release_pages(pagevec, npages); 954 kvfree(pagevec); 955 if (result < 0) 956 break; 957 } 958 nfs_pageio_complete(&desc); 959 960 /* 961 * If no bytes were started, return the error, and let the 962 * generic layer handle the completion. 963 */ 964 if (requested_bytes == 0) { 965 inode_dio_end(inode); 966 nfs_direct_req_release(dreq); 967 return result < 0 ? result : -EIO; 968 } 969 970 if (put_dreq(dreq)) 971 nfs_direct_write_complete(dreq); 972 return requested_bytes; 973 } 974 975 /** 976 * nfs_file_direct_write - file direct write operation for NFS files 977 * @iocb: target I/O control block 978 * @iter: vector of user buffers from which to write data 979 * @swap: flag indicating this is swap IO, not O_DIRECT IO 980 * 981 * We use this function for direct writes instead of calling 982 * generic_file_aio_write() in order to avoid taking the inode 983 * semaphore and updating the i_size. The NFS server will set 984 * the new i_size and this client must read the updated size 985 * back into its cache. We let the server do generic write 986 * parameter checking and report problems. 987 * 988 * We eliminate local atime updates, see direct read above. 989 * 990 * We avoid unnecessary page cache invalidations for normal cached 991 * readers of this file. 992 * 993 * Note that O_APPEND is not supported for NFS direct writes, as there 994 * is no atomic O_APPEND write facility in the NFS protocol. 995 */ 996 ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, 997 bool swap) 998 { 999 ssize_t result, requested; 1000 size_t count; 1001 struct file *file = iocb->ki_filp; 1002 struct address_space *mapping = file->f_mapping; 1003 struct inode *inode = mapping->host; 1004 struct nfs_direct_req *dreq; 1005 struct nfs_lock_context *l_ctx; 1006 loff_t pos, end; 1007 1008 dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", 1009 file, iov_iter_count(iter), (long long) iocb->ki_pos); 1010 1011 if (swap) 1012 /* bypass generic checks */ 1013 result = iov_iter_count(iter); 1014 else 1015 result = generic_write_checks(iocb, iter); 1016 if (result <= 0) 1017 return result; 1018 count = result; 1019 nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); 1020 1021 pos = iocb->ki_pos; 1022 end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; 1023 1024 task_io_account_write(count); 1025 1026 result = -ENOMEM; 1027 dreq = nfs_direct_req_alloc(); 1028 if (!dreq) 1029 goto out; 1030 1031 dreq->inode = inode; 1032 dreq->max_count = count; 1033 dreq->io_start = pos; 1034 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 1035 l_ctx = nfs_get_lock_context(dreq->ctx); 1036 if (IS_ERR(l_ctx)) { 1037 result = PTR_ERR(l_ctx); 1038 nfs_direct_req_release(dreq); 1039 goto out_release; 1040 } 1041 dreq->l_ctx = l_ctx; 1042 if (!is_sync_kiocb(iocb)) 1043 dreq->iocb = iocb; 1044 pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode); 1045 1046 if (swap) { 1047 requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, 1048 FLUSH_STABLE); 1049 } else { 1050 result = nfs_start_io_direct(inode); 1051 if (result) { 1052 /* release the reference that would usually be 1053 * consumed by nfs_direct_write_schedule_iovec() 1054 */ 1055 nfs_direct_req_release(dreq); 1056 goto out_release; 1057 } 1058 1059 requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, 1060 FLUSH_COND_STABLE); 1061 1062 if (mapping->nrpages) { 1063 invalidate_inode_pages2_range(mapping, 1064 pos >> PAGE_SHIFT, end); 1065 } 1066 1067 nfs_end_io_direct(inode); 1068 } 1069 1070 if (requested > 0) { 1071 result = nfs_direct_wait(dreq); 1072 if (result > 0) { 1073 requested -= result; 1074 iocb->ki_pos = pos + result; 1075 /* XXX: should check the generic_write_sync retval */ 1076 generic_write_sync(iocb, result); 1077 } 1078 iov_iter_revert(iter, requested); 1079 } else { 1080 result = requested; 1081 } 1082 nfs_fscache_invalidate(inode, FSCACHE_INVAL_DIO_WRITE); 1083 out_release: 1084 nfs_direct_req_release(dreq); 1085 out: 1086 return result; 1087 } 1088 1089 /** 1090 * nfs_init_directcache - create a slab cache for nfs_direct_req structures 1091 * 1092 */ 1093 int __init nfs_init_directcache(void) 1094 { 1095 nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", 1096 sizeof(struct nfs_direct_req), 1097 0, SLAB_RECLAIM_ACCOUNT, 1098 NULL); 1099 if (nfs_direct_cachep == NULL) 1100 return -ENOMEM; 1101 1102 return 0; 1103 } 1104 1105 /** 1106 * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures 1107 * 1108 */ 1109 void nfs_destroy_directcache(void) 1110 { 1111 kmem_cache_destroy(nfs_direct_cachep); 1112 } 1113