1 /* 2 * linux/fs/nfs/direct.c 3 * 4 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com> 5 * 6 * High-performance uncached I/O for the Linux NFS client 7 * 8 * There are important applications whose performance or correctness 9 * depends on uncached access to file data. Database clusters 10 * (multiple copies of the same instance running on separate hosts) 11 * implement their own cache coherency protocol that subsumes file 12 * system cache protocols. Applications that process datasets 13 * considerably larger than the client's memory do not always benefit 14 * from a local cache. A streaming video server, for instance, has no 15 * need to cache the contents of a file. 16 * 17 * When an application requests uncached I/O, all read and write requests 18 * are made directly to the server; data stored or fetched via these 19 * requests is not cached in the Linux page cache. The client does not 20 * correct unaligned requests from applications. All requested bytes are 21 * held on permanent storage before a direct write system call returns to 22 * an application. 23 * 24 * Solaris implements an uncached I/O facility called directio() that 25 * is used for backups and sequential I/O to very large files. Solaris 26 * also supports uncaching whole NFS partitions with "-o forcedirectio," 27 * an undocumented mount option. 28 * 29 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with 30 * help from Andrew Morton. 31 * 32 * 18 Dec 2001 Initial implementation for 2.4 --cel 33 * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy 34 * 08 Jun 2003 Port to 2.5 APIs --cel 35 * 31 Mar 2004 Handle direct I/O without VFS support --cel 36 * 15 Sep 2004 Parallel async reads --cel 37 * 38 */ 39 40 #include <linux/config.h> 41 #include <linux/errno.h> 42 #include <linux/sched.h> 43 #include <linux/kernel.h> 44 #include <linux/smp_lock.h> 45 #include <linux/file.h> 46 #include <linux/pagemap.h> 47 #include <linux/kref.h> 48 49 #include <linux/nfs_fs.h> 50 #include <linux/nfs_page.h> 51 #include <linux/sunrpc/clnt.h> 52 53 #include <asm/system.h> 54 #include <asm/uaccess.h> 55 #include <asm/atomic.h> 56 57 #define NFSDBG_FACILITY NFSDBG_VFS 58 #define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT) 59 60 static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty); 61 static kmem_cache_t *nfs_direct_cachep; 62 63 /* 64 * This represents a set of asynchronous requests that we're waiting on 65 */ 66 struct nfs_direct_req { 67 struct kref kref; /* release manager */ 68 struct list_head list; /* nfs_read_data structs */ 69 wait_queue_head_t wait; /* wait for i/o completion */ 70 struct page ** pages; /* pages in our buffer */ 71 unsigned int npages; /* count of pages */ 72 atomic_t complete, /* i/os we're waiting for */ 73 count, /* bytes actually processed */ 74 error; /* any reported error */ 75 }; 76 77 78 /** 79 * nfs_get_user_pages - find and set up pages underlying user's buffer 80 * rw: direction (read or write) 81 * user_addr: starting address of this segment of user's buffer 82 * count: size of this segment 83 * @pages: returned array of page struct pointers underlying user's buffer 84 */ 85 static inline int 86 nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, 87 struct page ***pages) 88 { 89 int result = -ENOMEM; 90 unsigned long page_count; 91 size_t array_size; 92 93 /* set an arbitrary limit to prevent type overflow */ 94 /* XXX: this can probably be as large as INT_MAX */ 95 if (size > MAX_DIRECTIO_SIZE) { 96 *pages = NULL; 97 return -EFBIG; 98 } 99 100 page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT; 101 page_count -= user_addr >> PAGE_SHIFT; 102 103 array_size = (page_count * sizeof(struct page *)); 104 *pages = kmalloc(array_size, GFP_KERNEL); 105 if (*pages) { 106 down_read(¤t->mm->mmap_sem); 107 result = get_user_pages(current, current->mm, user_addr, 108 page_count, (rw == READ), 0, 109 *pages, NULL); 110 up_read(¤t->mm->mmap_sem); 111 /* 112 * If we got fewer pages than expected from get_user_pages(), 113 * the user buffer runs off the end of a mapping; return EFAULT. 114 */ 115 if (result >= 0 && result < page_count) { 116 nfs_free_user_pages(*pages, result, 0); 117 *pages = NULL; 118 result = -EFAULT; 119 } 120 } 121 return result; 122 } 123 124 /** 125 * nfs_free_user_pages - tear down page struct array 126 * @pages: array of page struct pointers underlying target buffer 127 * @npages: number of pages in the array 128 * @do_dirty: dirty the pages as we release them 129 */ 130 static void 131 nfs_free_user_pages(struct page **pages, int npages, int do_dirty) 132 { 133 int i; 134 for (i = 0; i < npages; i++) { 135 struct page *page = pages[i]; 136 if (do_dirty && !PageCompound(page)) 137 set_page_dirty_lock(page); 138 page_cache_release(page); 139 } 140 kfree(pages); 141 } 142 143 /** 144 * nfs_direct_req_release - release nfs_direct_req structure for direct read 145 * @kref: kref object embedded in an nfs_direct_req structure 146 * 147 */ 148 static void nfs_direct_req_release(struct kref *kref) 149 { 150 struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); 151 kmem_cache_free(nfs_direct_cachep, dreq); 152 } 153 154 /** 155 * nfs_direct_read_alloc - allocate nfs_read_data structures for direct read 156 * @count: count of bytes for the read request 157 * @rsize: local rsize setting 158 * 159 * Note we also set the number of requests we have in the dreq when we are 160 * done. This prevents races with I/O completion so we will always wait 161 * until all requests have been dispatched and completed. 162 */ 163 static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int rsize) 164 { 165 struct list_head *list; 166 struct nfs_direct_req *dreq; 167 unsigned int reads = 0; 168 unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 169 170 dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL); 171 if (!dreq) 172 return NULL; 173 174 kref_init(&dreq->kref); 175 init_waitqueue_head(&dreq->wait); 176 INIT_LIST_HEAD(&dreq->list); 177 atomic_set(&dreq->count, 0); 178 atomic_set(&dreq->error, 0); 179 180 list = &dreq->list; 181 for(;;) { 182 struct nfs_read_data *data = nfs_readdata_alloc(rpages); 183 184 if (unlikely(!data)) { 185 while (!list_empty(list)) { 186 data = list_entry(list->next, 187 struct nfs_read_data, pages); 188 list_del(&data->pages); 189 nfs_readdata_free(data); 190 } 191 kref_put(&dreq->kref, nfs_direct_req_release); 192 return NULL; 193 } 194 195 INIT_LIST_HEAD(&data->pages); 196 list_add(&data->pages, list); 197 198 data->req = (struct nfs_page *) dreq; 199 reads++; 200 if (nbytes <= rsize) 201 break; 202 nbytes -= rsize; 203 } 204 kref_get(&dreq->kref); 205 atomic_set(&dreq->complete, reads); 206 return dreq; 207 } 208 209 /** 210 * nfs_direct_read_result - handle a read reply for a direct read request 211 * @data: address of NFS READ operation control block 212 * @status: status of this NFS READ operation 213 * 214 * We must hold a reference to all the pages in this direct read request 215 * until the RPCs complete. This could be long *after* we are woken up in 216 * nfs_direct_read_wait (for instance, if someone hits ^C on a slow server). 217 */ 218 static void nfs_direct_read_result(struct nfs_read_data *data, int status) 219 { 220 struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; 221 222 if (likely(status >= 0)) 223 atomic_add(data->res.count, &dreq->count); 224 else 225 atomic_set(&dreq->error, status); 226 227 if (unlikely(atomic_dec_and_test(&dreq->complete))) { 228 nfs_free_user_pages(dreq->pages, dreq->npages, 1); 229 wake_up(&dreq->wait); 230 kref_put(&dreq->kref, nfs_direct_req_release); 231 } 232 } 233 234 /** 235 * nfs_direct_read_schedule - dispatch NFS READ operations for a direct read 236 * @dreq: address of nfs_direct_req struct for this request 237 * @inode: target inode 238 * @ctx: target file open context 239 * @user_addr: starting address of this segment of user's buffer 240 * @count: size of this segment 241 * @file_offset: offset in file to begin the operation 242 * 243 * For each nfs_read_data struct that was allocated on the list, dispatch 244 * an NFS READ operation 245 */ 246 static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, 247 struct inode *inode, struct nfs_open_context *ctx, 248 unsigned long user_addr, size_t count, loff_t file_offset) 249 { 250 struct list_head *list = &dreq->list; 251 struct page **pages = dreq->pages; 252 unsigned int curpage, pgbase; 253 unsigned int rsize = NFS_SERVER(inode)->rsize; 254 255 curpage = 0; 256 pgbase = user_addr & ~PAGE_MASK; 257 do { 258 struct nfs_read_data *data; 259 unsigned int bytes; 260 261 bytes = rsize; 262 if (count < rsize) 263 bytes = count; 264 265 data = list_entry(list->next, struct nfs_read_data, pages); 266 list_del_init(&data->pages); 267 268 data->inode = inode; 269 data->cred = ctx->cred; 270 data->args.fh = NFS_FH(inode); 271 data->args.context = ctx; 272 data->args.offset = file_offset; 273 data->args.pgbase = pgbase; 274 data->args.pages = &pages[curpage]; 275 data->args.count = bytes; 276 data->res.fattr = &data->fattr; 277 data->res.eof = 0; 278 data->res.count = bytes; 279 280 NFS_PROTO(inode)->read_setup(data); 281 282 data->task.tk_cookie = (unsigned long) inode; 283 data->complete = nfs_direct_read_result; 284 285 lock_kernel(); 286 rpc_execute(&data->task); 287 unlock_kernel(); 288 289 dfprintk(VFS, "NFS: %4d initiated direct read call (req %s/%Ld, %u bytes @ offset %Lu)\n", 290 data->task.tk_pid, 291 inode->i_sb->s_id, 292 (long long)NFS_FILEID(inode), 293 bytes, 294 (unsigned long long)data->args.offset); 295 296 file_offset += bytes; 297 pgbase += bytes; 298 curpage += pgbase >> PAGE_SHIFT; 299 pgbase &= ~PAGE_MASK; 300 301 count -= bytes; 302 } while (count != 0); 303 } 304 305 /** 306 * nfs_direct_read_wait - wait for I/O completion for direct reads 307 * @dreq: request on which we are to wait 308 * @intr: whether or not this wait can be interrupted 309 * 310 * Collects and returns the final error value/byte-count. 311 */ 312 static ssize_t nfs_direct_read_wait(struct nfs_direct_req *dreq, int intr) 313 { 314 int result = 0; 315 316 if (intr) { 317 result = wait_event_interruptible(dreq->wait, 318 (atomic_read(&dreq->complete) == 0)); 319 } else { 320 wait_event(dreq->wait, (atomic_read(&dreq->complete) == 0)); 321 } 322 323 if (!result) 324 result = atomic_read(&dreq->error); 325 if (!result) 326 result = atomic_read(&dreq->count); 327 328 kref_put(&dreq->kref, nfs_direct_req_release); 329 return (ssize_t) result; 330 } 331 332 /** 333 * nfs_direct_read_seg - Read in one iov segment. Generate separate 334 * read RPCs for each "rsize" bytes. 335 * @inode: target inode 336 * @ctx: target file open context 337 * @user_addr: starting address of this segment of user's buffer 338 * @count: size of this segment 339 * @file_offset: offset in file to begin the operation 340 * @pages: array of addresses of page structs defining user's buffer 341 * @nr_pages: number of pages in the array 342 * 343 */ 344 static ssize_t nfs_direct_read_seg(struct inode *inode, 345 struct nfs_open_context *ctx, unsigned long user_addr, 346 size_t count, loff_t file_offset, struct page **pages, 347 unsigned int nr_pages) 348 { 349 ssize_t result; 350 sigset_t oldset; 351 struct rpc_clnt *clnt = NFS_CLIENT(inode); 352 struct nfs_direct_req *dreq; 353 354 dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize); 355 if (!dreq) 356 return -ENOMEM; 357 358 dreq->pages = pages; 359 dreq->npages = nr_pages; 360 361 rpc_clnt_sigmask(clnt, &oldset); 362 nfs_direct_read_schedule(dreq, inode, ctx, user_addr, count, 363 file_offset); 364 result = nfs_direct_read_wait(dreq, clnt->cl_intr); 365 rpc_clnt_sigunmask(clnt, &oldset); 366 367 return result; 368 } 369 370 /** 371 * nfs_direct_read - For each iov segment, map the user's buffer 372 * then generate read RPCs. 373 * @inode: target inode 374 * @ctx: target file open context 375 * @iov: array of vectors that define I/O buffer 376 * file_offset: offset in file to begin the operation 377 * nr_segs: size of iovec array 378 * 379 * We've already pushed out any non-direct writes so that this read 380 * will see them when we read from the server. 381 */ 382 static ssize_t 383 nfs_direct_read(struct inode *inode, struct nfs_open_context *ctx, 384 const struct iovec *iov, loff_t file_offset, 385 unsigned long nr_segs) 386 { 387 ssize_t tot_bytes = 0; 388 unsigned long seg = 0; 389 390 while ((seg < nr_segs) && (tot_bytes >= 0)) { 391 ssize_t result; 392 int page_count; 393 struct page **pages; 394 const struct iovec *vec = &iov[seg++]; 395 unsigned long user_addr = (unsigned long) vec->iov_base; 396 size_t size = vec->iov_len; 397 398 page_count = nfs_get_user_pages(READ, user_addr, size, &pages); 399 if (page_count < 0) { 400 nfs_free_user_pages(pages, 0, 0); 401 if (tot_bytes > 0) 402 break; 403 return page_count; 404 } 405 406 result = nfs_direct_read_seg(inode, ctx, user_addr, size, 407 file_offset, pages, page_count); 408 409 if (result <= 0) { 410 if (tot_bytes > 0) 411 break; 412 return result; 413 } 414 tot_bytes += result; 415 file_offset += result; 416 if (result < size) 417 break; 418 } 419 420 return tot_bytes; 421 } 422 423 /** 424 * nfs_direct_write_seg - Write out one iov segment. Generate separate 425 * write RPCs for each "wsize" bytes, then commit. 426 * @inode: target inode 427 * @ctx: target file open context 428 * user_addr: starting address of this segment of user's buffer 429 * count: size of this segment 430 * file_offset: offset in file to begin the operation 431 * @pages: array of addresses of page structs defining user's buffer 432 * nr_pages: size of pages array 433 */ 434 static ssize_t nfs_direct_write_seg(struct inode *inode, 435 struct nfs_open_context *ctx, unsigned long user_addr, 436 size_t count, loff_t file_offset, struct page **pages, 437 int nr_pages) 438 { 439 const unsigned int wsize = NFS_SERVER(inode)->wsize; 440 size_t request; 441 int curpage, need_commit; 442 ssize_t result, tot_bytes; 443 struct nfs_writeverf first_verf; 444 struct nfs_write_data *wdata; 445 446 wdata = nfs_writedata_alloc(NFS_SERVER(inode)->wpages); 447 if (!wdata) 448 return -ENOMEM; 449 450 wdata->inode = inode; 451 wdata->cred = ctx->cred; 452 wdata->args.fh = NFS_FH(inode); 453 wdata->args.context = ctx; 454 wdata->args.stable = NFS_UNSTABLE; 455 if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize) 456 wdata->args.stable = NFS_FILE_SYNC; 457 wdata->res.fattr = &wdata->fattr; 458 wdata->res.verf = &wdata->verf; 459 460 nfs_begin_data_update(inode); 461 retry: 462 need_commit = 0; 463 tot_bytes = 0; 464 curpage = 0; 465 request = count; 466 wdata->args.pgbase = user_addr & ~PAGE_MASK; 467 wdata->args.offset = file_offset; 468 do { 469 wdata->args.count = request; 470 if (wdata->args.count > wsize) 471 wdata->args.count = wsize; 472 wdata->args.pages = &pages[curpage]; 473 474 dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", 475 wdata->args.count, (long long) wdata->args.offset, 476 user_addr + tot_bytes, wdata->args.pgbase, curpage); 477 478 lock_kernel(); 479 result = NFS_PROTO(inode)->write(wdata); 480 unlock_kernel(); 481 482 if (result <= 0) { 483 if (tot_bytes > 0) 484 break; 485 goto out; 486 } 487 488 if (tot_bytes == 0) 489 memcpy(&first_verf.verifier, &wdata->verf.verifier, 490 sizeof(first_verf.verifier)); 491 if (wdata->verf.committed != NFS_FILE_SYNC) { 492 need_commit = 1; 493 if (memcmp(&first_verf.verifier, &wdata->verf.verifier, 494 sizeof(first_verf.verifier))) 495 goto sync_retry; 496 } 497 498 tot_bytes += result; 499 500 /* in case of a short write: stop now, let the app recover */ 501 if (result < wdata->args.count) 502 break; 503 504 wdata->args.offset += result; 505 wdata->args.pgbase += result; 506 curpage += wdata->args.pgbase >> PAGE_SHIFT; 507 wdata->args.pgbase &= ~PAGE_MASK; 508 request -= result; 509 } while (request != 0); 510 511 /* 512 * Commit data written so far, even in the event of an error 513 */ 514 if (need_commit) { 515 wdata->args.count = tot_bytes; 516 wdata->args.offset = file_offset; 517 518 lock_kernel(); 519 result = NFS_PROTO(inode)->commit(wdata); 520 unlock_kernel(); 521 522 if (result < 0 || memcmp(&first_verf.verifier, 523 &wdata->verf.verifier, 524 sizeof(first_verf.verifier)) != 0) 525 goto sync_retry; 526 } 527 result = tot_bytes; 528 529 out: 530 nfs_end_data_update(inode); 531 nfs_writedata_free(wdata); 532 return result; 533 534 sync_retry: 535 wdata->args.stable = NFS_FILE_SYNC; 536 goto retry; 537 } 538 539 /** 540 * nfs_direct_write - For each iov segment, map the user's buffer 541 * then generate write and commit RPCs. 542 * @inode: target inode 543 * @ctx: target file open context 544 * @iov: array of vectors that define I/O buffer 545 * file_offset: offset in file to begin the operation 546 * nr_segs: size of iovec array 547 * 548 * Upon return, generic_file_direct_IO invalidates any cached pages 549 * that non-direct readers might access, so they will pick up these 550 * writes immediately. 551 */ 552 static ssize_t nfs_direct_write(struct inode *inode, 553 struct nfs_open_context *ctx, const struct iovec *iov, 554 loff_t file_offset, unsigned long nr_segs) 555 { 556 ssize_t tot_bytes = 0; 557 unsigned long seg = 0; 558 559 while ((seg < nr_segs) && (tot_bytes >= 0)) { 560 ssize_t result; 561 int page_count; 562 struct page **pages; 563 const struct iovec *vec = &iov[seg++]; 564 unsigned long user_addr = (unsigned long) vec->iov_base; 565 size_t size = vec->iov_len; 566 567 page_count = nfs_get_user_pages(WRITE, user_addr, size, &pages); 568 if (page_count < 0) { 569 nfs_free_user_pages(pages, 0, 0); 570 if (tot_bytes > 0) 571 break; 572 return page_count; 573 } 574 575 result = nfs_direct_write_seg(inode, ctx, user_addr, size, 576 file_offset, pages, page_count); 577 nfs_free_user_pages(pages, page_count, 0); 578 579 if (result <= 0) { 580 if (tot_bytes > 0) 581 break; 582 return result; 583 } 584 tot_bytes += result; 585 file_offset += result; 586 if (result < size) 587 break; 588 } 589 return tot_bytes; 590 } 591 592 /** 593 * nfs_direct_IO - NFS address space operation for direct I/O 594 * rw: direction (read or write) 595 * @iocb: target I/O control block 596 * @iov: array of vectors that define I/O buffer 597 * file_offset: offset in file to begin the operation 598 * nr_segs: size of iovec array 599 * 600 */ 601 ssize_t 602 nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 603 loff_t file_offset, unsigned long nr_segs) 604 { 605 ssize_t result = -EINVAL; 606 struct file *file = iocb->ki_filp; 607 struct nfs_open_context *ctx; 608 struct dentry *dentry = file->f_dentry; 609 struct inode *inode = dentry->d_inode; 610 611 /* 612 * No support for async yet 613 */ 614 if (!is_sync_kiocb(iocb)) 615 return result; 616 617 ctx = (struct nfs_open_context *)file->private_data; 618 switch (rw) { 619 case READ: 620 dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n", 621 dentry->d_name.name, file_offset, nr_segs); 622 623 result = nfs_direct_read(inode, ctx, iov, 624 file_offset, nr_segs); 625 break; 626 case WRITE: 627 dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n", 628 dentry->d_name.name, file_offset, nr_segs); 629 630 result = nfs_direct_write(inode, ctx, iov, 631 file_offset, nr_segs); 632 break; 633 default: 634 break; 635 } 636 return result; 637 } 638 639 /** 640 * nfs_file_direct_read - file direct read operation for NFS files 641 * @iocb: target I/O control block 642 * @buf: user's buffer into which to read data 643 * count: number of bytes to read 644 * pos: byte offset in file where reading starts 645 * 646 * We use this function for direct reads instead of calling 647 * generic_file_aio_read() in order to avoid gfar's check to see if 648 * the request starts before the end of the file. For that check 649 * to work, we must generate a GETATTR before each direct read, and 650 * even then there is a window between the GETATTR and the subsequent 651 * READ where the file size could change. So our preference is simply 652 * to do all reads the application wants, and the server will take 653 * care of managing the end of file boundary. 654 * 655 * This function also eliminates unnecessarily updating the file's 656 * atime locally, as the NFS server sets the file's atime, and this 657 * client must read the updated atime from the server back into its 658 * cache. 659 */ 660 ssize_t 661 nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) 662 { 663 ssize_t retval = -EINVAL; 664 loff_t *ppos = &iocb->ki_pos; 665 struct file *file = iocb->ki_filp; 666 struct nfs_open_context *ctx = 667 (struct nfs_open_context *) file->private_data; 668 struct address_space *mapping = file->f_mapping; 669 struct inode *inode = mapping->host; 670 struct iovec iov = { 671 .iov_base = buf, 672 .iov_len = count, 673 }; 674 675 dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n", 676 file->f_dentry->d_parent->d_name.name, 677 file->f_dentry->d_name.name, 678 (unsigned long) count, (long long) pos); 679 680 if (!is_sync_kiocb(iocb)) 681 goto out; 682 if (count < 0) 683 goto out; 684 retval = -EFAULT; 685 if (!access_ok(VERIFY_WRITE, iov.iov_base, iov.iov_len)) 686 goto out; 687 retval = 0; 688 if (!count) 689 goto out; 690 691 retval = nfs_sync_mapping(mapping); 692 if (retval) 693 goto out; 694 695 retval = nfs_direct_read(inode, ctx, &iov, pos, 1); 696 if (retval > 0) 697 *ppos = pos + retval; 698 699 out: 700 return retval; 701 } 702 703 /** 704 * nfs_file_direct_write - file direct write operation for NFS files 705 * @iocb: target I/O control block 706 * @buf: user's buffer from which to write data 707 * count: number of bytes to write 708 * pos: byte offset in file where writing starts 709 * 710 * We use this function for direct writes instead of calling 711 * generic_file_aio_write() in order to avoid taking the inode 712 * semaphore and updating the i_size. The NFS server will set 713 * the new i_size and this client must read the updated size 714 * back into its cache. We let the server do generic write 715 * parameter checking and report problems. 716 * 717 * We also avoid an unnecessary invocation of generic_osync_inode(), 718 * as it is fairly meaningless to sync the metadata of an NFS file. 719 * 720 * We eliminate local atime updates, see direct read above. 721 * 722 * We avoid unnecessary page cache invalidations for normal cached 723 * readers of this file. 724 * 725 * Note that O_APPEND is not supported for NFS direct writes, as there 726 * is no atomic O_APPEND write facility in the NFS protocol. 727 */ 728 ssize_t 729 nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) 730 { 731 ssize_t retval; 732 struct file *file = iocb->ki_filp; 733 struct nfs_open_context *ctx = 734 (struct nfs_open_context *) file->private_data; 735 struct address_space *mapping = file->f_mapping; 736 struct inode *inode = mapping->host; 737 struct iovec iov = { 738 .iov_base = (char __user *)buf, 739 }; 740 741 dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n", 742 file->f_dentry->d_parent->d_name.name, 743 file->f_dentry->d_name.name, 744 (unsigned long) count, (long long) pos); 745 746 retval = -EINVAL; 747 if (!is_sync_kiocb(iocb)) 748 goto out; 749 750 retval = generic_write_checks(file, &pos, &count, 0); 751 if (retval) 752 goto out; 753 754 retval = -EINVAL; 755 if ((ssize_t) count < 0) 756 goto out; 757 retval = 0; 758 if (!count) 759 goto out; 760 iov.iov_len = count, 761 762 retval = -EFAULT; 763 if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len)) 764 goto out; 765 766 retval = nfs_sync_mapping(mapping); 767 if (retval) 768 goto out; 769 770 retval = nfs_direct_write(inode, ctx, &iov, pos, 1); 771 if (mapping->nrpages) 772 invalidate_inode_pages2(mapping); 773 if (retval > 0) 774 iocb->ki_pos = pos + retval; 775 776 out: 777 return retval; 778 } 779 780 int nfs_init_directcache(void) 781 { 782 nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", 783 sizeof(struct nfs_direct_req), 784 0, SLAB_RECLAIM_ACCOUNT, 785 NULL, NULL); 786 if (nfs_direct_cachep == NULL) 787 return -ENOMEM; 788 789 return 0; 790 } 791 792 void nfs_destroy_directcache(void) 793 { 794 if (kmem_cache_destroy(nfs_direct_cachep)) 795 printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n"); 796 } 797