1 /* 2 * linux/fs/nfs/read.c 3 * 4 * Block I/O for NFS 5 * 6 * Partial copy of Linus' read cache modifications to fs/nfs/file.c 7 * modified for async RPC by okir@monad.swb.de 8 * 9 * We do an ugly hack here in order to return proper error codes to the 10 * user program when a read request failed: since generic_file_read 11 * only checks the return value of inode->i_op->readpage() which is always 0 12 * for async RPC, we set the error bit of the page to 1 when an error occurs, 13 * and make nfs_readpage transmit requests synchronously when encountering this. 14 * This is only a small problem, though, since we now retry all operations 15 * within the RPC code when root squashing is suspected. 16 */ 17 18 #include <linux/time.h> 19 #include <linux/kernel.h> 20 #include <linux/errno.h> 21 #include <linux/fcntl.h> 22 #include <linux/stat.h> 23 #include <linux/mm.h> 24 #include <linux/slab.h> 25 #include <linux/pagemap.h> 26 #include <linux/sunrpc/clnt.h> 27 #include <linux/nfs_fs.h> 28 #include <linux/nfs_page.h> 29 #include <linux/smp_lock.h> 30 31 #include <asm/system.h> 32 33 #include "iostat.h" 34 35 #define NFSDBG_FACILITY NFSDBG_PAGECACHE 36 37 static int nfs_pagein_one(struct list_head *, struct inode *); 38 static const struct rpc_call_ops nfs_read_partial_ops; 39 static const struct rpc_call_ops nfs_read_full_ops; 40 41 static kmem_cache_t *nfs_rdata_cachep; 42 static mempool_t *nfs_rdata_mempool; 43 44 #define MIN_POOL_READ (32) 45 46 struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) 47 { 48 struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS); 49 50 if (p) { 51 memset(p, 0, sizeof(*p)); 52 INIT_LIST_HEAD(&p->pages); 53 if (pagecount <= ARRAY_SIZE(p->page_array)) 54 p->pagevec = p->page_array; 55 else { 56 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); 57 if (!p->pagevec) { 58 mempool_free(p, nfs_rdata_mempool); 59 p = NULL; 60 } 61 } 62 } 63 return p; 64 } 65 66 void nfs_readdata_free(struct nfs_read_data *p) 67 { 68 if (p && (p->pagevec != &p->page_array[0])) 69 kfree(p->pagevec); 70 mempool_free(p, nfs_rdata_mempool); 71 } 72 73 void nfs_readdata_release(void *data) 74 { 75 nfs_readdata_free(data); 76 } 77 78 static 79 unsigned int nfs_page_length(struct inode *inode, struct page *page) 80 { 81 loff_t i_size = i_size_read(inode); 82 unsigned long idx; 83 84 if (i_size <= 0) 85 return 0; 86 idx = (i_size - 1) >> PAGE_CACHE_SHIFT; 87 if (page->index > idx) 88 return 0; 89 if (page->index != idx) 90 return PAGE_CACHE_SIZE; 91 return 1 + ((i_size - 1) & (PAGE_CACHE_SIZE - 1)); 92 } 93 94 static 95 int nfs_return_empty_page(struct page *page) 96 { 97 memclear_highpage_flush(page, 0, PAGE_CACHE_SIZE); 98 SetPageUptodate(page); 99 unlock_page(page); 100 return 0; 101 } 102 103 static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) 104 { 105 unsigned int remainder = data->args.count - data->res.count; 106 unsigned int base = data->args.pgbase + data->res.count; 107 unsigned int pglen; 108 struct page **pages; 109 110 if (data->res.eof == 0 || remainder == 0) 111 return; 112 /* 113 * Note: "remainder" can never be negative, since we check for 114 * this in the XDR code. 115 */ 116 pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; 117 base &= ~PAGE_CACHE_MASK; 118 pglen = PAGE_CACHE_SIZE - base; 119 if (pglen < remainder) 120 memclear_highpage_flush(*pages, base, pglen); 121 else 122 memclear_highpage_flush(*pages, base, remainder); 123 } 124 125 /* 126 * Read a page synchronously. 127 */ 128 static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode, 129 struct page *page) 130 { 131 unsigned int rsize = NFS_SERVER(inode)->rsize; 132 unsigned int count = PAGE_CACHE_SIZE; 133 int result; 134 struct nfs_read_data *rdata; 135 136 rdata = nfs_readdata_alloc(1); 137 if (!rdata) 138 return -ENOMEM; 139 140 memset(rdata, 0, sizeof(*rdata)); 141 rdata->flags = (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); 142 rdata->cred = ctx->cred; 143 rdata->inode = inode; 144 INIT_LIST_HEAD(&rdata->pages); 145 rdata->args.fh = NFS_FH(inode); 146 rdata->args.context = ctx; 147 rdata->args.pages = &page; 148 rdata->args.pgbase = 0UL; 149 rdata->args.count = rsize; 150 rdata->res.fattr = &rdata->fattr; 151 152 dprintk("NFS: nfs_readpage_sync(%p)\n", page); 153 154 /* 155 * This works now because the socket layer never tries to DMA 156 * into this buffer directly. 157 */ 158 do { 159 if (count < rsize) 160 rdata->args.count = count; 161 rdata->res.count = rdata->args.count; 162 rdata->args.offset = page_offset(page) + rdata->args.pgbase; 163 164 dprintk("NFS: nfs_proc_read(%s, (%s/%Ld), %Lu, %u)\n", 165 NFS_SERVER(inode)->hostname, 166 inode->i_sb->s_id, 167 (long long)NFS_FILEID(inode), 168 (unsigned long long)rdata->args.pgbase, 169 rdata->args.count); 170 171 lock_kernel(); 172 result = NFS_PROTO(inode)->read(rdata); 173 unlock_kernel(); 174 175 /* 176 * Even if we had a partial success we can't mark the page 177 * cache valid. 178 */ 179 if (result < 0) { 180 if (result == -EISDIR) 181 result = -EINVAL; 182 goto io_error; 183 } 184 count -= result; 185 rdata->args.pgbase += result; 186 nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, result); 187 188 /* Note: result == 0 should only happen if we're caching 189 * a write that extends the file and punches a hole. 190 */ 191 if (rdata->res.eof != 0 || result == 0) 192 break; 193 } while (count); 194 spin_lock(&inode->i_lock); 195 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; 196 spin_unlock(&inode->i_lock); 197 198 nfs_readpage_truncate_uninitialised_page(rdata); 199 if (rdata->res.eof || rdata->res.count == rdata->args.count) 200 SetPageUptodate(page); 201 result = 0; 202 203 io_error: 204 unlock_page(page); 205 nfs_readdata_free(rdata); 206 return result; 207 } 208 209 static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, 210 struct page *page) 211 { 212 LIST_HEAD(one_request); 213 struct nfs_page *new; 214 unsigned int len; 215 216 len = nfs_page_length(inode, page); 217 if (len == 0) 218 return nfs_return_empty_page(page); 219 new = nfs_create_request(ctx, inode, page, 0, len); 220 if (IS_ERR(new)) { 221 unlock_page(page); 222 return PTR_ERR(new); 223 } 224 if (len < PAGE_CACHE_SIZE) 225 memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len); 226 227 nfs_list_add_request(new, &one_request); 228 nfs_pagein_one(&one_request, inode); 229 return 0; 230 } 231 232 static void nfs_readpage_release(struct nfs_page *req) 233 { 234 unlock_page(req->wb_page); 235 236 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", 237 req->wb_context->dentry->d_inode->i_sb->s_id, 238 (long long)NFS_FILEID(req->wb_context->dentry->d_inode), 239 req->wb_bytes, 240 (long long)req_offset(req)); 241 nfs_clear_request(req); 242 nfs_release_request(req); 243 } 244 245 /* 246 * Set up the NFS read request struct 247 */ 248 static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, 249 const struct rpc_call_ops *call_ops, 250 unsigned int count, unsigned int offset) 251 { 252 struct inode *inode; 253 int flags; 254 255 data->req = req; 256 data->inode = inode = req->wb_context->dentry->d_inode; 257 data->cred = req->wb_context->cred; 258 259 data->args.fh = NFS_FH(inode); 260 data->args.offset = req_offset(req) + offset; 261 data->args.pgbase = req->wb_pgbase + offset; 262 data->args.pages = data->pagevec; 263 data->args.count = count; 264 data->args.context = req->wb_context; 265 266 data->res.fattr = &data->fattr; 267 data->res.count = count; 268 data->res.eof = 0; 269 nfs_fattr_init(&data->fattr); 270 271 /* Set up the initial task struct. */ 272 flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); 273 rpc_init_task(&data->task, NFS_CLIENT(inode), flags, call_ops, data); 274 NFS_PROTO(inode)->read_setup(data); 275 276 data->task.tk_cookie = (unsigned long)inode; 277 278 dprintk("NFS: %4d initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", 279 data->task.tk_pid, 280 inode->i_sb->s_id, 281 (long long)NFS_FILEID(inode), 282 count, 283 (unsigned long long)data->args.offset); 284 } 285 286 static void 287 nfs_async_read_error(struct list_head *head) 288 { 289 struct nfs_page *req; 290 291 while (!list_empty(head)) { 292 req = nfs_list_entry(head->next); 293 nfs_list_remove_request(req); 294 SetPageError(req->wb_page); 295 nfs_readpage_release(req); 296 } 297 } 298 299 /* 300 * Start an async read operation 301 */ 302 static void nfs_execute_read(struct nfs_read_data *data) 303 { 304 struct rpc_clnt *clnt = NFS_CLIENT(data->inode); 305 sigset_t oldset; 306 307 rpc_clnt_sigmask(clnt, &oldset); 308 lock_kernel(); 309 rpc_execute(&data->task); 310 unlock_kernel(); 311 rpc_clnt_sigunmask(clnt, &oldset); 312 } 313 314 /* 315 * Generate multiple requests to fill a single page. 316 * 317 * We optimize to reduce the number of read operations on the wire. If we 318 * detect that we're reading a page, or an area of a page, that is past the 319 * end of file, we do not generate NFS read operations but just clear the 320 * parts of the page that would have come back zero from the server anyway. 321 * 322 * We rely on the cached value of i_size to make this determination; another 323 * client can fill pages on the server past our cached end-of-file, but we 324 * won't see the new data until our attribute cache is updated. This is more 325 * or less conventional NFS client behavior. 326 */ 327 static int nfs_pagein_multi(struct list_head *head, struct inode *inode) 328 { 329 struct nfs_page *req = nfs_list_entry(head->next); 330 struct page *page = req->wb_page; 331 struct nfs_read_data *data; 332 unsigned int rsize = NFS_SERVER(inode)->rsize; 333 unsigned int nbytes, offset; 334 int requests = 0; 335 LIST_HEAD(list); 336 337 nfs_list_remove_request(req); 338 339 nbytes = req->wb_bytes; 340 for(;;) { 341 data = nfs_readdata_alloc(1); 342 if (!data) 343 goto out_bad; 344 INIT_LIST_HEAD(&data->pages); 345 list_add(&data->pages, &list); 346 requests++; 347 if (nbytes <= rsize) 348 break; 349 nbytes -= rsize; 350 } 351 atomic_set(&req->wb_complete, requests); 352 353 ClearPageError(page); 354 offset = 0; 355 nbytes = req->wb_bytes; 356 do { 357 data = list_entry(list.next, struct nfs_read_data, pages); 358 list_del_init(&data->pages); 359 360 data->pagevec[0] = page; 361 362 if (nbytes > rsize) { 363 nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, 364 rsize, offset); 365 offset += rsize; 366 nbytes -= rsize; 367 } else { 368 nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, 369 nbytes, offset); 370 nbytes = 0; 371 } 372 nfs_execute_read(data); 373 } while (nbytes != 0); 374 375 return 0; 376 377 out_bad: 378 while (!list_empty(&list)) { 379 data = list_entry(list.next, struct nfs_read_data, pages); 380 list_del(&data->pages); 381 nfs_readdata_free(data); 382 } 383 SetPageError(page); 384 nfs_readpage_release(req); 385 return -ENOMEM; 386 } 387 388 static int nfs_pagein_one(struct list_head *head, struct inode *inode) 389 { 390 struct nfs_page *req; 391 struct page **pages; 392 struct nfs_read_data *data; 393 unsigned int count; 394 395 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) 396 return nfs_pagein_multi(head, inode); 397 398 data = nfs_readdata_alloc(NFS_SERVER(inode)->rpages); 399 if (!data) 400 goto out_bad; 401 402 INIT_LIST_HEAD(&data->pages); 403 pages = data->pagevec; 404 count = 0; 405 while (!list_empty(head)) { 406 req = nfs_list_entry(head->next); 407 nfs_list_remove_request(req); 408 nfs_list_add_request(req, &data->pages); 409 ClearPageError(req->wb_page); 410 *pages++ = req->wb_page; 411 count += req->wb_bytes; 412 } 413 req = nfs_list_entry(data->pages.next); 414 415 nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); 416 417 nfs_execute_read(data); 418 return 0; 419 out_bad: 420 nfs_async_read_error(head); 421 return -ENOMEM; 422 } 423 424 static int 425 nfs_pagein_list(struct list_head *head, int rpages) 426 { 427 LIST_HEAD(one_request); 428 struct nfs_page *req; 429 int error = 0; 430 unsigned int pages = 0; 431 432 while (!list_empty(head)) { 433 pages += nfs_coalesce_requests(head, &one_request, rpages); 434 req = nfs_list_entry(one_request.next); 435 error = nfs_pagein_one(&one_request, req->wb_context->dentry->d_inode); 436 if (error < 0) 437 break; 438 } 439 if (error >= 0) 440 return pages; 441 442 nfs_async_read_error(head); 443 return error; 444 } 445 446 /* 447 * Handle a read reply that fills part of a page. 448 */ 449 static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata) 450 { 451 struct nfs_read_data *data = calldata; 452 struct nfs_page *req = data->req; 453 struct page *page = req->wb_page; 454 455 if (likely(task->tk_status >= 0)) 456 nfs_readpage_truncate_uninitialised_page(data); 457 else 458 SetPageError(page); 459 if (nfs_readpage_result(task, data) != 0) 460 return; 461 if (atomic_dec_and_test(&req->wb_complete)) { 462 if (!PageError(page)) 463 SetPageUptodate(page); 464 nfs_readpage_release(req); 465 } 466 } 467 468 static const struct rpc_call_ops nfs_read_partial_ops = { 469 .rpc_call_done = nfs_readpage_result_partial, 470 .rpc_release = nfs_readdata_release, 471 }; 472 473 static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) 474 { 475 unsigned int count = data->res.count; 476 unsigned int base = data->args.pgbase; 477 struct page **pages; 478 479 if (unlikely(count == 0)) 480 return; 481 pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; 482 base &= ~PAGE_CACHE_MASK; 483 count += base; 484 for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) 485 SetPageUptodate(*pages); 486 /* 487 * Was this an eof or a short read? If the latter, don't mark the page 488 * as uptodate yet. 489 */ 490 if (count > 0 && (data->res.eof || data->args.count == data->res.count)) 491 SetPageUptodate(*pages); 492 } 493 494 static void nfs_readpage_set_pages_error(struct nfs_read_data *data) 495 { 496 unsigned int count = data->args.count; 497 unsigned int base = data->args.pgbase; 498 struct page **pages; 499 500 pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; 501 base &= ~PAGE_CACHE_MASK; 502 count += base; 503 for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) 504 SetPageError(*pages); 505 } 506 507 /* 508 * This is the callback from RPC telling us whether a reply was 509 * received or some error occurred (timeout or socket shutdown). 510 */ 511 static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) 512 { 513 struct nfs_read_data *data = calldata; 514 515 /* 516 * Note: nfs_readpage_result may change the values of 517 * data->args. In the multi-page case, we therefore need 518 * to ensure that we call the next nfs_readpage_set_page_uptodate() 519 * first in the multi-page case. 520 */ 521 if (likely(task->tk_status >= 0)) { 522 nfs_readpage_truncate_uninitialised_page(data); 523 nfs_readpage_set_pages_uptodate(data); 524 } else 525 nfs_readpage_set_pages_error(data); 526 if (nfs_readpage_result(task, data) != 0) 527 return; 528 while (!list_empty(&data->pages)) { 529 struct nfs_page *req = nfs_list_entry(data->pages.next); 530 531 nfs_list_remove_request(req); 532 nfs_readpage_release(req); 533 } 534 } 535 536 static const struct rpc_call_ops nfs_read_full_ops = { 537 .rpc_call_done = nfs_readpage_result_full, 538 .rpc_release = nfs_readdata_release, 539 }; 540 541 /* 542 * This is the callback from RPC telling us whether a reply was 543 * received or some error occurred (timeout or socket shutdown). 544 */ 545 int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) 546 { 547 struct nfs_readargs *argp = &data->args; 548 struct nfs_readres *resp = &data->res; 549 int status; 550 551 dprintk("NFS: %4d nfs_readpage_result, (status %d)\n", 552 task->tk_pid, task->tk_status); 553 554 status = NFS_PROTO(data->inode)->read_done(task, data); 555 if (status != 0) 556 return status; 557 558 nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, resp->count); 559 560 /* Is this a short read? */ 561 if (task->tk_status >= 0 && resp->count < argp->count && !resp->eof) { 562 nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); 563 /* Has the server at least made some progress? */ 564 if (resp->count != 0) { 565 /* Yes, so retry the read at the end of the data */ 566 argp->offset += resp->count; 567 argp->pgbase += resp->count; 568 argp->count -= resp->count; 569 rpc_restart_call(task); 570 return -EAGAIN; 571 } 572 task->tk_status = -EIO; 573 } 574 spin_lock(&data->inode->i_lock); 575 NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME; 576 spin_unlock(&data->inode->i_lock); 577 return 0; 578 } 579 580 /* 581 * Read a page over NFS. 582 * We read the page synchronously in the following case: 583 * - The error flag is set for this page. This happens only when a 584 * previous async read operation failed. 585 */ 586 int nfs_readpage(struct file *file, struct page *page) 587 { 588 struct nfs_open_context *ctx; 589 struct inode *inode = page->mapping->host; 590 int error; 591 592 dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", 593 page, PAGE_CACHE_SIZE, page->index); 594 nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); 595 nfs_add_stats(inode, NFSIOS_READPAGES, 1); 596 597 /* 598 * Try to flush any pending writes to the file.. 599 * 600 * NOTE! Because we own the page lock, there cannot 601 * be any new pending writes generated at this point 602 * for this page (other pages can be written to). 603 */ 604 error = nfs_wb_page(inode, page); 605 if (error) 606 goto out_error; 607 608 if (file == NULL) { 609 ctx = nfs_find_open_context(inode, NULL, FMODE_READ); 610 if (ctx == NULL) 611 return -EBADF; 612 } else 613 ctx = get_nfs_open_context((struct nfs_open_context *) 614 file->private_data); 615 if (!IS_SYNC(inode)) { 616 error = nfs_readpage_async(ctx, inode, page); 617 goto out; 618 } 619 620 error = nfs_readpage_sync(ctx, inode, page); 621 if (error < 0 && IS_SWAPFILE(inode)) 622 printk("Aiee.. nfs swap-in of page failed!\n"); 623 out: 624 put_nfs_open_context(ctx); 625 return error; 626 627 out_error: 628 unlock_page(page); 629 return error; 630 } 631 632 struct nfs_readdesc { 633 struct list_head *head; 634 struct nfs_open_context *ctx; 635 }; 636 637 static int 638 readpage_async_filler(void *data, struct page *page) 639 { 640 struct nfs_readdesc *desc = (struct nfs_readdesc *)data; 641 struct inode *inode = page->mapping->host; 642 struct nfs_page *new; 643 unsigned int len; 644 645 nfs_wb_page(inode, page); 646 len = nfs_page_length(inode, page); 647 if (len == 0) 648 return nfs_return_empty_page(page); 649 new = nfs_create_request(desc->ctx, inode, page, 0, len); 650 if (IS_ERR(new)) { 651 SetPageError(page); 652 unlock_page(page); 653 return PTR_ERR(new); 654 } 655 if (len < PAGE_CACHE_SIZE) 656 memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len); 657 nfs_list_add_request(new, desc->head); 658 return 0; 659 } 660 661 int nfs_readpages(struct file *filp, struct address_space *mapping, 662 struct list_head *pages, unsigned nr_pages) 663 { 664 LIST_HEAD(head); 665 struct nfs_readdesc desc = { 666 .head = &head, 667 }; 668 struct inode *inode = mapping->host; 669 struct nfs_server *server = NFS_SERVER(inode); 670 int ret; 671 672 dprintk("NFS: nfs_readpages (%s/%Ld %d)\n", 673 inode->i_sb->s_id, 674 (long long)NFS_FILEID(inode), 675 nr_pages); 676 nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); 677 678 if (filp == NULL) { 679 desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); 680 if (desc.ctx == NULL) 681 return -EBADF; 682 } else 683 desc.ctx = get_nfs_open_context((struct nfs_open_context *) 684 filp->private_data); 685 ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); 686 if (!list_empty(&head)) { 687 int err = nfs_pagein_list(&head, server->rpages); 688 if (!ret) 689 nfs_add_stats(inode, NFSIOS_READPAGES, err); 690 ret = err; 691 } 692 put_nfs_open_context(desc.ctx); 693 return ret; 694 } 695 696 int __init nfs_init_readpagecache(void) 697 { 698 nfs_rdata_cachep = kmem_cache_create("nfs_read_data", 699 sizeof(struct nfs_read_data), 700 0, SLAB_HWCACHE_ALIGN, 701 NULL, NULL); 702 if (nfs_rdata_cachep == NULL) 703 return -ENOMEM; 704 705 nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ, 706 nfs_rdata_cachep); 707 if (nfs_rdata_mempool == NULL) 708 return -ENOMEM; 709 710 return 0; 711 } 712 713 void nfs_destroy_readpagecache(void) 714 { 715 mempool_destroy(nfs_rdata_mempool); 716 if (kmem_cache_destroy(nfs_rdata_cachep)) 717 printk(KERN_INFO "nfs_read_data: not all structures were freed\n"); 718 } 719