1 /* handling of writes to regular files and writing back to the server 2 * 3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 4 * Written by David Howells (dhowells@redhat.com) 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 #include <linux/backing-dev.h> 12 #include <linux/slab.h> 13 #include <linux/fs.h> 14 #include <linux/pagemap.h> 15 #include <linux/writeback.h> 16 #include <linux/pagevec.h> 17 #include <linux/aio.h> 18 #include "internal.h" 19 20 static int afs_write_back_from_locked_page(struct afs_writeback *wb, 21 struct page *page); 22 23 /* 24 * mark a page as having been made dirty and thus needing writeback 25 */ 26 int afs_set_page_dirty(struct page *page) 27 { 28 _enter(""); 29 return __set_page_dirty_nobuffers(page); 30 } 31 32 /* 33 * unlink a writeback record because its usage has reached zero 34 * - must be called with the wb->vnode->writeback_lock held 35 */ 36 static void afs_unlink_writeback(struct afs_writeback *wb) 37 { 38 struct afs_writeback *front; 39 struct afs_vnode *vnode = wb->vnode; 40 41 list_del_init(&wb->link); 42 if (!list_empty(&vnode->writebacks)) { 43 /* if an fsync rises to the front of the queue then wake it 44 * up */ 45 front = list_entry(vnode->writebacks.next, 46 struct afs_writeback, link); 47 if (front->state == AFS_WBACK_SYNCING) { 48 _debug("wake up sync"); 49 front->state = AFS_WBACK_COMPLETE; 50 wake_up(&front->waitq); 51 } 52 } 53 } 54 55 /* 56 * free a writeback record 57 */ 58 static void afs_free_writeback(struct afs_writeback *wb) 59 { 60 _enter(""); 61 key_put(wb->key); 62 kfree(wb); 63 } 64 65 /* 66 * dispose of a reference to a writeback record 67 */ 68 void afs_put_writeback(struct afs_writeback *wb) 69 { 70 struct afs_vnode *vnode = wb->vnode; 71 72 _enter("{%d}", wb->usage); 73 74 spin_lock(&vnode->writeback_lock); 75 if (--wb->usage == 0) 76 afs_unlink_writeback(wb); 77 else 78 wb = NULL; 79 spin_unlock(&vnode->writeback_lock); 80 if (wb) 81 afs_free_writeback(wb); 82 } 83 84 /* 85 * partly or wholly fill a page that's under preparation for writing 86 */ 87 static int afs_fill_page(struct afs_vnode *vnode, struct key *key, 88 loff_t pos, struct page *page) 89 { 90 loff_t i_size; 91 int ret; 92 int len; 93 94 _enter(",,%llu", (unsigned long long)pos); 95 96 i_size = i_size_read(&vnode->vfs_inode); 97 if (pos + PAGE_CACHE_SIZE > i_size) 98 len = i_size - pos; 99 else 100 len = PAGE_CACHE_SIZE; 101 102 ret = afs_vnode_fetch_data(vnode, key, pos, len, page); 103 if (ret < 0) { 104 if (ret == -ENOENT) { 105 _debug("got NOENT from server" 106 " - marking file deleted and stale"); 107 set_bit(AFS_VNODE_DELETED, &vnode->flags); 108 ret = -ESTALE; 109 } 110 } 111 112 _leave(" = %d", ret); 113 return ret; 114 } 115 116 /* 117 * prepare to perform part of a write to a page 118 */ 119 int afs_write_begin(struct file *file, struct address_space *mapping, 120 loff_t pos, unsigned len, unsigned flags, 121 struct page **pagep, void **fsdata) 122 { 123 struct afs_writeback *candidate, *wb; 124 struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); 125 struct page *page; 126 struct key *key = file->private_data; 127 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 128 unsigned to = from + len; 129 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 130 int ret; 131 132 _enter("{%x:%u},{%lx},%u,%u", 133 vnode->fid.vid, vnode->fid.vnode, index, from, to); 134 135 candidate = kzalloc(sizeof(*candidate), GFP_KERNEL); 136 if (!candidate) 137 return -ENOMEM; 138 candidate->vnode = vnode; 139 candidate->first = candidate->last = index; 140 candidate->offset_first = from; 141 candidate->to_last = to; 142 INIT_LIST_HEAD(&candidate->link); 143 candidate->usage = 1; 144 candidate->state = AFS_WBACK_PENDING; 145 init_waitqueue_head(&candidate->waitq); 146 147 page = grab_cache_page_write_begin(mapping, index, flags); 148 if (!page) { 149 kfree(candidate); 150 return -ENOMEM; 151 } 152 *pagep = page; 153 /* page won't leak in error case: it eventually gets cleaned off LRU */ 154 155 if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) { 156 ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page); 157 if (ret < 0) { 158 kfree(candidate); 159 _leave(" = %d [prep]", ret); 160 return ret; 161 } 162 SetPageUptodate(page); 163 } 164 165 try_again: 166 spin_lock(&vnode->writeback_lock); 167 168 /* see if this page is already pending a writeback under a suitable key 169 * - if so we can just join onto that one */ 170 wb = (struct afs_writeback *) page_private(page); 171 if (wb) { 172 if (wb->key == key && wb->state == AFS_WBACK_PENDING) 173 goto subsume_in_current_wb; 174 goto flush_conflicting_wb; 175 } 176 177 if (index > 0) { 178 /* see if we can find an already pending writeback that we can 179 * append this page to */ 180 list_for_each_entry(wb, &vnode->writebacks, link) { 181 if (wb->last == index - 1 && wb->key == key && 182 wb->state == AFS_WBACK_PENDING) 183 goto append_to_previous_wb; 184 } 185 } 186 187 list_add_tail(&candidate->link, &vnode->writebacks); 188 candidate->key = key_get(key); 189 spin_unlock(&vnode->writeback_lock); 190 SetPagePrivate(page); 191 set_page_private(page, (unsigned long) candidate); 192 _leave(" = 0 [new]"); 193 return 0; 194 195 subsume_in_current_wb: 196 _debug("subsume"); 197 ASSERTRANGE(wb->first, <=, index, <=, wb->last); 198 if (index == wb->first && from < wb->offset_first) 199 wb->offset_first = from; 200 if (index == wb->last && to > wb->to_last) 201 wb->to_last = to; 202 spin_unlock(&vnode->writeback_lock); 203 kfree(candidate); 204 _leave(" = 0 [sub]"); 205 return 0; 206 207 append_to_previous_wb: 208 _debug("append into %lx-%lx", wb->first, wb->last); 209 wb->usage++; 210 wb->last++; 211 wb->to_last = to; 212 spin_unlock(&vnode->writeback_lock); 213 SetPagePrivate(page); 214 set_page_private(page, (unsigned long) wb); 215 kfree(candidate); 216 _leave(" = 0 [app]"); 217 return 0; 218 219 /* the page is currently bound to another context, so if it's dirty we 220 * need to flush it before we can use the new context */ 221 flush_conflicting_wb: 222 _debug("flush conflict"); 223 if (wb->state == AFS_WBACK_PENDING) 224 wb->state = AFS_WBACK_CONFLICTING; 225 spin_unlock(&vnode->writeback_lock); 226 if (PageDirty(page)) { 227 ret = afs_write_back_from_locked_page(wb, page); 228 if (ret < 0) { 229 afs_put_writeback(candidate); 230 _leave(" = %d", ret); 231 return ret; 232 } 233 } 234 235 /* the page holds a ref on the writeback record */ 236 afs_put_writeback(wb); 237 set_page_private(page, 0); 238 ClearPagePrivate(page); 239 goto try_again; 240 } 241 242 /* 243 * finalise part of a write to a page 244 */ 245 int afs_write_end(struct file *file, struct address_space *mapping, 246 loff_t pos, unsigned len, unsigned copied, 247 struct page *page, void *fsdata) 248 { 249 struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); 250 loff_t i_size, maybe_i_size; 251 252 _enter("{%x:%u},{%lx}", 253 vnode->fid.vid, vnode->fid.vnode, page->index); 254 255 maybe_i_size = pos + copied; 256 257 i_size = i_size_read(&vnode->vfs_inode); 258 if (maybe_i_size > i_size) { 259 spin_lock(&vnode->writeback_lock); 260 i_size = i_size_read(&vnode->vfs_inode); 261 if (maybe_i_size > i_size) 262 i_size_write(&vnode->vfs_inode, maybe_i_size); 263 spin_unlock(&vnode->writeback_lock); 264 } 265 266 set_page_dirty(page); 267 if (PageDirty(page)) 268 _debug("dirtied"); 269 unlock_page(page); 270 page_cache_release(page); 271 272 return copied; 273 } 274 275 /* 276 * kill all the pages in the given range 277 */ 278 static void afs_kill_pages(struct afs_vnode *vnode, bool error, 279 pgoff_t first, pgoff_t last) 280 { 281 struct pagevec pv; 282 unsigned count, loop; 283 284 _enter("{%x:%u},%lx-%lx", 285 vnode->fid.vid, vnode->fid.vnode, first, last); 286 287 pagevec_init(&pv, 0); 288 289 do { 290 _debug("kill %lx-%lx", first, last); 291 292 count = last - first + 1; 293 if (count > PAGEVEC_SIZE) 294 count = PAGEVEC_SIZE; 295 pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping, 296 first, count, pv.pages); 297 ASSERTCMP(pv.nr, ==, count); 298 299 for (loop = 0; loop < count; loop++) { 300 ClearPageUptodate(pv.pages[loop]); 301 if (error) 302 SetPageError(pv.pages[loop]); 303 end_page_writeback(pv.pages[loop]); 304 } 305 306 __pagevec_release(&pv); 307 } while (first < last); 308 309 _leave(""); 310 } 311 312 /* 313 * synchronously write back the locked page and any subsequent non-locked dirty 314 * pages also covered by the same writeback record 315 */ 316 static int afs_write_back_from_locked_page(struct afs_writeback *wb, 317 struct page *primary_page) 318 { 319 struct page *pages[8], *page; 320 unsigned long count; 321 unsigned n, offset, to; 322 pgoff_t start, first, last; 323 int loop, ret; 324 325 _enter(",%lx", primary_page->index); 326 327 count = 1; 328 if (!clear_page_dirty_for_io(primary_page)) 329 BUG(); 330 if (test_set_page_writeback(primary_page)) 331 BUG(); 332 333 /* find all consecutive lockable dirty pages, stopping when we find a 334 * page that is not immediately lockable, is not dirty or is missing, 335 * or we reach the end of the range */ 336 start = primary_page->index; 337 if (start >= wb->last) 338 goto no_more; 339 start++; 340 do { 341 _debug("more %lx [%lx]", start, count); 342 n = wb->last - start + 1; 343 if (n > ARRAY_SIZE(pages)) 344 n = ARRAY_SIZE(pages); 345 n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping, 346 start, n, pages); 347 _debug("fgpc %u", n); 348 if (n == 0) 349 goto no_more; 350 if (pages[0]->index != start) { 351 do { 352 put_page(pages[--n]); 353 } while (n > 0); 354 goto no_more; 355 } 356 357 for (loop = 0; loop < n; loop++) { 358 page = pages[loop]; 359 if (page->index > wb->last) 360 break; 361 if (!trylock_page(page)) 362 break; 363 if (!PageDirty(page) || 364 page_private(page) != (unsigned long) wb) { 365 unlock_page(page); 366 break; 367 } 368 if (!clear_page_dirty_for_io(page)) 369 BUG(); 370 if (test_set_page_writeback(page)) 371 BUG(); 372 unlock_page(page); 373 put_page(page); 374 } 375 count += loop; 376 if (loop < n) { 377 for (; loop < n; loop++) 378 put_page(pages[loop]); 379 goto no_more; 380 } 381 382 start += loop; 383 } while (start <= wb->last && count < 65536); 384 385 no_more: 386 /* we now have a contiguous set of dirty pages, each with writeback set 387 * and the dirty mark cleared; the first page is locked and must remain 388 * so, all the rest are unlocked */ 389 first = primary_page->index; 390 last = first + count - 1; 391 392 offset = (first == wb->first) ? wb->offset_first : 0; 393 to = (last == wb->last) ? wb->to_last : PAGE_SIZE; 394 395 _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); 396 397 ret = afs_vnode_store_data(wb, first, last, offset, to); 398 if (ret < 0) { 399 switch (ret) { 400 case -EDQUOT: 401 case -ENOSPC: 402 set_bit(AS_ENOSPC, 403 &wb->vnode->vfs_inode.i_mapping->flags); 404 break; 405 case -EROFS: 406 case -EIO: 407 case -EREMOTEIO: 408 case -EFBIG: 409 case -ENOENT: 410 case -ENOMEDIUM: 411 case -ENXIO: 412 afs_kill_pages(wb->vnode, true, first, last); 413 set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags); 414 break; 415 case -EACCES: 416 case -EPERM: 417 case -ENOKEY: 418 case -EKEYEXPIRED: 419 case -EKEYREJECTED: 420 case -EKEYREVOKED: 421 afs_kill_pages(wb->vnode, false, first, last); 422 break; 423 default: 424 break; 425 } 426 } else { 427 ret = count; 428 } 429 430 _leave(" = %d", ret); 431 return ret; 432 } 433 434 /* 435 * write a page back to the server 436 * - the caller locked the page for us 437 */ 438 int afs_writepage(struct page *page, struct writeback_control *wbc) 439 { 440 struct afs_writeback *wb; 441 int ret; 442 443 _enter("{%lx},", page->index); 444 445 wb = (struct afs_writeback *) page_private(page); 446 ASSERT(wb != NULL); 447 448 ret = afs_write_back_from_locked_page(wb, page); 449 unlock_page(page); 450 if (ret < 0) { 451 _leave(" = %d", ret); 452 return 0; 453 } 454 455 wbc->nr_to_write -= ret; 456 457 _leave(" = 0"); 458 return 0; 459 } 460 461 /* 462 * write a region of pages back to the server 463 */ 464 static int afs_writepages_region(struct address_space *mapping, 465 struct writeback_control *wbc, 466 pgoff_t index, pgoff_t end, pgoff_t *_next) 467 { 468 struct afs_writeback *wb; 469 struct page *page; 470 int ret, n; 471 472 _enter(",,%lx,%lx,", index, end); 473 474 do { 475 n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY, 476 1, &page); 477 if (!n) 478 break; 479 480 _debug("wback %lx", page->index); 481 482 if (page->index > end) { 483 *_next = index; 484 page_cache_release(page); 485 _leave(" = 0 [%lx]", *_next); 486 return 0; 487 } 488 489 /* at this point we hold neither mapping->tree_lock nor lock on 490 * the page itself: the page may be truncated or invalidated 491 * (changing page->mapping to NULL), or even swizzled back from 492 * swapper_space to tmpfs file mapping 493 */ 494 lock_page(page); 495 496 if (page->mapping != mapping) { 497 unlock_page(page); 498 page_cache_release(page); 499 continue; 500 } 501 502 if (wbc->sync_mode != WB_SYNC_NONE) 503 wait_on_page_writeback(page); 504 505 if (PageWriteback(page) || !PageDirty(page)) { 506 unlock_page(page); 507 continue; 508 } 509 510 wb = (struct afs_writeback *) page_private(page); 511 ASSERT(wb != NULL); 512 513 spin_lock(&wb->vnode->writeback_lock); 514 wb->state = AFS_WBACK_WRITING; 515 spin_unlock(&wb->vnode->writeback_lock); 516 517 ret = afs_write_back_from_locked_page(wb, page); 518 unlock_page(page); 519 page_cache_release(page); 520 if (ret < 0) { 521 _leave(" = %d", ret); 522 return ret; 523 } 524 525 wbc->nr_to_write -= ret; 526 527 cond_resched(); 528 } while (index < end && wbc->nr_to_write > 0); 529 530 *_next = index; 531 _leave(" = 0 [%lx]", *_next); 532 return 0; 533 } 534 535 /* 536 * write some of the pending data back to the server 537 */ 538 int afs_writepages(struct address_space *mapping, 539 struct writeback_control *wbc) 540 { 541 pgoff_t start, end, next; 542 int ret; 543 544 _enter(""); 545 546 if (wbc->range_cyclic) { 547 start = mapping->writeback_index; 548 end = -1; 549 ret = afs_writepages_region(mapping, wbc, start, end, &next); 550 if (start > 0 && wbc->nr_to_write > 0 && ret == 0) 551 ret = afs_writepages_region(mapping, wbc, 0, start, 552 &next); 553 mapping->writeback_index = next; 554 } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { 555 end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT); 556 ret = afs_writepages_region(mapping, wbc, 0, end, &next); 557 if (wbc->nr_to_write > 0) 558 mapping->writeback_index = next; 559 } else { 560 start = wbc->range_start >> PAGE_CACHE_SHIFT; 561 end = wbc->range_end >> PAGE_CACHE_SHIFT; 562 ret = afs_writepages_region(mapping, wbc, start, end, &next); 563 } 564 565 _leave(" = %d", ret); 566 return ret; 567 } 568 569 /* 570 * completion of write to server 571 */ 572 void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) 573 { 574 struct afs_writeback *wb = call->wb; 575 struct pagevec pv; 576 unsigned count, loop; 577 pgoff_t first = call->first, last = call->last; 578 bool free_wb; 579 580 _enter("{%x:%u},{%lx-%lx}", 581 vnode->fid.vid, vnode->fid.vnode, first, last); 582 583 ASSERT(wb != NULL); 584 585 pagevec_init(&pv, 0); 586 587 do { 588 _debug("done %lx-%lx", first, last); 589 590 count = last - first + 1; 591 if (count > PAGEVEC_SIZE) 592 count = PAGEVEC_SIZE; 593 pv.nr = find_get_pages_contig(call->mapping, first, count, 594 pv.pages); 595 ASSERTCMP(pv.nr, ==, count); 596 597 spin_lock(&vnode->writeback_lock); 598 for (loop = 0; loop < count; loop++) { 599 struct page *page = pv.pages[loop]; 600 end_page_writeback(page); 601 if (page_private(page) == (unsigned long) wb) { 602 set_page_private(page, 0); 603 ClearPagePrivate(page); 604 wb->usage--; 605 } 606 } 607 free_wb = false; 608 if (wb->usage == 0) { 609 afs_unlink_writeback(wb); 610 free_wb = true; 611 } 612 spin_unlock(&vnode->writeback_lock); 613 first += count; 614 if (free_wb) { 615 afs_free_writeback(wb); 616 wb = NULL; 617 } 618 619 __pagevec_release(&pv); 620 } while (first <= last); 621 622 _leave(""); 623 } 624 625 /* 626 * write to an AFS file 627 */ 628 ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) 629 { 630 struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); 631 ssize_t result; 632 size_t count = iov_iter_count(from); 633 634 _enter("{%x.%u},{%zu},", 635 vnode->fid.vid, vnode->fid.vnode, count); 636 637 if (IS_SWAPFILE(&vnode->vfs_inode)) { 638 printk(KERN_INFO 639 "AFS: Attempt to write to active swap file!\n"); 640 return -EBUSY; 641 } 642 643 if (!count) 644 return 0; 645 646 result = generic_file_write_iter(iocb, from); 647 if (IS_ERR_VALUE(result)) { 648 _leave(" = %zd", result); 649 return result; 650 } 651 652 _leave(" = %zd", result); 653 return result; 654 } 655 656 /* 657 * flush the vnode to the fileserver 658 */ 659 int afs_writeback_all(struct afs_vnode *vnode) 660 { 661 struct address_space *mapping = vnode->vfs_inode.i_mapping; 662 struct writeback_control wbc = { 663 .sync_mode = WB_SYNC_ALL, 664 .nr_to_write = LONG_MAX, 665 .range_cyclic = 1, 666 }; 667 int ret; 668 669 _enter(""); 670 671 ret = mapping->a_ops->writepages(mapping, &wbc); 672 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 673 674 _leave(" = %d", ret); 675 return ret; 676 } 677 678 /* 679 * flush any dirty pages for this process, and check for write errors. 680 * - the return status from this call provides a reliable indication of 681 * whether any write errors occurred for this process. 682 */ 683 int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) 684 { 685 struct inode *inode = file_inode(file); 686 struct afs_writeback *wb, *xwb; 687 struct afs_vnode *vnode = AFS_FS_I(inode); 688 int ret; 689 690 _enter("{%x:%u},{n=%pD},%d", 691 vnode->fid.vid, vnode->fid.vnode, file, 692 datasync); 693 694 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 695 if (ret) 696 return ret; 697 mutex_lock(&inode->i_mutex); 698 699 /* use a writeback record as a marker in the queue - when this reaches 700 * the front of the queue, all the outstanding writes are either 701 * completed or rejected */ 702 wb = kzalloc(sizeof(*wb), GFP_KERNEL); 703 if (!wb) { 704 ret = -ENOMEM; 705 goto out; 706 } 707 wb->vnode = vnode; 708 wb->first = 0; 709 wb->last = -1; 710 wb->offset_first = 0; 711 wb->to_last = PAGE_SIZE; 712 wb->usage = 1; 713 wb->state = AFS_WBACK_SYNCING; 714 init_waitqueue_head(&wb->waitq); 715 716 spin_lock(&vnode->writeback_lock); 717 list_for_each_entry(xwb, &vnode->writebacks, link) { 718 if (xwb->state == AFS_WBACK_PENDING) 719 xwb->state = AFS_WBACK_CONFLICTING; 720 } 721 list_add_tail(&wb->link, &vnode->writebacks); 722 spin_unlock(&vnode->writeback_lock); 723 724 /* push all the outstanding writebacks to the server */ 725 ret = afs_writeback_all(vnode); 726 if (ret < 0) { 727 afs_put_writeback(wb); 728 _leave(" = %d [wb]", ret); 729 goto out; 730 } 731 732 /* wait for the preceding writes to actually complete */ 733 ret = wait_event_interruptible(wb->waitq, 734 wb->state == AFS_WBACK_COMPLETE || 735 vnode->writebacks.next == &wb->link); 736 afs_put_writeback(wb); 737 _leave(" = %d", ret); 738 out: 739 mutex_unlock(&inode->i_mutex); 740 return ret; 741 } 742 743 /* 744 * notification that a previously read-only page is about to become writable 745 * - if it returns an error, the caller will deliver a bus error signal 746 */ 747 int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page) 748 { 749 struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host); 750 751 _enter("{{%x:%u}},{%lx}", 752 vnode->fid.vid, vnode->fid.vnode, page->index); 753 754 /* wait for the page to be written to the cache before we allow it to 755 * be modified */ 756 #ifdef CONFIG_AFS_FSCACHE 757 fscache_wait_on_page_write(vnode->cache, page); 758 #endif 759 760 _leave(" = 0"); 761 return 0; 762 } 763