1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_log.h" 20 #include "xfs_sb.h" 21 #include "xfs_ag.h" 22 #include "xfs_trans.h" 23 #include "xfs_mount.h" 24 #include "xfs_bmap_btree.h" 25 #include "xfs_dinode.h" 26 #include "xfs_inode.h" 27 #include "xfs_inode_item.h" 28 #include "xfs_alloc.h" 29 #include "xfs_error.h" 30 #include "xfs_iomap.h" 31 #include "xfs_vnodeops.h" 32 #include "xfs_trace.h" 33 #include "xfs_bmap.h" 34 #include <linux/gfp.h> 35 #include <linux/mpage.h> 36 #include <linux/pagevec.h> 37 #include <linux/writeback.h> 38 39 void 40 xfs_count_page_state( 41 struct page *page, 42 int *delalloc, 43 int *unwritten) 44 { 45 struct buffer_head *bh, *head; 46 47 *delalloc = *unwritten = 0; 48 49 bh = head = page_buffers(page); 50 do { 51 if (buffer_unwritten(bh)) 52 (*unwritten) = 1; 53 else if (buffer_delay(bh)) 54 (*delalloc) = 1; 55 } while ((bh = bh->b_this_page) != head); 56 } 57 58 STATIC struct block_device * 59 xfs_find_bdev_for_inode( 60 struct inode *inode) 61 { 62 struct xfs_inode *ip = XFS_I(inode); 63 struct xfs_mount *mp = ip->i_mount; 64 65 if (XFS_IS_REALTIME_INODE(ip)) 66 return mp->m_rtdev_targp->bt_bdev; 67 else 68 return mp->m_ddev_targp->bt_bdev; 69 } 70 71 /* 72 * We're now finished for good with this ioend structure. 73 * Update the page state via the associated buffer_heads, 74 * release holds on the inode and bio, and finally free 75 * up memory. Do not use the ioend after this. 76 */ 77 STATIC void 78 xfs_destroy_ioend( 79 xfs_ioend_t *ioend) 80 { 81 struct buffer_head *bh, *next; 82 83 for (bh = ioend->io_buffer_head; bh; bh = next) { 84 next = bh->b_private; 85 bh->b_end_io(bh, !ioend->io_error); 86 } 87 88 if (ioend->io_iocb) { 89 if (ioend->io_isasync) { 90 aio_complete(ioend->io_iocb, ioend->io_error ? 91 ioend->io_error : ioend->io_result, 0); 92 } 93 inode_dio_done(ioend->io_inode); 94 } 95 96 mempool_free(ioend, xfs_ioend_pool); 97 } 98 99 /* 100 * Fast and loose check if this write could update the on-disk inode size. 101 */ 102 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) 103 { 104 return ioend->io_offset + ioend->io_size > 105 XFS_I(ioend->io_inode)->i_d.di_size; 106 } 107 108 STATIC int 109 xfs_setfilesize_trans_alloc( 110 struct xfs_ioend *ioend) 111 { 112 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 113 struct xfs_trans *tp; 114 int error; 115 116 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 117 118 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); 119 if (error) { 120 xfs_trans_cancel(tp, 0); 121 return error; 122 } 123 124 ioend->io_append_trans = tp; 125 126 /* 127 * We will pass freeze protection with a transaction. So tell lockdep 128 * we released it. 129 */ 130 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 131 1, _THIS_IP_); 132 /* 133 * We hand off the transaction to the completion thread now, so 134 * clear the flag here. 135 */ 136 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 137 return 0; 138 } 139 140 /* 141 * Update on-disk file size now that data has been written to disk. 142 */ 143 STATIC int 144 xfs_setfilesize( 145 struct xfs_ioend *ioend) 146 { 147 struct xfs_inode *ip = XFS_I(ioend->io_inode); 148 struct xfs_trans *tp = ioend->io_append_trans; 149 xfs_fsize_t isize; 150 151 /* 152 * The transaction was allocated in the I/O submission thread, 153 * thus we need to mark ourselves as beeing in a transaction 154 * manually. 155 */ 156 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 157 158 xfs_ilock(ip, XFS_ILOCK_EXCL); 159 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); 160 if (!isize) { 161 xfs_iunlock(ip, XFS_ILOCK_EXCL); 162 xfs_trans_cancel(tp, 0); 163 return 0; 164 } 165 166 trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); 167 168 ip->i_d.di_size = isize; 169 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 170 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 171 172 return xfs_trans_commit(tp, 0); 173 } 174 175 /* 176 * Schedule IO completion handling on the final put of an ioend. 177 * 178 * If there is no work to do we might as well call it a day and free the 179 * ioend right now. 180 */ 181 STATIC void 182 xfs_finish_ioend( 183 struct xfs_ioend *ioend) 184 { 185 if (atomic_dec_and_test(&ioend->io_remaining)) { 186 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 187 188 if (ioend->io_type == XFS_IO_UNWRITTEN) 189 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 190 else if (ioend->io_append_trans) 191 queue_work(mp->m_data_workqueue, &ioend->io_work); 192 else 193 xfs_destroy_ioend(ioend); 194 } 195 } 196 197 /* 198 * IO write completion. 199 */ 200 STATIC void 201 xfs_end_io( 202 struct work_struct *work) 203 { 204 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); 205 struct xfs_inode *ip = XFS_I(ioend->io_inode); 206 int error = 0; 207 208 if (ioend->io_append_trans) { 209 /* 210 * We've got freeze protection passed with the transaction. 211 * Tell lockdep about it. 212 */ 213 rwsem_acquire_read( 214 &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 215 0, 1, _THIS_IP_); 216 } 217 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 218 ioend->io_error = -EIO; 219 goto done; 220 } 221 if (ioend->io_error) 222 goto done; 223 224 /* 225 * For unwritten extents we need to issue transactions to convert a 226 * range to normal written extens after the data I/O has finished. 227 */ 228 if (ioend->io_type == XFS_IO_UNWRITTEN) { 229 /* 230 * For buffered I/O we never preallocate a transaction when 231 * doing the unwritten extent conversion, but for direct I/O 232 * we do not know if we are converting an unwritten extent 233 * or not at the point where we preallocate the transaction. 234 */ 235 if (ioend->io_append_trans) { 236 ASSERT(ioend->io_isdirect); 237 238 current_set_flags_nested( 239 &ioend->io_append_trans->t_pflags, PF_FSTRANS); 240 xfs_trans_cancel(ioend->io_append_trans, 0); 241 } 242 243 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 244 ioend->io_size); 245 if (error) { 246 ioend->io_error = -error; 247 goto done; 248 } 249 } else if (ioend->io_append_trans) { 250 error = xfs_setfilesize(ioend); 251 if (error) 252 ioend->io_error = -error; 253 } else { 254 ASSERT(!xfs_ioend_is_append(ioend)); 255 } 256 257 done: 258 xfs_destroy_ioend(ioend); 259 } 260 261 /* 262 * Call IO completion handling in caller context on the final put of an ioend. 263 */ 264 STATIC void 265 xfs_finish_ioend_sync( 266 struct xfs_ioend *ioend) 267 { 268 if (atomic_dec_and_test(&ioend->io_remaining)) 269 xfs_end_io(&ioend->io_work); 270 } 271 272 /* 273 * Allocate and initialise an IO completion structure. 274 * We need to track unwritten extent write completion here initially. 275 * We'll need to extend this for updating the ondisk inode size later 276 * (vs. incore size). 277 */ 278 STATIC xfs_ioend_t * 279 xfs_alloc_ioend( 280 struct inode *inode, 281 unsigned int type) 282 { 283 xfs_ioend_t *ioend; 284 285 ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); 286 287 /* 288 * Set the count to 1 initially, which will prevent an I/O 289 * completion callback from happening before we have started 290 * all the I/O from calling the completion routine too early. 291 */ 292 atomic_set(&ioend->io_remaining, 1); 293 ioend->io_isasync = 0; 294 ioend->io_isdirect = 0; 295 ioend->io_error = 0; 296 ioend->io_list = NULL; 297 ioend->io_type = type; 298 ioend->io_inode = inode; 299 ioend->io_buffer_head = NULL; 300 ioend->io_buffer_tail = NULL; 301 ioend->io_offset = 0; 302 ioend->io_size = 0; 303 ioend->io_iocb = NULL; 304 ioend->io_result = 0; 305 ioend->io_append_trans = NULL; 306 307 INIT_WORK(&ioend->io_work, xfs_end_io); 308 return ioend; 309 } 310 311 STATIC int 312 xfs_map_blocks( 313 struct inode *inode, 314 loff_t offset, 315 struct xfs_bmbt_irec *imap, 316 int type, 317 int nonblocking) 318 { 319 struct xfs_inode *ip = XFS_I(inode); 320 struct xfs_mount *mp = ip->i_mount; 321 ssize_t count = 1 << inode->i_blkbits; 322 xfs_fileoff_t offset_fsb, end_fsb; 323 int error = 0; 324 int bmapi_flags = XFS_BMAPI_ENTIRE; 325 int nimaps = 1; 326 327 if (XFS_FORCED_SHUTDOWN(mp)) 328 return -XFS_ERROR(EIO); 329 330 if (type == XFS_IO_UNWRITTEN) 331 bmapi_flags |= XFS_BMAPI_IGSTATE; 332 333 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 334 if (nonblocking) 335 return -XFS_ERROR(EAGAIN); 336 xfs_ilock(ip, XFS_ILOCK_SHARED); 337 } 338 339 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 340 (ip->i_df.if_flags & XFS_IFEXTENTS)); 341 ASSERT(offset <= mp->m_super->s_maxbytes); 342 343 if (offset + count > mp->m_super->s_maxbytes) 344 count = mp->m_super->s_maxbytes - offset; 345 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 346 offset_fsb = XFS_B_TO_FSBT(mp, offset); 347 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 348 imap, &nimaps, bmapi_flags); 349 xfs_iunlock(ip, XFS_ILOCK_SHARED); 350 351 if (error) 352 return -XFS_ERROR(error); 353 354 if (type == XFS_IO_DELALLOC && 355 (!nimaps || isnullstartblock(imap->br_startblock))) { 356 error = xfs_iomap_write_allocate(ip, offset, count, imap); 357 if (!error) 358 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); 359 return -XFS_ERROR(error); 360 } 361 362 #ifdef DEBUG 363 if (type == XFS_IO_UNWRITTEN) { 364 ASSERT(nimaps); 365 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 366 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 367 } 368 #endif 369 if (nimaps) 370 trace_xfs_map_blocks_found(ip, offset, count, type, imap); 371 return 0; 372 } 373 374 STATIC int 375 xfs_imap_valid( 376 struct inode *inode, 377 struct xfs_bmbt_irec *imap, 378 xfs_off_t offset) 379 { 380 offset >>= inode->i_blkbits; 381 382 return offset >= imap->br_startoff && 383 offset < imap->br_startoff + imap->br_blockcount; 384 } 385 386 /* 387 * BIO completion handler for buffered IO. 388 */ 389 STATIC void 390 xfs_end_bio( 391 struct bio *bio, 392 int error) 393 { 394 xfs_ioend_t *ioend = bio->bi_private; 395 396 ASSERT(atomic_read(&bio->bi_cnt) >= 1); 397 ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; 398 399 /* Toss bio and pass work off to an xfsdatad thread */ 400 bio->bi_private = NULL; 401 bio->bi_end_io = NULL; 402 bio_put(bio); 403 404 xfs_finish_ioend(ioend); 405 } 406 407 STATIC void 408 xfs_submit_ioend_bio( 409 struct writeback_control *wbc, 410 xfs_ioend_t *ioend, 411 struct bio *bio) 412 { 413 atomic_inc(&ioend->io_remaining); 414 bio->bi_private = ioend; 415 bio->bi_end_io = xfs_end_bio; 416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); 417 } 418 419 STATIC struct bio * 420 xfs_alloc_ioend_bio( 421 struct buffer_head *bh) 422 { 423 int nvecs = bio_get_nr_vecs(bh->b_bdev); 424 struct bio *bio = bio_alloc(GFP_NOIO, nvecs); 425 426 ASSERT(bio->bi_private == NULL); 427 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 428 bio->bi_bdev = bh->b_bdev; 429 return bio; 430 } 431 432 STATIC void 433 xfs_start_buffer_writeback( 434 struct buffer_head *bh) 435 { 436 ASSERT(buffer_mapped(bh)); 437 ASSERT(buffer_locked(bh)); 438 ASSERT(!buffer_delay(bh)); 439 ASSERT(!buffer_unwritten(bh)); 440 441 mark_buffer_async_write(bh); 442 set_buffer_uptodate(bh); 443 clear_buffer_dirty(bh); 444 } 445 446 STATIC void 447 xfs_start_page_writeback( 448 struct page *page, 449 int clear_dirty, 450 int buffers) 451 { 452 ASSERT(PageLocked(page)); 453 ASSERT(!PageWriteback(page)); 454 if (clear_dirty) 455 clear_page_dirty_for_io(page); 456 set_page_writeback(page); 457 unlock_page(page); 458 /* If no buffers on the page are to be written, finish it here */ 459 if (!buffers) 460 end_page_writeback(page); 461 } 462 463 static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) 464 { 465 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); 466 } 467 468 /* 469 * Submit all of the bios for all of the ioends we have saved up, covering the 470 * initial writepage page and also any probed pages. 471 * 472 * Because we may have multiple ioends spanning a page, we need to start 473 * writeback on all the buffers before we submit them for I/O. If we mark the 474 * buffers as we got, then we can end up with a page that only has buffers 475 * marked async write and I/O complete on can occur before we mark the other 476 * buffers async write. 477 * 478 * The end result of this is that we trip a bug in end_page_writeback() because 479 * we call it twice for the one page as the code in end_buffer_async_write() 480 * assumes that all buffers on the page are started at the same time. 481 * 482 * The fix is two passes across the ioend list - one to start writeback on the 483 * buffer_heads, and then submit them for I/O on the second pass. 484 */ 485 STATIC void 486 xfs_submit_ioend( 487 struct writeback_control *wbc, 488 xfs_ioend_t *ioend) 489 { 490 xfs_ioend_t *head = ioend; 491 xfs_ioend_t *next; 492 struct buffer_head *bh; 493 struct bio *bio; 494 sector_t lastblock = 0; 495 496 /* Pass 1 - start writeback */ 497 do { 498 next = ioend->io_list; 499 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) 500 xfs_start_buffer_writeback(bh); 501 } while ((ioend = next) != NULL); 502 503 /* Pass 2 - submit I/O */ 504 ioend = head; 505 do { 506 next = ioend->io_list; 507 bio = NULL; 508 509 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 510 511 if (!bio) { 512 retry: 513 bio = xfs_alloc_ioend_bio(bh); 514 } else if (bh->b_blocknr != lastblock + 1) { 515 xfs_submit_ioend_bio(wbc, ioend, bio); 516 goto retry; 517 } 518 519 if (bio_add_buffer(bio, bh) != bh->b_size) { 520 xfs_submit_ioend_bio(wbc, ioend, bio); 521 goto retry; 522 } 523 524 lastblock = bh->b_blocknr; 525 } 526 if (bio) 527 xfs_submit_ioend_bio(wbc, ioend, bio); 528 xfs_finish_ioend(ioend); 529 } while ((ioend = next) != NULL); 530 } 531 532 /* 533 * Cancel submission of all buffer_heads so far in this endio. 534 * Toss the endio too. Only ever called for the initial page 535 * in a writepage request, so only ever one page. 536 */ 537 STATIC void 538 xfs_cancel_ioend( 539 xfs_ioend_t *ioend) 540 { 541 xfs_ioend_t *next; 542 struct buffer_head *bh, *next_bh; 543 544 do { 545 next = ioend->io_list; 546 bh = ioend->io_buffer_head; 547 do { 548 next_bh = bh->b_private; 549 clear_buffer_async_write(bh); 550 unlock_buffer(bh); 551 } while ((bh = next_bh) != NULL); 552 553 mempool_free(ioend, xfs_ioend_pool); 554 } while ((ioend = next) != NULL); 555 } 556 557 /* 558 * Test to see if we've been building up a completion structure for 559 * earlier buffers -- if so, we try to append to this ioend if we 560 * can, otherwise we finish off any current ioend and start another. 561 * Return true if we've finished the given ioend. 562 */ 563 STATIC void 564 xfs_add_to_ioend( 565 struct inode *inode, 566 struct buffer_head *bh, 567 xfs_off_t offset, 568 unsigned int type, 569 xfs_ioend_t **result, 570 int need_ioend) 571 { 572 xfs_ioend_t *ioend = *result; 573 574 if (!ioend || need_ioend || type != ioend->io_type) { 575 xfs_ioend_t *previous = *result; 576 577 ioend = xfs_alloc_ioend(inode, type); 578 ioend->io_offset = offset; 579 ioend->io_buffer_head = bh; 580 ioend->io_buffer_tail = bh; 581 if (previous) 582 previous->io_list = ioend; 583 *result = ioend; 584 } else { 585 ioend->io_buffer_tail->b_private = bh; 586 ioend->io_buffer_tail = bh; 587 } 588 589 bh->b_private = NULL; 590 ioend->io_size += bh->b_size; 591 } 592 593 STATIC void 594 xfs_map_buffer( 595 struct inode *inode, 596 struct buffer_head *bh, 597 struct xfs_bmbt_irec *imap, 598 xfs_off_t offset) 599 { 600 sector_t bn; 601 struct xfs_mount *m = XFS_I(inode)->i_mount; 602 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); 603 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); 604 605 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 606 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 607 608 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + 609 ((offset - iomap_offset) >> inode->i_blkbits); 610 611 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); 612 613 bh->b_blocknr = bn; 614 set_buffer_mapped(bh); 615 } 616 617 STATIC void 618 xfs_map_at_offset( 619 struct inode *inode, 620 struct buffer_head *bh, 621 struct xfs_bmbt_irec *imap, 622 xfs_off_t offset) 623 { 624 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 625 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 626 627 xfs_map_buffer(inode, bh, imap, offset); 628 set_buffer_mapped(bh); 629 clear_buffer_delay(bh); 630 clear_buffer_unwritten(bh); 631 } 632 633 /* 634 * Test if a given page is suitable for writing as part of an unwritten 635 * or delayed allocate extent. 636 */ 637 STATIC int 638 xfs_check_page_type( 639 struct page *page, 640 unsigned int type) 641 { 642 if (PageWriteback(page)) 643 return 0; 644 645 if (page->mapping && page_has_buffers(page)) { 646 struct buffer_head *bh, *head; 647 int acceptable = 0; 648 649 bh = head = page_buffers(page); 650 do { 651 if (buffer_unwritten(bh)) 652 acceptable += (type == XFS_IO_UNWRITTEN); 653 else if (buffer_delay(bh)) 654 acceptable += (type == XFS_IO_DELALLOC); 655 else if (buffer_dirty(bh) && buffer_mapped(bh)) 656 acceptable += (type == XFS_IO_OVERWRITE); 657 else 658 break; 659 } while ((bh = bh->b_this_page) != head); 660 661 if (acceptable) 662 return 1; 663 } 664 665 return 0; 666 } 667 668 /* 669 * Allocate & map buffers for page given the extent map. Write it out. 670 * except for the original page of a writepage, this is called on 671 * delalloc/unwritten pages only, for the original page it is possible 672 * that the page has no mapping at all. 673 */ 674 STATIC int 675 xfs_convert_page( 676 struct inode *inode, 677 struct page *page, 678 loff_t tindex, 679 struct xfs_bmbt_irec *imap, 680 xfs_ioend_t **ioendp, 681 struct writeback_control *wbc) 682 { 683 struct buffer_head *bh, *head; 684 xfs_off_t end_offset; 685 unsigned long p_offset; 686 unsigned int type; 687 int len, page_dirty; 688 int count = 0, done = 0, uptodate = 1; 689 xfs_off_t offset = page_offset(page); 690 691 if (page->index != tindex) 692 goto fail; 693 if (!trylock_page(page)) 694 goto fail; 695 if (PageWriteback(page)) 696 goto fail_unlock_page; 697 if (page->mapping != inode->i_mapping) 698 goto fail_unlock_page; 699 if (!xfs_check_page_type(page, (*ioendp)->io_type)) 700 goto fail_unlock_page; 701 702 /* 703 * page_dirty is initially a count of buffers on the page before 704 * EOF and is decremented as we move each into a cleanable state. 705 * 706 * Derivation: 707 * 708 * End offset is the highest offset that this page should represent. 709 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) 710 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and 711 * hence give us the correct page_dirty count. On any other page, 712 * it will be zero and in that case we need page_dirty to be the 713 * count of buffers on the page. 714 */ 715 end_offset = min_t(unsigned long long, 716 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, 717 i_size_read(inode)); 718 719 len = 1 << inode->i_blkbits; 720 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), 721 PAGE_CACHE_SIZE); 722 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; 723 page_dirty = p_offset / len; 724 725 bh = head = page_buffers(page); 726 do { 727 if (offset >= end_offset) 728 break; 729 if (!buffer_uptodate(bh)) 730 uptodate = 0; 731 if (!(PageUptodate(page) || buffer_uptodate(bh))) { 732 done = 1; 733 continue; 734 } 735 736 if (buffer_unwritten(bh) || buffer_delay(bh) || 737 buffer_mapped(bh)) { 738 if (buffer_unwritten(bh)) 739 type = XFS_IO_UNWRITTEN; 740 else if (buffer_delay(bh)) 741 type = XFS_IO_DELALLOC; 742 else 743 type = XFS_IO_OVERWRITE; 744 745 if (!xfs_imap_valid(inode, imap, offset)) { 746 done = 1; 747 continue; 748 } 749 750 lock_buffer(bh); 751 if (type != XFS_IO_OVERWRITE) 752 xfs_map_at_offset(inode, bh, imap, offset); 753 xfs_add_to_ioend(inode, bh, offset, type, 754 ioendp, done); 755 756 page_dirty--; 757 count++; 758 } else { 759 done = 1; 760 } 761 } while (offset += len, (bh = bh->b_this_page) != head); 762 763 if (uptodate && bh == head) 764 SetPageUptodate(page); 765 766 if (count) { 767 if (--wbc->nr_to_write <= 0 && 768 wbc->sync_mode == WB_SYNC_NONE) 769 done = 1; 770 } 771 xfs_start_page_writeback(page, !page_dirty, count); 772 773 return done; 774 fail_unlock_page: 775 unlock_page(page); 776 fail: 777 return 1; 778 } 779 780 /* 781 * Convert & write out a cluster of pages in the same extent as defined 782 * by mp and following the start page. 783 */ 784 STATIC void 785 xfs_cluster_write( 786 struct inode *inode, 787 pgoff_t tindex, 788 struct xfs_bmbt_irec *imap, 789 xfs_ioend_t **ioendp, 790 struct writeback_control *wbc, 791 pgoff_t tlast) 792 { 793 struct pagevec pvec; 794 int done = 0, i; 795 796 pagevec_init(&pvec, 0); 797 while (!done && tindex <= tlast) { 798 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); 799 800 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) 801 break; 802 803 for (i = 0; i < pagevec_count(&pvec); i++) { 804 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 805 imap, ioendp, wbc); 806 if (done) 807 break; 808 } 809 810 pagevec_release(&pvec); 811 cond_resched(); 812 } 813 } 814 815 STATIC void 816 xfs_vm_invalidatepage( 817 struct page *page, 818 unsigned long offset) 819 { 820 trace_xfs_invalidatepage(page->mapping->host, page, offset); 821 block_invalidatepage(page, offset); 822 } 823 824 /* 825 * If the page has delalloc buffers on it, we need to punch them out before we 826 * invalidate the page. If we don't, we leave a stale delalloc mapping on the 827 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read 828 * is done on that same region - the delalloc extent is returned when none is 829 * supposed to be there. 830 * 831 * We prevent this by truncating away the delalloc regions on the page before 832 * invalidating it. Because they are delalloc, we can do this without needing a 833 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this 834 * truncation without a transaction as there is no space left for block 835 * reservation (typically why we see a ENOSPC in writeback). 836 * 837 * This is not a performance critical path, so for now just do the punching a 838 * buffer head at a time. 839 */ 840 STATIC void 841 xfs_aops_discard_page( 842 struct page *page) 843 { 844 struct inode *inode = page->mapping->host; 845 struct xfs_inode *ip = XFS_I(inode); 846 struct buffer_head *bh, *head; 847 loff_t offset = page_offset(page); 848 849 if (!xfs_check_page_type(page, XFS_IO_DELALLOC)) 850 goto out_invalidate; 851 852 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 853 goto out_invalidate; 854 855 xfs_alert(ip->i_mount, 856 "page discard on page %p, inode 0x%llx, offset %llu.", 857 page, ip->i_ino, offset); 858 859 xfs_ilock(ip, XFS_ILOCK_EXCL); 860 bh = head = page_buffers(page); 861 do { 862 int error; 863 xfs_fileoff_t start_fsb; 864 865 if (!buffer_delay(bh)) 866 goto next_buffer; 867 868 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 869 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); 870 if (error) { 871 /* something screwed, just bail */ 872 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 873 xfs_alert(ip->i_mount, 874 "page discard unable to remove delalloc mapping."); 875 } 876 break; 877 } 878 next_buffer: 879 offset += 1 << inode->i_blkbits; 880 881 } while ((bh = bh->b_this_page) != head); 882 883 xfs_iunlock(ip, XFS_ILOCK_EXCL); 884 out_invalidate: 885 xfs_vm_invalidatepage(page, 0); 886 return; 887 } 888 889 /* 890 * Write out a dirty page. 891 * 892 * For delalloc space on the page we need to allocate space and flush it. 893 * For unwritten space on the page we need to start the conversion to 894 * regular allocated space. 895 * For any other dirty buffer heads on the page we should flush them. 896 */ 897 STATIC int 898 xfs_vm_writepage( 899 struct page *page, 900 struct writeback_control *wbc) 901 { 902 struct inode *inode = page->mapping->host; 903 struct buffer_head *bh, *head; 904 struct xfs_bmbt_irec imap; 905 xfs_ioend_t *ioend = NULL, *iohead = NULL; 906 loff_t offset; 907 unsigned int type; 908 __uint64_t end_offset; 909 pgoff_t end_index, last_index; 910 ssize_t len; 911 int err, imap_valid = 0, uptodate = 1; 912 int count = 0; 913 int nonblocking = 0; 914 915 trace_xfs_writepage(inode, page, 0); 916 917 ASSERT(page_has_buffers(page)); 918 919 /* 920 * Refuse to write the page out if we are called from reclaim context. 921 * 922 * This avoids stack overflows when called from deeply used stacks in 923 * random callers for direct reclaim or memcg reclaim. We explicitly 924 * allow reclaim from kswapd as the stack usage there is relatively low. 925 * 926 * This should never happen except in the case of a VM regression so 927 * warn about it. 928 */ 929 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == 930 PF_MEMALLOC)) 931 goto redirty; 932 933 /* 934 * Given that we do not allow direct reclaim to call us, we should 935 * never be called while in a filesystem transaction. 936 */ 937 if (WARN_ON(current->flags & PF_FSTRANS)) 938 goto redirty; 939 940 /* Is this page beyond the end of the file? */ 941 offset = i_size_read(inode); 942 end_index = offset >> PAGE_CACHE_SHIFT; 943 last_index = (offset - 1) >> PAGE_CACHE_SHIFT; 944 if (page->index >= end_index) { 945 unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); 946 947 /* 948 * Just skip the page if it is fully outside i_size, e.g. due 949 * to a truncate operation that is in progress. 950 */ 951 if (page->index >= end_index + 1 || offset_into_page == 0) { 952 unlock_page(page); 953 return 0; 954 } 955 956 /* 957 * The page straddles i_size. It must be zeroed out on each 958 * and every writepage invocation because it may be mmapped. 959 * "A file is mapped in multiples of the page size. For a file 960 * that is not a multiple of the page size, the remaining 961 * memory is zeroed when mapped, and writes to that region are 962 * not written out to the file." 963 */ 964 zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); 965 } 966 967 end_offset = min_t(unsigned long long, 968 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, 969 offset); 970 len = 1 << inode->i_blkbits; 971 972 bh = head = page_buffers(page); 973 offset = page_offset(page); 974 type = XFS_IO_OVERWRITE; 975 976 if (wbc->sync_mode == WB_SYNC_NONE) 977 nonblocking = 1; 978 979 do { 980 int new_ioend = 0; 981 982 if (offset >= end_offset) 983 break; 984 if (!buffer_uptodate(bh)) 985 uptodate = 0; 986 987 /* 988 * set_page_dirty dirties all buffers in a page, independent 989 * of their state. The dirty state however is entirely 990 * meaningless for holes (!mapped && uptodate), so skip 991 * buffers covering holes here. 992 */ 993 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 994 imap_valid = 0; 995 continue; 996 } 997 998 if (buffer_unwritten(bh)) { 999 if (type != XFS_IO_UNWRITTEN) { 1000 type = XFS_IO_UNWRITTEN; 1001 imap_valid = 0; 1002 } 1003 } else if (buffer_delay(bh)) { 1004 if (type != XFS_IO_DELALLOC) { 1005 type = XFS_IO_DELALLOC; 1006 imap_valid = 0; 1007 } 1008 } else if (buffer_uptodate(bh)) { 1009 if (type != XFS_IO_OVERWRITE) { 1010 type = XFS_IO_OVERWRITE; 1011 imap_valid = 0; 1012 } 1013 } else { 1014 if (PageUptodate(page)) 1015 ASSERT(buffer_mapped(bh)); 1016 /* 1017 * This buffer is not uptodate and will not be 1018 * written to disk. Ensure that we will put any 1019 * subsequent writeable buffers into a new 1020 * ioend. 1021 */ 1022 imap_valid = 0; 1023 continue; 1024 } 1025 1026 if (imap_valid) 1027 imap_valid = xfs_imap_valid(inode, &imap, offset); 1028 if (!imap_valid) { 1029 /* 1030 * If we didn't have a valid mapping then we need to 1031 * put the new mapping into a separate ioend structure. 1032 * This ensures non-contiguous extents always have 1033 * separate ioends, which is particularly important 1034 * for unwritten extent conversion at I/O completion 1035 * time. 1036 */ 1037 new_ioend = 1; 1038 err = xfs_map_blocks(inode, offset, &imap, type, 1039 nonblocking); 1040 if (err) 1041 goto error; 1042 imap_valid = xfs_imap_valid(inode, &imap, offset); 1043 } 1044 if (imap_valid) { 1045 lock_buffer(bh); 1046 if (type != XFS_IO_OVERWRITE) 1047 xfs_map_at_offset(inode, bh, &imap, offset); 1048 xfs_add_to_ioend(inode, bh, offset, type, &ioend, 1049 new_ioend); 1050 count++; 1051 } 1052 1053 if (!iohead) 1054 iohead = ioend; 1055 1056 } while (offset += len, ((bh = bh->b_this_page) != head)); 1057 1058 if (uptodate && bh == head) 1059 SetPageUptodate(page); 1060 1061 xfs_start_page_writeback(page, 1, count); 1062 1063 if (ioend && imap_valid) { 1064 xfs_off_t end_index; 1065 1066 end_index = imap.br_startoff + imap.br_blockcount; 1067 1068 /* to bytes */ 1069 end_index <<= inode->i_blkbits; 1070 1071 /* to pages */ 1072 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; 1073 1074 /* check against file size */ 1075 if (end_index > last_index) 1076 end_index = last_index; 1077 1078 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1079 wbc, end_index); 1080 } 1081 1082 if (iohead) { 1083 /* 1084 * Reserve log space if we might write beyond the on-disk 1085 * inode size. 1086 */ 1087 if (ioend->io_type != XFS_IO_UNWRITTEN && 1088 xfs_ioend_is_append(ioend)) { 1089 err = xfs_setfilesize_trans_alloc(ioend); 1090 if (err) 1091 goto error; 1092 } 1093 1094 xfs_submit_ioend(wbc, iohead); 1095 } 1096 1097 return 0; 1098 1099 error: 1100 if (iohead) 1101 xfs_cancel_ioend(iohead); 1102 1103 if (err == -EAGAIN) 1104 goto redirty; 1105 1106 xfs_aops_discard_page(page); 1107 ClearPageUptodate(page); 1108 unlock_page(page); 1109 return err; 1110 1111 redirty: 1112 redirty_page_for_writepage(wbc, page); 1113 unlock_page(page); 1114 return 0; 1115 } 1116 1117 STATIC int 1118 xfs_vm_writepages( 1119 struct address_space *mapping, 1120 struct writeback_control *wbc) 1121 { 1122 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1123 return generic_writepages(mapping, wbc); 1124 } 1125 1126 /* 1127 * Called to move a page into cleanable state - and from there 1128 * to be released. The page should already be clean. We always 1129 * have buffer heads in this call. 1130 * 1131 * Returns 1 if the page is ok to release, 0 otherwise. 1132 */ 1133 STATIC int 1134 xfs_vm_releasepage( 1135 struct page *page, 1136 gfp_t gfp_mask) 1137 { 1138 int delalloc, unwritten; 1139 1140 trace_xfs_releasepage(page->mapping->host, page, 0); 1141 1142 xfs_count_page_state(page, &delalloc, &unwritten); 1143 1144 if (WARN_ON(delalloc)) 1145 return 0; 1146 if (WARN_ON(unwritten)) 1147 return 0; 1148 1149 return try_to_free_buffers(page); 1150 } 1151 1152 STATIC int 1153 __xfs_get_blocks( 1154 struct inode *inode, 1155 sector_t iblock, 1156 struct buffer_head *bh_result, 1157 int create, 1158 int direct) 1159 { 1160 struct xfs_inode *ip = XFS_I(inode); 1161 struct xfs_mount *mp = ip->i_mount; 1162 xfs_fileoff_t offset_fsb, end_fsb; 1163 int error = 0; 1164 int lockmode = 0; 1165 struct xfs_bmbt_irec imap; 1166 int nimaps = 1; 1167 xfs_off_t offset; 1168 ssize_t size; 1169 int new = 0; 1170 1171 if (XFS_FORCED_SHUTDOWN(mp)) 1172 return -XFS_ERROR(EIO); 1173 1174 offset = (xfs_off_t)iblock << inode->i_blkbits; 1175 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1176 size = bh_result->b_size; 1177 1178 if (!create && direct && offset >= i_size_read(inode)) 1179 return 0; 1180 1181 /* 1182 * Direct I/O is usually done on preallocated files, so try getting 1183 * a block mapping without an exclusive lock first. For buffered 1184 * writes we already have the exclusive iolock anyway, so avoiding 1185 * a lock roundtrip here by taking the ilock exclusive from the 1186 * beginning is a useful micro optimization. 1187 */ 1188 if (create && !direct) { 1189 lockmode = XFS_ILOCK_EXCL; 1190 xfs_ilock(ip, lockmode); 1191 } else { 1192 lockmode = xfs_ilock_map_shared(ip); 1193 } 1194 1195 ASSERT(offset <= mp->m_super->s_maxbytes); 1196 if (offset + size > mp->m_super->s_maxbytes) 1197 size = mp->m_super->s_maxbytes - offset; 1198 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); 1199 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1200 1201 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 1202 &imap, &nimaps, XFS_BMAPI_ENTIRE); 1203 if (error) 1204 goto out_unlock; 1205 1206 if (create && 1207 (!nimaps || 1208 (imap.br_startblock == HOLESTARTBLOCK || 1209 imap.br_startblock == DELAYSTARTBLOCK))) { 1210 if (direct || xfs_get_extsz_hint(ip)) { 1211 /* 1212 * Drop the ilock in preparation for starting the block 1213 * allocation transaction. It will be retaken 1214 * exclusively inside xfs_iomap_write_direct for the 1215 * actual allocation. 1216 */ 1217 xfs_iunlock(ip, lockmode); 1218 error = xfs_iomap_write_direct(ip, offset, size, 1219 &imap, nimaps); 1220 if (error) 1221 return -error; 1222 new = 1; 1223 } else { 1224 /* 1225 * Delalloc reservations do not require a transaction, 1226 * we can go on without dropping the lock here. If we 1227 * are allocating a new delalloc block, make sure that 1228 * we set the new flag so that we mark the buffer new so 1229 * that we know that it is newly allocated if the write 1230 * fails. 1231 */ 1232 if (nimaps && imap.br_startblock == HOLESTARTBLOCK) 1233 new = 1; 1234 error = xfs_iomap_write_delay(ip, offset, size, &imap); 1235 if (error) 1236 goto out_unlock; 1237 1238 xfs_iunlock(ip, lockmode); 1239 } 1240 1241 trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); 1242 } else if (nimaps) { 1243 trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); 1244 xfs_iunlock(ip, lockmode); 1245 } else { 1246 trace_xfs_get_blocks_notfound(ip, offset, size); 1247 goto out_unlock; 1248 } 1249 1250 if (imap.br_startblock != HOLESTARTBLOCK && 1251 imap.br_startblock != DELAYSTARTBLOCK) { 1252 /* 1253 * For unwritten extents do not report a disk address on 1254 * the read case (treat as if we're reading into a hole). 1255 */ 1256 if (create || !ISUNWRITTEN(&imap)) 1257 xfs_map_buffer(inode, bh_result, &imap, offset); 1258 if (create && ISUNWRITTEN(&imap)) { 1259 if (direct) 1260 bh_result->b_private = inode; 1261 set_buffer_unwritten(bh_result); 1262 } 1263 } 1264 1265 /* 1266 * If this is a realtime file, data may be on a different device. 1267 * to that pointed to from the buffer_head b_bdev currently. 1268 */ 1269 bh_result->b_bdev = xfs_find_bdev_for_inode(inode); 1270 1271 /* 1272 * If we previously allocated a block out beyond eof and we are now 1273 * coming back to use it then we will need to flag it as new even if it 1274 * has a disk address. 1275 * 1276 * With sub-block writes into unwritten extents we also need to mark 1277 * the buffer as new so that the unwritten parts of the buffer gets 1278 * correctly zeroed. 1279 */ 1280 if (create && 1281 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1282 (offset >= i_size_read(inode)) || 1283 (new || ISUNWRITTEN(&imap)))) 1284 set_buffer_new(bh_result); 1285 1286 if (imap.br_startblock == DELAYSTARTBLOCK) { 1287 BUG_ON(direct); 1288 if (create) { 1289 set_buffer_uptodate(bh_result); 1290 set_buffer_mapped(bh_result); 1291 set_buffer_delay(bh_result); 1292 } 1293 } 1294 1295 /* 1296 * If this is O_DIRECT or the mpage code calling tell them how large 1297 * the mapping is, so that we can avoid repeated get_blocks calls. 1298 */ 1299 if (direct || size > (1 << inode->i_blkbits)) { 1300 xfs_off_t mapping_size; 1301 1302 mapping_size = imap.br_startoff + imap.br_blockcount - iblock; 1303 mapping_size <<= inode->i_blkbits; 1304 1305 ASSERT(mapping_size > 0); 1306 if (mapping_size > size) 1307 mapping_size = size; 1308 if (mapping_size > LONG_MAX) 1309 mapping_size = LONG_MAX; 1310 1311 bh_result->b_size = mapping_size; 1312 } 1313 1314 return 0; 1315 1316 out_unlock: 1317 xfs_iunlock(ip, lockmode); 1318 return -error; 1319 } 1320 1321 int 1322 xfs_get_blocks( 1323 struct inode *inode, 1324 sector_t iblock, 1325 struct buffer_head *bh_result, 1326 int create) 1327 { 1328 return __xfs_get_blocks(inode, iblock, bh_result, create, 0); 1329 } 1330 1331 STATIC int 1332 xfs_get_blocks_direct( 1333 struct inode *inode, 1334 sector_t iblock, 1335 struct buffer_head *bh_result, 1336 int create) 1337 { 1338 return __xfs_get_blocks(inode, iblock, bh_result, create, 1); 1339 } 1340 1341 /* 1342 * Complete a direct I/O write request. 1343 * 1344 * If the private argument is non-NULL __xfs_get_blocks signals us that we 1345 * need to issue a transaction to convert the range from unwritten to written 1346 * extents. In case this is regular synchronous I/O we just call xfs_end_io 1347 * to do this and we are done. But in case this was a successful AIO 1348 * request this handler is called from interrupt context, from which we 1349 * can't start transactions. In that case offload the I/O completion to 1350 * the workqueues we also use for buffered I/O completion. 1351 */ 1352 STATIC void 1353 xfs_end_io_direct_write( 1354 struct kiocb *iocb, 1355 loff_t offset, 1356 ssize_t size, 1357 void *private, 1358 int ret, 1359 bool is_async) 1360 { 1361 struct xfs_ioend *ioend = iocb->private; 1362 1363 /* 1364 * While the generic direct I/O code updates the inode size, it does 1365 * so only after the end_io handler is called, which means our 1366 * end_io handler thinks the on-disk size is outside the in-core 1367 * size. To prevent this just update it a little bit earlier here. 1368 */ 1369 if (offset + size > i_size_read(ioend->io_inode)) 1370 i_size_write(ioend->io_inode, offset + size); 1371 1372 /* 1373 * blockdev_direct_IO can return an error even after the I/O 1374 * completion handler was called. Thus we need to protect 1375 * against double-freeing. 1376 */ 1377 iocb->private = NULL; 1378 1379 ioend->io_offset = offset; 1380 ioend->io_size = size; 1381 ioend->io_iocb = iocb; 1382 ioend->io_result = ret; 1383 if (private && size > 0) 1384 ioend->io_type = XFS_IO_UNWRITTEN; 1385 1386 if (is_async) { 1387 ioend->io_isasync = 1; 1388 xfs_finish_ioend(ioend); 1389 } else { 1390 xfs_finish_ioend_sync(ioend); 1391 } 1392 } 1393 1394 STATIC ssize_t 1395 xfs_vm_direct_IO( 1396 int rw, 1397 struct kiocb *iocb, 1398 const struct iovec *iov, 1399 loff_t offset, 1400 unsigned long nr_segs) 1401 { 1402 struct inode *inode = iocb->ki_filp->f_mapping->host; 1403 struct block_device *bdev = xfs_find_bdev_for_inode(inode); 1404 struct xfs_ioend *ioend = NULL; 1405 ssize_t ret; 1406 1407 if (rw & WRITE) { 1408 size_t size = iov_length(iov, nr_segs); 1409 1410 /* 1411 * We need to preallocate a transaction for a size update 1412 * here. In the case that this write both updates the size 1413 * and converts at least on unwritten extent we will cancel 1414 * the still clean transaction after the I/O has finished. 1415 */ 1416 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); 1417 if (offset + size > XFS_I(inode)->i_d.di_size) { 1418 ret = xfs_setfilesize_trans_alloc(ioend); 1419 if (ret) 1420 goto out_destroy_ioend; 1421 ioend->io_isdirect = 1; 1422 } 1423 1424 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1425 offset, nr_segs, 1426 xfs_get_blocks_direct, 1427 xfs_end_io_direct_write, NULL, 0); 1428 if (ret != -EIOCBQUEUED && iocb->private) 1429 goto out_trans_cancel; 1430 } else { 1431 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1432 offset, nr_segs, 1433 xfs_get_blocks_direct, 1434 NULL, NULL, 0); 1435 } 1436 1437 return ret; 1438 1439 out_trans_cancel: 1440 if (ioend->io_append_trans) { 1441 current_set_flags_nested(&ioend->io_append_trans->t_pflags, 1442 PF_FSTRANS); 1443 rwsem_acquire_read( 1444 &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 1445 0, 1, _THIS_IP_); 1446 xfs_trans_cancel(ioend->io_append_trans, 0); 1447 } 1448 out_destroy_ioend: 1449 xfs_destroy_ioend(ioend); 1450 return ret; 1451 } 1452 1453 /* 1454 * Punch out the delalloc blocks we have already allocated. 1455 * 1456 * Don't bother with xfs_setattr given that nothing can have made it to disk yet 1457 * as the page is still locked at this point. 1458 */ 1459 STATIC void 1460 xfs_vm_kill_delalloc_range( 1461 struct inode *inode, 1462 loff_t start, 1463 loff_t end) 1464 { 1465 struct xfs_inode *ip = XFS_I(inode); 1466 xfs_fileoff_t start_fsb; 1467 xfs_fileoff_t end_fsb; 1468 int error; 1469 1470 start_fsb = XFS_B_TO_FSB(ip->i_mount, start); 1471 end_fsb = XFS_B_TO_FSB(ip->i_mount, end); 1472 if (end_fsb <= start_fsb) 1473 return; 1474 1475 xfs_ilock(ip, XFS_ILOCK_EXCL); 1476 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1477 end_fsb - start_fsb); 1478 if (error) { 1479 /* something screwed, just bail */ 1480 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1481 xfs_alert(ip->i_mount, 1482 "xfs_vm_write_failed: unable to clean up ino %lld", 1483 ip->i_ino); 1484 } 1485 } 1486 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1487 } 1488 1489 STATIC void 1490 xfs_vm_write_failed( 1491 struct inode *inode, 1492 struct page *page, 1493 loff_t pos, 1494 unsigned len) 1495 { 1496 loff_t block_offset = pos & PAGE_MASK; 1497 loff_t block_start; 1498 loff_t block_end; 1499 loff_t from = pos & (PAGE_CACHE_SIZE - 1); 1500 loff_t to = from + len; 1501 struct buffer_head *bh, *head; 1502 1503 ASSERT(block_offset + from == pos); 1504 1505 head = page_buffers(page); 1506 block_start = 0; 1507 for (bh = head; bh != head || !block_start; 1508 bh = bh->b_this_page, block_start = block_end, 1509 block_offset += bh->b_size) { 1510 block_end = block_start + bh->b_size; 1511 1512 /* skip buffers before the write */ 1513 if (block_end <= from) 1514 continue; 1515 1516 /* if the buffer is after the write, we're done */ 1517 if (block_start >= to) 1518 break; 1519 1520 if (!buffer_delay(bh)) 1521 continue; 1522 1523 if (!buffer_new(bh) && block_offset < i_size_read(inode)) 1524 continue; 1525 1526 xfs_vm_kill_delalloc_range(inode, block_offset, 1527 block_offset + bh->b_size); 1528 } 1529 1530 } 1531 1532 /* 1533 * This used to call block_write_begin(), but it unlocks and releases the page 1534 * on error, and we need that page to be able to punch stale delalloc blocks out 1535 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at 1536 * the appropriate point. 1537 */ 1538 STATIC int 1539 xfs_vm_write_begin( 1540 struct file *file, 1541 struct address_space *mapping, 1542 loff_t pos, 1543 unsigned len, 1544 unsigned flags, 1545 struct page **pagep, 1546 void **fsdata) 1547 { 1548 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1549 struct page *page; 1550 int status; 1551 1552 ASSERT(len <= PAGE_CACHE_SIZE); 1553 1554 page = grab_cache_page_write_begin(mapping, index, 1555 flags | AOP_FLAG_NOFS); 1556 if (!page) 1557 return -ENOMEM; 1558 1559 status = __block_write_begin(page, pos, len, xfs_get_blocks); 1560 if (unlikely(status)) { 1561 struct inode *inode = mapping->host; 1562 1563 xfs_vm_write_failed(inode, page, pos, len); 1564 unlock_page(page); 1565 1566 if (pos + len > i_size_read(inode)) 1567 truncate_pagecache(inode, pos + len, i_size_read(inode)); 1568 1569 page_cache_release(page); 1570 page = NULL; 1571 } 1572 1573 *pagep = page; 1574 return status; 1575 } 1576 1577 /* 1578 * On failure, we only need to kill delalloc blocks beyond EOF because they 1579 * will never be written. For blocks within EOF, generic_write_end() zeros them 1580 * so they are safe to leave alone and be written with all the other valid data. 1581 */ 1582 STATIC int 1583 xfs_vm_write_end( 1584 struct file *file, 1585 struct address_space *mapping, 1586 loff_t pos, 1587 unsigned len, 1588 unsigned copied, 1589 struct page *page, 1590 void *fsdata) 1591 { 1592 int ret; 1593 1594 ASSERT(len <= PAGE_CACHE_SIZE); 1595 1596 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 1597 if (unlikely(ret < len)) { 1598 struct inode *inode = mapping->host; 1599 size_t isize = i_size_read(inode); 1600 loff_t to = pos + len; 1601 1602 if (to > isize) { 1603 truncate_pagecache(inode, to, isize); 1604 xfs_vm_kill_delalloc_range(inode, isize, to); 1605 } 1606 } 1607 return ret; 1608 } 1609 1610 STATIC sector_t 1611 xfs_vm_bmap( 1612 struct address_space *mapping, 1613 sector_t block) 1614 { 1615 struct inode *inode = (struct inode *)mapping->host; 1616 struct xfs_inode *ip = XFS_I(inode); 1617 1618 trace_xfs_vm_bmap(XFS_I(inode)); 1619 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1620 xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); 1621 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1622 return generic_block_bmap(mapping, block, xfs_get_blocks); 1623 } 1624 1625 STATIC int 1626 xfs_vm_readpage( 1627 struct file *unused, 1628 struct page *page) 1629 { 1630 return mpage_readpage(page, xfs_get_blocks); 1631 } 1632 1633 STATIC int 1634 xfs_vm_readpages( 1635 struct file *unused, 1636 struct address_space *mapping, 1637 struct list_head *pages, 1638 unsigned nr_pages) 1639 { 1640 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1641 } 1642 1643 const struct address_space_operations xfs_address_space_operations = { 1644 .readpage = xfs_vm_readpage, 1645 .readpages = xfs_vm_readpages, 1646 .writepage = xfs_vm_writepage, 1647 .writepages = xfs_vm_writepages, 1648 .releasepage = xfs_vm_releasepage, 1649 .invalidatepage = xfs_vm_invalidatepage, 1650 .write_begin = xfs_vm_write_begin, 1651 .write_end = xfs_vm_write_end, 1652 .bmap = xfs_vm_bmap, 1653 .direct_IO = xfs_vm_direct_IO, 1654 .migratepage = buffer_migrate_page, 1655 .is_partially_uptodate = block_is_partially_uptodate, 1656 .error_remove_page = generic_error_remove_page, 1657 }; 1658