1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_shared.h" 20 #include "xfs_format.h" 21 #include "xfs_log_format.h" 22 #include "xfs_trans_resv.h" 23 #include "xfs_mount.h" 24 #include "xfs_inode.h" 25 #include "xfs_trans.h" 26 #include "xfs_inode_item.h" 27 #include "xfs_alloc.h" 28 #include "xfs_error.h" 29 #include "xfs_iomap.h" 30 #include "xfs_trace.h" 31 #include "xfs_bmap.h" 32 #include "xfs_bmap_util.h" 33 #include "xfs_bmap_btree.h" 34 #include <linux/gfp.h> 35 #include <linux/mpage.h> 36 #include <linux/pagevec.h> 37 #include <linux/writeback.h> 38 39 void 40 xfs_count_page_state( 41 struct page *page, 42 int *delalloc, 43 int *unwritten) 44 { 45 struct buffer_head *bh, *head; 46 47 *delalloc = *unwritten = 0; 48 49 bh = head = page_buffers(page); 50 do { 51 if (buffer_unwritten(bh)) 52 (*unwritten) = 1; 53 else if (buffer_delay(bh)) 54 (*delalloc) = 1; 55 } while ((bh = bh->b_this_page) != head); 56 } 57 58 STATIC struct block_device * 59 xfs_find_bdev_for_inode( 60 struct inode *inode) 61 { 62 struct xfs_inode *ip = XFS_I(inode); 63 struct xfs_mount *mp = ip->i_mount; 64 65 if (XFS_IS_REALTIME_INODE(ip)) 66 return mp->m_rtdev_targp->bt_bdev; 67 else 68 return mp->m_ddev_targp->bt_bdev; 69 } 70 71 /* 72 * We're now finished for good with this ioend structure. 73 * Update the page state via the associated buffer_heads, 74 * release holds on the inode and bio, and finally free 75 * up memory. Do not use the ioend after this. 76 */ 77 STATIC void 78 xfs_destroy_ioend( 79 xfs_ioend_t *ioend) 80 { 81 struct buffer_head *bh, *next; 82 83 for (bh = ioend->io_buffer_head; bh; bh = next) { 84 next = bh->b_private; 85 bh->b_end_io(bh, !ioend->io_error); 86 } 87 88 mempool_free(ioend, xfs_ioend_pool); 89 } 90 91 /* 92 * Fast and loose check if this write could update the on-disk inode size. 93 */ 94 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) 95 { 96 return ioend->io_offset + ioend->io_size > 97 XFS_I(ioend->io_inode)->i_d.di_size; 98 } 99 100 STATIC int 101 xfs_setfilesize_trans_alloc( 102 struct xfs_ioend *ioend) 103 { 104 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 105 struct xfs_trans *tp; 106 int error; 107 108 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 109 110 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); 111 if (error) { 112 xfs_trans_cancel(tp, 0); 113 return error; 114 } 115 116 ioend->io_append_trans = tp; 117 118 /* 119 * We may pass freeze protection with a transaction. So tell lockdep 120 * we released it. 121 */ 122 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 123 1, _THIS_IP_); 124 /* 125 * We hand off the transaction to the completion thread now, so 126 * clear the flag here. 127 */ 128 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 129 return 0; 130 } 131 132 /* 133 * Update on-disk file size now that data has been written to disk. 134 */ 135 STATIC int 136 xfs_setfilesize( 137 struct xfs_inode *ip, 138 struct xfs_trans *tp, 139 xfs_off_t offset, 140 size_t size) 141 { 142 xfs_fsize_t isize; 143 144 xfs_ilock(ip, XFS_ILOCK_EXCL); 145 isize = xfs_new_eof(ip, offset + size); 146 if (!isize) { 147 xfs_iunlock(ip, XFS_ILOCK_EXCL); 148 xfs_trans_cancel(tp, 0); 149 return 0; 150 } 151 152 trace_xfs_setfilesize(ip, offset, size); 153 154 ip->i_d.di_size = isize; 155 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 156 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 157 158 return xfs_trans_commit(tp, 0); 159 } 160 161 STATIC int 162 xfs_setfilesize_ioend( 163 struct xfs_ioend *ioend) 164 { 165 struct xfs_inode *ip = XFS_I(ioend->io_inode); 166 struct xfs_trans *tp = ioend->io_append_trans; 167 168 /* 169 * The transaction may have been allocated in the I/O submission thread, 170 * thus we need to mark ourselves as being in a transaction manually. 171 * Similarly for freeze protection. 172 */ 173 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 174 rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 175 0, 1, _THIS_IP_); 176 177 return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); 178 } 179 180 /* 181 * Schedule IO completion handling on the final put of an ioend. 182 * 183 * If there is no work to do we might as well call it a day and free the 184 * ioend right now. 185 */ 186 STATIC void 187 xfs_finish_ioend( 188 struct xfs_ioend *ioend) 189 { 190 if (atomic_dec_and_test(&ioend->io_remaining)) { 191 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 192 193 if (ioend->io_type == XFS_IO_UNWRITTEN) 194 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 195 else if (ioend->io_append_trans) 196 queue_work(mp->m_data_workqueue, &ioend->io_work); 197 else 198 xfs_destroy_ioend(ioend); 199 } 200 } 201 202 /* 203 * IO write completion. 204 */ 205 STATIC void 206 xfs_end_io( 207 struct work_struct *work) 208 { 209 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); 210 struct xfs_inode *ip = XFS_I(ioend->io_inode); 211 int error = 0; 212 213 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 214 ioend->io_error = -EIO; 215 goto done; 216 } 217 if (ioend->io_error) 218 goto done; 219 220 /* 221 * For unwritten extents we need to issue transactions to convert a 222 * range to normal written extens after the data I/O has finished. 223 */ 224 if (ioend->io_type == XFS_IO_UNWRITTEN) { 225 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 226 ioend->io_size); 227 } else if (ioend->io_append_trans) { 228 error = xfs_setfilesize_ioend(ioend); 229 } else { 230 ASSERT(!xfs_ioend_is_append(ioend)); 231 } 232 233 done: 234 if (error) 235 ioend->io_error = error; 236 xfs_destroy_ioend(ioend); 237 } 238 239 /* 240 * Allocate and initialise an IO completion structure. 241 * We need to track unwritten extent write completion here initially. 242 * We'll need to extend this for updating the ondisk inode size later 243 * (vs. incore size). 244 */ 245 STATIC xfs_ioend_t * 246 xfs_alloc_ioend( 247 struct inode *inode, 248 unsigned int type) 249 { 250 xfs_ioend_t *ioend; 251 252 ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); 253 254 /* 255 * Set the count to 1 initially, which will prevent an I/O 256 * completion callback from happening before we have started 257 * all the I/O from calling the completion routine too early. 258 */ 259 atomic_set(&ioend->io_remaining, 1); 260 ioend->io_error = 0; 261 ioend->io_list = NULL; 262 ioend->io_type = type; 263 ioend->io_inode = inode; 264 ioend->io_buffer_head = NULL; 265 ioend->io_buffer_tail = NULL; 266 ioend->io_offset = 0; 267 ioend->io_size = 0; 268 ioend->io_append_trans = NULL; 269 270 INIT_WORK(&ioend->io_work, xfs_end_io); 271 return ioend; 272 } 273 274 STATIC int 275 xfs_map_blocks( 276 struct inode *inode, 277 loff_t offset, 278 struct xfs_bmbt_irec *imap, 279 int type, 280 int nonblocking) 281 { 282 struct xfs_inode *ip = XFS_I(inode); 283 struct xfs_mount *mp = ip->i_mount; 284 ssize_t count = 1 << inode->i_blkbits; 285 xfs_fileoff_t offset_fsb, end_fsb; 286 int error = 0; 287 int bmapi_flags = XFS_BMAPI_ENTIRE; 288 int nimaps = 1; 289 290 if (XFS_FORCED_SHUTDOWN(mp)) 291 return -EIO; 292 293 if (type == XFS_IO_UNWRITTEN) 294 bmapi_flags |= XFS_BMAPI_IGSTATE; 295 296 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 297 if (nonblocking) 298 return -EAGAIN; 299 xfs_ilock(ip, XFS_ILOCK_SHARED); 300 } 301 302 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 303 (ip->i_df.if_flags & XFS_IFEXTENTS)); 304 ASSERT(offset <= mp->m_super->s_maxbytes); 305 306 if (offset + count > mp->m_super->s_maxbytes) 307 count = mp->m_super->s_maxbytes - offset; 308 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 309 offset_fsb = XFS_B_TO_FSBT(mp, offset); 310 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 311 imap, &nimaps, bmapi_flags); 312 xfs_iunlock(ip, XFS_ILOCK_SHARED); 313 314 if (error) 315 return error; 316 317 if (type == XFS_IO_DELALLOC && 318 (!nimaps || isnullstartblock(imap->br_startblock))) { 319 error = xfs_iomap_write_allocate(ip, offset, imap); 320 if (!error) 321 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); 322 return error; 323 } 324 325 #ifdef DEBUG 326 if (type == XFS_IO_UNWRITTEN) { 327 ASSERT(nimaps); 328 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 329 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 330 } 331 #endif 332 if (nimaps) 333 trace_xfs_map_blocks_found(ip, offset, count, type, imap); 334 return 0; 335 } 336 337 STATIC int 338 xfs_imap_valid( 339 struct inode *inode, 340 struct xfs_bmbt_irec *imap, 341 xfs_off_t offset) 342 { 343 offset >>= inode->i_blkbits; 344 345 return offset >= imap->br_startoff && 346 offset < imap->br_startoff + imap->br_blockcount; 347 } 348 349 /* 350 * BIO completion handler for buffered IO. 351 */ 352 STATIC void 353 xfs_end_bio( 354 struct bio *bio, 355 int error) 356 { 357 xfs_ioend_t *ioend = bio->bi_private; 358 359 ASSERT(atomic_read(&bio->bi_cnt) >= 1); 360 ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; 361 362 /* Toss bio and pass work off to an xfsdatad thread */ 363 bio->bi_private = NULL; 364 bio->bi_end_io = NULL; 365 bio_put(bio); 366 367 xfs_finish_ioend(ioend); 368 } 369 370 STATIC void 371 xfs_submit_ioend_bio( 372 struct writeback_control *wbc, 373 xfs_ioend_t *ioend, 374 struct bio *bio) 375 { 376 atomic_inc(&ioend->io_remaining); 377 bio->bi_private = ioend; 378 bio->bi_end_io = xfs_end_bio; 379 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); 380 } 381 382 STATIC struct bio * 383 xfs_alloc_ioend_bio( 384 struct buffer_head *bh) 385 { 386 int nvecs = bio_get_nr_vecs(bh->b_bdev); 387 struct bio *bio = bio_alloc(GFP_NOIO, nvecs); 388 389 ASSERT(bio->bi_private == NULL); 390 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 391 bio->bi_bdev = bh->b_bdev; 392 return bio; 393 } 394 395 STATIC void 396 xfs_start_buffer_writeback( 397 struct buffer_head *bh) 398 { 399 ASSERT(buffer_mapped(bh)); 400 ASSERT(buffer_locked(bh)); 401 ASSERT(!buffer_delay(bh)); 402 ASSERT(!buffer_unwritten(bh)); 403 404 mark_buffer_async_write(bh); 405 set_buffer_uptodate(bh); 406 clear_buffer_dirty(bh); 407 } 408 409 STATIC void 410 xfs_start_page_writeback( 411 struct page *page, 412 int clear_dirty, 413 int buffers) 414 { 415 ASSERT(PageLocked(page)); 416 ASSERT(!PageWriteback(page)); 417 418 /* 419 * if the page was not fully cleaned, we need to ensure that the higher 420 * layers come back to it correctly. That means we need to keep the page 421 * dirty, and for WB_SYNC_ALL writeback we need to ensure the 422 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to 423 * write this page in this writeback sweep will be made. 424 */ 425 if (clear_dirty) { 426 clear_page_dirty_for_io(page); 427 set_page_writeback(page); 428 } else 429 set_page_writeback_keepwrite(page); 430 431 unlock_page(page); 432 433 /* If no buffers on the page are to be written, finish it here */ 434 if (!buffers) 435 end_page_writeback(page); 436 } 437 438 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) 439 { 440 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); 441 } 442 443 /* 444 * Submit all of the bios for all of the ioends we have saved up, covering the 445 * initial writepage page and also any probed pages. 446 * 447 * Because we may have multiple ioends spanning a page, we need to start 448 * writeback on all the buffers before we submit them for I/O. If we mark the 449 * buffers as we got, then we can end up with a page that only has buffers 450 * marked async write and I/O complete on can occur before we mark the other 451 * buffers async write. 452 * 453 * The end result of this is that we trip a bug in end_page_writeback() because 454 * we call it twice for the one page as the code in end_buffer_async_write() 455 * assumes that all buffers on the page are started at the same time. 456 * 457 * The fix is two passes across the ioend list - one to start writeback on the 458 * buffer_heads, and then submit them for I/O on the second pass. 459 * 460 * If @fail is non-zero, it means that we have a situation where some part of 461 * the submission process has failed after we have marked paged for writeback 462 * and unlocked them. In this situation, we need to fail the ioend chain rather 463 * than submit it to IO. This typically only happens on a filesystem shutdown. 464 */ 465 STATIC void 466 xfs_submit_ioend( 467 struct writeback_control *wbc, 468 xfs_ioend_t *ioend, 469 int fail) 470 { 471 xfs_ioend_t *head = ioend; 472 xfs_ioend_t *next; 473 struct buffer_head *bh; 474 struct bio *bio; 475 sector_t lastblock = 0; 476 477 /* Pass 1 - start writeback */ 478 do { 479 next = ioend->io_list; 480 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) 481 xfs_start_buffer_writeback(bh); 482 } while ((ioend = next) != NULL); 483 484 /* Pass 2 - submit I/O */ 485 ioend = head; 486 do { 487 next = ioend->io_list; 488 bio = NULL; 489 490 /* 491 * If we are failing the IO now, just mark the ioend with an 492 * error and finish it. This will run IO completion immediately 493 * as there is only one reference to the ioend at this point in 494 * time. 495 */ 496 if (fail) { 497 ioend->io_error = fail; 498 xfs_finish_ioend(ioend); 499 continue; 500 } 501 502 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 503 504 if (!bio) { 505 retry: 506 bio = xfs_alloc_ioend_bio(bh); 507 } else if (bh->b_blocknr != lastblock + 1) { 508 xfs_submit_ioend_bio(wbc, ioend, bio); 509 goto retry; 510 } 511 512 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { 513 xfs_submit_ioend_bio(wbc, ioend, bio); 514 goto retry; 515 } 516 517 lastblock = bh->b_blocknr; 518 } 519 if (bio) 520 xfs_submit_ioend_bio(wbc, ioend, bio); 521 xfs_finish_ioend(ioend); 522 } while ((ioend = next) != NULL); 523 } 524 525 /* 526 * Cancel submission of all buffer_heads so far in this endio. 527 * Toss the endio too. Only ever called for the initial page 528 * in a writepage request, so only ever one page. 529 */ 530 STATIC void 531 xfs_cancel_ioend( 532 xfs_ioend_t *ioend) 533 { 534 xfs_ioend_t *next; 535 struct buffer_head *bh, *next_bh; 536 537 do { 538 next = ioend->io_list; 539 bh = ioend->io_buffer_head; 540 do { 541 next_bh = bh->b_private; 542 clear_buffer_async_write(bh); 543 /* 544 * The unwritten flag is cleared when added to the 545 * ioend. We're not submitting for I/O so mark the 546 * buffer unwritten again for next time around. 547 */ 548 if (ioend->io_type == XFS_IO_UNWRITTEN) 549 set_buffer_unwritten(bh); 550 unlock_buffer(bh); 551 } while ((bh = next_bh) != NULL); 552 553 mempool_free(ioend, xfs_ioend_pool); 554 } while ((ioend = next) != NULL); 555 } 556 557 /* 558 * Test to see if we've been building up a completion structure for 559 * earlier buffers -- if so, we try to append to this ioend if we 560 * can, otherwise we finish off any current ioend and start another. 561 * Return true if we've finished the given ioend. 562 */ 563 STATIC void 564 xfs_add_to_ioend( 565 struct inode *inode, 566 struct buffer_head *bh, 567 xfs_off_t offset, 568 unsigned int type, 569 xfs_ioend_t **result, 570 int need_ioend) 571 { 572 xfs_ioend_t *ioend = *result; 573 574 if (!ioend || need_ioend || type != ioend->io_type) { 575 xfs_ioend_t *previous = *result; 576 577 ioend = xfs_alloc_ioend(inode, type); 578 ioend->io_offset = offset; 579 ioend->io_buffer_head = bh; 580 ioend->io_buffer_tail = bh; 581 if (previous) 582 previous->io_list = ioend; 583 *result = ioend; 584 } else { 585 ioend->io_buffer_tail->b_private = bh; 586 ioend->io_buffer_tail = bh; 587 } 588 589 bh->b_private = NULL; 590 ioend->io_size += bh->b_size; 591 } 592 593 STATIC void 594 xfs_map_buffer( 595 struct inode *inode, 596 struct buffer_head *bh, 597 struct xfs_bmbt_irec *imap, 598 xfs_off_t offset) 599 { 600 sector_t bn; 601 struct xfs_mount *m = XFS_I(inode)->i_mount; 602 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); 603 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); 604 605 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 606 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 607 608 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + 609 ((offset - iomap_offset) >> inode->i_blkbits); 610 611 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); 612 613 bh->b_blocknr = bn; 614 set_buffer_mapped(bh); 615 } 616 617 STATIC void 618 xfs_map_at_offset( 619 struct inode *inode, 620 struct buffer_head *bh, 621 struct xfs_bmbt_irec *imap, 622 xfs_off_t offset) 623 { 624 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 625 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 626 627 xfs_map_buffer(inode, bh, imap, offset); 628 set_buffer_mapped(bh); 629 clear_buffer_delay(bh); 630 clear_buffer_unwritten(bh); 631 } 632 633 /* 634 * Test if a given page contains at least one buffer of a given @type. 635 * If @check_all_buffers is true, then we walk all the buffers in the page to 636 * try to find one of the type passed in. If it is not set, then the caller only 637 * needs to check the first buffer on the page for a match. 638 */ 639 STATIC bool 640 xfs_check_page_type( 641 struct page *page, 642 unsigned int type, 643 bool check_all_buffers) 644 { 645 struct buffer_head *bh; 646 struct buffer_head *head; 647 648 if (PageWriteback(page)) 649 return false; 650 if (!page->mapping) 651 return false; 652 if (!page_has_buffers(page)) 653 return false; 654 655 bh = head = page_buffers(page); 656 do { 657 if (buffer_unwritten(bh)) { 658 if (type == XFS_IO_UNWRITTEN) 659 return true; 660 } else if (buffer_delay(bh)) { 661 if (type == XFS_IO_DELALLOC) 662 return true; 663 } else if (buffer_dirty(bh) && buffer_mapped(bh)) { 664 if (type == XFS_IO_OVERWRITE) 665 return true; 666 } 667 668 /* If we are only checking the first buffer, we are done now. */ 669 if (!check_all_buffers) 670 break; 671 } while ((bh = bh->b_this_page) != head); 672 673 return false; 674 } 675 676 /* 677 * Allocate & map buffers for page given the extent map. Write it out. 678 * except for the original page of a writepage, this is called on 679 * delalloc/unwritten pages only, for the original page it is possible 680 * that the page has no mapping at all. 681 */ 682 STATIC int 683 xfs_convert_page( 684 struct inode *inode, 685 struct page *page, 686 loff_t tindex, 687 struct xfs_bmbt_irec *imap, 688 xfs_ioend_t **ioendp, 689 struct writeback_control *wbc) 690 { 691 struct buffer_head *bh, *head; 692 xfs_off_t end_offset; 693 unsigned long p_offset; 694 unsigned int type; 695 int len, page_dirty; 696 int count = 0, done = 0, uptodate = 1; 697 xfs_off_t offset = page_offset(page); 698 699 if (page->index != tindex) 700 goto fail; 701 if (!trylock_page(page)) 702 goto fail; 703 if (PageWriteback(page)) 704 goto fail_unlock_page; 705 if (page->mapping != inode->i_mapping) 706 goto fail_unlock_page; 707 if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) 708 goto fail_unlock_page; 709 710 /* 711 * page_dirty is initially a count of buffers on the page before 712 * EOF and is decremented as we move each into a cleanable state. 713 * 714 * Derivation: 715 * 716 * End offset is the highest offset that this page should represent. 717 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) 718 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and 719 * hence give us the correct page_dirty count. On any other page, 720 * it will be zero and in that case we need page_dirty to be the 721 * count of buffers on the page. 722 */ 723 end_offset = min_t(unsigned long long, 724 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, 725 i_size_read(inode)); 726 727 /* 728 * If the current map does not span the entire page we are about to try 729 * to write, then give up. The only way we can write a page that spans 730 * multiple mappings in a single writeback iteration is via the 731 * xfs_vm_writepage() function. Data integrity writeback requires the 732 * entire page to be written in a single attempt, otherwise the part of 733 * the page we don't write here doesn't get written as part of the data 734 * integrity sync. 735 * 736 * For normal writeback, we also don't attempt to write partial pages 737 * here as it simply means that write_cache_pages() will see it under 738 * writeback and ignore the page until some point in the future, at 739 * which time this will be the only page in the file that needs 740 * writeback. Hence for more optimal IO patterns, we should always 741 * avoid partial page writeback due to multiple mappings on a page here. 742 */ 743 if (!xfs_imap_valid(inode, imap, end_offset)) 744 goto fail_unlock_page; 745 746 len = 1 << inode->i_blkbits; 747 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), 748 PAGE_CACHE_SIZE); 749 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; 750 page_dirty = p_offset / len; 751 752 /* 753 * The moment we find a buffer that doesn't match our current type 754 * specification or can't be written, abort the loop and start 755 * writeback. As per the above xfs_imap_valid() check, only 756 * xfs_vm_writepage() can handle partial page writeback fully - we are 757 * limited here to the buffers that are contiguous with the current 758 * ioend, and hence a buffer we can't write breaks that contiguity and 759 * we have to defer the rest of the IO to xfs_vm_writepage(). 760 */ 761 bh = head = page_buffers(page); 762 do { 763 if (offset >= end_offset) 764 break; 765 if (!buffer_uptodate(bh)) 766 uptodate = 0; 767 if (!(PageUptodate(page) || buffer_uptodate(bh))) { 768 done = 1; 769 break; 770 } 771 772 if (buffer_unwritten(bh) || buffer_delay(bh) || 773 buffer_mapped(bh)) { 774 if (buffer_unwritten(bh)) 775 type = XFS_IO_UNWRITTEN; 776 else if (buffer_delay(bh)) 777 type = XFS_IO_DELALLOC; 778 else 779 type = XFS_IO_OVERWRITE; 780 781 /* 782 * imap should always be valid because of the above 783 * partial page end_offset check on the imap. 784 */ 785 ASSERT(xfs_imap_valid(inode, imap, offset)); 786 787 lock_buffer(bh); 788 if (type != XFS_IO_OVERWRITE) 789 xfs_map_at_offset(inode, bh, imap, offset); 790 xfs_add_to_ioend(inode, bh, offset, type, 791 ioendp, done); 792 793 page_dirty--; 794 count++; 795 } else { 796 done = 1; 797 break; 798 } 799 } while (offset += len, (bh = bh->b_this_page) != head); 800 801 if (uptodate && bh == head) 802 SetPageUptodate(page); 803 804 if (count) { 805 if (--wbc->nr_to_write <= 0 && 806 wbc->sync_mode == WB_SYNC_NONE) 807 done = 1; 808 } 809 xfs_start_page_writeback(page, !page_dirty, count); 810 811 return done; 812 fail_unlock_page: 813 unlock_page(page); 814 fail: 815 return 1; 816 } 817 818 /* 819 * Convert & write out a cluster of pages in the same extent as defined 820 * by mp and following the start page. 821 */ 822 STATIC void 823 xfs_cluster_write( 824 struct inode *inode, 825 pgoff_t tindex, 826 struct xfs_bmbt_irec *imap, 827 xfs_ioend_t **ioendp, 828 struct writeback_control *wbc, 829 pgoff_t tlast) 830 { 831 struct pagevec pvec; 832 int done = 0, i; 833 834 pagevec_init(&pvec, 0); 835 while (!done && tindex <= tlast) { 836 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); 837 838 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) 839 break; 840 841 for (i = 0; i < pagevec_count(&pvec); i++) { 842 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 843 imap, ioendp, wbc); 844 if (done) 845 break; 846 } 847 848 pagevec_release(&pvec); 849 cond_resched(); 850 } 851 } 852 853 STATIC void 854 xfs_vm_invalidatepage( 855 struct page *page, 856 unsigned int offset, 857 unsigned int length) 858 { 859 trace_xfs_invalidatepage(page->mapping->host, page, offset, 860 length); 861 block_invalidatepage(page, offset, length); 862 } 863 864 /* 865 * If the page has delalloc buffers on it, we need to punch them out before we 866 * invalidate the page. If we don't, we leave a stale delalloc mapping on the 867 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read 868 * is done on that same region - the delalloc extent is returned when none is 869 * supposed to be there. 870 * 871 * We prevent this by truncating away the delalloc regions on the page before 872 * invalidating it. Because they are delalloc, we can do this without needing a 873 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this 874 * truncation without a transaction as there is no space left for block 875 * reservation (typically why we see a ENOSPC in writeback). 876 * 877 * This is not a performance critical path, so for now just do the punching a 878 * buffer head at a time. 879 */ 880 STATIC void 881 xfs_aops_discard_page( 882 struct page *page) 883 { 884 struct inode *inode = page->mapping->host; 885 struct xfs_inode *ip = XFS_I(inode); 886 struct buffer_head *bh, *head; 887 loff_t offset = page_offset(page); 888 889 if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) 890 goto out_invalidate; 891 892 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 893 goto out_invalidate; 894 895 xfs_alert(ip->i_mount, 896 "page discard on page %p, inode 0x%llx, offset %llu.", 897 page, ip->i_ino, offset); 898 899 xfs_ilock(ip, XFS_ILOCK_EXCL); 900 bh = head = page_buffers(page); 901 do { 902 int error; 903 xfs_fileoff_t start_fsb; 904 905 if (!buffer_delay(bh)) 906 goto next_buffer; 907 908 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 909 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); 910 if (error) { 911 /* something screwed, just bail */ 912 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 913 xfs_alert(ip->i_mount, 914 "page discard unable to remove delalloc mapping."); 915 } 916 break; 917 } 918 next_buffer: 919 offset += 1 << inode->i_blkbits; 920 921 } while ((bh = bh->b_this_page) != head); 922 923 xfs_iunlock(ip, XFS_ILOCK_EXCL); 924 out_invalidate: 925 xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); 926 return; 927 } 928 929 /* 930 * Write out a dirty page. 931 * 932 * For delalloc space on the page we need to allocate space and flush it. 933 * For unwritten space on the page we need to start the conversion to 934 * regular allocated space. 935 * For any other dirty buffer heads on the page we should flush them. 936 */ 937 STATIC int 938 xfs_vm_writepage( 939 struct page *page, 940 struct writeback_control *wbc) 941 { 942 struct inode *inode = page->mapping->host; 943 struct buffer_head *bh, *head; 944 struct xfs_bmbt_irec imap; 945 xfs_ioend_t *ioend = NULL, *iohead = NULL; 946 loff_t offset; 947 unsigned int type; 948 __uint64_t end_offset; 949 pgoff_t end_index, last_index; 950 ssize_t len; 951 int err, imap_valid = 0, uptodate = 1; 952 int count = 0; 953 int nonblocking = 0; 954 955 trace_xfs_writepage(inode, page, 0, 0); 956 957 ASSERT(page_has_buffers(page)); 958 959 /* 960 * Refuse to write the page out if we are called from reclaim context. 961 * 962 * This avoids stack overflows when called from deeply used stacks in 963 * random callers for direct reclaim or memcg reclaim. We explicitly 964 * allow reclaim from kswapd as the stack usage there is relatively low. 965 * 966 * This should never happen except in the case of a VM regression so 967 * warn about it. 968 */ 969 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == 970 PF_MEMALLOC)) 971 goto redirty; 972 973 /* 974 * Given that we do not allow direct reclaim to call us, we should 975 * never be called while in a filesystem transaction. 976 */ 977 if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) 978 goto redirty; 979 980 /* Is this page beyond the end of the file? */ 981 offset = i_size_read(inode); 982 end_index = offset >> PAGE_CACHE_SHIFT; 983 last_index = (offset - 1) >> PAGE_CACHE_SHIFT; 984 985 /* 986 * The page index is less than the end_index, adjust the end_offset 987 * to the highest offset that this page should represent. 988 * ----------------------------------------------------- 989 * | file mapping | <EOF> | 990 * ----------------------------------------------------- 991 * | Page ... | Page N-2 | Page N-1 | Page N | | 992 * ^--------------------------------^----------|-------- 993 * | desired writeback range | see else | 994 * ---------------------------------^------------------| 995 */ 996 if (page->index < end_index) 997 end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; 998 else { 999 /* 1000 * Check whether the page to write out is beyond or straddles 1001 * i_size or not. 1002 * ------------------------------------------------------- 1003 * | file mapping | <EOF> | 1004 * ------------------------------------------------------- 1005 * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | 1006 * ^--------------------------------^-----------|--------- 1007 * | | Straddles | 1008 * ---------------------------------^-----------|--------| 1009 */ 1010 unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); 1011 1012 /* 1013 * Skip the page if it is fully outside i_size, e.g. due to a 1014 * truncate operation that is in progress. We must redirty the 1015 * page so that reclaim stops reclaiming it. Otherwise 1016 * xfs_vm_releasepage() is called on it and gets confused. 1017 * 1018 * Note that the end_index is unsigned long, it would overflow 1019 * if the given offset is greater than 16TB on 32-bit system 1020 * and if we do check the page is fully outside i_size or not 1021 * via "if (page->index >= end_index + 1)" as "end_index + 1" 1022 * will be evaluated to 0. Hence this page will be redirtied 1023 * and be written out repeatedly which would result in an 1024 * infinite loop, the user program that perform this operation 1025 * will hang. Instead, we can verify this situation by checking 1026 * if the page to write is totally beyond the i_size or if it's 1027 * offset is just equal to the EOF. 1028 */ 1029 if (page->index > end_index || 1030 (page->index == end_index && offset_into_page == 0)) 1031 goto redirty; 1032 1033 /* 1034 * The page straddles i_size. It must be zeroed out on each 1035 * and every writepage invocation because it may be mmapped. 1036 * "A file is mapped in multiples of the page size. For a file 1037 * that is not a multiple of the page size, the remaining 1038 * memory is zeroed when mapped, and writes to that region are 1039 * not written out to the file." 1040 */ 1041 zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); 1042 1043 /* Adjust the end_offset to the end of file */ 1044 end_offset = offset; 1045 } 1046 1047 len = 1 << inode->i_blkbits; 1048 1049 bh = head = page_buffers(page); 1050 offset = page_offset(page); 1051 type = XFS_IO_OVERWRITE; 1052 1053 if (wbc->sync_mode == WB_SYNC_NONE) 1054 nonblocking = 1; 1055 1056 do { 1057 int new_ioend = 0; 1058 1059 if (offset >= end_offset) 1060 break; 1061 if (!buffer_uptodate(bh)) 1062 uptodate = 0; 1063 1064 /* 1065 * set_page_dirty dirties all buffers in a page, independent 1066 * of their state. The dirty state however is entirely 1067 * meaningless for holes (!mapped && uptodate), so skip 1068 * buffers covering holes here. 1069 */ 1070 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 1071 imap_valid = 0; 1072 continue; 1073 } 1074 1075 if (buffer_unwritten(bh)) { 1076 if (type != XFS_IO_UNWRITTEN) { 1077 type = XFS_IO_UNWRITTEN; 1078 imap_valid = 0; 1079 } 1080 } else if (buffer_delay(bh)) { 1081 if (type != XFS_IO_DELALLOC) { 1082 type = XFS_IO_DELALLOC; 1083 imap_valid = 0; 1084 } 1085 } else if (buffer_uptodate(bh)) { 1086 if (type != XFS_IO_OVERWRITE) { 1087 type = XFS_IO_OVERWRITE; 1088 imap_valid = 0; 1089 } 1090 } else { 1091 if (PageUptodate(page)) 1092 ASSERT(buffer_mapped(bh)); 1093 /* 1094 * This buffer is not uptodate and will not be 1095 * written to disk. Ensure that we will put any 1096 * subsequent writeable buffers into a new 1097 * ioend. 1098 */ 1099 imap_valid = 0; 1100 continue; 1101 } 1102 1103 if (imap_valid) 1104 imap_valid = xfs_imap_valid(inode, &imap, offset); 1105 if (!imap_valid) { 1106 /* 1107 * If we didn't have a valid mapping then we need to 1108 * put the new mapping into a separate ioend structure. 1109 * This ensures non-contiguous extents always have 1110 * separate ioends, which is particularly important 1111 * for unwritten extent conversion at I/O completion 1112 * time. 1113 */ 1114 new_ioend = 1; 1115 err = xfs_map_blocks(inode, offset, &imap, type, 1116 nonblocking); 1117 if (err) 1118 goto error; 1119 imap_valid = xfs_imap_valid(inode, &imap, offset); 1120 } 1121 if (imap_valid) { 1122 lock_buffer(bh); 1123 if (type != XFS_IO_OVERWRITE) 1124 xfs_map_at_offset(inode, bh, &imap, offset); 1125 xfs_add_to_ioend(inode, bh, offset, type, &ioend, 1126 new_ioend); 1127 count++; 1128 } 1129 1130 if (!iohead) 1131 iohead = ioend; 1132 1133 } while (offset += len, ((bh = bh->b_this_page) != head)); 1134 1135 if (uptodate && bh == head) 1136 SetPageUptodate(page); 1137 1138 xfs_start_page_writeback(page, 1, count); 1139 1140 /* if there is no IO to be submitted for this page, we are done */ 1141 if (!ioend) 1142 return 0; 1143 1144 ASSERT(iohead); 1145 1146 /* 1147 * Any errors from this point onwards need tobe reported through the IO 1148 * completion path as we have marked the initial page as under writeback 1149 * and unlocked it. 1150 */ 1151 if (imap_valid) { 1152 xfs_off_t end_index; 1153 1154 end_index = imap.br_startoff + imap.br_blockcount; 1155 1156 /* to bytes */ 1157 end_index <<= inode->i_blkbits; 1158 1159 /* to pages */ 1160 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; 1161 1162 /* check against file size */ 1163 if (end_index > last_index) 1164 end_index = last_index; 1165 1166 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1167 wbc, end_index); 1168 } 1169 1170 1171 /* 1172 * Reserve log space if we might write beyond the on-disk inode size. 1173 */ 1174 err = 0; 1175 if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) 1176 err = xfs_setfilesize_trans_alloc(ioend); 1177 1178 xfs_submit_ioend(wbc, iohead, err); 1179 1180 return 0; 1181 1182 error: 1183 if (iohead) 1184 xfs_cancel_ioend(iohead); 1185 1186 if (err == -EAGAIN) 1187 goto redirty; 1188 1189 xfs_aops_discard_page(page); 1190 ClearPageUptodate(page); 1191 unlock_page(page); 1192 return err; 1193 1194 redirty: 1195 redirty_page_for_writepage(wbc, page); 1196 unlock_page(page); 1197 return 0; 1198 } 1199 1200 STATIC int 1201 xfs_vm_writepages( 1202 struct address_space *mapping, 1203 struct writeback_control *wbc) 1204 { 1205 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1206 return generic_writepages(mapping, wbc); 1207 } 1208 1209 /* 1210 * Called to move a page into cleanable state - and from there 1211 * to be released. The page should already be clean. We always 1212 * have buffer heads in this call. 1213 * 1214 * Returns 1 if the page is ok to release, 0 otherwise. 1215 */ 1216 STATIC int 1217 xfs_vm_releasepage( 1218 struct page *page, 1219 gfp_t gfp_mask) 1220 { 1221 int delalloc, unwritten; 1222 1223 trace_xfs_releasepage(page->mapping->host, page, 0, 0); 1224 1225 xfs_count_page_state(page, &delalloc, &unwritten); 1226 1227 if (WARN_ON_ONCE(delalloc)) 1228 return 0; 1229 if (WARN_ON_ONCE(unwritten)) 1230 return 0; 1231 1232 return try_to_free_buffers(page); 1233 } 1234 1235 /* 1236 * When we map a DIO buffer, we may need to attach an ioend that describes the 1237 * type of write IO we are doing. This passes to the completion function the 1238 * operations it needs to perform. If the mapping is for an overwrite wholly 1239 * within the EOF then we don't need an ioend and so we don't allocate one. 1240 * This avoids the unnecessary overhead of allocating and freeing ioends for 1241 * workloads that don't require transactions on IO completion. 1242 * 1243 * If we get multiple mappings in a single IO, we might be mapping different 1244 * types. But because the direct IO can only have a single private pointer, we 1245 * need to ensure that: 1246 * 1247 * a) i) the ioend spans the entire region of unwritten mappings; or 1248 * ii) the ioend spans all the mappings that cross or are beyond EOF; and 1249 * b) if it contains unwritten extents, it is *permanently* marked as such 1250 * 1251 * We could do this by chaining ioends like buffered IO does, but we only 1252 * actually get one IO completion callback from the direct IO, and that spans 1253 * the entire IO regardless of how many mappings and IOs are needed to complete 1254 * the DIO. There is only going to be one reference to the ioend and its life 1255 * cycle is constrained by the DIO completion code. hence we don't need 1256 * reference counting here. 1257 */ 1258 static void 1259 xfs_map_direct( 1260 struct inode *inode, 1261 struct buffer_head *bh_result, 1262 struct xfs_bmbt_irec *imap, 1263 xfs_off_t offset) 1264 { 1265 struct xfs_ioend *ioend; 1266 xfs_off_t size = bh_result->b_size; 1267 int type; 1268 1269 if (ISUNWRITTEN(imap)) 1270 type = XFS_IO_UNWRITTEN; 1271 else 1272 type = XFS_IO_OVERWRITE; 1273 1274 trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); 1275 1276 if (bh_result->b_private) { 1277 ioend = bh_result->b_private; 1278 ASSERT(ioend->io_size > 0); 1279 ASSERT(offset >= ioend->io_offset); 1280 if (offset + size > ioend->io_offset + ioend->io_size) 1281 ioend->io_size = offset - ioend->io_offset + size; 1282 1283 if (type == XFS_IO_UNWRITTEN && type != ioend->io_type) 1284 ioend->io_type = XFS_IO_UNWRITTEN; 1285 1286 trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset, 1287 ioend->io_size, ioend->io_type, 1288 imap); 1289 } else if (type == XFS_IO_UNWRITTEN || 1290 offset + size > i_size_read(inode)) { 1291 ioend = xfs_alloc_ioend(inode, type); 1292 ioend->io_offset = offset; 1293 ioend->io_size = size; 1294 1295 bh_result->b_private = ioend; 1296 set_buffer_defer_completion(bh_result); 1297 1298 trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type, 1299 imap); 1300 } else { 1301 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, 1302 imap); 1303 } 1304 } 1305 1306 /* 1307 * If this is O_DIRECT or the mpage code calling tell them how large the mapping 1308 * is, so that we can avoid repeated get_blocks calls. 1309 * 1310 * If the mapping spans EOF, then we have to break the mapping up as the mapping 1311 * for blocks beyond EOF must be marked new so that sub block regions can be 1312 * correctly zeroed. We can't do this for mappings within EOF unless the mapping 1313 * was just allocated or is unwritten, otherwise the callers would overwrite 1314 * existing data with zeros. Hence we have to split the mapping into a range up 1315 * to and including EOF, and a second mapping for beyond EOF. 1316 */ 1317 static void 1318 xfs_map_trim_size( 1319 struct inode *inode, 1320 sector_t iblock, 1321 struct buffer_head *bh_result, 1322 struct xfs_bmbt_irec *imap, 1323 xfs_off_t offset, 1324 ssize_t size) 1325 { 1326 xfs_off_t mapping_size; 1327 1328 mapping_size = imap->br_startoff + imap->br_blockcount - iblock; 1329 mapping_size <<= inode->i_blkbits; 1330 1331 ASSERT(mapping_size > 0); 1332 if (mapping_size > size) 1333 mapping_size = size; 1334 if (offset < i_size_read(inode) && 1335 offset + mapping_size >= i_size_read(inode)) { 1336 /* limit mapping to block that spans EOF */ 1337 mapping_size = roundup_64(i_size_read(inode) - offset, 1338 1 << inode->i_blkbits); 1339 } 1340 if (mapping_size > LONG_MAX) 1341 mapping_size = LONG_MAX; 1342 1343 bh_result->b_size = mapping_size; 1344 } 1345 1346 STATIC int 1347 __xfs_get_blocks( 1348 struct inode *inode, 1349 sector_t iblock, 1350 struct buffer_head *bh_result, 1351 int create, 1352 int direct) 1353 { 1354 struct xfs_inode *ip = XFS_I(inode); 1355 struct xfs_mount *mp = ip->i_mount; 1356 xfs_fileoff_t offset_fsb, end_fsb; 1357 int error = 0; 1358 int lockmode = 0; 1359 struct xfs_bmbt_irec imap; 1360 int nimaps = 1; 1361 xfs_off_t offset; 1362 ssize_t size; 1363 int new = 0; 1364 1365 if (XFS_FORCED_SHUTDOWN(mp)) 1366 return -EIO; 1367 1368 offset = (xfs_off_t)iblock << inode->i_blkbits; 1369 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1370 size = bh_result->b_size; 1371 1372 if (!create && direct && offset >= i_size_read(inode)) 1373 return 0; 1374 1375 /* 1376 * Direct I/O is usually done on preallocated files, so try getting 1377 * a block mapping without an exclusive lock first. For buffered 1378 * writes we already have the exclusive iolock anyway, so avoiding 1379 * a lock roundtrip here by taking the ilock exclusive from the 1380 * beginning is a useful micro optimization. 1381 */ 1382 if (create && !direct) { 1383 lockmode = XFS_ILOCK_EXCL; 1384 xfs_ilock(ip, lockmode); 1385 } else { 1386 lockmode = xfs_ilock_data_map_shared(ip); 1387 } 1388 1389 ASSERT(offset <= mp->m_super->s_maxbytes); 1390 if (offset + size > mp->m_super->s_maxbytes) 1391 size = mp->m_super->s_maxbytes - offset; 1392 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); 1393 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1394 1395 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 1396 &imap, &nimaps, XFS_BMAPI_ENTIRE); 1397 if (error) 1398 goto out_unlock; 1399 1400 if (create && 1401 (!nimaps || 1402 (imap.br_startblock == HOLESTARTBLOCK || 1403 imap.br_startblock == DELAYSTARTBLOCK))) { 1404 if (direct || xfs_get_extsz_hint(ip)) { 1405 /* 1406 * Drop the ilock in preparation for starting the block 1407 * allocation transaction. It will be retaken 1408 * exclusively inside xfs_iomap_write_direct for the 1409 * actual allocation. 1410 */ 1411 xfs_iunlock(ip, lockmode); 1412 error = xfs_iomap_write_direct(ip, offset, size, 1413 &imap, nimaps); 1414 if (error) 1415 return error; 1416 new = 1; 1417 } else { 1418 /* 1419 * Delalloc reservations do not require a transaction, 1420 * we can go on without dropping the lock here. If we 1421 * are allocating a new delalloc block, make sure that 1422 * we set the new flag so that we mark the buffer new so 1423 * that we know that it is newly allocated if the write 1424 * fails. 1425 */ 1426 if (nimaps && imap.br_startblock == HOLESTARTBLOCK) 1427 new = 1; 1428 error = xfs_iomap_write_delay(ip, offset, size, &imap); 1429 if (error) 1430 goto out_unlock; 1431 1432 xfs_iunlock(ip, lockmode); 1433 } 1434 trace_xfs_get_blocks_alloc(ip, offset, size, 1435 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1436 : XFS_IO_DELALLOC, &imap); 1437 } else if (nimaps) { 1438 trace_xfs_get_blocks_found(ip, offset, size, 1439 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1440 : XFS_IO_OVERWRITE, &imap); 1441 xfs_iunlock(ip, lockmode); 1442 } else { 1443 trace_xfs_get_blocks_notfound(ip, offset, size); 1444 goto out_unlock; 1445 } 1446 1447 /* trim mapping down to size requested */ 1448 if (direct || size > (1 << inode->i_blkbits)) 1449 xfs_map_trim_size(inode, iblock, bh_result, 1450 &imap, offset, size); 1451 1452 /* 1453 * For unwritten extents do not report a disk address in the buffered 1454 * read case (treat as if we're reading into a hole). 1455 */ 1456 if (imap.br_startblock != HOLESTARTBLOCK && 1457 imap.br_startblock != DELAYSTARTBLOCK && 1458 (create || !ISUNWRITTEN(&imap))) { 1459 xfs_map_buffer(inode, bh_result, &imap, offset); 1460 if (ISUNWRITTEN(&imap)) 1461 set_buffer_unwritten(bh_result); 1462 /* direct IO needs special help */ 1463 if (create && direct) 1464 xfs_map_direct(inode, bh_result, &imap, offset); 1465 } 1466 1467 /* 1468 * If this is a realtime file, data may be on a different device. 1469 * to that pointed to from the buffer_head b_bdev currently. 1470 */ 1471 bh_result->b_bdev = xfs_find_bdev_for_inode(inode); 1472 1473 /* 1474 * If we previously allocated a block out beyond eof and we are now 1475 * coming back to use it then we will need to flag it as new even if it 1476 * has a disk address. 1477 * 1478 * With sub-block writes into unwritten extents we also need to mark 1479 * the buffer as new so that the unwritten parts of the buffer gets 1480 * correctly zeroed. 1481 */ 1482 if (create && 1483 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1484 (offset >= i_size_read(inode)) || 1485 (new || ISUNWRITTEN(&imap)))) 1486 set_buffer_new(bh_result); 1487 1488 if (imap.br_startblock == DELAYSTARTBLOCK) { 1489 BUG_ON(direct); 1490 if (create) { 1491 set_buffer_uptodate(bh_result); 1492 set_buffer_mapped(bh_result); 1493 set_buffer_delay(bh_result); 1494 } 1495 } 1496 1497 return 0; 1498 1499 out_unlock: 1500 xfs_iunlock(ip, lockmode); 1501 return error; 1502 } 1503 1504 int 1505 xfs_get_blocks( 1506 struct inode *inode, 1507 sector_t iblock, 1508 struct buffer_head *bh_result, 1509 int create) 1510 { 1511 return __xfs_get_blocks(inode, iblock, bh_result, create, 0); 1512 } 1513 1514 STATIC int 1515 xfs_get_blocks_direct( 1516 struct inode *inode, 1517 sector_t iblock, 1518 struct buffer_head *bh_result, 1519 int create) 1520 { 1521 return __xfs_get_blocks(inode, iblock, bh_result, create, 1); 1522 } 1523 1524 /* 1525 * Complete a direct I/O write request. 1526 * 1527 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. 1528 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite 1529 * wholly within the EOF and so there is nothing for us to do. Note that in this 1530 * case the completion can be called in interrupt context, whereas if we have an 1531 * ioend we will always be called in task context (i.e. from a workqueue). 1532 */ 1533 STATIC void 1534 xfs_end_io_direct_write( 1535 struct kiocb *iocb, 1536 loff_t offset, 1537 ssize_t size, 1538 void *private) 1539 { 1540 struct inode *inode = file_inode(iocb->ki_filp); 1541 struct xfs_inode *ip = XFS_I(inode); 1542 struct xfs_mount *mp = ip->i_mount; 1543 struct xfs_ioend *ioend = private; 1544 1545 trace_xfs_gbmap_direct_endio(ip, offset, size, 1546 ioend ? ioend->io_type : 0, NULL); 1547 1548 if (!ioend) { 1549 ASSERT(offset + size <= i_size_read(inode)); 1550 return; 1551 } 1552 1553 if (XFS_FORCED_SHUTDOWN(mp)) 1554 goto out_end_io; 1555 1556 /* 1557 * dio completion end_io functions are only called on writes if more 1558 * than 0 bytes was written. 1559 */ 1560 ASSERT(size > 0); 1561 1562 /* 1563 * The ioend only maps whole blocks, while the IO may be sector aligned. 1564 * Hence the ioend offset/size may not match the IO offset/size exactly. 1565 * Because we don't map overwrites within EOF into the ioend, the offset 1566 * may not match, but only if the endio spans EOF. Either way, write 1567 * the IO sizes into the ioend so that completion processing does the 1568 * right thing. 1569 */ 1570 ASSERT(offset + size <= ioend->io_offset + ioend->io_size); 1571 ioend->io_size = size; 1572 ioend->io_offset = offset; 1573 1574 /* 1575 * The ioend tells us whether we are doing unwritten extent conversion 1576 * or an append transaction that updates the on-disk file size. These 1577 * cases are the only cases where we should *potentially* be needing 1578 * to update the VFS inode size. 1579 * 1580 * We need to update the in-core inode size here so that we don't end up 1581 * with the on-disk inode size being outside the in-core inode size. We 1582 * have no other method of updating EOF for AIO, so always do it here 1583 * if necessary. 1584 * 1585 * We need to lock the test/set EOF update as we can be racing with 1586 * other IO completions here to update the EOF. Failing to serialise 1587 * here can result in EOF moving backwards and Bad Things Happen when 1588 * that occurs. 1589 */ 1590 spin_lock(&ip->i_flags_lock); 1591 if (offset + size > i_size_read(inode)) 1592 i_size_write(inode, offset + size); 1593 spin_unlock(&ip->i_flags_lock); 1594 1595 /* 1596 * If we are doing an append IO that needs to update the EOF on disk, 1597 * do the transaction reserve now so we can use common end io 1598 * processing. Stashing the error (if there is one) in the ioend will 1599 * result in the ioend processing passing on the error if it is 1600 * possible as we can't return it from here. 1601 */ 1602 if (ioend->io_type == XFS_IO_OVERWRITE) 1603 ioend->io_error = xfs_setfilesize_trans_alloc(ioend); 1604 1605 out_end_io: 1606 xfs_end_io(&ioend->io_work); 1607 return; 1608 } 1609 1610 STATIC ssize_t 1611 xfs_vm_direct_IO( 1612 struct kiocb *iocb, 1613 struct iov_iter *iter, 1614 loff_t offset) 1615 { 1616 struct inode *inode = iocb->ki_filp->f_mapping->host; 1617 struct block_device *bdev = xfs_find_bdev_for_inode(inode); 1618 1619 if (iov_iter_rw(iter) == WRITE) { 1620 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1621 xfs_get_blocks_direct, 1622 xfs_end_io_direct_write, NULL, 1623 DIO_ASYNC_EXTEND); 1624 } 1625 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1626 xfs_get_blocks_direct, NULL, NULL, 0); 1627 } 1628 1629 /* 1630 * Punch out the delalloc blocks we have already allocated. 1631 * 1632 * Don't bother with xfs_setattr given that nothing can have made it to disk yet 1633 * as the page is still locked at this point. 1634 */ 1635 STATIC void 1636 xfs_vm_kill_delalloc_range( 1637 struct inode *inode, 1638 loff_t start, 1639 loff_t end) 1640 { 1641 struct xfs_inode *ip = XFS_I(inode); 1642 xfs_fileoff_t start_fsb; 1643 xfs_fileoff_t end_fsb; 1644 int error; 1645 1646 start_fsb = XFS_B_TO_FSB(ip->i_mount, start); 1647 end_fsb = XFS_B_TO_FSB(ip->i_mount, end); 1648 if (end_fsb <= start_fsb) 1649 return; 1650 1651 xfs_ilock(ip, XFS_ILOCK_EXCL); 1652 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1653 end_fsb - start_fsb); 1654 if (error) { 1655 /* something screwed, just bail */ 1656 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1657 xfs_alert(ip->i_mount, 1658 "xfs_vm_write_failed: unable to clean up ino %lld", 1659 ip->i_ino); 1660 } 1661 } 1662 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1663 } 1664 1665 STATIC void 1666 xfs_vm_write_failed( 1667 struct inode *inode, 1668 struct page *page, 1669 loff_t pos, 1670 unsigned len) 1671 { 1672 loff_t block_offset; 1673 loff_t block_start; 1674 loff_t block_end; 1675 loff_t from = pos & (PAGE_CACHE_SIZE - 1); 1676 loff_t to = from + len; 1677 struct buffer_head *bh, *head; 1678 1679 /* 1680 * The request pos offset might be 32 or 64 bit, this is all fine 1681 * on 64-bit platform. However, for 64-bit pos request on 32-bit 1682 * platform, the high 32-bit will be masked off if we evaluate the 1683 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is 1684 * 0xfffff000 as an unsigned long, hence the result is incorrect 1685 * which could cause the following ASSERT failed in most cases. 1686 * In order to avoid this, we can evaluate the block_offset of the 1687 * start of the page by using shifts rather than masks the mismatch 1688 * problem. 1689 */ 1690 block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; 1691 1692 ASSERT(block_offset + from == pos); 1693 1694 head = page_buffers(page); 1695 block_start = 0; 1696 for (bh = head; bh != head || !block_start; 1697 bh = bh->b_this_page, block_start = block_end, 1698 block_offset += bh->b_size) { 1699 block_end = block_start + bh->b_size; 1700 1701 /* skip buffers before the write */ 1702 if (block_end <= from) 1703 continue; 1704 1705 /* if the buffer is after the write, we're done */ 1706 if (block_start >= to) 1707 break; 1708 1709 if (!buffer_delay(bh)) 1710 continue; 1711 1712 if (!buffer_new(bh) && block_offset < i_size_read(inode)) 1713 continue; 1714 1715 xfs_vm_kill_delalloc_range(inode, block_offset, 1716 block_offset + bh->b_size); 1717 1718 /* 1719 * This buffer does not contain data anymore. make sure anyone 1720 * who finds it knows that for certain. 1721 */ 1722 clear_buffer_delay(bh); 1723 clear_buffer_uptodate(bh); 1724 clear_buffer_mapped(bh); 1725 clear_buffer_new(bh); 1726 clear_buffer_dirty(bh); 1727 } 1728 1729 } 1730 1731 /* 1732 * This used to call block_write_begin(), but it unlocks and releases the page 1733 * on error, and we need that page to be able to punch stale delalloc blocks out 1734 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at 1735 * the appropriate point. 1736 */ 1737 STATIC int 1738 xfs_vm_write_begin( 1739 struct file *file, 1740 struct address_space *mapping, 1741 loff_t pos, 1742 unsigned len, 1743 unsigned flags, 1744 struct page **pagep, 1745 void **fsdata) 1746 { 1747 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1748 struct page *page; 1749 int status; 1750 1751 ASSERT(len <= PAGE_CACHE_SIZE); 1752 1753 page = grab_cache_page_write_begin(mapping, index, flags); 1754 if (!page) 1755 return -ENOMEM; 1756 1757 status = __block_write_begin(page, pos, len, xfs_get_blocks); 1758 if (unlikely(status)) { 1759 struct inode *inode = mapping->host; 1760 size_t isize = i_size_read(inode); 1761 1762 xfs_vm_write_failed(inode, page, pos, len); 1763 unlock_page(page); 1764 1765 /* 1766 * If the write is beyond EOF, we only want to kill blocks 1767 * allocated in this write, not blocks that were previously 1768 * written successfully. 1769 */ 1770 if (pos + len > isize) { 1771 ssize_t start = max_t(ssize_t, pos, isize); 1772 1773 truncate_pagecache_range(inode, start, pos + len); 1774 } 1775 1776 page_cache_release(page); 1777 page = NULL; 1778 } 1779 1780 *pagep = page; 1781 return status; 1782 } 1783 1784 /* 1785 * On failure, we only need to kill delalloc blocks beyond EOF in the range of 1786 * this specific write because they will never be written. Previous writes 1787 * beyond EOF where block allocation succeeded do not need to be trashed, so 1788 * only new blocks from this write should be trashed. For blocks within 1789 * EOF, generic_write_end() zeros them so they are safe to leave alone and be 1790 * written with all the other valid data. 1791 */ 1792 STATIC int 1793 xfs_vm_write_end( 1794 struct file *file, 1795 struct address_space *mapping, 1796 loff_t pos, 1797 unsigned len, 1798 unsigned copied, 1799 struct page *page, 1800 void *fsdata) 1801 { 1802 int ret; 1803 1804 ASSERT(len <= PAGE_CACHE_SIZE); 1805 1806 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 1807 if (unlikely(ret < len)) { 1808 struct inode *inode = mapping->host; 1809 size_t isize = i_size_read(inode); 1810 loff_t to = pos + len; 1811 1812 if (to > isize) { 1813 /* only kill blocks in this write beyond EOF */ 1814 if (pos > isize) 1815 isize = pos; 1816 xfs_vm_kill_delalloc_range(inode, isize, to); 1817 truncate_pagecache_range(inode, isize, to); 1818 } 1819 } 1820 return ret; 1821 } 1822 1823 STATIC sector_t 1824 xfs_vm_bmap( 1825 struct address_space *mapping, 1826 sector_t block) 1827 { 1828 struct inode *inode = (struct inode *)mapping->host; 1829 struct xfs_inode *ip = XFS_I(inode); 1830 1831 trace_xfs_vm_bmap(XFS_I(inode)); 1832 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1833 filemap_write_and_wait(mapping); 1834 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1835 return generic_block_bmap(mapping, block, xfs_get_blocks); 1836 } 1837 1838 STATIC int 1839 xfs_vm_readpage( 1840 struct file *unused, 1841 struct page *page) 1842 { 1843 return mpage_readpage(page, xfs_get_blocks); 1844 } 1845 1846 STATIC int 1847 xfs_vm_readpages( 1848 struct file *unused, 1849 struct address_space *mapping, 1850 struct list_head *pages, 1851 unsigned nr_pages) 1852 { 1853 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1854 } 1855 1856 /* 1857 * This is basically a copy of __set_page_dirty_buffers() with one 1858 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them 1859 * dirty, we'll never be able to clean them because we don't write buffers 1860 * beyond EOF, and that means we can't invalidate pages that span EOF 1861 * that have been marked dirty. Further, the dirty state can leak into 1862 * the file interior if the file is extended, resulting in all sorts of 1863 * bad things happening as the state does not match the underlying data. 1864 * 1865 * XXX: this really indicates that bufferheads in XFS need to die. Warts like 1866 * this only exist because of bufferheads and how the generic code manages them. 1867 */ 1868 STATIC int 1869 xfs_vm_set_page_dirty( 1870 struct page *page) 1871 { 1872 struct address_space *mapping = page->mapping; 1873 struct inode *inode = mapping->host; 1874 loff_t end_offset; 1875 loff_t offset; 1876 int newly_dirty; 1877 1878 if (unlikely(!mapping)) 1879 return !TestSetPageDirty(page); 1880 1881 end_offset = i_size_read(inode); 1882 offset = page_offset(page); 1883 1884 spin_lock(&mapping->private_lock); 1885 if (page_has_buffers(page)) { 1886 struct buffer_head *head = page_buffers(page); 1887 struct buffer_head *bh = head; 1888 1889 do { 1890 if (offset < end_offset) 1891 set_buffer_dirty(bh); 1892 bh = bh->b_this_page; 1893 offset += 1 << inode->i_blkbits; 1894 } while (bh != head); 1895 } 1896 newly_dirty = !TestSetPageDirty(page); 1897 spin_unlock(&mapping->private_lock); 1898 1899 if (newly_dirty) { 1900 /* sigh - __set_page_dirty() is static, so copy it here, too */ 1901 unsigned long flags; 1902 1903 spin_lock_irqsave(&mapping->tree_lock, flags); 1904 if (page->mapping) { /* Race with truncate? */ 1905 WARN_ON_ONCE(!PageUptodate(page)); 1906 account_page_dirtied(page, mapping); 1907 radix_tree_tag_set(&mapping->page_tree, 1908 page_index(page), PAGECACHE_TAG_DIRTY); 1909 } 1910 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1911 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1912 } 1913 return newly_dirty; 1914 } 1915 1916 const struct address_space_operations xfs_address_space_operations = { 1917 .readpage = xfs_vm_readpage, 1918 .readpages = xfs_vm_readpages, 1919 .writepage = xfs_vm_writepage, 1920 .writepages = xfs_vm_writepages, 1921 .set_page_dirty = xfs_vm_set_page_dirty, 1922 .releasepage = xfs_vm_releasepage, 1923 .invalidatepage = xfs_vm_invalidatepage, 1924 .write_begin = xfs_vm_write_begin, 1925 .write_end = xfs_vm_write_end, 1926 .bmap = xfs_vm_bmap, 1927 .direct_IO = xfs_vm_direct_IO, 1928 .migratepage = buffer_migrate_page, 1929 .is_partially_uptodate = block_is_partially_uptodate, 1930 .error_remove_page = generic_error_remove_page, 1931 }; 1932