1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_shared.h" 20 #include "xfs_format.h" 21 #include "xfs_log_format.h" 22 #include "xfs_trans_resv.h" 23 #include "xfs_mount.h" 24 #include "xfs_inode.h" 25 #include "xfs_trans.h" 26 #include "xfs_inode_item.h" 27 #include "xfs_alloc.h" 28 #include "xfs_error.h" 29 #include "xfs_iomap.h" 30 #include "xfs_trace.h" 31 #include "xfs_bmap.h" 32 #include "xfs_bmap_util.h" 33 #include "xfs_bmap_btree.h" 34 #include <linux/gfp.h> 35 #include <linux/mpage.h> 36 #include <linux/pagevec.h> 37 #include <linux/writeback.h> 38 39 /* flags for direct write completions */ 40 #define XFS_DIO_FLAG_UNWRITTEN (1 << 0) 41 #define XFS_DIO_FLAG_APPEND (1 << 1) 42 43 /* 44 * structure owned by writepages passed to individual writepage calls 45 */ 46 struct xfs_writepage_ctx { 47 struct xfs_bmbt_irec imap; 48 bool imap_valid; 49 unsigned int io_type; 50 struct xfs_ioend *ioend; 51 sector_t last_block; 52 }; 53 54 void 55 xfs_count_page_state( 56 struct page *page, 57 int *delalloc, 58 int *unwritten) 59 { 60 struct buffer_head *bh, *head; 61 62 *delalloc = *unwritten = 0; 63 64 bh = head = page_buffers(page); 65 do { 66 if (buffer_unwritten(bh)) 67 (*unwritten) = 1; 68 else if (buffer_delay(bh)) 69 (*delalloc) = 1; 70 } while ((bh = bh->b_this_page) != head); 71 } 72 73 struct block_device * 74 xfs_find_bdev_for_inode( 75 struct inode *inode) 76 { 77 struct xfs_inode *ip = XFS_I(inode); 78 struct xfs_mount *mp = ip->i_mount; 79 80 if (XFS_IS_REALTIME_INODE(ip)) 81 return mp->m_rtdev_targp->bt_bdev; 82 else 83 return mp->m_ddev_targp->bt_bdev; 84 } 85 86 /* 87 * We're now finished for good with this ioend structure. 88 * Update the page state via the associated buffer_heads, 89 * release holds on the inode and bio, and finally free 90 * up memory. Do not use the ioend after this. 91 */ 92 STATIC void 93 xfs_destroy_ioend( 94 xfs_ioend_t *ioend) 95 { 96 struct buffer_head *bh, *next; 97 98 for (bh = ioend->io_buffer_head; bh; bh = next) { 99 next = bh->b_private; 100 bh->b_end_io(bh, !ioend->io_error); 101 } 102 103 mempool_free(ioend, xfs_ioend_pool); 104 } 105 106 /* 107 * Fast and loose check if this write could update the on-disk inode size. 108 */ 109 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) 110 { 111 return ioend->io_offset + ioend->io_size > 112 XFS_I(ioend->io_inode)->i_d.di_size; 113 } 114 115 STATIC int 116 xfs_setfilesize_trans_alloc( 117 struct xfs_ioend *ioend) 118 { 119 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 120 struct xfs_trans *tp; 121 int error; 122 123 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 124 125 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); 126 if (error) { 127 xfs_trans_cancel(tp); 128 return error; 129 } 130 131 ioend->io_append_trans = tp; 132 133 /* 134 * We may pass freeze protection with a transaction. So tell lockdep 135 * we released it. 136 */ 137 __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS); 138 /* 139 * We hand off the transaction to the completion thread now, so 140 * clear the flag here. 141 */ 142 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 143 return 0; 144 } 145 146 /* 147 * Update on-disk file size now that data has been written to disk. 148 */ 149 STATIC int 150 xfs_setfilesize( 151 struct xfs_inode *ip, 152 struct xfs_trans *tp, 153 xfs_off_t offset, 154 size_t size) 155 { 156 xfs_fsize_t isize; 157 158 xfs_ilock(ip, XFS_ILOCK_EXCL); 159 isize = xfs_new_eof(ip, offset + size); 160 if (!isize) { 161 xfs_iunlock(ip, XFS_ILOCK_EXCL); 162 xfs_trans_cancel(tp); 163 return 0; 164 } 165 166 trace_xfs_setfilesize(ip, offset, size); 167 168 ip->i_d.di_size = isize; 169 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 170 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 171 172 return xfs_trans_commit(tp); 173 } 174 175 STATIC int 176 xfs_setfilesize_ioend( 177 struct xfs_ioend *ioend) 178 { 179 struct xfs_inode *ip = XFS_I(ioend->io_inode); 180 struct xfs_trans *tp = ioend->io_append_trans; 181 182 /* 183 * The transaction may have been allocated in the I/O submission thread, 184 * thus we need to mark ourselves as being in a transaction manually. 185 * Similarly for freeze protection. 186 */ 187 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 188 __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); 189 190 /* we abort the update if there was an IO error */ 191 if (ioend->io_error) { 192 xfs_trans_cancel(tp); 193 return ioend->io_error; 194 } 195 196 return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); 197 } 198 199 /* 200 * Schedule IO completion handling on the final put of an ioend. 201 * 202 * If there is no work to do we might as well call it a day and free the 203 * ioend right now. 204 */ 205 STATIC void 206 xfs_finish_ioend( 207 struct xfs_ioend *ioend) 208 { 209 if (atomic_dec_and_test(&ioend->io_remaining)) { 210 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 211 212 if (ioend->io_type == XFS_IO_UNWRITTEN) 213 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 214 else if (ioend->io_append_trans) 215 queue_work(mp->m_data_workqueue, &ioend->io_work); 216 else 217 xfs_destroy_ioend(ioend); 218 } 219 } 220 221 /* 222 * IO write completion. 223 */ 224 STATIC void 225 xfs_end_io( 226 struct work_struct *work) 227 { 228 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); 229 struct xfs_inode *ip = XFS_I(ioend->io_inode); 230 int error = 0; 231 232 /* 233 * Set an error if the mount has shut down and proceed with end I/O 234 * processing so it can perform whatever cleanups are necessary. 235 */ 236 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 237 ioend->io_error = -EIO; 238 239 /* 240 * For unwritten extents we need to issue transactions to convert a 241 * range to normal written extens after the data I/O has finished. 242 * Detecting and handling completion IO errors is done individually 243 * for each case as different cleanup operations need to be performed 244 * on error. 245 */ 246 if (ioend->io_type == XFS_IO_UNWRITTEN) { 247 if (ioend->io_error) 248 goto done; 249 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 250 ioend->io_size); 251 } else if (ioend->io_append_trans) { 252 error = xfs_setfilesize_ioend(ioend); 253 } else { 254 ASSERT(!xfs_ioend_is_append(ioend)); 255 } 256 257 done: 258 if (error) 259 ioend->io_error = error; 260 xfs_destroy_ioend(ioend); 261 } 262 263 /* 264 * Allocate and initialise an IO completion structure. 265 * We need to track unwritten extent write completion here initially. 266 * We'll need to extend this for updating the ondisk inode size later 267 * (vs. incore size). 268 */ 269 STATIC xfs_ioend_t * 270 xfs_alloc_ioend( 271 struct inode *inode, 272 unsigned int type) 273 { 274 xfs_ioend_t *ioend; 275 276 ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); 277 278 /* 279 * Set the count to 1 initially, which will prevent an I/O 280 * completion callback from happening before we have started 281 * all the I/O from calling the completion routine too early. 282 */ 283 atomic_set(&ioend->io_remaining, 1); 284 ioend->io_error = 0; 285 INIT_LIST_HEAD(&ioend->io_list); 286 ioend->io_type = type; 287 ioend->io_inode = inode; 288 ioend->io_buffer_head = NULL; 289 ioend->io_buffer_tail = NULL; 290 ioend->io_offset = 0; 291 ioend->io_size = 0; 292 ioend->io_append_trans = NULL; 293 294 INIT_WORK(&ioend->io_work, xfs_end_io); 295 return ioend; 296 } 297 298 STATIC int 299 xfs_map_blocks( 300 struct inode *inode, 301 loff_t offset, 302 struct xfs_bmbt_irec *imap, 303 int type) 304 { 305 struct xfs_inode *ip = XFS_I(inode); 306 struct xfs_mount *mp = ip->i_mount; 307 ssize_t count = 1 << inode->i_blkbits; 308 xfs_fileoff_t offset_fsb, end_fsb; 309 int error = 0; 310 int bmapi_flags = XFS_BMAPI_ENTIRE; 311 int nimaps = 1; 312 313 if (XFS_FORCED_SHUTDOWN(mp)) 314 return -EIO; 315 316 if (type == XFS_IO_UNWRITTEN) 317 bmapi_flags |= XFS_BMAPI_IGSTATE; 318 319 xfs_ilock(ip, XFS_ILOCK_SHARED); 320 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 321 (ip->i_df.if_flags & XFS_IFEXTENTS)); 322 ASSERT(offset <= mp->m_super->s_maxbytes); 323 324 if (offset + count > mp->m_super->s_maxbytes) 325 count = mp->m_super->s_maxbytes - offset; 326 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 327 offset_fsb = XFS_B_TO_FSBT(mp, offset); 328 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 329 imap, &nimaps, bmapi_flags); 330 xfs_iunlock(ip, XFS_ILOCK_SHARED); 331 332 if (error) 333 return error; 334 335 if (type == XFS_IO_DELALLOC && 336 (!nimaps || isnullstartblock(imap->br_startblock))) { 337 error = xfs_iomap_write_allocate(ip, offset, imap); 338 if (!error) 339 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); 340 return error; 341 } 342 343 #ifdef DEBUG 344 if (type == XFS_IO_UNWRITTEN) { 345 ASSERT(nimaps); 346 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 347 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 348 } 349 #endif 350 if (nimaps) 351 trace_xfs_map_blocks_found(ip, offset, count, type, imap); 352 return 0; 353 } 354 355 STATIC bool 356 xfs_imap_valid( 357 struct inode *inode, 358 struct xfs_bmbt_irec *imap, 359 xfs_off_t offset) 360 { 361 offset >>= inode->i_blkbits; 362 363 return offset >= imap->br_startoff && 364 offset < imap->br_startoff + imap->br_blockcount; 365 } 366 367 /* 368 * BIO completion handler for buffered IO. 369 */ 370 STATIC void 371 xfs_end_bio( 372 struct bio *bio) 373 { 374 xfs_ioend_t *ioend = bio->bi_private; 375 376 if (!ioend->io_error) 377 ioend->io_error = bio->bi_error; 378 379 /* Toss bio and pass work off to an xfsdatad thread */ 380 bio->bi_private = NULL; 381 bio->bi_end_io = NULL; 382 bio_put(bio); 383 384 xfs_finish_ioend(ioend); 385 } 386 387 STATIC void 388 xfs_submit_ioend_bio( 389 struct writeback_control *wbc, 390 xfs_ioend_t *ioend, 391 struct bio *bio) 392 { 393 atomic_inc(&ioend->io_remaining); 394 bio->bi_private = ioend; 395 bio->bi_end_io = xfs_end_bio; 396 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); 397 } 398 399 STATIC struct bio * 400 xfs_alloc_ioend_bio( 401 struct buffer_head *bh) 402 { 403 struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); 404 405 ASSERT(bio->bi_private == NULL); 406 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 407 bio->bi_bdev = bh->b_bdev; 408 return bio; 409 } 410 411 STATIC void 412 xfs_start_buffer_writeback( 413 struct buffer_head *bh) 414 { 415 ASSERT(buffer_mapped(bh)); 416 ASSERT(buffer_locked(bh)); 417 ASSERT(!buffer_delay(bh)); 418 ASSERT(!buffer_unwritten(bh)); 419 420 mark_buffer_async_write(bh); 421 set_buffer_uptodate(bh); 422 clear_buffer_dirty(bh); 423 } 424 425 STATIC void 426 xfs_start_page_writeback( 427 struct page *page, 428 int clear_dirty) 429 { 430 ASSERT(PageLocked(page)); 431 ASSERT(!PageWriteback(page)); 432 433 /* 434 * if the page was not fully cleaned, we need to ensure that the higher 435 * layers come back to it correctly. That means we need to keep the page 436 * dirty, and for WB_SYNC_ALL writeback we need to ensure the 437 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to 438 * write this page in this writeback sweep will be made. 439 */ 440 if (clear_dirty) { 441 clear_page_dirty_for_io(page); 442 set_page_writeback(page); 443 } else 444 set_page_writeback_keepwrite(page); 445 446 unlock_page(page); 447 } 448 449 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) 450 { 451 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); 452 } 453 454 /* 455 * Submit all of the bios for an ioend. We are only passed a single ioend at a 456 * time; the caller is responsible for chaining prior to submission. 457 * 458 * If @fail is non-zero, it means that we have a situation where some part of 459 * the submission process has failed after we have marked paged for writeback 460 * and unlocked them. In this situation, we need to fail the ioend chain rather 461 * than submit it to IO. This typically only happens on a filesystem shutdown. 462 */ 463 STATIC int 464 xfs_submit_ioend( 465 struct writeback_control *wbc, 466 xfs_ioend_t *ioend, 467 int status) 468 { 469 struct buffer_head *bh; 470 struct bio *bio; 471 sector_t lastblock = 0; 472 473 /* Reserve log space if we might write beyond the on-disk inode size. */ 474 if (!status && 475 ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) 476 status = xfs_setfilesize_trans_alloc(ioend); 477 /* 478 * If we are failing the IO now, just mark the ioend with an 479 * error and finish it. This will run IO completion immediately 480 * as there is only one reference to the ioend at this point in 481 * time. 482 */ 483 if (status) { 484 ioend->io_error = status; 485 xfs_finish_ioend(ioend); 486 return status; 487 } 488 489 bio = NULL; 490 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 491 492 if (!bio) { 493 retry: 494 bio = xfs_alloc_ioend_bio(bh); 495 } else if (bh->b_blocknr != lastblock + 1) { 496 xfs_submit_ioend_bio(wbc, ioend, bio); 497 goto retry; 498 } 499 500 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { 501 xfs_submit_ioend_bio(wbc, ioend, bio); 502 goto retry; 503 } 504 505 lastblock = bh->b_blocknr; 506 } 507 if (bio) 508 xfs_submit_ioend_bio(wbc, ioend, bio); 509 xfs_finish_ioend(ioend); 510 return 0; 511 } 512 513 /* 514 * Test to see if we've been building up a completion structure for 515 * earlier buffers -- if so, we try to append to this ioend if we 516 * can, otherwise we finish off any current ioend and start another. 517 * Return the ioend we finished off so that the caller can submit it 518 * once it has finished processing the dirty page. 519 */ 520 STATIC void 521 xfs_add_to_ioend( 522 struct inode *inode, 523 struct buffer_head *bh, 524 xfs_off_t offset, 525 struct xfs_writepage_ctx *wpc, 526 struct list_head *iolist) 527 { 528 if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || 529 bh->b_blocknr != wpc->last_block + 1 || 530 offset != wpc->ioend->io_offset + wpc->ioend->io_size) { 531 struct xfs_ioend *new; 532 533 if (wpc->ioend) 534 list_add(&wpc->ioend->io_list, iolist); 535 536 new = xfs_alloc_ioend(inode, wpc->io_type); 537 new->io_offset = offset; 538 new->io_buffer_head = bh; 539 new->io_buffer_tail = bh; 540 wpc->ioend = new; 541 } else { 542 wpc->ioend->io_buffer_tail->b_private = bh; 543 wpc->ioend->io_buffer_tail = bh; 544 } 545 546 bh->b_private = NULL; 547 wpc->ioend->io_size += bh->b_size; 548 wpc->last_block = bh->b_blocknr; 549 xfs_start_buffer_writeback(bh); 550 } 551 552 STATIC void 553 xfs_map_buffer( 554 struct inode *inode, 555 struct buffer_head *bh, 556 struct xfs_bmbt_irec *imap, 557 xfs_off_t offset) 558 { 559 sector_t bn; 560 struct xfs_mount *m = XFS_I(inode)->i_mount; 561 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); 562 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); 563 564 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 565 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 566 567 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + 568 ((offset - iomap_offset) >> inode->i_blkbits); 569 570 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); 571 572 bh->b_blocknr = bn; 573 set_buffer_mapped(bh); 574 } 575 576 STATIC void 577 xfs_map_at_offset( 578 struct inode *inode, 579 struct buffer_head *bh, 580 struct xfs_bmbt_irec *imap, 581 xfs_off_t offset) 582 { 583 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 584 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 585 586 xfs_map_buffer(inode, bh, imap, offset); 587 set_buffer_mapped(bh); 588 clear_buffer_delay(bh); 589 clear_buffer_unwritten(bh); 590 } 591 592 /* 593 * Test if a given page contains at least one buffer of a given @type. 594 * If @check_all_buffers is true, then we walk all the buffers in the page to 595 * try to find one of the type passed in. If it is not set, then the caller only 596 * needs to check the first buffer on the page for a match. 597 */ 598 STATIC bool 599 xfs_check_page_type( 600 struct page *page, 601 unsigned int type, 602 bool check_all_buffers) 603 { 604 struct buffer_head *bh; 605 struct buffer_head *head; 606 607 if (PageWriteback(page)) 608 return false; 609 if (!page->mapping) 610 return false; 611 if (!page_has_buffers(page)) 612 return false; 613 614 bh = head = page_buffers(page); 615 do { 616 if (buffer_unwritten(bh)) { 617 if (type == XFS_IO_UNWRITTEN) 618 return true; 619 } else if (buffer_delay(bh)) { 620 if (type == XFS_IO_DELALLOC) 621 return true; 622 } else if (buffer_dirty(bh) && buffer_mapped(bh)) { 623 if (type == XFS_IO_OVERWRITE) 624 return true; 625 } 626 627 /* If we are only checking the first buffer, we are done now. */ 628 if (!check_all_buffers) 629 break; 630 } while ((bh = bh->b_this_page) != head); 631 632 return false; 633 } 634 635 STATIC void 636 xfs_vm_invalidatepage( 637 struct page *page, 638 unsigned int offset, 639 unsigned int length) 640 { 641 trace_xfs_invalidatepage(page->mapping->host, page, offset, 642 length); 643 block_invalidatepage(page, offset, length); 644 } 645 646 /* 647 * If the page has delalloc buffers on it, we need to punch them out before we 648 * invalidate the page. If we don't, we leave a stale delalloc mapping on the 649 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read 650 * is done on that same region - the delalloc extent is returned when none is 651 * supposed to be there. 652 * 653 * We prevent this by truncating away the delalloc regions on the page before 654 * invalidating it. Because they are delalloc, we can do this without needing a 655 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this 656 * truncation without a transaction as there is no space left for block 657 * reservation (typically why we see a ENOSPC in writeback). 658 * 659 * This is not a performance critical path, so for now just do the punching a 660 * buffer head at a time. 661 */ 662 STATIC void 663 xfs_aops_discard_page( 664 struct page *page) 665 { 666 struct inode *inode = page->mapping->host; 667 struct xfs_inode *ip = XFS_I(inode); 668 struct buffer_head *bh, *head; 669 loff_t offset = page_offset(page); 670 671 if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) 672 goto out_invalidate; 673 674 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 675 goto out_invalidate; 676 677 xfs_alert(ip->i_mount, 678 "page discard on page %p, inode 0x%llx, offset %llu.", 679 page, ip->i_ino, offset); 680 681 xfs_ilock(ip, XFS_ILOCK_EXCL); 682 bh = head = page_buffers(page); 683 do { 684 int error; 685 xfs_fileoff_t start_fsb; 686 687 if (!buffer_delay(bh)) 688 goto next_buffer; 689 690 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 691 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); 692 if (error) { 693 /* something screwed, just bail */ 694 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 695 xfs_alert(ip->i_mount, 696 "page discard unable to remove delalloc mapping."); 697 } 698 break; 699 } 700 next_buffer: 701 offset += 1 << inode->i_blkbits; 702 703 } while ((bh = bh->b_this_page) != head); 704 705 xfs_iunlock(ip, XFS_ILOCK_EXCL); 706 out_invalidate: 707 xfs_vm_invalidatepage(page, 0, PAGE_SIZE); 708 return; 709 } 710 711 /* 712 * We implement an immediate ioend submission policy here to avoid needing to 713 * chain multiple ioends and hence nest mempool allocations which can violate 714 * forward progress guarantees we need to provide. The current ioend we are 715 * adding buffers to is cached on the writepage context, and if the new buffer 716 * does not append to the cached ioend it will create a new ioend and cache that 717 * instead. 718 * 719 * If a new ioend is created and cached, the old ioend is returned and queued 720 * locally for submission once the entire page is processed or an error has been 721 * detected. While ioends are submitted immediately after they are completed, 722 * batching optimisations are provided by higher level block plugging. 723 * 724 * At the end of a writeback pass, there will be a cached ioend remaining on the 725 * writepage context that the caller will need to submit. 726 */ 727 static int 728 xfs_writepage_map( 729 struct xfs_writepage_ctx *wpc, 730 struct writeback_control *wbc, 731 struct inode *inode, 732 struct page *page, 733 loff_t offset, 734 __uint64_t end_offset) 735 { 736 LIST_HEAD(submit_list); 737 struct xfs_ioend *ioend, *next; 738 struct buffer_head *bh, *head; 739 ssize_t len = 1 << inode->i_blkbits; 740 int error = 0; 741 int count = 0; 742 int uptodate = 1; 743 744 bh = head = page_buffers(page); 745 offset = page_offset(page); 746 do { 747 if (offset >= end_offset) 748 break; 749 if (!buffer_uptodate(bh)) 750 uptodate = 0; 751 752 /* 753 * set_page_dirty dirties all buffers in a page, independent 754 * of their state. The dirty state however is entirely 755 * meaningless for holes (!mapped && uptodate), so skip 756 * buffers covering holes here. 757 */ 758 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 759 wpc->imap_valid = false; 760 continue; 761 } 762 763 if (buffer_unwritten(bh)) { 764 if (wpc->io_type != XFS_IO_UNWRITTEN) { 765 wpc->io_type = XFS_IO_UNWRITTEN; 766 wpc->imap_valid = false; 767 } 768 } else if (buffer_delay(bh)) { 769 if (wpc->io_type != XFS_IO_DELALLOC) { 770 wpc->io_type = XFS_IO_DELALLOC; 771 wpc->imap_valid = false; 772 } 773 } else if (buffer_uptodate(bh)) { 774 if (wpc->io_type != XFS_IO_OVERWRITE) { 775 wpc->io_type = XFS_IO_OVERWRITE; 776 wpc->imap_valid = false; 777 } 778 } else { 779 if (PageUptodate(page)) 780 ASSERT(buffer_mapped(bh)); 781 /* 782 * This buffer is not uptodate and will not be 783 * written to disk. Ensure that we will put any 784 * subsequent writeable buffers into a new 785 * ioend. 786 */ 787 wpc->imap_valid = false; 788 continue; 789 } 790 791 if (wpc->imap_valid) 792 wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, 793 offset); 794 if (!wpc->imap_valid) { 795 error = xfs_map_blocks(inode, offset, &wpc->imap, 796 wpc->io_type); 797 if (error) 798 goto out; 799 wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, 800 offset); 801 } 802 if (wpc->imap_valid) { 803 lock_buffer(bh); 804 if (wpc->io_type != XFS_IO_OVERWRITE) 805 xfs_map_at_offset(inode, bh, &wpc->imap, offset); 806 xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list); 807 count++; 808 } 809 810 } while (offset += len, ((bh = bh->b_this_page) != head)); 811 812 if (uptodate && bh == head) 813 SetPageUptodate(page); 814 815 ASSERT(wpc->ioend || list_empty(&submit_list)); 816 817 out: 818 /* 819 * On error, we have to fail the ioend here because we have locked 820 * buffers in the ioend. If we don't do this, we'll deadlock 821 * invalidating the page as that tries to lock the buffers on the page. 822 * Also, because we may have set pages under writeback, we have to make 823 * sure we run IO completion to mark the error state of the IO 824 * appropriately, so we can't cancel the ioend directly here. That means 825 * we have to mark this page as under writeback if we included any 826 * buffers from it in the ioend chain so that completion treats it 827 * correctly. 828 * 829 * If we didn't include the page in the ioend, the on error we can 830 * simply discard and unlock it as there are no other users of the page 831 * or it's buffers right now. The caller will still need to trigger 832 * submission of outstanding ioends on the writepage context so they are 833 * treated correctly on error. 834 */ 835 if (count) { 836 xfs_start_page_writeback(page, !error); 837 838 /* 839 * Preserve the original error if there was one, otherwise catch 840 * submission errors here and propagate into subsequent ioend 841 * submissions. 842 */ 843 list_for_each_entry_safe(ioend, next, &submit_list, io_list) { 844 int error2; 845 846 list_del_init(&ioend->io_list); 847 error2 = xfs_submit_ioend(wbc, ioend, error); 848 if (error2 && !error) 849 error = error2; 850 } 851 } else if (error) { 852 xfs_aops_discard_page(page); 853 ClearPageUptodate(page); 854 unlock_page(page); 855 } else { 856 /* 857 * We can end up here with no error and nothing to write if we 858 * race with a partial page truncate on a sub-page block sized 859 * filesystem. In that case we need to mark the page clean. 860 */ 861 xfs_start_page_writeback(page, 1); 862 end_page_writeback(page); 863 } 864 865 mapping_set_error(page->mapping, error); 866 return error; 867 } 868 869 /* 870 * Write out a dirty page. 871 * 872 * For delalloc space on the page we need to allocate space and flush it. 873 * For unwritten space on the page we need to start the conversion to 874 * regular allocated space. 875 * For any other dirty buffer heads on the page we should flush them. 876 */ 877 STATIC int 878 xfs_do_writepage( 879 struct page *page, 880 struct writeback_control *wbc, 881 void *data) 882 { 883 struct xfs_writepage_ctx *wpc = data; 884 struct inode *inode = page->mapping->host; 885 loff_t offset; 886 __uint64_t end_offset; 887 pgoff_t end_index; 888 889 trace_xfs_writepage(inode, page, 0, 0); 890 891 ASSERT(page_has_buffers(page)); 892 893 /* 894 * Refuse to write the page out if we are called from reclaim context. 895 * 896 * This avoids stack overflows when called from deeply used stacks in 897 * random callers for direct reclaim or memcg reclaim. We explicitly 898 * allow reclaim from kswapd as the stack usage there is relatively low. 899 * 900 * This should never happen except in the case of a VM regression so 901 * warn about it. 902 */ 903 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == 904 PF_MEMALLOC)) 905 goto redirty; 906 907 /* 908 * Given that we do not allow direct reclaim to call us, we should 909 * never be called while in a filesystem transaction. 910 */ 911 if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) 912 goto redirty; 913 914 /* 915 * Is this page beyond the end of the file? 916 * 917 * The page index is less than the end_index, adjust the end_offset 918 * to the highest offset that this page should represent. 919 * ----------------------------------------------------- 920 * | file mapping | <EOF> | 921 * ----------------------------------------------------- 922 * | Page ... | Page N-2 | Page N-1 | Page N | | 923 * ^--------------------------------^----------|-------- 924 * | desired writeback range | see else | 925 * ---------------------------------^------------------| 926 */ 927 offset = i_size_read(inode); 928 end_index = offset >> PAGE_SHIFT; 929 if (page->index < end_index) 930 end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT; 931 else { 932 /* 933 * Check whether the page to write out is beyond or straddles 934 * i_size or not. 935 * ------------------------------------------------------- 936 * | file mapping | <EOF> | 937 * ------------------------------------------------------- 938 * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | 939 * ^--------------------------------^-----------|--------- 940 * | | Straddles | 941 * ---------------------------------^-----------|--------| 942 */ 943 unsigned offset_into_page = offset & (PAGE_SIZE - 1); 944 945 /* 946 * Skip the page if it is fully outside i_size, e.g. due to a 947 * truncate operation that is in progress. We must redirty the 948 * page so that reclaim stops reclaiming it. Otherwise 949 * xfs_vm_releasepage() is called on it and gets confused. 950 * 951 * Note that the end_index is unsigned long, it would overflow 952 * if the given offset is greater than 16TB on 32-bit system 953 * and if we do check the page is fully outside i_size or not 954 * via "if (page->index >= end_index + 1)" as "end_index + 1" 955 * will be evaluated to 0. Hence this page will be redirtied 956 * and be written out repeatedly which would result in an 957 * infinite loop, the user program that perform this operation 958 * will hang. Instead, we can verify this situation by checking 959 * if the page to write is totally beyond the i_size or if it's 960 * offset is just equal to the EOF. 961 */ 962 if (page->index > end_index || 963 (page->index == end_index && offset_into_page == 0)) 964 goto redirty; 965 966 /* 967 * The page straddles i_size. It must be zeroed out on each 968 * and every writepage invocation because it may be mmapped. 969 * "A file is mapped in multiples of the page size. For a file 970 * that is not a multiple of the page size, the remaining 971 * memory is zeroed when mapped, and writes to that region are 972 * not written out to the file." 973 */ 974 zero_user_segment(page, offset_into_page, PAGE_SIZE); 975 976 /* Adjust the end_offset to the end of file */ 977 end_offset = offset; 978 } 979 980 return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset); 981 982 redirty: 983 redirty_page_for_writepage(wbc, page); 984 unlock_page(page); 985 return 0; 986 } 987 988 STATIC int 989 xfs_vm_writepage( 990 struct page *page, 991 struct writeback_control *wbc) 992 { 993 struct xfs_writepage_ctx wpc = { 994 .io_type = XFS_IO_INVALID, 995 }; 996 int ret; 997 998 ret = xfs_do_writepage(page, wbc, &wpc); 999 if (wpc.ioend) 1000 ret = xfs_submit_ioend(wbc, wpc.ioend, ret); 1001 return ret; 1002 } 1003 1004 STATIC int 1005 xfs_vm_writepages( 1006 struct address_space *mapping, 1007 struct writeback_control *wbc) 1008 { 1009 struct xfs_writepage_ctx wpc = { 1010 .io_type = XFS_IO_INVALID, 1011 }; 1012 int ret; 1013 1014 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1015 if (dax_mapping(mapping)) 1016 return dax_writeback_mapping_range(mapping, 1017 xfs_find_bdev_for_inode(mapping->host), wbc); 1018 1019 ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); 1020 if (wpc.ioend) 1021 ret = xfs_submit_ioend(wbc, wpc.ioend, ret); 1022 return ret; 1023 } 1024 1025 /* 1026 * Called to move a page into cleanable state - and from there 1027 * to be released. The page should already be clean. We always 1028 * have buffer heads in this call. 1029 * 1030 * Returns 1 if the page is ok to release, 0 otherwise. 1031 */ 1032 STATIC int 1033 xfs_vm_releasepage( 1034 struct page *page, 1035 gfp_t gfp_mask) 1036 { 1037 int delalloc, unwritten; 1038 1039 trace_xfs_releasepage(page->mapping->host, page, 0, 0); 1040 1041 xfs_count_page_state(page, &delalloc, &unwritten); 1042 1043 if (WARN_ON_ONCE(delalloc)) 1044 return 0; 1045 if (WARN_ON_ONCE(unwritten)) 1046 return 0; 1047 1048 return try_to_free_buffers(page); 1049 } 1050 1051 /* 1052 * When we map a DIO buffer, we may need to pass flags to 1053 * xfs_end_io_direct_write to tell it what kind of write IO we are doing. 1054 * 1055 * Note that for DIO, an IO to the highest supported file block offset (i.e. 1056 * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 1057 * bit variable. Hence if we see this overflow, we have to assume that the IO is 1058 * extending the file size. We won't know for sure until IO completion is run 1059 * and the actual max write offset is communicated to the IO completion 1060 * routine. 1061 */ 1062 static void 1063 xfs_map_direct( 1064 struct inode *inode, 1065 struct buffer_head *bh_result, 1066 struct xfs_bmbt_irec *imap, 1067 xfs_off_t offset) 1068 { 1069 uintptr_t *flags = (uintptr_t *)&bh_result->b_private; 1070 xfs_off_t size = bh_result->b_size; 1071 1072 trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size, 1073 ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap); 1074 1075 if (ISUNWRITTEN(imap)) { 1076 *flags |= XFS_DIO_FLAG_UNWRITTEN; 1077 set_buffer_defer_completion(bh_result); 1078 } else if (offset + size > i_size_read(inode) || offset + size < 0) { 1079 *flags |= XFS_DIO_FLAG_APPEND; 1080 set_buffer_defer_completion(bh_result); 1081 } 1082 } 1083 1084 /* 1085 * If this is O_DIRECT or the mpage code calling tell them how large the mapping 1086 * is, so that we can avoid repeated get_blocks calls. 1087 * 1088 * If the mapping spans EOF, then we have to break the mapping up as the mapping 1089 * for blocks beyond EOF must be marked new so that sub block regions can be 1090 * correctly zeroed. We can't do this for mappings within EOF unless the mapping 1091 * was just allocated or is unwritten, otherwise the callers would overwrite 1092 * existing data with zeros. Hence we have to split the mapping into a range up 1093 * to and including EOF, and a second mapping for beyond EOF. 1094 */ 1095 static void 1096 xfs_map_trim_size( 1097 struct inode *inode, 1098 sector_t iblock, 1099 struct buffer_head *bh_result, 1100 struct xfs_bmbt_irec *imap, 1101 xfs_off_t offset, 1102 ssize_t size) 1103 { 1104 xfs_off_t mapping_size; 1105 1106 mapping_size = imap->br_startoff + imap->br_blockcount - iblock; 1107 mapping_size <<= inode->i_blkbits; 1108 1109 ASSERT(mapping_size > 0); 1110 if (mapping_size > size) 1111 mapping_size = size; 1112 if (offset < i_size_read(inode) && 1113 offset + mapping_size >= i_size_read(inode)) { 1114 /* limit mapping to block that spans EOF */ 1115 mapping_size = roundup_64(i_size_read(inode) - offset, 1116 1 << inode->i_blkbits); 1117 } 1118 if (mapping_size > LONG_MAX) 1119 mapping_size = LONG_MAX; 1120 1121 bh_result->b_size = mapping_size; 1122 } 1123 1124 STATIC int 1125 __xfs_get_blocks( 1126 struct inode *inode, 1127 sector_t iblock, 1128 struct buffer_head *bh_result, 1129 int create, 1130 bool direct, 1131 bool dax_fault) 1132 { 1133 struct xfs_inode *ip = XFS_I(inode); 1134 struct xfs_mount *mp = ip->i_mount; 1135 xfs_fileoff_t offset_fsb, end_fsb; 1136 int error = 0; 1137 int lockmode = 0; 1138 struct xfs_bmbt_irec imap; 1139 int nimaps = 1; 1140 xfs_off_t offset; 1141 ssize_t size; 1142 int new = 0; 1143 1144 if (XFS_FORCED_SHUTDOWN(mp)) 1145 return -EIO; 1146 1147 offset = (xfs_off_t)iblock << inode->i_blkbits; 1148 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1149 size = bh_result->b_size; 1150 1151 if (!create && direct && offset >= i_size_read(inode)) 1152 return 0; 1153 1154 /* 1155 * Direct I/O is usually done on preallocated files, so try getting 1156 * a block mapping without an exclusive lock first. For buffered 1157 * writes we already have the exclusive iolock anyway, so avoiding 1158 * a lock roundtrip here by taking the ilock exclusive from the 1159 * beginning is a useful micro optimization. 1160 */ 1161 if (create && !direct) { 1162 lockmode = XFS_ILOCK_EXCL; 1163 xfs_ilock(ip, lockmode); 1164 } else { 1165 lockmode = xfs_ilock_data_map_shared(ip); 1166 } 1167 1168 ASSERT(offset <= mp->m_super->s_maxbytes); 1169 if (offset + size > mp->m_super->s_maxbytes) 1170 size = mp->m_super->s_maxbytes - offset; 1171 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); 1172 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1173 1174 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 1175 &imap, &nimaps, XFS_BMAPI_ENTIRE); 1176 if (error) 1177 goto out_unlock; 1178 1179 /* for DAX, we convert unwritten extents directly */ 1180 if (create && 1181 (!nimaps || 1182 (imap.br_startblock == HOLESTARTBLOCK || 1183 imap.br_startblock == DELAYSTARTBLOCK) || 1184 (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { 1185 if (direct || xfs_get_extsz_hint(ip)) { 1186 /* 1187 * xfs_iomap_write_direct() expects the shared lock. It 1188 * is unlocked on return. 1189 */ 1190 if (lockmode == XFS_ILOCK_EXCL) 1191 xfs_ilock_demote(ip, lockmode); 1192 1193 error = xfs_iomap_write_direct(ip, offset, size, 1194 &imap, nimaps); 1195 if (error) 1196 return error; 1197 new = 1; 1198 1199 } else { 1200 /* 1201 * Delalloc reservations do not require a transaction, 1202 * we can go on without dropping the lock here. If we 1203 * are allocating a new delalloc block, make sure that 1204 * we set the new flag so that we mark the buffer new so 1205 * that we know that it is newly allocated if the write 1206 * fails. 1207 */ 1208 if (nimaps && imap.br_startblock == HOLESTARTBLOCK) 1209 new = 1; 1210 error = xfs_iomap_write_delay(ip, offset, size, &imap); 1211 if (error) 1212 goto out_unlock; 1213 1214 xfs_iunlock(ip, lockmode); 1215 } 1216 trace_xfs_get_blocks_alloc(ip, offset, size, 1217 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1218 : XFS_IO_DELALLOC, &imap); 1219 } else if (nimaps) { 1220 trace_xfs_get_blocks_found(ip, offset, size, 1221 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1222 : XFS_IO_OVERWRITE, &imap); 1223 xfs_iunlock(ip, lockmode); 1224 } else { 1225 trace_xfs_get_blocks_notfound(ip, offset, size); 1226 goto out_unlock; 1227 } 1228 1229 if (IS_DAX(inode) && create) { 1230 ASSERT(!ISUNWRITTEN(&imap)); 1231 /* zeroing is not needed at a higher layer */ 1232 new = 0; 1233 } 1234 1235 /* trim mapping down to size requested */ 1236 if (direct || size > (1 << inode->i_blkbits)) 1237 xfs_map_trim_size(inode, iblock, bh_result, 1238 &imap, offset, size); 1239 1240 /* 1241 * For unwritten extents do not report a disk address in the buffered 1242 * read case (treat as if we're reading into a hole). 1243 */ 1244 if (imap.br_startblock != HOLESTARTBLOCK && 1245 imap.br_startblock != DELAYSTARTBLOCK && 1246 (create || !ISUNWRITTEN(&imap))) { 1247 xfs_map_buffer(inode, bh_result, &imap, offset); 1248 if (ISUNWRITTEN(&imap)) 1249 set_buffer_unwritten(bh_result); 1250 /* direct IO needs special help */ 1251 if (create && direct) { 1252 if (dax_fault) 1253 ASSERT(!ISUNWRITTEN(&imap)); 1254 else 1255 xfs_map_direct(inode, bh_result, &imap, offset); 1256 } 1257 } 1258 1259 /* 1260 * If this is a realtime file, data may be on a different device. 1261 * to that pointed to from the buffer_head b_bdev currently. 1262 */ 1263 bh_result->b_bdev = xfs_find_bdev_for_inode(inode); 1264 1265 /* 1266 * If we previously allocated a block out beyond eof and we are now 1267 * coming back to use it then we will need to flag it as new even if it 1268 * has a disk address. 1269 * 1270 * With sub-block writes into unwritten extents we also need to mark 1271 * the buffer as new so that the unwritten parts of the buffer gets 1272 * correctly zeroed. 1273 */ 1274 if (create && 1275 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1276 (offset >= i_size_read(inode)) || 1277 (new || ISUNWRITTEN(&imap)))) 1278 set_buffer_new(bh_result); 1279 1280 if (imap.br_startblock == DELAYSTARTBLOCK) { 1281 BUG_ON(direct); 1282 if (create) { 1283 set_buffer_uptodate(bh_result); 1284 set_buffer_mapped(bh_result); 1285 set_buffer_delay(bh_result); 1286 } 1287 } 1288 1289 return 0; 1290 1291 out_unlock: 1292 xfs_iunlock(ip, lockmode); 1293 return error; 1294 } 1295 1296 int 1297 xfs_get_blocks( 1298 struct inode *inode, 1299 sector_t iblock, 1300 struct buffer_head *bh_result, 1301 int create) 1302 { 1303 return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); 1304 } 1305 1306 int 1307 xfs_get_blocks_direct( 1308 struct inode *inode, 1309 sector_t iblock, 1310 struct buffer_head *bh_result, 1311 int create) 1312 { 1313 return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); 1314 } 1315 1316 int 1317 xfs_get_blocks_dax_fault( 1318 struct inode *inode, 1319 sector_t iblock, 1320 struct buffer_head *bh_result, 1321 int create) 1322 { 1323 return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); 1324 } 1325 1326 /* 1327 * Complete a direct I/O write request. 1328 * 1329 * xfs_map_direct passes us some flags in the private data to tell us what to 1330 * do. If no flags are set, then the write IO is an overwrite wholly within 1331 * the existing allocated file size and so there is nothing for us to do. 1332 * 1333 * Note that in this case the completion can be called in interrupt context, 1334 * whereas if we have flags set we will always be called in task context 1335 * (i.e. from a workqueue). 1336 */ 1337 STATIC int 1338 xfs_end_io_direct_write( 1339 struct kiocb *iocb, 1340 loff_t offset, 1341 ssize_t size, 1342 void *private) 1343 { 1344 struct inode *inode = file_inode(iocb->ki_filp); 1345 struct xfs_inode *ip = XFS_I(inode); 1346 struct xfs_mount *mp = ip->i_mount; 1347 uintptr_t flags = (uintptr_t)private; 1348 int error = 0; 1349 1350 trace_xfs_end_io_direct_write(ip, offset, size); 1351 1352 if (XFS_FORCED_SHUTDOWN(mp)) 1353 return -EIO; 1354 1355 if (size <= 0) 1356 return size; 1357 1358 /* 1359 * The flags tell us whether we are doing unwritten extent conversions 1360 * or an append transaction that updates the on-disk file size. These 1361 * cases are the only cases where we should *potentially* be needing 1362 * to update the VFS inode size. 1363 */ 1364 if (flags == 0) { 1365 ASSERT(offset + size <= i_size_read(inode)); 1366 return 0; 1367 } 1368 1369 /* 1370 * We need to update the in-core inode size here so that we don't end up 1371 * with the on-disk inode size being outside the in-core inode size. We 1372 * have no other method of updating EOF for AIO, so always do it here 1373 * if necessary. 1374 * 1375 * We need to lock the test/set EOF update as we can be racing with 1376 * other IO completions here to update the EOF. Failing to serialise 1377 * here can result in EOF moving backwards and Bad Things Happen when 1378 * that occurs. 1379 */ 1380 spin_lock(&ip->i_flags_lock); 1381 if (offset + size > i_size_read(inode)) 1382 i_size_write(inode, offset + size); 1383 spin_unlock(&ip->i_flags_lock); 1384 1385 if (flags & XFS_DIO_FLAG_UNWRITTEN) { 1386 trace_xfs_end_io_direct_write_unwritten(ip, offset, size); 1387 1388 error = xfs_iomap_write_unwritten(ip, offset, size); 1389 } else if (flags & XFS_DIO_FLAG_APPEND) { 1390 struct xfs_trans *tp; 1391 1392 trace_xfs_end_io_direct_write_append(ip, offset, size); 1393 1394 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 1395 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); 1396 if (error) { 1397 xfs_trans_cancel(tp); 1398 return error; 1399 } 1400 error = xfs_setfilesize(ip, tp, offset, size); 1401 } 1402 1403 return error; 1404 } 1405 1406 STATIC ssize_t 1407 xfs_vm_direct_IO( 1408 struct kiocb *iocb, 1409 struct iov_iter *iter, 1410 loff_t offset) 1411 { 1412 struct inode *inode = iocb->ki_filp->f_mapping->host; 1413 dio_iodone_t *endio = NULL; 1414 int flags = 0; 1415 struct block_device *bdev; 1416 1417 if (iov_iter_rw(iter) == WRITE) { 1418 endio = xfs_end_io_direct_write; 1419 flags = DIO_ASYNC_EXTEND; 1420 } 1421 1422 if (IS_DAX(inode)) { 1423 return dax_do_io(iocb, inode, iter, offset, 1424 xfs_get_blocks_direct, endio, 0); 1425 } 1426 1427 bdev = xfs_find_bdev_for_inode(inode); 1428 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1429 xfs_get_blocks_direct, endio, NULL, flags); 1430 } 1431 1432 /* 1433 * Punch out the delalloc blocks we have already allocated. 1434 * 1435 * Don't bother with xfs_setattr given that nothing can have made it to disk yet 1436 * as the page is still locked at this point. 1437 */ 1438 STATIC void 1439 xfs_vm_kill_delalloc_range( 1440 struct inode *inode, 1441 loff_t start, 1442 loff_t end) 1443 { 1444 struct xfs_inode *ip = XFS_I(inode); 1445 xfs_fileoff_t start_fsb; 1446 xfs_fileoff_t end_fsb; 1447 int error; 1448 1449 start_fsb = XFS_B_TO_FSB(ip->i_mount, start); 1450 end_fsb = XFS_B_TO_FSB(ip->i_mount, end); 1451 if (end_fsb <= start_fsb) 1452 return; 1453 1454 xfs_ilock(ip, XFS_ILOCK_EXCL); 1455 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1456 end_fsb - start_fsb); 1457 if (error) { 1458 /* something screwed, just bail */ 1459 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1460 xfs_alert(ip->i_mount, 1461 "xfs_vm_write_failed: unable to clean up ino %lld", 1462 ip->i_ino); 1463 } 1464 } 1465 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1466 } 1467 1468 STATIC void 1469 xfs_vm_write_failed( 1470 struct inode *inode, 1471 struct page *page, 1472 loff_t pos, 1473 unsigned len) 1474 { 1475 loff_t block_offset; 1476 loff_t block_start; 1477 loff_t block_end; 1478 loff_t from = pos & (PAGE_SIZE - 1); 1479 loff_t to = from + len; 1480 struct buffer_head *bh, *head; 1481 struct xfs_mount *mp = XFS_I(inode)->i_mount; 1482 1483 /* 1484 * The request pos offset might be 32 or 64 bit, this is all fine 1485 * on 64-bit platform. However, for 64-bit pos request on 32-bit 1486 * platform, the high 32-bit will be masked off if we evaluate the 1487 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is 1488 * 0xfffff000 as an unsigned long, hence the result is incorrect 1489 * which could cause the following ASSERT failed in most cases. 1490 * In order to avoid this, we can evaluate the block_offset of the 1491 * start of the page by using shifts rather than masks the mismatch 1492 * problem. 1493 */ 1494 block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT; 1495 1496 ASSERT(block_offset + from == pos); 1497 1498 head = page_buffers(page); 1499 block_start = 0; 1500 for (bh = head; bh != head || !block_start; 1501 bh = bh->b_this_page, block_start = block_end, 1502 block_offset += bh->b_size) { 1503 block_end = block_start + bh->b_size; 1504 1505 /* skip buffers before the write */ 1506 if (block_end <= from) 1507 continue; 1508 1509 /* if the buffer is after the write, we're done */ 1510 if (block_start >= to) 1511 break; 1512 1513 /* 1514 * Process delalloc and unwritten buffers beyond EOF. We can 1515 * encounter unwritten buffers in the event that a file has 1516 * post-EOF unwritten extents and an extending write happens to 1517 * fail (e.g., an unaligned write that also involves a delalloc 1518 * to the same page). 1519 */ 1520 if (!buffer_delay(bh) && !buffer_unwritten(bh)) 1521 continue; 1522 1523 if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) && 1524 block_offset < i_size_read(inode)) 1525 continue; 1526 1527 if (buffer_delay(bh)) 1528 xfs_vm_kill_delalloc_range(inode, block_offset, 1529 block_offset + bh->b_size); 1530 1531 /* 1532 * This buffer does not contain data anymore. make sure anyone 1533 * who finds it knows that for certain. 1534 */ 1535 clear_buffer_delay(bh); 1536 clear_buffer_uptodate(bh); 1537 clear_buffer_mapped(bh); 1538 clear_buffer_new(bh); 1539 clear_buffer_dirty(bh); 1540 clear_buffer_unwritten(bh); 1541 } 1542 1543 } 1544 1545 /* 1546 * This used to call block_write_begin(), but it unlocks and releases the page 1547 * on error, and we need that page to be able to punch stale delalloc blocks out 1548 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at 1549 * the appropriate point. 1550 */ 1551 STATIC int 1552 xfs_vm_write_begin( 1553 struct file *file, 1554 struct address_space *mapping, 1555 loff_t pos, 1556 unsigned len, 1557 unsigned flags, 1558 struct page **pagep, 1559 void **fsdata) 1560 { 1561 pgoff_t index = pos >> PAGE_SHIFT; 1562 struct page *page; 1563 int status; 1564 struct xfs_mount *mp = XFS_I(mapping->host)->i_mount; 1565 1566 ASSERT(len <= PAGE_SIZE); 1567 1568 page = grab_cache_page_write_begin(mapping, index, flags); 1569 if (!page) 1570 return -ENOMEM; 1571 1572 status = __block_write_begin(page, pos, len, xfs_get_blocks); 1573 if (xfs_mp_fail_writes(mp)) 1574 status = -EIO; 1575 if (unlikely(status)) { 1576 struct inode *inode = mapping->host; 1577 size_t isize = i_size_read(inode); 1578 1579 xfs_vm_write_failed(inode, page, pos, len); 1580 unlock_page(page); 1581 1582 /* 1583 * If the write is beyond EOF, we only want to kill blocks 1584 * allocated in this write, not blocks that were previously 1585 * written successfully. 1586 */ 1587 if (xfs_mp_fail_writes(mp)) 1588 isize = 0; 1589 if (pos + len > isize) { 1590 ssize_t start = max_t(ssize_t, pos, isize); 1591 1592 truncate_pagecache_range(inode, start, pos + len); 1593 } 1594 1595 put_page(page); 1596 page = NULL; 1597 } 1598 1599 *pagep = page; 1600 return status; 1601 } 1602 1603 /* 1604 * On failure, we only need to kill delalloc blocks beyond EOF in the range of 1605 * this specific write because they will never be written. Previous writes 1606 * beyond EOF where block allocation succeeded do not need to be trashed, so 1607 * only new blocks from this write should be trashed. For blocks within 1608 * EOF, generic_write_end() zeros them so they are safe to leave alone and be 1609 * written with all the other valid data. 1610 */ 1611 STATIC int 1612 xfs_vm_write_end( 1613 struct file *file, 1614 struct address_space *mapping, 1615 loff_t pos, 1616 unsigned len, 1617 unsigned copied, 1618 struct page *page, 1619 void *fsdata) 1620 { 1621 int ret; 1622 1623 ASSERT(len <= PAGE_SIZE); 1624 1625 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 1626 if (unlikely(ret < len)) { 1627 struct inode *inode = mapping->host; 1628 size_t isize = i_size_read(inode); 1629 loff_t to = pos + len; 1630 1631 if (to > isize) { 1632 /* only kill blocks in this write beyond EOF */ 1633 if (pos > isize) 1634 isize = pos; 1635 xfs_vm_kill_delalloc_range(inode, isize, to); 1636 truncate_pagecache_range(inode, isize, to); 1637 } 1638 } 1639 return ret; 1640 } 1641 1642 STATIC sector_t 1643 xfs_vm_bmap( 1644 struct address_space *mapping, 1645 sector_t block) 1646 { 1647 struct inode *inode = (struct inode *)mapping->host; 1648 struct xfs_inode *ip = XFS_I(inode); 1649 1650 trace_xfs_vm_bmap(XFS_I(inode)); 1651 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1652 filemap_write_and_wait(mapping); 1653 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1654 return generic_block_bmap(mapping, block, xfs_get_blocks); 1655 } 1656 1657 STATIC int 1658 xfs_vm_readpage( 1659 struct file *unused, 1660 struct page *page) 1661 { 1662 trace_xfs_vm_readpage(page->mapping->host, 1); 1663 return mpage_readpage(page, xfs_get_blocks); 1664 } 1665 1666 STATIC int 1667 xfs_vm_readpages( 1668 struct file *unused, 1669 struct address_space *mapping, 1670 struct list_head *pages, 1671 unsigned nr_pages) 1672 { 1673 trace_xfs_vm_readpages(mapping->host, nr_pages); 1674 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1675 } 1676 1677 /* 1678 * This is basically a copy of __set_page_dirty_buffers() with one 1679 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them 1680 * dirty, we'll never be able to clean them because we don't write buffers 1681 * beyond EOF, and that means we can't invalidate pages that span EOF 1682 * that have been marked dirty. Further, the dirty state can leak into 1683 * the file interior if the file is extended, resulting in all sorts of 1684 * bad things happening as the state does not match the underlying data. 1685 * 1686 * XXX: this really indicates that bufferheads in XFS need to die. Warts like 1687 * this only exist because of bufferheads and how the generic code manages them. 1688 */ 1689 STATIC int 1690 xfs_vm_set_page_dirty( 1691 struct page *page) 1692 { 1693 struct address_space *mapping = page->mapping; 1694 struct inode *inode = mapping->host; 1695 loff_t end_offset; 1696 loff_t offset; 1697 int newly_dirty; 1698 1699 if (unlikely(!mapping)) 1700 return !TestSetPageDirty(page); 1701 1702 end_offset = i_size_read(inode); 1703 offset = page_offset(page); 1704 1705 spin_lock(&mapping->private_lock); 1706 if (page_has_buffers(page)) { 1707 struct buffer_head *head = page_buffers(page); 1708 struct buffer_head *bh = head; 1709 1710 do { 1711 if (offset < end_offset) 1712 set_buffer_dirty(bh); 1713 bh = bh->b_this_page; 1714 offset += 1 << inode->i_blkbits; 1715 } while (bh != head); 1716 } 1717 /* 1718 * Lock out page->mem_cgroup migration to keep PageDirty 1719 * synchronized with per-memcg dirty page counters. 1720 */ 1721 lock_page_memcg(page); 1722 newly_dirty = !TestSetPageDirty(page); 1723 spin_unlock(&mapping->private_lock); 1724 1725 if (newly_dirty) { 1726 /* sigh - __set_page_dirty() is static, so copy it here, too */ 1727 unsigned long flags; 1728 1729 spin_lock_irqsave(&mapping->tree_lock, flags); 1730 if (page->mapping) { /* Race with truncate? */ 1731 WARN_ON_ONCE(!PageUptodate(page)); 1732 account_page_dirtied(page, mapping); 1733 radix_tree_tag_set(&mapping->page_tree, 1734 page_index(page), PAGECACHE_TAG_DIRTY); 1735 } 1736 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1737 } 1738 unlock_page_memcg(page); 1739 if (newly_dirty) 1740 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1741 return newly_dirty; 1742 } 1743 1744 const struct address_space_operations xfs_address_space_operations = { 1745 .readpage = xfs_vm_readpage, 1746 .readpages = xfs_vm_readpages, 1747 .writepage = xfs_vm_writepage, 1748 .writepages = xfs_vm_writepages, 1749 .set_page_dirty = xfs_vm_set_page_dirty, 1750 .releasepage = xfs_vm_releasepage, 1751 .invalidatepage = xfs_vm_invalidatepage, 1752 .write_begin = xfs_vm_write_begin, 1753 .write_end = xfs_vm_write_end, 1754 .bmap = xfs_vm_bmap, 1755 .direct_IO = xfs_vm_direct_IO, 1756 .migratepage = buffer_migrate_page, 1757 .is_partially_uptodate = block_is_partially_uptodate, 1758 .error_remove_page = generic_error_remove_page, 1759 }; 1760