1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_shared.h" 20 #include "xfs_format.h" 21 #include "xfs_log_format.h" 22 #include "xfs_trans_resv.h" 23 #include "xfs_mount.h" 24 #include "xfs_inode.h" 25 #include "xfs_trans.h" 26 #include "xfs_inode_item.h" 27 #include "xfs_alloc.h" 28 #include "xfs_error.h" 29 #include "xfs_iomap.h" 30 #include "xfs_trace.h" 31 #include "xfs_bmap.h" 32 #include "xfs_bmap_util.h" 33 #include "xfs_bmap_btree.h" 34 #include <linux/gfp.h> 35 #include <linux/mpage.h> 36 #include <linux/pagevec.h> 37 #include <linux/writeback.h> 38 39 /* flags for direct write completions */ 40 #define XFS_DIO_FLAG_UNWRITTEN (1 << 0) 41 #define XFS_DIO_FLAG_APPEND (1 << 1) 42 43 /* 44 * structure owned by writepages passed to individual writepage calls 45 */ 46 struct xfs_writepage_ctx { 47 struct xfs_bmbt_irec imap; 48 bool imap_valid; 49 unsigned int io_type; 50 struct xfs_ioend *ioend; 51 sector_t last_block; 52 }; 53 54 void 55 xfs_count_page_state( 56 struct page *page, 57 int *delalloc, 58 int *unwritten) 59 { 60 struct buffer_head *bh, *head; 61 62 *delalloc = *unwritten = 0; 63 64 bh = head = page_buffers(page); 65 do { 66 if (buffer_unwritten(bh)) 67 (*unwritten) = 1; 68 else if (buffer_delay(bh)) 69 (*delalloc) = 1; 70 } while ((bh = bh->b_this_page) != head); 71 } 72 73 struct block_device * 74 xfs_find_bdev_for_inode( 75 struct inode *inode) 76 { 77 struct xfs_inode *ip = XFS_I(inode); 78 struct xfs_mount *mp = ip->i_mount; 79 80 if (XFS_IS_REALTIME_INODE(ip)) 81 return mp->m_rtdev_targp->bt_bdev; 82 else 83 return mp->m_ddev_targp->bt_bdev; 84 } 85 86 /* 87 * We're now finished for good with this page. Update the page state via the 88 * associated buffer_heads, paying attention to the start and end offsets that 89 * we need to process on the page. 90 * 91 * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last 92 * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or 93 * the page at all, as we may be racing with memory reclaim and it can free both 94 * the bufferhead chain and the page as it will see the page as clean and 95 * unused. 96 */ 97 static void 98 xfs_finish_page_writeback( 99 struct inode *inode, 100 struct bio_vec *bvec, 101 int error) 102 { 103 unsigned int end = bvec->bv_offset + bvec->bv_len - 1; 104 struct buffer_head *head, *bh, *next; 105 unsigned int off = 0; 106 unsigned int bsize; 107 108 ASSERT(bvec->bv_offset < PAGE_SIZE); 109 ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); 110 ASSERT(end < PAGE_SIZE); 111 ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0); 112 113 bh = head = page_buffers(bvec->bv_page); 114 115 bsize = bh->b_size; 116 do { 117 next = bh->b_this_page; 118 if (off < bvec->bv_offset) 119 goto next_bh; 120 if (off > end) 121 break; 122 bh->b_end_io(bh, !error); 123 next_bh: 124 off += bsize; 125 } while ((bh = next) != head); 126 } 127 128 /* 129 * We're now finished for good with this ioend structure. Update the page 130 * state, release holds on bios, and finally free up memory. Do not use the 131 * ioend after this. 132 */ 133 STATIC void 134 xfs_destroy_ioend( 135 struct xfs_ioend *ioend, 136 int error) 137 { 138 struct inode *inode = ioend->io_inode; 139 struct bio *last = ioend->io_bio; 140 struct bio *bio, *next; 141 142 for (bio = &ioend->io_inline_bio; bio; bio = next) { 143 struct bio_vec *bvec; 144 int i; 145 146 /* 147 * For the last bio, bi_private points to the ioend, so we 148 * need to explicitly end the iteration here. 149 */ 150 if (bio == last) 151 next = NULL; 152 else 153 next = bio->bi_private; 154 155 /* walk each page on bio, ending page IO on them */ 156 bio_for_each_segment_all(bvec, bio, i) 157 xfs_finish_page_writeback(inode, bvec, error); 158 159 bio_put(bio); 160 } 161 } 162 163 /* 164 * Fast and loose check if this write could update the on-disk inode size. 165 */ 166 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) 167 { 168 return ioend->io_offset + ioend->io_size > 169 XFS_I(ioend->io_inode)->i_d.di_size; 170 } 171 172 STATIC int 173 xfs_setfilesize_trans_alloc( 174 struct xfs_ioend *ioend) 175 { 176 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 177 struct xfs_trans *tp; 178 int error; 179 180 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); 181 if (error) 182 return error; 183 184 ioend->io_append_trans = tp; 185 186 /* 187 * We may pass freeze protection with a transaction. So tell lockdep 188 * we released it. 189 */ 190 __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS); 191 /* 192 * We hand off the transaction to the completion thread now, so 193 * clear the flag here. 194 */ 195 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 196 return 0; 197 } 198 199 /* 200 * Update on-disk file size now that data has been written to disk. 201 */ 202 STATIC int 203 __xfs_setfilesize( 204 struct xfs_inode *ip, 205 struct xfs_trans *tp, 206 xfs_off_t offset, 207 size_t size) 208 { 209 xfs_fsize_t isize; 210 211 xfs_ilock(ip, XFS_ILOCK_EXCL); 212 isize = xfs_new_eof(ip, offset + size); 213 if (!isize) { 214 xfs_iunlock(ip, XFS_ILOCK_EXCL); 215 xfs_trans_cancel(tp); 216 return 0; 217 } 218 219 trace_xfs_setfilesize(ip, offset, size); 220 221 ip->i_d.di_size = isize; 222 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 223 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 224 225 return xfs_trans_commit(tp); 226 } 227 228 int 229 xfs_setfilesize( 230 struct xfs_inode *ip, 231 xfs_off_t offset, 232 size_t size) 233 { 234 struct xfs_mount *mp = ip->i_mount; 235 struct xfs_trans *tp; 236 int error; 237 238 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); 239 if (error) 240 return error; 241 242 return __xfs_setfilesize(ip, tp, offset, size); 243 } 244 245 STATIC int 246 xfs_setfilesize_ioend( 247 struct xfs_ioend *ioend, 248 int error) 249 { 250 struct xfs_inode *ip = XFS_I(ioend->io_inode); 251 struct xfs_trans *tp = ioend->io_append_trans; 252 253 /* 254 * The transaction may have been allocated in the I/O submission thread, 255 * thus we need to mark ourselves as being in a transaction manually. 256 * Similarly for freeze protection. 257 */ 258 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 259 __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); 260 261 /* we abort the update if there was an IO error */ 262 if (error) { 263 xfs_trans_cancel(tp); 264 return error; 265 } 266 267 return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); 268 } 269 270 /* 271 * IO write completion. 272 */ 273 STATIC void 274 xfs_end_io( 275 struct work_struct *work) 276 { 277 struct xfs_ioend *ioend = 278 container_of(work, struct xfs_ioend, io_work); 279 struct xfs_inode *ip = XFS_I(ioend->io_inode); 280 int error = ioend->io_bio->bi_error; 281 282 /* 283 * Set an error if the mount has shut down and proceed with end I/O 284 * processing so it can perform whatever cleanups are necessary. 285 */ 286 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 287 error = -EIO; 288 289 /* 290 * For unwritten extents we need to issue transactions to convert a 291 * range to normal written extens after the data I/O has finished. 292 * Detecting and handling completion IO errors is done individually 293 * for each case as different cleanup operations need to be performed 294 * on error. 295 */ 296 if (ioend->io_type == XFS_IO_UNWRITTEN) { 297 if (error) 298 goto done; 299 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 300 ioend->io_size); 301 } else if (ioend->io_append_trans) { 302 error = xfs_setfilesize_ioend(ioend, error); 303 } else { 304 ASSERT(!xfs_ioend_is_append(ioend)); 305 } 306 307 done: 308 xfs_destroy_ioend(ioend, error); 309 } 310 311 STATIC void 312 xfs_end_bio( 313 struct bio *bio) 314 { 315 struct xfs_ioend *ioend = bio->bi_private; 316 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 317 318 if (ioend->io_type == XFS_IO_UNWRITTEN) 319 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 320 else if (ioend->io_append_trans) 321 queue_work(mp->m_data_workqueue, &ioend->io_work); 322 else 323 xfs_destroy_ioend(ioend, bio->bi_error); 324 } 325 326 STATIC int 327 xfs_map_blocks( 328 struct inode *inode, 329 loff_t offset, 330 struct xfs_bmbt_irec *imap, 331 int type) 332 { 333 struct xfs_inode *ip = XFS_I(inode); 334 struct xfs_mount *mp = ip->i_mount; 335 ssize_t count = 1 << inode->i_blkbits; 336 xfs_fileoff_t offset_fsb, end_fsb; 337 int error = 0; 338 int bmapi_flags = XFS_BMAPI_ENTIRE; 339 int nimaps = 1; 340 341 if (XFS_FORCED_SHUTDOWN(mp)) 342 return -EIO; 343 344 if (type == XFS_IO_UNWRITTEN) 345 bmapi_flags |= XFS_BMAPI_IGSTATE; 346 347 xfs_ilock(ip, XFS_ILOCK_SHARED); 348 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 349 (ip->i_df.if_flags & XFS_IFEXTENTS)); 350 ASSERT(offset <= mp->m_super->s_maxbytes); 351 352 if (offset + count > mp->m_super->s_maxbytes) 353 count = mp->m_super->s_maxbytes - offset; 354 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 355 offset_fsb = XFS_B_TO_FSBT(mp, offset); 356 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 357 imap, &nimaps, bmapi_flags); 358 xfs_iunlock(ip, XFS_ILOCK_SHARED); 359 360 if (error) 361 return error; 362 363 if (type == XFS_IO_DELALLOC && 364 (!nimaps || isnullstartblock(imap->br_startblock))) { 365 error = xfs_iomap_write_allocate(ip, offset, imap); 366 if (!error) 367 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); 368 return error; 369 } 370 371 #ifdef DEBUG 372 if (type == XFS_IO_UNWRITTEN) { 373 ASSERT(nimaps); 374 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 375 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 376 } 377 #endif 378 if (nimaps) 379 trace_xfs_map_blocks_found(ip, offset, count, type, imap); 380 return 0; 381 } 382 383 STATIC bool 384 xfs_imap_valid( 385 struct inode *inode, 386 struct xfs_bmbt_irec *imap, 387 xfs_off_t offset) 388 { 389 offset >>= inode->i_blkbits; 390 391 return offset >= imap->br_startoff && 392 offset < imap->br_startoff + imap->br_blockcount; 393 } 394 395 STATIC void 396 xfs_start_buffer_writeback( 397 struct buffer_head *bh) 398 { 399 ASSERT(buffer_mapped(bh)); 400 ASSERT(buffer_locked(bh)); 401 ASSERT(!buffer_delay(bh)); 402 ASSERT(!buffer_unwritten(bh)); 403 404 mark_buffer_async_write(bh); 405 set_buffer_uptodate(bh); 406 clear_buffer_dirty(bh); 407 } 408 409 STATIC void 410 xfs_start_page_writeback( 411 struct page *page, 412 int clear_dirty) 413 { 414 ASSERT(PageLocked(page)); 415 ASSERT(!PageWriteback(page)); 416 417 /* 418 * if the page was not fully cleaned, we need to ensure that the higher 419 * layers come back to it correctly. That means we need to keep the page 420 * dirty, and for WB_SYNC_ALL writeback we need to ensure the 421 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to 422 * write this page in this writeback sweep will be made. 423 */ 424 if (clear_dirty) { 425 clear_page_dirty_for_io(page); 426 set_page_writeback(page); 427 } else 428 set_page_writeback_keepwrite(page); 429 430 unlock_page(page); 431 } 432 433 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) 434 { 435 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); 436 } 437 438 /* 439 * Submit the bio for an ioend. We are passed an ioend with a bio attached to 440 * it, and we submit that bio. The ioend may be used for multiple bio 441 * submissions, so we only want to allocate an append transaction for the ioend 442 * once. In the case of multiple bio submission, each bio will take an IO 443 * reference to the ioend to ensure that the ioend completion is only done once 444 * all bios have been submitted and the ioend is really done. 445 * 446 * If @fail is non-zero, it means that we have a situation where some part of 447 * the submission process has failed after we have marked paged for writeback 448 * and unlocked them. In this situation, we need to fail the bio and ioend 449 * rather than submit it to IO. This typically only happens on a filesystem 450 * shutdown. 451 */ 452 STATIC int 453 xfs_submit_ioend( 454 struct writeback_control *wbc, 455 struct xfs_ioend *ioend, 456 int status) 457 { 458 /* Reserve log space if we might write beyond the on-disk inode size. */ 459 if (!status && 460 ioend->io_type != XFS_IO_UNWRITTEN && 461 xfs_ioend_is_append(ioend) && 462 !ioend->io_append_trans) 463 status = xfs_setfilesize_trans_alloc(ioend); 464 465 ioend->io_bio->bi_private = ioend; 466 ioend->io_bio->bi_end_io = xfs_end_bio; 467 bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, 468 (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); 469 /* 470 * If we are failing the IO now, just mark the ioend with an 471 * error and finish it. This will run IO completion immediately 472 * as there is only one reference to the ioend at this point in 473 * time. 474 */ 475 if (status) { 476 ioend->io_bio->bi_error = status; 477 bio_endio(ioend->io_bio); 478 return status; 479 } 480 481 submit_bio(ioend->io_bio); 482 return 0; 483 } 484 485 static void 486 xfs_init_bio_from_bh( 487 struct bio *bio, 488 struct buffer_head *bh) 489 { 490 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 491 bio->bi_bdev = bh->b_bdev; 492 } 493 494 static struct xfs_ioend * 495 xfs_alloc_ioend( 496 struct inode *inode, 497 unsigned int type, 498 xfs_off_t offset, 499 struct buffer_head *bh) 500 { 501 struct xfs_ioend *ioend; 502 struct bio *bio; 503 504 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset); 505 xfs_init_bio_from_bh(bio, bh); 506 507 ioend = container_of(bio, struct xfs_ioend, io_inline_bio); 508 INIT_LIST_HEAD(&ioend->io_list); 509 ioend->io_type = type; 510 ioend->io_inode = inode; 511 ioend->io_size = 0; 512 ioend->io_offset = offset; 513 INIT_WORK(&ioend->io_work, xfs_end_io); 514 ioend->io_append_trans = NULL; 515 ioend->io_bio = bio; 516 return ioend; 517 } 518 519 /* 520 * Allocate a new bio, and chain the old bio to the new one. 521 * 522 * Note that we have to do perform the chaining in this unintuitive order 523 * so that the bi_private linkage is set up in the right direction for the 524 * traversal in xfs_destroy_ioend(). 525 */ 526 static void 527 xfs_chain_bio( 528 struct xfs_ioend *ioend, 529 struct writeback_control *wbc, 530 struct buffer_head *bh) 531 { 532 struct bio *new; 533 534 new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); 535 xfs_init_bio_from_bh(new, bh); 536 537 bio_chain(ioend->io_bio, new); 538 bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ 539 bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, 540 (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); 541 submit_bio(ioend->io_bio); 542 ioend->io_bio = new; 543 } 544 545 /* 546 * Test to see if we've been building up a completion structure for 547 * earlier buffers -- if so, we try to append to this ioend if we 548 * can, otherwise we finish off any current ioend and start another. 549 * Return the ioend we finished off so that the caller can submit it 550 * once it has finished processing the dirty page. 551 */ 552 STATIC void 553 xfs_add_to_ioend( 554 struct inode *inode, 555 struct buffer_head *bh, 556 xfs_off_t offset, 557 struct xfs_writepage_ctx *wpc, 558 struct writeback_control *wbc, 559 struct list_head *iolist) 560 { 561 if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || 562 bh->b_blocknr != wpc->last_block + 1 || 563 offset != wpc->ioend->io_offset + wpc->ioend->io_size) { 564 if (wpc->ioend) 565 list_add(&wpc->ioend->io_list, iolist); 566 wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh); 567 } 568 569 /* 570 * If the buffer doesn't fit into the bio we need to allocate a new 571 * one. This shouldn't happen more than once for a given buffer. 572 */ 573 while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size) 574 xfs_chain_bio(wpc->ioend, wbc, bh); 575 576 wpc->ioend->io_size += bh->b_size; 577 wpc->last_block = bh->b_blocknr; 578 xfs_start_buffer_writeback(bh); 579 } 580 581 STATIC void 582 xfs_map_buffer( 583 struct inode *inode, 584 struct buffer_head *bh, 585 struct xfs_bmbt_irec *imap, 586 xfs_off_t offset) 587 { 588 sector_t bn; 589 struct xfs_mount *m = XFS_I(inode)->i_mount; 590 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); 591 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); 592 593 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 594 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 595 596 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + 597 ((offset - iomap_offset) >> inode->i_blkbits); 598 599 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); 600 601 bh->b_blocknr = bn; 602 set_buffer_mapped(bh); 603 } 604 605 STATIC void 606 xfs_map_at_offset( 607 struct inode *inode, 608 struct buffer_head *bh, 609 struct xfs_bmbt_irec *imap, 610 xfs_off_t offset) 611 { 612 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 613 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 614 615 xfs_map_buffer(inode, bh, imap, offset); 616 set_buffer_mapped(bh); 617 clear_buffer_delay(bh); 618 clear_buffer_unwritten(bh); 619 } 620 621 /* 622 * Test if a given page contains at least one buffer of a given @type. 623 * If @check_all_buffers is true, then we walk all the buffers in the page to 624 * try to find one of the type passed in. If it is not set, then the caller only 625 * needs to check the first buffer on the page for a match. 626 */ 627 STATIC bool 628 xfs_check_page_type( 629 struct page *page, 630 unsigned int type, 631 bool check_all_buffers) 632 { 633 struct buffer_head *bh; 634 struct buffer_head *head; 635 636 if (PageWriteback(page)) 637 return false; 638 if (!page->mapping) 639 return false; 640 if (!page_has_buffers(page)) 641 return false; 642 643 bh = head = page_buffers(page); 644 do { 645 if (buffer_unwritten(bh)) { 646 if (type == XFS_IO_UNWRITTEN) 647 return true; 648 } else if (buffer_delay(bh)) { 649 if (type == XFS_IO_DELALLOC) 650 return true; 651 } else if (buffer_dirty(bh) && buffer_mapped(bh)) { 652 if (type == XFS_IO_OVERWRITE) 653 return true; 654 } 655 656 /* If we are only checking the first buffer, we are done now. */ 657 if (!check_all_buffers) 658 break; 659 } while ((bh = bh->b_this_page) != head); 660 661 return false; 662 } 663 664 STATIC void 665 xfs_vm_invalidatepage( 666 struct page *page, 667 unsigned int offset, 668 unsigned int length) 669 { 670 trace_xfs_invalidatepage(page->mapping->host, page, offset, 671 length); 672 block_invalidatepage(page, offset, length); 673 } 674 675 /* 676 * If the page has delalloc buffers on it, we need to punch them out before we 677 * invalidate the page. If we don't, we leave a stale delalloc mapping on the 678 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read 679 * is done on that same region - the delalloc extent is returned when none is 680 * supposed to be there. 681 * 682 * We prevent this by truncating away the delalloc regions on the page before 683 * invalidating it. Because they are delalloc, we can do this without needing a 684 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this 685 * truncation without a transaction as there is no space left for block 686 * reservation (typically why we see a ENOSPC in writeback). 687 * 688 * This is not a performance critical path, so for now just do the punching a 689 * buffer head at a time. 690 */ 691 STATIC void 692 xfs_aops_discard_page( 693 struct page *page) 694 { 695 struct inode *inode = page->mapping->host; 696 struct xfs_inode *ip = XFS_I(inode); 697 struct buffer_head *bh, *head; 698 loff_t offset = page_offset(page); 699 700 if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) 701 goto out_invalidate; 702 703 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 704 goto out_invalidate; 705 706 xfs_alert(ip->i_mount, 707 "page discard on page %p, inode 0x%llx, offset %llu.", 708 page, ip->i_ino, offset); 709 710 xfs_ilock(ip, XFS_ILOCK_EXCL); 711 bh = head = page_buffers(page); 712 do { 713 int error; 714 xfs_fileoff_t start_fsb; 715 716 if (!buffer_delay(bh)) 717 goto next_buffer; 718 719 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 720 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); 721 if (error) { 722 /* something screwed, just bail */ 723 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 724 xfs_alert(ip->i_mount, 725 "page discard unable to remove delalloc mapping."); 726 } 727 break; 728 } 729 next_buffer: 730 offset += 1 << inode->i_blkbits; 731 732 } while ((bh = bh->b_this_page) != head); 733 734 xfs_iunlock(ip, XFS_ILOCK_EXCL); 735 out_invalidate: 736 xfs_vm_invalidatepage(page, 0, PAGE_SIZE); 737 return; 738 } 739 740 /* 741 * We implement an immediate ioend submission policy here to avoid needing to 742 * chain multiple ioends and hence nest mempool allocations which can violate 743 * forward progress guarantees we need to provide. The current ioend we are 744 * adding buffers to is cached on the writepage context, and if the new buffer 745 * does not append to the cached ioend it will create a new ioend and cache that 746 * instead. 747 * 748 * If a new ioend is created and cached, the old ioend is returned and queued 749 * locally for submission once the entire page is processed or an error has been 750 * detected. While ioends are submitted immediately after they are completed, 751 * batching optimisations are provided by higher level block plugging. 752 * 753 * At the end of a writeback pass, there will be a cached ioend remaining on the 754 * writepage context that the caller will need to submit. 755 */ 756 static int 757 xfs_writepage_map( 758 struct xfs_writepage_ctx *wpc, 759 struct writeback_control *wbc, 760 struct inode *inode, 761 struct page *page, 762 loff_t offset, 763 __uint64_t end_offset) 764 { 765 LIST_HEAD(submit_list); 766 struct xfs_ioend *ioend, *next; 767 struct buffer_head *bh, *head; 768 ssize_t len = 1 << inode->i_blkbits; 769 int error = 0; 770 int count = 0; 771 int uptodate = 1; 772 773 bh = head = page_buffers(page); 774 offset = page_offset(page); 775 do { 776 if (offset >= end_offset) 777 break; 778 if (!buffer_uptodate(bh)) 779 uptodate = 0; 780 781 /* 782 * set_page_dirty dirties all buffers in a page, independent 783 * of their state. The dirty state however is entirely 784 * meaningless for holes (!mapped && uptodate), so skip 785 * buffers covering holes here. 786 */ 787 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 788 wpc->imap_valid = false; 789 continue; 790 } 791 792 if (buffer_unwritten(bh)) { 793 if (wpc->io_type != XFS_IO_UNWRITTEN) { 794 wpc->io_type = XFS_IO_UNWRITTEN; 795 wpc->imap_valid = false; 796 } 797 } else if (buffer_delay(bh)) { 798 if (wpc->io_type != XFS_IO_DELALLOC) { 799 wpc->io_type = XFS_IO_DELALLOC; 800 wpc->imap_valid = false; 801 } 802 } else if (buffer_uptodate(bh)) { 803 if (wpc->io_type != XFS_IO_OVERWRITE) { 804 wpc->io_type = XFS_IO_OVERWRITE; 805 wpc->imap_valid = false; 806 } 807 } else { 808 if (PageUptodate(page)) 809 ASSERT(buffer_mapped(bh)); 810 /* 811 * This buffer is not uptodate and will not be 812 * written to disk. Ensure that we will put any 813 * subsequent writeable buffers into a new 814 * ioend. 815 */ 816 wpc->imap_valid = false; 817 continue; 818 } 819 820 if (wpc->imap_valid) 821 wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, 822 offset); 823 if (!wpc->imap_valid) { 824 error = xfs_map_blocks(inode, offset, &wpc->imap, 825 wpc->io_type); 826 if (error) 827 goto out; 828 wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, 829 offset); 830 } 831 if (wpc->imap_valid) { 832 lock_buffer(bh); 833 if (wpc->io_type != XFS_IO_OVERWRITE) 834 xfs_map_at_offset(inode, bh, &wpc->imap, offset); 835 xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list); 836 count++; 837 } 838 839 } while (offset += len, ((bh = bh->b_this_page) != head)); 840 841 if (uptodate && bh == head) 842 SetPageUptodate(page); 843 844 ASSERT(wpc->ioend || list_empty(&submit_list)); 845 846 out: 847 /* 848 * On error, we have to fail the ioend here because we have locked 849 * buffers in the ioend. If we don't do this, we'll deadlock 850 * invalidating the page as that tries to lock the buffers on the page. 851 * Also, because we may have set pages under writeback, we have to make 852 * sure we run IO completion to mark the error state of the IO 853 * appropriately, so we can't cancel the ioend directly here. That means 854 * we have to mark this page as under writeback if we included any 855 * buffers from it in the ioend chain so that completion treats it 856 * correctly. 857 * 858 * If we didn't include the page in the ioend, the on error we can 859 * simply discard and unlock it as there are no other users of the page 860 * or it's buffers right now. The caller will still need to trigger 861 * submission of outstanding ioends on the writepage context so they are 862 * treated correctly on error. 863 */ 864 if (count) { 865 xfs_start_page_writeback(page, !error); 866 867 /* 868 * Preserve the original error if there was one, otherwise catch 869 * submission errors here and propagate into subsequent ioend 870 * submissions. 871 */ 872 list_for_each_entry_safe(ioend, next, &submit_list, io_list) { 873 int error2; 874 875 list_del_init(&ioend->io_list); 876 error2 = xfs_submit_ioend(wbc, ioend, error); 877 if (error2 && !error) 878 error = error2; 879 } 880 } else if (error) { 881 xfs_aops_discard_page(page); 882 ClearPageUptodate(page); 883 unlock_page(page); 884 } else { 885 /* 886 * We can end up here with no error and nothing to write if we 887 * race with a partial page truncate on a sub-page block sized 888 * filesystem. In that case we need to mark the page clean. 889 */ 890 xfs_start_page_writeback(page, 1); 891 end_page_writeback(page); 892 } 893 894 mapping_set_error(page->mapping, error); 895 return error; 896 } 897 898 /* 899 * Write out a dirty page. 900 * 901 * For delalloc space on the page we need to allocate space and flush it. 902 * For unwritten space on the page we need to start the conversion to 903 * regular allocated space. 904 * For any other dirty buffer heads on the page we should flush them. 905 */ 906 STATIC int 907 xfs_do_writepage( 908 struct page *page, 909 struct writeback_control *wbc, 910 void *data) 911 { 912 struct xfs_writepage_ctx *wpc = data; 913 struct inode *inode = page->mapping->host; 914 loff_t offset; 915 __uint64_t end_offset; 916 pgoff_t end_index; 917 918 trace_xfs_writepage(inode, page, 0, 0); 919 920 ASSERT(page_has_buffers(page)); 921 922 /* 923 * Refuse to write the page out if we are called from reclaim context. 924 * 925 * This avoids stack overflows when called from deeply used stacks in 926 * random callers for direct reclaim or memcg reclaim. We explicitly 927 * allow reclaim from kswapd as the stack usage there is relatively low. 928 * 929 * This should never happen except in the case of a VM regression so 930 * warn about it. 931 */ 932 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == 933 PF_MEMALLOC)) 934 goto redirty; 935 936 /* 937 * Given that we do not allow direct reclaim to call us, we should 938 * never be called while in a filesystem transaction. 939 */ 940 if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) 941 goto redirty; 942 943 /* 944 * Is this page beyond the end of the file? 945 * 946 * The page index is less than the end_index, adjust the end_offset 947 * to the highest offset that this page should represent. 948 * ----------------------------------------------------- 949 * | file mapping | <EOF> | 950 * ----------------------------------------------------- 951 * | Page ... | Page N-2 | Page N-1 | Page N | | 952 * ^--------------------------------^----------|-------- 953 * | desired writeback range | see else | 954 * ---------------------------------^------------------| 955 */ 956 offset = i_size_read(inode); 957 end_index = offset >> PAGE_SHIFT; 958 if (page->index < end_index) 959 end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT; 960 else { 961 /* 962 * Check whether the page to write out is beyond or straddles 963 * i_size or not. 964 * ------------------------------------------------------- 965 * | file mapping | <EOF> | 966 * ------------------------------------------------------- 967 * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | 968 * ^--------------------------------^-----------|--------- 969 * | | Straddles | 970 * ---------------------------------^-----------|--------| 971 */ 972 unsigned offset_into_page = offset & (PAGE_SIZE - 1); 973 974 /* 975 * Skip the page if it is fully outside i_size, e.g. due to a 976 * truncate operation that is in progress. We must redirty the 977 * page so that reclaim stops reclaiming it. Otherwise 978 * xfs_vm_releasepage() is called on it and gets confused. 979 * 980 * Note that the end_index is unsigned long, it would overflow 981 * if the given offset is greater than 16TB on 32-bit system 982 * and if we do check the page is fully outside i_size or not 983 * via "if (page->index >= end_index + 1)" as "end_index + 1" 984 * will be evaluated to 0. Hence this page will be redirtied 985 * and be written out repeatedly which would result in an 986 * infinite loop, the user program that perform this operation 987 * will hang. Instead, we can verify this situation by checking 988 * if the page to write is totally beyond the i_size or if it's 989 * offset is just equal to the EOF. 990 */ 991 if (page->index > end_index || 992 (page->index == end_index && offset_into_page == 0)) 993 goto redirty; 994 995 /* 996 * The page straddles i_size. It must be zeroed out on each 997 * and every writepage invocation because it may be mmapped. 998 * "A file is mapped in multiples of the page size. For a file 999 * that is not a multiple of the page size, the remaining 1000 * memory is zeroed when mapped, and writes to that region are 1001 * not written out to the file." 1002 */ 1003 zero_user_segment(page, offset_into_page, PAGE_SIZE); 1004 1005 /* Adjust the end_offset to the end of file */ 1006 end_offset = offset; 1007 } 1008 1009 return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset); 1010 1011 redirty: 1012 redirty_page_for_writepage(wbc, page); 1013 unlock_page(page); 1014 return 0; 1015 } 1016 1017 STATIC int 1018 xfs_vm_writepage( 1019 struct page *page, 1020 struct writeback_control *wbc) 1021 { 1022 struct xfs_writepage_ctx wpc = { 1023 .io_type = XFS_IO_INVALID, 1024 }; 1025 int ret; 1026 1027 ret = xfs_do_writepage(page, wbc, &wpc); 1028 if (wpc.ioend) 1029 ret = xfs_submit_ioend(wbc, wpc.ioend, ret); 1030 return ret; 1031 } 1032 1033 STATIC int 1034 xfs_vm_writepages( 1035 struct address_space *mapping, 1036 struct writeback_control *wbc) 1037 { 1038 struct xfs_writepage_ctx wpc = { 1039 .io_type = XFS_IO_INVALID, 1040 }; 1041 int ret; 1042 1043 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1044 if (dax_mapping(mapping)) 1045 return dax_writeback_mapping_range(mapping, 1046 xfs_find_bdev_for_inode(mapping->host), wbc); 1047 1048 ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); 1049 if (wpc.ioend) 1050 ret = xfs_submit_ioend(wbc, wpc.ioend, ret); 1051 return ret; 1052 } 1053 1054 /* 1055 * Called to move a page into cleanable state - and from there 1056 * to be released. The page should already be clean. We always 1057 * have buffer heads in this call. 1058 * 1059 * Returns 1 if the page is ok to release, 0 otherwise. 1060 */ 1061 STATIC int 1062 xfs_vm_releasepage( 1063 struct page *page, 1064 gfp_t gfp_mask) 1065 { 1066 int delalloc, unwritten; 1067 1068 trace_xfs_releasepage(page->mapping->host, page, 0, 0); 1069 1070 /* 1071 * mm accommodates an old ext3 case where clean pages might not have had 1072 * the dirty bit cleared. Thus, it can send actual dirty pages to 1073 * ->releasepage() via shrink_active_list(). Conversely, 1074 * block_invalidatepage() can send pages that are still marked dirty 1075 * but otherwise have invalidated buffers. 1076 * 1077 * We've historically freed buffers on the latter. Instead, quietly 1078 * filter out all dirty pages to avoid spurious buffer state warnings. 1079 * This can likely be removed once shrink_active_list() is fixed. 1080 */ 1081 if (PageDirty(page)) 1082 return 0; 1083 1084 xfs_count_page_state(page, &delalloc, &unwritten); 1085 1086 if (WARN_ON_ONCE(delalloc)) 1087 return 0; 1088 if (WARN_ON_ONCE(unwritten)) 1089 return 0; 1090 1091 return try_to_free_buffers(page); 1092 } 1093 1094 /* 1095 * When we map a DIO buffer, we may need to pass flags to 1096 * xfs_end_io_direct_write to tell it what kind of write IO we are doing. 1097 * 1098 * Note that for DIO, an IO to the highest supported file block offset (i.e. 1099 * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 1100 * bit variable. Hence if we see this overflow, we have to assume that the IO is 1101 * extending the file size. We won't know for sure until IO completion is run 1102 * and the actual max write offset is communicated to the IO completion 1103 * routine. 1104 */ 1105 static void 1106 xfs_map_direct( 1107 struct inode *inode, 1108 struct buffer_head *bh_result, 1109 struct xfs_bmbt_irec *imap, 1110 xfs_off_t offset) 1111 { 1112 uintptr_t *flags = (uintptr_t *)&bh_result->b_private; 1113 xfs_off_t size = bh_result->b_size; 1114 1115 trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size, 1116 ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap); 1117 1118 if (ISUNWRITTEN(imap)) { 1119 *flags |= XFS_DIO_FLAG_UNWRITTEN; 1120 set_buffer_defer_completion(bh_result); 1121 } else if (offset + size > i_size_read(inode) || offset + size < 0) { 1122 *flags |= XFS_DIO_FLAG_APPEND; 1123 set_buffer_defer_completion(bh_result); 1124 } 1125 } 1126 1127 /* 1128 * If this is O_DIRECT or the mpage code calling tell them how large the mapping 1129 * is, so that we can avoid repeated get_blocks calls. 1130 * 1131 * If the mapping spans EOF, then we have to break the mapping up as the mapping 1132 * for blocks beyond EOF must be marked new so that sub block regions can be 1133 * correctly zeroed. We can't do this for mappings within EOF unless the mapping 1134 * was just allocated or is unwritten, otherwise the callers would overwrite 1135 * existing data with zeros. Hence we have to split the mapping into a range up 1136 * to and including EOF, and a second mapping for beyond EOF. 1137 */ 1138 static void 1139 xfs_map_trim_size( 1140 struct inode *inode, 1141 sector_t iblock, 1142 struct buffer_head *bh_result, 1143 struct xfs_bmbt_irec *imap, 1144 xfs_off_t offset, 1145 ssize_t size) 1146 { 1147 xfs_off_t mapping_size; 1148 1149 mapping_size = imap->br_startoff + imap->br_blockcount - iblock; 1150 mapping_size <<= inode->i_blkbits; 1151 1152 ASSERT(mapping_size > 0); 1153 if (mapping_size > size) 1154 mapping_size = size; 1155 if (offset < i_size_read(inode) && 1156 offset + mapping_size >= i_size_read(inode)) { 1157 /* limit mapping to block that spans EOF */ 1158 mapping_size = roundup_64(i_size_read(inode) - offset, 1159 1 << inode->i_blkbits); 1160 } 1161 if (mapping_size > LONG_MAX) 1162 mapping_size = LONG_MAX; 1163 1164 bh_result->b_size = mapping_size; 1165 } 1166 1167 STATIC int 1168 __xfs_get_blocks( 1169 struct inode *inode, 1170 sector_t iblock, 1171 struct buffer_head *bh_result, 1172 int create, 1173 bool direct, 1174 bool dax_fault) 1175 { 1176 struct xfs_inode *ip = XFS_I(inode); 1177 struct xfs_mount *mp = ip->i_mount; 1178 xfs_fileoff_t offset_fsb, end_fsb; 1179 int error = 0; 1180 int lockmode = 0; 1181 struct xfs_bmbt_irec imap; 1182 int nimaps = 1; 1183 xfs_off_t offset; 1184 ssize_t size; 1185 int new = 0; 1186 1187 BUG_ON(create && !direct); 1188 1189 if (XFS_FORCED_SHUTDOWN(mp)) 1190 return -EIO; 1191 1192 offset = (xfs_off_t)iblock << inode->i_blkbits; 1193 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1194 size = bh_result->b_size; 1195 1196 if (!create && offset >= i_size_read(inode)) 1197 return 0; 1198 1199 /* 1200 * Direct I/O is usually done on preallocated files, so try getting 1201 * a block mapping without an exclusive lock first. 1202 */ 1203 lockmode = xfs_ilock_data_map_shared(ip); 1204 1205 ASSERT(offset <= mp->m_super->s_maxbytes); 1206 if (offset + size > mp->m_super->s_maxbytes) 1207 size = mp->m_super->s_maxbytes - offset; 1208 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); 1209 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1210 1211 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 1212 &imap, &nimaps, XFS_BMAPI_ENTIRE); 1213 if (error) 1214 goto out_unlock; 1215 1216 /* for DAX, we convert unwritten extents directly */ 1217 if (create && 1218 (!nimaps || 1219 (imap.br_startblock == HOLESTARTBLOCK || 1220 imap.br_startblock == DELAYSTARTBLOCK) || 1221 (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { 1222 /* 1223 * xfs_iomap_write_direct() expects the shared lock. It 1224 * is unlocked on return. 1225 */ 1226 if (lockmode == XFS_ILOCK_EXCL) 1227 xfs_ilock_demote(ip, lockmode); 1228 1229 error = xfs_iomap_write_direct(ip, offset, size, 1230 &imap, nimaps); 1231 if (error) 1232 return error; 1233 new = 1; 1234 1235 trace_xfs_get_blocks_alloc(ip, offset, size, 1236 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1237 : XFS_IO_DELALLOC, &imap); 1238 } else if (nimaps) { 1239 trace_xfs_get_blocks_found(ip, offset, size, 1240 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1241 : XFS_IO_OVERWRITE, &imap); 1242 xfs_iunlock(ip, lockmode); 1243 } else { 1244 trace_xfs_get_blocks_notfound(ip, offset, size); 1245 goto out_unlock; 1246 } 1247 1248 if (IS_DAX(inode) && create) { 1249 ASSERT(!ISUNWRITTEN(&imap)); 1250 /* zeroing is not needed at a higher layer */ 1251 new = 0; 1252 } 1253 1254 /* trim mapping down to size requested */ 1255 xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); 1256 1257 /* 1258 * For unwritten extents do not report a disk address in the buffered 1259 * read case (treat as if we're reading into a hole). 1260 */ 1261 if (imap.br_startblock != HOLESTARTBLOCK && 1262 imap.br_startblock != DELAYSTARTBLOCK && 1263 (create || !ISUNWRITTEN(&imap))) { 1264 xfs_map_buffer(inode, bh_result, &imap, offset); 1265 if (ISUNWRITTEN(&imap)) 1266 set_buffer_unwritten(bh_result); 1267 /* direct IO needs special help */ 1268 if (create) { 1269 if (dax_fault) 1270 ASSERT(!ISUNWRITTEN(&imap)); 1271 else 1272 xfs_map_direct(inode, bh_result, &imap, offset); 1273 } 1274 } 1275 1276 /* 1277 * If this is a realtime file, data may be on a different device. 1278 * to that pointed to from the buffer_head b_bdev currently. 1279 */ 1280 bh_result->b_bdev = xfs_find_bdev_for_inode(inode); 1281 1282 /* 1283 * If we previously allocated a block out beyond eof and we are now 1284 * coming back to use it then we will need to flag it as new even if it 1285 * has a disk address. 1286 * 1287 * With sub-block writes into unwritten extents we also need to mark 1288 * the buffer as new so that the unwritten parts of the buffer gets 1289 * correctly zeroed. 1290 */ 1291 if (create && 1292 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1293 (offset >= i_size_read(inode)) || 1294 (new || ISUNWRITTEN(&imap)))) 1295 set_buffer_new(bh_result); 1296 1297 BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK); 1298 1299 return 0; 1300 1301 out_unlock: 1302 xfs_iunlock(ip, lockmode); 1303 return error; 1304 } 1305 1306 int 1307 xfs_get_blocks( 1308 struct inode *inode, 1309 sector_t iblock, 1310 struct buffer_head *bh_result, 1311 int create) 1312 { 1313 return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); 1314 } 1315 1316 int 1317 xfs_get_blocks_direct( 1318 struct inode *inode, 1319 sector_t iblock, 1320 struct buffer_head *bh_result, 1321 int create) 1322 { 1323 return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); 1324 } 1325 1326 int 1327 xfs_get_blocks_dax_fault( 1328 struct inode *inode, 1329 sector_t iblock, 1330 struct buffer_head *bh_result, 1331 int create) 1332 { 1333 return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); 1334 } 1335 1336 /* 1337 * Complete a direct I/O write request. 1338 * 1339 * xfs_map_direct passes us some flags in the private data to tell us what to 1340 * do. If no flags are set, then the write IO is an overwrite wholly within 1341 * the existing allocated file size and so there is nothing for us to do. 1342 * 1343 * Note that in this case the completion can be called in interrupt context, 1344 * whereas if we have flags set we will always be called in task context 1345 * (i.e. from a workqueue). 1346 */ 1347 int 1348 xfs_end_io_direct_write( 1349 struct kiocb *iocb, 1350 loff_t offset, 1351 ssize_t size, 1352 void *private) 1353 { 1354 struct inode *inode = file_inode(iocb->ki_filp); 1355 struct xfs_inode *ip = XFS_I(inode); 1356 uintptr_t flags = (uintptr_t)private; 1357 int error = 0; 1358 1359 trace_xfs_end_io_direct_write(ip, offset, size); 1360 1361 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 1362 return -EIO; 1363 1364 if (size <= 0) 1365 return size; 1366 1367 /* 1368 * The flags tell us whether we are doing unwritten extent conversions 1369 * or an append transaction that updates the on-disk file size. These 1370 * cases are the only cases where we should *potentially* be needing 1371 * to update the VFS inode size. 1372 */ 1373 if (flags == 0) { 1374 ASSERT(offset + size <= i_size_read(inode)); 1375 return 0; 1376 } 1377 1378 /* 1379 * We need to update the in-core inode size here so that we don't end up 1380 * with the on-disk inode size being outside the in-core inode size. We 1381 * have no other method of updating EOF for AIO, so always do it here 1382 * if necessary. 1383 * 1384 * We need to lock the test/set EOF update as we can be racing with 1385 * other IO completions here to update the EOF. Failing to serialise 1386 * here can result in EOF moving backwards and Bad Things Happen when 1387 * that occurs. 1388 */ 1389 spin_lock(&ip->i_flags_lock); 1390 if (offset + size > i_size_read(inode)) 1391 i_size_write(inode, offset + size); 1392 spin_unlock(&ip->i_flags_lock); 1393 1394 if (flags & XFS_DIO_FLAG_UNWRITTEN) { 1395 trace_xfs_end_io_direct_write_unwritten(ip, offset, size); 1396 1397 error = xfs_iomap_write_unwritten(ip, offset, size); 1398 } else if (flags & XFS_DIO_FLAG_APPEND) { 1399 trace_xfs_end_io_direct_write_append(ip, offset, size); 1400 1401 error = xfs_setfilesize(ip, offset, size); 1402 } 1403 1404 return error; 1405 } 1406 1407 STATIC ssize_t 1408 xfs_vm_direct_IO( 1409 struct kiocb *iocb, 1410 struct iov_iter *iter) 1411 { 1412 /* 1413 * We just need the method present so that open/fcntl allow direct I/O. 1414 */ 1415 return -EINVAL; 1416 } 1417 1418 STATIC sector_t 1419 xfs_vm_bmap( 1420 struct address_space *mapping, 1421 sector_t block) 1422 { 1423 struct inode *inode = (struct inode *)mapping->host; 1424 struct xfs_inode *ip = XFS_I(inode); 1425 1426 trace_xfs_vm_bmap(XFS_I(inode)); 1427 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1428 filemap_write_and_wait(mapping); 1429 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1430 return generic_block_bmap(mapping, block, xfs_get_blocks); 1431 } 1432 1433 STATIC int 1434 xfs_vm_readpage( 1435 struct file *unused, 1436 struct page *page) 1437 { 1438 trace_xfs_vm_readpage(page->mapping->host, 1); 1439 return mpage_readpage(page, xfs_get_blocks); 1440 } 1441 1442 STATIC int 1443 xfs_vm_readpages( 1444 struct file *unused, 1445 struct address_space *mapping, 1446 struct list_head *pages, 1447 unsigned nr_pages) 1448 { 1449 trace_xfs_vm_readpages(mapping->host, nr_pages); 1450 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1451 } 1452 1453 /* 1454 * This is basically a copy of __set_page_dirty_buffers() with one 1455 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them 1456 * dirty, we'll never be able to clean them because we don't write buffers 1457 * beyond EOF, and that means we can't invalidate pages that span EOF 1458 * that have been marked dirty. Further, the dirty state can leak into 1459 * the file interior if the file is extended, resulting in all sorts of 1460 * bad things happening as the state does not match the underlying data. 1461 * 1462 * XXX: this really indicates that bufferheads in XFS need to die. Warts like 1463 * this only exist because of bufferheads and how the generic code manages them. 1464 */ 1465 STATIC int 1466 xfs_vm_set_page_dirty( 1467 struct page *page) 1468 { 1469 struct address_space *mapping = page->mapping; 1470 struct inode *inode = mapping->host; 1471 loff_t end_offset; 1472 loff_t offset; 1473 int newly_dirty; 1474 1475 if (unlikely(!mapping)) 1476 return !TestSetPageDirty(page); 1477 1478 end_offset = i_size_read(inode); 1479 offset = page_offset(page); 1480 1481 spin_lock(&mapping->private_lock); 1482 if (page_has_buffers(page)) { 1483 struct buffer_head *head = page_buffers(page); 1484 struct buffer_head *bh = head; 1485 1486 do { 1487 if (offset < end_offset) 1488 set_buffer_dirty(bh); 1489 bh = bh->b_this_page; 1490 offset += 1 << inode->i_blkbits; 1491 } while (bh != head); 1492 } 1493 /* 1494 * Lock out page->mem_cgroup migration to keep PageDirty 1495 * synchronized with per-memcg dirty page counters. 1496 */ 1497 lock_page_memcg(page); 1498 newly_dirty = !TestSetPageDirty(page); 1499 spin_unlock(&mapping->private_lock); 1500 1501 if (newly_dirty) { 1502 /* sigh - __set_page_dirty() is static, so copy it here, too */ 1503 unsigned long flags; 1504 1505 spin_lock_irqsave(&mapping->tree_lock, flags); 1506 if (page->mapping) { /* Race with truncate? */ 1507 WARN_ON_ONCE(!PageUptodate(page)); 1508 account_page_dirtied(page, mapping); 1509 radix_tree_tag_set(&mapping->page_tree, 1510 page_index(page), PAGECACHE_TAG_DIRTY); 1511 } 1512 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1513 } 1514 unlock_page_memcg(page); 1515 if (newly_dirty) 1516 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1517 return newly_dirty; 1518 } 1519 1520 const struct address_space_operations xfs_address_space_operations = { 1521 .readpage = xfs_vm_readpage, 1522 .readpages = xfs_vm_readpages, 1523 .writepage = xfs_vm_writepage, 1524 .writepages = xfs_vm_writepages, 1525 .set_page_dirty = xfs_vm_set_page_dirty, 1526 .releasepage = xfs_vm_releasepage, 1527 .invalidatepage = xfs_vm_invalidatepage, 1528 .bmap = xfs_vm_bmap, 1529 .direct_IO = xfs_vm_direct_IO, 1530 .migratepage = buffer_migrate_page, 1531 .is_partially_uptodate = block_is_partially_uptodate, 1532 .error_remove_page = generic_error_remove_page, 1533 }; 1534