1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_shared.h" 20 #include "xfs_format.h" 21 #include "xfs_log_format.h" 22 #include "xfs_trans_resv.h" 23 #include "xfs_mount.h" 24 #include "xfs_inode.h" 25 #include "xfs_trans.h" 26 #include "xfs_inode_item.h" 27 #include "xfs_alloc.h" 28 #include "xfs_error.h" 29 #include "xfs_iomap.h" 30 #include "xfs_trace.h" 31 #include "xfs_bmap.h" 32 #include "xfs_bmap_util.h" 33 #include "xfs_bmap_btree.h" 34 #include <linux/gfp.h> 35 #include <linux/mpage.h> 36 #include <linux/pagevec.h> 37 #include <linux/writeback.h> 38 39 void 40 xfs_count_page_state( 41 struct page *page, 42 int *delalloc, 43 int *unwritten) 44 { 45 struct buffer_head *bh, *head; 46 47 *delalloc = *unwritten = 0; 48 49 bh = head = page_buffers(page); 50 do { 51 if (buffer_unwritten(bh)) 52 (*unwritten) = 1; 53 else if (buffer_delay(bh)) 54 (*delalloc) = 1; 55 } while ((bh = bh->b_this_page) != head); 56 } 57 58 STATIC struct block_device * 59 xfs_find_bdev_for_inode( 60 struct inode *inode) 61 { 62 struct xfs_inode *ip = XFS_I(inode); 63 struct xfs_mount *mp = ip->i_mount; 64 65 if (XFS_IS_REALTIME_INODE(ip)) 66 return mp->m_rtdev_targp->bt_bdev; 67 else 68 return mp->m_ddev_targp->bt_bdev; 69 } 70 71 /* 72 * We're now finished for good with this ioend structure. 73 * Update the page state via the associated buffer_heads, 74 * release holds on the inode and bio, and finally free 75 * up memory. Do not use the ioend after this. 76 */ 77 STATIC void 78 xfs_destroy_ioend( 79 xfs_ioend_t *ioend) 80 { 81 struct buffer_head *bh, *next; 82 83 for (bh = ioend->io_buffer_head; bh; bh = next) { 84 next = bh->b_private; 85 bh->b_end_io(bh, !ioend->io_error); 86 } 87 88 mempool_free(ioend, xfs_ioend_pool); 89 } 90 91 /* 92 * Fast and loose check if this write could update the on-disk inode size. 93 */ 94 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) 95 { 96 return ioend->io_offset + ioend->io_size > 97 XFS_I(ioend->io_inode)->i_d.di_size; 98 } 99 100 STATIC int 101 xfs_setfilesize_trans_alloc( 102 struct xfs_ioend *ioend) 103 { 104 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 105 struct xfs_trans *tp; 106 int error; 107 108 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 109 110 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); 111 if (error) { 112 xfs_trans_cancel(tp); 113 return error; 114 } 115 116 ioend->io_append_trans = tp; 117 118 /* 119 * We may pass freeze protection with a transaction. So tell lockdep 120 * we released it. 121 */ 122 __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS); 123 /* 124 * We hand off the transaction to the completion thread now, so 125 * clear the flag here. 126 */ 127 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 128 return 0; 129 } 130 131 /* 132 * Update on-disk file size now that data has been written to disk. 133 */ 134 STATIC int 135 xfs_setfilesize( 136 struct xfs_inode *ip, 137 struct xfs_trans *tp, 138 xfs_off_t offset, 139 size_t size) 140 { 141 xfs_fsize_t isize; 142 143 xfs_ilock(ip, XFS_ILOCK_EXCL); 144 isize = xfs_new_eof(ip, offset + size); 145 if (!isize) { 146 xfs_iunlock(ip, XFS_ILOCK_EXCL); 147 xfs_trans_cancel(tp); 148 return 0; 149 } 150 151 trace_xfs_setfilesize(ip, offset, size); 152 153 ip->i_d.di_size = isize; 154 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 155 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 156 157 return xfs_trans_commit(tp); 158 } 159 160 STATIC int 161 xfs_setfilesize_ioend( 162 struct xfs_ioend *ioend) 163 { 164 struct xfs_inode *ip = XFS_I(ioend->io_inode); 165 struct xfs_trans *tp = ioend->io_append_trans; 166 167 /* 168 * The transaction may have been allocated in the I/O submission thread, 169 * thus we need to mark ourselves as being in a transaction manually. 170 * Similarly for freeze protection. 171 */ 172 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 173 __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); 174 175 /* we abort the update if there was an IO error */ 176 if (ioend->io_error) { 177 xfs_trans_cancel(tp); 178 return ioend->io_error; 179 } 180 181 return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); 182 } 183 184 /* 185 * Schedule IO completion handling on the final put of an ioend. 186 * 187 * If there is no work to do we might as well call it a day and free the 188 * ioend right now. 189 */ 190 STATIC void 191 xfs_finish_ioend( 192 struct xfs_ioend *ioend) 193 { 194 if (atomic_dec_and_test(&ioend->io_remaining)) { 195 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 196 197 if (ioend->io_type == XFS_IO_UNWRITTEN) 198 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 199 else if (ioend->io_append_trans) 200 queue_work(mp->m_data_workqueue, &ioend->io_work); 201 else 202 xfs_destroy_ioend(ioend); 203 } 204 } 205 206 /* 207 * IO write completion. 208 */ 209 STATIC void 210 xfs_end_io( 211 struct work_struct *work) 212 { 213 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); 214 struct xfs_inode *ip = XFS_I(ioend->io_inode); 215 int error = 0; 216 217 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 218 ioend->io_error = -EIO; 219 goto done; 220 } 221 222 /* 223 * For unwritten extents we need to issue transactions to convert a 224 * range to normal written extens after the data I/O has finished. 225 * Detecting and handling completion IO errors is done individually 226 * for each case as different cleanup operations need to be performed 227 * on error. 228 */ 229 if (ioend->io_type == XFS_IO_UNWRITTEN) { 230 if (ioend->io_error) 231 goto done; 232 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 233 ioend->io_size); 234 } else if (ioend->io_append_trans) { 235 error = xfs_setfilesize_ioend(ioend); 236 } else { 237 ASSERT(!xfs_ioend_is_append(ioend)); 238 } 239 240 done: 241 if (error) 242 ioend->io_error = error; 243 xfs_destroy_ioend(ioend); 244 } 245 246 /* 247 * Allocate and initialise an IO completion structure. 248 * We need to track unwritten extent write completion here initially. 249 * We'll need to extend this for updating the ondisk inode size later 250 * (vs. incore size). 251 */ 252 STATIC xfs_ioend_t * 253 xfs_alloc_ioend( 254 struct inode *inode, 255 unsigned int type) 256 { 257 xfs_ioend_t *ioend; 258 259 ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); 260 261 /* 262 * Set the count to 1 initially, which will prevent an I/O 263 * completion callback from happening before we have started 264 * all the I/O from calling the completion routine too early. 265 */ 266 atomic_set(&ioend->io_remaining, 1); 267 ioend->io_error = 0; 268 ioend->io_list = NULL; 269 ioend->io_type = type; 270 ioend->io_inode = inode; 271 ioend->io_buffer_head = NULL; 272 ioend->io_buffer_tail = NULL; 273 ioend->io_offset = 0; 274 ioend->io_size = 0; 275 ioend->io_append_trans = NULL; 276 277 INIT_WORK(&ioend->io_work, xfs_end_io); 278 return ioend; 279 } 280 281 STATIC int 282 xfs_map_blocks( 283 struct inode *inode, 284 loff_t offset, 285 struct xfs_bmbt_irec *imap, 286 int type, 287 int nonblocking) 288 { 289 struct xfs_inode *ip = XFS_I(inode); 290 struct xfs_mount *mp = ip->i_mount; 291 ssize_t count = 1 << inode->i_blkbits; 292 xfs_fileoff_t offset_fsb, end_fsb; 293 int error = 0; 294 int bmapi_flags = XFS_BMAPI_ENTIRE; 295 int nimaps = 1; 296 297 if (XFS_FORCED_SHUTDOWN(mp)) 298 return -EIO; 299 300 if (type == XFS_IO_UNWRITTEN) 301 bmapi_flags |= XFS_BMAPI_IGSTATE; 302 303 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 304 if (nonblocking) 305 return -EAGAIN; 306 xfs_ilock(ip, XFS_ILOCK_SHARED); 307 } 308 309 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 310 (ip->i_df.if_flags & XFS_IFEXTENTS)); 311 ASSERT(offset <= mp->m_super->s_maxbytes); 312 313 if (offset + count > mp->m_super->s_maxbytes) 314 count = mp->m_super->s_maxbytes - offset; 315 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 316 offset_fsb = XFS_B_TO_FSBT(mp, offset); 317 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 318 imap, &nimaps, bmapi_flags); 319 xfs_iunlock(ip, XFS_ILOCK_SHARED); 320 321 if (error) 322 return error; 323 324 if (type == XFS_IO_DELALLOC && 325 (!nimaps || isnullstartblock(imap->br_startblock))) { 326 error = xfs_iomap_write_allocate(ip, offset, imap); 327 if (!error) 328 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); 329 return error; 330 } 331 332 #ifdef DEBUG 333 if (type == XFS_IO_UNWRITTEN) { 334 ASSERT(nimaps); 335 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 336 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 337 } 338 #endif 339 if (nimaps) 340 trace_xfs_map_blocks_found(ip, offset, count, type, imap); 341 return 0; 342 } 343 344 STATIC int 345 xfs_imap_valid( 346 struct inode *inode, 347 struct xfs_bmbt_irec *imap, 348 xfs_off_t offset) 349 { 350 offset >>= inode->i_blkbits; 351 352 return offset >= imap->br_startoff && 353 offset < imap->br_startoff + imap->br_blockcount; 354 } 355 356 /* 357 * BIO completion handler for buffered IO. 358 */ 359 STATIC void 360 xfs_end_bio( 361 struct bio *bio) 362 { 363 xfs_ioend_t *ioend = bio->bi_private; 364 365 if (!ioend->io_error) 366 ioend->io_error = bio->bi_error; 367 368 /* Toss bio and pass work off to an xfsdatad thread */ 369 bio->bi_private = NULL; 370 bio->bi_end_io = NULL; 371 bio_put(bio); 372 373 xfs_finish_ioend(ioend); 374 } 375 376 STATIC void 377 xfs_submit_ioend_bio( 378 struct writeback_control *wbc, 379 xfs_ioend_t *ioend, 380 struct bio *bio) 381 { 382 atomic_inc(&ioend->io_remaining); 383 bio->bi_private = ioend; 384 bio->bi_end_io = xfs_end_bio; 385 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); 386 } 387 388 STATIC struct bio * 389 xfs_alloc_ioend_bio( 390 struct buffer_head *bh) 391 { 392 struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); 393 394 ASSERT(bio->bi_private == NULL); 395 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 396 bio->bi_bdev = bh->b_bdev; 397 return bio; 398 } 399 400 STATIC void 401 xfs_start_buffer_writeback( 402 struct buffer_head *bh) 403 { 404 ASSERT(buffer_mapped(bh)); 405 ASSERT(buffer_locked(bh)); 406 ASSERT(!buffer_delay(bh)); 407 ASSERT(!buffer_unwritten(bh)); 408 409 mark_buffer_async_write(bh); 410 set_buffer_uptodate(bh); 411 clear_buffer_dirty(bh); 412 } 413 414 STATIC void 415 xfs_start_page_writeback( 416 struct page *page, 417 int clear_dirty, 418 int buffers) 419 { 420 ASSERT(PageLocked(page)); 421 ASSERT(!PageWriteback(page)); 422 423 /* 424 * if the page was not fully cleaned, we need to ensure that the higher 425 * layers come back to it correctly. That means we need to keep the page 426 * dirty, and for WB_SYNC_ALL writeback we need to ensure the 427 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to 428 * write this page in this writeback sweep will be made. 429 */ 430 if (clear_dirty) { 431 clear_page_dirty_for_io(page); 432 set_page_writeback(page); 433 } else 434 set_page_writeback_keepwrite(page); 435 436 unlock_page(page); 437 438 /* If no buffers on the page are to be written, finish it here */ 439 if (!buffers) 440 end_page_writeback(page); 441 } 442 443 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) 444 { 445 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); 446 } 447 448 /* 449 * Submit all of the bios for all of the ioends we have saved up, covering the 450 * initial writepage page and also any probed pages. 451 * 452 * Because we may have multiple ioends spanning a page, we need to start 453 * writeback on all the buffers before we submit them for I/O. If we mark the 454 * buffers as we got, then we can end up with a page that only has buffers 455 * marked async write and I/O complete on can occur before we mark the other 456 * buffers async write. 457 * 458 * The end result of this is that we trip a bug in end_page_writeback() because 459 * we call it twice for the one page as the code in end_buffer_async_write() 460 * assumes that all buffers on the page are started at the same time. 461 * 462 * The fix is two passes across the ioend list - one to start writeback on the 463 * buffer_heads, and then submit them for I/O on the second pass. 464 * 465 * If @fail is non-zero, it means that we have a situation where some part of 466 * the submission process has failed after we have marked paged for writeback 467 * and unlocked them. In this situation, we need to fail the ioend chain rather 468 * than submit it to IO. This typically only happens on a filesystem shutdown. 469 */ 470 STATIC void 471 xfs_submit_ioend( 472 struct writeback_control *wbc, 473 xfs_ioend_t *ioend, 474 int fail) 475 { 476 xfs_ioend_t *head = ioend; 477 xfs_ioend_t *next; 478 struct buffer_head *bh; 479 struct bio *bio; 480 sector_t lastblock = 0; 481 482 /* Pass 1 - start writeback */ 483 do { 484 next = ioend->io_list; 485 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) 486 xfs_start_buffer_writeback(bh); 487 } while ((ioend = next) != NULL); 488 489 /* Pass 2 - submit I/O */ 490 ioend = head; 491 do { 492 next = ioend->io_list; 493 bio = NULL; 494 495 /* 496 * If we are failing the IO now, just mark the ioend with an 497 * error and finish it. This will run IO completion immediately 498 * as there is only one reference to the ioend at this point in 499 * time. 500 */ 501 if (fail) { 502 ioend->io_error = fail; 503 xfs_finish_ioend(ioend); 504 continue; 505 } 506 507 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 508 509 if (!bio) { 510 retry: 511 bio = xfs_alloc_ioend_bio(bh); 512 } else if (bh->b_blocknr != lastblock + 1) { 513 xfs_submit_ioend_bio(wbc, ioend, bio); 514 goto retry; 515 } 516 517 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { 518 xfs_submit_ioend_bio(wbc, ioend, bio); 519 goto retry; 520 } 521 522 lastblock = bh->b_blocknr; 523 } 524 if (bio) 525 xfs_submit_ioend_bio(wbc, ioend, bio); 526 xfs_finish_ioend(ioend); 527 } while ((ioend = next) != NULL); 528 } 529 530 /* 531 * Cancel submission of all buffer_heads so far in this endio. 532 * Toss the endio too. Only ever called for the initial page 533 * in a writepage request, so only ever one page. 534 */ 535 STATIC void 536 xfs_cancel_ioend( 537 xfs_ioend_t *ioend) 538 { 539 xfs_ioend_t *next; 540 struct buffer_head *bh, *next_bh; 541 542 do { 543 next = ioend->io_list; 544 bh = ioend->io_buffer_head; 545 do { 546 next_bh = bh->b_private; 547 clear_buffer_async_write(bh); 548 /* 549 * The unwritten flag is cleared when added to the 550 * ioend. We're not submitting for I/O so mark the 551 * buffer unwritten again for next time around. 552 */ 553 if (ioend->io_type == XFS_IO_UNWRITTEN) 554 set_buffer_unwritten(bh); 555 unlock_buffer(bh); 556 } while ((bh = next_bh) != NULL); 557 558 mempool_free(ioend, xfs_ioend_pool); 559 } while ((ioend = next) != NULL); 560 } 561 562 /* 563 * Test to see if we've been building up a completion structure for 564 * earlier buffers -- if so, we try to append to this ioend if we 565 * can, otherwise we finish off any current ioend and start another. 566 * Return true if we've finished the given ioend. 567 */ 568 STATIC void 569 xfs_add_to_ioend( 570 struct inode *inode, 571 struct buffer_head *bh, 572 xfs_off_t offset, 573 unsigned int type, 574 xfs_ioend_t **result, 575 int need_ioend) 576 { 577 xfs_ioend_t *ioend = *result; 578 579 if (!ioend || need_ioend || type != ioend->io_type) { 580 xfs_ioend_t *previous = *result; 581 582 ioend = xfs_alloc_ioend(inode, type); 583 ioend->io_offset = offset; 584 ioend->io_buffer_head = bh; 585 ioend->io_buffer_tail = bh; 586 if (previous) 587 previous->io_list = ioend; 588 *result = ioend; 589 } else { 590 ioend->io_buffer_tail->b_private = bh; 591 ioend->io_buffer_tail = bh; 592 } 593 594 bh->b_private = NULL; 595 ioend->io_size += bh->b_size; 596 } 597 598 STATIC void 599 xfs_map_buffer( 600 struct inode *inode, 601 struct buffer_head *bh, 602 struct xfs_bmbt_irec *imap, 603 xfs_off_t offset) 604 { 605 sector_t bn; 606 struct xfs_mount *m = XFS_I(inode)->i_mount; 607 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); 608 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); 609 610 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 611 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 612 613 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + 614 ((offset - iomap_offset) >> inode->i_blkbits); 615 616 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); 617 618 bh->b_blocknr = bn; 619 set_buffer_mapped(bh); 620 } 621 622 STATIC void 623 xfs_map_at_offset( 624 struct inode *inode, 625 struct buffer_head *bh, 626 struct xfs_bmbt_irec *imap, 627 xfs_off_t offset) 628 { 629 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 630 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 631 632 xfs_map_buffer(inode, bh, imap, offset); 633 set_buffer_mapped(bh); 634 clear_buffer_delay(bh); 635 clear_buffer_unwritten(bh); 636 } 637 638 /* 639 * Test if a given page contains at least one buffer of a given @type. 640 * If @check_all_buffers is true, then we walk all the buffers in the page to 641 * try to find one of the type passed in. If it is not set, then the caller only 642 * needs to check the first buffer on the page for a match. 643 */ 644 STATIC bool 645 xfs_check_page_type( 646 struct page *page, 647 unsigned int type, 648 bool check_all_buffers) 649 { 650 struct buffer_head *bh; 651 struct buffer_head *head; 652 653 if (PageWriteback(page)) 654 return false; 655 if (!page->mapping) 656 return false; 657 if (!page_has_buffers(page)) 658 return false; 659 660 bh = head = page_buffers(page); 661 do { 662 if (buffer_unwritten(bh)) { 663 if (type == XFS_IO_UNWRITTEN) 664 return true; 665 } else if (buffer_delay(bh)) { 666 if (type == XFS_IO_DELALLOC) 667 return true; 668 } else if (buffer_dirty(bh) && buffer_mapped(bh)) { 669 if (type == XFS_IO_OVERWRITE) 670 return true; 671 } 672 673 /* If we are only checking the first buffer, we are done now. */ 674 if (!check_all_buffers) 675 break; 676 } while ((bh = bh->b_this_page) != head); 677 678 return false; 679 } 680 681 /* 682 * Allocate & map buffers for page given the extent map. Write it out. 683 * except for the original page of a writepage, this is called on 684 * delalloc/unwritten pages only, for the original page it is possible 685 * that the page has no mapping at all. 686 */ 687 STATIC int 688 xfs_convert_page( 689 struct inode *inode, 690 struct page *page, 691 loff_t tindex, 692 struct xfs_bmbt_irec *imap, 693 xfs_ioend_t **ioendp, 694 struct writeback_control *wbc) 695 { 696 struct buffer_head *bh, *head; 697 xfs_off_t end_offset; 698 unsigned long p_offset; 699 unsigned int type; 700 int len, page_dirty; 701 int count = 0, done = 0, uptodate = 1; 702 xfs_off_t offset = page_offset(page); 703 704 if (page->index != tindex) 705 goto fail; 706 if (!trylock_page(page)) 707 goto fail; 708 if (PageWriteback(page)) 709 goto fail_unlock_page; 710 if (page->mapping != inode->i_mapping) 711 goto fail_unlock_page; 712 if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) 713 goto fail_unlock_page; 714 715 /* 716 * page_dirty is initially a count of buffers on the page before 717 * EOF and is decremented as we move each into a cleanable state. 718 * 719 * Derivation: 720 * 721 * End offset is the highest offset that this page should represent. 722 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) 723 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and 724 * hence give us the correct page_dirty count. On any other page, 725 * it will be zero and in that case we need page_dirty to be the 726 * count of buffers on the page. 727 */ 728 end_offset = min_t(unsigned long long, 729 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, 730 i_size_read(inode)); 731 732 /* 733 * If the current map does not span the entire page we are about to try 734 * to write, then give up. The only way we can write a page that spans 735 * multiple mappings in a single writeback iteration is via the 736 * xfs_vm_writepage() function. Data integrity writeback requires the 737 * entire page to be written in a single attempt, otherwise the part of 738 * the page we don't write here doesn't get written as part of the data 739 * integrity sync. 740 * 741 * For normal writeback, we also don't attempt to write partial pages 742 * here as it simply means that write_cache_pages() will see it under 743 * writeback and ignore the page until some point in the future, at 744 * which time this will be the only page in the file that needs 745 * writeback. Hence for more optimal IO patterns, we should always 746 * avoid partial page writeback due to multiple mappings on a page here. 747 */ 748 if (!xfs_imap_valid(inode, imap, end_offset)) 749 goto fail_unlock_page; 750 751 len = 1 << inode->i_blkbits; 752 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), 753 PAGE_CACHE_SIZE); 754 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; 755 page_dirty = p_offset / len; 756 757 /* 758 * The moment we find a buffer that doesn't match our current type 759 * specification or can't be written, abort the loop and start 760 * writeback. As per the above xfs_imap_valid() check, only 761 * xfs_vm_writepage() can handle partial page writeback fully - we are 762 * limited here to the buffers that are contiguous with the current 763 * ioend, and hence a buffer we can't write breaks that contiguity and 764 * we have to defer the rest of the IO to xfs_vm_writepage(). 765 */ 766 bh = head = page_buffers(page); 767 do { 768 if (offset >= end_offset) 769 break; 770 if (!buffer_uptodate(bh)) 771 uptodate = 0; 772 if (!(PageUptodate(page) || buffer_uptodate(bh))) { 773 done = 1; 774 break; 775 } 776 777 if (buffer_unwritten(bh) || buffer_delay(bh) || 778 buffer_mapped(bh)) { 779 if (buffer_unwritten(bh)) 780 type = XFS_IO_UNWRITTEN; 781 else if (buffer_delay(bh)) 782 type = XFS_IO_DELALLOC; 783 else 784 type = XFS_IO_OVERWRITE; 785 786 /* 787 * imap should always be valid because of the above 788 * partial page end_offset check on the imap. 789 */ 790 ASSERT(xfs_imap_valid(inode, imap, offset)); 791 792 lock_buffer(bh); 793 if (type != XFS_IO_OVERWRITE) 794 xfs_map_at_offset(inode, bh, imap, offset); 795 xfs_add_to_ioend(inode, bh, offset, type, 796 ioendp, done); 797 798 page_dirty--; 799 count++; 800 } else { 801 done = 1; 802 break; 803 } 804 } while (offset += len, (bh = bh->b_this_page) != head); 805 806 if (uptodate && bh == head) 807 SetPageUptodate(page); 808 809 if (count) { 810 if (--wbc->nr_to_write <= 0 && 811 wbc->sync_mode == WB_SYNC_NONE) 812 done = 1; 813 } 814 xfs_start_page_writeback(page, !page_dirty, count); 815 816 return done; 817 fail_unlock_page: 818 unlock_page(page); 819 fail: 820 return 1; 821 } 822 823 /* 824 * Convert & write out a cluster of pages in the same extent as defined 825 * by mp and following the start page. 826 */ 827 STATIC void 828 xfs_cluster_write( 829 struct inode *inode, 830 pgoff_t tindex, 831 struct xfs_bmbt_irec *imap, 832 xfs_ioend_t **ioendp, 833 struct writeback_control *wbc, 834 pgoff_t tlast) 835 { 836 struct pagevec pvec; 837 int done = 0, i; 838 839 pagevec_init(&pvec, 0); 840 while (!done && tindex <= tlast) { 841 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); 842 843 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) 844 break; 845 846 for (i = 0; i < pagevec_count(&pvec); i++) { 847 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 848 imap, ioendp, wbc); 849 if (done) 850 break; 851 } 852 853 pagevec_release(&pvec); 854 cond_resched(); 855 } 856 } 857 858 STATIC void 859 xfs_vm_invalidatepage( 860 struct page *page, 861 unsigned int offset, 862 unsigned int length) 863 { 864 trace_xfs_invalidatepage(page->mapping->host, page, offset, 865 length); 866 block_invalidatepage(page, offset, length); 867 } 868 869 /* 870 * If the page has delalloc buffers on it, we need to punch them out before we 871 * invalidate the page. If we don't, we leave a stale delalloc mapping on the 872 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read 873 * is done on that same region - the delalloc extent is returned when none is 874 * supposed to be there. 875 * 876 * We prevent this by truncating away the delalloc regions on the page before 877 * invalidating it. Because they are delalloc, we can do this without needing a 878 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this 879 * truncation without a transaction as there is no space left for block 880 * reservation (typically why we see a ENOSPC in writeback). 881 * 882 * This is not a performance critical path, so for now just do the punching a 883 * buffer head at a time. 884 */ 885 STATIC void 886 xfs_aops_discard_page( 887 struct page *page) 888 { 889 struct inode *inode = page->mapping->host; 890 struct xfs_inode *ip = XFS_I(inode); 891 struct buffer_head *bh, *head; 892 loff_t offset = page_offset(page); 893 894 if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) 895 goto out_invalidate; 896 897 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 898 goto out_invalidate; 899 900 xfs_alert(ip->i_mount, 901 "page discard on page %p, inode 0x%llx, offset %llu.", 902 page, ip->i_ino, offset); 903 904 xfs_ilock(ip, XFS_ILOCK_EXCL); 905 bh = head = page_buffers(page); 906 do { 907 int error; 908 xfs_fileoff_t start_fsb; 909 910 if (!buffer_delay(bh)) 911 goto next_buffer; 912 913 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 914 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); 915 if (error) { 916 /* something screwed, just bail */ 917 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 918 xfs_alert(ip->i_mount, 919 "page discard unable to remove delalloc mapping."); 920 } 921 break; 922 } 923 next_buffer: 924 offset += 1 << inode->i_blkbits; 925 926 } while ((bh = bh->b_this_page) != head); 927 928 xfs_iunlock(ip, XFS_ILOCK_EXCL); 929 out_invalidate: 930 xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); 931 return; 932 } 933 934 /* 935 * Write out a dirty page. 936 * 937 * For delalloc space on the page we need to allocate space and flush it. 938 * For unwritten space on the page we need to start the conversion to 939 * regular allocated space. 940 * For any other dirty buffer heads on the page we should flush them. 941 */ 942 STATIC int 943 xfs_vm_writepage( 944 struct page *page, 945 struct writeback_control *wbc) 946 { 947 struct inode *inode = page->mapping->host; 948 struct buffer_head *bh, *head; 949 struct xfs_bmbt_irec imap; 950 xfs_ioend_t *ioend = NULL, *iohead = NULL; 951 loff_t offset; 952 unsigned int type; 953 __uint64_t end_offset; 954 pgoff_t end_index, last_index; 955 ssize_t len; 956 int err, imap_valid = 0, uptodate = 1; 957 int count = 0; 958 int nonblocking = 0; 959 960 trace_xfs_writepage(inode, page, 0, 0); 961 962 ASSERT(page_has_buffers(page)); 963 964 /* 965 * Refuse to write the page out if we are called from reclaim context. 966 * 967 * This avoids stack overflows when called from deeply used stacks in 968 * random callers for direct reclaim or memcg reclaim. We explicitly 969 * allow reclaim from kswapd as the stack usage there is relatively low. 970 * 971 * This should never happen except in the case of a VM regression so 972 * warn about it. 973 */ 974 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == 975 PF_MEMALLOC)) 976 goto redirty; 977 978 /* 979 * Given that we do not allow direct reclaim to call us, we should 980 * never be called while in a filesystem transaction. 981 */ 982 if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) 983 goto redirty; 984 985 /* Is this page beyond the end of the file? */ 986 offset = i_size_read(inode); 987 end_index = offset >> PAGE_CACHE_SHIFT; 988 last_index = (offset - 1) >> PAGE_CACHE_SHIFT; 989 990 /* 991 * The page index is less than the end_index, adjust the end_offset 992 * to the highest offset that this page should represent. 993 * ----------------------------------------------------- 994 * | file mapping | <EOF> | 995 * ----------------------------------------------------- 996 * | Page ... | Page N-2 | Page N-1 | Page N | | 997 * ^--------------------------------^----------|-------- 998 * | desired writeback range | see else | 999 * ---------------------------------^------------------| 1000 */ 1001 if (page->index < end_index) 1002 end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; 1003 else { 1004 /* 1005 * Check whether the page to write out is beyond or straddles 1006 * i_size or not. 1007 * ------------------------------------------------------- 1008 * | file mapping | <EOF> | 1009 * ------------------------------------------------------- 1010 * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | 1011 * ^--------------------------------^-----------|--------- 1012 * | | Straddles | 1013 * ---------------------------------^-----------|--------| 1014 */ 1015 unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); 1016 1017 /* 1018 * Skip the page if it is fully outside i_size, e.g. due to a 1019 * truncate operation that is in progress. We must redirty the 1020 * page so that reclaim stops reclaiming it. Otherwise 1021 * xfs_vm_releasepage() is called on it and gets confused. 1022 * 1023 * Note that the end_index is unsigned long, it would overflow 1024 * if the given offset is greater than 16TB on 32-bit system 1025 * and if we do check the page is fully outside i_size or not 1026 * via "if (page->index >= end_index + 1)" as "end_index + 1" 1027 * will be evaluated to 0. Hence this page will be redirtied 1028 * and be written out repeatedly which would result in an 1029 * infinite loop, the user program that perform this operation 1030 * will hang. Instead, we can verify this situation by checking 1031 * if the page to write is totally beyond the i_size or if it's 1032 * offset is just equal to the EOF. 1033 */ 1034 if (page->index > end_index || 1035 (page->index == end_index && offset_into_page == 0)) 1036 goto redirty; 1037 1038 /* 1039 * The page straddles i_size. It must be zeroed out on each 1040 * and every writepage invocation because it may be mmapped. 1041 * "A file is mapped in multiples of the page size. For a file 1042 * that is not a multiple of the page size, the remaining 1043 * memory is zeroed when mapped, and writes to that region are 1044 * not written out to the file." 1045 */ 1046 zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); 1047 1048 /* Adjust the end_offset to the end of file */ 1049 end_offset = offset; 1050 } 1051 1052 len = 1 << inode->i_blkbits; 1053 1054 bh = head = page_buffers(page); 1055 offset = page_offset(page); 1056 type = XFS_IO_OVERWRITE; 1057 1058 if (wbc->sync_mode == WB_SYNC_NONE) 1059 nonblocking = 1; 1060 1061 do { 1062 int new_ioend = 0; 1063 1064 if (offset >= end_offset) 1065 break; 1066 if (!buffer_uptodate(bh)) 1067 uptodate = 0; 1068 1069 /* 1070 * set_page_dirty dirties all buffers in a page, independent 1071 * of their state. The dirty state however is entirely 1072 * meaningless for holes (!mapped && uptodate), so skip 1073 * buffers covering holes here. 1074 */ 1075 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 1076 imap_valid = 0; 1077 continue; 1078 } 1079 1080 if (buffer_unwritten(bh)) { 1081 if (type != XFS_IO_UNWRITTEN) { 1082 type = XFS_IO_UNWRITTEN; 1083 imap_valid = 0; 1084 } 1085 } else if (buffer_delay(bh)) { 1086 if (type != XFS_IO_DELALLOC) { 1087 type = XFS_IO_DELALLOC; 1088 imap_valid = 0; 1089 } 1090 } else if (buffer_uptodate(bh)) { 1091 if (type != XFS_IO_OVERWRITE) { 1092 type = XFS_IO_OVERWRITE; 1093 imap_valid = 0; 1094 } 1095 } else { 1096 if (PageUptodate(page)) 1097 ASSERT(buffer_mapped(bh)); 1098 /* 1099 * This buffer is not uptodate and will not be 1100 * written to disk. Ensure that we will put any 1101 * subsequent writeable buffers into a new 1102 * ioend. 1103 */ 1104 imap_valid = 0; 1105 continue; 1106 } 1107 1108 if (imap_valid) 1109 imap_valid = xfs_imap_valid(inode, &imap, offset); 1110 if (!imap_valid) { 1111 /* 1112 * If we didn't have a valid mapping then we need to 1113 * put the new mapping into a separate ioend structure. 1114 * This ensures non-contiguous extents always have 1115 * separate ioends, which is particularly important 1116 * for unwritten extent conversion at I/O completion 1117 * time. 1118 */ 1119 new_ioend = 1; 1120 err = xfs_map_blocks(inode, offset, &imap, type, 1121 nonblocking); 1122 if (err) 1123 goto error; 1124 imap_valid = xfs_imap_valid(inode, &imap, offset); 1125 } 1126 if (imap_valid) { 1127 lock_buffer(bh); 1128 if (type != XFS_IO_OVERWRITE) 1129 xfs_map_at_offset(inode, bh, &imap, offset); 1130 xfs_add_to_ioend(inode, bh, offset, type, &ioend, 1131 new_ioend); 1132 count++; 1133 } 1134 1135 if (!iohead) 1136 iohead = ioend; 1137 1138 } while (offset += len, ((bh = bh->b_this_page) != head)); 1139 1140 if (uptodate && bh == head) 1141 SetPageUptodate(page); 1142 1143 xfs_start_page_writeback(page, 1, count); 1144 1145 /* if there is no IO to be submitted for this page, we are done */ 1146 if (!ioend) 1147 return 0; 1148 1149 ASSERT(iohead); 1150 1151 /* 1152 * Any errors from this point onwards need tobe reported through the IO 1153 * completion path as we have marked the initial page as under writeback 1154 * and unlocked it. 1155 */ 1156 if (imap_valid) { 1157 xfs_off_t end_index; 1158 1159 end_index = imap.br_startoff + imap.br_blockcount; 1160 1161 /* to bytes */ 1162 end_index <<= inode->i_blkbits; 1163 1164 /* to pages */ 1165 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; 1166 1167 /* check against file size */ 1168 if (end_index > last_index) 1169 end_index = last_index; 1170 1171 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1172 wbc, end_index); 1173 } 1174 1175 1176 /* 1177 * Reserve log space if we might write beyond the on-disk inode size. 1178 */ 1179 err = 0; 1180 if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) 1181 err = xfs_setfilesize_trans_alloc(ioend); 1182 1183 xfs_submit_ioend(wbc, iohead, err); 1184 1185 return 0; 1186 1187 error: 1188 if (iohead) 1189 xfs_cancel_ioend(iohead); 1190 1191 if (err == -EAGAIN) 1192 goto redirty; 1193 1194 xfs_aops_discard_page(page); 1195 ClearPageUptodate(page); 1196 unlock_page(page); 1197 return err; 1198 1199 redirty: 1200 redirty_page_for_writepage(wbc, page); 1201 unlock_page(page); 1202 return 0; 1203 } 1204 1205 STATIC int 1206 xfs_vm_writepages( 1207 struct address_space *mapping, 1208 struct writeback_control *wbc) 1209 { 1210 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1211 return generic_writepages(mapping, wbc); 1212 } 1213 1214 /* 1215 * Called to move a page into cleanable state - and from there 1216 * to be released. The page should already be clean. We always 1217 * have buffer heads in this call. 1218 * 1219 * Returns 1 if the page is ok to release, 0 otherwise. 1220 */ 1221 STATIC int 1222 xfs_vm_releasepage( 1223 struct page *page, 1224 gfp_t gfp_mask) 1225 { 1226 int delalloc, unwritten; 1227 1228 trace_xfs_releasepage(page->mapping->host, page, 0, 0); 1229 1230 xfs_count_page_state(page, &delalloc, &unwritten); 1231 1232 if (WARN_ON_ONCE(delalloc)) 1233 return 0; 1234 if (WARN_ON_ONCE(unwritten)) 1235 return 0; 1236 1237 return try_to_free_buffers(page); 1238 } 1239 1240 /* 1241 * When we map a DIO buffer, we may need to attach an ioend that describes the 1242 * type of write IO we are doing. This passes to the completion function the 1243 * operations it needs to perform. If the mapping is for an overwrite wholly 1244 * within the EOF then we don't need an ioend and so we don't allocate one. 1245 * This avoids the unnecessary overhead of allocating and freeing ioends for 1246 * workloads that don't require transactions on IO completion. 1247 * 1248 * If we get multiple mappings in a single IO, we might be mapping different 1249 * types. But because the direct IO can only have a single private pointer, we 1250 * need to ensure that: 1251 * 1252 * a) i) the ioend spans the entire region of unwritten mappings; or 1253 * ii) the ioend spans all the mappings that cross or are beyond EOF; and 1254 * b) if it contains unwritten extents, it is *permanently* marked as such 1255 * 1256 * We could do this by chaining ioends like buffered IO does, but we only 1257 * actually get one IO completion callback from the direct IO, and that spans 1258 * the entire IO regardless of how many mappings and IOs are needed to complete 1259 * the DIO. There is only going to be one reference to the ioend and its life 1260 * cycle is constrained by the DIO completion code. hence we don't need 1261 * reference counting here. 1262 * 1263 * Note that for DIO, an IO to the highest supported file block offset (i.e. 1264 * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 1265 * bit variable. Hence if we see this overflow, we have to assume that the IO is 1266 * extending the file size. We won't know for sure until IO completion is run 1267 * and the actual max write offset is communicated to the IO completion 1268 * routine. 1269 * 1270 * For DAX page faults, we are preparing to never see unwritten extents here, 1271 * nor should we ever extend the inode size. Hence we will soon have nothing to 1272 * do here for this case, ensuring we don't have to provide an IO completion 1273 * callback to free an ioend that we don't actually need for a fault into the 1274 * page at offset (2^63 - 1FSB) bytes. 1275 */ 1276 1277 static void 1278 xfs_map_direct( 1279 struct inode *inode, 1280 struct buffer_head *bh_result, 1281 struct xfs_bmbt_irec *imap, 1282 xfs_off_t offset, 1283 bool dax_fault) 1284 { 1285 struct xfs_ioend *ioend; 1286 xfs_off_t size = bh_result->b_size; 1287 int type; 1288 1289 if (ISUNWRITTEN(imap)) 1290 type = XFS_IO_UNWRITTEN; 1291 else 1292 type = XFS_IO_OVERWRITE; 1293 1294 trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); 1295 1296 if (dax_fault) { 1297 ASSERT(type == XFS_IO_OVERWRITE); 1298 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, 1299 imap); 1300 return; 1301 } 1302 1303 if (bh_result->b_private) { 1304 ioend = bh_result->b_private; 1305 ASSERT(ioend->io_size > 0); 1306 ASSERT(offset >= ioend->io_offset); 1307 if (offset + size > ioend->io_offset + ioend->io_size) 1308 ioend->io_size = offset - ioend->io_offset + size; 1309 1310 if (type == XFS_IO_UNWRITTEN && type != ioend->io_type) 1311 ioend->io_type = XFS_IO_UNWRITTEN; 1312 1313 trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset, 1314 ioend->io_size, ioend->io_type, 1315 imap); 1316 } else if (type == XFS_IO_UNWRITTEN || 1317 offset + size > i_size_read(inode) || 1318 offset + size < 0) { 1319 ioend = xfs_alloc_ioend(inode, type); 1320 ioend->io_offset = offset; 1321 ioend->io_size = size; 1322 1323 bh_result->b_private = ioend; 1324 set_buffer_defer_completion(bh_result); 1325 1326 trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type, 1327 imap); 1328 } else { 1329 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, 1330 imap); 1331 } 1332 } 1333 1334 /* 1335 * If this is O_DIRECT or the mpage code calling tell them how large the mapping 1336 * is, so that we can avoid repeated get_blocks calls. 1337 * 1338 * If the mapping spans EOF, then we have to break the mapping up as the mapping 1339 * for blocks beyond EOF must be marked new so that sub block regions can be 1340 * correctly zeroed. We can't do this for mappings within EOF unless the mapping 1341 * was just allocated or is unwritten, otherwise the callers would overwrite 1342 * existing data with zeros. Hence we have to split the mapping into a range up 1343 * to and including EOF, and a second mapping for beyond EOF. 1344 */ 1345 static void 1346 xfs_map_trim_size( 1347 struct inode *inode, 1348 sector_t iblock, 1349 struct buffer_head *bh_result, 1350 struct xfs_bmbt_irec *imap, 1351 xfs_off_t offset, 1352 ssize_t size) 1353 { 1354 xfs_off_t mapping_size; 1355 1356 mapping_size = imap->br_startoff + imap->br_blockcount - iblock; 1357 mapping_size <<= inode->i_blkbits; 1358 1359 ASSERT(mapping_size > 0); 1360 if (mapping_size > size) 1361 mapping_size = size; 1362 if (offset < i_size_read(inode) && 1363 offset + mapping_size >= i_size_read(inode)) { 1364 /* limit mapping to block that spans EOF */ 1365 mapping_size = roundup_64(i_size_read(inode) - offset, 1366 1 << inode->i_blkbits); 1367 } 1368 if (mapping_size > LONG_MAX) 1369 mapping_size = LONG_MAX; 1370 1371 bh_result->b_size = mapping_size; 1372 } 1373 1374 STATIC int 1375 __xfs_get_blocks( 1376 struct inode *inode, 1377 sector_t iblock, 1378 struct buffer_head *bh_result, 1379 int create, 1380 bool direct, 1381 bool dax_fault) 1382 { 1383 struct xfs_inode *ip = XFS_I(inode); 1384 struct xfs_mount *mp = ip->i_mount; 1385 xfs_fileoff_t offset_fsb, end_fsb; 1386 int error = 0; 1387 int lockmode = 0; 1388 struct xfs_bmbt_irec imap; 1389 int nimaps = 1; 1390 xfs_off_t offset; 1391 ssize_t size; 1392 int new = 0; 1393 1394 if (XFS_FORCED_SHUTDOWN(mp)) 1395 return -EIO; 1396 1397 offset = (xfs_off_t)iblock << inode->i_blkbits; 1398 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1399 size = bh_result->b_size; 1400 1401 if (!create && direct && offset >= i_size_read(inode)) 1402 return 0; 1403 1404 /* 1405 * Direct I/O is usually done on preallocated files, so try getting 1406 * a block mapping without an exclusive lock first. For buffered 1407 * writes we already have the exclusive iolock anyway, so avoiding 1408 * a lock roundtrip here by taking the ilock exclusive from the 1409 * beginning is a useful micro optimization. 1410 */ 1411 if (create && !direct) { 1412 lockmode = XFS_ILOCK_EXCL; 1413 xfs_ilock(ip, lockmode); 1414 } else { 1415 lockmode = xfs_ilock_data_map_shared(ip); 1416 } 1417 1418 ASSERT(offset <= mp->m_super->s_maxbytes); 1419 if (offset + size > mp->m_super->s_maxbytes) 1420 size = mp->m_super->s_maxbytes - offset; 1421 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); 1422 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1423 1424 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 1425 &imap, &nimaps, XFS_BMAPI_ENTIRE); 1426 if (error) 1427 goto out_unlock; 1428 1429 /* for DAX, we convert unwritten extents directly */ 1430 if (create && 1431 (!nimaps || 1432 (imap.br_startblock == HOLESTARTBLOCK || 1433 imap.br_startblock == DELAYSTARTBLOCK) || 1434 (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { 1435 if (direct || xfs_get_extsz_hint(ip)) { 1436 /* 1437 * xfs_iomap_write_direct() expects the shared lock. It 1438 * is unlocked on return. 1439 */ 1440 if (lockmode == XFS_ILOCK_EXCL) 1441 xfs_ilock_demote(ip, lockmode); 1442 1443 error = xfs_iomap_write_direct(ip, offset, size, 1444 &imap, nimaps); 1445 if (error) 1446 return error; 1447 new = 1; 1448 1449 } else { 1450 /* 1451 * Delalloc reservations do not require a transaction, 1452 * we can go on without dropping the lock here. If we 1453 * are allocating a new delalloc block, make sure that 1454 * we set the new flag so that we mark the buffer new so 1455 * that we know that it is newly allocated if the write 1456 * fails. 1457 */ 1458 if (nimaps && imap.br_startblock == HOLESTARTBLOCK) 1459 new = 1; 1460 error = xfs_iomap_write_delay(ip, offset, size, &imap); 1461 if (error) 1462 goto out_unlock; 1463 1464 xfs_iunlock(ip, lockmode); 1465 } 1466 trace_xfs_get_blocks_alloc(ip, offset, size, 1467 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1468 : XFS_IO_DELALLOC, &imap); 1469 } else if (nimaps) { 1470 trace_xfs_get_blocks_found(ip, offset, size, 1471 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1472 : XFS_IO_OVERWRITE, &imap); 1473 xfs_iunlock(ip, lockmode); 1474 } else { 1475 trace_xfs_get_blocks_notfound(ip, offset, size); 1476 goto out_unlock; 1477 } 1478 1479 if (IS_DAX(inode) && create) { 1480 ASSERT(!ISUNWRITTEN(&imap)); 1481 /* zeroing is not needed at a higher layer */ 1482 new = 0; 1483 } 1484 1485 /* trim mapping down to size requested */ 1486 if (direct || size > (1 << inode->i_blkbits)) 1487 xfs_map_trim_size(inode, iblock, bh_result, 1488 &imap, offset, size); 1489 1490 /* 1491 * For unwritten extents do not report a disk address in the buffered 1492 * read case (treat as if we're reading into a hole). 1493 */ 1494 if (imap.br_startblock != HOLESTARTBLOCK && 1495 imap.br_startblock != DELAYSTARTBLOCK && 1496 (create || !ISUNWRITTEN(&imap))) { 1497 xfs_map_buffer(inode, bh_result, &imap, offset); 1498 if (ISUNWRITTEN(&imap)) 1499 set_buffer_unwritten(bh_result); 1500 /* direct IO needs special help */ 1501 if (create && direct) 1502 xfs_map_direct(inode, bh_result, &imap, offset, 1503 dax_fault); 1504 } 1505 1506 /* 1507 * If this is a realtime file, data may be on a different device. 1508 * to that pointed to from the buffer_head b_bdev currently. 1509 */ 1510 bh_result->b_bdev = xfs_find_bdev_for_inode(inode); 1511 1512 /* 1513 * If we previously allocated a block out beyond eof and we are now 1514 * coming back to use it then we will need to flag it as new even if it 1515 * has a disk address. 1516 * 1517 * With sub-block writes into unwritten extents we also need to mark 1518 * the buffer as new so that the unwritten parts of the buffer gets 1519 * correctly zeroed. 1520 */ 1521 if (create && 1522 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1523 (offset >= i_size_read(inode)) || 1524 (new || ISUNWRITTEN(&imap)))) 1525 set_buffer_new(bh_result); 1526 1527 if (imap.br_startblock == DELAYSTARTBLOCK) { 1528 BUG_ON(direct); 1529 if (create) { 1530 set_buffer_uptodate(bh_result); 1531 set_buffer_mapped(bh_result); 1532 set_buffer_delay(bh_result); 1533 } 1534 } 1535 1536 return 0; 1537 1538 out_unlock: 1539 xfs_iunlock(ip, lockmode); 1540 return error; 1541 } 1542 1543 int 1544 xfs_get_blocks( 1545 struct inode *inode, 1546 sector_t iblock, 1547 struct buffer_head *bh_result, 1548 int create) 1549 { 1550 return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); 1551 } 1552 1553 int 1554 xfs_get_blocks_direct( 1555 struct inode *inode, 1556 sector_t iblock, 1557 struct buffer_head *bh_result, 1558 int create) 1559 { 1560 return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); 1561 } 1562 1563 int 1564 xfs_get_blocks_dax_fault( 1565 struct inode *inode, 1566 sector_t iblock, 1567 struct buffer_head *bh_result, 1568 int create) 1569 { 1570 return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); 1571 } 1572 1573 static void 1574 __xfs_end_io_direct_write( 1575 struct inode *inode, 1576 struct xfs_ioend *ioend, 1577 loff_t offset, 1578 ssize_t size) 1579 { 1580 struct xfs_mount *mp = XFS_I(inode)->i_mount; 1581 1582 if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) 1583 goto out_end_io; 1584 1585 /* 1586 * dio completion end_io functions are only called on writes if more 1587 * than 0 bytes was written. 1588 */ 1589 ASSERT(size > 0); 1590 1591 /* 1592 * The ioend only maps whole blocks, while the IO may be sector aligned. 1593 * Hence the ioend offset/size may not match the IO offset/size exactly. 1594 * Because we don't map overwrites within EOF into the ioend, the offset 1595 * may not match, but only if the endio spans EOF. Either way, write 1596 * the IO sizes into the ioend so that completion processing does the 1597 * right thing. 1598 */ 1599 ASSERT(offset + size <= ioend->io_offset + ioend->io_size); 1600 ioend->io_size = size; 1601 ioend->io_offset = offset; 1602 1603 /* 1604 * The ioend tells us whether we are doing unwritten extent conversion 1605 * or an append transaction that updates the on-disk file size. These 1606 * cases are the only cases where we should *potentially* be needing 1607 * to update the VFS inode size. 1608 * 1609 * We need to update the in-core inode size here so that we don't end up 1610 * with the on-disk inode size being outside the in-core inode size. We 1611 * have no other method of updating EOF for AIO, so always do it here 1612 * if necessary. 1613 * 1614 * We need to lock the test/set EOF update as we can be racing with 1615 * other IO completions here to update the EOF. Failing to serialise 1616 * here can result in EOF moving backwards and Bad Things Happen when 1617 * that occurs. 1618 */ 1619 spin_lock(&XFS_I(inode)->i_flags_lock); 1620 if (offset + size > i_size_read(inode)) 1621 i_size_write(inode, offset + size); 1622 spin_unlock(&XFS_I(inode)->i_flags_lock); 1623 1624 /* 1625 * If we are doing an append IO that needs to update the EOF on disk, 1626 * do the transaction reserve now so we can use common end io 1627 * processing. Stashing the error (if there is one) in the ioend will 1628 * result in the ioend processing passing on the error if it is 1629 * possible as we can't return it from here. 1630 */ 1631 if (ioend->io_type == XFS_IO_OVERWRITE) 1632 ioend->io_error = xfs_setfilesize_trans_alloc(ioend); 1633 1634 out_end_io: 1635 xfs_end_io(&ioend->io_work); 1636 return; 1637 } 1638 1639 /* 1640 * Complete a direct I/O write request. 1641 * 1642 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. 1643 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite 1644 * wholly within the EOF and so there is nothing for us to do. Note that in this 1645 * case the completion can be called in interrupt context, whereas if we have an 1646 * ioend we will always be called in task context (i.e. from a workqueue). 1647 */ 1648 STATIC void 1649 xfs_end_io_direct_write( 1650 struct kiocb *iocb, 1651 loff_t offset, 1652 ssize_t size, 1653 void *private) 1654 { 1655 struct inode *inode = file_inode(iocb->ki_filp); 1656 struct xfs_ioend *ioend = private; 1657 1658 trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size, 1659 ioend ? ioend->io_type : 0, NULL); 1660 1661 if (!ioend) { 1662 ASSERT(offset + size <= i_size_read(inode)); 1663 return; 1664 } 1665 1666 __xfs_end_io_direct_write(inode, ioend, offset, size); 1667 } 1668 1669 static inline ssize_t 1670 xfs_vm_do_dio( 1671 struct inode *inode, 1672 struct kiocb *iocb, 1673 struct iov_iter *iter, 1674 loff_t offset, 1675 void (*endio)(struct kiocb *iocb, 1676 loff_t offset, 1677 ssize_t size, 1678 void *private), 1679 int flags) 1680 { 1681 struct block_device *bdev; 1682 1683 if (IS_DAX(inode)) 1684 return dax_do_io(iocb, inode, iter, offset, 1685 xfs_get_blocks_direct, endio, 0); 1686 1687 bdev = xfs_find_bdev_for_inode(inode); 1688 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1689 xfs_get_blocks_direct, endio, NULL, flags); 1690 } 1691 1692 STATIC ssize_t 1693 xfs_vm_direct_IO( 1694 struct kiocb *iocb, 1695 struct iov_iter *iter, 1696 loff_t offset) 1697 { 1698 struct inode *inode = iocb->ki_filp->f_mapping->host; 1699 1700 if (iov_iter_rw(iter) == WRITE) 1701 return xfs_vm_do_dio(inode, iocb, iter, offset, 1702 xfs_end_io_direct_write, DIO_ASYNC_EXTEND); 1703 return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0); 1704 } 1705 1706 /* 1707 * Punch out the delalloc blocks we have already allocated. 1708 * 1709 * Don't bother with xfs_setattr given that nothing can have made it to disk yet 1710 * as the page is still locked at this point. 1711 */ 1712 STATIC void 1713 xfs_vm_kill_delalloc_range( 1714 struct inode *inode, 1715 loff_t start, 1716 loff_t end) 1717 { 1718 struct xfs_inode *ip = XFS_I(inode); 1719 xfs_fileoff_t start_fsb; 1720 xfs_fileoff_t end_fsb; 1721 int error; 1722 1723 start_fsb = XFS_B_TO_FSB(ip->i_mount, start); 1724 end_fsb = XFS_B_TO_FSB(ip->i_mount, end); 1725 if (end_fsb <= start_fsb) 1726 return; 1727 1728 xfs_ilock(ip, XFS_ILOCK_EXCL); 1729 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1730 end_fsb - start_fsb); 1731 if (error) { 1732 /* something screwed, just bail */ 1733 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1734 xfs_alert(ip->i_mount, 1735 "xfs_vm_write_failed: unable to clean up ino %lld", 1736 ip->i_ino); 1737 } 1738 } 1739 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1740 } 1741 1742 STATIC void 1743 xfs_vm_write_failed( 1744 struct inode *inode, 1745 struct page *page, 1746 loff_t pos, 1747 unsigned len) 1748 { 1749 loff_t block_offset; 1750 loff_t block_start; 1751 loff_t block_end; 1752 loff_t from = pos & (PAGE_CACHE_SIZE - 1); 1753 loff_t to = from + len; 1754 struct buffer_head *bh, *head; 1755 1756 /* 1757 * The request pos offset might be 32 or 64 bit, this is all fine 1758 * on 64-bit platform. However, for 64-bit pos request on 32-bit 1759 * platform, the high 32-bit will be masked off if we evaluate the 1760 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is 1761 * 0xfffff000 as an unsigned long, hence the result is incorrect 1762 * which could cause the following ASSERT failed in most cases. 1763 * In order to avoid this, we can evaluate the block_offset of the 1764 * start of the page by using shifts rather than masks the mismatch 1765 * problem. 1766 */ 1767 block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; 1768 1769 ASSERT(block_offset + from == pos); 1770 1771 head = page_buffers(page); 1772 block_start = 0; 1773 for (bh = head; bh != head || !block_start; 1774 bh = bh->b_this_page, block_start = block_end, 1775 block_offset += bh->b_size) { 1776 block_end = block_start + bh->b_size; 1777 1778 /* skip buffers before the write */ 1779 if (block_end <= from) 1780 continue; 1781 1782 /* if the buffer is after the write, we're done */ 1783 if (block_start >= to) 1784 break; 1785 1786 if (!buffer_delay(bh)) 1787 continue; 1788 1789 if (!buffer_new(bh) && block_offset < i_size_read(inode)) 1790 continue; 1791 1792 xfs_vm_kill_delalloc_range(inode, block_offset, 1793 block_offset + bh->b_size); 1794 1795 /* 1796 * This buffer does not contain data anymore. make sure anyone 1797 * who finds it knows that for certain. 1798 */ 1799 clear_buffer_delay(bh); 1800 clear_buffer_uptodate(bh); 1801 clear_buffer_mapped(bh); 1802 clear_buffer_new(bh); 1803 clear_buffer_dirty(bh); 1804 } 1805 1806 } 1807 1808 /* 1809 * This used to call block_write_begin(), but it unlocks and releases the page 1810 * on error, and we need that page to be able to punch stale delalloc blocks out 1811 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at 1812 * the appropriate point. 1813 */ 1814 STATIC int 1815 xfs_vm_write_begin( 1816 struct file *file, 1817 struct address_space *mapping, 1818 loff_t pos, 1819 unsigned len, 1820 unsigned flags, 1821 struct page **pagep, 1822 void **fsdata) 1823 { 1824 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1825 struct page *page; 1826 int status; 1827 1828 ASSERT(len <= PAGE_CACHE_SIZE); 1829 1830 page = grab_cache_page_write_begin(mapping, index, flags); 1831 if (!page) 1832 return -ENOMEM; 1833 1834 status = __block_write_begin(page, pos, len, xfs_get_blocks); 1835 if (unlikely(status)) { 1836 struct inode *inode = mapping->host; 1837 size_t isize = i_size_read(inode); 1838 1839 xfs_vm_write_failed(inode, page, pos, len); 1840 unlock_page(page); 1841 1842 /* 1843 * If the write is beyond EOF, we only want to kill blocks 1844 * allocated in this write, not blocks that were previously 1845 * written successfully. 1846 */ 1847 if (pos + len > isize) { 1848 ssize_t start = max_t(ssize_t, pos, isize); 1849 1850 truncate_pagecache_range(inode, start, pos + len); 1851 } 1852 1853 page_cache_release(page); 1854 page = NULL; 1855 } 1856 1857 *pagep = page; 1858 return status; 1859 } 1860 1861 /* 1862 * On failure, we only need to kill delalloc blocks beyond EOF in the range of 1863 * this specific write because they will never be written. Previous writes 1864 * beyond EOF where block allocation succeeded do not need to be trashed, so 1865 * only new blocks from this write should be trashed. For blocks within 1866 * EOF, generic_write_end() zeros them so they are safe to leave alone and be 1867 * written with all the other valid data. 1868 */ 1869 STATIC int 1870 xfs_vm_write_end( 1871 struct file *file, 1872 struct address_space *mapping, 1873 loff_t pos, 1874 unsigned len, 1875 unsigned copied, 1876 struct page *page, 1877 void *fsdata) 1878 { 1879 int ret; 1880 1881 ASSERT(len <= PAGE_CACHE_SIZE); 1882 1883 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 1884 if (unlikely(ret < len)) { 1885 struct inode *inode = mapping->host; 1886 size_t isize = i_size_read(inode); 1887 loff_t to = pos + len; 1888 1889 if (to > isize) { 1890 /* only kill blocks in this write beyond EOF */ 1891 if (pos > isize) 1892 isize = pos; 1893 xfs_vm_kill_delalloc_range(inode, isize, to); 1894 truncate_pagecache_range(inode, isize, to); 1895 } 1896 } 1897 return ret; 1898 } 1899 1900 STATIC sector_t 1901 xfs_vm_bmap( 1902 struct address_space *mapping, 1903 sector_t block) 1904 { 1905 struct inode *inode = (struct inode *)mapping->host; 1906 struct xfs_inode *ip = XFS_I(inode); 1907 1908 trace_xfs_vm_bmap(XFS_I(inode)); 1909 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1910 filemap_write_and_wait(mapping); 1911 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1912 return generic_block_bmap(mapping, block, xfs_get_blocks); 1913 } 1914 1915 STATIC int 1916 xfs_vm_readpage( 1917 struct file *unused, 1918 struct page *page) 1919 { 1920 trace_xfs_vm_readpage(page->mapping->host, 1); 1921 return mpage_readpage(page, xfs_get_blocks); 1922 } 1923 1924 STATIC int 1925 xfs_vm_readpages( 1926 struct file *unused, 1927 struct address_space *mapping, 1928 struct list_head *pages, 1929 unsigned nr_pages) 1930 { 1931 trace_xfs_vm_readpages(mapping->host, nr_pages); 1932 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1933 } 1934 1935 /* 1936 * This is basically a copy of __set_page_dirty_buffers() with one 1937 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them 1938 * dirty, we'll never be able to clean them because we don't write buffers 1939 * beyond EOF, and that means we can't invalidate pages that span EOF 1940 * that have been marked dirty. Further, the dirty state can leak into 1941 * the file interior if the file is extended, resulting in all sorts of 1942 * bad things happening as the state does not match the underlying data. 1943 * 1944 * XXX: this really indicates that bufferheads in XFS need to die. Warts like 1945 * this only exist because of bufferheads and how the generic code manages them. 1946 */ 1947 STATIC int 1948 xfs_vm_set_page_dirty( 1949 struct page *page) 1950 { 1951 struct address_space *mapping = page->mapping; 1952 struct inode *inode = mapping->host; 1953 loff_t end_offset; 1954 loff_t offset; 1955 int newly_dirty; 1956 struct mem_cgroup *memcg; 1957 1958 if (unlikely(!mapping)) 1959 return !TestSetPageDirty(page); 1960 1961 end_offset = i_size_read(inode); 1962 offset = page_offset(page); 1963 1964 spin_lock(&mapping->private_lock); 1965 if (page_has_buffers(page)) { 1966 struct buffer_head *head = page_buffers(page); 1967 struct buffer_head *bh = head; 1968 1969 do { 1970 if (offset < end_offset) 1971 set_buffer_dirty(bh); 1972 bh = bh->b_this_page; 1973 offset += 1 << inode->i_blkbits; 1974 } while (bh != head); 1975 } 1976 /* 1977 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with 1978 * per-memcg dirty page counters. 1979 */ 1980 memcg = mem_cgroup_begin_page_stat(page); 1981 newly_dirty = !TestSetPageDirty(page); 1982 spin_unlock(&mapping->private_lock); 1983 1984 if (newly_dirty) { 1985 /* sigh - __set_page_dirty() is static, so copy it here, too */ 1986 unsigned long flags; 1987 1988 spin_lock_irqsave(&mapping->tree_lock, flags); 1989 if (page->mapping) { /* Race with truncate? */ 1990 WARN_ON_ONCE(!PageUptodate(page)); 1991 account_page_dirtied(page, mapping, memcg); 1992 radix_tree_tag_set(&mapping->page_tree, 1993 page_index(page), PAGECACHE_TAG_DIRTY); 1994 } 1995 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1996 } 1997 mem_cgroup_end_page_stat(memcg); 1998 if (newly_dirty) 1999 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 2000 return newly_dirty; 2001 } 2002 2003 const struct address_space_operations xfs_address_space_operations = { 2004 .readpage = xfs_vm_readpage, 2005 .readpages = xfs_vm_readpages, 2006 .writepage = xfs_vm_writepage, 2007 .writepages = xfs_vm_writepages, 2008 .set_page_dirty = xfs_vm_set_page_dirty, 2009 .releasepage = xfs_vm_releasepage, 2010 .invalidatepage = xfs_vm_invalidatepage, 2011 .write_begin = xfs_vm_write_begin, 2012 .write_end = xfs_vm_write_end, 2013 .bmap = xfs_vm_bmap, 2014 .direct_IO = xfs_vm_direct_IO, 2015 .migratepage = buffer_migrate_page, 2016 .is_partially_uptodate = block_is_partially_uptodate, 2017 .error_remove_page = generic_error_remove_page, 2018 }; 2019