1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_shared.h" 20 #include "xfs_format.h" 21 #include "xfs_log_format.h" 22 #include "xfs_trans_resv.h" 23 #include "xfs_mount.h" 24 #include "xfs_inode.h" 25 #include "xfs_trans.h" 26 #include "xfs_inode_item.h" 27 #include "xfs_alloc.h" 28 #include "xfs_error.h" 29 #include "xfs_iomap.h" 30 #include "xfs_trace.h" 31 #include "xfs_bmap.h" 32 #include "xfs_bmap_util.h" 33 #include "xfs_bmap_btree.h" 34 #include <linux/gfp.h> 35 #include <linux/mpage.h> 36 #include <linux/pagevec.h> 37 #include <linux/writeback.h> 38 39 void 40 xfs_count_page_state( 41 struct page *page, 42 int *delalloc, 43 int *unwritten) 44 { 45 struct buffer_head *bh, *head; 46 47 *delalloc = *unwritten = 0; 48 49 bh = head = page_buffers(page); 50 do { 51 if (buffer_unwritten(bh)) 52 (*unwritten) = 1; 53 else if (buffer_delay(bh)) 54 (*delalloc) = 1; 55 } while ((bh = bh->b_this_page) != head); 56 } 57 58 STATIC struct block_device * 59 xfs_find_bdev_for_inode( 60 struct inode *inode) 61 { 62 struct xfs_inode *ip = XFS_I(inode); 63 struct xfs_mount *mp = ip->i_mount; 64 65 if (XFS_IS_REALTIME_INODE(ip)) 66 return mp->m_rtdev_targp->bt_bdev; 67 else 68 return mp->m_ddev_targp->bt_bdev; 69 } 70 71 /* 72 * We're now finished for good with this ioend structure. 73 * Update the page state via the associated buffer_heads, 74 * release holds on the inode and bio, and finally free 75 * up memory. Do not use the ioend after this. 76 */ 77 STATIC void 78 xfs_destroy_ioend( 79 xfs_ioend_t *ioend) 80 { 81 struct buffer_head *bh, *next; 82 83 for (bh = ioend->io_buffer_head; bh; bh = next) { 84 next = bh->b_private; 85 bh->b_end_io(bh, !ioend->io_error); 86 } 87 88 mempool_free(ioend, xfs_ioend_pool); 89 } 90 91 /* 92 * Fast and loose check if this write could update the on-disk inode size. 93 */ 94 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) 95 { 96 return ioend->io_offset + ioend->io_size > 97 XFS_I(ioend->io_inode)->i_d.di_size; 98 } 99 100 STATIC int 101 xfs_setfilesize_trans_alloc( 102 struct xfs_ioend *ioend) 103 { 104 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 105 struct xfs_trans *tp; 106 int error; 107 108 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 109 110 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); 111 if (error) { 112 xfs_trans_cancel(tp); 113 return error; 114 } 115 116 ioend->io_append_trans = tp; 117 118 /* 119 * We may pass freeze protection with a transaction. So tell lockdep 120 * we released it. 121 */ 122 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 123 1, _THIS_IP_); 124 /* 125 * We hand off the transaction to the completion thread now, so 126 * clear the flag here. 127 */ 128 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 129 return 0; 130 } 131 132 /* 133 * Update on-disk file size now that data has been written to disk. 134 */ 135 STATIC int 136 xfs_setfilesize( 137 struct xfs_inode *ip, 138 struct xfs_trans *tp, 139 xfs_off_t offset, 140 size_t size) 141 { 142 xfs_fsize_t isize; 143 144 xfs_ilock(ip, XFS_ILOCK_EXCL); 145 isize = xfs_new_eof(ip, offset + size); 146 if (!isize) { 147 xfs_iunlock(ip, XFS_ILOCK_EXCL); 148 xfs_trans_cancel(tp); 149 return 0; 150 } 151 152 trace_xfs_setfilesize(ip, offset, size); 153 154 ip->i_d.di_size = isize; 155 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 156 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 157 158 return xfs_trans_commit(tp); 159 } 160 161 STATIC int 162 xfs_setfilesize_ioend( 163 struct xfs_ioend *ioend) 164 { 165 struct xfs_inode *ip = XFS_I(ioend->io_inode); 166 struct xfs_trans *tp = ioend->io_append_trans; 167 168 /* 169 * The transaction may have been allocated in the I/O submission thread, 170 * thus we need to mark ourselves as being in a transaction manually. 171 * Similarly for freeze protection. 172 */ 173 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 174 rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 175 0, 1, _THIS_IP_); 176 177 return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); 178 } 179 180 /* 181 * Schedule IO completion handling on the final put of an ioend. 182 * 183 * If there is no work to do we might as well call it a day and free the 184 * ioend right now. 185 */ 186 STATIC void 187 xfs_finish_ioend( 188 struct xfs_ioend *ioend) 189 { 190 if (atomic_dec_and_test(&ioend->io_remaining)) { 191 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 192 193 if (ioend->io_type == XFS_IO_UNWRITTEN) 194 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 195 else if (ioend->io_append_trans) 196 queue_work(mp->m_data_workqueue, &ioend->io_work); 197 else 198 xfs_destroy_ioend(ioend); 199 } 200 } 201 202 /* 203 * IO write completion. 204 */ 205 STATIC void 206 xfs_end_io( 207 struct work_struct *work) 208 { 209 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); 210 struct xfs_inode *ip = XFS_I(ioend->io_inode); 211 int error = 0; 212 213 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 214 ioend->io_error = -EIO; 215 goto done; 216 } 217 if (ioend->io_error) 218 goto done; 219 220 /* 221 * For unwritten extents we need to issue transactions to convert a 222 * range to normal written extens after the data I/O has finished. 223 */ 224 if (ioend->io_type == XFS_IO_UNWRITTEN) { 225 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 226 ioend->io_size); 227 } else if (ioend->io_append_trans) { 228 error = xfs_setfilesize_ioend(ioend); 229 } else { 230 ASSERT(!xfs_ioend_is_append(ioend)); 231 } 232 233 done: 234 if (error) 235 ioend->io_error = error; 236 xfs_destroy_ioend(ioend); 237 } 238 239 /* 240 * Allocate and initialise an IO completion structure. 241 * We need to track unwritten extent write completion here initially. 242 * We'll need to extend this for updating the ondisk inode size later 243 * (vs. incore size). 244 */ 245 STATIC xfs_ioend_t * 246 xfs_alloc_ioend( 247 struct inode *inode, 248 unsigned int type) 249 { 250 xfs_ioend_t *ioend; 251 252 ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); 253 254 /* 255 * Set the count to 1 initially, which will prevent an I/O 256 * completion callback from happening before we have started 257 * all the I/O from calling the completion routine too early. 258 */ 259 atomic_set(&ioend->io_remaining, 1); 260 ioend->io_error = 0; 261 ioend->io_list = NULL; 262 ioend->io_type = type; 263 ioend->io_inode = inode; 264 ioend->io_buffer_head = NULL; 265 ioend->io_buffer_tail = NULL; 266 ioend->io_offset = 0; 267 ioend->io_size = 0; 268 ioend->io_append_trans = NULL; 269 270 INIT_WORK(&ioend->io_work, xfs_end_io); 271 return ioend; 272 } 273 274 STATIC int 275 xfs_map_blocks( 276 struct inode *inode, 277 loff_t offset, 278 struct xfs_bmbt_irec *imap, 279 int type, 280 int nonblocking) 281 { 282 struct xfs_inode *ip = XFS_I(inode); 283 struct xfs_mount *mp = ip->i_mount; 284 ssize_t count = 1 << inode->i_blkbits; 285 xfs_fileoff_t offset_fsb, end_fsb; 286 int error = 0; 287 int bmapi_flags = XFS_BMAPI_ENTIRE; 288 int nimaps = 1; 289 290 if (XFS_FORCED_SHUTDOWN(mp)) 291 return -EIO; 292 293 if (type == XFS_IO_UNWRITTEN) 294 bmapi_flags |= XFS_BMAPI_IGSTATE; 295 296 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 297 if (nonblocking) 298 return -EAGAIN; 299 xfs_ilock(ip, XFS_ILOCK_SHARED); 300 } 301 302 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 303 (ip->i_df.if_flags & XFS_IFEXTENTS)); 304 ASSERT(offset <= mp->m_super->s_maxbytes); 305 306 if (offset + count > mp->m_super->s_maxbytes) 307 count = mp->m_super->s_maxbytes - offset; 308 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 309 offset_fsb = XFS_B_TO_FSBT(mp, offset); 310 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 311 imap, &nimaps, bmapi_flags); 312 xfs_iunlock(ip, XFS_ILOCK_SHARED); 313 314 if (error) 315 return error; 316 317 if (type == XFS_IO_DELALLOC && 318 (!nimaps || isnullstartblock(imap->br_startblock))) { 319 error = xfs_iomap_write_allocate(ip, offset, imap); 320 if (!error) 321 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); 322 return error; 323 } 324 325 #ifdef DEBUG 326 if (type == XFS_IO_UNWRITTEN) { 327 ASSERT(nimaps); 328 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 329 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 330 } 331 #endif 332 if (nimaps) 333 trace_xfs_map_blocks_found(ip, offset, count, type, imap); 334 return 0; 335 } 336 337 STATIC int 338 xfs_imap_valid( 339 struct inode *inode, 340 struct xfs_bmbt_irec *imap, 341 xfs_off_t offset) 342 { 343 offset >>= inode->i_blkbits; 344 345 return offset >= imap->br_startoff && 346 offset < imap->br_startoff + imap->br_blockcount; 347 } 348 349 /* 350 * BIO completion handler for buffered IO. 351 */ 352 STATIC void 353 xfs_end_bio( 354 struct bio *bio) 355 { 356 xfs_ioend_t *ioend = bio->bi_private; 357 358 ioend->io_error = bio->bi_error; 359 360 /* Toss bio and pass work off to an xfsdatad thread */ 361 bio->bi_private = NULL; 362 bio->bi_end_io = NULL; 363 bio_put(bio); 364 365 xfs_finish_ioend(ioend); 366 } 367 368 STATIC void 369 xfs_submit_ioend_bio( 370 struct writeback_control *wbc, 371 xfs_ioend_t *ioend, 372 struct bio *bio) 373 { 374 atomic_inc(&ioend->io_remaining); 375 bio->bi_private = ioend; 376 bio->bi_end_io = xfs_end_bio; 377 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); 378 } 379 380 STATIC struct bio * 381 xfs_alloc_ioend_bio( 382 struct buffer_head *bh) 383 { 384 struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); 385 386 ASSERT(bio->bi_private == NULL); 387 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 388 bio->bi_bdev = bh->b_bdev; 389 return bio; 390 } 391 392 STATIC void 393 xfs_start_buffer_writeback( 394 struct buffer_head *bh) 395 { 396 ASSERT(buffer_mapped(bh)); 397 ASSERT(buffer_locked(bh)); 398 ASSERT(!buffer_delay(bh)); 399 ASSERT(!buffer_unwritten(bh)); 400 401 mark_buffer_async_write(bh); 402 set_buffer_uptodate(bh); 403 clear_buffer_dirty(bh); 404 } 405 406 STATIC void 407 xfs_start_page_writeback( 408 struct page *page, 409 int clear_dirty, 410 int buffers) 411 { 412 ASSERT(PageLocked(page)); 413 ASSERT(!PageWriteback(page)); 414 415 /* 416 * if the page was not fully cleaned, we need to ensure that the higher 417 * layers come back to it correctly. That means we need to keep the page 418 * dirty, and for WB_SYNC_ALL writeback we need to ensure the 419 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to 420 * write this page in this writeback sweep will be made. 421 */ 422 if (clear_dirty) { 423 clear_page_dirty_for_io(page); 424 set_page_writeback(page); 425 } else 426 set_page_writeback_keepwrite(page); 427 428 unlock_page(page); 429 430 /* If no buffers on the page are to be written, finish it here */ 431 if (!buffers) 432 end_page_writeback(page); 433 } 434 435 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) 436 { 437 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); 438 } 439 440 /* 441 * Submit all of the bios for all of the ioends we have saved up, covering the 442 * initial writepage page and also any probed pages. 443 * 444 * Because we may have multiple ioends spanning a page, we need to start 445 * writeback on all the buffers before we submit them for I/O. If we mark the 446 * buffers as we got, then we can end up with a page that only has buffers 447 * marked async write and I/O complete on can occur before we mark the other 448 * buffers async write. 449 * 450 * The end result of this is that we trip a bug in end_page_writeback() because 451 * we call it twice for the one page as the code in end_buffer_async_write() 452 * assumes that all buffers on the page are started at the same time. 453 * 454 * The fix is two passes across the ioend list - one to start writeback on the 455 * buffer_heads, and then submit them for I/O on the second pass. 456 * 457 * If @fail is non-zero, it means that we have a situation where some part of 458 * the submission process has failed after we have marked paged for writeback 459 * and unlocked them. In this situation, we need to fail the ioend chain rather 460 * than submit it to IO. This typically only happens on a filesystem shutdown. 461 */ 462 STATIC void 463 xfs_submit_ioend( 464 struct writeback_control *wbc, 465 xfs_ioend_t *ioend, 466 int fail) 467 { 468 xfs_ioend_t *head = ioend; 469 xfs_ioend_t *next; 470 struct buffer_head *bh; 471 struct bio *bio; 472 sector_t lastblock = 0; 473 474 /* Pass 1 - start writeback */ 475 do { 476 next = ioend->io_list; 477 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) 478 xfs_start_buffer_writeback(bh); 479 } while ((ioend = next) != NULL); 480 481 /* Pass 2 - submit I/O */ 482 ioend = head; 483 do { 484 next = ioend->io_list; 485 bio = NULL; 486 487 /* 488 * If we are failing the IO now, just mark the ioend with an 489 * error and finish it. This will run IO completion immediately 490 * as there is only one reference to the ioend at this point in 491 * time. 492 */ 493 if (fail) { 494 ioend->io_error = fail; 495 xfs_finish_ioend(ioend); 496 continue; 497 } 498 499 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 500 501 if (!bio) { 502 retry: 503 bio = xfs_alloc_ioend_bio(bh); 504 } else if (bh->b_blocknr != lastblock + 1) { 505 xfs_submit_ioend_bio(wbc, ioend, bio); 506 goto retry; 507 } 508 509 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { 510 xfs_submit_ioend_bio(wbc, ioend, bio); 511 goto retry; 512 } 513 514 lastblock = bh->b_blocknr; 515 } 516 if (bio) 517 xfs_submit_ioend_bio(wbc, ioend, bio); 518 xfs_finish_ioend(ioend); 519 } while ((ioend = next) != NULL); 520 } 521 522 /* 523 * Cancel submission of all buffer_heads so far in this endio. 524 * Toss the endio too. Only ever called for the initial page 525 * in a writepage request, so only ever one page. 526 */ 527 STATIC void 528 xfs_cancel_ioend( 529 xfs_ioend_t *ioend) 530 { 531 xfs_ioend_t *next; 532 struct buffer_head *bh, *next_bh; 533 534 do { 535 next = ioend->io_list; 536 bh = ioend->io_buffer_head; 537 do { 538 next_bh = bh->b_private; 539 clear_buffer_async_write(bh); 540 /* 541 * The unwritten flag is cleared when added to the 542 * ioend. We're not submitting for I/O so mark the 543 * buffer unwritten again for next time around. 544 */ 545 if (ioend->io_type == XFS_IO_UNWRITTEN) 546 set_buffer_unwritten(bh); 547 unlock_buffer(bh); 548 } while ((bh = next_bh) != NULL); 549 550 mempool_free(ioend, xfs_ioend_pool); 551 } while ((ioend = next) != NULL); 552 } 553 554 /* 555 * Test to see if we've been building up a completion structure for 556 * earlier buffers -- if so, we try to append to this ioend if we 557 * can, otherwise we finish off any current ioend and start another. 558 * Return true if we've finished the given ioend. 559 */ 560 STATIC void 561 xfs_add_to_ioend( 562 struct inode *inode, 563 struct buffer_head *bh, 564 xfs_off_t offset, 565 unsigned int type, 566 xfs_ioend_t **result, 567 int need_ioend) 568 { 569 xfs_ioend_t *ioend = *result; 570 571 if (!ioend || need_ioend || type != ioend->io_type) { 572 xfs_ioend_t *previous = *result; 573 574 ioend = xfs_alloc_ioend(inode, type); 575 ioend->io_offset = offset; 576 ioend->io_buffer_head = bh; 577 ioend->io_buffer_tail = bh; 578 if (previous) 579 previous->io_list = ioend; 580 *result = ioend; 581 } else { 582 ioend->io_buffer_tail->b_private = bh; 583 ioend->io_buffer_tail = bh; 584 } 585 586 bh->b_private = NULL; 587 ioend->io_size += bh->b_size; 588 } 589 590 STATIC void 591 xfs_map_buffer( 592 struct inode *inode, 593 struct buffer_head *bh, 594 struct xfs_bmbt_irec *imap, 595 xfs_off_t offset) 596 { 597 sector_t bn; 598 struct xfs_mount *m = XFS_I(inode)->i_mount; 599 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); 600 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); 601 602 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 603 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 604 605 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + 606 ((offset - iomap_offset) >> inode->i_blkbits); 607 608 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); 609 610 bh->b_blocknr = bn; 611 set_buffer_mapped(bh); 612 } 613 614 STATIC void 615 xfs_map_at_offset( 616 struct inode *inode, 617 struct buffer_head *bh, 618 struct xfs_bmbt_irec *imap, 619 xfs_off_t offset) 620 { 621 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 622 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 623 624 xfs_map_buffer(inode, bh, imap, offset); 625 set_buffer_mapped(bh); 626 clear_buffer_delay(bh); 627 clear_buffer_unwritten(bh); 628 } 629 630 /* 631 * Test if a given page contains at least one buffer of a given @type. 632 * If @check_all_buffers is true, then we walk all the buffers in the page to 633 * try to find one of the type passed in. If it is not set, then the caller only 634 * needs to check the first buffer on the page for a match. 635 */ 636 STATIC bool 637 xfs_check_page_type( 638 struct page *page, 639 unsigned int type, 640 bool check_all_buffers) 641 { 642 struct buffer_head *bh; 643 struct buffer_head *head; 644 645 if (PageWriteback(page)) 646 return false; 647 if (!page->mapping) 648 return false; 649 if (!page_has_buffers(page)) 650 return false; 651 652 bh = head = page_buffers(page); 653 do { 654 if (buffer_unwritten(bh)) { 655 if (type == XFS_IO_UNWRITTEN) 656 return true; 657 } else if (buffer_delay(bh)) { 658 if (type == XFS_IO_DELALLOC) 659 return true; 660 } else if (buffer_dirty(bh) && buffer_mapped(bh)) { 661 if (type == XFS_IO_OVERWRITE) 662 return true; 663 } 664 665 /* If we are only checking the first buffer, we are done now. */ 666 if (!check_all_buffers) 667 break; 668 } while ((bh = bh->b_this_page) != head); 669 670 return false; 671 } 672 673 /* 674 * Allocate & map buffers for page given the extent map. Write it out. 675 * except for the original page of a writepage, this is called on 676 * delalloc/unwritten pages only, for the original page it is possible 677 * that the page has no mapping at all. 678 */ 679 STATIC int 680 xfs_convert_page( 681 struct inode *inode, 682 struct page *page, 683 loff_t tindex, 684 struct xfs_bmbt_irec *imap, 685 xfs_ioend_t **ioendp, 686 struct writeback_control *wbc) 687 { 688 struct buffer_head *bh, *head; 689 xfs_off_t end_offset; 690 unsigned long p_offset; 691 unsigned int type; 692 int len, page_dirty; 693 int count = 0, done = 0, uptodate = 1; 694 xfs_off_t offset = page_offset(page); 695 696 if (page->index != tindex) 697 goto fail; 698 if (!trylock_page(page)) 699 goto fail; 700 if (PageWriteback(page)) 701 goto fail_unlock_page; 702 if (page->mapping != inode->i_mapping) 703 goto fail_unlock_page; 704 if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) 705 goto fail_unlock_page; 706 707 /* 708 * page_dirty is initially a count of buffers on the page before 709 * EOF and is decremented as we move each into a cleanable state. 710 * 711 * Derivation: 712 * 713 * End offset is the highest offset that this page should represent. 714 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) 715 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and 716 * hence give us the correct page_dirty count. On any other page, 717 * it will be zero and in that case we need page_dirty to be the 718 * count of buffers on the page. 719 */ 720 end_offset = min_t(unsigned long long, 721 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, 722 i_size_read(inode)); 723 724 /* 725 * If the current map does not span the entire page we are about to try 726 * to write, then give up. The only way we can write a page that spans 727 * multiple mappings in a single writeback iteration is via the 728 * xfs_vm_writepage() function. Data integrity writeback requires the 729 * entire page to be written in a single attempt, otherwise the part of 730 * the page we don't write here doesn't get written as part of the data 731 * integrity sync. 732 * 733 * For normal writeback, we also don't attempt to write partial pages 734 * here as it simply means that write_cache_pages() will see it under 735 * writeback and ignore the page until some point in the future, at 736 * which time this will be the only page in the file that needs 737 * writeback. Hence for more optimal IO patterns, we should always 738 * avoid partial page writeback due to multiple mappings on a page here. 739 */ 740 if (!xfs_imap_valid(inode, imap, end_offset)) 741 goto fail_unlock_page; 742 743 len = 1 << inode->i_blkbits; 744 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), 745 PAGE_CACHE_SIZE); 746 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; 747 page_dirty = p_offset / len; 748 749 /* 750 * The moment we find a buffer that doesn't match our current type 751 * specification or can't be written, abort the loop and start 752 * writeback. As per the above xfs_imap_valid() check, only 753 * xfs_vm_writepage() can handle partial page writeback fully - we are 754 * limited here to the buffers that are contiguous with the current 755 * ioend, and hence a buffer we can't write breaks that contiguity and 756 * we have to defer the rest of the IO to xfs_vm_writepage(). 757 */ 758 bh = head = page_buffers(page); 759 do { 760 if (offset >= end_offset) 761 break; 762 if (!buffer_uptodate(bh)) 763 uptodate = 0; 764 if (!(PageUptodate(page) || buffer_uptodate(bh))) { 765 done = 1; 766 break; 767 } 768 769 if (buffer_unwritten(bh) || buffer_delay(bh) || 770 buffer_mapped(bh)) { 771 if (buffer_unwritten(bh)) 772 type = XFS_IO_UNWRITTEN; 773 else if (buffer_delay(bh)) 774 type = XFS_IO_DELALLOC; 775 else 776 type = XFS_IO_OVERWRITE; 777 778 /* 779 * imap should always be valid because of the above 780 * partial page end_offset check on the imap. 781 */ 782 ASSERT(xfs_imap_valid(inode, imap, offset)); 783 784 lock_buffer(bh); 785 if (type != XFS_IO_OVERWRITE) 786 xfs_map_at_offset(inode, bh, imap, offset); 787 xfs_add_to_ioend(inode, bh, offset, type, 788 ioendp, done); 789 790 page_dirty--; 791 count++; 792 } else { 793 done = 1; 794 break; 795 } 796 } while (offset += len, (bh = bh->b_this_page) != head); 797 798 if (uptodate && bh == head) 799 SetPageUptodate(page); 800 801 if (count) { 802 if (--wbc->nr_to_write <= 0 && 803 wbc->sync_mode == WB_SYNC_NONE) 804 done = 1; 805 } 806 xfs_start_page_writeback(page, !page_dirty, count); 807 808 return done; 809 fail_unlock_page: 810 unlock_page(page); 811 fail: 812 return 1; 813 } 814 815 /* 816 * Convert & write out a cluster of pages in the same extent as defined 817 * by mp and following the start page. 818 */ 819 STATIC void 820 xfs_cluster_write( 821 struct inode *inode, 822 pgoff_t tindex, 823 struct xfs_bmbt_irec *imap, 824 xfs_ioend_t **ioendp, 825 struct writeback_control *wbc, 826 pgoff_t tlast) 827 { 828 struct pagevec pvec; 829 int done = 0, i; 830 831 pagevec_init(&pvec, 0); 832 while (!done && tindex <= tlast) { 833 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); 834 835 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) 836 break; 837 838 for (i = 0; i < pagevec_count(&pvec); i++) { 839 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 840 imap, ioendp, wbc); 841 if (done) 842 break; 843 } 844 845 pagevec_release(&pvec); 846 cond_resched(); 847 } 848 } 849 850 STATIC void 851 xfs_vm_invalidatepage( 852 struct page *page, 853 unsigned int offset, 854 unsigned int length) 855 { 856 trace_xfs_invalidatepage(page->mapping->host, page, offset, 857 length); 858 block_invalidatepage(page, offset, length); 859 } 860 861 /* 862 * If the page has delalloc buffers on it, we need to punch them out before we 863 * invalidate the page. If we don't, we leave a stale delalloc mapping on the 864 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read 865 * is done on that same region - the delalloc extent is returned when none is 866 * supposed to be there. 867 * 868 * We prevent this by truncating away the delalloc regions on the page before 869 * invalidating it. Because they are delalloc, we can do this without needing a 870 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this 871 * truncation without a transaction as there is no space left for block 872 * reservation (typically why we see a ENOSPC in writeback). 873 * 874 * This is not a performance critical path, so for now just do the punching a 875 * buffer head at a time. 876 */ 877 STATIC void 878 xfs_aops_discard_page( 879 struct page *page) 880 { 881 struct inode *inode = page->mapping->host; 882 struct xfs_inode *ip = XFS_I(inode); 883 struct buffer_head *bh, *head; 884 loff_t offset = page_offset(page); 885 886 if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) 887 goto out_invalidate; 888 889 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 890 goto out_invalidate; 891 892 xfs_alert(ip->i_mount, 893 "page discard on page %p, inode 0x%llx, offset %llu.", 894 page, ip->i_ino, offset); 895 896 xfs_ilock(ip, XFS_ILOCK_EXCL); 897 bh = head = page_buffers(page); 898 do { 899 int error; 900 xfs_fileoff_t start_fsb; 901 902 if (!buffer_delay(bh)) 903 goto next_buffer; 904 905 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 906 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); 907 if (error) { 908 /* something screwed, just bail */ 909 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 910 xfs_alert(ip->i_mount, 911 "page discard unable to remove delalloc mapping."); 912 } 913 break; 914 } 915 next_buffer: 916 offset += 1 << inode->i_blkbits; 917 918 } while ((bh = bh->b_this_page) != head); 919 920 xfs_iunlock(ip, XFS_ILOCK_EXCL); 921 out_invalidate: 922 xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); 923 return; 924 } 925 926 /* 927 * Write out a dirty page. 928 * 929 * For delalloc space on the page we need to allocate space and flush it. 930 * For unwritten space on the page we need to start the conversion to 931 * regular allocated space. 932 * For any other dirty buffer heads on the page we should flush them. 933 */ 934 STATIC int 935 xfs_vm_writepage( 936 struct page *page, 937 struct writeback_control *wbc) 938 { 939 struct inode *inode = page->mapping->host; 940 struct buffer_head *bh, *head; 941 struct xfs_bmbt_irec imap; 942 xfs_ioend_t *ioend = NULL, *iohead = NULL; 943 loff_t offset; 944 unsigned int type; 945 __uint64_t end_offset; 946 pgoff_t end_index, last_index; 947 ssize_t len; 948 int err, imap_valid = 0, uptodate = 1; 949 int count = 0; 950 int nonblocking = 0; 951 952 trace_xfs_writepage(inode, page, 0, 0); 953 954 ASSERT(page_has_buffers(page)); 955 956 /* 957 * Refuse to write the page out if we are called from reclaim context. 958 * 959 * This avoids stack overflows when called from deeply used stacks in 960 * random callers for direct reclaim or memcg reclaim. We explicitly 961 * allow reclaim from kswapd as the stack usage there is relatively low. 962 * 963 * This should never happen except in the case of a VM regression so 964 * warn about it. 965 */ 966 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == 967 PF_MEMALLOC)) 968 goto redirty; 969 970 /* 971 * Given that we do not allow direct reclaim to call us, we should 972 * never be called while in a filesystem transaction. 973 */ 974 if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) 975 goto redirty; 976 977 /* Is this page beyond the end of the file? */ 978 offset = i_size_read(inode); 979 end_index = offset >> PAGE_CACHE_SHIFT; 980 last_index = (offset - 1) >> PAGE_CACHE_SHIFT; 981 982 /* 983 * The page index is less than the end_index, adjust the end_offset 984 * to the highest offset that this page should represent. 985 * ----------------------------------------------------- 986 * | file mapping | <EOF> | 987 * ----------------------------------------------------- 988 * | Page ... | Page N-2 | Page N-1 | Page N | | 989 * ^--------------------------------^----------|-------- 990 * | desired writeback range | see else | 991 * ---------------------------------^------------------| 992 */ 993 if (page->index < end_index) 994 end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; 995 else { 996 /* 997 * Check whether the page to write out is beyond or straddles 998 * i_size or not. 999 * ------------------------------------------------------- 1000 * | file mapping | <EOF> | 1001 * ------------------------------------------------------- 1002 * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | 1003 * ^--------------------------------^-----------|--------- 1004 * | | Straddles | 1005 * ---------------------------------^-----------|--------| 1006 */ 1007 unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); 1008 1009 /* 1010 * Skip the page if it is fully outside i_size, e.g. due to a 1011 * truncate operation that is in progress. We must redirty the 1012 * page so that reclaim stops reclaiming it. Otherwise 1013 * xfs_vm_releasepage() is called on it and gets confused. 1014 * 1015 * Note that the end_index is unsigned long, it would overflow 1016 * if the given offset is greater than 16TB on 32-bit system 1017 * and if we do check the page is fully outside i_size or not 1018 * via "if (page->index >= end_index + 1)" as "end_index + 1" 1019 * will be evaluated to 0. Hence this page will be redirtied 1020 * and be written out repeatedly which would result in an 1021 * infinite loop, the user program that perform this operation 1022 * will hang. Instead, we can verify this situation by checking 1023 * if the page to write is totally beyond the i_size or if it's 1024 * offset is just equal to the EOF. 1025 */ 1026 if (page->index > end_index || 1027 (page->index == end_index && offset_into_page == 0)) 1028 goto redirty; 1029 1030 /* 1031 * The page straddles i_size. It must be zeroed out on each 1032 * and every writepage invocation because it may be mmapped. 1033 * "A file is mapped in multiples of the page size. For a file 1034 * that is not a multiple of the page size, the remaining 1035 * memory is zeroed when mapped, and writes to that region are 1036 * not written out to the file." 1037 */ 1038 zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); 1039 1040 /* Adjust the end_offset to the end of file */ 1041 end_offset = offset; 1042 } 1043 1044 len = 1 << inode->i_blkbits; 1045 1046 bh = head = page_buffers(page); 1047 offset = page_offset(page); 1048 type = XFS_IO_OVERWRITE; 1049 1050 if (wbc->sync_mode == WB_SYNC_NONE) 1051 nonblocking = 1; 1052 1053 do { 1054 int new_ioend = 0; 1055 1056 if (offset >= end_offset) 1057 break; 1058 if (!buffer_uptodate(bh)) 1059 uptodate = 0; 1060 1061 /* 1062 * set_page_dirty dirties all buffers in a page, independent 1063 * of their state. The dirty state however is entirely 1064 * meaningless for holes (!mapped && uptodate), so skip 1065 * buffers covering holes here. 1066 */ 1067 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 1068 imap_valid = 0; 1069 continue; 1070 } 1071 1072 if (buffer_unwritten(bh)) { 1073 if (type != XFS_IO_UNWRITTEN) { 1074 type = XFS_IO_UNWRITTEN; 1075 imap_valid = 0; 1076 } 1077 } else if (buffer_delay(bh)) { 1078 if (type != XFS_IO_DELALLOC) { 1079 type = XFS_IO_DELALLOC; 1080 imap_valid = 0; 1081 } 1082 } else if (buffer_uptodate(bh)) { 1083 if (type != XFS_IO_OVERWRITE) { 1084 type = XFS_IO_OVERWRITE; 1085 imap_valid = 0; 1086 } 1087 } else { 1088 if (PageUptodate(page)) 1089 ASSERT(buffer_mapped(bh)); 1090 /* 1091 * This buffer is not uptodate and will not be 1092 * written to disk. Ensure that we will put any 1093 * subsequent writeable buffers into a new 1094 * ioend. 1095 */ 1096 imap_valid = 0; 1097 continue; 1098 } 1099 1100 if (imap_valid) 1101 imap_valid = xfs_imap_valid(inode, &imap, offset); 1102 if (!imap_valid) { 1103 /* 1104 * If we didn't have a valid mapping then we need to 1105 * put the new mapping into a separate ioend structure. 1106 * This ensures non-contiguous extents always have 1107 * separate ioends, which is particularly important 1108 * for unwritten extent conversion at I/O completion 1109 * time. 1110 */ 1111 new_ioend = 1; 1112 err = xfs_map_blocks(inode, offset, &imap, type, 1113 nonblocking); 1114 if (err) 1115 goto error; 1116 imap_valid = xfs_imap_valid(inode, &imap, offset); 1117 } 1118 if (imap_valid) { 1119 lock_buffer(bh); 1120 if (type != XFS_IO_OVERWRITE) 1121 xfs_map_at_offset(inode, bh, &imap, offset); 1122 xfs_add_to_ioend(inode, bh, offset, type, &ioend, 1123 new_ioend); 1124 count++; 1125 } 1126 1127 if (!iohead) 1128 iohead = ioend; 1129 1130 } while (offset += len, ((bh = bh->b_this_page) != head)); 1131 1132 if (uptodate && bh == head) 1133 SetPageUptodate(page); 1134 1135 xfs_start_page_writeback(page, 1, count); 1136 1137 /* if there is no IO to be submitted for this page, we are done */ 1138 if (!ioend) 1139 return 0; 1140 1141 ASSERT(iohead); 1142 1143 /* 1144 * Any errors from this point onwards need tobe reported through the IO 1145 * completion path as we have marked the initial page as under writeback 1146 * and unlocked it. 1147 */ 1148 if (imap_valid) { 1149 xfs_off_t end_index; 1150 1151 end_index = imap.br_startoff + imap.br_blockcount; 1152 1153 /* to bytes */ 1154 end_index <<= inode->i_blkbits; 1155 1156 /* to pages */ 1157 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; 1158 1159 /* check against file size */ 1160 if (end_index > last_index) 1161 end_index = last_index; 1162 1163 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1164 wbc, end_index); 1165 } 1166 1167 1168 /* 1169 * Reserve log space if we might write beyond the on-disk inode size. 1170 */ 1171 err = 0; 1172 if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) 1173 err = xfs_setfilesize_trans_alloc(ioend); 1174 1175 xfs_submit_ioend(wbc, iohead, err); 1176 1177 return 0; 1178 1179 error: 1180 if (iohead) 1181 xfs_cancel_ioend(iohead); 1182 1183 if (err == -EAGAIN) 1184 goto redirty; 1185 1186 xfs_aops_discard_page(page); 1187 ClearPageUptodate(page); 1188 unlock_page(page); 1189 return err; 1190 1191 redirty: 1192 redirty_page_for_writepage(wbc, page); 1193 unlock_page(page); 1194 return 0; 1195 } 1196 1197 STATIC int 1198 xfs_vm_writepages( 1199 struct address_space *mapping, 1200 struct writeback_control *wbc) 1201 { 1202 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1203 return generic_writepages(mapping, wbc); 1204 } 1205 1206 /* 1207 * Called to move a page into cleanable state - and from there 1208 * to be released. The page should already be clean. We always 1209 * have buffer heads in this call. 1210 * 1211 * Returns 1 if the page is ok to release, 0 otherwise. 1212 */ 1213 STATIC int 1214 xfs_vm_releasepage( 1215 struct page *page, 1216 gfp_t gfp_mask) 1217 { 1218 int delalloc, unwritten; 1219 1220 trace_xfs_releasepage(page->mapping->host, page, 0, 0); 1221 1222 xfs_count_page_state(page, &delalloc, &unwritten); 1223 1224 if (WARN_ON_ONCE(delalloc)) 1225 return 0; 1226 if (WARN_ON_ONCE(unwritten)) 1227 return 0; 1228 1229 return try_to_free_buffers(page); 1230 } 1231 1232 /* 1233 * When we map a DIO buffer, we may need to attach an ioend that describes the 1234 * type of write IO we are doing. This passes to the completion function the 1235 * operations it needs to perform. If the mapping is for an overwrite wholly 1236 * within the EOF then we don't need an ioend and so we don't allocate one. 1237 * This avoids the unnecessary overhead of allocating and freeing ioends for 1238 * workloads that don't require transactions on IO completion. 1239 * 1240 * If we get multiple mappings in a single IO, we might be mapping different 1241 * types. But because the direct IO can only have a single private pointer, we 1242 * need to ensure that: 1243 * 1244 * a) i) the ioend spans the entire region of unwritten mappings; or 1245 * ii) the ioend spans all the mappings that cross or are beyond EOF; and 1246 * b) if it contains unwritten extents, it is *permanently* marked as such 1247 * 1248 * We could do this by chaining ioends like buffered IO does, but we only 1249 * actually get one IO completion callback from the direct IO, and that spans 1250 * the entire IO regardless of how many mappings and IOs are needed to complete 1251 * the DIO. There is only going to be one reference to the ioend and its life 1252 * cycle is constrained by the DIO completion code. hence we don't need 1253 * reference counting here. 1254 */ 1255 static void 1256 xfs_map_direct( 1257 struct inode *inode, 1258 struct buffer_head *bh_result, 1259 struct xfs_bmbt_irec *imap, 1260 xfs_off_t offset) 1261 { 1262 struct xfs_ioend *ioend; 1263 xfs_off_t size = bh_result->b_size; 1264 int type; 1265 1266 if (ISUNWRITTEN(imap)) 1267 type = XFS_IO_UNWRITTEN; 1268 else 1269 type = XFS_IO_OVERWRITE; 1270 1271 trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); 1272 1273 if (bh_result->b_private) { 1274 ioend = bh_result->b_private; 1275 ASSERT(ioend->io_size > 0); 1276 ASSERT(offset >= ioend->io_offset); 1277 if (offset + size > ioend->io_offset + ioend->io_size) 1278 ioend->io_size = offset - ioend->io_offset + size; 1279 1280 if (type == XFS_IO_UNWRITTEN && type != ioend->io_type) 1281 ioend->io_type = XFS_IO_UNWRITTEN; 1282 1283 trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset, 1284 ioend->io_size, ioend->io_type, 1285 imap); 1286 } else if (type == XFS_IO_UNWRITTEN || 1287 offset + size > i_size_read(inode)) { 1288 ioend = xfs_alloc_ioend(inode, type); 1289 ioend->io_offset = offset; 1290 ioend->io_size = size; 1291 1292 bh_result->b_private = ioend; 1293 set_buffer_defer_completion(bh_result); 1294 1295 trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type, 1296 imap); 1297 } else { 1298 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, 1299 imap); 1300 } 1301 } 1302 1303 /* 1304 * If this is O_DIRECT or the mpage code calling tell them how large the mapping 1305 * is, so that we can avoid repeated get_blocks calls. 1306 * 1307 * If the mapping spans EOF, then we have to break the mapping up as the mapping 1308 * for blocks beyond EOF must be marked new so that sub block regions can be 1309 * correctly zeroed. We can't do this for mappings within EOF unless the mapping 1310 * was just allocated or is unwritten, otherwise the callers would overwrite 1311 * existing data with zeros. Hence we have to split the mapping into a range up 1312 * to and including EOF, and a second mapping for beyond EOF. 1313 */ 1314 static void 1315 xfs_map_trim_size( 1316 struct inode *inode, 1317 sector_t iblock, 1318 struct buffer_head *bh_result, 1319 struct xfs_bmbt_irec *imap, 1320 xfs_off_t offset, 1321 ssize_t size) 1322 { 1323 xfs_off_t mapping_size; 1324 1325 mapping_size = imap->br_startoff + imap->br_blockcount - iblock; 1326 mapping_size <<= inode->i_blkbits; 1327 1328 ASSERT(mapping_size > 0); 1329 if (mapping_size > size) 1330 mapping_size = size; 1331 if (offset < i_size_read(inode) && 1332 offset + mapping_size >= i_size_read(inode)) { 1333 /* limit mapping to block that spans EOF */ 1334 mapping_size = roundup_64(i_size_read(inode) - offset, 1335 1 << inode->i_blkbits); 1336 } 1337 if (mapping_size > LONG_MAX) 1338 mapping_size = LONG_MAX; 1339 1340 bh_result->b_size = mapping_size; 1341 } 1342 1343 STATIC int 1344 __xfs_get_blocks( 1345 struct inode *inode, 1346 sector_t iblock, 1347 struct buffer_head *bh_result, 1348 int create, 1349 bool direct) 1350 { 1351 struct xfs_inode *ip = XFS_I(inode); 1352 struct xfs_mount *mp = ip->i_mount; 1353 xfs_fileoff_t offset_fsb, end_fsb; 1354 int error = 0; 1355 int lockmode = 0; 1356 struct xfs_bmbt_irec imap; 1357 int nimaps = 1; 1358 xfs_off_t offset; 1359 ssize_t size; 1360 int new = 0; 1361 1362 if (XFS_FORCED_SHUTDOWN(mp)) 1363 return -EIO; 1364 1365 offset = (xfs_off_t)iblock << inode->i_blkbits; 1366 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1367 size = bh_result->b_size; 1368 1369 if (!create && direct && offset >= i_size_read(inode)) 1370 return 0; 1371 1372 /* 1373 * Direct I/O is usually done on preallocated files, so try getting 1374 * a block mapping without an exclusive lock first. For buffered 1375 * writes we already have the exclusive iolock anyway, so avoiding 1376 * a lock roundtrip here by taking the ilock exclusive from the 1377 * beginning is a useful micro optimization. 1378 */ 1379 if (create && !direct) { 1380 lockmode = XFS_ILOCK_EXCL; 1381 xfs_ilock(ip, lockmode); 1382 } else { 1383 lockmode = xfs_ilock_data_map_shared(ip); 1384 } 1385 1386 ASSERT(offset <= mp->m_super->s_maxbytes); 1387 if (offset + size > mp->m_super->s_maxbytes) 1388 size = mp->m_super->s_maxbytes - offset; 1389 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); 1390 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1391 1392 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 1393 &imap, &nimaps, XFS_BMAPI_ENTIRE); 1394 if (error) 1395 goto out_unlock; 1396 1397 if (create && 1398 (!nimaps || 1399 (imap.br_startblock == HOLESTARTBLOCK || 1400 imap.br_startblock == DELAYSTARTBLOCK))) { 1401 if (direct || xfs_get_extsz_hint(ip)) { 1402 /* 1403 * Drop the ilock in preparation for starting the block 1404 * allocation transaction. It will be retaken 1405 * exclusively inside xfs_iomap_write_direct for the 1406 * actual allocation. 1407 */ 1408 xfs_iunlock(ip, lockmode); 1409 error = xfs_iomap_write_direct(ip, offset, size, 1410 &imap, nimaps); 1411 if (error) 1412 return error; 1413 new = 1; 1414 1415 } else { 1416 /* 1417 * Delalloc reservations do not require a transaction, 1418 * we can go on without dropping the lock here. If we 1419 * are allocating a new delalloc block, make sure that 1420 * we set the new flag so that we mark the buffer new so 1421 * that we know that it is newly allocated if the write 1422 * fails. 1423 */ 1424 if (nimaps && imap.br_startblock == HOLESTARTBLOCK) 1425 new = 1; 1426 error = xfs_iomap_write_delay(ip, offset, size, &imap); 1427 if (error) 1428 goto out_unlock; 1429 1430 xfs_iunlock(ip, lockmode); 1431 } 1432 trace_xfs_get_blocks_alloc(ip, offset, size, 1433 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1434 : XFS_IO_DELALLOC, &imap); 1435 } else if (nimaps) { 1436 trace_xfs_get_blocks_found(ip, offset, size, 1437 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1438 : XFS_IO_OVERWRITE, &imap); 1439 xfs_iunlock(ip, lockmode); 1440 } else { 1441 trace_xfs_get_blocks_notfound(ip, offset, size); 1442 goto out_unlock; 1443 } 1444 1445 /* trim mapping down to size requested */ 1446 if (direct || size > (1 << inode->i_blkbits)) 1447 xfs_map_trim_size(inode, iblock, bh_result, 1448 &imap, offset, size); 1449 1450 /* 1451 * For unwritten extents do not report a disk address in the buffered 1452 * read case (treat as if we're reading into a hole). 1453 */ 1454 if (imap.br_startblock != HOLESTARTBLOCK && 1455 imap.br_startblock != DELAYSTARTBLOCK && 1456 (create || !ISUNWRITTEN(&imap))) { 1457 xfs_map_buffer(inode, bh_result, &imap, offset); 1458 if (ISUNWRITTEN(&imap)) 1459 set_buffer_unwritten(bh_result); 1460 /* direct IO needs special help */ 1461 if (create && direct) 1462 xfs_map_direct(inode, bh_result, &imap, offset); 1463 } 1464 1465 /* 1466 * If this is a realtime file, data may be on a different device. 1467 * to that pointed to from the buffer_head b_bdev currently. 1468 */ 1469 bh_result->b_bdev = xfs_find_bdev_for_inode(inode); 1470 1471 /* 1472 * If we previously allocated a block out beyond eof and we are now 1473 * coming back to use it then we will need to flag it as new even if it 1474 * has a disk address. 1475 * 1476 * With sub-block writes into unwritten extents we also need to mark 1477 * the buffer as new so that the unwritten parts of the buffer gets 1478 * correctly zeroed. 1479 */ 1480 if (create && 1481 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1482 (offset >= i_size_read(inode)) || 1483 (new || ISUNWRITTEN(&imap)))) 1484 set_buffer_new(bh_result); 1485 1486 if (imap.br_startblock == DELAYSTARTBLOCK) { 1487 BUG_ON(direct); 1488 if (create) { 1489 set_buffer_uptodate(bh_result); 1490 set_buffer_mapped(bh_result); 1491 set_buffer_delay(bh_result); 1492 } 1493 } 1494 1495 return 0; 1496 1497 out_unlock: 1498 xfs_iunlock(ip, lockmode); 1499 return error; 1500 } 1501 1502 int 1503 xfs_get_blocks( 1504 struct inode *inode, 1505 sector_t iblock, 1506 struct buffer_head *bh_result, 1507 int create) 1508 { 1509 return __xfs_get_blocks(inode, iblock, bh_result, create, false); 1510 } 1511 1512 int 1513 xfs_get_blocks_direct( 1514 struct inode *inode, 1515 sector_t iblock, 1516 struct buffer_head *bh_result, 1517 int create) 1518 { 1519 return __xfs_get_blocks(inode, iblock, bh_result, create, true); 1520 } 1521 1522 static void 1523 __xfs_end_io_direct_write( 1524 struct inode *inode, 1525 struct xfs_ioend *ioend, 1526 loff_t offset, 1527 ssize_t size) 1528 { 1529 struct xfs_mount *mp = XFS_I(inode)->i_mount; 1530 1531 if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) 1532 goto out_end_io; 1533 1534 /* 1535 * dio completion end_io functions are only called on writes if more 1536 * than 0 bytes was written. 1537 */ 1538 ASSERT(size > 0); 1539 1540 /* 1541 * The ioend only maps whole blocks, while the IO may be sector aligned. 1542 * Hence the ioend offset/size may not match the IO offset/size exactly. 1543 * Because we don't map overwrites within EOF into the ioend, the offset 1544 * may not match, but only if the endio spans EOF. Either way, write 1545 * the IO sizes into the ioend so that completion processing does the 1546 * right thing. 1547 */ 1548 ASSERT(offset + size <= ioend->io_offset + ioend->io_size); 1549 ioend->io_size = size; 1550 ioend->io_offset = offset; 1551 1552 /* 1553 * The ioend tells us whether we are doing unwritten extent conversion 1554 * or an append transaction that updates the on-disk file size. These 1555 * cases are the only cases where we should *potentially* be needing 1556 * to update the VFS inode size. 1557 * 1558 * We need to update the in-core inode size here so that we don't end up 1559 * with the on-disk inode size being outside the in-core inode size. We 1560 * have no other method of updating EOF for AIO, so always do it here 1561 * if necessary. 1562 * 1563 * We need to lock the test/set EOF update as we can be racing with 1564 * other IO completions here to update the EOF. Failing to serialise 1565 * here can result in EOF moving backwards and Bad Things Happen when 1566 * that occurs. 1567 */ 1568 spin_lock(&XFS_I(inode)->i_flags_lock); 1569 if (offset + size > i_size_read(inode)) 1570 i_size_write(inode, offset + size); 1571 spin_unlock(&XFS_I(inode)->i_flags_lock); 1572 1573 /* 1574 * If we are doing an append IO that needs to update the EOF on disk, 1575 * do the transaction reserve now so we can use common end io 1576 * processing. Stashing the error (if there is one) in the ioend will 1577 * result in the ioend processing passing on the error if it is 1578 * possible as we can't return it from here. 1579 */ 1580 if (ioend->io_type == XFS_IO_OVERWRITE) 1581 ioend->io_error = xfs_setfilesize_trans_alloc(ioend); 1582 1583 out_end_io: 1584 xfs_end_io(&ioend->io_work); 1585 return; 1586 } 1587 1588 /* 1589 * Complete a direct I/O write request. 1590 * 1591 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. 1592 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite 1593 * wholly within the EOF and so there is nothing for us to do. Note that in this 1594 * case the completion can be called in interrupt context, whereas if we have an 1595 * ioend we will always be called in task context (i.e. from a workqueue). 1596 */ 1597 STATIC void 1598 xfs_end_io_direct_write( 1599 struct kiocb *iocb, 1600 loff_t offset, 1601 ssize_t size, 1602 void *private) 1603 { 1604 struct inode *inode = file_inode(iocb->ki_filp); 1605 struct xfs_ioend *ioend = private; 1606 1607 trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size, 1608 ioend ? ioend->io_type : 0, NULL); 1609 1610 if (!ioend) { 1611 ASSERT(offset + size <= i_size_read(inode)); 1612 return; 1613 } 1614 1615 __xfs_end_io_direct_write(inode, ioend, offset, size); 1616 } 1617 1618 /* 1619 * For DAX we need a mapping buffer callback for unwritten extent conversion 1620 * when page faults allocate blocks and then zero them. Note that in this 1621 * case the mapping indicated by the ioend may extend beyond EOF. We most 1622 * definitely do not want to extend EOF here, so we trim back the ioend size to 1623 * EOF. 1624 */ 1625 #ifdef CONFIG_FS_DAX 1626 void 1627 xfs_end_io_dax_write( 1628 struct buffer_head *bh, 1629 int uptodate) 1630 { 1631 struct xfs_ioend *ioend = bh->b_private; 1632 struct inode *inode = ioend->io_inode; 1633 ssize_t size = ioend->io_size; 1634 1635 ASSERT(IS_DAX(ioend->io_inode)); 1636 1637 /* if there was an error zeroing, then don't convert it */ 1638 if (!uptodate) 1639 ioend->io_error = -EIO; 1640 1641 /* 1642 * Trim update to EOF, so we don't extend EOF during unwritten extent 1643 * conversion of partial EOF blocks. 1644 */ 1645 spin_lock(&XFS_I(inode)->i_flags_lock); 1646 if (ioend->io_offset + size > i_size_read(inode)) 1647 size = i_size_read(inode) - ioend->io_offset; 1648 spin_unlock(&XFS_I(inode)->i_flags_lock); 1649 1650 __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size); 1651 1652 } 1653 #else 1654 void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { } 1655 #endif 1656 1657 static inline ssize_t 1658 xfs_vm_do_dio( 1659 struct inode *inode, 1660 struct kiocb *iocb, 1661 struct iov_iter *iter, 1662 loff_t offset, 1663 void (*endio)(struct kiocb *iocb, 1664 loff_t offset, 1665 ssize_t size, 1666 void *private), 1667 int flags) 1668 { 1669 struct block_device *bdev; 1670 1671 if (IS_DAX(inode)) 1672 return dax_do_io(iocb, inode, iter, offset, 1673 xfs_get_blocks_direct, endio, 0); 1674 1675 bdev = xfs_find_bdev_for_inode(inode); 1676 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1677 xfs_get_blocks_direct, endio, NULL, flags); 1678 } 1679 1680 STATIC ssize_t 1681 xfs_vm_direct_IO( 1682 struct kiocb *iocb, 1683 struct iov_iter *iter, 1684 loff_t offset) 1685 { 1686 struct inode *inode = iocb->ki_filp->f_mapping->host; 1687 1688 if (iov_iter_rw(iter) == WRITE) 1689 return xfs_vm_do_dio(inode, iocb, iter, offset, 1690 xfs_end_io_direct_write, DIO_ASYNC_EXTEND); 1691 return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0); 1692 } 1693 1694 /* 1695 * Punch out the delalloc blocks we have already allocated. 1696 * 1697 * Don't bother with xfs_setattr given that nothing can have made it to disk yet 1698 * as the page is still locked at this point. 1699 */ 1700 STATIC void 1701 xfs_vm_kill_delalloc_range( 1702 struct inode *inode, 1703 loff_t start, 1704 loff_t end) 1705 { 1706 struct xfs_inode *ip = XFS_I(inode); 1707 xfs_fileoff_t start_fsb; 1708 xfs_fileoff_t end_fsb; 1709 int error; 1710 1711 start_fsb = XFS_B_TO_FSB(ip->i_mount, start); 1712 end_fsb = XFS_B_TO_FSB(ip->i_mount, end); 1713 if (end_fsb <= start_fsb) 1714 return; 1715 1716 xfs_ilock(ip, XFS_ILOCK_EXCL); 1717 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1718 end_fsb - start_fsb); 1719 if (error) { 1720 /* something screwed, just bail */ 1721 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1722 xfs_alert(ip->i_mount, 1723 "xfs_vm_write_failed: unable to clean up ino %lld", 1724 ip->i_ino); 1725 } 1726 } 1727 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1728 } 1729 1730 STATIC void 1731 xfs_vm_write_failed( 1732 struct inode *inode, 1733 struct page *page, 1734 loff_t pos, 1735 unsigned len) 1736 { 1737 loff_t block_offset; 1738 loff_t block_start; 1739 loff_t block_end; 1740 loff_t from = pos & (PAGE_CACHE_SIZE - 1); 1741 loff_t to = from + len; 1742 struct buffer_head *bh, *head; 1743 1744 /* 1745 * The request pos offset might be 32 or 64 bit, this is all fine 1746 * on 64-bit platform. However, for 64-bit pos request on 32-bit 1747 * platform, the high 32-bit will be masked off if we evaluate the 1748 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is 1749 * 0xfffff000 as an unsigned long, hence the result is incorrect 1750 * which could cause the following ASSERT failed in most cases. 1751 * In order to avoid this, we can evaluate the block_offset of the 1752 * start of the page by using shifts rather than masks the mismatch 1753 * problem. 1754 */ 1755 block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; 1756 1757 ASSERT(block_offset + from == pos); 1758 1759 head = page_buffers(page); 1760 block_start = 0; 1761 for (bh = head; bh != head || !block_start; 1762 bh = bh->b_this_page, block_start = block_end, 1763 block_offset += bh->b_size) { 1764 block_end = block_start + bh->b_size; 1765 1766 /* skip buffers before the write */ 1767 if (block_end <= from) 1768 continue; 1769 1770 /* if the buffer is after the write, we're done */ 1771 if (block_start >= to) 1772 break; 1773 1774 if (!buffer_delay(bh)) 1775 continue; 1776 1777 if (!buffer_new(bh) && block_offset < i_size_read(inode)) 1778 continue; 1779 1780 xfs_vm_kill_delalloc_range(inode, block_offset, 1781 block_offset + bh->b_size); 1782 1783 /* 1784 * This buffer does not contain data anymore. make sure anyone 1785 * who finds it knows that for certain. 1786 */ 1787 clear_buffer_delay(bh); 1788 clear_buffer_uptodate(bh); 1789 clear_buffer_mapped(bh); 1790 clear_buffer_new(bh); 1791 clear_buffer_dirty(bh); 1792 } 1793 1794 } 1795 1796 /* 1797 * This used to call block_write_begin(), but it unlocks and releases the page 1798 * on error, and we need that page to be able to punch stale delalloc blocks out 1799 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at 1800 * the appropriate point. 1801 */ 1802 STATIC int 1803 xfs_vm_write_begin( 1804 struct file *file, 1805 struct address_space *mapping, 1806 loff_t pos, 1807 unsigned len, 1808 unsigned flags, 1809 struct page **pagep, 1810 void **fsdata) 1811 { 1812 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1813 struct page *page; 1814 int status; 1815 1816 ASSERT(len <= PAGE_CACHE_SIZE); 1817 1818 page = grab_cache_page_write_begin(mapping, index, flags); 1819 if (!page) 1820 return -ENOMEM; 1821 1822 status = __block_write_begin(page, pos, len, xfs_get_blocks); 1823 if (unlikely(status)) { 1824 struct inode *inode = mapping->host; 1825 size_t isize = i_size_read(inode); 1826 1827 xfs_vm_write_failed(inode, page, pos, len); 1828 unlock_page(page); 1829 1830 /* 1831 * If the write is beyond EOF, we only want to kill blocks 1832 * allocated in this write, not blocks that were previously 1833 * written successfully. 1834 */ 1835 if (pos + len > isize) { 1836 ssize_t start = max_t(ssize_t, pos, isize); 1837 1838 truncate_pagecache_range(inode, start, pos + len); 1839 } 1840 1841 page_cache_release(page); 1842 page = NULL; 1843 } 1844 1845 *pagep = page; 1846 return status; 1847 } 1848 1849 /* 1850 * On failure, we only need to kill delalloc blocks beyond EOF in the range of 1851 * this specific write because they will never be written. Previous writes 1852 * beyond EOF where block allocation succeeded do not need to be trashed, so 1853 * only new blocks from this write should be trashed. For blocks within 1854 * EOF, generic_write_end() zeros them so they are safe to leave alone and be 1855 * written with all the other valid data. 1856 */ 1857 STATIC int 1858 xfs_vm_write_end( 1859 struct file *file, 1860 struct address_space *mapping, 1861 loff_t pos, 1862 unsigned len, 1863 unsigned copied, 1864 struct page *page, 1865 void *fsdata) 1866 { 1867 int ret; 1868 1869 ASSERT(len <= PAGE_CACHE_SIZE); 1870 1871 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 1872 if (unlikely(ret < len)) { 1873 struct inode *inode = mapping->host; 1874 size_t isize = i_size_read(inode); 1875 loff_t to = pos + len; 1876 1877 if (to > isize) { 1878 /* only kill blocks in this write beyond EOF */ 1879 if (pos > isize) 1880 isize = pos; 1881 xfs_vm_kill_delalloc_range(inode, isize, to); 1882 truncate_pagecache_range(inode, isize, to); 1883 } 1884 } 1885 return ret; 1886 } 1887 1888 STATIC sector_t 1889 xfs_vm_bmap( 1890 struct address_space *mapping, 1891 sector_t block) 1892 { 1893 struct inode *inode = (struct inode *)mapping->host; 1894 struct xfs_inode *ip = XFS_I(inode); 1895 1896 trace_xfs_vm_bmap(XFS_I(inode)); 1897 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1898 filemap_write_and_wait(mapping); 1899 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1900 return generic_block_bmap(mapping, block, xfs_get_blocks); 1901 } 1902 1903 STATIC int 1904 xfs_vm_readpage( 1905 struct file *unused, 1906 struct page *page) 1907 { 1908 return mpage_readpage(page, xfs_get_blocks); 1909 } 1910 1911 STATIC int 1912 xfs_vm_readpages( 1913 struct file *unused, 1914 struct address_space *mapping, 1915 struct list_head *pages, 1916 unsigned nr_pages) 1917 { 1918 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1919 } 1920 1921 /* 1922 * This is basically a copy of __set_page_dirty_buffers() with one 1923 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them 1924 * dirty, we'll never be able to clean them because we don't write buffers 1925 * beyond EOF, and that means we can't invalidate pages that span EOF 1926 * that have been marked dirty. Further, the dirty state can leak into 1927 * the file interior if the file is extended, resulting in all sorts of 1928 * bad things happening as the state does not match the underlying data. 1929 * 1930 * XXX: this really indicates that bufferheads in XFS need to die. Warts like 1931 * this only exist because of bufferheads and how the generic code manages them. 1932 */ 1933 STATIC int 1934 xfs_vm_set_page_dirty( 1935 struct page *page) 1936 { 1937 struct address_space *mapping = page->mapping; 1938 struct inode *inode = mapping->host; 1939 loff_t end_offset; 1940 loff_t offset; 1941 int newly_dirty; 1942 struct mem_cgroup *memcg; 1943 1944 if (unlikely(!mapping)) 1945 return !TestSetPageDirty(page); 1946 1947 end_offset = i_size_read(inode); 1948 offset = page_offset(page); 1949 1950 spin_lock(&mapping->private_lock); 1951 if (page_has_buffers(page)) { 1952 struct buffer_head *head = page_buffers(page); 1953 struct buffer_head *bh = head; 1954 1955 do { 1956 if (offset < end_offset) 1957 set_buffer_dirty(bh); 1958 bh = bh->b_this_page; 1959 offset += 1 << inode->i_blkbits; 1960 } while (bh != head); 1961 } 1962 /* 1963 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with 1964 * per-memcg dirty page counters. 1965 */ 1966 memcg = mem_cgroup_begin_page_stat(page); 1967 newly_dirty = !TestSetPageDirty(page); 1968 spin_unlock(&mapping->private_lock); 1969 1970 if (newly_dirty) { 1971 /* sigh - __set_page_dirty() is static, so copy it here, too */ 1972 unsigned long flags; 1973 1974 spin_lock_irqsave(&mapping->tree_lock, flags); 1975 if (page->mapping) { /* Race with truncate? */ 1976 WARN_ON_ONCE(!PageUptodate(page)); 1977 account_page_dirtied(page, mapping, memcg); 1978 radix_tree_tag_set(&mapping->page_tree, 1979 page_index(page), PAGECACHE_TAG_DIRTY); 1980 } 1981 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1982 } 1983 mem_cgroup_end_page_stat(memcg); 1984 if (newly_dirty) 1985 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1986 return newly_dirty; 1987 } 1988 1989 const struct address_space_operations xfs_address_space_operations = { 1990 .readpage = xfs_vm_readpage, 1991 .readpages = xfs_vm_readpages, 1992 .writepage = xfs_vm_writepage, 1993 .writepages = xfs_vm_writepages, 1994 .set_page_dirty = xfs_vm_set_page_dirty, 1995 .releasepage = xfs_vm_releasepage, 1996 .invalidatepage = xfs_vm_invalidatepage, 1997 .write_begin = xfs_vm_write_begin, 1998 .write_end = xfs_vm_write_end, 1999 .bmap = xfs_vm_bmap, 2000 .direct_IO = xfs_vm_direct_IO, 2001 .migratepage = buffer_migrate_page, 2002 .is_partially_uptodate = block_is_partially_uptodate, 2003 .error_remove_page = generic_error_remove_page, 2004 }; 2005