1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * Copyright (c) 2016-2018 Christoph Hellwig. 5 * All Rights Reserved. 6 */ 7 #include "xfs.h" 8 #include "xfs_fs.h" 9 #include "xfs_shared.h" 10 #include "xfs_format.h" 11 #include "xfs_log_format.h" 12 #include "xfs_trans_resv.h" 13 #include "xfs_mount.h" 14 #include "xfs_inode.h" 15 #include "xfs_btree.h" 16 #include "xfs_bmap_btree.h" 17 #include "xfs_bmap.h" 18 #include "xfs_bmap_util.h" 19 #include "xfs_errortag.h" 20 #include "xfs_error.h" 21 #include "xfs_trans.h" 22 #include "xfs_trans_space.h" 23 #include "xfs_inode_item.h" 24 #include "xfs_iomap.h" 25 #include "xfs_trace.h" 26 #include "xfs_quota.h" 27 #include "xfs_rtgroup.h" 28 #include "xfs_dquot_item.h" 29 #include "xfs_dquot.h" 30 #include "xfs_reflink.h" 31 #include "xfs_health.h" 32 #include "xfs_rtbitmap.h" 33 #include "xfs_icache.h" 34 #include "xfs_zone_alloc.h" 35 36 #define XFS_ALLOC_ALIGN(mp, off) \ 37 (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) 38 39 static int 40 xfs_alert_fsblock_zero( 41 xfs_inode_t *ip, 42 xfs_bmbt_irec_t *imap) 43 { 44 xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, 45 "Access to block zero in inode %llu " 46 "start_block: %llx start_off: %llx " 47 "blkcnt: %llx extent-state: %x", 48 (unsigned long long)ip->i_ino, 49 (unsigned long long)imap->br_startblock, 50 (unsigned long long)imap->br_startoff, 51 (unsigned long long)imap->br_blockcount, 52 imap->br_state); 53 xfs_bmap_mark_sick(ip, XFS_DATA_FORK); 54 return -EFSCORRUPTED; 55 } 56 57 u64 58 xfs_iomap_inode_sequence( 59 struct xfs_inode *ip, 60 u16 iomap_flags) 61 { 62 u64 cookie = 0; 63 64 if (iomap_flags & IOMAP_F_XATTR) 65 return READ_ONCE(ip->i_af.if_seq); 66 if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp) 67 cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32; 68 return cookie | READ_ONCE(ip->i_df.if_seq); 69 } 70 71 /* 72 * Check that the iomap passed to us is still valid for the given offset and 73 * length. 74 */ 75 static bool 76 xfs_iomap_valid( 77 struct inode *inode, 78 const struct iomap *iomap) 79 { 80 struct xfs_inode *ip = XFS_I(inode); 81 82 if (iomap->type == IOMAP_HOLE) 83 return true; 84 85 if (iomap->validity_cookie != 86 xfs_iomap_inode_sequence(ip, iomap->flags)) { 87 trace_xfs_iomap_invalid(ip, iomap); 88 return false; 89 } 90 91 XFS_ERRORTAG_DELAY(ip->i_mount, XFS_ERRTAG_WRITE_DELAY_MS); 92 return true; 93 } 94 95 const struct iomap_write_ops xfs_iomap_write_ops = { 96 .iomap_valid = xfs_iomap_valid, 97 }; 98 99 int 100 xfs_bmbt_to_iomap( 101 struct xfs_inode *ip, 102 struct iomap *iomap, 103 struct xfs_bmbt_irec *imap, 104 unsigned int mapping_flags, 105 u16 iomap_flags, 106 u64 sequence_cookie) 107 { 108 struct xfs_mount *mp = ip->i_mount; 109 struct xfs_buftarg *target = xfs_inode_buftarg(ip); 110 111 if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) { 112 xfs_bmap_mark_sick(ip, XFS_DATA_FORK); 113 return xfs_alert_fsblock_zero(ip, imap); 114 } 115 116 if (imap->br_startblock == HOLESTARTBLOCK) { 117 iomap->addr = IOMAP_NULL_ADDR; 118 iomap->type = IOMAP_HOLE; 119 } else if (imap->br_startblock == DELAYSTARTBLOCK || 120 isnullstartblock(imap->br_startblock)) { 121 iomap->addr = IOMAP_NULL_ADDR; 122 iomap->type = IOMAP_DELALLOC; 123 } else { 124 xfs_daddr_t daddr = xfs_fsb_to_db(ip, imap->br_startblock); 125 126 iomap->addr = BBTOB(daddr); 127 if (mapping_flags & IOMAP_DAX) 128 iomap->addr += target->bt_dax_part_off; 129 130 if (imap->br_state == XFS_EXT_UNWRITTEN) 131 iomap->type = IOMAP_UNWRITTEN; 132 else 133 iomap->type = IOMAP_MAPPED; 134 135 /* 136 * Mark iomaps starting at the first sector of a RTG as merge 137 * boundary so that each I/O completions is contained to a 138 * single RTG. 139 */ 140 if (XFS_IS_REALTIME_INODE(ip) && xfs_has_rtgroups(mp) && 141 xfs_rtbno_is_group_start(mp, imap->br_startblock)) 142 iomap->flags |= IOMAP_F_BOUNDARY; 143 } 144 iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); 145 iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); 146 if (mapping_flags & IOMAP_DAX) 147 iomap->dax_dev = target->bt_daxdev; 148 else 149 iomap->bdev = target->bt_bdev; 150 iomap->flags = iomap_flags; 151 152 if (xfs_ipincount(ip) && 153 (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) 154 iomap->flags |= IOMAP_F_DIRTY; 155 156 iomap->validity_cookie = sequence_cookie; 157 return 0; 158 } 159 160 static void 161 xfs_hole_to_iomap( 162 struct xfs_inode *ip, 163 struct iomap *iomap, 164 xfs_fileoff_t offset_fsb, 165 xfs_fileoff_t end_fsb) 166 { 167 struct xfs_buftarg *target = xfs_inode_buftarg(ip); 168 169 iomap->addr = IOMAP_NULL_ADDR; 170 iomap->type = IOMAP_HOLE; 171 iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb); 172 iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb); 173 iomap->bdev = target->bt_bdev; 174 iomap->dax_dev = target->bt_daxdev; 175 } 176 177 static inline xfs_fileoff_t 178 xfs_iomap_end_fsb( 179 struct xfs_mount *mp, 180 loff_t offset, 181 loff_t count) 182 { 183 ASSERT(offset <= mp->m_super->s_maxbytes); 184 return min(XFS_B_TO_FSB(mp, offset + count), 185 XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); 186 } 187 188 static xfs_extlen_t 189 xfs_eof_alignment( 190 struct xfs_inode *ip) 191 { 192 struct xfs_mount *mp = ip->i_mount; 193 xfs_extlen_t align = 0; 194 195 if (!XFS_IS_REALTIME_INODE(ip)) { 196 /* 197 * Round up the allocation request to a stripe unit 198 * (m_dalign) boundary if the file size is >= stripe unit 199 * size, and we are allocating past the allocation eof. 200 * 201 * If mounted with the "-o swalloc" option the alignment is 202 * increased from the strip unit size to the stripe width. 203 */ 204 if (mp->m_swidth && xfs_has_swalloc(mp)) 205 align = mp->m_swidth; 206 else if (mp->m_dalign) 207 align = mp->m_dalign; 208 209 if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align)) 210 align = 0; 211 } 212 213 return align; 214 } 215 216 /* 217 * Check if last_fsb is outside the last extent, and if so grow it to the next 218 * stripe unit boundary. 219 */ 220 xfs_fileoff_t 221 xfs_iomap_eof_align_last_fsb( 222 struct xfs_inode *ip, 223 xfs_fileoff_t end_fsb) 224 { 225 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 226 xfs_extlen_t extsz = xfs_get_extsz_hint(ip); 227 xfs_extlen_t align = xfs_eof_alignment(ip); 228 struct xfs_bmbt_irec irec; 229 struct xfs_iext_cursor icur; 230 231 ASSERT(!xfs_need_iread_extents(ifp)); 232 233 /* 234 * Always round up the allocation request to the extent hint boundary. 235 */ 236 if (extsz) { 237 if (align) 238 align = roundup_64(align, extsz); 239 else 240 align = extsz; 241 } 242 243 if (align) { 244 xfs_fileoff_t aligned_end_fsb = roundup_64(end_fsb, align); 245 246 xfs_iext_last(ifp, &icur); 247 if (!xfs_iext_get_extent(ifp, &icur, &irec) || 248 aligned_end_fsb >= irec.br_startoff + irec.br_blockcount) 249 return aligned_end_fsb; 250 } 251 252 return end_fsb; 253 } 254 255 int 256 xfs_iomap_write_direct( 257 struct xfs_inode *ip, 258 xfs_fileoff_t offset_fsb, 259 xfs_fileoff_t count_fsb, 260 unsigned int flags, 261 struct xfs_bmbt_irec *imap, 262 u64 *seq) 263 { 264 struct xfs_mount *mp = ip->i_mount; 265 struct xfs_trans *tp; 266 xfs_filblks_t resaligned; 267 int nimaps; 268 unsigned int dblocks, rblocks; 269 bool force = false; 270 int error; 271 int bmapi_flags = XFS_BMAPI_PREALLOC; 272 int nr_exts = XFS_IEXT_ADD_NOSPLIT_CNT; 273 274 ASSERT(count_fsb > 0); 275 276 resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, 277 xfs_get_extsz_hint(ip)); 278 if (unlikely(XFS_IS_REALTIME_INODE(ip))) { 279 dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); 280 rblocks = resaligned; 281 } else { 282 dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); 283 rblocks = 0; 284 } 285 286 error = xfs_qm_dqattach(ip); 287 if (error) 288 return error; 289 290 /* 291 * For DAX, we do not allocate unwritten extents, but instead we zero 292 * the block before we commit the transaction. Ideally we'd like to do 293 * this outside the transaction context, but if we commit and then crash 294 * we may not have zeroed the blocks and this will be exposed on 295 * recovery of the allocation. Hence we must zero before commit. 296 * 297 * Further, if we are mapping unwritten extents here, we need to zero 298 * and convert them to written so that we don't need an unwritten extent 299 * callback for DAX. This also means that we need to be able to dip into 300 * the reserve block pool for bmbt block allocation if there is no space 301 * left but we need to do unwritten extent conversion. 302 */ 303 if (flags & IOMAP_DAX) { 304 bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; 305 if (imap->br_state == XFS_EXT_UNWRITTEN) { 306 force = true; 307 nr_exts = XFS_IEXT_WRITE_UNWRITTEN_CNT; 308 dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; 309 } 310 } 311 312 error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, 313 rblocks, force, &tp); 314 if (error) 315 return error; 316 317 error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, nr_exts); 318 if (error) 319 goto out_trans_cancel; 320 321 /* 322 * From this point onwards we overwrite the imap pointer that the 323 * caller gave to us. 324 */ 325 nimaps = 1; 326 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0, 327 imap, &nimaps); 328 if (error) 329 goto out_trans_cancel; 330 331 /* 332 * Complete the transaction 333 */ 334 error = xfs_trans_commit(tp); 335 if (error) 336 goto out_unlock; 337 338 if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) { 339 xfs_bmap_mark_sick(ip, XFS_DATA_FORK); 340 error = xfs_alert_fsblock_zero(ip, imap); 341 } 342 343 out_unlock: 344 *seq = xfs_iomap_inode_sequence(ip, 0); 345 xfs_iunlock(ip, XFS_ILOCK_EXCL); 346 return error; 347 348 out_trans_cancel: 349 xfs_trans_cancel(tp); 350 goto out_unlock; 351 } 352 353 STATIC bool 354 xfs_quota_need_throttle( 355 struct xfs_inode *ip, 356 xfs_dqtype_t type, 357 xfs_fsblock_t alloc_blocks) 358 { 359 struct xfs_dquot *dq = xfs_inode_dquot(ip, type); 360 struct xfs_dquot_res *res; 361 struct xfs_dquot_pre *pre; 362 363 if (!dq || !xfs_this_quota_on(ip->i_mount, type)) 364 return false; 365 366 if (XFS_IS_REALTIME_INODE(ip)) { 367 res = &dq->q_rtb; 368 pre = &dq->q_rtb_prealloc; 369 } else { 370 res = &dq->q_blk; 371 pre = &dq->q_blk_prealloc; 372 } 373 374 /* no hi watermark, no throttle */ 375 if (!pre->q_prealloc_hi_wmark) 376 return false; 377 378 /* under the lo watermark, no throttle */ 379 if (res->reserved + alloc_blocks < pre->q_prealloc_lo_wmark) 380 return false; 381 382 return true; 383 } 384 385 STATIC void 386 xfs_quota_calc_throttle( 387 struct xfs_inode *ip, 388 xfs_dqtype_t type, 389 xfs_fsblock_t *qblocks, 390 int *qshift, 391 int64_t *qfreesp) 392 { 393 struct xfs_dquot *dq = xfs_inode_dquot(ip, type); 394 struct xfs_dquot_res *res; 395 struct xfs_dquot_pre *pre; 396 int64_t freesp; 397 int shift = 0; 398 399 if (!dq) { 400 res = NULL; 401 pre = NULL; 402 } else if (XFS_IS_REALTIME_INODE(ip)) { 403 res = &dq->q_rtb; 404 pre = &dq->q_rtb_prealloc; 405 } else { 406 res = &dq->q_blk; 407 pre = &dq->q_blk_prealloc; 408 } 409 410 /* no dq, or over hi wmark, squash the prealloc completely */ 411 if (!res || res->reserved >= pre->q_prealloc_hi_wmark) { 412 *qblocks = 0; 413 *qfreesp = 0; 414 return; 415 } 416 417 freesp = pre->q_prealloc_hi_wmark - res->reserved; 418 if (freesp < pre->q_low_space[XFS_QLOWSP_5_PCNT]) { 419 shift = 2; 420 if (freesp < pre->q_low_space[XFS_QLOWSP_3_PCNT]) 421 shift += 2; 422 if (freesp < pre->q_low_space[XFS_QLOWSP_1_PCNT]) 423 shift += 2; 424 } 425 426 if (freesp < *qfreesp) 427 *qfreesp = freesp; 428 429 /* only overwrite the throttle values if we are more aggressive */ 430 if ((freesp >> shift) < (*qblocks >> *qshift)) { 431 *qblocks = freesp; 432 *qshift = shift; 433 } 434 } 435 436 static int64_t 437 xfs_iomap_freesp( 438 struct xfs_mount *mp, 439 unsigned int idx, 440 uint64_t low_space[XFS_LOWSP_MAX], 441 int *shift) 442 { 443 int64_t freesp; 444 445 freesp = xfs_estimate_freecounter(mp, idx); 446 if (freesp < low_space[XFS_LOWSP_5_PCNT]) { 447 *shift = 2; 448 if (freesp < low_space[XFS_LOWSP_4_PCNT]) 449 (*shift)++; 450 if (freesp < low_space[XFS_LOWSP_3_PCNT]) 451 (*shift)++; 452 if (freesp < low_space[XFS_LOWSP_2_PCNT]) 453 (*shift)++; 454 if (freesp < low_space[XFS_LOWSP_1_PCNT]) 455 (*shift)++; 456 } 457 return freesp; 458 } 459 460 /* 461 * If we don't have a user specified preallocation size, dynamically increase 462 * the preallocation size as the size of the file grows. Cap the maximum size 463 * at a single extent or less if the filesystem is near full. The closer the 464 * filesystem is to being full, the smaller the maximum preallocation. 465 */ 466 STATIC xfs_fsblock_t 467 xfs_iomap_prealloc_size( 468 struct xfs_inode *ip, 469 int whichfork, 470 loff_t offset, 471 loff_t count, 472 struct xfs_iext_cursor *icur) 473 { 474 struct xfs_iext_cursor ncur = *icur; 475 struct xfs_bmbt_irec prev, got; 476 struct xfs_mount *mp = ip->i_mount; 477 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 478 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 479 int64_t freesp; 480 xfs_fsblock_t qblocks; 481 xfs_fsblock_t alloc_blocks = 0; 482 xfs_extlen_t plen; 483 int shift = 0; 484 int qshift = 0; 485 486 /* 487 * As an exception we don't do any preallocation at all if the file is 488 * smaller than the minimum preallocation and we are using the default 489 * dynamic preallocation scheme, as it is likely this is the only write 490 * to the file that is going to be done. 491 */ 492 if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks)) 493 return 0; 494 495 /* 496 * Use the minimum preallocation size for small files or if we are 497 * writing right after a hole. 498 */ 499 if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || 500 !xfs_iext_prev_extent(ifp, &ncur, &prev) || 501 prev.br_startoff + prev.br_blockcount < offset_fsb) 502 return mp->m_allocsize_blocks; 503 504 /* 505 * Take the size of the preceding data extents as the basis for the 506 * preallocation size. Note that we don't care if the previous extents 507 * are written or not. 508 */ 509 plen = prev.br_blockcount; 510 while (xfs_iext_prev_extent(ifp, &ncur, &got)) { 511 if (plen > XFS_MAX_BMBT_EXTLEN / 2 || 512 isnullstartblock(got.br_startblock) || 513 got.br_startoff + got.br_blockcount != prev.br_startoff || 514 got.br_startblock + got.br_blockcount != prev.br_startblock) 515 break; 516 plen += got.br_blockcount; 517 prev = got; 518 } 519 520 /* 521 * If the size of the extents is greater than half the maximum extent 522 * length, then use the current offset as the basis. This ensures that 523 * for large files the preallocation size always extends to 524 * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe 525 * unit/width alignment of real extents. 526 */ 527 alloc_blocks = plen * 2; 528 if (alloc_blocks > XFS_MAX_BMBT_EXTLEN) 529 alloc_blocks = XFS_B_TO_FSB(mp, offset); 530 qblocks = alloc_blocks; 531 532 /* 533 * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc 534 * down to the nearest power of two value after throttling. To prevent 535 * the round down from unconditionally reducing the maximum supported 536 * prealloc size, we round up first, apply appropriate throttling, round 537 * down and cap the value to XFS_BMBT_MAX_EXTLEN. 538 */ 539 alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN), 540 alloc_blocks); 541 542 if (unlikely(XFS_IS_REALTIME_INODE(ip))) 543 freesp = xfs_rtbxlen_to_blen(mp, 544 xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS, 545 mp->m_low_rtexts, &shift)); 546 else 547 freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space, 548 &shift); 549 550 /* 551 * Check each quota to cap the prealloc size, provide a shift value to 552 * throttle with and adjust amount of available space. 553 */ 554 if (xfs_quota_need_throttle(ip, XFS_DQTYPE_USER, alloc_blocks)) 555 xfs_quota_calc_throttle(ip, XFS_DQTYPE_USER, &qblocks, &qshift, 556 &freesp); 557 if (xfs_quota_need_throttle(ip, XFS_DQTYPE_GROUP, alloc_blocks)) 558 xfs_quota_calc_throttle(ip, XFS_DQTYPE_GROUP, &qblocks, &qshift, 559 &freesp); 560 if (xfs_quota_need_throttle(ip, XFS_DQTYPE_PROJ, alloc_blocks)) 561 xfs_quota_calc_throttle(ip, XFS_DQTYPE_PROJ, &qblocks, &qshift, 562 &freesp); 563 564 /* 565 * The final prealloc size is set to the minimum of free space available 566 * in each of the quotas and the overall filesystem. 567 * 568 * The shift throttle value is set to the maximum value as determined by 569 * the global low free space values and per-quota low free space values. 570 */ 571 alloc_blocks = min(alloc_blocks, qblocks); 572 shift = max(shift, qshift); 573 574 if (shift) 575 alloc_blocks >>= shift; 576 /* 577 * rounddown_pow_of_two() returns an undefined result if we pass in 578 * alloc_blocks = 0. 579 */ 580 if (alloc_blocks) 581 alloc_blocks = rounddown_pow_of_two(alloc_blocks); 582 if (alloc_blocks > XFS_MAX_BMBT_EXTLEN) 583 alloc_blocks = XFS_MAX_BMBT_EXTLEN; 584 585 /* 586 * If we are still trying to allocate more space than is 587 * available, squash the prealloc hard. This can happen if we 588 * have a large file on a small filesystem and the above 589 * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN. 590 */ 591 while (alloc_blocks && alloc_blocks >= freesp) 592 alloc_blocks >>= 4; 593 if (alloc_blocks < mp->m_allocsize_blocks) 594 alloc_blocks = mp->m_allocsize_blocks; 595 trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, 596 mp->m_allocsize_blocks); 597 return alloc_blocks; 598 } 599 600 int 601 xfs_iomap_write_unwritten( 602 xfs_inode_t *ip, 603 xfs_off_t offset, 604 xfs_off_t count, 605 bool update_isize) 606 { 607 xfs_mount_t *mp = ip->i_mount; 608 xfs_fileoff_t offset_fsb; 609 xfs_filblks_t count_fsb; 610 xfs_filblks_t numblks_fsb; 611 int nimaps; 612 xfs_trans_t *tp; 613 xfs_bmbt_irec_t imap; 614 struct inode *inode = VFS_I(ip); 615 xfs_fsize_t i_size; 616 uint resblks; 617 int error; 618 619 trace_xfs_unwritten_convert(ip, offset, count); 620 621 offset_fsb = XFS_B_TO_FSBT(mp, offset); 622 count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 623 count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); 624 625 /* 626 * Reserve enough blocks in this transaction for two complete extent 627 * btree splits. We may be converting the middle part of an unwritten 628 * extent and in this case we will insert two new extents in the btree 629 * each of which could cause a full split. 630 * 631 * This reservation amount will be used in the first call to 632 * xfs_bmbt_split() to select an AG with enough space to satisfy the 633 * rest of the operation. 634 */ 635 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; 636 637 /* Attach dquots so that bmbt splits are accounted correctly. */ 638 error = xfs_qm_dqattach(ip); 639 if (error) 640 return error; 641 642 do { 643 /* 644 * Set up a transaction to convert the range of extents 645 * from unwritten to real. Do allocations in a loop until 646 * we have covered the range passed in. 647 * 648 * Note that we can't risk to recursing back into the filesystem 649 * here as we might be asked to write out the same inode that we 650 * complete here and might deadlock on the iolock. 651 */ 652 error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 653 0, true, &tp); 654 if (error) 655 return error; 656 657 error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, 658 XFS_IEXT_WRITE_UNWRITTEN_CNT); 659 if (error) 660 goto error_on_bmapi_transaction; 661 662 /* 663 * Modify the unwritten extent state of the buffer. 664 */ 665 nimaps = 1; 666 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, 667 XFS_BMAPI_CONVERT, resblks, &imap, 668 &nimaps); 669 if (error) 670 goto error_on_bmapi_transaction; 671 672 /* 673 * Log the updated inode size as we go. We have to be careful 674 * to only log it up to the actual write offset if it is 675 * halfway into a block. 676 */ 677 i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); 678 if (i_size > offset + count) 679 i_size = offset + count; 680 if (update_isize && i_size > i_size_read(inode)) 681 i_size_write(inode, i_size); 682 i_size = xfs_new_eof(ip, i_size); 683 if (i_size) { 684 ip->i_disk_size = i_size; 685 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 686 } 687 688 error = xfs_trans_commit(tp); 689 xfs_iunlock(ip, XFS_ILOCK_EXCL); 690 if (error) 691 return error; 692 693 if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) { 694 xfs_bmap_mark_sick(ip, XFS_DATA_FORK); 695 return xfs_alert_fsblock_zero(ip, &imap); 696 } 697 698 if ((numblks_fsb = imap.br_blockcount) == 0) { 699 /* 700 * The numblks_fsb value should always get 701 * smaller, otherwise the loop is stuck. 702 */ 703 ASSERT(imap.br_blockcount); 704 break; 705 } 706 offset_fsb += numblks_fsb; 707 count_fsb -= numblks_fsb; 708 } while (count_fsb > 0); 709 710 return 0; 711 712 error_on_bmapi_transaction: 713 xfs_trans_cancel(tp); 714 xfs_iunlock(ip, XFS_ILOCK_EXCL); 715 return error; 716 } 717 718 static inline bool 719 imap_needs_alloc( 720 struct inode *inode, 721 unsigned flags, 722 struct xfs_bmbt_irec *imap, 723 int nimaps) 724 { 725 /* don't allocate blocks when just zeroing */ 726 if (flags & IOMAP_ZERO) 727 return false; 728 if (!nimaps || 729 imap->br_startblock == HOLESTARTBLOCK || 730 imap->br_startblock == DELAYSTARTBLOCK) 731 return true; 732 /* we convert unwritten extents before copying the data for DAX */ 733 if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN) 734 return true; 735 return false; 736 } 737 738 static inline bool 739 imap_needs_cow( 740 struct xfs_inode *ip, 741 unsigned int flags, 742 struct xfs_bmbt_irec *imap, 743 int nimaps) 744 { 745 if (!xfs_is_cow_inode(ip)) 746 return false; 747 748 /* when zeroing we don't have to COW holes or unwritten extents */ 749 if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) { 750 if (!nimaps || 751 imap->br_startblock == HOLESTARTBLOCK || 752 imap->br_state == XFS_EXT_UNWRITTEN) 753 return false; 754 } 755 756 return true; 757 } 758 759 /* 760 * Extents not yet cached requires exclusive access, don't block for 761 * IOMAP_NOWAIT. 762 * 763 * This is basically an opencoded xfs_ilock_data_map_shared() call, but with 764 * support for IOMAP_NOWAIT. 765 */ 766 static int 767 xfs_ilock_for_iomap( 768 struct xfs_inode *ip, 769 unsigned flags, 770 unsigned *lockmode) 771 { 772 if (flags & IOMAP_NOWAIT) { 773 if (xfs_need_iread_extents(&ip->i_df)) 774 return -EAGAIN; 775 if (!xfs_ilock_nowait(ip, *lockmode)) 776 return -EAGAIN; 777 } else { 778 if (xfs_need_iread_extents(&ip->i_df)) 779 *lockmode = XFS_ILOCK_EXCL; 780 xfs_ilock(ip, *lockmode); 781 } 782 783 return 0; 784 } 785 786 /* 787 * Check that the imap we are going to return to the caller spans the entire 788 * range that the caller requested for the IO. 789 */ 790 static bool 791 imap_spans_range( 792 struct xfs_bmbt_irec *imap, 793 xfs_fileoff_t offset_fsb, 794 xfs_fileoff_t end_fsb) 795 { 796 if (imap->br_startoff > offset_fsb) 797 return false; 798 if (imap->br_startoff + imap->br_blockcount < end_fsb) 799 return false; 800 return true; 801 } 802 803 static bool 804 xfs_bmap_hw_atomic_write_possible( 805 struct xfs_inode *ip, 806 struct xfs_bmbt_irec *imap, 807 xfs_fileoff_t offset_fsb, 808 xfs_fileoff_t end_fsb) 809 { 810 struct xfs_mount *mp = ip->i_mount; 811 xfs_fsize_t len = XFS_FSB_TO_B(mp, end_fsb - offset_fsb); 812 813 /* 814 * atomic writes are required to be naturally aligned for disk blocks, 815 * which ensures that we adhere to block layer rules that we won't 816 * straddle any boundary or violate write alignment requirement. 817 */ 818 if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount)) 819 return false; 820 821 /* 822 * Spanning multiple extents would mean that multiple BIOs would be 823 * issued, and so would lose atomicity required for REQ_ATOMIC-based 824 * atomics. 825 */ 826 if (!imap_spans_range(imap, offset_fsb, end_fsb)) 827 return false; 828 829 /* 830 * The ->iomap_begin caller should ensure this, but check anyway. 831 */ 832 return len <= xfs_inode_buftarg(ip)->bt_awu_max; 833 } 834 835 static int 836 xfs_direct_write_iomap_begin( 837 struct inode *inode, 838 loff_t offset, 839 loff_t length, 840 unsigned flags, 841 struct iomap *iomap, 842 struct iomap *srcmap) 843 { 844 struct xfs_inode *ip = XFS_I(inode); 845 struct xfs_mount *mp = ip->i_mount; 846 struct xfs_bmbt_irec imap, cmap; 847 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 848 xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); 849 xfs_fileoff_t orig_end_fsb = end_fsb; 850 int nimaps = 1, error = 0; 851 bool shared = false; 852 u16 iomap_flags = 0; 853 bool needs_alloc; 854 unsigned int lockmode; 855 u64 seq; 856 857 ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); 858 859 if (xfs_is_shutdown(mp)) 860 return -EIO; 861 862 /* 863 * Writes that span EOF might trigger an IO size update on completion, 864 * so consider them to be dirty for the purposes of O_DSYNC even if 865 * there is no other metadata changes pending or have been made here. 866 */ 867 if (offset + length > i_size_read(inode)) 868 iomap_flags |= IOMAP_F_DIRTY; 869 870 /* HW-offload atomics are always used in this path */ 871 if (flags & IOMAP_ATOMIC) 872 iomap_flags |= IOMAP_F_ATOMIC_BIO; 873 874 /* 875 * COW writes may allocate delalloc space or convert unwritten COW 876 * extents, so we need to make sure to take the lock exclusively here. 877 */ 878 if (xfs_is_cow_inode(ip)) 879 lockmode = XFS_ILOCK_EXCL; 880 else 881 lockmode = XFS_ILOCK_SHARED; 882 883 relock: 884 error = xfs_ilock_for_iomap(ip, flags, &lockmode); 885 if (error) 886 return error; 887 888 /* 889 * The reflink iflag could have changed since the earlier unlocked 890 * check, check if it again and relock if needed. 891 */ 892 if (xfs_is_cow_inode(ip) && lockmode == XFS_ILOCK_SHARED) { 893 xfs_iunlock(ip, lockmode); 894 lockmode = XFS_ILOCK_EXCL; 895 goto relock; 896 } 897 898 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, 899 &nimaps, 0); 900 if (error) 901 goto out_unlock; 902 903 if (imap_needs_cow(ip, flags, &imap, nimaps)) { 904 error = -EAGAIN; 905 if (flags & IOMAP_NOWAIT) 906 goto out_unlock; 907 908 /* may drop and re-acquire the ilock */ 909 error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared, 910 &lockmode, 911 (flags & IOMAP_DIRECT) || IS_DAX(inode)); 912 if (error) 913 goto out_unlock; 914 if (shared) { 915 if ((flags & IOMAP_ATOMIC) && 916 !xfs_bmap_hw_atomic_write_possible(ip, &cmap, 917 offset_fsb, end_fsb)) { 918 error = -ENOPROTOOPT; 919 goto out_unlock; 920 } 921 goto out_found_cow; 922 } 923 end_fsb = imap.br_startoff + imap.br_blockcount; 924 length = XFS_FSB_TO_B(mp, end_fsb) - offset; 925 } 926 927 needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps); 928 929 if (flags & IOMAP_ATOMIC) { 930 error = -ENOPROTOOPT; 931 /* 932 * If we allocate less than what is required for the write 933 * then we may end up with multiple extents, which means that 934 * REQ_ATOMIC-based cannot be used, so avoid this possibility. 935 */ 936 if (needs_alloc && orig_end_fsb - offset_fsb > 1) 937 goto out_unlock; 938 939 if (!xfs_bmap_hw_atomic_write_possible(ip, &imap, offset_fsb, 940 orig_end_fsb)) 941 goto out_unlock; 942 } 943 944 if (needs_alloc) 945 goto allocate_blocks; 946 947 /* 948 * NOWAIT and OVERWRITE I/O needs to span the entire requested I/O with 949 * a single map so that we avoid partial IO failures due to the rest of 950 * the I/O range not covered by this map triggering an EAGAIN condition 951 * when it is subsequently mapped and aborting the I/O. 952 */ 953 if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) { 954 error = -EAGAIN; 955 if (!imap_spans_range(&imap, offset_fsb, end_fsb)) 956 goto out_unlock; 957 } 958 959 /* 960 * For overwrite only I/O, we cannot convert unwritten extents without 961 * requiring sub-block zeroing. This can only be done under an 962 * exclusive IOLOCK, hence return -EAGAIN if this is not a written 963 * extent to tell the caller to try again. 964 */ 965 if (flags & IOMAP_OVERWRITE_ONLY) { 966 error = -EAGAIN; 967 if (imap.br_state != XFS_EXT_NORM && 968 ((offset | length) & mp->m_blockmask)) 969 goto out_unlock; 970 } 971 972 seq = xfs_iomap_inode_sequence(ip, iomap_flags); 973 xfs_iunlock(ip, lockmode); 974 trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); 975 return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); 976 977 allocate_blocks: 978 error = -EAGAIN; 979 if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) 980 goto out_unlock; 981 982 /* 983 * We cap the maximum length we map to a sane size to keep the chunks 984 * of work done where somewhat symmetric with the work writeback does. 985 * This is a completely arbitrary number pulled out of thin air as a 986 * best guess for initial testing. 987 * 988 * Note that the values needs to be less than 32-bits wide until the 989 * lower level functions are updated. 990 */ 991 length = min_t(loff_t, length, 1024 * PAGE_SIZE); 992 end_fsb = xfs_iomap_end_fsb(mp, offset, length); 993 994 if (offset + length > XFS_ISIZE(ip)) 995 end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb); 996 else if (nimaps && imap.br_startblock == HOLESTARTBLOCK) 997 end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount); 998 xfs_iunlock(ip, lockmode); 999 1000 error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, 1001 flags, &imap, &seq); 1002 if (error) 1003 return error; 1004 1005 trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); 1006 return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 1007 iomap_flags | IOMAP_F_NEW, seq); 1008 1009 out_found_cow: 1010 length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); 1011 trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); 1012 if (imap.br_startblock != HOLESTARTBLOCK) { 1013 seq = xfs_iomap_inode_sequence(ip, 0); 1014 error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); 1015 if (error) 1016 goto out_unlock; 1017 } 1018 seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); 1019 xfs_iunlock(ip, lockmode); 1020 return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); 1021 1022 out_unlock: 1023 if (lockmode) 1024 xfs_iunlock(ip, lockmode); 1025 return error; 1026 } 1027 1028 const struct iomap_ops xfs_direct_write_iomap_ops = { 1029 .iomap_begin = xfs_direct_write_iomap_begin, 1030 }; 1031 1032 #ifdef CONFIG_XFS_RT 1033 /* 1034 * This is really simple. The space has already been reserved before taking the 1035 * IOLOCK, the actual block allocation is done just before submitting the bio 1036 * and only recorded in the extent map on I/O completion. 1037 */ 1038 static int 1039 xfs_zoned_direct_write_iomap_begin( 1040 struct inode *inode, 1041 loff_t offset, 1042 loff_t length, 1043 unsigned flags, 1044 struct iomap *iomap, 1045 struct iomap *srcmap) 1046 { 1047 struct xfs_inode *ip = XFS_I(inode); 1048 int error; 1049 1050 ASSERT(!(flags & IOMAP_OVERWRITE_ONLY)); 1051 1052 /* 1053 * Needs to be pushed down into the allocator so that only writes into 1054 * a single zone can be supported. 1055 */ 1056 if (flags & IOMAP_NOWAIT) 1057 return -EAGAIN; 1058 1059 /* 1060 * Ensure the extent list is in memory in so that we don't have to do 1061 * read it from the I/O completion handler. 1062 */ 1063 if (xfs_need_iread_extents(&ip->i_df)) { 1064 xfs_ilock(ip, XFS_ILOCK_EXCL); 1065 error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); 1066 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1067 if (error) 1068 return error; 1069 } 1070 1071 iomap->type = IOMAP_MAPPED; 1072 iomap->flags = IOMAP_F_DIRTY; 1073 iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev; 1074 iomap->offset = offset; 1075 iomap->length = length; 1076 iomap->flags = IOMAP_F_ANON_WRITE; 1077 return 0; 1078 } 1079 1080 const struct iomap_ops xfs_zoned_direct_write_iomap_ops = { 1081 .iomap_begin = xfs_zoned_direct_write_iomap_begin, 1082 }; 1083 #endif /* CONFIG_XFS_RT */ 1084 1085 static int 1086 xfs_atomic_write_cow_iomap_begin( 1087 struct inode *inode, 1088 loff_t offset, 1089 loff_t length, 1090 unsigned flags, 1091 struct iomap *iomap, 1092 struct iomap *srcmap) 1093 { 1094 struct xfs_inode *ip = XFS_I(inode); 1095 struct xfs_mount *mp = ip->i_mount; 1096 const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 1097 xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); 1098 xfs_filblks_t count_fsb = end_fsb - offset_fsb; 1099 int nmaps = 1; 1100 xfs_filblks_t resaligned; 1101 struct xfs_bmbt_irec cmap; 1102 struct xfs_iext_cursor icur; 1103 struct xfs_trans *tp; 1104 unsigned int dblocks = 0, rblocks = 0; 1105 int error; 1106 u64 seq; 1107 1108 ASSERT(flags & IOMAP_WRITE); 1109 ASSERT(flags & IOMAP_DIRECT); 1110 1111 if (xfs_is_shutdown(mp)) 1112 return -EIO; 1113 1114 if (!xfs_can_sw_atomic_write(mp)) { 1115 ASSERT(xfs_can_sw_atomic_write(mp)); 1116 return -EINVAL; 1117 } 1118 1119 /* blocks are always allocated in this path */ 1120 if (flags & IOMAP_NOWAIT) 1121 return -EAGAIN; 1122 1123 trace_xfs_iomap_atomic_write_cow(ip, offset, length); 1124 1125 xfs_ilock(ip, XFS_ILOCK_EXCL); 1126 1127 if (!ip->i_cowfp) { 1128 ASSERT(!xfs_is_reflink_inode(ip)); 1129 xfs_ifork_init_cow(ip); 1130 } 1131 1132 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) 1133 cmap.br_startoff = end_fsb; 1134 if (cmap.br_startoff <= offset_fsb) { 1135 xfs_trim_extent(&cmap, offset_fsb, count_fsb); 1136 goto found; 1137 } 1138 1139 end_fsb = cmap.br_startoff; 1140 count_fsb = end_fsb - offset_fsb; 1141 1142 resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, 1143 xfs_get_cowextsz_hint(ip)); 1144 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1145 1146 if (XFS_IS_REALTIME_INODE(ip)) { 1147 dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); 1148 rblocks = resaligned; 1149 } else { 1150 dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); 1151 rblocks = 0; 1152 } 1153 1154 error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, 1155 rblocks, false, &tp); 1156 if (error) 1157 return error; 1158 1159 /* extent layout could have changed since the unlock, so check again */ 1160 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) 1161 cmap.br_startoff = end_fsb; 1162 if (cmap.br_startoff <= offset_fsb) { 1163 xfs_trim_extent(&cmap, offset_fsb, count_fsb); 1164 xfs_trans_cancel(tp); 1165 goto found; 1166 } 1167 1168 /* 1169 * Allocate the entire reservation as unwritten blocks. 1170 * 1171 * Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to 1172 * extszhint, such that there will be a greater chance that future 1173 * atomic writes to that same range will be aligned (and don't require 1174 * this COW-based method). 1175 */ 1176 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, 1177 XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC | 1178 XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps); 1179 if (error) { 1180 xfs_trans_cancel(tp); 1181 goto out_unlock; 1182 } 1183 1184 xfs_inode_set_cowblocks_tag(ip); 1185 error = xfs_trans_commit(tp); 1186 if (error) 1187 goto out_unlock; 1188 1189 found: 1190 if (cmap.br_state != XFS_EXT_NORM) { 1191 error = xfs_reflink_convert_cow_locked(ip, offset_fsb, 1192 count_fsb); 1193 if (error) 1194 goto out_unlock; 1195 cmap.br_state = XFS_EXT_NORM; 1196 } 1197 1198 length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); 1199 trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); 1200 seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); 1201 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1202 return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); 1203 1204 out_unlock: 1205 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1206 return error; 1207 } 1208 1209 const struct iomap_ops xfs_atomic_write_cow_iomap_ops = { 1210 .iomap_begin = xfs_atomic_write_cow_iomap_begin, 1211 }; 1212 1213 static int 1214 xfs_dax_write_iomap_end( 1215 struct inode *inode, 1216 loff_t pos, 1217 loff_t length, 1218 ssize_t written, 1219 unsigned flags, 1220 struct iomap *iomap) 1221 { 1222 struct xfs_inode *ip = XFS_I(inode); 1223 1224 if (!xfs_is_cow_inode(ip)) 1225 return 0; 1226 1227 if (!written) 1228 return xfs_reflink_cancel_cow_range(ip, pos, length, true); 1229 1230 return xfs_reflink_end_cow(ip, pos, written); 1231 } 1232 1233 const struct iomap_ops xfs_dax_write_iomap_ops = { 1234 .iomap_begin = xfs_direct_write_iomap_begin, 1235 .iomap_end = xfs_dax_write_iomap_end, 1236 }; 1237 1238 /* 1239 * Convert a hole to a delayed allocation. 1240 */ 1241 static void 1242 xfs_bmap_add_extent_hole_delay( 1243 struct xfs_inode *ip, /* incore inode pointer */ 1244 int whichfork, 1245 struct xfs_iext_cursor *icur, 1246 struct xfs_bmbt_irec *new) /* new data to add to file extents */ 1247 { 1248 struct xfs_ifork *ifp; /* inode fork pointer */ 1249 xfs_bmbt_irec_t left; /* left neighbor extent entry */ 1250 xfs_filblks_t newlen=0; /* new indirect size */ 1251 xfs_filblks_t oldlen=0; /* old indirect size */ 1252 xfs_bmbt_irec_t right; /* right neighbor extent entry */ 1253 uint32_t state = xfs_bmap_fork_to_state(whichfork); 1254 xfs_filblks_t temp; /* temp for indirect calculations */ 1255 1256 ifp = xfs_ifork_ptr(ip, whichfork); 1257 ASSERT(isnullstartblock(new->br_startblock)); 1258 1259 /* 1260 * Check and set flags if this segment has a left neighbor 1261 */ 1262 if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { 1263 state |= BMAP_LEFT_VALID; 1264 if (isnullstartblock(left.br_startblock)) 1265 state |= BMAP_LEFT_DELAY; 1266 } 1267 1268 /* 1269 * Check and set flags if the current (right) segment exists. 1270 * If it doesn't exist, we're converting the hole at end-of-file. 1271 */ 1272 if (xfs_iext_get_extent(ifp, icur, &right)) { 1273 state |= BMAP_RIGHT_VALID; 1274 if (isnullstartblock(right.br_startblock)) 1275 state |= BMAP_RIGHT_DELAY; 1276 } 1277 1278 /* 1279 * Set contiguity flags on the left and right neighbors. 1280 * Don't let extents get too large, even if the pieces are contiguous. 1281 */ 1282 if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && 1283 left.br_startoff + left.br_blockcount == new->br_startoff && 1284 left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) 1285 state |= BMAP_LEFT_CONTIG; 1286 1287 if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && 1288 new->br_startoff + new->br_blockcount == right.br_startoff && 1289 new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && 1290 (!(state & BMAP_LEFT_CONTIG) || 1291 (left.br_blockcount + new->br_blockcount + 1292 right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) 1293 state |= BMAP_RIGHT_CONTIG; 1294 1295 /* 1296 * Switch out based on the contiguity flags. 1297 */ 1298 switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { 1299 case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: 1300 /* 1301 * New allocation is contiguous with delayed allocations 1302 * on the left and on the right. 1303 * Merge all three into a single extent record. 1304 */ 1305 temp = left.br_blockcount + new->br_blockcount + 1306 right.br_blockcount; 1307 1308 oldlen = startblockval(left.br_startblock) + 1309 startblockval(new->br_startblock) + 1310 startblockval(right.br_startblock); 1311 newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1312 oldlen); 1313 left.br_startblock = nullstartblock(newlen); 1314 left.br_blockcount = temp; 1315 1316 xfs_iext_remove(ip, icur, state); 1317 xfs_iext_prev(ifp, icur); 1318 xfs_iext_update_extent(ip, state, icur, &left); 1319 break; 1320 1321 case BMAP_LEFT_CONTIG: 1322 /* 1323 * New allocation is contiguous with a delayed allocation 1324 * on the left. 1325 * Merge the new allocation with the left neighbor. 1326 */ 1327 temp = left.br_blockcount + new->br_blockcount; 1328 1329 oldlen = startblockval(left.br_startblock) + 1330 startblockval(new->br_startblock); 1331 newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1332 oldlen); 1333 left.br_blockcount = temp; 1334 left.br_startblock = nullstartblock(newlen); 1335 1336 xfs_iext_prev(ifp, icur); 1337 xfs_iext_update_extent(ip, state, icur, &left); 1338 break; 1339 1340 case BMAP_RIGHT_CONTIG: 1341 /* 1342 * New allocation is contiguous with a delayed allocation 1343 * on the right. 1344 * Merge the new allocation with the right neighbor. 1345 */ 1346 temp = new->br_blockcount + right.br_blockcount; 1347 oldlen = startblockval(new->br_startblock) + 1348 startblockval(right.br_startblock); 1349 newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1350 oldlen); 1351 right.br_startoff = new->br_startoff; 1352 right.br_startblock = nullstartblock(newlen); 1353 right.br_blockcount = temp; 1354 xfs_iext_update_extent(ip, state, icur, &right); 1355 break; 1356 1357 case 0: 1358 /* 1359 * New allocation is not contiguous with another 1360 * delayed allocation. 1361 * Insert a new entry. 1362 */ 1363 oldlen = newlen = 0; 1364 xfs_iext_insert(ip, icur, new, state); 1365 break; 1366 } 1367 if (oldlen != newlen) { 1368 ASSERT(oldlen > newlen); 1369 xfs_add_fdblocks(ip->i_mount, oldlen - newlen); 1370 1371 /* 1372 * Nothing to do for disk quota accounting here. 1373 */ 1374 xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); 1375 } 1376 } 1377 1378 /* 1379 * Add a delayed allocation extent to an inode. Blocks are reserved from the 1380 * global pool and the extent inserted into the inode in-core extent tree. 1381 * 1382 * On entry, got refers to the first extent beyond the offset of the extent to 1383 * allocate or eof is specified if no such extent exists. On return, got refers 1384 * to the extent record that was inserted to the inode fork. 1385 * 1386 * Note that the allocated extent may have been merged with contiguous extents 1387 * during insertion into the inode fork. Thus, got does not reflect the current 1388 * state of the inode fork on return. If necessary, the caller can use lastx to 1389 * look up the updated record in the inode fork. 1390 */ 1391 static int 1392 xfs_bmapi_reserve_delalloc( 1393 struct xfs_inode *ip, 1394 int whichfork, 1395 xfs_fileoff_t off, 1396 xfs_filblks_t len, 1397 xfs_filblks_t prealloc, 1398 struct xfs_bmbt_irec *got, 1399 struct xfs_iext_cursor *icur, 1400 int eof) 1401 { 1402 struct xfs_mount *mp = ip->i_mount; 1403 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 1404 xfs_extlen_t alen; 1405 xfs_extlen_t indlen; 1406 uint64_t fdblocks; 1407 int error; 1408 xfs_fileoff_t aoff; 1409 bool use_cowextszhint = 1410 whichfork == XFS_COW_FORK && !prealloc; 1411 1412 retry: 1413 /* 1414 * Cap the alloc length. Keep track of prealloc so we know whether to 1415 * tag the inode before we return. 1416 */ 1417 aoff = off; 1418 alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); 1419 if (!eof) 1420 alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); 1421 if (prealloc && alen >= len) 1422 prealloc = alen - len; 1423 1424 /* 1425 * If we're targetting the COW fork but aren't creating a speculative 1426 * posteof preallocation, try to expand the reservation to align with 1427 * the COW extent size hint if there's sufficient free space. 1428 * 1429 * Unlike the data fork, the CoW cancellation functions will free all 1430 * the reservations at inactivation, so we don't require that every 1431 * delalloc reservation have a dirty pagecache. 1432 */ 1433 if (use_cowextszhint) { 1434 struct xfs_bmbt_irec prev; 1435 xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); 1436 1437 if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) 1438 prev.br_startoff = NULLFILEOFF; 1439 1440 error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, 1441 1, 0, &aoff, &alen); 1442 ASSERT(!error); 1443 } 1444 1445 /* 1446 * Make a transaction-less quota reservation for delayed allocation 1447 * blocks. This number gets adjusted later. We return if we haven't 1448 * allocated blocks already inside this loop. 1449 */ 1450 error = xfs_quota_reserve_blkres(ip, alen); 1451 if (error) 1452 goto out; 1453 1454 /* 1455 * Split changing sb for alen and indlen since they could be coming 1456 * from different places. 1457 */ 1458 indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); 1459 ASSERT(indlen > 0); 1460 1461 fdblocks = indlen; 1462 if (XFS_IS_REALTIME_INODE(ip)) { 1463 ASSERT(!xfs_is_zoned_inode(ip)); 1464 error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); 1465 if (error) 1466 goto out_unreserve_quota; 1467 } else { 1468 fdblocks += alen; 1469 } 1470 1471 error = xfs_dec_fdblocks(mp, fdblocks, false); 1472 if (error) 1473 goto out_unreserve_frextents; 1474 1475 ip->i_delayed_blks += alen; 1476 xfs_mod_delalloc(ip, alen, indlen); 1477 1478 got->br_startoff = aoff; 1479 got->br_startblock = nullstartblock(indlen); 1480 got->br_blockcount = alen; 1481 got->br_state = XFS_EXT_NORM; 1482 1483 xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); 1484 1485 /* 1486 * Tag the inode if blocks were preallocated. Note that COW fork 1487 * preallocation can occur at the start or end of the extent, even when 1488 * prealloc == 0, so we must also check the aligned offset and length. 1489 */ 1490 if (whichfork == XFS_DATA_FORK && prealloc) 1491 xfs_inode_set_eofblocks_tag(ip); 1492 if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) 1493 xfs_inode_set_cowblocks_tag(ip); 1494 1495 return 0; 1496 1497 out_unreserve_frextents: 1498 if (XFS_IS_REALTIME_INODE(ip)) 1499 xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); 1500 out_unreserve_quota: 1501 if (XFS_IS_QUOTA_ON(mp)) 1502 xfs_quota_unreserve_blkres(ip, alen); 1503 out: 1504 if (error == -ENOSPC || error == -EDQUOT) { 1505 trace_xfs_delalloc_enospc(ip, off, len); 1506 1507 if (prealloc || use_cowextszhint) { 1508 /* retry without any preallocation */ 1509 use_cowextszhint = false; 1510 prealloc = 0; 1511 goto retry; 1512 } 1513 } 1514 return error; 1515 } 1516 1517 static int 1518 xfs_zoned_buffered_write_iomap_begin( 1519 struct inode *inode, 1520 loff_t offset, 1521 loff_t count, 1522 unsigned flags, 1523 struct iomap *iomap, 1524 struct iomap *srcmap) 1525 { 1526 struct iomap_iter *iter = 1527 container_of(iomap, struct iomap_iter, iomap); 1528 struct xfs_zone_alloc_ctx *ac = iter->private; 1529 struct xfs_inode *ip = XFS_I(inode); 1530 struct xfs_mount *mp = ip->i_mount; 1531 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 1532 xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); 1533 u16 iomap_flags = IOMAP_F_SHARED; 1534 unsigned int lockmode = XFS_ILOCK_EXCL; 1535 xfs_filblks_t count_fsb; 1536 xfs_extlen_t indlen; 1537 struct xfs_bmbt_irec got; 1538 struct xfs_iext_cursor icur; 1539 int error = 0; 1540 1541 ASSERT(!xfs_get_extsz_hint(ip)); 1542 ASSERT(!(flags & IOMAP_UNSHARE)); 1543 ASSERT(ac); 1544 1545 if (xfs_is_shutdown(mp)) 1546 return -EIO; 1547 1548 error = xfs_qm_dqattach(ip); 1549 if (error) 1550 return error; 1551 1552 error = xfs_ilock_for_iomap(ip, flags, &lockmode); 1553 if (error) 1554 return error; 1555 1556 if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || 1557 XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { 1558 xfs_bmap_mark_sick(ip, XFS_DATA_FORK); 1559 error = -EFSCORRUPTED; 1560 goto out_unlock; 1561 } 1562 1563 XFS_STATS_INC(mp, xs_blk_mapw); 1564 1565 error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); 1566 if (error) 1567 goto out_unlock; 1568 1569 /* 1570 * For zeroing operations check if there is any data to zero first. 1571 * 1572 * For regular writes we always need to allocate new blocks, but need to 1573 * provide the source mapping when the range is unaligned to support 1574 * read-modify-write of the whole block in the page cache. 1575 * 1576 * In either case we need to limit the reported range to the boundaries 1577 * of the source map in the data fork. 1578 */ 1579 if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) || 1580 !IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) || 1581 (flags & IOMAP_ZERO)) { 1582 struct xfs_bmbt_irec smap; 1583 struct xfs_iext_cursor scur; 1584 1585 if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur, 1586 &smap)) 1587 smap.br_startoff = end_fsb; /* fake hole until EOF */ 1588 if (smap.br_startoff > offset_fsb) { 1589 /* 1590 * We never need to allocate blocks for zeroing a hole. 1591 */ 1592 if (flags & IOMAP_ZERO) { 1593 xfs_hole_to_iomap(ip, iomap, offset_fsb, 1594 smap.br_startoff); 1595 goto out_unlock; 1596 } 1597 end_fsb = min(end_fsb, smap.br_startoff); 1598 } else { 1599 end_fsb = min(end_fsb, 1600 smap.br_startoff + smap.br_blockcount); 1601 xfs_trim_extent(&smap, offset_fsb, 1602 end_fsb - offset_fsb); 1603 error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0, 1604 xfs_iomap_inode_sequence(ip, 0)); 1605 if (error) 1606 goto out_unlock; 1607 } 1608 } 1609 1610 if (!ip->i_cowfp) 1611 xfs_ifork_init_cow(ip); 1612 1613 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) 1614 got.br_startoff = end_fsb; 1615 if (got.br_startoff <= offset_fsb) { 1616 trace_xfs_reflink_cow_found(ip, &got); 1617 goto done; 1618 } 1619 1620 /* 1621 * Cap the maximum length to keep the chunks of work done here somewhat 1622 * symmetric with the work writeback does. 1623 */ 1624 end_fsb = min(end_fsb, got.br_startoff); 1625 count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN, 1626 XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE)); 1627 1628 /* 1629 * The block reservation is supposed to cover all blocks that the 1630 * operation could possible write, but there is a nasty corner case 1631 * where blocks could be stolen from underneath us: 1632 * 1633 * 1) while this thread iterates over a larger buffered write, 1634 * 2) another thread is causing a write fault that calls into 1635 * ->page_mkwrite in range this thread writes to, using up the 1636 * delalloc reservation created by a previous call to this function. 1637 * 3) another thread does direct I/O on the range that the write fault 1638 * happened on, which causes writeback of the dirty data. 1639 * 4) this then set the stale flag, which cuts the current iomap 1640 * iteration short, causing the new call to ->iomap_begin that gets 1641 * us here again, but now without a sufficient reservation. 1642 * 1643 * This is a very unusual I/O pattern, and nothing but generic/095 is 1644 * known to hit it. There's not really much we can do here, so turn this 1645 * into a short write. 1646 */ 1647 if (count_fsb > ac->reserved_blocks) { 1648 xfs_warn_ratelimited(mp, 1649 "Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O", 1650 ip->i_ino, current->comm); 1651 count_fsb = ac->reserved_blocks; 1652 if (!count_fsb) { 1653 error = -EIO; 1654 goto out_unlock; 1655 } 1656 } 1657 1658 error = xfs_quota_reserve_blkres(ip, count_fsb); 1659 if (error) 1660 goto out_unlock; 1661 1662 indlen = xfs_bmap_worst_indlen(ip, count_fsb); 1663 error = xfs_dec_fdblocks(mp, indlen, false); 1664 if (error) 1665 goto out_unlock; 1666 ip->i_delayed_blks += count_fsb; 1667 xfs_mod_delalloc(ip, count_fsb, indlen); 1668 1669 got.br_startoff = offset_fsb; 1670 got.br_startblock = nullstartblock(indlen); 1671 got.br_blockcount = count_fsb; 1672 got.br_state = XFS_EXT_NORM; 1673 xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got); 1674 ac->reserved_blocks -= count_fsb; 1675 iomap_flags |= IOMAP_F_NEW; 1676 1677 trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb), 1678 XFS_COW_FORK, &got); 1679 done: 1680 error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags, 1681 xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED)); 1682 out_unlock: 1683 xfs_iunlock(ip, lockmode); 1684 return error; 1685 } 1686 1687 static int 1688 xfs_buffered_write_iomap_begin( 1689 struct inode *inode, 1690 loff_t offset, 1691 loff_t count, 1692 unsigned flags, 1693 struct iomap *iomap, 1694 struct iomap *srcmap) 1695 { 1696 struct xfs_inode *ip = XFS_I(inode); 1697 struct xfs_mount *mp = ip->i_mount; 1698 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 1699 xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); 1700 struct xfs_bmbt_irec imap, cmap; 1701 struct xfs_iext_cursor icur, ccur; 1702 xfs_fsblock_t prealloc_blocks = 0; 1703 bool eof = false, cow_eof = false, shared = false; 1704 int allocfork = XFS_DATA_FORK; 1705 int error = 0; 1706 unsigned int lockmode = XFS_ILOCK_EXCL; 1707 unsigned int iomap_flags = 0; 1708 u64 seq; 1709 1710 if (xfs_is_shutdown(mp)) 1711 return -EIO; 1712 1713 if (xfs_is_zoned_inode(ip)) 1714 return xfs_zoned_buffered_write_iomap_begin(inode, offset, 1715 count, flags, iomap, srcmap); 1716 1717 /* we can't use delayed allocations when using extent size hints */ 1718 if (xfs_get_extsz_hint(ip)) 1719 return xfs_direct_write_iomap_begin(inode, offset, count, 1720 flags, iomap, srcmap); 1721 1722 error = xfs_qm_dqattach(ip); 1723 if (error) 1724 return error; 1725 1726 error = xfs_ilock_for_iomap(ip, flags, &lockmode); 1727 if (error) 1728 return error; 1729 1730 if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || 1731 XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { 1732 xfs_bmap_mark_sick(ip, XFS_DATA_FORK); 1733 error = -EFSCORRUPTED; 1734 goto out_unlock; 1735 } 1736 1737 XFS_STATS_INC(mp, xs_blk_mapw); 1738 1739 error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); 1740 if (error) 1741 goto out_unlock; 1742 1743 /* 1744 * Search the data fork first to look up our source mapping. We 1745 * always need the data fork map, as we have to return it to the 1746 * iomap code so that the higher level write code can read data in to 1747 * perform read-modify-write cycles for unaligned writes. 1748 */ 1749 eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); 1750 if (eof) 1751 imap.br_startoff = end_fsb; /* fake hole until the end */ 1752 1753 /* We never need to allocate blocks for zeroing or unsharing a hole. */ 1754 if ((flags & (IOMAP_UNSHARE | IOMAP_ZERO)) && 1755 imap.br_startoff > offset_fsb) { 1756 xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); 1757 goto out_unlock; 1758 } 1759 1760 /* 1761 * For zeroing, trim a delalloc extent that extends beyond the EOF 1762 * block. If it starts beyond the EOF block, convert it to an 1763 * unwritten extent. 1764 */ 1765 if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb && 1766 isnullstartblock(imap.br_startblock)) { 1767 xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); 1768 1769 if (offset_fsb >= eof_fsb) 1770 goto convert_delay; 1771 if (end_fsb > eof_fsb) { 1772 end_fsb = eof_fsb; 1773 xfs_trim_extent(&imap, offset_fsb, 1774 end_fsb - offset_fsb); 1775 } 1776 } 1777 1778 /* 1779 * Search the COW fork extent list even if we did not find a data fork 1780 * extent. This serves two purposes: first this implements the 1781 * speculative preallocation using cowextsize, so that we also unshare 1782 * block adjacent to shared blocks instead of just the shared blocks 1783 * themselves. Second the lookup in the extent list is generally faster 1784 * than going out to the shared extent tree. 1785 */ 1786 if (xfs_is_cow_inode(ip)) { 1787 if (!ip->i_cowfp) { 1788 ASSERT(!xfs_is_reflink_inode(ip)); 1789 xfs_ifork_init_cow(ip); 1790 } 1791 cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, 1792 &ccur, &cmap); 1793 if (!cow_eof && cmap.br_startoff <= offset_fsb) { 1794 trace_xfs_reflink_cow_found(ip, &cmap); 1795 goto found_cow; 1796 } 1797 } 1798 1799 if (imap.br_startoff <= offset_fsb) { 1800 /* 1801 * For reflink files we may need a delalloc reservation when 1802 * overwriting shared extents. This includes zeroing of 1803 * existing extents that contain data. 1804 */ 1805 if (!xfs_is_cow_inode(ip) || 1806 ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) { 1807 trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, 1808 &imap); 1809 goto found_imap; 1810 } 1811 1812 xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); 1813 1814 /* Trim the mapping to the nearest shared extent boundary. */ 1815 error = xfs_bmap_trim_cow(ip, &imap, &shared); 1816 if (error) 1817 goto out_unlock; 1818 1819 /* Not shared? Just report the (potentially capped) extent. */ 1820 if (!shared) { 1821 trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, 1822 &imap); 1823 goto found_imap; 1824 } 1825 1826 /* 1827 * Fork all the shared blocks from our write offset until the 1828 * end of the extent. 1829 */ 1830 allocfork = XFS_COW_FORK; 1831 end_fsb = imap.br_startoff + imap.br_blockcount; 1832 } else { 1833 /* 1834 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES 1835 * pages to keep the chunks of work done where somewhat 1836 * symmetric with the work writeback does. This is a completely 1837 * arbitrary number pulled out of thin air. 1838 * 1839 * Note that the values needs to be less than 32-bits wide until 1840 * the lower level functions are updated. 1841 */ 1842 count = min_t(loff_t, count, 1024 * PAGE_SIZE); 1843 end_fsb = xfs_iomap_end_fsb(mp, offset, count); 1844 1845 if (xfs_is_always_cow_inode(ip)) 1846 allocfork = XFS_COW_FORK; 1847 } 1848 1849 if (eof && offset + count > XFS_ISIZE(ip)) { 1850 /* 1851 * Determine the initial size of the preallocation. 1852 * We clean up any extra preallocation when the file is closed. 1853 */ 1854 if (xfs_has_allocsize(mp)) 1855 prealloc_blocks = mp->m_allocsize_blocks; 1856 else if (allocfork == XFS_DATA_FORK) 1857 prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, 1858 offset, count, &icur); 1859 else 1860 prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, 1861 offset, count, &ccur); 1862 if (prealloc_blocks) { 1863 xfs_extlen_t align; 1864 xfs_off_t end_offset; 1865 xfs_fileoff_t p_end_fsb; 1866 1867 end_offset = XFS_ALLOC_ALIGN(mp, offset + count - 1); 1868 p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) + 1869 prealloc_blocks; 1870 1871 align = xfs_eof_alignment(ip); 1872 if (align) 1873 p_end_fsb = roundup_64(p_end_fsb, align); 1874 1875 p_end_fsb = min(p_end_fsb, 1876 XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); 1877 ASSERT(p_end_fsb > offset_fsb); 1878 prealloc_blocks = p_end_fsb - end_fsb; 1879 } 1880 } 1881 1882 /* 1883 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch 1884 * them out if the write happens to fail. 1885 */ 1886 iomap_flags |= IOMAP_F_NEW; 1887 if (allocfork == XFS_COW_FORK) { 1888 error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, 1889 end_fsb - offset_fsb, prealloc_blocks, &cmap, 1890 &ccur, cow_eof); 1891 if (error) 1892 goto out_unlock; 1893 1894 trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap); 1895 goto found_cow; 1896 } 1897 1898 error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, 1899 end_fsb - offset_fsb, prealloc_blocks, &imap, &icur, 1900 eof); 1901 if (error) 1902 goto out_unlock; 1903 1904 trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); 1905 found_imap: 1906 seq = xfs_iomap_inode_sequence(ip, iomap_flags); 1907 xfs_iunlock(ip, lockmode); 1908 return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); 1909 1910 convert_delay: 1911 xfs_iunlock(ip, lockmode); 1912 truncate_pagecache(inode, offset); 1913 error = xfs_bmapi_convert_delalloc(ip, XFS_DATA_FORK, offset, 1914 iomap, NULL); 1915 if (error) 1916 return error; 1917 1918 trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &imap); 1919 return 0; 1920 1921 found_cow: 1922 if (imap.br_startoff <= offset_fsb) { 1923 error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, 1924 xfs_iomap_inode_sequence(ip, 0)); 1925 if (error) 1926 goto out_unlock; 1927 } else { 1928 xfs_trim_extent(&cmap, offset_fsb, 1929 imap.br_startoff - offset_fsb); 1930 } 1931 1932 iomap_flags |= IOMAP_F_SHARED; 1933 seq = xfs_iomap_inode_sequence(ip, iomap_flags); 1934 xfs_iunlock(ip, lockmode); 1935 return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, iomap_flags, seq); 1936 1937 out_unlock: 1938 xfs_iunlock(ip, lockmode); 1939 return error; 1940 } 1941 1942 static void 1943 xfs_buffered_write_delalloc_punch( 1944 struct inode *inode, 1945 loff_t offset, 1946 loff_t length, 1947 struct iomap *iomap) 1948 { 1949 struct iomap_iter *iter = 1950 container_of(iomap, struct iomap_iter, iomap); 1951 1952 xfs_bmap_punch_delalloc_range(XFS_I(inode), 1953 (iomap->flags & IOMAP_F_SHARED) ? 1954 XFS_COW_FORK : XFS_DATA_FORK, 1955 offset, offset + length, iter->private); 1956 } 1957 1958 static int 1959 xfs_buffered_write_iomap_end( 1960 struct inode *inode, 1961 loff_t offset, 1962 loff_t length, 1963 ssize_t written, 1964 unsigned flags, 1965 struct iomap *iomap) 1966 { 1967 loff_t start_byte, end_byte; 1968 1969 /* If we didn't reserve the blocks, we're not allowed to punch them. */ 1970 if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW)) 1971 return 0; 1972 1973 /* 1974 * iomap_page_mkwrite() will never fail in a way that requires delalloc 1975 * extents that it allocated to be revoked. Hence never try to release 1976 * them here. 1977 */ 1978 if (flags & IOMAP_FAULT) 1979 return 0; 1980 1981 /* Nothing to do if we've written the entire delalloc extent */ 1982 start_byte = iomap_last_written_block(inode, offset, written); 1983 end_byte = round_up(offset + length, i_blocksize(inode)); 1984 if (start_byte >= end_byte) 1985 return 0; 1986 1987 /* For zeroing operations the callers already hold invalidate_lock. */ 1988 if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) { 1989 rwsem_assert_held_write(&inode->i_mapping->invalidate_lock); 1990 iomap_write_delalloc_release(inode, start_byte, end_byte, flags, 1991 iomap, xfs_buffered_write_delalloc_punch); 1992 } else { 1993 filemap_invalidate_lock(inode->i_mapping); 1994 iomap_write_delalloc_release(inode, start_byte, end_byte, flags, 1995 iomap, xfs_buffered_write_delalloc_punch); 1996 filemap_invalidate_unlock(inode->i_mapping); 1997 } 1998 1999 return 0; 2000 } 2001 2002 const struct iomap_ops xfs_buffered_write_iomap_ops = { 2003 .iomap_begin = xfs_buffered_write_iomap_begin, 2004 .iomap_end = xfs_buffered_write_iomap_end, 2005 }; 2006 2007 static int 2008 xfs_read_iomap_begin( 2009 struct inode *inode, 2010 loff_t offset, 2011 loff_t length, 2012 unsigned flags, 2013 struct iomap *iomap, 2014 struct iomap *srcmap) 2015 { 2016 struct xfs_inode *ip = XFS_I(inode); 2017 struct xfs_mount *mp = ip->i_mount; 2018 struct xfs_bmbt_irec imap; 2019 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 2020 xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); 2021 int nimaps = 1, error = 0; 2022 bool shared = false; 2023 unsigned int lockmode = XFS_ILOCK_SHARED; 2024 u64 seq; 2025 2026 ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); 2027 2028 if (xfs_is_shutdown(mp)) 2029 return -EIO; 2030 2031 error = xfs_ilock_for_iomap(ip, flags, &lockmode); 2032 if (error) 2033 return error; 2034 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, 2035 &nimaps, 0); 2036 if (!error && ((flags & IOMAP_REPORT) || IS_DAX(inode))) 2037 error = xfs_reflink_trim_around_shared(ip, &imap, &shared); 2038 seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0); 2039 xfs_iunlock(ip, lockmode); 2040 2041 if (error) 2042 return error; 2043 trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); 2044 return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 2045 shared ? IOMAP_F_SHARED : 0, seq); 2046 } 2047 2048 const struct iomap_ops xfs_read_iomap_ops = { 2049 .iomap_begin = xfs_read_iomap_begin, 2050 }; 2051 2052 static int 2053 xfs_seek_iomap_begin( 2054 struct inode *inode, 2055 loff_t offset, 2056 loff_t length, 2057 unsigned flags, 2058 struct iomap *iomap, 2059 struct iomap *srcmap) 2060 { 2061 struct xfs_inode *ip = XFS_I(inode); 2062 struct xfs_mount *mp = ip->i_mount; 2063 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 2064 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); 2065 xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF; 2066 struct xfs_iext_cursor icur; 2067 struct xfs_bmbt_irec imap, cmap; 2068 int error = 0; 2069 unsigned lockmode; 2070 u64 seq; 2071 2072 if (xfs_is_shutdown(mp)) 2073 return -EIO; 2074 2075 lockmode = xfs_ilock_data_map_shared(ip); 2076 error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); 2077 if (error) 2078 goto out_unlock; 2079 2080 if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) { 2081 /* 2082 * If we found a data extent we are done. 2083 */ 2084 if (imap.br_startoff <= offset_fsb) 2085 goto done; 2086 data_fsb = imap.br_startoff; 2087 } else { 2088 /* 2089 * Fake a hole until the end of the file. 2090 */ 2091 data_fsb = xfs_iomap_end_fsb(mp, offset, length); 2092 } 2093 2094 /* 2095 * If a COW fork extent covers the hole, report it - capped to the next 2096 * data fork extent: 2097 */ 2098 if (xfs_inode_has_cow_data(ip) && 2099 xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) 2100 cow_fsb = cmap.br_startoff; 2101 if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { 2102 if (data_fsb < cow_fsb + cmap.br_blockcount) 2103 end_fsb = min(end_fsb, data_fsb); 2104 xfs_trim_extent(&cmap, offset_fsb, end_fsb - offset_fsb); 2105 seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); 2106 error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 2107 IOMAP_F_SHARED, seq); 2108 /* 2109 * This is a COW extent, so we must probe the page cache 2110 * because there could be dirty page cache being backed 2111 * by this extent. 2112 */ 2113 iomap->type = IOMAP_UNWRITTEN; 2114 goto out_unlock; 2115 } 2116 2117 /* 2118 * Else report a hole, capped to the next found data or COW extent. 2119 */ 2120 if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb) 2121 imap.br_blockcount = cow_fsb - offset_fsb; 2122 else 2123 imap.br_blockcount = data_fsb - offset_fsb; 2124 imap.br_startoff = offset_fsb; 2125 imap.br_startblock = HOLESTARTBLOCK; 2126 imap.br_state = XFS_EXT_NORM; 2127 done: 2128 seq = xfs_iomap_inode_sequence(ip, 0); 2129 xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); 2130 error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); 2131 out_unlock: 2132 xfs_iunlock(ip, lockmode); 2133 return error; 2134 } 2135 2136 const struct iomap_ops xfs_seek_iomap_ops = { 2137 .iomap_begin = xfs_seek_iomap_begin, 2138 }; 2139 2140 static int 2141 xfs_xattr_iomap_begin( 2142 struct inode *inode, 2143 loff_t offset, 2144 loff_t length, 2145 unsigned flags, 2146 struct iomap *iomap, 2147 struct iomap *srcmap) 2148 { 2149 struct xfs_inode *ip = XFS_I(inode); 2150 struct xfs_mount *mp = ip->i_mount; 2151 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 2152 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); 2153 struct xfs_bmbt_irec imap; 2154 int nimaps = 1, error = 0; 2155 unsigned lockmode; 2156 int seq; 2157 2158 if (xfs_is_shutdown(mp)) 2159 return -EIO; 2160 2161 lockmode = xfs_ilock_attr_map_shared(ip); 2162 2163 /* if there are no attribute fork or extents, return ENOENT */ 2164 if (!xfs_inode_has_attr_fork(ip) || !ip->i_af.if_nextents) { 2165 error = -ENOENT; 2166 goto out_unlock; 2167 } 2168 2169 ASSERT(ip->i_af.if_format != XFS_DINODE_FMT_LOCAL); 2170 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, 2171 &nimaps, XFS_BMAPI_ATTRFORK); 2172 out_unlock: 2173 2174 seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR); 2175 xfs_iunlock(ip, lockmode); 2176 2177 if (error) 2178 return error; 2179 ASSERT(nimaps); 2180 return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq); 2181 } 2182 2183 const struct iomap_ops xfs_xattr_iomap_ops = { 2184 .iomap_begin = xfs_xattr_iomap_begin, 2185 }; 2186 2187 int 2188 xfs_zero_range( 2189 struct xfs_inode *ip, 2190 loff_t pos, 2191 loff_t len, 2192 struct xfs_zone_alloc_ctx *ac, 2193 bool *did_zero) 2194 { 2195 struct inode *inode = VFS_I(ip); 2196 2197 xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); 2198 2199 if (IS_DAX(inode)) 2200 return dax_zero_range(inode, pos, len, did_zero, 2201 &xfs_dax_write_iomap_ops); 2202 return iomap_zero_range(inode, pos, len, did_zero, 2203 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, 2204 ac); 2205 } 2206 2207 int 2208 xfs_truncate_page( 2209 struct xfs_inode *ip, 2210 loff_t pos, 2211 struct xfs_zone_alloc_ctx *ac, 2212 bool *did_zero) 2213 { 2214 struct inode *inode = VFS_I(ip); 2215 2216 if (IS_DAX(inode)) 2217 return dax_truncate_page(inode, pos, did_zero, 2218 &xfs_dax_write_iomap_ops); 2219 return iomap_truncate_page(inode, pos, did_zero, 2220 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, 2221 ac); 2222 } 2223