1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/fsverity.h> 4 #include <linux/iomap.h> 5 #include "ctree.h" 6 #include "delalloc-space.h" 7 #include "direct-io.h" 8 #include "extent-tree.h" 9 #include "file.h" 10 #include "fs.h" 11 #include "transaction.h" 12 #include "volumes.h" 13 #include "bio.h" 14 #include "ordered-data.h" 15 16 struct btrfs_dio_data { 17 ssize_t submitted; 18 struct extent_changeset *data_reserved; 19 struct btrfs_ordered_extent *ordered; 20 bool data_space_reserved; 21 bool nocow_done; 22 }; 23 24 struct btrfs_dio_private { 25 /* Range of I/O */ 26 u64 file_offset; 27 u32 bytes; 28 29 /* This must be last */ 30 struct btrfs_bio bbio; 31 }; 32 33 static struct bio_set btrfs_dio_bioset; 34 35 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 36 struct extent_state **cached_state, 37 unsigned int iomap_flags) 38 { 39 const bool writing = (iomap_flags & IOMAP_WRITE); 40 const bool nowait = (iomap_flags & IOMAP_NOWAIT); 41 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 42 struct btrfs_ordered_extent *ordered; 43 int ret = 0; 44 45 /* Direct lock must be taken before the extent lock. */ 46 if (nowait) { 47 if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) 48 return -EAGAIN; 49 } else { 50 btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state); 51 } 52 53 while (1) { 54 if (nowait) { 55 if (!btrfs_try_lock_extent(io_tree, lockstart, lockend, 56 cached_state)) { 57 ret = -EAGAIN; 58 break; 59 } 60 } else { 61 btrfs_lock_extent(io_tree, lockstart, lockend, cached_state); 62 } 63 /* 64 * We're concerned with the entire range that we're going to be 65 * doing DIO to, so we need to make sure there's no ordered 66 * extents in this range. 67 */ 68 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, 69 lockend - lockstart + 1); 70 71 /* 72 * We need to make sure there are no buffered pages in this 73 * range either, we could have raced between the invalidate in 74 * generic_file_direct_write and locking the extent. The 75 * invalidate needs to happen so that reads after a write do not 76 * get stale data. 77 */ 78 if (!ordered && 79 (!writing || !filemap_range_has_page(inode->i_mapping, 80 lockstart, lockend))) 81 break; 82 83 btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state); 84 85 if (ordered) { 86 if (nowait) { 87 btrfs_put_ordered_extent(ordered); 88 ret = -EAGAIN; 89 break; 90 } 91 /* 92 * If we are doing a DIO read and the ordered extent we 93 * found is for a buffered write, we can not wait for it 94 * to complete and retry, because if we do so we can 95 * deadlock with concurrent buffered writes on page 96 * locks. This happens only if our DIO read covers more 97 * than one extent map, if at this point has already 98 * created an ordered extent for a previous extent map 99 * and locked its range in the inode's io tree, and a 100 * concurrent write against that previous extent map's 101 * range and this range started (we unlock the ranges 102 * in the io tree only when the bios complete and 103 * buffered writes always lock pages before attempting 104 * to lock range in the io tree). 105 */ 106 if (writing || 107 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) 108 btrfs_start_ordered_extent(ordered); 109 else 110 ret = nowait ? -EAGAIN : -ENOTBLK; 111 btrfs_put_ordered_extent(ordered); 112 } else { 113 /* 114 * We could trigger writeback for this range (and wait 115 * for it to complete) and then invalidate the pages for 116 * this range (through invalidate_inode_pages2_range()), 117 * but that can lead us to a deadlock with a concurrent 118 * call to readahead (a buffered read or a defrag call 119 * triggered a readahead) on a page lock due to an 120 * ordered dio extent we created before but did not have 121 * yet a corresponding bio submitted (whence it can not 122 * complete), which makes readahead wait for that 123 * ordered extent to complete while holding a lock on 124 * that page. 125 */ 126 ret = nowait ? -EAGAIN : -ENOTBLK; 127 } 128 129 if (ret) 130 break; 131 132 cond_resched(); 133 } 134 135 if (ret) 136 btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state); 137 return ret; 138 } 139 140 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, 141 struct btrfs_dio_data *dio_data, 142 const u64 start, 143 const struct btrfs_file_extent *file_extent, 144 const int type) 145 { 146 struct extent_map *em = NULL; 147 struct btrfs_ordered_extent *ordered; 148 149 if (type != BTRFS_ORDERED_NOCOW) { 150 em = btrfs_create_io_em(inode, start, file_extent, type); 151 if (IS_ERR(em)) 152 goto out; 153 } 154 155 ordered = btrfs_alloc_ordered_extent(inode, start, file_extent, 156 (1U << type) | 157 (1U << BTRFS_ORDERED_DIRECT)); 158 if (IS_ERR(ordered)) { 159 if (em) { 160 btrfs_free_extent_map(em); 161 btrfs_drop_extent_map_range(inode, start, 162 start + file_extent->num_bytes - 1, false); 163 } 164 em = ERR_CAST(ordered); 165 } else { 166 ASSERT(!dio_data->ordered); 167 dio_data->ordered = ordered; 168 } 169 out: 170 171 return em; 172 } 173 174 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, 175 struct btrfs_dio_data *dio_data, 176 u64 start, u64 len) 177 { 178 struct btrfs_root *root = inode->root; 179 struct btrfs_fs_info *fs_info = root->fs_info; 180 struct btrfs_file_extent file_extent; 181 struct extent_map *em; 182 struct btrfs_key ins; 183 u64 alloc_hint; 184 int ret; 185 186 alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len); 187 again: 188 ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 189 0, alloc_hint, &ins, true, true); 190 if (ret == -EAGAIN) { 191 ASSERT(btrfs_is_zoned(fs_info)); 192 wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, 193 TASK_UNINTERRUPTIBLE); 194 goto again; 195 } 196 if (ret) 197 return ERR_PTR(ret); 198 199 file_extent.disk_bytenr = ins.objectid; 200 file_extent.disk_num_bytes = ins.offset; 201 file_extent.num_bytes = ins.offset; 202 file_extent.ram_bytes = ins.offset; 203 file_extent.offset = 0; 204 file_extent.compression = BTRFS_COMPRESS_NONE; 205 em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent, 206 BTRFS_ORDERED_REGULAR); 207 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 208 if (IS_ERR(em)) 209 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); 210 211 return em; 212 } 213 214 static int btrfs_get_blocks_direct_write(struct extent_map **map, 215 struct inode *inode, 216 struct btrfs_dio_data *dio_data, 217 u64 start, u64 *lenp, 218 unsigned int iomap_flags) 219 { 220 const bool nowait = (iomap_flags & IOMAP_NOWAIT); 221 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 222 struct btrfs_file_extent file_extent; 223 struct extent_map *em = *map; 224 int type; 225 u64 block_start; 226 struct btrfs_block_group *bg; 227 bool can_nocow = false; 228 bool space_reserved = false; 229 u64 len = *lenp; 230 u64 prev_len; 231 int ret = 0; 232 233 /* 234 * We don't allocate a new extent in the following cases 235 * 236 * 1) The inode is marked as NODATACOW. In this case we'll just use the 237 * existing extent. 238 * 2) The extent is marked as PREALLOC. We're good to go here and can 239 * just use the extent. 240 * 241 */ 242 if ((em->flags & EXTENT_FLAG_PREALLOC) || 243 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 244 em->disk_bytenr != EXTENT_MAP_HOLE)) { 245 if (em->flags & EXTENT_FLAG_PREALLOC) 246 type = BTRFS_ORDERED_PREALLOC; 247 else 248 type = BTRFS_ORDERED_NOCOW; 249 len = min(len, em->len - (start - em->start)); 250 block_start = btrfs_extent_map_block_start(em) + (start - em->start); 251 252 if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent, 253 false) == 1) { 254 bg = btrfs_inc_nocow_writers(fs_info, block_start); 255 if (bg) 256 can_nocow = true; 257 } 258 } 259 260 prev_len = len; 261 if (can_nocow) { 262 struct extent_map *em2; 263 264 /* We can NOCOW, so only need to reserve metadata space. */ 265 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, 266 nowait); 267 if (ret < 0) { 268 /* Our caller expects us to free the input extent map. */ 269 btrfs_free_extent_map(em); 270 *map = NULL; 271 btrfs_dec_nocow_writers(bg); 272 if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) 273 ret = -EAGAIN; 274 goto out; 275 } 276 space_reserved = true; 277 278 em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, 279 &file_extent, type); 280 btrfs_dec_nocow_writers(bg); 281 if (type == BTRFS_ORDERED_PREALLOC) { 282 btrfs_free_extent_map(em); 283 *map = em2; 284 em = em2; 285 } 286 287 if (IS_ERR(em2)) { 288 ret = PTR_ERR(em2); 289 goto out; 290 } 291 292 dio_data->nocow_done = true; 293 } else { 294 /* Our caller expects us to free the input extent map. */ 295 btrfs_free_extent_map(em); 296 *map = NULL; 297 298 if (nowait) { 299 ret = -EAGAIN; 300 goto out; 301 } 302 303 /* 304 * If we could not allocate data space before locking the file 305 * range and we can't do a NOCOW write, then we have to fail. 306 */ 307 if (!dio_data->data_space_reserved) { 308 ret = -ENOSPC; 309 goto out; 310 } 311 312 /* 313 * We have to COW and we have already reserved data space before, 314 * so now we reserve only metadata. 315 */ 316 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, 317 false); 318 if (ret < 0) 319 goto out; 320 space_reserved = true; 321 322 em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); 323 if (IS_ERR(em)) { 324 ret = PTR_ERR(em); 325 goto out; 326 } 327 *map = em; 328 len = min(len, em->len - (start - em->start)); 329 if (len < prev_len) 330 btrfs_delalloc_release_metadata(BTRFS_I(inode), 331 prev_len - len, true); 332 } 333 334 /* 335 * We have created our ordered extent, so we can now release our reservation 336 * for an outstanding extent. 337 */ 338 btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); 339 340 /* 341 * Need to update the i_size under the extent lock so buffered 342 * readers will get the updated i_size when we unlock. 343 */ 344 if (start + len > i_size_read(inode)) 345 i_size_write(inode, start + len); 346 out: 347 if (ret && space_reserved) { 348 btrfs_delalloc_release_extents(BTRFS_I(inode), len); 349 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); 350 } 351 *lenp = len; 352 return ret; 353 } 354 355 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, 356 loff_t length, unsigned int flags, struct iomap *iomap, 357 struct iomap *srcmap) 358 { 359 struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); 360 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 361 struct extent_map *em; 362 struct extent_state *cached_state = NULL; 363 struct btrfs_dio_data *dio_data = iter->private; 364 u64 lockstart, lockend; 365 const bool write = !!(flags & IOMAP_WRITE); 366 int ret = 0; 367 u64 len = length; 368 const u64 data_alloc_len = length; 369 u32 unlock_bits = EXTENT_LOCKED; 370 371 /* 372 * We could potentially fault if we have a buffer > PAGE_SIZE, and if 373 * we're NOWAIT we may submit a bio for a partial range and return 374 * EIOCBQUEUED, which would result in an errant short read. 375 * 376 * The best way to handle this would be to allow for partial completions 377 * of iocb's, so we could submit the partial bio, return and fault in 378 * the rest of the pages, and then submit the io for the rest of the 379 * range. However we don't have that currently, so simply return 380 * -EAGAIN at this point so that the normal path is used. 381 */ 382 if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) 383 return -EAGAIN; 384 385 /* 386 * Cap the size of reads to that usually seen in buffered I/O as we need 387 * to allocate a contiguous array for the checksums. 388 */ 389 if (!write) 390 len = min_t(u64, len, fs_info->sectorsize * BIO_MAX_VECS); 391 392 lockstart = start; 393 lockend = start + len - 1; 394 395 /* 396 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't 397 * enough if we've written compressed pages to this area, so we need to 398 * flush the dirty pages again to make absolutely sure that any 399 * outstanding dirty pages are on disk - the first flush only starts 400 * compression on the data, while keeping the pages locked, so by the 401 * time the second flush returns we know bios for the compressed pages 402 * were submitted and finished, and the pages no longer under writeback. 403 * 404 * If we have a NOWAIT request and we have any pages in the range that 405 * are locked, likely due to compression still in progress, we don't want 406 * to block on page locks. We also don't want to block on pages marked as 407 * dirty or under writeback (same as for the non-compression case). 408 * iomap_dio_rw() did the same check, but after that and before we got 409 * here, mmap'ed writes may have happened or buffered reads started 410 * (readpage() and readahead(), which lock pages), as we haven't locked 411 * the file range yet. 412 */ 413 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 414 &BTRFS_I(inode)->runtime_flags)) { 415 if (flags & IOMAP_NOWAIT) { 416 if (filemap_range_needs_writeback(inode->i_mapping, 417 lockstart, lockend)) 418 return -EAGAIN; 419 } else { 420 ret = filemap_fdatawrite_range(inode->i_mapping, start, 421 start + length - 1); 422 if (ret) 423 return ret; 424 } 425 } 426 427 memset(dio_data, 0, sizeof(*dio_data)); 428 429 /* 430 * We always try to allocate data space and must do it before locking 431 * the file range, to avoid deadlocks with concurrent writes to the same 432 * range if the range has several extents and the writes don't expand the 433 * current i_size (the inode lock is taken in shared mode). If we fail to 434 * allocate data space here we continue and later, after locking the 435 * file range, we fail with ENOSPC only if we figure out we can not do a 436 * NOCOW write. 437 */ 438 if (write && !(flags & IOMAP_NOWAIT)) { 439 ret = btrfs_check_data_free_space(BTRFS_I(inode), 440 &dio_data->data_reserved, 441 start, data_alloc_len, false); 442 if (!ret) 443 dio_data->data_space_reserved = true; 444 else if (!(BTRFS_I(inode)->flags & 445 (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 446 goto err; 447 } 448 449 /* 450 * If this errors out it's because we couldn't invalidate pagecache for 451 * this range and we need to fallback to buffered IO, or we are doing a 452 * NOWAIT read/write and we need to block. 453 */ 454 ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); 455 if (ret < 0) 456 goto err; 457 458 em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); 459 if (IS_ERR(em)) { 460 ret = PTR_ERR(em); 461 goto unlock_err; 462 } 463 464 /* 465 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 466 * io. INLINE is special, and we could probably kludge it in here, but 467 * it's still buffered so for safety lets just fall back to the generic 468 * buffered path. 469 * 470 * For COMPRESSED we _have_ to read the entire extent in so we can 471 * decompress it, so there will be buffering required no matter what we 472 * do, so go ahead and fallback to buffered. 473 * 474 * We return -ENOTBLK because that's what makes DIO go ahead and go back 475 * to buffered IO. Don't blame me, this is the price we pay for using 476 * the generic code. 477 */ 478 if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) { 479 btrfs_free_extent_map(em); 480 /* 481 * If we are in a NOWAIT context, return -EAGAIN in order to 482 * fallback to buffered IO. This is not only because we can 483 * block with buffered IO (no support for NOWAIT semantics at 484 * the moment) but also to avoid returning short reads to user 485 * space - this happens if we were able to read some data from 486 * previous non-compressed extents and then when we fallback to 487 * buffered IO, at btrfs_file_read_iter() by calling 488 * filemap_read(), we fail to fault in pages for the read buffer, 489 * in which case filemap_read() returns a short read (the number 490 * of bytes previously read is > 0, so it does not return -EFAULT). 491 */ 492 ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; 493 goto unlock_err; 494 } 495 496 len = min(len, em->len - (start - em->start)); 497 498 /* 499 * If we have a NOWAIT request and the range contains multiple extents 500 * (or a mix of extents and holes), then we return -EAGAIN to make the 501 * caller fallback to a context where it can do a blocking (without 502 * NOWAIT) request. This way we avoid doing partial IO and returning 503 * success to the caller, which is not optimal for writes and for reads 504 * it can result in unexpected behaviour for an application. 505 * 506 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling 507 * iomap_dio_rw(), we can end up returning less data then what the caller 508 * asked for, resulting in an unexpected, and incorrect, short read. 509 * That is, the caller asked to read N bytes and we return less than that, 510 * which is wrong unless we are crossing EOF. This happens if we get a 511 * page fault error when trying to fault in pages for the buffer that is 512 * associated to the struct iov_iter passed to iomap_dio_rw(), and we 513 * have previously submitted bios for other extents in the range, in 514 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of 515 * those bios have completed by the time we get the page fault error, 516 * which we return back to our caller - we should only return EIOCBQUEUED 517 * after we have submitted bios for all the extents in the range. 518 */ 519 if ((flags & IOMAP_NOWAIT) && len < length) { 520 btrfs_free_extent_map(em); 521 ret = -EAGAIN; 522 goto unlock_err; 523 } 524 525 if (write) { 526 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, 527 start, &len, flags); 528 if (ret < 0) 529 goto unlock_err; 530 /* Recalc len in case the new em is smaller than requested */ 531 len = min(len, em->len - (start - em->start)); 532 if (dio_data->data_space_reserved) { 533 u64 release_offset; 534 u64 release_len = 0; 535 536 if (dio_data->nocow_done) { 537 release_offset = start; 538 release_len = data_alloc_len; 539 } else if (len < data_alloc_len) { 540 release_offset = start + len; 541 release_len = data_alloc_len - len; 542 } 543 544 if (release_len > 0) 545 btrfs_free_reserved_data_space(BTRFS_I(inode), 546 dio_data->data_reserved, 547 release_offset, 548 release_len); 549 } 550 } 551 552 /* 553 * Translate extent map information to iomap. 554 * We trim the extents (and move the addr) even though iomap code does 555 * that, since we have locked only the parts we are performing I/O in. 556 */ 557 if ((em->disk_bytenr == EXTENT_MAP_HOLE) || 558 ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { 559 iomap->addr = IOMAP_NULL_ADDR; 560 iomap->type = IOMAP_HOLE; 561 } else { 562 iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start); 563 iomap->type = IOMAP_MAPPED; 564 } 565 iomap->offset = start; 566 iomap->bdev = fs_info->fs_devices->latest_dev->bdev; 567 iomap->length = len; 568 btrfs_free_extent_map(em); 569 570 /* 571 * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed, 572 * writes only hold it for this part. We hold the extent lock until 573 * we're completely done with the extent map to make sure it remains 574 * valid. 575 */ 576 if (write) 577 unlock_bits |= EXTENT_DIO_LOCKED; 578 579 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 580 unlock_bits, &cached_state); 581 582 /* We didn't use everything, unlock the dio extent for the remainder. */ 583 if (!write && (start + len) < lockend) 584 btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, 585 lockend, NULL); 586 587 return 0; 588 589 unlock_err: 590 /* 591 * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget 592 * to update this, be explicit that we expect EXTENT_LOCKED and 593 * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing. 594 */ 595 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 596 EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); 597 err: 598 if (dio_data->data_space_reserved) { 599 btrfs_free_reserved_data_space(BTRFS_I(inode), 600 dio_data->data_reserved, 601 start, data_alloc_len); 602 extent_changeset_free(dio_data->data_reserved); 603 } 604 605 return ret; 606 } 607 608 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, 609 ssize_t written, unsigned int flags, struct iomap *iomap) 610 { 611 struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); 612 struct btrfs_dio_data *dio_data = iter->private; 613 size_t submitted = dio_data->submitted; 614 const bool write = !!(flags & IOMAP_WRITE); 615 int ret = 0; 616 617 if (!write && (iomap->type == IOMAP_HOLE)) { 618 /* If reading from a hole, unlock and return */ 619 btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, 620 pos + length - 1, NULL); 621 return 0; 622 } 623 624 if (submitted < length) { 625 pos += submitted; 626 length -= submitted; 627 if (write) 628 btrfs_finish_ordered_extent(dio_data->ordered, NULL, 629 pos, length, false); 630 else 631 btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, 632 pos + length - 1, NULL); 633 ret = -ENOTBLK; 634 } 635 if (write) { 636 btrfs_put_ordered_extent(dio_data->ordered); 637 dio_data->ordered = NULL; 638 } 639 640 if (write) 641 extent_changeset_free(dio_data->data_reserved); 642 return ret; 643 } 644 645 static void btrfs_dio_end_io(struct btrfs_bio *bbio) 646 { 647 struct btrfs_dio_private *dip = 648 container_of(bbio, struct btrfs_dio_private, bbio); 649 struct btrfs_inode *inode = bbio->inode; 650 struct bio *bio = &bbio->bio; 651 652 if (bio->bi_status) { 653 btrfs_warn(inode->root->fs_info, 654 "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", 655 btrfs_ino(inode), bio->bi_opf, 656 dip->file_offset, dip->bytes, bio->bi_status); 657 } 658 659 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 660 btrfs_finish_ordered_extent(bbio->ordered, NULL, 661 dip->file_offset, dip->bytes, 662 !bio->bi_status); 663 } else { 664 btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset, 665 dip->file_offset + dip->bytes - 1, NULL); 666 } 667 668 bbio->bio.bi_private = bbio->private; 669 iomap_dio_bio_end_io(bio); 670 } 671 672 static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, 673 struct btrfs_ordered_extent *ordered) 674 { 675 u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; 676 u64 len = bbio->bio.bi_iter.bi_size; 677 struct btrfs_ordered_extent *new; 678 int ret; 679 680 /* Must always be called for the beginning of an ordered extent. */ 681 if (WARN_ON_ONCE(start != ordered->disk_bytenr)) 682 return -EINVAL; 683 684 /* No need to split if the ordered extent covers the entire bio. */ 685 if (ordered->disk_num_bytes == len) { 686 refcount_inc(&ordered->refs); 687 bbio->ordered = ordered; 688 return 0; 689 } 690 691 /* 692 * Don't split the extent_map for NOCOW extents, as we're writing into 693 * a pre-existing one. 694 */ 695 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 696 ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset, 697 ordered->num_bytes, len, 698 ordered->disk_bytenr); 699 if (ret) 700 return ret; 701 } 702 703 new = btrfs_split_ordered_extent(ordered, len); 704 if (IS_ERR(new)) 705 return PTR_ERR(new); 706 bbio->ordered = new; 707 return 0; 708 } 709 710 static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, 711 loff_t file_offset) 712 { 713 struct btrfs_bio *bbio = btrfs_bio(bio); 714 struct btrfs_dio_private *dip = 715 container_of(bbio, struct btrfs_dio_private, bbio); 716 struct btrfs_dio_data *dio_data = iter->private; 717 718 btrfs_bio_init(bbio, BTRFS_I(iter->inode), file_offset, 719 btrfs_dio_end_io, bio->bi_private); 720 721 dip->file_offset = file_offset; 722 dip->bytes = bio->bi_iter.bi_size; 723 724 dio_data->submitted += bio->bi_iter.bi_size; 725 726 /* 727 * Check if we are doing a partial write. If we are, we need to split 728 * the ordered extent to match the submitted bio. Hang on to the 729 * remaining unfinishable ordered_extent in dio_data so that it can be 730 * cancelled in iomap_end to avoid a deadlock wherein faulting the 731 * remaining pages is blocked on the outstanding ordered extent. 732 */ 733 if (iter->flags & IOMAP_WRITE) { 734 int ret; 735 736 ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); 737 if (ret) { 738 btrfs_finish_ordered_extent(dio_data->ordered, NULL, 739 file_offset, dip->bytes, 740 !ret); 741 bio->bi_status = errno_to_blk_status(ret); 742 iomap_dio_bio_end_io(bio); 743 return; 744 } 745 } 746 747 btrfs_submit_bbio(bbio, 0); 748 } 749 750 static const struct iomap_ops btrfs_dio_iomap_ops = { 751 .iomap_begin = btrfs_dio_iomap_begin, 752 .iomap_end = btrfs_dio_iomap_end, 753 }; 754 755 static const struct iomap_dio_ops btrfs_dio_ops = { 756 .submit_io = btrfs_dio_submit_io, 757 .bio_set = &btrfs_dio_bioset, 758 }; 759 760 static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, 761 size_t done_before) 762 { 763 struct btrfs_dio_data data = { 0 }; 764 765 return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 766 IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before); 767 } 768 769 static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, 770 size_t done_before) 771 { 772 struct btrfs_dio_data data = { 0 }; 773 774 return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 775 IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before); 776 } 777 778 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, 779 const struct iov_iter *iter, loff_t offset) 780 { 781 const u32 blocksize_mask = fs_info->sectorsize - 1; 782 783 if (offset & blocksize_mask) 784 return -EINVAL; 785 786 if (iov_iter_alignment(iter) & blocksize_mask) 787 return -EINVAL; 788 return 0; 789 } 790 791 ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) 792 { 793 struct file *file = iocb->ki_filp; 794 struct inode *inode = file_inode(file); 795 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 796 loff_t pos; 797 ssize_t written = 0; 798 ssize_t written_buffered; 799 size_t prev_left = 0; 800 loff_t endbyte; 801 ssize_t ret; 802 unsigned int ilock_flags = 0; 803 struct iomap_dio *dio; 804 const u64 data_profile = btrfs_data_alloc_profile(fs_info) & 805 BTRFS_BLOCK_GROUP_PROFILE_MASK; 806 807 if (iocb->ki_flags & IOCB_NOWAIT) 808 ilock_flags |= BTRFS_ILOCK_TRY; 809 810 /* 811 * If the write DIO is within EOF, use a shared lock and also only if 812 * security bits will likely not be dropped by file_remove_privs() called 813 * from btrfs_write_check(). Either will need to be rechecked after the 814 * lock was acquired. 815 */ 816 if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) 817 ilock_flags |= BTRFS_ILOCK_SHARED; 818 819 /* 820 * If our data profile has duplication (either extra mirrors or RAID56), 821 * we can not trust the direct IO buffer, the content may change during 822 * writeback and cause different contents written to different mirrors. 823 * 824 * Thus only RAID0 and SINGLE can go true zero-copy direct IO. 825 */ 826 if (data_profile != BTRFS_BLOCK_GROUP_RAID0 && data_profile != 0) 827 goto buffered; 828 829 relock: 830 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); 831 if (ret < 0) 832 return ret; 833 834 /* Shared lock cannot be used with security bits set. */ 835 if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { 836 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 837 ilock_flags &= ~BTRFS_ILOCK_SHARED; 838 goto relock; 839 } 840 841 ret = generic_write_checks(iocb, from); 842 if (ret <= 0) { 843 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 844 return ret; 845 } 846 847 ret = btrfs_write_check(iocb, ret); 848 if (ret < 0) { 849 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 850 goto out; 851 } 852 853 pos = iocb->ki_pos; 854 /* 855 * Re-check since file size may have changed just before taking the 856 * lock or pos may have changed because of O_APPEND in generic_write_check() 857 */ 858 if ((ilock_flags & BTRFS_ILOCK_SHARED) && 859 pos + iov_iter_count(from) > i_size_read(inode)) { 860 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 861 ilock_flags &= ~BTRFS_ILOCK_SHARED; 862 goto relock; 863 } 864 865 if (check_direct_IO(fs_info, from, pos)) { 866 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 867 goto buffered; 868 } 869 /* 870 * We can't control the folios being passed in, applications can write 871 * to them while a direct IO write is in progress. This means the 872 * content might change after we calculated the data checksum. 873 * Therefore we can end up storing a checksum that doesn't match the 874 * persisted data. 875 * 876 * To be extra safe and avoid false data checksum mismatch, if the 877 * inode requires data checksum, just fallback to buffered IO. 878 * For buffered IO we have full control of page cache and can ensure 879 * no one is modifying the content during writeback. 880 */ 881 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 882 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 883 goto buffered; 884 } 885 886 /* 887 * The iov_iter can be mapped to the same file range we are writing to. 888 * If that's the case, then we will deadlock in the iomap code, because 889 * it first calls our callback btrfs_dio_iomap_begin(), which will create 890 * an ordered extent, and after that it will fault in the pages that the 891 * iov_iter refers to. During the fault in we end up in the readahead 892 * pages code (starting at btrfs_readahead()), which will lock the range, 893 * find that ordered extent and then wait for it to complete (at 894 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since 895 * obviously the ordered extent can never complete as we didn't submit 896 * yet the respective bio(s). This always happens when the buffer is 897 * memory mapped to the same file range, since the iomap DIO code always 898 * invalidates pages in the target file range (after starting and waiting 899 * for any writeback). 900 * 901 * So here we disable page faults in the iov_iter and then retry if we 902 * got -EFAULT, faulting in the pages before the retry. 903 */ 904 again: 905 from->nofault = true; 906 dio = btrfs_dio_write(iocb, from, written); 907 from->nofault = false; 908 909 if (IS_ERR_OR_NULL(dio)) { 910 ret = PTR_ERR_OR_ZERO(dio); 911 } else { 912 /* 913 * If we have a synchronous write, we must make sure the fsync 914 * triggered by the iomap_dio_complete() call below doesn't 915 * deadlock on the inode lock - we are already holding it and we 916 * can't call it after unlocking because we may need to complete 917 * partial writes due to the input buffer (or parts of it) not 918 * being already faulted in. 919 */ 920 ASSERT(current->journal_info == NULL); 921 current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB; 922 ret = iomap_dio_complete(dio); 923 current->journal_info = NULL; 924 } 925 926 /* No increment (+=) because iomap returns a cumulative value. */ 927 if (ret > 0) 928 written = ret; 929 930 if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) { 931 const size_t left = iov_iter_count(from); 932 /* 933 * We have more data left to write. Try to fault in as many as 934 * possible of the remainder pages and retry. We do this without 935 * releasing and locking again the inode, to prevent races with 936 * truncate. 937 * 938 * Also, in case the iov refers to pages in the file range of the 939 * file we want to write to (due to a mmap), we could enter an 940 * infinite loop if we retry after faulting the pages in, since 941 * iomap will invalidate any pages in the range early on, before 942 * it tries to fault in the pages of the iov. So we keep track of 943 * how much was left of iov in the previous EFAULT and fallback 944 * to buffered IO in case we haven't made any progress. 945 */ 946 if (left == prev_left) { 947 ret = -ENOTBLK; 948 } else { 949 fault_in_iov_iter_readable(from, left); 950 prev_left = left; 951 goto again; 952 } 953 } 954 955 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 956 957 /* 958 * If 'ret' is -ENOTBLK or we have not written all data, then it means 959 * we must fallback to buffered IO. 960 */ 961 if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from)) 962 goto out; 963 964 buffered: 965 /* 966 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller 967 * it must retry the operation in a context where blocking is acceptable, 968 * because even if we end up not blocking during the buffered IO attempt 969 * below, we will block when flushing and waiting for the IO. 970 */ 971 if (iocb->ki_flags & IOCB_NOWAIT) { 972 ret = -EAGAIN; 973 goto out; 974 } 975 976 pos = iocb->ki_pos; 977 written_buffered = btrfs_buffered_write(iocb, from); 978 if (written_buffered < 0) { 979 ret = written_buffered; 980 goto out; 981 } 982 /* 983 * Ensure all data is persisted. We want the next direct IO read to be 984 * able to read what was just written. 985 */ 986 endbyte = pos + written_buffered - 1; 987 ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte); 988 if (ret) 989 goto out; 990 ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); 991 if (ret) 992 goto out; 993 written += written_buffered; 994 iocb->ki_pos = pos + written_buffered; 995 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, 996 endbyte >> PAGE_SHIFT); 997 out: 998 return ret < 0 ? ret : written; 999 } 1000 1001 static int check_direct_read(struct btrfs_fs_info *fs_info, 1002 const struct iov_iter *iter, loff_t offset) 1003 { 1004 int ret; 1005 int i, seg; 1006 1007 ret = check_direct_IO(fs_info, iter, offset); 1008 if (ret < 0) 1009 return ret; 1010 1011 if (!iter_is_iovec(iter)) 1012 return 0; 1013 1014 for (seg = 0; seg < iter->nr_segs; seg++) { 1015 for (i = seg + 1; i < iter->nr_segs; i++) { 1016 const struct iovec *iov1 = iter_iov(iter) + seg; 1017 const struct iovec *iov2 = iter_iov(iter) + i; 1018 1019 if (iov1->iov_base == iov2->iov_base) 1020 return -EINVAL; 1021 } 1022 } 1023 return 0; 1024 } 1025 1026 ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) 1027 { 1028 struct inode *inode = file_inode(iocb->ki_filp); 1029 size_t prev_left = 0; 1030 ssize_t read = 0; 1031 ssize_t ret; 1032 1033 if (fsverity_active(inode)) 1034 return 0; 1035 1036 if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos)) 1037 return 0; 1038 1039 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 1040 again: 1041 /* 1042 * This is similar to what we do for direct IO writes, see the comment 1043 * at btrfs_direct_write(), but we also disable page faults in addition 1044 * to disabling them only at the iov_iter level. This is because when 1045 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), 1046 * which can still trigger page fault ins despite having set ->nofault 1047 * to true of our 'to' iov_iter. 1048 * 1049 * The difference to direct IO writes is that we deadlock when trying 1050 * to lock the extent range in the inode's tree during he page reads 1051 * triggered by the fault in (while for writes it is due to waiting for 1052 * our own ordered extent). This is because for direct IO reads, 1053 * btrfs_dio_iomap_begin() returns with the extent range locked, which 1054 * is only unlocked in the endio callback (end_bio_extent_readpage()). 1055 */ 1056 pagefault_disable(); 1057 to->nofault = true; 1058 ret = btrfs_dio_read(iocb, to, read); 1059 to->nofault = false; 1060 pagefault_enable(); 1061 1062 /* No increment (+=) because iomap returns a cumulative value. */ 1063 if (ret > 0) 1064 read = ret; 1065 1066 if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { 1067 const size_t left = iov_iter_count(to); 1068 1069 if (left == prev_left) { 1070 /* 1071 * We didn't make any progress since the last attempt, 1072 * fallback to a buffered read for the remainder of the 1073 * range. This is just to avoid any possibility of looping 1074 * for too long. 1075 */ 1076 ret = read; 1077 } else { 1078 /* 1079 * We made some progress since the last retry or this is 1080 * the first time we are retrying. Fault in as many pages 1081 * as possible and retry. 1082 */ 1083 fault_in_iov_iter_writeable(to, left); 1084 prev_left = left; 1085 goto again; 1086 } 1087 } 1088 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 1089 return ret < 0 ? ret : read; 1090 } 1091 1092 int __init btrfs_init_dio(void) 1093 { 1094 if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, 1095 offsetof(struct btrfs_dio_private, bbio.bio), 1096 BIOSET_NEED_BVECS)) 1097 return -ENOMEM; 1098 1099 return 0; 1100 } 1101 1102 void __cold btrfs_destroy_dio(void) 1103 { 1104 bioset_exit(&btrfs_dio_bioset); 1105 } 1106