1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/fsverity.h> 4 #include <linux/iomap.h> 5 #include "ctree.h" 6 #include "delalloc-space.h" 7 #include "direct-io.h" 8 #include "extent-tree.h" 9 #include "file.h" 10 #include "fs.h" 11 #include "transaction.h" 12 #include "volumes.h" 13 14 struct btrfs_dio_data { 15 ssize_t submitted; 16 struct extent_changeset *data_reserved; 17 struct btrfs_ordered_extent *ordered; 18 bool data_space_reserved; 19 bool nocow_done; 20 }; 21 22 struct btrfs_dio_private { 23 /* Range of I/O */ 24 u64 file_offset; 25 u32 bytes; 26 27 /* This must be last */ 28 struct btrfs_bio bbio; 29 }; 30 31 static struct bio_set btrfs_dio_bioset; 32 33 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 34 struct extent_state **cached_state, 35 unsigned int iomap_flags) 36 { 37 const bool writing = (iomap_flags & IOMAP_WRITE); 38 const bool nowait = (iomap_flags & IOMAP_NOWAIT); 39 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 40 struct btrfs_ordered_extent *ordered; 41 int ret = 0; 42 43 /* Direct lock must be taken before the extent lock. */ 44 if (nowait) { 45 if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) 46 return -EAGAIN; 47 } else { 48 lock_dio_extent(io_tree, lockstart, lockend, cached_state); 49 } 50 51 while (1) { 52 if (nowait) { 53 if (!try_lock_extent(io_tree, lockstart, lockend, 54 cached_state)) { 55 ret = -EAGAIN; 56 break; 57 } 58 } else { 59 lock_extent(io_tree, lockstart, lockend, cached_state); 60 } 61 /* 62 * We're concerned with the entire range that we're going to be 63 * doing DIO to, so we need to make sure there's no ordered 64 * extents in this range. 65 */ 66 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, 67 lockend - lockstart + 1); 68 69 /* 70 * We need to make sure there are no buffered pages in this 71 * range either, we could have raced between the invalidate in 72 * generic_file_direct_write and locking the extent. The 73 * invalidate needs to happen so that reads after a write do not 74 * get stale data. 75 */ 76 if (!ordered && 77 (!writing || !filemap_range_has_page(inode->i_mapping, 78 lockstart, lockend))) 79 break; 80 81 unlock_extent(io_tree, lockstart, lockend, cached_state); 82 83 if (ordered) { 84 if (nowait) { 85 btrfs_put_ordered_extent(ordered); 86 ret = -EAGAIN; 87 break; 88 } 89 /* 90 * If we are doing a DIO read and the ordered extent we 91 * found is for a buffered write, we can not wait for it 92 * to complete and retry, because if we do so we can 93 * deadlock with concurrent buffered writes on page 94 * locks. This happens only if our DIO read covers more 95 * than one extent map, if at this point has already 96 * created an ordered extent for a previous extent map 97 * and locked its range in the inode's io tree, and a 98 * concurrent write against that previous extent map's 99 * range and this range started (we unlock the ranges 100 * in the io tree only when the bios complete and 101 * buffered writes always lock pages before attempting 102 * to lock range in the io tree). 103 */ 104 if (writing || 105 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) 106 btrfs_start_ordered_extent(ordered); 107 else 108 ret = nowait ? -EAGAIN : -ENOTBLK; 109 btrfs_put_ordered_extent(ordered); 110 } else { 111 /* 112 * We could trigger writeback for this range (and wait 113 * for it to complete) and then invalidate the pages for 114 * this range (through invalidate_inode_pages2_range()), 115 * but that can lead us to a deadlock with a concurrent 116 * call to readahead (a buffered read or a defrag call 117 * triggered a readahead) on a page lock due to an 118 * ordered dio extent we created before but did not have 119 * yet a corresponding bio submitted (whence it can not 120 * complete), which makes readahead wait for that 121 * ordered extent to complete while holding a lock on 122 * that page. 123 */ 124 ret = nowait ? -EAGAIN : -ENOTBLK; 125 } 126 127 if (ret) 128 break; 129 130 cond_resched(); 131 } 132 133 if (ret) 134 unlock_dio_extent(io_tree, lockstart, lockend, cached_state); 135 return ret; 136 } 137 138 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, 139 struct btrfs_dio_data *dio_data, 140 const u64 start, 141 const struct btrfs_file_extent *file_extent, 142 const int type) 143 { 144 struct extent_map *em = NULL; 145 struct btrfs_ordered_extent *ordered; 146 147 if (type != BTRFS_ORDERED_NOCOW) { 148 em = btrfs_create_io_em(inode, start, file_extent, type); 149 if (IS_ERR(em)) 150 goto out; 151 } 152 153 ordered = btrfs_alloc_ordered_extent(inode, start, file_extent, 154 (1 << type) | 155 (1 << BTRFS_ORDERED_DIRECT)); 156 if (IS_ERR(ordered)) { 157 if (em) { 158 free_extent_map(em); 159 btrfs_drop_extent_map_range(inode, start, 160 start + file_extent->num_bytes - 1, false); 161 } 162 em = ERR_CAST(ordered); 163 } else { 164 ASSERT(!dio_data->ordered); 165 dio_data->ordered = ordered; 166 } 167 out: 168 169 return em; 170 } 171 172 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, 173 struct btrfs_dio_data *dio_data, 174 u64 start, u64 len) 175 { 176 struct btrfs_root *root = inode->root; 177 struct btrfs_fs_info *fs_info = root->fs_info; 178 struct btrfs_file_extent file_extent; 179 struct extent_map *em; 180 struct btrfs_key ins; 181 u64 alloc_hint; 182 int ret; 183 184 alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len); 185 again: 186 ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 187 0, alloc_hint, &ins, 1, 1); 188 if (ret == -EAGAIN) { 189 ASSERT(btrfs_is_zoned(fs_info)); 190 wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, 191 TASK_UNINTERRUPTIBLE); 192 goto again; 193 } 194 if (ret) 195 return ERR_PTR(ret); 196 197 file_extent.disk_bytenr = ins.objectid; 198 file_extent.disk_num_bytes = ins.offset; 199 file_extent.num_bytes = ins.offset; 200 file_extent.ram_bytes = ins.offset; 201 file_extent.offset = 0; 202 file_extent.compression = BTRFS_COMPRESS_NONE; 203 em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent, 204 BTRFS_ORDERED_REGULAR); 205 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 206 if (IS_ERR(em)) 207 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 208 1); 209 210 return em; 211 } 212 213 static int btrfs_get_blocks_direct_write(struct extent_map **map, 214 struct inode *inode, 215 struct btrfs_dio_data *dio_data, 216 u64 start, u64 *lenp, 217 unsigned int iomap_flags) 218 { 219 const bool nowait = (iomap_flags & IOMAP_NOWAIT); 220 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 221 struct btrfs_file_extent file_extent; 222 struct extent_map *em = *map; 223 int type; 224 u64 block_start; 225 struct btrfs_block_group *bg; 226 bool can_nocow = false; 227 bool space_reserved = false; 228 u64 len = *lenp; 229 u64 prev_len; 230 int ret = 0; 231 232 /* 233 * We don't allocate a new extent in the following cases 234 * 235 * 1) The inode is marked as NODATACOW. In this case we'll just use the 236 * existing extent. 237 * 2) The extent is marked as PREALLOC. We're good to go here and can 238 * just use the extent. 239 * 240 */ 241 if ((em->flags & EXTENT_FLAG_PREALLOC) || 242 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 243 em->disk_bytenr != EXTENT_MAP_HOLE)) { 244 if (em->flags & EXTENT_FLAG_PREALLOC) 245 type = BTRFS_ORDERED_PREALLOC; 246 else 247 type = BTRFS_ORDERED_NOCOW; 248 len = min(len, em->len - (start - em->start)); 249 block_start = extent_map_block_start(em) + (start - em->start); 250 251 if (can_nocow_extent(inode, start, &len, 252 &file_extent, false, false) == 1) { 253 bg = btrfs_inc_nocow_writers(fs_info, block_start); 254 if (bg) 255 can_nocow = true; 256 } 257 } 258 259 prev_len = len; 260 if (can_nocow) { 261 struct extent_map *em2; 262 263 /* We can NOCOW, so only need to reserve metadata space. */ 264 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, 265 nowait); 266 if (ret < 0) { 267 /* Our caller expects us to free the input extent map. */ 268 free_extent_map(em); 269 *map = NULL; 270 btrfs_dec_nocow_writers(bg); 271 if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) 272 ret = -EAGAIN; 273 goto out; 274 } 275 space_reserved = true; 276 277 em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, 278 &file_extent, type); 279 btrfs_dec_nocow_writers(bg); 280 if (type == BTRFS_ORDERED_PREALLOC) { 281 free_extent_map(em); 282 *map = em2; 283 em = em2; 284 } 285 286 if (IS_ERR(em2)) { 287 ret = PTR_ERR(em2); 288 goto out; 289 } 290 291 dio_data->nocow_done = true; 292 } else { 293 /* Our caller expects us to free the input extent map. */ 294 free_extent_map(em); 295 *map = NULL; 296 297 if (nowait) { 298 ret = -EAGAIN; 299 goto out; 300 } 301 302 /* 303 * If we could not allocate data space before locking the file 304 * range and we can't do a NOCOW write, then we have to fail. 305 */ 306 if (!dio_data->data_space_reserved) { 307 ret = -ENOSPC; 308 goto out; 309 } 310 311 /* 312 * We have to COW and we have already reserved data space before, 313 * so now we reserve only metadata. 314 */ 315 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, 316 false); 317 if (ret < 0) 318 goto out; 319 space_reserved = true; 320 321 em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); 322 if (IS_ERR(em)) { 323 ret = PTR_ERR(em); 324 goto out; 325 } 326 *map = em; 327 len = min(len, em->len - (start - em->start)); 328 if (len < prev_len) 329 btrfs_delalloc_release_metadata(BTRFS_I(inode), 330 prev_len - len, true); 331 } 332 333 /* 334 * We have created our ordered extent, so we can now release our reservation 335 * for an outstanding extent. 336 */ 337 btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); 338 339 /* 340 * Need to update the i_size under the extent lock so buffered 341 * readers will get the updated i_size when we unlock. 342 */ 343 if (start + len > i_size_read(inode)) 344 i_size_write(inode, start + len); 345 out: 346 if (ret && space_reserved) { 347 btrfs_delalloc_release_extents(BTRFS_I(inode), len); 348 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); 349 } 350 *lenp = len; 351 return ret; 352 } 353 354 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, 355 loff_t length, unsigned int flags, struct iomap *iomap, 356 struct iomap *srcmap) 357 { 358 struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); 359 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 360 struct extent_map *em; 361 struct extent_state *cached_state = NULL; 362 struct btrfs_dio_data *dio_data = iter->private; 363 u64 lockstart, lockend; 364 const bool write = !!(flags & IOMAP_WRITE); 365 int ret = 0; 366 u64 len = length; 367 const u64 data_alloc_len = length; 368 u32 unlock_bits = EXTENT_LOCKED; 369 370 /* 371 * We could potentially fault if we have a buffer > PAGE_SIZE, and if 372 * we're NOWAIT we may submit a bio for a partial range and return 373 * EIOCBQUEUED, which would result in an errant short read. 374 * 375 * The best way to handle this would be to allow for partial completions 376 * of iocb's, so we could submit the partial bio, return and fault in 377 * the rest of the pages, and then submit the io for the rest of the 378 * range. However we don't have that currently, so simply return 379 * -EAGAIN at this point so that the normal path is used. 380 */ 381 if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) 382 return -EAGAIN; 383 384 /* 385 * Cap the size of reads to that usually seen in buffered I/O as we need 386 * to allocate a contiguous array for the checksums. 387 */ 388 if (!write) 389 len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); 390 391 lockstart = start; 392 lockend = start + len - 1; 393 394 /* 395 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't 396 * enough if we've written compressed pages to this area, so we need to 397 * flush the dirty pages again to make absolutely sure that any 398 * outstanding dirty pages are on disk - the first flush only starts 399 * compression on the data, while keeping the pages locked, so by the 400 * time the second flush returns we know bios for the compressed pages 401 * were submitted and finished, and the pages no longer under writeback. 402 * 403 * If we have a NOWAIT request and we have any pages in the range that 404 * are locked, likely due to compression still in progress, we don't want 405 * to block on page locks. We also don't want to block on pages marked as 406 * dirty or under writeback (same as for the non-compression case). 407 * iomap_dio_rw() did the same check, but after that and before we got 408 * here, mmap'ed writes may have happened or buffered reads started 409 * (readpage() and readahead(), which lock pages), as we haven't locked 410 * the file range yet. 411 */ 412 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 413 &BTRFS_I(inode)->runtime_flags)) { 414 if (flags & IOMAP_NOWAIT) { 415 if (filemap_range_needs_writeback(inode->i_mapping, 416 lockstart, lockend)) 417 return -EAGAIN; 418 } else { 419 ret = filemap_fdatawrite_range(inode->i_mapping, start, 420 start + length - 1); 421 if (ret) 422 return ret; 423 } 424 } 425 426 memset(dio_data, 0, sizeof(*dio_data)); 427 428 /* 429 * We always try to allocate data space and must do it before locking 430 * the file range, to avoid deadlocks with concurrent writes to the same 431 * range if the range has several extents and the writes don't expand the 432 * current i_size (the inode lock is taken in shared mode). If we fail to 433 * allocate data space here we continue and later, after locking the 434 * file range, we fail with ENOSPC only if we figure out we can not do a 435 * NOCOW write. 436 */ 437 if (write && !(flags & IOMAP_NOWAIT)) { 438 ret = btrfs_check_data_free_space(BTRFS_I(inode), 439 &dio_data->data_reserved, 440 start, data_alloc_len, false); 441 if (!ret) 442 dio_data->data_space_reserved = true; 443 else if (ret && !(BTRFS_I(inode)->flags & 444 (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 445 goto err; 446 } 447 448 /* 449 * If this errors out it's because we couldn't invalidate pagecache for 450 * this range and we need to fallback to buffered IO, or we are doing a 451 * NOWAIT read/write and we need to block. 452 */ 453 ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); 454 if (ret < 0) 455 goto err; 456 457 em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); 458 if (IS_ERR(em)) { 459 ret = PTR_ERR(em); 460 goto unlock_err; 461 } 462 463 /* 464 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 465 * io. INLINE is special, and we could probably kludge it in here, but 466 * it's still buffered so for safety lets just fall back to the generic 467 * buffered path. 468 * 469 * For COMPRESSED we _have_ to read the entire extent in so we can 470 * decompress it, so there will be buffering required no matter what we 471 * do, so go ahead and fallback to buffered. 472 * 473 * We return -ENOTBLK because that's what makes DIO go ahead and go back 474 * to buffered IO. Don't blame me, this is the price we pay for using 475 * the generic code. 476 */ 477 if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) { 478 free_extent_map(em); 479 /* 480 * If we are in a NOWAIT context, return -EAGAIN in order to 481 * fallback to buffered IO. This is not only because we can 482 * block with buffered IO (no support for NOWAIT semantics at 483 * the moment) but also to avoid returning short reads to user 484 * space - this happens if we were able to read some data from 485 * previous non-compressed extents and then when we fallback to 486 * buffered IO, at btrfs_file_read_iter() by calling 487 * filemap_read(), we fail to fault in pages for the read buffer, 488 * in which case filemap_read() returns a short read (the number 489 * of bytes previously read is > 0, so it does not return -EFAULT). 490 */ 491 ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; 492 goto unlock_err; 493 } 494 495 len = min(len, em->len - (start - em->start)); 496 497 /* 498 * If we have a NOWAIT request and the range contains multiple extents 499 * (or a mix of extents and holes), then we return -EAGAIN to make the 500 * caller fallback to a context where it can do a blocking (without 501 * NOWAIT) request. This way we avoid doing partial IO and returning 502 * success to the caller, which is not optimal for writes and for reads 503 * it can result in unexpected behaviour for an application. 504 * 505 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling 506 * iomap_dio_rw(), we can end up returning less data then what the caller 507 * asked for, resulting in an unexpected, and incorrect, short read. 508 * That is, the caller asked to read N bytes and we return less than that, 509 * which is wrong unless we are crossing EOF. This happens if we get a 510 * page fault error when trying to fault in pages for the buffer that is 511 * associated to the struct iov_iter passed to iomap_dio_rw(), and we 512 * have previously submitted bios for other extents in the range, in 513 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of 514 * those bios have completed by the time we get the page fault error, 515 * which we return back to our caller - we should only return EIOCBQUEUED 516 * after we have submitted bios for all the extents in the range. 517 */ 518 if ((flags & IOMAP_NOWAIT) && len < length) { 519 free_extent_map(em); 520 ret = -EAGAIN; 521 goto unlock_err; 522 } 523 524 if (write) { 525 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, 526 start, &len, flags); 527 if (ret < 0) 528 goto unlock_err; 529 /* Recalc len in case the new em is smaller than requested */ 530 len = min(len, em->len - (start - em->start)); 531 if (dio_data->data_space_reserved) { 532 u64 release_offset; 533 u64 release_len = 0; 534 535 if (dio_data->nocow_done) { 536 release_offset = start; 537 release_len = data_alloc_len; 538 } else if (len < data_alloc_len) { 539 release_offset = start + len; 540 release_len = data_alloc_len - len; 541 } 542 543 if (release_len > 0) 544 btrfs_free_reserved_data_space(BTRFS_I(inode), 545 dio_data->data_reserved, 546 release_offset, 547 release_len); 548 } 549 } 550 551 /* 552 * Translate extent map information to iomap. 553 * We trim the extents (and move the addr) even though iomap code does 554 * that, since we have locked only the parts we are performing I/O in. 555 */ 556 if ((em->disk_bytenr == EXTENT_MAP_HOLE) || 557 ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { 558 iomap->addr = IOMAP_NULL_ADDR; 559 iomap->type = IOMAP_HOLE; 560 } else { 561 iomap->addr = extent_map_block_start(em) + (start - em->start); 562 iomap->type = IOMAP_MAPPED; 563 } 564 iomap->offset = start; 565 iomap->bdev = fs_info->fs_devices->latest_dev->bdev; 566 iomap->length = len; 567 free_extent_map(em); 568 569 /* 570 * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed, 571 * writes only hold it for this part. We hold the extent lock until 572 * we're completely done with the extent map to make sure it remains 573 * valid. 574 */ 575 if (write) 576 unlock_bits |= EXTENT_DIO_LOCKED; 577 578 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 579 unlock_bits, &cached_state); 580 581 /* We didn't use everything, unlock the dio extent for the remainder. */ 582 if (!write && (start + len) < lockend) 583 unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, 584 lockend, NULL); 585 586 return 0; 587 588 unlock_err: 589 /* 590 * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget 591 * to update this, be explicit that we expect EXTENT_LOCKED and 592 * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing. 593 */ 594 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 595 EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); 596 err: 597 if (dio_data->data_space_reserved) { 598 btrfs_free_reserved_data_space(BTRFS_I(inode), 599 dio_data->data_reserved, 600 start, data_alloc_len); 601 extent_changeset_free(dio_data->data_reserved); 602 } 603 604 return ret; 605 } 606 607 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, 608 ssize_t written, unsigned int flags, struct iomap *iomap) 609 { 610 struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); 611 struct btrfs_dio_data *dio_data = iter->private; 612 size_t submitted = dio_data->submitted; 613 const bool write = !!(flags & IOMAP_WRITE); 614 int ret = 0; 615 616 if (!write && (iomap->type == IOMAP_HOLE)) { 617 /* If reading from a hole, unlock and return */ 618 unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, 619 pos + length - 1, NULL); 620 return 0; 621 } 622 623 if (submitted < length) { 624 pos += submitted; 625 length -= submitted; 626 if (write) 627 btrfs_finish_ordered_extent(dio_data->ordered, NULL, 628 pos, length, false); 629 else 630 unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, 631 pos + length - 1, NULL); 632 ret = -ENOTBLK; 633 } 634 if (write) { 635 btrfs_put_ordered_extent(dio_data->ordered); 636 dio_data->ordered = NULL; 637 } 638 639 if (write) 640 extent_changeset_free(dio_data->data_reserved); 641 return ret; 642 } 643 644 static void btrfs_dio_end_io(struct btrfs_bio *bbio) 645 { 646 struct btrfs_dio_private *dip = 647 container_of(bbio, struct btrfs_dio_private, bbio); 648 struct btrfs_inode *inode = bbio->inode; 649 struct bio *bio = &bbio->bio; 650 651 if (bio->bi_status) { 652 btrfs_warn(inode->root->fs_info, 653 "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", 654 btrfs_ino(inode), bio->bi_opf, 655 dip->file_offset, dip->bytes, bio->bi_status); 656 } 657 658 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 659 btrfs_finish_ordered_extent(bbio->ordered, NULL, 660 dip->file_offset, dip->bytes, 661 !bio->bi_status); 662 } else { 663 unlock_dio_extent(&inode->io_tree, dip->file_offset, 664 dip->file_offset + dip->bytes - 1, NULL); 665 } 666 667 bbio->bio.bi_private = bbio->private; 668 iomap_dio_bio_end_io(bio); 669 } 670 671 static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, 672 struct btrfs_ordered_extent *ordered) 673 { 674 u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; 675 u64 len = bbio->bio.bi_iter.bi_size; 676 struct btrfs_ordered_extent *new; 677 int ret; 678 679 /* Must always be called for the beginning of an ordered extent. */ 680 if (WARN_ON_ONCE(start != ordered->disk_bytenr)) 681 return -EINVAL; 682 683 /* No need to split if the ordered extent covers the entire bio. */ 684 if (ordered->disk_num_bytes == len) { 685 refcount_inc(&ordered->refs); 686 bbio->ordered = ordered; 687 return 0; 688 } 689 690 /* 691 * Don't split the extent_map for NOCOW extents, as we're writing into 692 * a pre-existing one. 693 */ 694 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 695 ret = split_extent_map(bbio->inode, bbio->file_offset, 696 ordered->num_bytes, len, 697 ordered->disk_bytenr); 698 if (ret) 699 return ret; 700 } 701 702 new = btrfs_split_ordered_extent(ordered, len); 703 if (IS_ERR(new)) 704 return PTR_ERR(new); 705 bbio->ordered = new; 706 return 0; 707 } 708 709 static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, 710 loff_t file_offset) 711 { 712 struct btrfs_bio *bbio = btrfs_bio(bio); 713 struct btrfs_dio_private *dip = 714 container_of(bbio, struct btrfs_dio_private, bbio); 715 struct btrfs_dio_data *dio_data = iter->private; 716 717 btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, 718 btrfs_dio_end_io, bio->bi_private); 719 bbio->inode = BTRFS_I(iter->inode); 720 bbio->file_offset = file_offset; 721 722 dip->file_offset = file_offset; 723 dip->bytes = bio->bi_iter.bi_size; 724 725 dio_data->submitted += bio->bi_iter.bi_size; 726 727 /* 728 * Check if we are doing a partial write. If we are, we need to split 729 * the ordered extent to match the submitted bio. Hang on to the 730 * remaining unfinishable ordered_extent in dio_data so that it can be 731 * cancelled in iomap_end to avoid a deadlock wherein faulting the 732 * remaining pages is blocked on the outstanding ordered extent. 733 */ 734 if (iter->flags & IOMAP_WRITE) { 735 int ret; 736 737 ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); 738 if (ret) { 739 btrfs_finish_ordered_extent(dio_data->ordered, NULL, 740 file_offset, dip->bytes, 741 !ret); 742 bio->bi_status = errno_to_blk_status(ret); 743 iomap_dio_bio_end_io(bio); 744 return; 745 } 746 } 747 748 btrfs_submit_bbio(bbio, 0); 749 } 750 751 static const struct iomap_ops btrfs_dio_iomap_ops = { 752 .iomap_begin = btrfs_dio_iomap_begin, 753 .iomap_end = btrfs_dio_iomap_end, 754 }; 755 756 static const struct iomap_dio_ops btrfs_dio_ops = { 757 .submit_io = btrfs_dio_submit_io, 758 .bio_set = &btrfs_dio_bioset, 759 }; 760 761 static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, 762 size_t done_before) 763 { 764 struct btrfs_dio_data data = { 0 }; 765 766 return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 767 IOMAP_DIO_PARTIAL, &data, done_before); 768 } 769 770 static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, 771 size_t done_before) 772 { 773 struct btrfs_dio_data data = { 0 }; 774 775 return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 776 IOMAP_DIO_PARTIAL, &data, done_before); 777 } 778 779 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, 780 const struct iov_iter *iter, loff_t offset) 781 { 782 const u32 blocksize_mask = fs_info->sectorsize - 1; 783 784 if (offset & blocksize_mask) 785 return -EINVAL; 786 787 if (iov_iter_alignment(iter) & blocksize_mask) 788 return -EINVAL; 789 790 return 0; 791 } 792 793 ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) 794 { 795 struct file *file = iocb->ki_filp; 796 struct inode *inode = file_inode(file); 797 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 798 loff_t pos; 799 ssize_t written = 0; 800 ssize_t written_buffered; 801 size_t prev_left = 0; 802 loff_t endbyte; 803 ssize_t ret; 804 unsigned int ilock_flags = 0; 805 struct iomap_dio *dio; 806 807 if (iocb->ki_flags & IOCB_NOWAIT) 808 ilock_flags |= BTRFS_ILOCK_TRY; 809 810 /* 811 * If the write DIO is within EOF, use a shared lock and also only if 812 * security bits will likely not be dropped by file_remove_privs() called 813 * from btrfs_write_check(). Either will need to be rechecked after the 814 * lock was acquired. 815 */ 816 if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) 817 ilock_flags |= BTRFS_ILOCK_SHARED; 818 819 relock: 820 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); 821 if (ret < 0) 822 return ret; 823 824 /* Shared lock cannot be used with security bits set. */ 825 if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { 826 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 827 ilock_flags &= ~BTRFS_ILOCK_SHARED; 828 goto relock; 829 } 830 831 ret = generic_write_checks(iocb, from); 832 if (ret <= 0) { 833 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 834 return ret; 835 } 836 837 ret = btrfs_write_check(iocb, ret); 838 if (ret < 0) { 839 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 840 goto out; 841 } 842 843 pos = iocb->ki_pos; 844 /* 845 * Re-check since file size may have changed just before taking the 846 * lock or pos may have changed because of O_APPEND in generic_write_check() 847 */ 848 if ((ilock_flags & BTRFS_ILOCK_SHARED) && 849 pos + iov_iter_count(from) > i_size_read(inode)) { 850 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 851 ilock_flags &= ~BTRFS_ILOCK_SHARED; 852 goto relock; 853 } 854 855 if (check_direct_IO(fs_info, from, pos)) { 856 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 857 goto buffered; 858 } 859 860 /* 861 * The iov_iter can be mapped to the same file range we are writing to. 862 * If that's the case, then we will deadlock in the iomap code, because 863 * it first calls our callback btrfs_dio_iomap_begin(), which will create 864 * an ordered extent, and after that it will fault in the pages that the 865 * iov_iter refers to. During the fault in we end up in the readahead 866 * pages code (starting at btrfs_readahead()), which will lock the range, 867 * find that ordered extent and then wait for it to complete (at 868 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since 869 * obviously the ordered extent can never complete as we didn't submit 870 * yet the respective bio(s). This always happens when the buffer is 871 * memory mapped to the same file range, since the iomap DIO code always 872 * invalidates pages in the target file range (after starting and waiting 873 * for any writeback). 874 * 875 * So here we disable page faults in the iov_iter and then retry if we 876 * got -EFAULT, faulting in the pages before the retry. 877 */ 878 again: 879 from->nofault = true; 880 dio = btrfs_dio_write(iocb, from, written); 881 from->nofault = false; 882 883 if (IS_ERR_OR_NULL(dio)) { 884 ret = PTR_ERR_OR_ZERO(dio); 885 } else { 886 /* 887 * If we have a synchronous write, we must make sure the fsync 888 * triggered by the iomap_dio_complete() call below doesn't 889 * deadlock on the inode lock - we are already holding it and we 890 * can't call it after unlocking because we may need to complete 891 * partial writes due to the input buffer (or parts of it) not 892 * being already faulted in. 893 */ 894 ASSERT(current->journal_info == NULL); 895 current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB; 896 ret = iomap_dio_complete(dio); 897 current->journal_info = NULL; 898 } 899 900 /* No increment (+=) because iomap returns a cumulative value. */ 901 if (ret > 0) 902 written = ret; 903 904 if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) { 905 const size_t left = iov_iter_count(from); 906 /* 907 * We have more data left to write. Try to fault in as many as 908 * possible of the remainder pages and retry. We do this without 909 * releasing and locking again the inode, to prevent races with 910 * truncate. 911 * 912 * Also, in case the iov refers to pages in the file range of the 913 * file we want to write to (due to a mmap), we could enter an 914 * infinite loop if we retry after faulting the pages in, since 915 * iomap will invalidate any pages in the range early on, before 916 * it tries to fault in the pages of the iov. So we keep track of 917 * how much was left of iov in the previous EFAULT and fallback 918 * to buffered IO in case we haven't made any progress. 919 */ 920 if (left == prev_left) { 921 ret = -ENOTBLK; 922 } else { 923 fault_in_iov_iter_readable(from, left); 924 prev_left = left; 925 goto again; 926 } 927 } 928 929 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 930 931 /* 932 * If 'ret' is -ENOTBLK or we have not written all data, then it means 933 * we must fallback to buffered IO. 934 */ 935 if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from)) 936 goto out; 937 938 buffered: 939 /* 940 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller 941 * it must retry the operation in a context where blocking is acceptable, 942 * because even if we end up not blocking during the buffered IO attempt 943 * below, we will block when flushing and waiting for the IO. 944 */ 945 if (iocb->ki_flags & IOCB_NOWAIT) { 946 ret = -EAGAIN; 947 goto out; 948 } 949 950 pos = iocb->ki_pos; 951 written_buffered = btrfs_buffered_write(iocb, from); 952 if (written_buffered < 0) { 953 ret = written_buffered; 954 goto out; 955 } 956 /* 957 * Ensure all data is persisted. We want the next direct IO read to be 958 * able to read what was just written. 959 */ 960 endbyte = pos + written_buffered - 1; 961 ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte); 962 if (ret) 963 goto out; 964 ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); 965 if (ret) 966 goto out; 967 written += written_buffered; 968 iocb->ki_pos = pos + written_buffered; 969 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, 970 endbyte >> PAGE_SHIFT); 971 out: 972 return ret < 0 ? ret : written; 973 } 974 975 static int check_direct_read(struct btrfs_fs_info *fs_info, 976 const struct iov_iter *iter, loff_t offset) 977 { 978 int ret; 979 int i, seg; 980 981 ret = check_direct_IO(fs_info, iter, offset); 982 if (ret < 0) 983 return ret; 984 985 if (!iter_is_iovec(iter)) 986 return 0; 987 988 for (seg = 0; seg < iter->nr_segs; seg++) { 989 for (i = seg + 1; i < iter->nr_segs; i++) { 990 const struct iovec *iov1 = iter_iov(iter) + seg; 991 const struct iovec *iov2 = iter_iov(iter) + i; 992 993 if (iov1->iov_base == iov2->iov_base) 994 return -EINVAL; 995 } 996 } 997 return 0; 998 } 999 1000 ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) 1001 { 1002 struct inode *inode = file_inode(iocb->ki_filp); 1003 size_t prev_left = 0; 1004 ssize_t read = 0; 1005 ssize_t ret; 1006 1007 if (fsverity_active(inode)) 1008 return 0; 1009 1010 if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos)) 1011 return 0; 1012 1013 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 1014 again: 1015 /* 1016 * This is similar to what we do for direct IO writes, see the comment 1017 * at btrfs_direct_write(), but we also disable page faults in addition 1018 * to disabling them only at the iov_iter level. This is because when 1019 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), 1020 * which can still trigger page fault ins despite having set ->nofault 1021 * to true of our 'to' iov_iter. 1022 * 1023 * The difference to direct IO writes is that we deadlock when trying 1024 * to lock the extent range in the inode's tree during he page reads 1025 * triggered by the fault in (while for writes it is due to waiting for 1026 * our own ordered extent). This is because for direct IO reads, 1027 * btrfs_dio_iomap_begin() returns with the extent range locked, which 1028 * is only unlocked in the endio callback (end_bio_extent_readpage()). 1029 */ 1030 pagefault_disable(); 1031 to->nofault = true; 1032 ret = btrfs_dio_read(iocb, to, read); 1033 to->nofault = false; 1034 pagefault_enable(); 1035 1036 /* No increment (+=) because iomap returns a cumulative value. */ 1037 if (ret > 0) 1038 read = ret; 1039 1040 if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { 1041 const size_t left = iov_iter_count(to); 1042 1043 if (left == prev_left) { 1044 /* 1045 * We didn't make any progress since the last attempt, 1046 * fallback to a buffered read for the remainder of the 1047 * range. This is just to avoid any possibility of looping 1048 * for too long. 1049 */ 1050 ret = read; 1051 } else { 1052 /* 1053 * We made some progress since the last retry or this is 1054 * the first time we are retrying. Fault in as many pages 1055 * as possible and retry. 1056 */ 1057 fault_in_iov_iter_writeable(to, left); 1058 prev_left = left; 1059 goto again; 1060 } 1061 } 1062 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 1063 return ret < 0 ? ret : read; 1064 } 1065 1066 int __init btrfs_init_dio(void) 1067 { 1068 if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, 1069 offsetof(struct btrfs_dio_private, bbio.bio), 1070 BIOSET_NEED_BVECS)) 1071 return -ENOMEM; 1072 1073 return 0; 1074 } 1075 1076 void __cold btrfs_destroy_dio(void) 1077 { 1078 bioset_exit(&btrfs_dio_bioset); 1079 } 1080