1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/fsverity.h> 4 #include <linux/iomap.h> 5 #include "ctree.h" 6 #include "delalloc-space.h" 7 #include "direct-io.h" 8 #include "extent-tree.h" 9 #include "file.h" 10 #include "fs.h" 11 #include "transaction.h" 12 #include "volumes.h" 13 14 struct btrfs_dio_data { 15 ssize_t submitted; 16 struct extent_changeset *data_reserved; 17 struct btrfs_ordered_extent *ordered; 18 bool data_space_reserved; 19 bool nocow_done; 20 }; 21 22 struct btrfs_dio_private { 23 /* Range of I/O */ 24 u64 file_offset; 25 u32 bytes; 26 27 /* This must be last */ 28 struct btrfs_bio bbio; 29 }; 30 31 static struct bio_set btrfs_dio_bioset; 32 33 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 34 struct extent_state **cached_state, 35 unsigned int iomap_flags) 36 { 37 const bool writing = (iomap_flags & IOMAP_WRITE); 38 const bool nowait = (iomap_flags & IOMAP_NOWAIT); 39 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 40 struct btrfs_ordered_extent *ordered; 41 int ret = 0; 42 43 while (1) { 44 if (nowait) { 45 if (!try_lock_extent(io_tree, lockstart, lockend, 46 cached_state)) 47 return -EAGAIN; 48 } else { 49 lock_extent(io_tree, lockstart, lockend, cached_state); 50 } 51 /* 52 * We're concerned with the entire range that we're going to be 53 * doing DIO to, so we need to make sure there's no ordered 54 * extents in this range. 55 */ 56 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, 57 lockend - lockstart + 1); 58 59 /* 60 * We need to make sure there are no buffered pages in this 61 * range either, we could have raced between the invalidate in 62 * generic_file_direct_write and locking the extent. The 63 * invalidate needs to happen so that reads after a write do not 64 * get stale data. 65 */ 66 if (!ordered && 67 (!writing || !filemap_range_has_page(inode->i_mapping, 68 lockstart, lockend))) 69 break; 70 71 unlock_extent(io_tree, lockstart, lockend, cached_state); 72 73 if (ordered) { 74 if (nowait) { 75 btrfs_put_ordered_extent(ordered); 76 ret = -EAGAIN; 77 break; 78 } 79 /* 80 * If we are doing a DIO read and the ordered extent we 81 * found is for a buffered write, we can not wait for it 82 * to complete and retry, because if we do so we can 83 * deadlock with concurrent buffered writes on page 84 * locks. This happens only if our DIO read covers more 85 * than one extent map, if at this point has already 86 * created an ordered extent for a previous extent map 87 * and locked its range in the inode's io tree, and a 88 * concurrent write against that previous extent map's 89 * range and this range started (we unlock the ranges 90 * in the io tree only when the bios complete and 91 * buffered writes always lock pages before attempting 92 * to lock range in the io tree). 93 */ 94 if (writing || 95 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) 96 btrfs_start_ordered_extent(ordered); 97 else 98 ret = nowait ? -EAGAIN : -ENOTBLK; 99 btrfs_put_ordered_extent(ordered); 100 } else { 101 /* 102 * We could trigger writeback for this range (and wait 103 * for it to complete) and then invalidate the pages for 104 * this range (through invalidate_inode_pages2_range()), 105 * but that can lead us to a deadlock with a concurrent 106 * call to readahead (a buffered read or a defrag call 107 * triggered a readahead) on a page lock due to an 108 * ordered dio extent we created before but did not have 109 * yet a corresponding bio submitted (whence it can not 110 * complete), which makes readahead wait for that 111 * ordered extent to complete while holding a lock on 112 * that page. 113 */ 114 ret = nowait ? -EAGAIN : -ENOTBLK; 115 } 116 117 if (ret) 118 break; 119 120 cond_resched(); 121 } 122 123 return ret; 124 } 125 126 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, 127 struct btrfs_dio_data *dio_data, 128 const u64 start, 129 const struct btrfs_file_extent *file_extent, 130 const int type) 131 { 132 struct extent_map *em = NULL; 133 struct btrfs_ordered_extent *ordered; 134 135 if (type != BTRFS_ORDERED_NOCOW) { 136 em = btrfs_create_io_em(inode, start, file_extent, type); 137 if (IS_ERR(em)) 138 goto out; 139 } 140 141 ordered = btrfs_alloc_ordered_extent(inode, start, file_extent, 142 (1 << type) | 143 (1 << BTRFS_ORDERED_DIRECT)); 144 if (IS_ERR(ordered)) { 145 if (em) { 146 free_extent_map(em); 147 btrfs_drop_extent_map_range(inode, start, 148 start + file_extent->num_bytes - 1, false); 149 } 150 em = ERR_CAST(ordered); 151 } else { 152 ASSERT(!dio_data->ordered); 153 dio_data->ordered = ordered; 154 } 155 out: 156 157 return em; 158 } 159 160 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, 161 struct btrfs_dio_data *dio_data, 162 u64 start, u64 len) 163 { 164 struct btrfs_root *root = inode->root; 165 struct btrfs_fs_info *fs_info = root->fs_info; 166 struct btrfs_file_extent file_extent; 167 struct extent_map *em; 168 struct btrfs_key ins; 169 u64 alloc_hint; 170 int ret; 171 172 alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len); 173 again: 174 ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 175 0, alloc_hint, &ins, 1, 1); 176 if (ret == -EAGAIN) { 177 ASSERT(btrfs_is_zoned(fs_info)); 178 wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, 179 TASK_UNINTERRUPTIBLE); 180 goto again; 181 } 182 if (ret) 183 return ERR_PTR(ret); 184 185 file_extent.disk_bytenr = ins.objectid; 186 file_extent.disk_num_bytes = ins.offset; 187 file_extent.num_bytes = ins.offset; 188 file_extent.ram_bytes = ins.offset; 189 file_extent.offset = 0; 190 file_extent.compression = BTRFS_COMPRESS_NONE; 191 em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent, 192 BTRFS_ORDERED_REGULAR); 193 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 194 if (IS_ERR(em)) 195 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 196 1); 197 198 return em; 199 } 200 201 static int btrfs_get_blocks_direct_write(struct extent_map **map, 202 struct inode *inode, 203 struct btrfs_dio_data *dio_data, 204 u64 start, u64 *lenp, 205 unsigned int iomap_flags) 206 { 207 const bool nowait = (iomap_flags & IOMAP_NOWAIT); 208 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 209 struct btrfs_file_extent file_extent; 210 struct extent_map *em = *map; 211 int type; 212 u64 block_start; 213 struct btrfs_block_group *bg; 214 bool can_nocow = false; 215 bool space_reserved = false; 216 u64 len = *lenp; 217 u64 prev_len; 218 int ret = 0; 219 220 /* 221 * We don't allocate a new extent in the following cases 222 * 223 * 1) The inode is marked as NODATACOW. In this case we'll just use the 224 * existing extent. 225 * 2) The extent is marked as PREALLOC. We're good to go here and can 226 * just use the extent. 227 * 228 */ 229 if ((em->flags & EXTENT_FLAG_PREALLOC) || 230 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 231 em->disk_bytenr != EXTENT_MAP_HOLE)) { 232 if (em->flags & EXTENT_FLAG_PREALLOC) 233 type = BTRFS_ORDERED_PREALLOC; 234 else 235 type = BTRFS_ORDERED_NOCOW; 236 len = min(len, em->len - (start - em->start)); 237 block_start = extent_map_block_start(em) + (start - em->start); 238 239 if (can_nocow_extent(inode, start, &len, 240 &file_extent, false, false) == 1) { 241 bg = btrfs_inc_nocow_writers(fs_info, block_start); 242 if (bg) 243 can_nocow = true; 244 } 245 } 246 247 prev_len = len; 248 if (can_nocow) { 249 struct extent_map *em2; 250 251 /* We can NOCOW, so only need to reserve metadata space. */ 252 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, 253 nowait); 254 if (ret < 0) { 255 /* Our caller expects us to free the input extent map. */ 256 free_extent_map(em); 257 *map = NULL; 258 btrfs_dec_nocow_writers(bg); 259 if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) 260 ret = -EAGAIN; 261 goto out; 262 } 263 space_reserved = true; 264 265 em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, 266 &file_extent, type); 267 btrfs_dec_nocow_writers(bg); 268 if (type == BTRFS_ORDERED_PREALLOC) { 269 free_extent_map(em); 270 *map = em2; 271 em = em2; 272 } 273 274 if (IS_ERR(em2)) { 275 ret = PTR_ERR(em2); 276 goto out; 277 } 278 279 dio_data->nocow_done = true; 280 } else { 281 /* Our caller expects us to free the input extent map. */ 282 free_extent_map(em); 283 *map = NULL; 284 285 if (nowait) { 286 ret = -EAGAIN; 287 goto out; 288 } 289 290 /* 291 * If we could not allocate data space before locking the file 292 * range and we can't do a NOCOW write, then we have to fail. 293 */ 294 if (!dio_data->data_space_reserved) { 295 ret = -ENOSPC; 296 goto out; 297 } 298 299 /* 300 * We have to COW and we have already reserved data space before, 301 * so now we reserve only metadata. 302 */ 303 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, 304 false); 305 if (ret < 0) 306 goto out; 307 space_reserved = true; 308 309 em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); 310 if (IS_ERR(em)) { 311 ret = PTR_ERR(em); 312 goto out; 313 } 314 *map = em; 315 len = min(len, em->len - (start - em->start)); 316 if (len < prev_len) 317 btrfs_delalloc_release_metadata(BTRFS_I(inode), 318 prev_len - len, true); 319 } 320 321 /* 322 * We have created our ordered extent, so we can now release our reservation 323 * for an outstanding extent. 324 */ 325 btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); 326 327 /* 328 * Need to update the i_size under the extent lock so buffered 329 * readers will get the updated i_size when we unlock. 330 */ 331 if (start + len > i_size_read(inode)) 332 i_size_write(inode, start + len); 333 out: 334 if (ret && space_reserved) { 335 btrfs_delalloc_release_extents(BTRFS_I(inode), len); 336 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); 337 } 338 *lenp = len; 339 return ret; 340 } 341 342 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, 343 loff_t length, unsigned int flags, struct iomap *iomap, 344 struct iomap *srcmap) 345 { 346 struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); 347 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 348 struct extent_map *em; 349 struct extent_state *cached_state = NULL; 350 struct btrfs_dio_data *dio_data = iter->private; 351 u64 lockstart, lockend; 352 const bool write = !!(flags & IOMAP_WRITE); 353 int ret = 0; 354 u64 len = length; 355 const u64 data_alloc_len = length; 356 bool unlock_extents = false; 357 358 /* 359 * We could potentially fault if we have a buffer > PAGE_SIZE, and if 360 * we're NOWAIT we may submit a bio for a partial range and return 361 * EIOCBQUEUED, which would result in an errant short read. 362 * 363 * The best way to handle this would be to allow for partial completions 364 * of iocb's, so we could submit the partial bio, return and fault in 365 * the rest of the pages, and then submit the io for the rest of the 366 * range. However we don't have that currently, so simply return 367 * -EAGAIN at this point so that the normal path is used. 368 */ 369 if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) 370 return -EAGAIN; 371 372 /* 373 * Cap the size of reads to that usually seen in buffered I/O as we need 374 * to allocate a contiguous array for the checksums. 375 */ 376 if (!write) 377 len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); 378 379 lockstart = start; 380 lockend = start + len - 1; 381 382 /* 383 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't 384 * enough if we've written compressed pages to this area, so we need to 385 * flush the dirty pages again to make absolutely sure that any 386 * outstanding dirty pages are on disk - the first flush only starts 387 * compression on the data, while keeping the pages locked, so by the 388 * time the second flush returns we know bios for the compressed pages 389 * were submitted and finished, and the pages no longer under writeback. 390 * 391 * If we have a NOWAIT request and we have any pages in the range that 392 * are locked, likely due to compression still in progress, we don't want 393 * to block on page locks. We also don't want to block on pages marked as 394 * dirty or under writeback (same as for the non-compression case). 395 * iomap_dio_rw() did the same check, but after that and before we got 396 * here, mmap'ed writes may have happened or buffered reads started 397 * (readpage() and readahead(), which lock pages), as we haven't locked 398 * the file range yet. 399 */ 400 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 401 &BTRFS_I(inode)->runtime_flags)) { 402 if (flags & IOMAP_NOWAIT) { 403 if (filemap_range_needs_writeback(inode->i_mapping, 404 lockstart, lockend)) 405 return -EAGAIN; 406 } else { 407 ret = filemap_fdatawrite_range(inode->i_mapping, start, 408 start + length - 1); 409 if (ret) 410 return ret; 411 } 412 } 413 414 memset(dio_data, 0, sizeof(*dio_data)); 415 416 /* 417 * We always try to allocate data space and must do it before locking 418 * the file range, to avoid deadlocks with concurrent writes to the same 419 * range if the range has several extents and the writes don't expand the 420 * current i_size (the inode lock is taken in shared mode). If we fail to 421 * allocate data space here we continue and later, after locking the 422 * file range, we fail with ENOSPC only if we figure out we can not do a 423 * NOCOW write. 424 */ 425 if (write && !(flags & IOMAP_NOWAIT)) { 426 ret = btrfs_check_data_free_space(BTRFS_I(inode), 427 &dio_data->data_reserved, 428 start, data_alloc_len, false); 429 if (!ret) 430 dio_data->data_space_reserved = true; 431 else if (ret && !(BTRFS_I(inode)->flags & 432 (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 433 goto err; 434 } 435 436 /* 437 * If this errors out it's because we couldn't invalidate pagecache for 438 * this range and we need to fallback to buffered IO, or we are doing a 439 * NOWAIT read/write and we need to block. 440 */ 441 ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); 442 if (ret < 0) 443 goto err; 444 445 em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); 446 if (IS_ERR(em)) { 447 ret = PTR_ERR(em); 448 goto unlock_err; 449 } 450 451 /* 452 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 453 * io. INLINE is special, and we could probably kludge it in here, but 454 * it's still buffered so for safety lets just fall back to the generic 455 * buffered path. 456 * 457 * For COMPRESSED we _have_ to read the entire extent in so we can 458 * decompress it, so there will be buffering required no matter what we 459 * do, so go ahead and fallback to buffered. 460 * 461 * We return -ENOTBLK because that's what makes DIO go ahead and go back 462 * to buffered IO. Don't blame me, this is the price we pay for using 463 * the generic code. 464 */ 465 if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) { 466 free_extent_map(em); 467 /* 468 * If we are in a NOWAIT context, return -EAGAIN in order to 469 * fallback to buffered IO. This is not only because we can 470 * block with buffered IO (no support for NOWAIT semantics at 471 * the moment) but also to avoid returning short reads to user 472 * space - this happens if we were able to read some data from 473 * previous non-compressed extents and then when we fallback to 474 * buffered IO, at btrfs_file_read_iter() by calling 475 * filemap_read(), we fail to fault in pages for the read buffer, 476 * in which case filemap_read() returns a short read (the number 477 * of bytes previously read is > 0, so it does not return -EFAULT). 478 */ 479 ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; 480 goto unlock_err; 481 } 482 483 len = min(len, em->len - (start - em->start)); 484 485 /* 486 * If we have a NOWAIT request and the range contains multiple extents 487 * (or a mix of extents and holes), then we return -EAGAIN to make the 488 * caller fallback to a context where it can do a blocking (without 489 * NOWAIT) request. This way we avoid doing partial IO and returning 490 * success to the caller, which is not optimal for writes and for reads 491 * it can result in unexpected behaviour for an application. 492 * 493 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling 494 * iomap_dio_rw(), we can end up returning less data then what the caller 495 * asked for, resulting in an unexpected, and incorrect, short read. 496 * That is, the caller asked to read N bytes and we return less than that, 497 * which is wrong unless we are crossing EOF. This happens if we get a 498 * page fault error when trying to fault in pages for the buffer that is 499 * associated to the struct iov_iter passed to iomap_dio_rw(), and we 500 * have previously submitted bios for other extents in the range, in 501 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of 502 * those bios have completed by the time we get the page fault error, 503 * which we return back to our caller - we should only return EIOCBQUEUED 504 * after we have submitted bios for all the extents in the range. 505 */ 506 if ((flags & IOMAP_NOWAIT) && len < length) { 507 free_extent_map(em); 508 ret = -EAGAIN; 509 goto unlock_err; 510 } 511 512 if (write) { 513 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, 514 start, &len, flags); 515 if (ret < 0) 516 goto unlock_err; 517 unlock_extents = true; 518 /* Recalc len in case the new em is smaller than requested */ 519 len = min(len, em->len - (start - em->start)); 520 if (dio_data->data_space_reserved) { 521 u64 release_offset; 522 u64 release_len = 0; 523 524 if (dio_data->nocow_done) { 525 release_offset = start; 526 release_len = data_alloc_len; 527 } else if (len < data_alloc_len) { 528 release_offset = start + len; 529 release_len = data_alloc_len - len; 530 } 531 532 if (release_len > 0) 533 btrfs_free_reserved_data_space(BTRFS_I(inode), 534 dio_data->data_reserved, 535 release_offset, 536 release_len); 537 } 538 } else { 539 /* 540 * We need to unlock only the end area that we aren't using. 541 * The rest is going to be unlocked by the endio routine. 542 */ 543 lockstart = start + len; 544 if (lockstart < lockend) 545 unlock_extents = true; 546 } 547 548 if (unlock_extents) 549 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 550 &cached_state); 551 else 552 free_extent_state(cached_state); 553 554 /* 555 * Translate extent map information to iomap. 556 * We trim the extents (and move the addr) even though iomap code does 557 * that, since we have locked only the parts we are performing I/O in. 558 */ 559 if ((em->disk_bytenr == EXTENT_MAP_HOLE) || 560 ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { 561 iomap->addr = IOMAP_NULL_ADDR; 562 iomap->type = IOMAP_HOLE; 563 } else { 564 iomap->addr = extent_map_block_start(em) + (start - em->start); 565 iomap->type = IOMAP_MAPPED; 566 } 567 iomap->offset = start; 568 iomap->bdev = fs_info->fs_devices->latest_dev->bdev; 569 iomap->length = len; 570 free_extent_map(em); 571 572 return 0; 573 574 unlock_err: 575 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 576 &cached_state); 577 err: 578 if (dio_data->data_space_reserved) { 579 btrfs_free_reserved_data_space(BTRFS_I(inode), 580 dio_data->data_reserved, 581 start, data_alloc_len); 582 extent_changeset_free(dio_data->data_reserved); 583 } 584 585 return ret; 586 } 587 588 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, 589 ssize_t written, unsigned int flags, struct iomap *iomap) 590 { 591 struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); 592 struct btrfs_dio_data *dio_data = iter->private; 593 size_t submitted = dio_data->submitted; 594 const bool write = !!(flags & IOMAP_WRITE); 595 int ret = 0; 596 597 if (!write && (iomap->type == IOMAP_HOLE)) { 598 /* If reading from a hole, unlock and return */ 599 unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, 600 NULL); 601 return 0; 602 } 603 604 if (submitted < length) { 605 pos += submitted; 606 length -= submitted; 607 if (write) 608 btrfs_finish_ordered_extent(dio_data->ordered, NULL, 609 pos, length, false); 610 else 611 unlock_extent(&BTRFS_I(inode)->io_tree, pos, 612 pos + length - 1, NULL); 613 ret = -ENOTBLK; 614 } 615 if (write) { 616 btrfs_put_ordered_extent(dio_data->ordered); 617 dio_data->ordered = NULL; 618 } 619 620 if (write) 621 extent_changeset_free(dio_data->data_reserved); 622 return ret; 623 } 624 625 static void btrfs_dio_end_io(struct btrfs_bio *bbio) 626 { 627 struct btrfs_dio_private *dip = 628 container_of(bbio, struct btrfs_dio_private, bbio); 629 struct btrfs_inode *inode = bbio->inode; 630 struct bio *bio = &bbio->bio; 631 632 if (bio->bi_status) { 633 btrfs_warn(inode->root->fs_info, 634 "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", 635 btrfs_ino(inode), bio->bi_opf, 636 dip->file_offset, dip->bytes, bio->bi_status); 637 } 638 639 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 640 btrfs_finish_ordered_extent(bbio->ordered, NULL, 641 dip->file_offset, dip->bytes, 642 !bio->bi_status); 643 } else { 644 unlock_extent(&inode->io_tree, dip->file_offset, 645 dip->file_offset + dip->bytes - 1, NULL); 646 } 647 648 bbio->bio.bi_private = bbio->private; 649 iomap_dio_bio_end_io(bio); 650 } 651 652 static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, 653 struct btrfs_ordered_extent *ordered) 654 { 655 u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; 656 u64 len = bbio->bio.bi_iter.bi_size; 657 struct btrfs_ordered_extent *new; 658 int ret; 659 660 /* Must always be called for the beginning of an ordered extent. */ 661 if (WARN_ON_ONCE(start != ordered->disk_bytenr)) 662 return -EINVAL; 663 664 /* No need to split if the ordered extent covers the entire bio. */ 665 if (ordered->disk_num_bytes == len) { 666 refcount_inc(&ordered->refs); 667 bbio->ordered = ordered; 668 return 0; 669 } 670 671 /* 672 * Don't split the extent_map for NOCOW extents, as we're writing into 673 * a pre-existing one. 674 */ 675 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 676 ret = split_extent_map(bbio->inode, bbio->file_offset, 677 ordered->num_bytes, len, 678 ordered->disk_bytenr); 679 if (ret) 680 return ret; 681 } 682 683 new = btrfs_split_ordered_extent(ordered, len); 684 if (IS_ERR(new)) 685 return PTR_ERR(new); 686 bbio->ordered = new; 687 return 0; 688 } 689 690 static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, 691 loff_t file_offset) 692 { 693 struct btrfs_bio *bbio = btrfs_bio(bio); 694 struct btrfs_dio_private *dip = 695 container_of(bbio, struct btrfs_dio_private, bbio); 696 struct btrfs_dio_data *dio_data = iter->private; 697 698 btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, 699 btrfs_dio_end_io, bio->bi_private); 700 bbio->inode = BTRFS_I(iter->inode); 701 bbio->file_offset = file_offset; 702 703 dip->file_offset = file_offset; 704 dip->bytes = bio->bi_iter.bi_size; 705 706 dio_data->submitted += bio->bi_iter.bi_size; 707 708 /* 709 * Check if we are doing a partial write. If we are, we need to split 710 * the ordered extent to match the submitted bio. Hang on to the 711 * remaining unfinishable ordered_extent in dio_data so that it can be 712 * cancelled in iomap_end to avoid a deadlock wherein faulting the 713 * remaining pages is blocked on the outstanding ordered extent. 714 */ 715 if (iter->flags & IOMAP_WRITE) { 716 int ret; 717 718 ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); 719 if (ret) { 720 btrfs_finish_ordered_extent(dio_data->ordered, NULL, 721 file_offset, dip->bytes, 722 !ret); 723 bio->bi_status = errno_to_blk_status(ret); 724 iomap_dio_bio_end_io(bio); 725 return; 726 } 727 } 728 729 btrfs_submit_bio(bbio, 0); 730 } 731 732 static const struct iomap_ops btrfs_dio_iomap_ops = { 733 .iomap_begin = btrfs_dio_iomap_begin, 734 .iomap_end = btrfs_dio_iomap_end, 735 }; 736 737 static const struct iomap_dio_ops btrfs_dio_ops = { 738 .submit_io = btrfs_dio_submit_io, 739 .bio_set = &btrfs_dio_bioset, 740 }; 741 742 static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, 743 size_t done_before) 744 { 745 struct btrfs_dio_data data = { 0 }; 746 747 return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 748 IOMAP_DIO_PARTIAL, &data, done_before); 749 } 750 751 static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, 752 size_t done_before) 753 { 754 struct btrfs_dio_data data = { 0 }; 755 756 return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 757 IOMAP_DIO_PARTIAL, &data, done_before); 758 } 759 760 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, 761 const struct iov_iter *iter, loff_t offset) 762 { 763 const u32 blocksize_mask = fs_info->sectorsize - 1; 764 765 if (offset & blocksize_mask) 766 return -EINVAL; 767 768 if (iov_iter_alignment(iter) & blocksize_mask) 769 return -EINVAL; 770 771 return 0; 772 } 773 774 ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) 775 { 776 struct file *file = iocb->ki_filp; 777 struct inode *inode = file_inode(file); 778 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 779 loff_t pos; 780 ssize_t written = 0; 781 ssize_t written_buffered; 782 size_t prev_left = 0; 783 loff_t endbyte; 784 ssize_t ret; 785 unsigned int ilock_flags = 0; 786 struct iomap_dio *dio; 787 788 if (iocb->ki_flags & IOCB_NOWAIT) 789 ilock_flags |= BTRFS_ILOCK_TRY; 790 791 /* 792 * If the write DIO is within EOF, use a shared lock and also only if 793 * security bits will likely not be dropped by file_remove_privs() called 794 * from btrfs_write_check(). Either will need to be rechecked after the 795 * lock was acquired. 796 */ 797 if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) 798 ilock_flags |= BTRFS_ILOCK_SHARED; 799 800 relock: 801 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); 802 if (ret < 0) 803 return ret; 804 805 /* Shared lock cannot be used with security bits set. */ 806 if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { 807 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 808 ilock_flags &= ~BTRFS_ILOCK_SHARED; 809 goto relock; 810 } 811 812 ret = generic_write_checks(iocb, from); 813 if (ret <= 0) { 814 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 815 return ret; 816 } 817 818 ret = btrfs_write_check(iocb, from, ret); 819 if (ret < 0) { 820 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 821 goto out; 822 } 823 824 pos = iocb->ki_pos; 825 /* 826 * Re-check since file size may have changed just before taking the 827 * lock or pos may have changed because of O_APPEND in generic_write_check() 828 */ 829 if ((ilock_flags & BTRFS_ILOCK_SHARED) && 830 pos + iov_iter_count(from) > i_size_read(inode)) { 831 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 832 ilock_flags &= ~BTRFS_ILOCK_SHARED; 833 goto relock; 834 } 835 836 if (check_direct_IO(fs_info, from, pos)) { 837 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 838 goto buffered; 839 } 840 841 /* 842 * The iov_iter can be mapped to the same file range we are writing to. 843 * If that's the case, then we will deadlock in the iomap code, because 844 * it first calls our callback btrfs_dio_iomap_begin(), which will create 845 * an ordered extent, and after that it will fault in the pages that the 846 * iov_iter refers to. During the fault in we end up in the readahead 847 * pages code (starting at btrfs_readahead()), which will lock the range, 848 * find that ordered extent and then wait for it to complete (at 849 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since 850 * obviously the ordered extent can never complete as we didn't submit 851 * yet the respective bio(s). This always happens when the buffer is 852 * memory mapped to the same file range, since the iomap DIO code always 853 * invalidates pages in the target file range (after starting and waiting 854 * for any writeback). 855 * 856 * So here we disable page faults in the iov_iter and then retry if we 857 * got -EFAULT, faulting in the pages before the retry. 858 */ 859 again: 860 from->nofault = true; 861 dio = btrfs_dio_write(iocb, from, written); 862 from->nofault = false; 863 864 if (IS_ERR_OR_NULL(dio)) { 865 ret = PTR_ERR_OR_ZERO(dio); 866 } else { 867 struct btrfs_file_private stack_private = { 0 }; 868 struct btrfs_file_private *private; 869 const bool have_private = (file->private_data != NULL); 870 871 if (!have_private) 872 file->private_data = &stack_private; 873 874 /* 875 * If we have a synchronous write, we must make sure the fsync 876 * triggered by the iomap_dio_complete() call below doesn't 877 * deadlock on the inode lock - we are already holding it and we 878 * can't call it after unlocking because we may need to complete 879 * partial writes due to the input buffer (or parts of it) not 880 * being already faulted in. 881 */ 882 private = file->private_data; 883 private->fsync_skip_inode_lock = true; 884 ret = iomap_dio_complete(dio); 885 private->fsync_skip_inode_lock = false; 886 887 if (!have_private) 888 file->private_data = NULL; 889 } 890 891 /* No increment (+=) because iomap returns a cumulative value. */ 892 if (ret > 0) 893 written = ret; 894 895 if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) { 896 const size_t left = iov_iter_count(from); 897 /* 898 * We have more data left to write. Try to fault in as many as 899 * possible of the remainder pages and retry. We do this without 900 * releasing and locking again the inode, to prevent races with 901 * truncate. 902 * 903 * Also, in case the iov refers to pages in the file range of the 904 * file we want to write to (due to a mmap), we could enter an 905 * infinite loop if we retry after faulting the pages in, since 906 * iomap will invalidate any pages in the range early on, before 907 * it tries to fault in the pages of the iov. So we keep track of 908 * how much was left of iov in the previous EFAULT and fallback 909 * to buffered IO in case we haven't made any progress. 910 */ 911 if (left == prev_left) { 912 ret = -ENOTBLK; 913 } else { 914 fault_in_iov_iter_readable(from, left); 915 prev_left = left; 916 goto again; 917 } 918 } 919 920 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 921 922 /* 923 * If 'ret' is -ENOTBLK or we have not written all data, then it means 924 * we must fallback to buffered IO. 925 */ 926 if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from)) 927 goto out; 928 929 buffered: 930 /* 931 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller 932 * it must retry the operation in a context where blocking is acceptable, 933 * because even if we end up not blocking during the buffered IO attempt 934 * below, we will block when flushing and waiting for the IO. 935 */ 936 if (iocb->ki_flags & IOCB_NOWAIT) { 937 ret = -EAGAIN; 938 goto out; 939 } 940 941 pos = iocb->ki_pos; 942 written_buffered = btrfs_buffered_write(iocb, from); 943 if (written_buffered < 0) { 944 ret = written_buffered; 945 goto out; 946 } 947 /* 948 * Ensure all data is persisted. We want the next direct IO read to be 949 * able to read what was just written. 950 */ 951 endbyte = pos + written_buffered - 1; 952 ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte); 953 if (ret) 954 goto out; 955 ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); 956 if (ret) 957 goto out; 958 written += written_buffered; 959 iocb->ki_pos = pos + written_buffered; 960 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, 961 endbyte >> PAGE_SHIFT); 962 out: 963 return ret < 0 ? ret : written; 964 } 965 966 static int check_direct_read(struct btrfs_fs_info *fs_info, 967 const struct iov_iter *iter, loff_t offset) 968 { 969 int ret; 970 int i, seg; 971 972 ret = check_direct_IO(fs_info, iter, offset); 973 if (ret < 0) 974 return ret; 975 976 if (!iter_is_iovec(iter)) 977 return 0; 978 979 for (seg = 0; seg < iter->nr_segs; seg++) { 980 for (i = seg + 1; i < iter->nr_segs; i++) { 981 const struct iovec *iov1 = iter_iov(iter) + seg; 982 const struct iovec *iov2 = iter_iov(iter) + i; 983 984 if (iov1->iov_base == iov2->iov_base) 985 return -EINVAL; 986 } 987 } 988 return 0; 989 } 990 991 ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) 992 { 993 struct inode *inode = file_inode(iocb->ki_filp); 994 size_t prev_left = 0; 995 ssize_t read = 0; 996 ssize_t ret; 997 998 if (fsverity_active(inode)) 999 return 0; 1000 1001 if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos)) 1002 return 0; 1003 1004 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 1005 again: 1006 /* 1007 * This is similar to what we do for direct IO writes, see the comment 1008 * at btrfs_direct_write(), but we also disable page faults in addition 1009 * to disabling them only at the iov_iter level. This is because when 1010 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), 1011 * which can still trigger page fault ins despite having set ->nofault 1012 * to true of our 'to' iov_iter. 1013 * 1014 * The difference to direct IO writes is that we deadlock when trying 1015 * to lock the extent range in the inode's tree during he page reads 1016 * triggered by the fault in (while for writes it is due to waiting for 1017 * our own ordered extent). This is because for direct IO reads, 1018 * btrfs_dio_iomap_begin() returns with the extent range locked, which 1019 * is only unlocked in the endio callback (end_bio_extent_readpage()). 1020 */ 1021 pagefault_disable(); 1022 to->nofault = true; 1023 ret = btrfs_dio_read(iocb, to, read); 1024 to->nofault = false; 1025 pagefault_enable(); 1026 1027 /* No increment (+=) because iomap returns a cumulative value. */ 1028 if (ret > 0) 1029 read = ret; 1030 1031 if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { 1032 const size_t left = iov_iter_count(to); 1033 1034 if (left == prev_left) { 1035 /* 1036 * We didn't make any progress since the last attempt, 1037 * fallback to a buffered read for the remainder of the 1038 * range. This is just to avoid any possibility of looping 1039 * for too long. 1040 */ 1041 ret = read; 1042 } else { 1043 /* 1044 * We made some progress since the last retry or this is 1045 * the first time we are retrying. Fault in as many pages 1046 * as possible and retry. 1047 */ 1048 fault_in_iov_iter_writeable(to, left); 1049 prev_left = left; 1050 goto again; 1051 } 1052 } 1053 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 1054 return ret < 0 ? ret : read; 1055 } 1056 1057 int __init btrfs_init_dio(void) 1058 { 1059 if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, 1060 offsetof(struct btrfs_dio_private, bbio.bio), 1061 BIOSET_NEED_BVECS)) 1062 return -ENOMEM; 1063 1064 return 0; 1065 } 1066 1067 void __cold btrfs_destroy_dio(void) 1068 { 1069 bioset_exit(&btrfs_dio_bioset); 1070 } 1071