1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/slab.h> 7 #include <linux/blkdev.h> 8 #include <linux/writeback.h> 9 #include <linux/sched/mm.h> 10 #include "ctree.h" 11 #include "transaction.h" 12 #include "btrfs_inode.h" 13 #include "extent_io.h" 14 #include "disk-io.h" 15 #include "compression.h" 16 17 static struct kmem_cache *btrfs_ordered_extent_cache; 18 19 static u64 entry_end(struct btrfs_ordered_extent *entry) 20 { 21 if (entry->file_offset + entry->len < entry->file_offset) 22 return (u64)-1; 23 return entry->file_offset + entry->len; 24 } 25 26 /* returns NULL if the insertion worked, or it returns the node it did find 27 * in the tree 28 */ 29 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, 30 struct rb_node *node) 31 { 32 struct rb_node **p = &root->rb_node; 33 struct rb_node *parent = NULL; 34 struct btrfs_ordered_extent *entry; 35 36 while (*p) { 37 parent = *p; 38 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); 39 40 if (file_offset < entry->file_offset) 41 p = &(*p)->rb_left; 42 else if (file_offset >= entry_end(entry)) 43 p = &(*p)->rb_right; 44 else 45 return parent; 46 } 47 48 rb_link_node(node, parent, p); 49 rb_insert_color(node, root); 50 return NULL; 51 } 52 53 static void ordered_data_tree_panic(struct inode *inode, int errno, 54 u64 offset) 55 { 56 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 57 btrfs_panic(fs_info, errno, 58 "Inconsistency in ordered tree at offset %llu", offset); 59 } 60 61 /* 62 * look for a given offset in the tree, and if it can't be found return the 63 * first lesser offset 64 */ 65 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, 66 struct rb_node **prev_ret) 67 { 68 struct rb_node *n = root->rb_node; 69 struct rb_node *prev = NULL; 70 struct rb_node *test; 71 struct btrfs_ordered_extent *entry; 72 struct btrfs_ordered_extent *prev_entry = NULL; 73 74 while (n) { 75 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); 76 prev = n; 77 prev_entry = entry; 78 79 if (file_offset < entry->file_offset) 80 n = n->rb_left; 81 else if (file_offset >= entry_end(entry)) 82 n = n->rb_right; 83 else 84 return n; 85 } 86 if (!prev_ret) 87 return NULL; 88 89 while (prev && file_offset >= entry_end(prev_entry)) { 90 test = rb_next(prev); 91 if (!test) 92 break; 93 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 94 rb_node); 95 if (file_offset < entry_end(prev_entry)) 96 break; 97 98 prev = test; 99 } 100 if (prev) 101 prev_entry = rb_entry(prev, struct btrfs_ordered_extent, 102 rb_node); 103 while (prev && file_offset < entry_end(prev_entry)) { 104 test = rb_prev(prev); 105 if (!test) 106 break; 107 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 108 rb_node); 109 prev = test; 110 } 111 *prev_ret = prev; 112 return NULL; 113 } 114 115 /* 116 * helper to check if a given offset is inside a given entry 117 */ 118 static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) 119 { 120 if (file_offset < entry->file_offset || 121 entry->file_offset + entry->len <= file_offset) 122 return 0; 123 return 1; 124 } 125 126 static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, 127 u64 len) 128 { 129 if (file_offset + len <= entry->file_offset || 130 entry->file_offset + entry->len <= file_offset) 131 return 0; 132 return 1; 133 } 134 135 /* 136 * look find the first ordered struct that has this offset, otherwise 137 * the first one less than this offset 138 */ 139 static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, 140 u64 file_offset) 141 { 142 struct rb_root *root = &tree->tree; 143 struct rb_node *prev = NULL; 144 struct rb_node *ret; 145 struct btrfs_ordered_extent *entry; 146 147 if (tree->last) { 148 entry = rb_entry(tree->last, struct btrfs_ordered_extent, 149 rb_node); 150 if (offset_in_entry(entry, file_offset)) 151 return tree->last; 152 } 153 ret = __tree_search(root, file_offset, &prev); 154 if (!ret) 155 ret = prev; 156 if (ret) 157 tree->last = ret; 158 return ret; 159 } 160 161 /* allocate and add a new ordered_extent into the per-inode tree. 162 * file_offset is the logical offset in the file 163 * 164 * start is the disk block number of an extent already reserved in the 165 * extent allocation tree 166 * 167 * len is the length of the extent 168 * 169 * The tree is given a single reference on the ordered extent that was 170 * inserted. 171 */ 172 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173 u64 start, u64 len, u64 disk_len, 174 int type, int dio, int compress_type) 175 { 176 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 177 struct btrfs_root *root = BTRFS_I(inode)->root; 178 struct btrfs_ordered_inode_tree *tree; 179 struct rb_node *node; 180 struct btrfs_ordered_extent *entry; 181 182 tree = &BTRFS_I(inode)->ordered_tree; 183 entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); 184 if (!entry) 185 return -ENOMEM; 186 187 entry->file_offset = file_offset; 188 entry->start = start; 189 entry->len = len; 190 entry->disk_len = disk_len; 191 entry->bytes_left = len; 192 entry->inode = igrab(inode); 193 entry->compress_type = compress_type; 194 entry->truncated_len = (u64)-1; 195 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 196 set_bit(type, &entry->flags); 197 198 if (dio) { 199 percpu_counter_add_batch(&fs_info->dio_bytes, len, 200 fs_info->delalloc_batch); 201 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); 202 } 203 204 /* one ref for the tree */ 205 refcount_set(&entry->refs, 1); 206 init_waitqueue_head(&entry->wait); 207 INIT_LIST_HEAD(&entry->list); 208 INIT_LIST_HEAD(&entry->root_extent_list); 209 INIT_LIST_HEAD(&entry->work_list); 210 init_completion(&entry->completion); 211 INIT_LIST_HEAD(&entry->log_list); 212 INIT_LIST_HEAD(&entry->trans_list); 213 214 trace_btrfs_ordered_extent_add(inode, entry); 215 216 spin_lock_irq(&tree->lock); 217 node = tree_insert(&tree->tree, file_offset, 218 &entry->rb_node); 219 if (node) 220 ordered_data_tree_panic(inode, -EEXIST, file_offset); 221 spin_unlock_irq(&tree->lock); 222 223 spin_lock(&root->ordered_extent_lock); 224 list_add_tail(&entry->root_extent_list, 225 &root->ordered_extents); 226 root->nr_ordered_extents++; 227 if (root->nr_ordered_extents == 1) { 228 spin_lock(&fs_info->ordered_root_lock); 229 BUG_ON(!list_empty(&root->ordered_root)); 230 list_add_tail(&root->ordered_root, &fs_info->ordered_roots); 231 spin_unlock(&fs_info->ordered_root_lock); 232 } 233 spin_unlock(&root->ordered_extent_lock); 234 235 /* 236 * We don't need the count_max_extents here, we can assume that all of 237 * that work has been done at higher layers, so this is truly the 238 * smallest the extent is going to get. 239 */ 240 spin_lock(&BTRFS_I(inode)->lock); 241 btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); 242 spin_unlock(&BTRFS_I(inode)->lock); 243 244 return 0; 245 } 246 247 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 248 u64 start, u64 len, u64 disk_len, int type) 249 { 250 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 251 disk_len, type, 0, 252 BTRFS_COMPRESS_NONE); 253 } 254 255 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 256 u64 start, u64 len, u64 disk_len, int type) 257 { 258 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 259 disk_len, type, 1, 260 BTRFS_COMPRESS_NONE); 261 } 262 263 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, 264 u64 start, u64 len, u64 disk_len, 265 int type, int compress_type) 266 { 267 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 268 disk_len, type, 0, 269 compress_type); 270 } 271 272 /* 273 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 274 * when an ordered extent is finished. If the list covers more than one 275 * ordered extent, it is split across multiples. 276 */ 277 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, 278 struct btrfs_ordered_sum *sum) 279 { 280 struct btrfs_ordered_inode_tree *tree; 281 282 tree = &BTRFS_I(entry->inode)->ordered_tree; 283 spin_lock_irq(&tree->lock); 284 list_add_tail(&sum->list, &entry->list); 285 spin_unlock_irq(&tree->lock); 286 } 287 288 /* 289 * this is used to account for finished IO across a given range 290 * of the file. The IO may span ordered extents. If 291 * a given ordered_extent is completely done, 1 is returned, otherwise 292 * 0. 293 * 294 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used 295 * to make sure this function only returns 1 once for a given ordered extent. 296 * 297 * file_offset is updated to one byte past the range that is recorded as 298 * complete. This allows you to walk forward in the file. 299 */ 300 int btrfs_dec_test_first_ordered_pending(struct inode *inode, 301 struct btrfs_ordered_extent **cached, 302 u64 *file_offset, u64 io_size, int uptodate) 303 { 304 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 305 struct btrfs_ordered_inode_tree *tree; 306 struct rb_node *node; 307 struct btrfs_ordered_extent *entry = NULL; 308 int ret; 309 unsigned long flags; 310 u64 dec_end; 311 u64 dec_start; 312 u64 to_dec; 313 314 tree = &BTRFS_I(inode)->ordered_tree; 315 spin_lock_irqsave(&tree->lock, flags); 316 node = tree_search(tree, *file_offset); 317 if (!node) { 318 ret = 1; 319 goto out; 320 } 321 322 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 323 if (!offset_in_entry(entry, *file_offset)) { 324 ret = 1; 325 goto out; 326 } 327 328 dec_start = max(*file_offset, entry->file_offset); 329 dec_end = min(*file_offset + io_size, entry->file_offset + 330 entry->len); 331 *file_offset = dec_end; 332 if (dec_start > dec_end) { 333 btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu", 334 dec_start, dec_end); 335 } 336 to_dec = dec_end - dec_start; 337 if (to_dec > entry->bytes_left) { 338 btrfs_crit(fs_info, 339 "bad ordered accounting left %llu size %llu", 340 entry->bytes_left, to_dec); 341 } 342 entry->bytes_left -= to_dec; 343 if (!uptodate) 344 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 345 346 if (entry->bytes_left == 0) { 347 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 348 /* test_and_set_bit implies a barrier */ 349 cond_wake_up_nomb(&entry->wait); 350 } else { 351 ret = 1; 352 } 353 out: 354 if (!ret && cached && entry) { 355 *cached = entry; 356 refcount_inc(&entry->refs); 357 } 358 spin_unlock_irqrestore(&tree->lock, flags); 359 return ret == 0; 360 } 361 362 /* 363 * this is used to account for finished IO across a given range 364 * of the file. The IO should not span ordered extents. If 365 * a given ordered_extent is completely done, 1 is returned, otherwise 366 * 0. 367 * 368 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used 369 * to make sure this function only returns 1 once for a given ordered extent. 370 */ 371 int btrfs_dec_test_ordered_pending(struct inode *inode, 372 struct btrfs_ordered_extent **cached, 373 u64 file_offset, u64 io_size, int uptodate) 374 { 375 struct btrfs_ordered_inode_tree *tree; 376 struct rb_node *node; 377 struct btrfs_ordered_extent *entry = NULL; 378 unsigned long flags; 379 int ret; 380 381 tree = &BTRFS_I(inode)->ordered_tree; 382 spin_lock_irqsave(&tree->lock, flags); 383 if (cached && *cached) { 384 entry = *cached; 385 goto have_entry; 386 } 387 388 node = tree_search(tree, file_offset); 389 if (!node) { 390 ret = 1; 391 goto out; 392 } 393 394 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 395 have_entry: 396 if (!offset_in_entry(entry, file_offset)) { 397 ret = 1; 398 goto out; 399 } 400 401 if (io_size > entry->bytes_left) { 402 btrfs_crit(BTRFS_I(inode)->root->fs_info, 403 "bad ordered accounting left %llu size %llu", 404 entry->bytes_left, io_size); 405 } 406 entry->bytes_left -= io_size; 407 if (!uptodate) 408 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 409 410 if (entry->bytes_left == 0) { 411 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 412 /* test_and_set_bit implies a barrier */ 413 cond_wake_up_nomb(&entry->wait); 414 } else { 415 ret = 1; 416 } 417 out: 418 if (!ret && cached && entry) { 419 *cached = entry; 420 refcount_inc(&entry->refs); 421 } 422 spin_unlock_irqrestore(&tree->lock, flags); 423 return ret == 0; 424 } 425 426 /* 427 * used to drop a reference on an ordered extent. This will free 428 * the extent if the last reference is dropped 429 */ 430 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) 431 { 432 struct list_head *cur; 433 struct btrfs_ordered_sum *sum; 434 435 trace_btrfs_ordered_extent_put(entry->inode, entry); 436 437 if (refcount_dec_and_test(&entry->refs)) { 438 ASSERT(list_empty(&entry->log_list)); 439 ASSERT(list_empty(&entry->trans_list)); 440 ASSERT(list_empty(&entry->root_extent_list)); 441 ASSERT(RB_EMPTY_NODE(&entry->rb_node)); 442 if (entry->inode) 443 btrfs_add_delayed_iput(entry->inode); 444 while (!list_empty(&entry->list)) { 445 cur = entry->list.next; 446 sum = list_entry(cur, struct btrfs_ordered_sum, list); 447 list_del(&sum->list); 448 kvfree(sum); 449 } 450 kmem_cache_free(btrfs_ordered_extent_cache, entry); 451 } 452 } 453 454 /* 455 * remove an ordered extent from the tree. No references are dropped 456 * and waiters are woken up. 457 */ 458 void btrfs_remove_ordered_extent(struct inode *inode, 459 struct btrfs_ordered_extent *entry) 460 { 461 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 462 struct btrfs_ordered_inode_tree *tree; 463 struct btrfs_inode *btrfs_inode = BTRFS_I(inode); 464 struct btrfs_root *root = btrfs_inode->root; 465 struct rb_node *node; 466 467 /* This is paired with btrfs_add_ordered_extent. */ 468 spin_lock(&btrfs_inode->lock); 469 btrfs_mod_outstanding_extents(btrfs_inode, -1); 470 spin_unlock(&btrfs_inode->lock); 471 if (root != fs_info->tree_root) 472 btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false); 473 474 if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) 475 percpu_counter_add_batch(&fs_info->dio_bytes, -entry->len, 476 fs_info->delalloc_batch); 477 478 tree = &btrfs_inode->ordered_tree; 479 spin_lock_irq(&tree->lock); 480 node = &entry->rb_node; 481 rb_erase(node, &tree->tree); 482 RB_CLEAR_NODE(node); 483 if (tree->last == node) 484 tree->last = NULL; 485 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 486 spin_unlock_irq(&tree->lock); 487 488 spin_lock(&root->ordered_extent_lock); 489 list_del_init(&entry->root_extent_list); 490 root->nr_ordered_extents--; 491 492 trace_btrfs_ordered_extent_remove(inode, entry); 493 494 if (!root->nr_ordered_extents) { 495 spin_lock(&fs_info->ordered_root_lock); 496 BUG_ON(list_empty(&root->ordered_root)); 497 list_del_init(&root->ordered_root); 498 spin_unlock(&fs_info->ordered_root_lock); 499 } 500 spin_unlock(&root->ordered_extent_lock); 501 wake_up(&entry->wait); 502 } 503 504 static void btrfs_run_ordered_extent_work(struct btrfs_work *work) 505 { 506 struct btrfs_ordered_extent *ordered; 507 508 ordered = container_of(work, struct btrfs_ordered_extent, flush_work); 509 btrfs_start_ordered_extent(ordered->inode, ordered, 1); 510 complete(&ordered->completion); 511 } 512 513 /* 514 * wait for all the ordered extents in a root. This is done when balancing 515 * space between drives. 516 */ 517 u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, 518 const u64 range_start, const u64 range_len) 519 { 520 struct btrfs_fs_info *fs_info = root->fs_info; 521 LIST_HEAD(splice); 522 LIST_HEAD(skipped); 523 LIST_HEAD(works); 524 struct btrfs_ordered_extent *ordered, *next; 525 u64 count = 0; 526 const u64 range_end = range_start + range_len; 527 528 mutex_lock(&root->ordered_extent_mutex); 529 spin_lock(&root->ordered_extent_lock); 530 list_splice_init(&root->ordered_extents, &splice); 531 while (!list_empty(&splice) && nr) { 532 ordered = list_first_entry(&splice, struct btrfs_ordered_extent, 533 root_extent_list); 534 535 if (range_end <= ordered->start || 536 ordered->start + ordered->disk_len <= range_start) { 537 list_move_tail(&ordered->root_extent_list, &skipped); 538 cond_resched_lock(&root->ordered_extent_lock); 539 continue; 540 } 541 542 list_move_tail(&ordered->root_extent_list, 543 &root->ordered_extents); 544 refcount_inc(&ordered->refs); 545 spin_unlock(&root->ordered_extent_lock); 546 547 btrfs_init_work(&ordered->flush_work, 548 btrfs_flush_delalloc_helper, 549 btrfs_run_ordered_extent_work, NULL, NULL); 550 list_add_tail(&ordered->work_list, &works); 551 btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); 552 553 cond_resched(); 554 spin_lock(&root->ordered_extent_lock); 555 if (nr != U64_MAX) 556 nr--; 557 count++; 558 } 559 list_splice_tail(&skipped, &root->ordered_extents); 560 list_splice_tail(&splice, &root->ordered_extents); 561 spin_unlock(&root->ordered_extent_lock); 562 563 list_for_each_entry_safe(ordered, next, &works, work_list) { 564 list_del_init(&ordered->work_list); 565 wait_for_completion(&ordered->completion); 566 btrfs_put_ordered_extent(ordered); 567 cond_resched(); 568 } 569 mutex_unlock(&root->ordered_extent_mutex); 570 571 return count; 572 } 573 574 u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, 575 const u64 range_start, const u64 range_len) 576 { 577 struct btrfs_root *root; 578 struct list_head splice; 579 u64 total_done = 0; 580 u64 done; 581 582 INIT_LIST_HEAD(&splice); 583 584 mutex_lock(&fs_info->ordered_operations_mutex); 585 spin_lock(&fs_info->ordered_root_lock); 586 list_splice_init(&fs_info->ordered_roots, &splice); 587 while (!list_empty(&splice) && nr) { 588 root = list_first_entry(&splice, struct btrfs_root, 589 ordered_root); 590 root = btrfs_grab_fs_root(root); 591 BUG_ON(!root); 592 list_move_tail(&root->ordered_root, 593 &fs_info->ordered_roots); 594 spin_unlock(&fs_info->ordered_root_lock); 595 596 done = btrfs_wait_ordered_extents(root, nr, 597 range_start, range_len); 598 btrfs_put_fs_root(root); 599 total_done += done; 600 601 spin_lock(&fs_info->ordered_root_lock); 602 if (nr != U64_MAX) { 603 nr -= done; 604 } 605 } 606 list_splice_tail(&splice, &fs_info->ordered_roots); 607 spin_unlock(&fs_info->ordered_root_lock); 608 mutex_unlock(&fs_info->ordered_operations_mutex); 609 610 return total_done; 611 } 612 613 /* 614 * Used to start IO or wait for a given ordered extent to finish. 615 * 616 * If wait is one, this effectively waits on page writeback for all the pages 617 * in the extent, and it waits on the io completion code to insert 618 * metadata into the btree corresponding to the extent 619 */ 620 void btrfs_start_ordered_extent(struct inode *inode, 621 struct btrfs_ordered_extent *entry, 622 int wait) 623 { 624 u64 start = entry->file_offset; 625 u64 end = start + entry->len - 1; 626 627 trace_btrfs_ordered_extent_start(inode, entry); 628 629 /* 630 * pages in the range can be dirty, clean or writeback. We 631 * start IO on any dirty ones so the wait doesn't stall waiting 632 * for the flusher thread to find them 633 */ 634 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) 635 filemap_fdatawrite_range(inode->i_mapping, start, end); 636 if (wait) { 637 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 638 &entry->flags)); 639 } 640 } 641 642 /* 643 * Used to wait on ordered extents across a large range of bytes. 644 */ 645 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) 646 { 647 int ret = 0; 648 int ret_wb = 0; 649 u64 end; 650 u64 orig_end; 651 struct btrfs_ordered_extent *ordered; 652 653 if (start + len < start) { 654 orig_end = INT_LIMIT(loff_t); 655 } else { 656 orig_end = start + len - 1; 657 if (orig_end > INT_LIMIT(loff_t)) 658 orig_end = INT_LIMIT(loff_t); 659 } 660 661 /* start IO across the range first to instantiate any delalloc 662 * extents 663 */ 664 ret = btrfs_fdatawrite_range(inode, start, orig_end); 665 if (ret) 666 return ret; 667 668 /* 669 * If we have a writeback error don't return immediately. Wait first 670 * for any ordered extents that haven't completed yet. This is to make 671 * sure no one can dirty the same page ranges and call writepages() 672 * before the ordered extents complete - to avoid failures (-EEXIST) 673 * when adding the new ordered extents to the ordered tree. 674 */ 675 ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); 676 677 end = orig_end; 678 while (1) { 679 ordered = btrfs_lookup_first_ordered_extent(inode, end); 680 if (!ordered) 681 break; 682 if (ordered->file_offset > orig_end) { 683 btrfs_put_ordered_extent(ordered); 684 break; 685 } 686 if (ordered->file_offset + ordered->len <= start) { 687 btrfs_put_ordered_extent(ordered); 688 break; 689 } 690 btrfs_start_ordered_extent(inode, ordered, 1); 691 end = ordered->file_offset; 692 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) 693 ret = -EIO; 694 btrfs_put_ordered_extent(ordered); 695 if (ret || end == 0 || end == start) 696 break; 697 end--; 698 } 699 return ret_wb ? ret_wb : ret; 700 } 701 702 /* 703 * find an ordered extent corresponding to file_offset. return NULL if 704 * nothing is found, otherwise take a reference on the extent and return it 705 */ 706 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, 707 u64 file_offset) 708 { 709 struct btrfs_ordered_inode_tree *tree; 710 struct rb_node *node; 711 struct btrfs_ordered_extent *entry = NULL; 712 713 tree = &BTRFS_I(inode)->ordered_tree; 714 spin_lock_irq(&tree->lock); 715 node = tree_search(tree, file_offset); 716 if (!node) 717 goto out; 718 719 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 720 if (!offset_in_entry(entry, file_offset)) 721 entry = NULL; 722 if (entry) 723 refcount_inc(&entry->refs); 724 out: 725 spin_unlock_irq(&tree->lock); 726 return entry; 727 } 728 729 /* Since the DIO code tries to lock a wide area we need to look for any ordered 730 * extents that exist in the range, rather than just the start of the range. 731 */ 732 struct btrfs_ordered_extent *btrfs_lookup_ordered_range( 733 struct btrfs_inode *inode, u64 file_offset, u64 len) 734 { 735 struct btrfs_ordered_inode_tree *tree; 736 struct rb_node *node; 737 struct btrfs_ordered_extent *entry = NULL; 738 739 tree = &inode->ordered_tree; 740 spin_lock_irq(&tree->lock); 741 node = tree_search(tree, file_offset); 742 if (!node) { 743 node = tree_search(tree, file_offset + len); 744 if (!node) 745 goto out; 746 } 747 748 while (1) { 749 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 750 if (range_overlaps(entry, file_offset, len)) 751 break; 752 753 if (entry->file_offset >= file_offset + len) { 754 entry = NULL; 755 break; 756 } 757 entry = NULL; 758 node = rb_next(node); 759 if (!node) 760 break; 761 } 762 out: 763 if (entry) 764 refcount_inc(&entry->refs); 765 spin_unlock_irq(&tree->lock); 766 return entry; 767 } 768 769 /* 770 * lookup and return any extent before 'file_offset'. NULL is returned 771 * if none is found 772 */ 773 struct btrfs_ordered_extent * 774 btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) 775 { 776 struct btrfs_ordered_inode_tree *tree; 777 struct rb_node *node; 778 struct btrfs_ordered_extent *entry = NULL; 779 780 tree = &BTRFS_I(inode)->ordered_tree; 781 spin_lock_irq(&tree->lock); 782 node = tree_search(tree, file_offset); 783 if (!node) 784 goto out; 785 786 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 787 refcount_inc(&entry->refs); 788 out: 789 spin_unlock_irq(&tree->lock); 790 return entry; 791 } 792 793 /* 794 * After an extent is done, call this to conditionally update the on disk 795 * i_size. i_size is updated to cover any fully written part of the file. 796 */ 797 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 798 struct btrfs_ordered_extent *ordered) 799 { 800 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 801 u64 disk_i_size; 802 u64 new_i_size; 803 u64 i_size = i_size_read(inode); 804 struct rb_node *node; 805 struct rb_node *prev = NULL; 806 struct btrfs_ordered_extent *test; 807 int ret = 1; 808 u64 orig_offset = offset; 809 810 spin_lock_irq(&tree->lock); 811 if (ordered) { 812 offset = entry_end(ordered); 813 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) 814 offset = min(offset, 815 ordered->file_offset + 816 ordered->truncated_len); 817 } else { 818 offset = ALIGN(offset, btrfs_inode_sectorsize(inode)); 819 } 820 disk_i_size = BTRFS_I(inode)->disk_i_size; 821 822 /* 823 * truncate file. 824 * If ordered is not NULL, then this is called from endio and 825 * disk_i_size will be updated by either truncate itself or any 826 * in-flight IOs which are inside the disk_i_size. 827 * 828 * Because btrfs_setsize() may set i_size with disk_i_size if truncate 829 * fails somehow, we need to make sure we have a precise disk_i_size by 830 * updating it as usual. 831 * 832 */ 833 if (!ordered && disk_i_size > i_size) { 834 BTRFS_I(inode)->disk_i_size = orig_offset; 835 ret = 0; 836 goto out; 837 } 838 839 /* 840 * if the disk i_size is already at the inode->i_size, or 841 * this ordered extent is inside the disk i_size, we're done 842 */ 843 if (disk_i_size == i_size) 844 goto out; 845 846 /* 847 * We still need to update disk_i_size if outstanding_isize is greater 848 * than disk_i_size. 849 */ 850 if (offset <= disk_i_size && 851 (!ordered || ordered->outstanding_isize <= disk_i_size)) 852 goto out; 853 854 /* 855 * walk backward from this ordered extent to disk_i_size. 856 * if we find an ordered extent then we can't update disk i_size 857 * yet 858 */ 859 if (ordered) { 860 node = rb_prev(&ordered->rb_node); 861 } else { 862 prev = tree_search(tree, offset); 863 /* 864 * we insert file extents without involving ordered struct, 865 * so there should be no ordered struct cover this offset 866 */ 867 if (prev) { 868 test = rb_entry(prev, struct btrfs_ordered_extent, 869 rb_node); 870 BUG_ON(offset_in_entry(test, offset)); 871 } 872 node = prev; 873 } 874 for (; node; node = rb_prev(node)) { 875 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 876 877 /* We treat this entry as if it doesn't exist */ 878 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) 879 continue; 880 881 if (entry_end(test) <= disk_i_size) 882 break; 883 if (test->file_offset >= i_size) 884 break; 885 886 /* 887 * We don't update disk_i_size now, so record this undealt 888 * i_size. Or we will not know the real i_size. 889 */ 890 if (test->outstanding_isize < offset) 891 test->outstanding_isize = offset; 892 if (ordered && 893 ordered->outstanding_isize > test->outstanding_isize) 894 test->outstanding_isize = ordered->outstanding_isize; 895 goto out; 896 } 897 new_i_size = min_t(u64, offset, i_size); 898 899 /* 900 * Some ordered extents may completed before the current one, and 901 * we hold the real i_size in ->outstanding_isize. 902 */ 903 if (ordered && ordered->outstanding_isize > new_i_size) 904 new_i_size = min_t(u64, ordered->outstanding_isize, i_size); 905 BTRFS_I(inode)->disk_i_size = new_i_size; 906 ret = 0; 907 out: 908 /* 909 * We need to do this because we can't remove ordered extents until 910 * after the i_disk_size has been updated and then the inode has been 911 * updated to reflect the change, so we need to tell anybody who finds 912 * this ordered extent that we've already done all the real work, we 913 * just haven't completed all the other work. 914 */ 915 if (ordered) 916 set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); 917 spin_unlock_irq(&tree->lock); 918 return ret; 919 } 920 921 /* 922 * search the ordered extents for one corresponding to 'offset' and 923 * try to find a checksum. This is used because we allow pages to 924 * be reclaimed before their checksum is actually put into the btree 925 */ 926 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, 927 u32 *sum, int len) 928 { 929 struct btrfs_ordered_sum *ordered_sum; 930 struct btrfs_ordered_extent *ordered; 931 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 932 unsigned long num_sectors; 933 unsigned long i; 934 u32 sectorsize = btrfs_inode_sectorsize(inode); 935 int index = 0; 936 937 ordered = btrfs_lookup_ordered_extent(inode, offset); 938 if (!ordered) 939 return 0; 940 941 spin_lock_irq(&tree->lock); 942 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 943 if (disk_bytenr >= ordered_sum->bytenr && 944 disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { 945 i = (disk_bytenr - ordered_sum->bytenr) >> 946 inode->i_sb->s_blocksize_bits; 947 num_sectors = ordered_sum->len >> 948 inode->i_sb->s_blocksize_bits; 949 num_sectors = min_t(int, len - index, num_sectors - i); 950 memcpy(sum + index, ordered_sum->sums + i, 951 num_sectors); 952 953 index += (int)num_sectors; 954 if (index == len) 955 goto out; 956 disk_bytenr += num_sectors * sectorsize; 957 } 958 } 959 out: 960 spin_unlock_irq(&tree->lock); 961 btrfs_put_ordered_extent(ordered); 962 return index; 963 } 964 965 int __init ordered_data_init(void) 966 { 967 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", 968 sizeof(struct btrfs_ordered_extent), 0, 969 SLAB_MEM_SPREAD, 970 NULL); 971 if (!btrfs_ordered_extent_cache) 972 return -ENOMEM; 973 974 return 0; 975 } 976 977 void __cold ordered_data_exit(void) 978 { 979 kmem_cache_destroy(btrfs_ordered_extent_cache); 980 } 981