1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/slab.h> 7 #include <linux/blkdev.h> 8 #include <linux/writeback.h> 9 #include <linux/sched/mm.h> 10 #include "messages.h" 11 #include "misc.h" 12 #include "ctree.h" 13 #include "transaction.h" 14 #include "btrfs_inode.h" 15 #include "extent_io.h" 16 #include "disk-io.h" 17 #include "compression.h" 18 #include "delalloc-space.h" 19 #include "qgroup.h" 20 #include "subpage.h" 21 #include "file.h" 22 #include "block-group.h" 23 24 static struct kmem_cache *btrfs_ordered_extent_cache; 25 26 static u64 entry_end(struct btrfs_ordered_extent *entry) 27 { 28 if (entry->file_offset + entry->num_bytes < entry->file_offset) 29 return (u64)-1; 30 return entry->file_offset + entry->num_bytes; 31 } 32 33 /* returns NULL if the insertion worked, or it returns the node it did find 34 * in the tree 35 */ 36 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, 37 struct rb_node *node) 38 { 39 struct rb_node **p = &root->rb_node; 40 struct rb_node *parent = NULL; 41 struct btrfs_ordered_extent *entry; 42 43 while (*p) { 44 parent = *p; 45 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); 46 47 if (file_offset < entry->file_offset) 48 p = &(*p)->rb_left; 49 else if (file_offset >= entry_end(entry)) 50 p = &(*p)->rb_right; 51 else 52 return parent; 53 } 54 55 rb_link_node(node, parent, p); 56 rb_insert_color(node, root); 57 return NULL; 58 } 59 60 /* 61 * look for a given offset in the tree, and if it can't be found return the 62 * first lesser offset 63 */ 64 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, 65 struct rb_node **prev_ret) 66 { 67 struct rb_node *n = root->rb_node; 68 struct rb_node *prev = NULL; 69 struct rb_node *test; 70 struct btrfs_ordered_extent *entry; 71 struct btrfs_ordered_extent *prev_entry = NULL; 72 73 while (n) { 74 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); 75 prev = n; 76 prev_entry = entry; 77 78 if (file_offset < entry->file_offset) 79 n = n->rb_left; 80 else if (file_offset >= entry_end(entry)) 81 n = n->rb_right; 82 else 83 return n; 84 } 85 if (!prev_ret) 86 return NULL; 87 88 while (prev && file_offset >= entry_end(prev_entry)) { 89 test = rb_next(prev); 90 if (!test) 91 break; 92 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 93 rb_node); 94 if (file_offset < entry_end(prev_entry)) 95 break; 96 97 prev = test; 98 } 99 if (prev) 100 prev_entry = rb_entry(prev, struct btrfs_ordered_extent, 101 rb_node); 102 while (prev && file_offset < entry_end(prev_entry)) { 103 test = rb_prev(prev); 104 if (!test) 105 break; 106 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 107 rb_node); 108 prev = test; 109 } 110 *prev_ret = prev; 111 return NULL; 112 } 113 114 static int btrfs_range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, 115 u64 len) 116 { 117 if (file_offset + len <= entry->file_offset || 118 entry->file_offset + entry->num_bytes <= file_offset) 119 return 0; 120 return 1; 121 } 122 123 /* 124 * look find the first ordered struct that has this offset, otherwise 125 * the first one less than this offset 126 */ 127 static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode, 128 u64 file_offset) 129 { 130 struct rb_node *prev = NULL; 131 struct rb_node *ret; 132 struct btrfs_ordered_extent *entry; 133 134 if (inode->ordered_tree_last) { 135 entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent, 136 rb_node); 137 if (in_range(file_offset, entry->file_offset, entry->num_bytes)) 138 return inode->ordered_tree_last; 139 } 140 ret = __tree_search(&inode->ordered_tree, file_offset, &prev); 141 if (!ret) 142 ret = prev; 143 if (ret) 144 inode->ordered_tree_last = ret; 145 return ret; 146 } 147 148 static struct btrfs_ordered_extent *alloc_ordered_extent( 149 struct btrfs_inode *inode, u64 file_offset, u64 num_bytes, 150 u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes, 151 u64 offset, unsigned long flags, int compress_type) 152 { 153 struct btrfs_ordered_extent *entry; 154 int ret; 155 u64 qgroup_rsv = 0; 156 const bool is_nocow = (flags & 157 ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))); 158 159 /* Only one type flag can be set. */ 160 ASSERT(has_single_bit_set(flags & BTRFS_ORDERED_EXCLUSIVE_FLAGS), 161 "flags=0x%lx", flags); 162 163 /* DIRECT cannot be set with COMPRESSED nor ENCODED. */ 164 if (test_bit(BTRFS_ORDERED_DIRECT, &flags)) { 165 ASSERT(!test_bit(BTRFS_ORDERED_COMPRESSED, &flags)); 166 ASSERT(!test_bit(BTRFS_ORDERED_ENCODED, &flags)); 167 } 168 169 /* ENCODED must be set with COMPRESSED. */ 170 if (test_bit(BTRFS_ORDERED_ENCODED, &flags)) 171 ASSERT(test_bit(BTRFS_ORDERED_COMPRESSED, &flags)); 172 173 /* 174 * For a NOCOW write we can free the qgroup reserve right now. For a COW 175 * one we transfer the reserved space from the inode's iotree into the 176 * ordered extent by calling btrfs_qgroup_release_data() and tracking 177 * the qgroup reserved amount in the ordered extent, so that later after 178 * completing the ordered extent, when running the data delayed ref it 179 * creates, we free the reserved data with btrfs_qgroup_free_refroot(). 180 */ 181 if (is_nocow) 182 ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); 183 else 184 ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); 185 186 if (ret < 0) 187 return ERR_PTR(ret); 188 189 entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); 190 if (!entry) { 191 entry = ERR_PTR(-ENOMEM); 192 goto out; 193 } 194 195 entry->file_offset = file_offset; 196 entry->num_bytes = num_bytes; 197 entry->ram_bytes = ram_bytes; 198 entry->disk_bytenr = disk_bytenr; 199 entry->disk_num_bytes = disk_num_bytes; 200 entry->offset = offset; 201 entry->bytes_left = num_bytes; 202 if (WARN_ON_ONCE(!igrab(&inode->vfs_inode))) { 203 kmem_cache_free(btrfs_ordered_extent_cache, entry); 204 entry = ERR_PTR(-ESTALE); 205 goto out; 206 } 207 entry->inode = inode; 208 entry->compress_type = compress_type; 209 entry->truncated_len = (u64)-1; 210 entry->qgroup_rsv = qgroup_rsv; 211 entry->flags = flags; 212 refcount_set(&entry->refs, 1); 213 init_waitqueue_head(&entry->wait); 214 INIT_LIST_HEAD(&entry->csum_list); 215 INIT_LIST_HEAD(&entry->log_list); 216 INIT_LIST_HEAD(&entry->root_extent_list); 217 INIT_LIST_HEAD(&entry->work_list); 218 INIT_LIST_HEAD(&entry->bioc_list); 219 init_completion(&entry->completion); 220 221 /* 222 * We don't need the count_max_extents here, we can assume that all of 223 * that work has been done at higher layers, so this is truly the 224 * smallest the extent is going to get. 225 */ 226 spin_lock(&inode->lock); 227 btrfs_mod_outstanding_extents(inode, 1); 228 spin_unlock(&inode->lock); 229 230 out: 231 if (IS_ERR(entry) && !is_nocow) 232 btrfs_qgroup_free_refroot(inode->root->fs_info, 233 btrfs_root_id(inode->root), 234 qgroup_rsv, BTRFS_QGROUP_RSV_DATA); 235 236 return entry; 237 } 238 239 static void insert_ordered_extent(struct btrfs_ordered_extent *entry) 240 { 241 struct btrfs_inode *inode = entry->inode; 242 struct btrfs_root *root = inode->root; 243 struct btrfs_fs_info *fs_info = root->fs_info; 244 struct rb_node *node; 245 246 trace_btrfs_ordered_extent_add(inode, entry); 247 248 percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes, 249 fs_info->delalloc_batch); 250 251 /* One ref for the tree. */ 252 refcount_inc(&entry->refs); 253 254 spin_lock(&inode->ordered_tree_lock); 255 node = tree_insert(&inode->ordered_tree, entry->file_offset, 256 &entry->rb_node); 257 if (unlikely(node)) { 258 struct btrfs_ordered_extent *exist = 259 rb_entry(node, struct btrfs_ordered_extent, rb_node); 260 261 btrfs_panic(fs_info, -EEXIST, 262 "overlapping ordered extents, existing oe file_offset %llu num_bytes %llu flags 0x%lx, new oe file_offset %llu num_bytes %llu flags 0x%lx", 263 exist->file_offset, exist->num_bytes, exist->flags, 264 entry->file_offset, entry->num_bytes, entry->flags); 265 } 266 spin_unlock(&inode->ordered_tree_lock); 267 268 spin_lock(&root->ordered_extent_lock); 269 list_add_tail(&entry->root_extent_list, 270 &root->ordered_extents); 271 root->nr_ordered_extents++; 272 if (root->nr_ordered_extents == 1) { 273 spin_lock(&fs_info->ordered_root_lock); 274 BUG_ON(!list_empty(&root->ordered_root)); 275 list_add_tail(&root->ordered_root, &fs_info->ordered_roots); 276 spin_unlock(&fs_info->ordered_root_lock); 277 } 278 spin_unlock(&root->ordered_extent_lock); 279 } 280 281 /* 282 * Add an ordered extent to the per-inode tree. 283 * 284 * @inode: Inode that this extent is for. 285 * @file_offset: Logical offset in file where the extent starts. 286 * @num_bytes: Logical length of extent in file. 287 * @ram_bytes: Full length of unencoded data. 288 * @disk_bytenr: Offset of extent on disk. 289 * @disk_num_bytes: Size of extent on disk. 290 * @offset: Offset into unencoded data where file data starts. 291 * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*). 292 * @compress_type: Compression algorithm used for data. 293 * 294 * Most of these parameters correspond to &struct btrfs_file_extent_item. The 295 * tree is given a single reference on the ordered extent that was inserted, and 296 * the returned pointer is given a second reference. 297 * 298 * Return: the new ordered extent or error pointer. 299 */ 300 struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( 301 struct btrfs_inode *inode, u64 file_offset, 302 const struct btrfs_file_extent *file_extent, unsigned long flags) 303 { 304 struct btrfs_ordered_extent *entry; 305 306 ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0, "flags=0x%lx", flags); 307 308 /* 309 * For regular writes, we just use the members in @file_extent. 310 * 311 * For NOCOW, we don't really care about the numbers except @start and 312 * file_extent->num_bytes, as we won't insert a file extent item at all. 313 * 314 * For PREALLOC, we do not use ordered extent members, but 315 * btrfs_mark_extent_written() handles everything. 316 * 317 * So here we always pass 0 as offset for NOCOW/PREALLOC ordered extents, 318 * or btrfs_split_ordered_extent() cannot handle it correctly. 319 */ 320 if (flags & ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))) 321 entry = alloc_ordered_extent(inode, file_offset, 322 file_extent->num_bytes, 323 file_extent->num_bytes, 324 file_extent->disk_bytenr + file_extent->offset, 325 file_extent->num_bytes, 0, flags, 326 file_extent->compression); 327 else 328 entry = alloc_ordered_extent(inode, file_offset, 329 file_extent->num_bytes, 330 file_extent->ram_bytes, 331 file_extent->disk_bytenr, 332 file_extent->disk_num_bytes, 333 file_extent->offset, flags, 334 file_extent->compression); 335 if (!IS_ERR(entry)) 336 insert_ordered_extent(entry); 337 return entry; 338 } 339 340 /* 341 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 342 * when an ordered extent is finished. If the list covers more than one 343 * ordered extent, it is split across multiples. 344 */ 345 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, 346 struct btrfs_ordered_sum *sum) 347 { 348 struct btrfs_inode *inode = entry->inode; 349 350 spin_lock(&inode->ordered_tree_lock); 351 list_add_tail(&sum->list, &entry->csum_list); 352 spin_unlock(&inode->ordered_tree_lock); 353 } 354 355 void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered) 356 { 357 if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) 358 mapping_set_error(ordered->inode->vfs_inode.i_mapping, -EIO); 359 } 360 361 void btrfs_mark_ordered_extent_truncated(struct btrfs_ordered_extent *ordered, 362 u64 truncate_len) 363 { 364 struct btrfs_inode *inode = ordered->inode; 365 366 ASSERT(truncate_len <= ordered->num_bytes); 367 spin_lock(&inode->ordered_tree_lock); 368 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 369 ordered->truncated_len = min(ordered->truncated_len, truncate_len); 370 spin_unlock(&inode->ordered_tree_lock); 371 } 372 373 static void finish_ordered_fn(struct btrfs_work *work) 374 { 375 struct btrfs_ordered_extent *ordered_extent; 376 377 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 378 btrfs_finish_ordered_io(ordered_extent); 379 } 380 381 static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, 382 u64 file_offset, u64 len, bool uptodate) 383 { 384 struct btrfs_inode *inode = ordered->inode; 385 struct btrfs_fs_info *fs_info = inode->root->fs_info; 386 387 lockdep_assert_held(&inode->ordered_tree_lock); 388 389 /* Now we're fine to update the accounting. */ 390 if (WARN_ON_ONCE(len > ordered->bytes_left)) { 391 btrfs_crit(fs_info, 392 "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu", 393 btrfs_root_id(inode->root), btrfs_ino(inode), 394 ordered->file_offset, ordered->num_bytes, 395 len, ordered->bytes_left); 396 ordered->bytes_left = 0; 397 } else { 398 ordered->bytes_left -= len; 399 } 400 401 if (!uptodate) 402 btrfs_mark_ordered_extent_error(ordered); 403 404 if (ordered->bytes_left) 405 return false; 406 407 /* 408 * All the IO of the ordered extent is finished, we need to queue 409 * the finish_func to be executed. 410 */ 411 set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags); 412 cond_wake_up(&ordered->wait); 413 refcount_inc(&ordered->refs); 414 trace_btrfs_ordered_extent_mark_finished(inode, ordered); 415 return true; 416 } 417 418 static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) 419 { 420 struct btrfs_inode *inode = ordered->inode; 421 struct btrfs_fs_info *fs_info = inode->root->fs_info; 422 struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ? 423 fs_info->endio_freespace_worker : fs_info->endio_write_workers; 424 425 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL); 426 btrfs_queue_work(wq, &ordered->work); 427 } 428 429 void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, 430 u64 file_offset, u64 len, bool uptodate) 431 { 432 struct btrfs_inode *inode = ordered->inode; 433 bool ret; 434 435 trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); 436 437 spin_lock(&inode->ordered_tree_lock); 438 ret = can_finish_ordered_extent(ordered, file_offset, len, 439 uptodate); 440 spin_unlock(&inode->ordered_tree_lock); 441 442 /* 443 * If this is a COW write it means we created new extent maps for the 444 * range and they point to unwritten locations if we got an error either 445 * before submitting a bio or during IO. 446 * 447 * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we 448 * are queuing its completion below. During completion, at 449 * btrfs_finish_one_ordered(), we will drop the extent maps for the 450 * unwritten extents. 451 * 452 * However because completion runs in a work queue we can end up having 453 * a fast fsync running before that. In the case of direct IO, once we 454 * unlock the inode the fsync might start, and we queue the completion 455 * before unlocking the inode. In the case of buffered IO when writeback 456 * finishes (end_bbio_data_write()) we queue the completion, so if the 457 * writeback was triggered by a fast fsync, the fsync might start 458 * logging before ordered extent completion runs in the work queue. 459 * 460 * The fast fsync will log file extent items based on the extent maps it 461 * finds, so if by the time it collects extent maps the ordered extent 462 * completion didn't happen yet, it will log file extent items that 463 * point to unwritten extents, resulting in a corruption if a crash 464 * happens and the log tree is replayed. Note that a fast fsync does not 465 * wait for completion of ordered extents in order to reduce latency. 466 * 467 * Set a flag in the inode so that the next fast fsync will wait for 468 * ordered extents to complete before starting to log. 469 */ 470 if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 471 set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); 472 473 if (ret) 474 btrfs_queue_ordered_fn(ordered); 475 } 476 477 /* 478 * Mark all ordered extents io inside the specified range finished. 479 * 480 * @folio: The involved folio for the operation. 481 * For uncompressed buffered IO, the folio status also needs to be 482 * updated to indicate whether the pending ordered io is finished. 483 * Can be NULL for direct IO and compressed write. 484 * For these cases, callers are ensured they won't execute the 485 * endio function twice. 486 * 487 * This function is called for endio, thus the range must have ordered 488 * extent(s) covering it. 489 */ 490 void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, 491 u64 file_offset, u64 num_bytes, bool uptodate) 492 { 493 struct rb_node *node; 494 struct btrfs_ordered_extent *entry = NULL; 495 u64 cur = file_offset; 496 const u64 end = file_offset + num_bytes; 497 498 trace_btrfs_writepage_end_io_hook(inode, file_offset, end - 1, uptodate); 499 500 spin_lock(&inode->ordered_tree_lock); 501 while (cur < end) { 502 u64 entry_end; 503 u64 this_end; 504 u64 len; 505 506 node = ordered_tree_search(inode, cur); 507 /* No ordered extents at all */ 508 if (!node) 509 break; 510 511 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 512 entry_end = entry->file_offset + entry->num_bytes; 513 /* 514 * |<-- OE --->| | 515 * cur 516 * Go to next OE. 517 */ 518 if (cur >= entry_end) { 519 node = rb_next(node); 520 /* No more ordered extents, exit */ 521 if (!node) 522 break; 523 entry = rb_entry(node, struct btrfs_ordered_extent, 524 rb_node); 525 526 /* Go to next ordered extent and continue */ 527 cur = entry->file_offset; 528 continue; 529 } 530 /* 531 * | |<--- OE --->| 532 * cur 533 * Go to the start of OE. 534 */ 535 if (cur < entry->file_offset) { 536 cur = entry->file_offset; 537 continue; 538 } 539 540 /* 541 * Now we are definitely inside one ordered extent. 542 * 543 * |<--- OE --->| 544 * | 545 * cur 546 */ 547 this_end = min(entry_end, end); 548 len = this_end - cur; 549 ASSERT(len < U32_MAX); 550 551 if (can_finish_ordered_extent(entry, cur, len, uptodate)) { 552 spin_unlock(&inode->ordered_tree_lock); 553 btrfs_queue_ordered_fn(entry); 554 spin_lock(&inode->ordered_tree_lock); 555 } 556 cur += len; 557 } 558 spin_unlock(&inode->ordered_tree_lock); 559 } 560 561 /* 562 * Finish IO for one ordered extent across a given range. The range can only 563 * contain one ordered extent. 564 * 565 * @cached: The cached ordered extent. If not NULL, we can skip the tree 566 * search and use the ordered extent directly. 567 * Will be also used to store the finished ordered extent. 568 * @file_offset: File offset for the finished IO 569 * @io_size: Length of the finish IO range 570 * 571 * Return true if the ordered extent is finished in the range, and update 572 * @cached. 573 * Return false otherwise. 574 * 575 * NOTE: The range can NOT cross multiple ordered extents. 576 * Thus caller should ensure the range doesn't cross ordered extents. 577 */ 578 bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, 579 struct btrfs_ordered_extent **cached, 580 u64 file_offset, u64 io_size) 581 { 582 struct rb_node *node; 583 struct btrfs_ordered_extent *entry = NULL; 584 bool finished = false; 585 586 spin_lock(&inode->ordered_tree_lock); 587 if (cached && *cached) { 588 entry = *cached; 589 goto have_entry; 590 } 591 592 node = ordered_tree_search(inode, file_offset); 593 if (!node) 594 goto out; 595 596 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 597 have_entry: 598 if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) 599 goto out; 600 601 if (io_size > entry->bytes_left) 602 btrfs_crit(inode->root->fs_info, 603 "bad ordered accounting left %llu size %llu", 604 entry->bytes_left, io_size); 605 606 entry->bytes_left -= io_size; 607 608 if (entry->bytes_left == 0) { 609 /* 610 * Ensure only one caller can set the flag and finished_ret 611 * accordingly 612 */ 613 finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 614 /* test_and_set_bit implies a barrier */ 615 cond_wake_up_nomb(&entry->wait); 616 } 617 out: 618 if (finished && cached && entry) { 619 *cached = entry; 620 refcount_inc(&entry->refs); 621 trace_btrfs_ordered_extent_dec_test_pending(inode, entry); 622 } 623 spin_unlock(&inode->ordered_tree_lock); 624 return finished; 625 } 626 627 /* 628 * used to drop a reference on an ordered extent. This will free 629 * the extent if the last reference is dropped 630 */ 631 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) 632 { 633 trace_btrfs_ordered_extent_put(entry->inode, entry); 634 635 if (refcount_dec_and_test(&entry->refs)) { 636 struct btrfs_ordered_sum *sum; 637 struct btrfs_ordered_sum *tmp; 638 639 ASSERT(list_empty(&entry->root_extent_list)); 640 ASSERT(list_empty(&entry->log_list)); 641 ASSERT(RB_EMPTY_NODE(&entry->rb_node)); 642 btrfs_add_delayed_iput(entry->inode); 643 list_for_each_entry_safe(sum, tmp, &entry->csum_list, list) 644 kvfree(sum); 645 kmem_cache_free(btrfs_ordered_extent_cache, entry); 646 } 647 } 648 649 /* 650 * remove an ordered extent from the tree. No references are dropped 651 * and waiters are woken up. 652 */ 653 void btrfs_remove_ordered_extent(struct btrfs_ordered_extent *entry) 654 { 655 struct btrfs_inode *btrfs_inode = entry->inode; 656 struct btrfs_root *root = btrfs_inode->root; 657 struct btrfs_fs_info *fs_info = root->fs_info; 658 struct rb_node *node; 659 bool pending; 660 bool freespace_inode; 661 662 /* 663 * If this is a free space inode the thread has not acquired the ordered 664 * extents lockdep map. 665 */ 666 freespace_inode = btrfs_is_free_space_inode(btrfs_inode); 667 668 btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered); 669 /* This is paired with alloc_ordered_extent(). */ 670 spin_lock(&btrfs_inode->lock); 671 btrfs_mod_outstanding_extents(btrfs_inode, -1); 672 spin_unlock(&btrfs_inode->lock); 673 if (root != fs_info->tree_root) { 674 u64 release; 675 676 if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags)) 677 release = entry->disk_num_bytes; 678 else 679 release = entry->num_bytes; 680 btrfs_delalloc_release_metadata(btrfs_inode, release, 681 test_bit(BTRFS_ORDERED_IOERR, 682 &entry->flags)); 683 } 684 685 percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, 686 fs_info->delalloc_batch); 687 688 spin_lock(&btrfs_inode->ordered_tree_lock); 689 node = &entry->rb_node; 690 rb_erase(node, &btrfs_inode->ordered_tree); 691 RB_CLEAR_NODE(node); 692 if (btrfs_inode->ordered_tree_last == node) 693 btrfs_inode->ordered_tree_last = NULL; 694 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 695 pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); 696 spin_unlock(&btrfs_inode->ordered_tree_lock); 697 698 /* 699 * The current running transaction is waiting on us, we need to let it 700 * know that we're complete and wake it up. 701 */ 702 if (pending) { 703 struct btrfs_transaction *trans; 704 705 /* 706 * The checks for trans are just a formality, it should be set, 707 * but if it isn't we don't want to deref/assert under the spin 708 * lock, so be nice and check if trans is set, but ASSERT() so 709 * if it isn't set a developer will notice. 710 */ 711 spin_lock(&fs_info->trans_lock); 712 trans = fs_info->running_transaction; 713 if (trans) 714 refcount_inc(&trans->use_count); 715 spin_unlock(&fs_info->trans_lock); 716 717 ASSERT(trans || BTRFS_FS_ERROR(fs_info)); 718 if (trans) { 719 if (atomic_dec_and_test(&trans->pending_ordered)) 720 wake_up(&trans->pending_wait); 721 btrfs_put_transaction(trans); 722 } 723 } 724 725 btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered); 726 727 spin_lock(&root->ordered_extent_lock); 728 list_del_init(&entry->root_extent_list); 729 root->nr_ordered_extents--; 730 731 trace_btrfs_ordered_extent_remove(btrfs_inode, entry); 732 733 if (!root->nr_ordered_extents) { 734 spin_lock(&fs_info->ordered_root_lock); 735 BUG_ON(list_empty(&root->ordered_root)); 736 list_del_init(&root->ordered_root); 737 spin_unlock(&fs_info->ordered_root_lock); 738 } 739 spin_unlock(&root->ordered_extent_lock); 740 wake_up(&entry->wait); 741 if (!freespace_inode) 742 btrfs_lockdep_release(fs_info, btrfs_ordered_extent); 743 } 744 745 static void btrfs_run_ordered_extent_work(struct btrfs_work *work) 746 { 747 struct btrfs_ordered_extent *ordered; 748 749 ordered = container_of(work, struct btrfs_ordered_extent, flush_work); 750 btrfs_start_ordered_extent(ordered); 751 complete(&ordered->completion); 752 } 753 754 /* 755 * Wait for all the ordered extents in a root. Use @bg as range or do whole 756 * range if it's NULL. 757 */ 758 u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, 759 const struct btrfs_block_group *bg) 760 { 761 struct btrfs_fs_info *fs_info = root->fs_info; 762 LIST_HEAD(splice); 763 LIST_HEAD(skipped); 764 LIST_HEAD(works); 765 struct btrfs_ordered_extent *ordered, *next; 766 u64 count = 0; 767 u64 range_start, range_len; 768 u64 range_end; 769 770 if (bg) { 771 range_start = bg->start; 772 range_len = bg->length; 773 } else { 774 range_start = 0; 775 range_len = U64_MAX; 776 } 777 range_end = range_start + range_len; 778 779 mutex_lock(&root->ordered_extent_mutex); 780 spin_lock(&root->ordered_extent_lock); 781 list_splice_init(&root->ordered_extents, &splice); 782 while (!list_empty(&splice) && nr) { 783 ordered = list_first_entry(&splice, struct btrfs_ordered_extent, 784 root_extent_list); 785 786 if (range_end <= ordered->disk_bytenr || 787 ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) { 788 list_move_tail(&ordered->root_extent_list, &skipped); 789 cond_resched_lock(&root->ordered_extent_lock); 790 continue; 791 } 792 793 list_move_tail(&ordered->root_extent_list, 794 &root->ordered_extents); 795 refcount_inc(&ordered->refs); 796 spin_unlock(&root->ordered_extent_lock); 797 798 btrfs_init_work(&ordered->flush_work, 799 btrfs_run_ordered_extent_work, NULL); 800 list_add_tail(&ordered->work_list, &works); 801 btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); 802 803 cond_resched(); 804 if (nr != U64_MAX) 805 nr--; 806 count++; 807 spin_lock(&root->ordered_extent_lock); 808 } 809 list_splice_tail(&skipped, &root->ordered_extents); 810 list_splice_tail(&splice, &root->ordered_extents); 811 spin_unlock(&root->ordered_extent_lock); 812 813 list_for_each_entry_safe(ordered, next, &works, work_list) { 814 list_del_init(&ordered->work_list); 815 wait_for_completion(&ordered->completion); 816 btrfs_put_ordered_extent(ordered); 817 cond_resched(); 818 } 819 mutex_unlock(&root->ordered_extent_mutex); 820 821 return count; 822 } 823 824 /* 825 * Wait for @nr ordered extents that intersect the @bg, or the whole range of 826 * the filesystem if @bg is NULL. 827 */ 828 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, 829 const struct btrfs_block_group *bg) 830 { 831 struct btrfs_root *root; 832 LIST_HEAD(splice); 833 u64 done; 834 835 mutex_lock(&fs_info->ordered_operations_mutex); 836 spin_lock(&fs_info->ordered_root_lock); 837 list_splice_init(&fs_info->ordered_roots, &splice); 838 while (!list_empty(&splice) && nr) { 839 root = list_first_entry(&splice, struct btrfs_root, 840 ordered_root); 841 root = btrfs_grab_root(root); 842 BUG_ON(!root); 843 list_move_tail(&root->ordered_root, 844 &fs_info->ordered_roots); 845 spin_unlock(&fs_info->ordered_root_lock); 846 847 done = btrfs_wait_ordered_extents(root, nr, bg); 848 btrfs_put_root(root); 849 850 if (nr != U64_MAX) 851 nr -= done; 852 853 spin_lock(&fs_info->ordered_root_lock); 854 } 855 list_splice_tail(&splice, &fs_info->ordered_roots); 856 spin_unlock(&fs_info->ordered_root_lock); 857 mutex_unlock(&fs_info->ordered_operations_mutex); 858 } 859 860 /* 861 * Start IO and wait for a given ordered extent to finish. 862 * 863 * Wait on page writeback for all the pages in the extent but not in 864 * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the 865 * IO completion code to insert metadata into the btree corresponding to the extent. 866 */ 867 void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, 868 u64 nowriteback_start, u32 nowriteback_len) 869 { 870 u64 start = entry->file_offset; 871 u64 end = start + entry->num_bytes - 1; 872 struct btrfs_inode *inode = entry->inode; 873 bool freespace_inode; 874 875 trace_btrfs_ordered_extent_start(inode, entry); 876 877 /* 878 * If this is a free space inode do not take the ordered extents lockdep 879 * map. 880 */ 881 freespace_inode = btrfs_is_free_space_inode(inode); 882 883 /* 884 * pages in the range can be dirty, clean or writeback. We 885 * start IO on any dirty ones so the wait doesn't stall waiting 886 * for the flusher thread to find them 887 */ 888 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) { 889 if (!nowriteback_len) { 890 filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); 891 } else { 892 if (start < nowriteback_start) 893 filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, 894 nowriteback_start - 1); 895 if (nowriteback_start + nowriteback_len < end) 896 filemap_fdatawrite_range(inode->vfs_inode.i_mapping, 897 nowriteback_start + nowriteback_len, 898 end); 899 } 900 } 901 902 if (!freespace_inode) 903 btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); 904 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); 905 } 906 907 /* 908 * Used to wait on ordered extents across a large range of bytes. 909 */ 910 int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len) 911 { 912 int ret = 0; 913 int ret_wb = 0; 914 u64 end; 915 u64 orig_end; 916 struct btrfs_ordered_extent *ordered; 917 918 if (start + len < start) { 919 orig_end = OFFSET_MAX; 920 } else { 921 orig_end = start + len - 1; 922 if (orig_end > OFFSET_MAX) 923 orig_end = OFFSET_MAX; 924 } 925 926 /* start IO across the range first to instantiate any delalloc 927 * extents 928 */ 929 ret = btrfs_fdatawrite_range(inode, start, orig_end); 930 if (ret) 931 return ret; 932 933 /* 934 * If we have a writeback error don't return immediately. Wait first 935 * for any ordered extents that haven't completed yet. This is to make 936 * sure no one can dirty the same page ranges and call writepages() 937 * before the ordered extents complete - to avoid failures (-EEXIST) 938 * when adding the new ordered extents to the ordered tree. 939 */ 940 ret_wb = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, orig_end); 941 942 end = orig_end; 943 while (1) { 944 ordered = btrfs_lookup_first_ordered_extent(inode, end); 945 if (!ordered) 946 break; 947 if (ordered->file_offset > orig_end) { 948 btrfs_put_ordered_extent(ordered); 949 break; 950 } 951 if (ordered->file_offset + ordered->num_bytes <= start) { 952 btrfs_put_ordered_extent(ordered); 953 break; 954 } 955 btrfs_start_ordered_extent(ordered); 956 end = ordered->file_offset; 957 /* 958 * If the ordered extent had an error save the error but don't 959 * exit without waiting first for all other ordered extents in 960 * the range to complete. 961 */ 962 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) 963 ret = -EIO; 964 btrfs_put_ordered_extent(ordered); 965 if (end == 0 || end == start) 966 break; 967 end--; 968 } 969 return ret_wb ? ret_wb : ret; 970 } 971 972 /* 973 * find an ordered extent corresponding to file_offset. return NULL if 974 * nothing is found, otherwise take a reference on the extent and return it 975 */ 976 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, 977 u64 file_offset) 978 { 979 struct rb_node *node; 980 struct btrfs_ordered_extent *entry = NULL; 981 982 spin_lock(&inode->ordered_tree_lock); 983 node = ordered_tree_search(inode, file_offset); 984 if (!node) 985 goto out; 986 987 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 988 if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) 989 entry = NULL; 990 if (entry) { 991 refcount_inc(&entry->refs); 992 trace_btrfs_ordered_extent_lookup(inode, entry); 993 } 994 out: 995 spin_unlock(&inode->ordered_tree_lock); 996 return entry; 997 } 998 999 /* Since the DIO code tries to lock a wide area we need to look for any ordered 1000 * extents that exist in the range, rather than just the start of the range. 1001 */ 1002 struct btrfs_ordered_extent *btrfs_lookup_ordered_range( 1003 struct btrfs_inode *inode, u64 file_offset, u64 len) 1004 { 1005 struct rb_node *node; 1006 struct btrfs_ordered_extent *entry = NULL; 1007 1008 spin_lock(&inode->ordered_tree_lock); 1009 node = ordered_tree_search(inode, file_offset); 1010 if (!node) { 1011 node = ordered_tree_search(inode, file_offset + len); 1012 if (!node) 1013 goto out; 1014 } 1015 1016 while (1) { 1017 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 1018 if (btrfs_range_overlaps(entry, file_offset, len)) 1019 break; 1020 1021 if (entry->file_offset >= file_offset + len) { 1022 entry = NULL; 1023 break; 1024 } 1025 entry = NULL; 1026 node = rb_next(node); 1027 if (!node) 1028 break; 1029 } 1030 out: 1031 if (entry) { 1032 refcount_inc(&entry->refs); 1033 trace_btrfs_ordered_extent_lookup_range(inode, entry); 1034 } 1035 spin_unlock(&inode->ordered_tree_lock); 1036 return entry; 1037 } 1038 1039 /* 1040 * Adds all ordered extents to the given list. The list ends up sorted by the 1041 * file_offset of the ordered extents. 1042 */ 1043 void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, 1044 struct list_head *list) 1045 { 1046 struct rb_node *n; 1047 1048 btrfs_assert_inode_locked(inode); 1049 1050 spin_lock(&inode->ordered_tree_lock); 1051 for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { 1052 struct btrfs_ordered_extent *ordered; 1053 1054 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); 1055 1056 if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) 1057 continue; 1058 1059 ASSERT(list_empty(&ordered->log_list)); 1060 list_add_tail(&ordered->log_list, list); 1061 refcount_inc(&ordered->refs); 1062 trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); 1063 } 1064 spin_unlock(&inode->ordered_tree_lock); 1065 } 1066 1067 /* 1068 * lookup and return any extent before 'file_offset'. NULL is returned 1069 * if none is found 1070 */ 1071 struct btrfs_ordered_extent * 1072 btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) 1073 { 1074 struct rb_node *node; 1075 struct btrfs_ordered_extent *entry = NULL; 1076 1077 spin_lock(&inode->ordered_tree_lock); 1078 node = ordered_tree_search(inode, file_offset); 1079 if (!node) 1080 goto out; 1081 1082 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 1083 refcount_inc(&entry->refs); 1084 trace_btrfs_ordered_extent_lookup_first(inode, entry); 1085 out: 1086 spin_unlock(&inode->ordered_tree_lock); 1087 return entry; 1088 } 1089 1090 /* 1091 * Lookup the first ordered extent that overlaps the range 1092 * [@file_offset, @file_offset + @len). 1093 * 1094 * The difference between this and btrfs_lookup_first_ordered_extent() is 1095 * that this one won't return any ordered extent that does not overlap the range. 1096 * And the difference against btrfs_lookup_ordered_extent() is, this function 1097 * ensures the first ordered extent gets returned. 1098 */ 1099 struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( 1100 struct btrfs_inode *inode, u64 file_offset, u64 len) 1101 { 1102 struct rb_node *node; 1103 struct rb_node *cur; 1104 struct rb_node *prev; 1105 struct rb_node *next; 1106 struct btrfs_ordered_extent *entry = NULL; 1107 1108 spin_lock(&inode->ordered_tree_lock); 1109 node = inode->ordered_tree.rb_node; 1110 /* 1111 * Here we don't want to use tree_search() which will use tree->last 1112 * and screw up the search order. 1113 * And __tree_search() can't return the adjacent ordered extents 1114 * either, thus here we do our own search. 1115 */ 1116 while (node) { 1117 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 1118 1119 if (file_offset < entry->file_offset) { 1120 node = node->rb_left; 1121 } else if (file_offset >= entry_end(entry)) { 1122 node = node->rb_right; 1123 } else { 1124 /* 1125 * Direct hit, got an ordered extent that starts at 1126 * @file_offset 1127 */ 1128 goto out; 1129 } 1130 } 1131 if (!entry) { 1132 /* Empty tree */ 1133 goto out; 1134 } 1135 1136 cur = &entry->rb_node; 1137 /* We got an entry around @file_offset, check adjacent entries */ 1138 if (entry->file_offset < file_offset) { 1139 prev = cur; 1140 next = rb_next(cur); 1141 } else { 1142 prev = rb_prev(cur); 1143 next = cur; 1144 } 1145 if (prev) { 1146 entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node); 1147 if (btrfs_range_overlaps(entry, file_offset, len)) 1148 goto out; 1149 } 1150 if (next) { 1151 entry = rb_entry(next, struct btrfs_ordered_extent, rb_node); 1152 if (btrfs_range_overlaps(entry, file_offset, len)) 1153 goto out; 1154 } 1155 /* No ordered extent in the range */ 1156 entry = NULL; 1157 out: 1158 if (entry) { 1159 refcount_inc(&entry->refs); 1160 trace_btrfs_ordered_extent_lookup_first_range(inode, entry); 1161 } 1162 1163 spin_unlock(&inode->ordered_tree_lock); 1164 return entry; 1165 } 1166 1167 /* 1168 * Lock the passed range and ensures all pending ordered extents in it are run 1169 * to completion. 1170 * 1171 * @inode: Inode whose ordered tree is to be searched 1172 * @start: Beginning of range to flush 1173 * @end: Last byte of range to lock 1174 * @cached_state: If passed, will return the extent state responsible for the 1175 * locked range. It's the caller's responsibility to free the 1176 * cached state. 1177 * 1178 * Always return with the given range locked, ensuring after it's called no 1179 * order extent can be pending. 1180 */ 1181 void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, 1182 u64 end, 1183 struct extent_state **cached_state) 1184 { 1185 struct btrfs_ordered_extent *ordered; 1186 struct extent_state *cache = NULL; 1187 struct extent_state **cachedp = &cache; 1188 1189 if (cached_state) 1190 cachedp = cached_state; 1191 1192 while (1) { 1193 btrfs_lock_extent(&inode->io_tree, start, end, cachedp); 1194 ordered = btrfs_lookup_ordered_range(inode, start, 1195 end - start + 1); 1196 if (!ordered) { 1197 /* 1198 * If no external cached_state has been passed then 1199 * decrement the extra ref taken for cachedp since we 1200 * aren't exposing it outside of this function 1201 */ 1202 if (!cached_state) 1203 refcount_dec(&cache->refs); 1204 break; 1205 } 1206 btrfs_unlock_extent(&inode->io_tree, start, end, cachedp); 1207 btrfs_start_ordered_extent(ordered); 1208 btrfs_put_ordered_extent(ordered); 1209 } 1210 } 1211 1212 /* 1213 * Lock the passed range and ensure all pending ordered extents in it are run 1214 * to completion in nowait mode. 1215 * 1216 * Return true if btrfs_lock_ordered_range does not return any extents, 1217 * otherwise false. 1218 */ 1219 bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, 1220 struct extent_state **cached_state) 1221 { 1222 struct btrfs_ordered_extent *ordered; 1223 1224 if (!btrfs_try_lock_extent(&inode->io_tree, start, end, cached_state)) 1225 return false; 1226 1227 ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); 1228 if (!ordered) 1229 return true; 1230 1231 btrfs_put_ordered_extent(ordered); 1232 btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); 1233 1234 return false; 1235 } 1236 1237 /* Split out a new ordered extent for this first @len bytes of @ordered. */ 1238 struct btrfs_ordered_extent *btrfs_split_ordered_extent( 1239 struct btrfs_ordered_extent *ordered, u64 len) 1240 { 1241 struct btrfs_inode *inode = ordered->inode; 1242 struct btrfs_root *root = inode->root; 1243 struct btrfs_fs_info *fs_info = root->fs_info; 1244 u64 file_offset = ordered->file_offset; 1245 u64 disk_bytenr = ordered->disk_bytenr; 1246 unsigned long flags = ordered->flags; 1247 struct btrfs_ordered_sum *sum, *tmpsum; 1248 struct btrfs_ordered_extent *new; 1249 struct rb_node *node; 1250 u64 offset = 0; 1251 1252 trace_btrfs_ordered_extent_split(inode, ordered); 1253 1254 ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)), "flags=0x%lx", flags); 1255 1256 /* 1257 * The entire bio must be covered by the ordered extent, but we can't 1258 * reduce the original extent to a zero length either. 1259 */ 1260 if (WARN_ON_ONCE(len >= ordered->num_bytes)) 1261 return ERR_PTR(-EINVAL); 1262 /* 1263 * If our ordered extent had an error there's no point in continuing. 1264 * The error may have come from a transaction abort done either by this 1265 * task or some other concurrent task, and the transaction abort path 1266 * iterates over all existing ordered extents and sets the flag 1267 * BTRFS_ORDERED_IOERR on them. 1268 */ 1269 if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { 1270 const int fs_error = BTRFS_FS_ERROR(fs_info); 1271 1272 return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); 1273 } 1274 /* We cannot split partially completed ordered extents. */ 1275 if (ordered->bytes_left) { 1276 ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS), "flags=0x%lx", flags); 1277 if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) 1278 return ERR_PTR(-EINVAL); 1279 } 1280 /* We cannot split a compressed ordered extent. */ 1281 if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) 1282 return ERR_PTR(-EINVAL); 1283 1284 new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr, 1285 len, 0, flags, ordered->compress_type); 1286 if (IS_ERR(new)) 1287 return new; 1288 1289 /* One ref for the tree. */ 1290 refcount_inc(&new->refs); 1291 1292 /* 1293 * Take the root's ordered_extent_lock to avoid a race with 1294 * btrfs_wait_ordered_extents() when updating the disk_bytenr and 1295 * disk_num_bytes fields of the ordered extent below. 1296 * 1297 * There's no concern about a previous caller of 1298 * btrfs_wait_ordered_extents() getting the trimmed ordered extent 1299 * before we insert the new one, because even if it gets the ordered 1300 * extent before it's trimmed and the new one inserted, right before it 1301 * uses it or during its use, the ordered extent might have been 1302 * trimmed in the meanwhile, and it missed the new ordered extent. 1303 * There's no way around this and it's harmless for current use cases, 1304 * so we take the root's ordered_extent_lock to fix that race during 1305 * trimming and silence tools like KCSAN. 1306 */ 1307 spin_lock_irq(&root->ordered_extent_lock); 1308 spin_lock(&inode->ordered_tree_lock); 1309 1310 /* 1311 * We don't have overlapping ordered extents (that would imply double 1312 * allocation of extents) and we checked above that the split length 1313 * does not cross the ordered extent's num_bytes field, so there's 1314 * no need to remove it and re-insert it in the tree. 1315 */ 1316 ordered->file_offset += len; 1317 ordered->disk_bytenr += len; 1318 ordered->num_bytes -= len; 1319 ordered->disk_num_bytes -= len; 1320 ordered->ram_bytes -= len; 1321 1322 if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) { 1323 ASSERT(ordered->bytes_left == 0, "ordered->bytes_left=%llu", 1324 ordered->bytes_left); 1325 new->bytes_left = 0; 1326 } else { 1327 ordered->bytes_left -= len; 1328 } 1329 1330 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) { 1331 if (ordered->truncated_len > len) { 1332 ordered->truncated_len -= len; 1333 } else { 1334 new->truncated_len = ordered->truncated_len; 1335 ordered->truncated_len = 0; 1336 } 1337 } 1338 1339 list_for_each_entry_safe(sum, tmpsum, &ordered->csum_list, list) { 1340 if (offset == len) 1341 break; 1342 list_move_tail(&sum->list, &new->csum_list); 1343 offset += sum->len; 1344 } 1345 1346 node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node); 1347 if (unlikely(node)) 1348 btrfs_panic(fs_info, -EEXIST, 1349 "inconsistency in ordered tree at offset %llu after split", 1350 new->file_offset); 1351 spin_unlock(&inode->ordered_tree_lock); 1352 1353 list_add_tail(&new->root_extent_list, &root->ordered_extents); 1354 root->nr_ordered_extents++; 1355 spin_unlock_irq(&root->ordered_extent_lock); 1356 return new; 1357 } 1358 1359 int __init ordered_data_init(void) 1360 { 1361 btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0); 1362 if (!btrfs_ordered_extent_cache) 1363 return -ENOMEM; 1364 1365 return 0; 1366 } 1367 1368 void __cold ordered_data_exit(void) 1369 { 1370 kmem_cache_destroy(btrfs_ordered_extent_cache); 1371 } 1372