1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/slab.h> 7 #include <linux/blkdev.h> 8 #include <linux/writeback.h> 9 #include <linux/sched/mm.h> 10 #include "messages.h" 11 #include "misc.h" 12 #include "ctree.h" 13 #include "transaction.h" 14 #include "btrfs_inode.h" 15 #include "extent_io.h" 16 #include "disk-io.h" 17 #include "compression.h" 18 #include "delalloc-space.h" 19 #include "qgroup.h" 20 #include "subpage.h" 21 #include "file.h" 22 #include "block-group.h" 23 24 static struct kmem_cache *btrfs_ordered_extent_cache; 25 26 static u64 entry_end(struct btrfs_ordered_extent *entry) 27 { 28 if (entry->file_offset + entry->num_bytes < entry->file_offset) 29 return (u64)-1; 30 return entry->file_offset + entry->num_bytes; 31 } 32 33 /* returns NULL if the insertion worked, or it returns the node it did find 34 * in the tree 35 */ 36 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, 37 struct rb_node *node) 38 { 39 struct rb_node **p = &root->rb_node; 40 struct rb_node *parent = NULL; 41 struct btrfs_ordered_extent *entry; 42 43 while (*p) { 44 parent = *p; 45 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); 46 47 if (file_offset < entry->file_offset) 48 p = &(*p)->rb_left; 49 else if (file_offset >= entry_end(entry)) 50 p = &(*p)->rb_right; 51 else 52 return parent; 53 } 54 55 rb_link_node(node, parent, p); 56 rb_insert_color(node, root); 57 return NULL; 58 } 59 60 /* 61 * look for a given offset in the tree, and if it can't be found return the 62 * first lesser offset 63 */ 64 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, 65 struct rb_node **prev_ret) 66 { 67 struct rb_node *n = root->rb_node; 68 struct rb_node *prev = NULL; 69 struct rb_node *test; 70 struct btrfs_ordered_extent *entry; 71 struct btrfs_ordered_extent *prev_entry = NULL; 72 73 while (n) { 74 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); 75 prev = n; 76 prev_entry = entry; 77 78 if (file_offset < entry->file_offset) 79 n = n->rb_left; 80 else if (file_offset >= entry_end(entry)) 81 n = n->rb_right; 82 else 83 return n; 84 } 85 if (!prev_ret) 86 return NULL; 87 88 while (prev && file_offset >= entry_end(prev_entry)) { 89 test = rb_next(prev); 90 if (!test) 91 break; 92 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 93 rb_node); 94 if (file_offset < entry_end(prev_entry)) 95 break; 96 97 prev = test; 98 } 99 if (prev) 100 prev_entry = rb_entry(prev, struct btrfs_ordered_extent, 101 rb_node); 102 while (prev && file_offset < entry_end(prev_entry)) { 103 test = rb_prev(prev); 104 if (!test) 105 break; 106 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 107 rb_node); 108 prev = test; 109 } 110 *prev_ret = prev; 111 return NULL; 112 } 113 114 static int btrfs_range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, 115 u64 len) 116 { 117 if (file_offset + len <= entry->file_offset || 118 entry->file_offset + entry->num_bytes <= file_offset) 119 return 0; 120 return 1; 121 } 122 123 /* 124 * look find the first ordered struct that has this offset, otherwise 125 * the first one less than this offset 126 */ 127 static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode, 128 u64 file_offset) 129 { 130 struct rb_node *prev = NULL; 131 struct rb_node *ret; 132 struct btrfs_ordered_extent *entry; 133 134 if (inode->ordered_tree_last) { 135 entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent, 136 rb_node); 137 if (in_range(file_offset, entry->file_offset, entry->num_bytes)) 138 return inode->ordered_tree_last; 139 } 140 ret = __tree_search(&inode->ordered_tree, file_offset, &prev); 141 if (!ret) 142 ret = prev; 143 if (ret) 144 inode->ordered_tree_last = ret; 145 return ret; 146 } 147 148 static struct btrfs_ordered_extent *alloc_ordered_extent( 149 struct btrfs_inode *inode, u64 file_offset, u64 num_bytes, 150 u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes, 151 u64 offset, unsigned long flags, int compress_type) 152 { 153 struct btrfs_ordered_extent *entry; 154 int ret; 155 u64 qgroup_rsv = 0; 156 const bool is_nocow = (flags & 157 ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))); 158 159 /* Only one type flag can be set. */ 160 ASSERT(has_single_bit_set(flags & BTRFS_ORDERED_EXCLUSIVE_FLAGS)); 161 162 /* DIRECT cannot be set with COMPRESSED nor ENCODED. */ 163 if (test_bit(BTRFS_ORDERED_DIRECT, &flags)) { 164 ASSERT(!test_bit(BTRFS_ORDERED_COMPRESSED, &flags)); 165 ASSERT(!test_bit(BTRFS_ORDERED_ENCODED, &flags)); 166 } 167 168 /* ENCODED must be set with COMPRESSED. */ 169 if (test_bit(BTRFS_ORDERED_ENCODED, &flags)) 170 ASSERT(test_bit(BTRFS_ORDERED_COMPRESSED, &flags)); 171 172 /* 173 * For a NOCOW write we can free the qgroup reserve right now. For a COW 174 * one we transfer the reserved space from the inode's iotree into the 175 * ordered extent by calling btrfs_qgroup_release_data() and tracking 176 * the qgroup reserved amount in the ordered extent, so that later after 177 * completing the ordered extent, when running the data delayed ref it 178 * creates, we free the reserved data with btrfs_qgroup_free_refroot(). 179 */ 180 if (is_nocow) 181 ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); 182 else 183 ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); 184 185 if (ret < 0) 186 return ERR_PTR(ret); 187 188 entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); 189 if (!entry) { 190 entry = ERR_PTR(-ENOMEM); 191 goto out; 192 } 193 194 entry->file_offset = file_offset; 195 entry->num_bytes = num_bytes; 196 entry->ram_bytes = ram_bytes; 197 entry->disk_bytenr = disk_bytenr; 198 entry->disk_num_bytes = disk_num_bytes; 199 entry->offset = offset; 200 entry->bytes_left = num_bytes; 201 if (WARN_ON_ONCE(!igrab(&inode->vfs_inode))) { 202 kmem_cache_free(btrfs_ordered_extent_cache, entry); 203 entry = ERR_PTR(-ESTALE); 204 goto out; 205 } 206 entry->inode = inode; 207 entry->compress_type = compress_type; 208 entry->truncated_len = (u64)-1; 209 entry->qgroup_rsv = qgroup_rsv; 210 entry->flags = flags; 211 refcount_set(&entry->refs, 1); 212 init_waitqueue_head(&entry->wait); 213 INIT_LIST_HEAD(&entry->csum_list); 214 INIT_LIST_HEAD(&entry->log_list); 215 INIT_LIST_HEAD(&entry->root_extent_list); 216 INIT_LIST_HEAD(&entry->work_list); 217 INIT_LIST_HEAD(&entry->bioc_list); 218 init_completion(&entry->completion); 219 220 /* 221 * We don't need the count_max_extents here, we can assume that all of 222 * that work has been done at higher layers, so this is truly the 223 * smallest the extent is going to get. 224 */ 225 spin_lock(&inode->lock); 226 btrfs_mod_outstanding_extents(inode, 1); 227 spin_unlock(&inode->lock); 228 229 out: 230 if (IS_ERR(entry) && !is_nocow) 231 btrfs_qgroup_free_refroot(inode->root->fs_info, 232 btrfs_root_id(inode->root), 233 qgroup_rsv, BTRFS_QGROUP_RSV_DATA); 234 235 return entry; 236 } 237 238 static void insert_ordered_extent(struct btrfs_ordered_extent *entry) 239 { 240 struct btrfs_inode *inode = entry->inode; 241 struct btrfs_root *root = inode->root; 242 struct btrfs_fs_info *fs_info = root->fs_info; 243 struct rb_node *node; 244 245 trace_btrfs_ordered_extent_add(inode, entry); 246 247 percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes, 248 fs_info->delalloc_batch); 249 250 /* One ref for the tree. */ 251 refcount_inc(&entry->refs); 252 253 spin_lock(&inode->ordered_tree_lock); 254 node = tree_insert(&inode->ordered_tree, entry->file_offset, 255 &entry->rb_node); 256 if (unlikely(node)) { 257 struct btrfs_ordered_extent *exist = 258 rb_entry(node, struct btrfs_ordered_extent, rb_node); 259 260 btrfs_panic(fs_info, -EEXIST, 261 "overlapping ordered extents, existing oe file_offset %llu num_bytes %llu flags 0x%lx, new oe file_offset %llu num_bytes %llu flags 0x%lx", 262 exist->file_offset, exist->num_bytes, exist->flags, 263 entry->file_offset, entry->num_bytes, entry->flags); 264 } 265 spin_unlock(&inode->ordered_tree_lock); 266 267 spin_lock(&root->ordered_extent_lock); 268 list_add_tail(&entry->root_extent_list, 269 &root->ordered_extents); 270 root->nr_ordered_extents++; 271 if (root->nr_ordered_extents == 1) { 272 spin_lock(&fs_info->ordered_root_lock); 273 BUG_ON(!list_empty(&root->ordered_root)); 274 list_add_tail(&root->ordered_root, &fs_info->ordered_roots); 275 spin_unlock(&fs_info->ordered_root_lock); 276 } 277 spin_unlock(&root->ordered_extent_lock); 278 } 279 280 /* 281 * Add an ordered extent to the per-inode tree. 282 * 283 * @inode: Inode that this extent is for. 284 * @file_offset: Logical offset in file where the extent starts. 285 * @num_bytes: Logical length of extent in file. 286 * @ram_bytes: Full length of unencoded data. 287 * @disk_bytenr: Offset of extent on disk. 288 * @disk_num_bytes: Size of extent on disk. 289 * @offset: Offset into unencoded data where file data starts. 290 * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*). 291 * @compress_type: Compression algorithm used for data. 292 * 293 * Most of these parameters correspond to &struct btrfs_file_extent_item. The 294 * tree is given a single reference on the ordered extent that was inserted, and 295 * the returned pointer is given a second reference. 296 * 297 * Return: the new ordered extent or error pointer. 298 */ 299 struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( 300 struct btrfs_inode *inode, u64 file_offset, 301 const struct btrfs_file_extent *file_extent, unsigned long flags) 302 { 303 struct btrfs_ordered_extent *entry; 304 305 ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); 306 307 /* 308 * For regular writes, we just use the members in @file_extent. 309 * 310 * For NOCOW, we don't really care about the numbers except @start and 311 * file_extent->num_bytes, as we won't insert a file extent item at all. 312 * 313 * For PREALLOC, we do not use ordered extent members, but 314 * btrfs_mark_extent_written() handles everything. 315 * 316 * So here we always pass 0 as offset for NOCOW/PREALLOC ordered extents, 317 * or btrfs_split_ordered_extent() cannot handle it correctly. 318 */ 319 if (flags & ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))) 320 entry = alloc_ordered_extent(inode, file_offset, 321 file_extent->num_bytes, 322 file_extent->num_bytes, 323 file_extent->disk_bytenr + file_extent->offset, 324 file_extent->num_bytes, 0, flags, 325 file_extent->compression); 326 else 327 entry = alloc_ordered_extent(inode, file_offset, 328 file_extent->num_bytes, 329 file_extent->ram_bytes, 330 file_extent->disk_bytenr, 331 file_extent->disk_num_bytes, 332 file_extent->offset, flags, 333 file_extent->compression); 334 if (!IS_ERR(entry)) 335 insert_ordered_extent(entry); 336 return entry; 337 } 338 339 /* 340 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 341 * when an ordered extent is finished. If the list covers more than one 342 * ordered extent, it is split across multiples. 343 */ 344 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, 345 struct btrfs_ordered_sum *sum) 346 { 347 struct btrfs_inode *inode = entry->inode; 348 349 spin_lock(&inode->ordered_tree_lock); 350 list_add_tail(&sum->list, &entry->csum_list); 351 spin_unlock(&inode->ordered_tree_lock); 352 } 353 354 void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered) 355 { 356 if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) 357 mapping_set_error(ordered->inode->vfs_inode.i_mapping, -EIO); 358 } 359 360 static void finish_ordered_fn(struct btrfs_work *work) 361 { 362 struct btrfs_ordered_extent *ordered_extent; 363 364 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 365 btrfs_finish_ordered_io(ordered_extent); 366 } 367 368 static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, 369 u64 file_offset, u64 len, bool uptodate) 370 { 371 struct btrfs_inode *inode = ordered->inode; 372 struct btrfs_fs_info *fs_info = inode->root->fs_info; 373 374 lockdep_assert_held(&inode->ordered_tree_lock); 375 376 /* Now we're fine to update the accounting. */ 377 if (WARN_ON_ONCE(len > ordered->bytes_left)) { 378 btrfs_crit(fs_info, 379 "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu", 380 btrfs_root_id(inode->root), btrfs_ino(inode), 381 ordered->file_offset, ordered->num_bytes, 382 len, ordered->bytes_left); 383 ordered->bytes_left = 0; 384 } else { 385 ordered->bytes_left -= len; 386 } 387 388 if (!uptodate) 389 btrfs_mark_ordered_extent_error(ordered); 390 391 if (ordered->bytes_left) 392 return false; 393 394 /* 395 * All the IO of the ordered extent is finished, we need to queue 396 * the finish_func to be executed. 397 */ 398 set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags); 399 cond_wake_up(&ordered->wait); 400 refcount_inc(&ordered->refs); 401 trace_btrfs_ordered_extent_mark_finished(inode, ordered); 402 return true; 403 } 404 405 static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) 406 { 407 struct btrfs_inode *inode = ordered->inode; 408 struct btrfs_fs_info *fs_info = inode->root->fs_info; 409 struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ? 410 fs_info->endio_freespace_worker : fs_info->endio_write_workers; 411 412 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL); 413 btrfs_queue_work(wq, &ordered->work); 414 } 415 416 void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, 417 u64 file_offset, u64 len, bool uptodate) 418 { 419 struct btrfs_inode *inode = ordered->inode; 420 bool ret; 421 422 trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); 423 424 spin_lock(&inode->ordered_tree_lock); 425 ret = can_finish_ordered_extent(ordered, file_offset, len, 426 uptodate); 427 spin_unlock(&inode->ordered_tree_lock); 428 429 /* 430 * If this is a COW write it means we created new extent maps for the 431 * range and they point to unwritten locations if we got an error either 432 * before submitting a bio or during IO. 433 * 434 * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we 435 * are queuing its completion below. During completion, at 436 * btrfs_finish_one_ordered(), we will drop the extent maps for the 437 * unwritten extents. 438 * 439 * However because completion runs in a work queue we can end up having 440 * a fast fsync running before that. In the case of direct IO, once we 441 * unlock the inode the fsync might start, and we queue the completion 442 * before unlocking the inode. In the case of buffered IO when writeback 443 * finishes (end_bbio_data_write()) we queue the completion, so if the 444 * writeback was triggered by a fast fsync, the fsync might start 445 * logging before ordered extent completion runs in the work queue. 446 * 447 * The fast fsync will log file extent items based on the extent maps it 448 * finds, so if by the time it collects extent maps the ordered extent 449 * completion didn't happen yet, it will log file extent items that 450 * point to unwritten extents, resulting in a corruption if a crash 451 * happens and the log tree is replayed. Note that a fast fsync does not 452 * wait for completion of ordered extents in order to reduce latency. 453 * 454 * Set a flag in the inode so that the next fast fsync will wait for 455 * ordered extents to complete before starting to log. 456 */ 457 if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 458 set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); 459 460 if (ret) 461 btrfs_queue_ordered_fn(ordered); 462 } 463 464 /* 465 * Mark all ordered extents io inside the specified range finished. 466 * 467 * @folio: The involved folio for the operation. 468 * For uncompressed buffered IO, the folio status also needs to be 469 * updated to indicate whether the pending ordered io is finished. 470 * Can be NULL for direct IO and compressed write. 471 * For these cases, callers are ensured they won't execute the 472 * endio function twice. 473 * 474 * This function is called for endio, thus the range must have ordered 475 * extent(s) covering it. 476 */ 477 void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, 478 u64 file_offset, u64 num_bytes, bool uptodate) 479 { 480 struct rb_node *node; 481 struct btrfs_ordered_extent *entry = NULL; 482 u64 cur = file_offset; 483 const u64 end = file_offset + num_bytes; 484 485 trace_btrfs_writepage_end_io_hook(inode, file_offset, end - 1, uptodate); 486 487 spin_lock(&inode->ordered_tree_lock); 488 while (cur < end) { 489 u64 entry_end; 490 u64 this_end; 491 u64 len; 492 493 node = ordered_tree_search(inode, cur); 494 /* No ordered extents at all */ 495 if (!node) 496 break; 497 498 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 499 entry_end = entry->file_offset + entry->num_bytes; 500 /* 501 * |<-- OE --->| | 502 * cur 503 * Go to next OE. 504 */ 505 if (cur >= entry_end) { 506 node = rb_next(node); 507 /* No more ordered extents, exit */ 508 if (!node) 509 break; 510 entry = rb_entry(node, struct btrfs_ordered_extent, 511 rb_node); 512 513 /* Go to next ordered extent and continue */ 514 cur = entry->file_offset; 515 continue; 516 } 517 /* 518 * | |<--- OE --->| 519 * cur 520 * Go to the start of OE. 521 */ 522 if (cur < entry->file_offset) { 523 cur = entry->file_offset; 524 continue; 525 } 526 527 /* 528 * Now we are definitely inside one ordered extent. 529 * 530 * |<--- OE --->| 531 * | 532 * cur 533 */ 534 this_end = min(entry_end, end); 535 len = this_end - cur; 536 ASSERT(len < U32_MAX); 537 538 if (can_finish_ordered_extent(entry, cur, len, uptodate)) { 539 spin_unlock(&inode->ordered_tree_lock); 540 btrfs_queue_ordered_fn(entry); 541 spin_lock(&inode->ordered_tree_lock); 542 } 543 cur += len; 544 } 545 spin_unlock(&inode->ordered_tree_lock); 546 } 547 548 /* 549 * Finish IO for one ordered extent across a given range. The range can only 550 * contain one ordered extent. 551 * 552 * @cached: The cached ordered extent. If not NULL, we can skip the tree 553 * search and use the ordered extent directly. 554 * Will be also used to store the finished ordered extent. 555 * @file_offset: File offset for the finished IO 556 * @io_size: Length of the finish IO range 557 * 558 * Return true if the ordered extent is finished in the range, and update 559 * @cached. 560 * Return false otherwise. 561 * 562 * NOTE: The range can NOT cross multiple ordered extents. 563 * Thus caller should ensure the range doesn't cross ordered extents. 564 */ 565 bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, 566 struct btrfs_ordered_extent **cached, 567 u64 file_offset, u64 io_size) 568 { 569 struct rb_node *node; 570 struct btrfs_ordered_extent *entry = NULL; 571 bool finished = false; 572 573 spin_lock(&inode->ordered_tree_lock); 574 if (cached && *cached) { 575 entry = *cached; 576 goto have_entry; 577 } 578 579 node = ordered_tree_search(inode, file_offset); 580 if (!node) 581 goto out; 582 583 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 584 have_entry: 585 if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) 586 goto out; 587 588 if (io_size > entry->bytes_left) 589 btrfs_crit(inode->root->fs_info, 590 "bad ordered accounting left %llu size %llu", 591 entry->bytes_left, io_size); 592 593 entry->bytes_left -= io_size; 594 595 if (entry->bytes_left == 0) { 596 /* 597 * Ensure only one caller can set the flag and finished_ret 598 * accordingly 599 */ 600 finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 601 /* test_and_set_bit implies a barrier */ 602 cond_wake_up_nomb(&entry->wait); 603 } 604 out: 605 if (finished && cached && entry) { 606 *cached = entry; 607 refcount_inc(&entry->refs); 608 trace_btrfs_ordered_extent_dec_test_pending(inode, entry); 609 } 610 spin_unlock(&inode->ordered_tree_lock); 611 return finished; 612 } 613 614 /* 615 * used to drop a reference on an ordered extent. This will free 616 * the extent if the last reference is dropped 617 */ 618 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) 619 { 620 trace_btrfs_ordered_extent_put(entry->inode, entry); 621 622 if (refcount_dec_and_test(&entry->refs)) { 623 struct btrfs_ordered_sum *sum; 624 struct btrfs_ordered_sum *tmp; 625 626 ASSERT(list_empty(&entry->root_extent_list)); 627 ASSERT(list_empty(&entry->log_list)); 628 ASSERT(RB_EMPTY_NODE(&entry->rb_node)); 629 btrfs_add_delayed_iput(entry->inode); 630 list_for_each_entry_safe(sum, tmp, &entry->csum_list, list) 631 kvfree(sum); 632 kmem_cache_free(btrfs_ordered_extent_cache, entry); 633 } 634 } 635 636 /* 637 * remove an ordered extent from the tree. No references are dropped 638 * and waiters are woken up. 639 */ 640 void btrfs_remove_ordered_extent(struct btrfs_ordered_extent *entry) 641 { 642 struct btrfs_inode *btrfs_inode = entry->inode; 643 struct btrfs_root *root = btrfs_inode->root; 644 struct btrfs_fs_info *fs_info = root->fs_info; 645 struct rb_node *node; 646 bool pending; 647 bool freespace_inode; 648 649 /* 650 * If this is a free space inode the thread has not acquired the ordered 651 * extents lockdep map. 652 */ 653 freespace_inode = btrfs_is_free_space_inode(btrfs_inode); 654 655 btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered); 656 /* This is paired with alloc_ordered_extent(). */ 657 spin_lock(&btrfs_inode->lock); 658 btrfs_mod_outstanding_extents(btrfs_inode, -1); 659 spin_unlock(&btrfs_inode->lock); 660 if (root != fs_info->tree_root) { 661 u64 release; 662 663 if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags)) 664 release = entry->disk_num_bytes; 665 else 666 release = entry->num_bytes; 667 btrfs_delalloc_release_metadata(btrfs_inode, release, 668 test_bit(BTRFS_ORDERED_IOERR, 669 &entry->flags)); 670 } 671 672 percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, 673 fs_info->delalloc_batch); 674 675 spin_lock(&btrfs_inode->ordered_tree_lock); 676 node = &entry->rb_node; 677 rb_erase(node, &btrfs_inode->ordered_tree); 678 RB_CLEAR_NODE(node); 679 if (btrfs_inode->ordered_tree_last == node) 680 btrfs_inode->ordered_tree_last = NULL; 681 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 682 pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); 683 spin_unlock(&btrfs_inode->ordered_tree_lock); 684 685 /* 686 * The current running transaction is waiting on us, we need to let it 687 * know that we're complete and wake it up. 688 */ 689 if (pending) { 690 struct btrfs_transaction *trans; 691 692 /* 693 * The checks for trans are just a formality, it should be set, 694 * but if it isn't we don't want to deref/assert under the spin 695 * lock, so be nice and check if trans is set, but ASSERT() so 696 * if it isn't set a developer will notice. 697 */ 698 spin_lock(&fs_info->trans_lock); 699 trans = fs_info->running_transaction; 700 if (trans) 701 refcount_inc(&trans->use_count); 702 spin_unlock(&fs_info->trans_lock); 703 704 ASSERT(trans || BTRFS_FS_ERROR(fs_info)); 705 if (trans) { 706 if (atomic_dec_and_test(&trans->pending_ordered)) 707 wake_up(&trans->pending_wait); 708 btrfs_put_transaction(trans); 709 } 710 } 711 712 btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered); 713 714 spin_lock(&root->ordered_extent_lock); 715 list_del_init(&entry->root_extent_list); 716 root->nr_ordered_extents--; 717 718 trace_btrfs_ordered_extent_remove(btrfs_inode, entry); 719 720 if (!root->nr_ordered_extents) { 721 spin_lock(&fs_info->ordered_root_lock); 722 BUG_ON(list_empty(&root->ordered_root)); 723 list_del_init(&root->ordered_root); 724 spin_unlock(&fs_info->ordered_root_lock); 725 } 726 spin_unlock(&root->ordered_extent_lock); 727 wake_up(&entry->wait); 728 if (!freespace_inode) 729 btrfs_lockdep_release(fs_info, btrfs_ordered_extent); 730 } 731 732 static void btrfs_run_ordered_extent_work(struct btrfs_work *work) 733 { 734 struct btrfs_ordered_extent *ordered; 735 736 ordered = container_of(work, struct btrfs_ordered_extent, flush_work); 737 btrfs_start_ordered_extent(ordered); 738 complete(&ordered->completion); 739 } 740 741 /* 742 * Wait for all the ordered extents in a root. Use @bg as range or do whole 743 * range if it's NULL. 744 */ 745 u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, 746 const struct btrfs_block_group *bg) 747 { 748 struct btrfs_fs_info *fs_info = root->fs_info; 749 LIST_HEAD(splice); 750 LIST_HEAD(skipped); 751 LIST_HEAD(works); 752 struct btrfs_ordered_extent *ordered, *next; 753 u64 count = 0; 754 u64 range_start, range_len; 755 u64 range_end; 756 757 if (bg) { 758 range_start = bg->start; 759 range_len = bg->length; 760 } else { 761 range_start = 0; 762 range_len = U64_MAX; 763 } 764 range_end = range_start + range_len; 765 766 mutex_lock(&root->ordered_extent_mutex); 767 spin_lock(&root->ordered_extent_lock); 768 list_splice_init(&root->ordered_extents, &splice); 769 while (!list_empty(&splice) && nr) { 770 ordered = list_first_entry(&splice, struct btrfs_ordered_extent, 771 root_extent_list); 772 773 if (range_end <= ordered->disk_bytenr || 774 ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) { 775 list_move_tail(&ordered->root_extent_list, &skipped); 776 cond_resched_lock(&root->ordered_extent_lock); 777 continue; 778 } 779 780 list_move_tail(&ordered->root_extent_list, 781 &root->ordered_extents); 782 refcount_inc(&ordered->refs); 783 spin_unlock(&root->ordered_extent_lock); 784 785 btrfs_init_work(&ordered->flush_work, 786 btrfs_run_ordered_extent_work, NULL); 787 list_add_tail(&ordered->work_list, &works); 788 btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); 789 790 cond_resched(); 791 if (nr != U64_MAX) 792 nr--; 793 count++; 794 spin_lock(&root->ordered_extent_lock); 795 } 796 list_splice_tail(&skipped, &root->ordered_extents); 797 list_splice_tail(&splice, &root->ordered_extents); 798 spin_unlock(&root->ordered_extent_lock); 799 800 list_for_each_entry_safe(ordered, next, &works, work_list) { 801 list_del_init(&ordered->work_list); 802 wait_for_completion(&ordered->completion); 803 btrfs_put_ordered_extent(ordered); 804 cond_resched(); 805 } 806 mutex_unlock(&root->ordered_extent_mutex); 807 808 return count; 809 } 810 811 /* 812 * Wait for @nr ordered extents that intersect the @bg, or the whole range of 813 * the filesystem if @bg is NULL. 814 */ 815 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, 816 const struct btrfs_block_group *bg) 817 { 818 struct btrfs_root *root; 819 LIST_HEAD(splice); 820 u64 done; 821 822 mutex_lock(&fs_info->ordered_operations_mutex); 823 spin_lock(&fs_info->ordered_root_lock); 824 list_splice_init(&fs_info->ordered_roots, &splice); 825 while (!list_empty(&splice) && nr) { 826 root = list_first_entry(&splice, struct btrfs_root, 827 ordered_root); 828 root = btrfs_grab_root(root); 829 BUG_ON(!root); 830 list_move_tail(&root->ordered_root, 831 &fs_info->ordered_roots); 832 spin_unlock(&fs_info->ordered_root_lock); 833 834 done = btrfs_wait_ordered_extents(root, nr, bg); 835 btrfs_put_root(root); 836 837 if (nr != U64_MAX) 838 nr -= done; 839 840 spin_lock(&fs_info->ordered_root_lock); 841 } 842 list_splice_tail(&splice, &fs_info->ordered_roots); 843 spin_unlock(&fs_info->ordered_root_lock); 844 mutex_unlock(&fs_info->ordered_operations_mutex); 845 } 846 847 /* 848 * Start IO and wait for a given ordered extent to finish. 849 * 850 * Wait on page writeback for all the pages in the extent but not in 851 * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the 852 * IO completion code to insert metadata into the btree corresponding to the extent. 853 */ 854 void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, 855 u64 nowriteback_start, u32 nowriteback_len) 856 { 857 u64 start = entry->file_offset; 858 u64 end = start + entry->num_bytes - 1; 859 struct btrfs_inode *inode = entry->inode; 860 bool freespace_inode; 861 862 trace_btrfs_ordered_extent_start(inode, entry); 863 864 /* 865 * If this is a free space inode do not take the ordered extents lockdep 866 * map. 867 */ 868 freespace_inode = btrfs_is_free_space_inode(inode); 869 870 /* 871 * pages in the range can be dirty, clean or writeback. We 872 * start IO on any dirty ones so the wait doesn't stall waiting 873 * for the flusher thread to find them 874 */ 875 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) { 876 if (!nowriteback_len) { 877 filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); 878 } else { 879 if (start < nowriteback_start) 880 filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, 881 nowriteback_start - 1); 882 if (nowriteback_start + nowriteback_len < end) 883 filemap_fdatawrite_range(inode->vfs_inode.i_mapping, 884 nowriteback_start + nowriteback_len, 885 end); 886 } 887 } 888 889 if (!freespace_inode) 890 btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); 891 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); 892 } 893 894 /* 895 * Used to wait on ordered extents across a large range of bytes. 896 */ 897 int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len) 898 { 899 int ret = 0; 900 int ret_wb = 0; 901 u64 end; 902 u64 orig_end; 903 struct btrfs_ordered_extent *ordered; 904 905 if (start + len < start) { 906 orig_end = OFFSET_MAX; 907 } else { 908 orig_end = start + len - 1; 909 if (orig_end > OFFSET_MAX) 910 orig_end = OFFSET_MAX; 911 } 912 913 /* start IO across the range first to instantiate any delalloc 914 * extents 915 */ 916 ret = btrfs_fdatawrite_range(inode, start, orig_end); 917 if (ret) 918 return ret; 919 920 /* 921 * If we have a writeback error don't return immediately. Wait first 922 * for any ordered extents that haven't completed yet. This is to make 923 * sure no one can dirty the same page ranges and call writepages() 924 * before the ordered extents complete - to avoid failures (-EEXIST) 925 * when adding the new ordered extents to the ordered tree. 926 */ 927 ret_wb = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, orig_end); 928 929 end = orig_end; 930 while (1) { 931 ordered = btrfs_lookup_first_ordered_extent(inode, end); 932 if (!ordered) 933 break; 934 if (ordered->file_offset > orig_end) { 935 btrfs_put_ordered_extent(ordered); 936 break; 937 } 938 if (ordered->file_offset + ordered->num_bytes <= start) { 939 btrfs_put_ordered_extent(ordered); 940 break; 941 } 942 btrfs_start_ordered_extent(ordered); 943 end = ordered->file_offset; 944 /* 945 * If the ordered extent had an error save the error but don't 946 * exit without waiting first for all other ordered extents in 947 * the range to complete. 948 */ 949 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) 950 ret = -EIO; 951 btrfs_put_ordered_extent(ordered); 952 if (end == 0 || end == start) 953 break; 954 end--; 955 } 956 return ret_wb ? ret_wb : ret; 957 } 958 959 /* 960 * find an ordered extent corresponding to file_offset. return NULL if 961 * nothing is found, otherwise take a reference on the extent and return it 962 */ 963 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, 964 u64 file_offset) 965 { 966 struct rb_node *node; 967 struct btrfs_ordered_extent *entry = NULL; 968 969 spin_lock(&inode->ordered_tree_lock); 970 node = ordered_tree_search(inode, file_offset); 971 if (!node) 972 goto out; 973 974 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 975 if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) 976 entry = NULL; 977 if (entry) { 978 refcount_inc(&entry->refs); 979 trace_btrfs_ordered_extent_lookup(inode, entry); 980 } 981 out: 982 spin_unlock(&inode->ordered_tree_lock); 983 return entry; 984 } 985 986 /* Since the DIO code tries to lock a wide area we need to look for any ordered 987 * extents that exist in the range, rather than just the start of the range. 988 */ 989 struct btrfs_ordered_extent *btrfs_lookup_ordered_range( 990 struct btrfs_inode *inode, u64 file_offset, u64 len) 991 { 992 struct rb_node *node; 993 struct btrfs_ordered_extent *entry = NULL; 994 995 spin_lock(&inode->ordered_tree_lock); 996 node = ordered_tree_search(inode, file_offset); 997 if (!node) { 998 node = ordered_tree_search(inode, file_offset + len); 999 if (!node) 1000 goto out; 1001 } 1002 1003 while (1) { 1004 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 1005 if (btrfs_range_overlaps(entry, file_offset, len)) 1006 break; 1007 1008 if (entry->file_offset >= file_offset + len) { 1009 entry = NULL; 1010 break; 1011 } 1012 entry = NULL; 1013 node = rb_next(node); 1014 if (!node) 1015 break; 1016 } 1017 out: 1018 if (entry) { 1019 refcount_inc(&entry->refs); 1020 trace_btrfs_ordered_extent_lookup_range(inode, entry); 1021 } 1022 spin_unlock(&inode->ordered_tree_lock); 1023 return entry; 1024 } 1025 1026 /* 1027 * Adds all ordered extents to the given list. The list ends up sorted by the 1028 * file_offset of the ordered extents. 1029 */ 1030 void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, 1031 struct list_head *list) 1032 { 1033 struct rb_node *n; 1034 1035 btrfs_assert_inode_locked(inode); 1036 1037 spin_lock(&inode->ordered_tree_lock); 1038 for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { 1039 struct btrfs_ordered_extent *ordered; 1040 1041 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); 1042 1043 if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) 1044 continue; 1045 1046 ASSERT(list_empty(&ordered->log_list)); 1047 list_add_tail(&ordered->log_list, list); 1048 refcount_inc(&ordered->refs); 1049 trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); 1050 } 1051 spin_unlock(&inode->ordered_tree_lock); 1052 } 1053 1054 /* 1055 * lookup and return any extent before 'file_offset'. NULL is returned 1056 * if none is found 1057 */ 1058 struct btrfs_ordered_extent * 1059 btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) 1060 { 1061 struct rb_node *node; 1062 struct btrfs_ordered_extent *entry = NULL; 1063 1064 spin_lock(&inode->ordered_tree_lock); 1065 node = ordered_tree_search(inode, file_offset); 1066 if (!node) 1067 goto out; 1068 1069 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 1070 refcount_inc(&entry->refs); 1071 trace_btrfs_ordered_extent_lookup_first(inode, entry); 1072 out: 1073 spin_unlock(&inode->ordered_tree_lock); 1074 return entry; 1075 } 1076 1077 /* 1078 * Lookup the first ordered extent that overlaps the range 1079 * [@file_offset, @file_offset + @len). 1080 * 1081 * The difference between this and btrfs_lookup_first_ordered_extent() is 1082 * that this one won't return any ordered extent that does not overlap the range. 1083 * And the difference against btrfs_lookup_ordered_extent() is, this function 1084 * ensures the first ordered extent gets returned. 1085 */ 1086 struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( 1087 struct btrfs_inode *inode, u64 file_offset, u64 len) 1088 { 1089 struct rb_node *node; 1090 struct rb_node *cur; 1091 struct rb_node *prev; 1092 struct rb_node *next; 1093 struct btrfs_ordered_extent *entry = NULL; 1094 1095 spin_lock(&inode->ordered_tree_lock); 1096 node = inode->ordered_tree.rb_node; 1097 /* 1098 * Here we don't want to use tree_search() which will use tree->last 1099 * and screw up the search order. 1100 * And __tree_search() can't return the adjacent ordered extents 1101 * either, thus here we do our own search. 1102 */ 1103 while (node) { 1104 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 1105 1106 if (file_offset < entry->file_offset) { 1107 node = node->rb_left; 1108 } else if (file_offset >= entry_end(entry)) { 1109 node = node->rb_right; 1110 } else { 1111 /* 1112 * Direct hit, got an ordered extent that starts at 1113 * @file_offset 1114 */ 1115 goto out; 1116 } 1117 } 1118 if (!entry) { 1119 /* Empty tree */ 1120 goto out; 1121 } 1122 1123 cur = &entry->rb_node; 1124 /* We got an entry around @file_offset, check adjacent entries */ 1125 if (entry->file_offset < file_offset) { 1126 prev = cur; 1127 next = rb_next(cur); 1128 } else { 1129 prev = rb_prev(cur); 1130 next = cur; 1131 } 1132 if (prev) { 1133 entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node); 1134 if (btrfs_range_overlaps(entry, file_offset, len)) 1135 goto out; 1136 } 1137 if (next) { 1138 entry = rb_entry(next, struct btrfs_ordered_extent, rb_node); 1139 if (btrfs_range_overlaps(entry, file_offset, len)) 1140 goto out; 1141 } 1142 /* No ordered extent in the range */ 1143 entry = NULL; 1144 out: 1145 if (entry) { 1146 refcount_inc(&entry->refs); 1147 trace_btrfs_ordered_extent_lookup_first_range(inode, entry); 1148 } 1149 1150 spin_unlock(&inode->ordered_tree_lock); 1151 return entry; 1152 } 1153 1154 /* 1155 * Lock the passed range and ensures all pending ordered extents in it are run 1156 * to completion. 1157 * 1158 * @inode: Inode whose ordered tree is to be searched 1159 * @start: Beginning of range to flush 1160 * @end: Last byte of range to lock 1161 * @cached_state: If passed, will return the extent state responsible for the 1162 * locked range. It's the caller's responsibility to free the 1163 * cached state. 1164 * 1165 * Always return with the given range locked, ensuring after it's called no 1166 * order extent can be pending. 1167 */ 1168 void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, 1169 u64 end, 1170 struct extent_state **cached_state) 1171 { 1172 struct btrfs_ordered_extent *ordered; 1173 struct extent_state *cache = NULL; 1174 struct extent_state **cachedp = &cache; 1175 1176 if (cached_state) 1177 cachedp = cached_state; 1178 1179 while (1) { 1180 btrfs_lock_extent(&inode->io_tree, start, end, cachedp); 1181 ordered = btrfs_lookup_ordered_range(inode, start, 1182 end - start + 1); 1183 if (!ordered) { 1184 /* 1185 * If no external cached_state has been passed then 1186 * decrement the extra ref taken for cachedp since we 1187 * aren't exposing it outside of this function 1188 */ 1189 if (!cached_state) 1190 refcount_dec(&cache->refs); 1191 break; 1192 } 1193 btrfs_unlock_extent(&inode->io_tree, start, end, cachedp); 1194 btrfs_start_ordered_extent(ordered); 1195 btrfs_put_ordered_extent(ordered); 1196 } 1197 } 1198 1199 /* 1200 * Lock the passed range and ensure all pending ordered extents in it are run 1201 * to completion in nowait mode. 1202 * 1203 * Return true if btrfs_lock_ordered_range does not return any extents, 1204 * otherwise false. 1205 */ 1206 bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, 1207 struct extent_state **cached_state) 1208 { 1209 struct btrfs_ordered_extent *ordered; 1210 1211 if (!btrfs_try_lock_extent(&inode->io_tree, start, end, cached_state)) 1212 return false; 1213 1214 ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); 1215 if (!ordered) 1216 return true; 1217 1218 btrfs_put_ordered_extent(ordered); 1219 btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); 1220 1221 return false; 1222 } 1223 1224 /* Split out a new ordered extent for this first @len bytes of @ordered. */ 1225 struct btrfs_ordered_extent *btrfs_split_ordered_extent( 1226 struct btrfs_ordered_extent *ordered, u64 len) 1227 { 1228 struct btrfs_inode *inode = ordered->inode; 1229 struct btrfs_root *root = inode->root; 1230 struct btrfs_fs_info *fs_info = root->fs_info; 1231 u64 file_offset = ordered->file_offset; 1232 u64 disk_bytenr = ordered->disk_bytenr; 1233 unsigned long flags = ordered->flags; 1234 struct btrfs_ordered_sum *sum, *tmpsum; 1235 struct btrfs_ordered_extent *new; 1236 struct rb_node *node; 1237 u64 offset = 0; 1238 1239 trace_btrfs_ordered_extent_split(inode, ordered); 1240 1241 ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED))); 1242 1243 /* 1244 * The entire bio must be covered by the ordered extent, but we can't 1245 * reduce the original extent to a zero length either. 1246 */ 1247 if (WARN_ON_ONCE(len >= ordered->num_bytes)) 1248 return ERR_PTR(-EINVAL); 1249 /* 1250 * If our ordered extent had an error there's no point in continuing. 1251 * The error may have come from a transaction abort done either by this 1252 * task or some other concurrent task, and the transaction abort path 1253 * iterates over all existing ordered extents and sets the flag 1254 * BTRFS_ORDERED_IOERR on them. 1255 */ 1256 if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { 1257 const int fs_error = BTRFS_FS_ERROR(fs_info); 1258 1259 return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); 1260 } 1261 /* We cannot split partially completed ordered extents. */ 1262 if (ordered->bytes_left) { 1263 ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); 1264 if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) 1265 return ERR_PTR(-EINVAL); 1266 } 1267 /* We cannot split a compressed ordered extent. */ 1268 if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) 1269 return ERR_PTR(-EINVAL); 1270 1271 new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr, 1272 len, 0, flags, ordered->compress_type); 1273 if (IS_ERR(new)) 1274 return new; 1275 1276 /* One ref for the tree. */ 1277 refcount_inc(&new->refs); 1278 1279 /* 1280 * Take the root's ordered_extent_lock to avoid a race with 1281 * btrfs_wait_ordered_extents() when updating the disk_bytenr and 1282 * disk_num_bytes fields of the ordered extent below. 1283 * 1284 * There's no concern about a previous caller of 1285 * btrfs_wait_ordered_extents() getting the trimmed ordered extent 1286 * before we insert the new one, because even if it gets the ordered 1287 * extent before it's trimmed and the new one inserted, right before it 1288 * uses it or during its use, the ordered extent might have been 1289 * trimmed in the meanwhile, and it missed the new ordered extent. 1290 * There's no way around this and it's harmless for current use cases, 1291 * so we take the root's ordered_extent_lock to fix that race during 1292 * trimming and silence tools like KCSAN. 1293 */ 1294 spin_lock_irq(&root->ordered_extent_lock); 1295 spin_lock(&inode->ordered_tree_lock); 1296 1297 /* 1298 * We don't have overlapping ordered extents (that would imply double 1299 * allocation of extents) and we checked above that the split length 1300 * does not cross the ordered extent's num_bytes field, so there's 1301 * no need to remove it and re-insert it in the tree. 1302 */ 1303 ordered->file_offset += len; 1304 ordered->disk_bytenr += len; 1305 ordered->num_bytes -= len; 1306 ordered->disk_num_bytes -= len; 1307 ordered->ram_bytes -= len; 1308 1309 if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) { 1310 ASSERT(ordered->bytes_left == 0); 1311 new->bytes_left = 0; 1312 } else { 1313 ordered->bytes_left -= len; 1314 } 1315 1316 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) { 1317 if (ordered->truncated_len > len) { 1318 ordered->truncated_len -= len; 1319 } else { 1320 new->truncated_len = ordered->truncated_len; 1321 ordered->truncated_len = 0; 1322 } 1323 } 1324 1325 list_for_each_entry_safe(sum, tmpsum, &ordered->csum_list, list) { 1326 if (offset == len) 1327 break; 1328 list_move_tail(&sum->list, &new->csum_list); 1329 offset += sum->len; 1330 } 1331 1332 node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node); 1333 if (unlikely(node)) 1334 btrfs_panic(fs_info, -EEXIST, 1335 "inconsistency in ordered tree at offset %llu after split", 1336 new->file_offset); 1337 spin_unlock(&inode->ordered_tree_lock); 1338 1339 list_add_tail(&new->root_extent_list, &root->ordered_extents); 1340 root->nr_ordered_extents++; 1341 spin_unlock_irq(&root->ordered_extent_lock); 1342 return new; 1343 } 1344 1345 int __init ordered_data_init(void) 1346 { 1347 btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0); 1348 if (!btrfs_ordered_extent_cache) 1349 return -ENOMEM; 1350 1351 return 0; 1352 } 1353 1354 void __cold ordered_data_exit(void) 1355 { 1356 kmem_cache_destroy(btrfs_ordered_extent_cache); 1357 } 1358