1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/slab.h> 7 #include <linux/blkdev.h> 8 #include <linux/writeback.h> 9 #include <linux/sched/mm.h> 10 #include "messages.h" 11 #include "misc.h" 12 #include "ctree.h" 13 #include "transaction.h" 14 #include "btrfs_inode.h" 15 #include "extent_io.h" 16 #include "disk-io.h" 17 #include "compression.h" 18 #include "delalloc-space.h" 19 #include "qgroup.h" 20 #include "subpage.h" 21 #include "file.h" 22 23 static struct kmem_cache *btrfs_ordered_extent_cache; 24 25 static u64 entry_end(struct btrfs_ordered_extent *entry) 26 { 27 if (entry->file_offset + entry->num_bytes < entry->file_offset) 28 return (u64)-1; 29 return entry->file_offset + entry->num_bytes; 30 } 31 32 /* returns NULL if the insertion worked, or it returns the node it did find 33 * in the tree 34 */ 35 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, 36 struct rb_node *node) 37 { 38 struct rb_node **p = &root->rb_node; 39 struct rb_node *parent = NULL; 40 struct btrfs_ordered_extent *entry; 41 42 while (*p) { 43 parent = *p; 44 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); 45 46 if (file_offset < entry->file_offset) 47 p = &(*p)->rb_left; 48 else if (file_offset >= entry_end(entry)) 49 p = &(*p)->rb_right; 50 else 51 return parent; 52 } 53 54 rb_link_node(node, parent, p); 55 rb_insert_color(node, root); 56 return NULL; 57 } 58 59 /* 60 * look for a given offset in the tree, and if it can't be found return the 61 * first lesser offset 62 */ 63 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, 64 struct rb_node **prev_ret) 65 { 66 struct rb_node *n = root->rb_node; 67 struct rb_node *prev = NULL; 68 struct rb_node *test; 69 struct btrfs_ordered_extent *entry; 70 struct btrfs_ordered_extent *prev_entry = NULL; 71 72 while (n) { 73 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); 74 prev = n; 75 prev_entry = entry; 76 77 if (file_offset < entry->file_offset) 78 n = n->rb_left; 79 else if (file_offset >= entry_end(entry)) 80 n = n->rb_right; 81 else 82 return n; 83 } 84 if (!prev_ret) 85 return NULL; 86 87 while (prev && file_offset >= entry_end(prev_entry)) { 88 test = rb_next(prev); 89 if (!test) 90 break; 91 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 92 rb_node); 93 if (file_offset < entry_end(prev_entry)) 94 break; 95 96 prev = test; 97 } 98 if (prev) 99 prev_entry = rb_entry(prev, struct btrfs_ordered_extent, 100 rb_node); 101 while (prev && file_offset < entry_end(prev_entry)) { 102 test = rb_prev(prev); 103 if (!test) 104 break; 105 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 106 rb_node); 107 prev = test; 108 } 109 *prev_ret = prev; 110 return NULL; 111 } 112 113 static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, 114 u64 len) 115 { 116 if (file_offset + len <= entry->file_offset || 117 entry->file_offset + entry->num_bytes <= file_offset) 118 return 0; 119 return 1; 120 } 121 122 /* 123 * look find the first ordered struct that has this offset, otherwise 124 * the first one less than this offset 125 */ 126 static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode, 127 u64 file_offset) 128 { 129 struct rb_node *prev = NULL; 130 struct rb_node *ret; 131 struct btrfs_ordered_extent *entry; 132 133 if (inode->ordered_tree_last) { 134 entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent, 135 rb_node); 136 if (in_range(file_offset, entry->file_offset, entry->num_bytes)) 137 return inode->ordered_tree_last; 138 } 139 ret = __tree_search(&inode->ordered_tree, file_offset, &prev); 140 if (!ret) 141 ret = prev; 142 if (ret) 143 inode->ordered_tree_last = ret; 144 return ret; 145 } 146 147 static struct btrfs_ordered_extent *alloc_ordered_extent( 148 struct btrfs_inode *inode, u64 file_offset, u64 num_bytes, 149 u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes, 150 u64 offset, unsigned long flags, int compress_type) 151 { 152 struct btrfs_ordered_extent *entry; 153 int ret; 154 u64 qgroup_rsv = 0; 155 156 if (flags & 157 ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { 158 /* For nocow write, we can release the qgroup rsv right now */ 159 ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); 160 if (ret < 0) 161 return ERR_PTR(ret); 162 } else { 163 /* 164 * The ordered extent has reserved qgroup space, release now 165 * and pass the reserved number for qgroup_record to free. 166 */ 167 ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); 168 if (ret < 0) 169 return ERR_PTR(ret); 170 } 171 entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); 172 if (!entry) 173 return ERR_PTR(-ENOMEM); 174 175 entry->file_offset = file_offset; 176 entry->num_bytes = num_bytes; 177 entry->ram_bytes = ram_bytes; 178 entry->disk_bytenr = disk_bytenr; 179 entry->disk_num_bytes = disk_num_bytes; 180 entry->offset = offset; 181 entry->bytes_left = num_bytes; 182 entry->inode = igrab(&inode->vfs_inode); 183 entry->compress_type = compress_type; 184 entry->truncated_len = (u64)-1; 185 entry->qgroup_rsv = qgroup_rsv; 186 entry->flags = flags; 187 refcount_set(&entry->refs, 1); 188 init_waitqueue_head(&entry->wait); 189 INIT_LIST_HEAD(&entry->list); 190 INIT_LIST_HEAD(&entry->log_list); 191 INIT_LIST_HEAD(&entry->root_extent_list); 192 INIT_LIST_HEAD(&entry->work_list); 193 INIT_LIST_HEAD(&entry->bioc_list); 194 init_completion(&entry->completion); 195 196 /* 197 * We don't need the count_max_extents here, we can assume that all of 198 * that work has been done at higher layers, so this is truly the 199 * smallest the extent is going to get. 200 */ 201 spin_lock(&inode->lock); 202 btrfs_mod_outstanding_extents(inode, 1); 203 spin_unlock(&inode->lock); 204 205 return entry; 206 } 207 208 static void insert_ordered_extent(struct btrfs_ordered_extent *entry) 209 { 210 struct btrfs_inode *inode = BTRFS_I(entry->inode); 211 struct btrfs_root *root = inode->root; 212 struct btrfs_fs_info *fs_info = root->fs_info; 213 struct rb_node *node; 214 215 trace_btrfs_ordered_extent_add(inode, entry); 216 217 percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes, 218 fs_info->delalloc_batch); 219 220 /* One ref for the tree. */ 221 refcount_inc(&entry->refs); 222 223 spin_lock_irq(&inode->ordered_tree_lock); 224 node = tree_insert(&inode->ordered_tree, entry->file_offset, 225 &entry->rb_node); 226 if (node) 227 btrfs_panic(fs_info, -EEXIST, 228 "inconsistency in ordered tree at offset %llu", 229 entry->file_offset); 230 spin_unlock_irq(&inode->ordered_tree_lock); 231 232 spin_lock(&root->ordered_extent_lock); 233 list_add_tail(&entry->root_extent_list, 234 &root->ordered_extents); 235 root->nr_ordered_extents++; 236 if (root->nr_ordered_extents == 1) { 237 spin_lock(&fs_info->ordered_root_lock); 238 BUG_ON(!list_empty(&root->ordered_root)); 239 list_add_tail(&root->ordered_root, &fs_info->ordered_roots); 240 spin_unlock(&fs_info->ordered_root_lock); 241 } 242 spin_unlock(&root->ordered_extent_lock); 243 } 244 245 /* 246 * Add an ordered extent to the per-inode tree. 247 * 248 * @inode: Inode that this extent is for. 249 * @file_offset: Logical offset in file where the extent starts. 250 * @num_bytes: Logical length of extent in file. 251 * @ram_bytes: Full length of unencoded data. 252 * @disk_bytenr: Offset of extent on disk. 253 * @disk_num_bytes: Size of extent on disk. 254 * @offset: Offset into unencoded data where file data starts. 255 * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). 256 * @compress_type: Compression algorithm used for data. 257 * 258 * Most of these parameters correspond to &struct btrfs_file_extent_item. The 259 * tree is given a single reference on the ordered extent that was inserted, and 260 * the returned pointer is given a second reference. 261 * 262 * Return: the new ordered extent or error pointer. 263 */ 264 struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( 265 struct btrfs_inode *inode, u64 file_offset, 266 u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, 267 u64 disk_num_bytes, u64 offset, unsigned long flags, 268 int compress_type) 269 { 270 struct btrfs_ordered_extent *entry; 271 272 ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); 273 274 entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes, 275 disk_bytenr, disk_num_bytes, offset, flags, 276 compress_type); 277 if (!IS_ERR(entry)) 278 insert_ordered_extent(entry); 279 return entry; 280 } 281 282 /* 283 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 284 * when an ordered extent is finished. If the list covers more than one 285 * ordered extent, it is split across multiples. 286 */ 287 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, 288 struct btrfs_ordered_sum *sum) 289 { 290 struct btrfs_inode *inode = BTRFS_I(entry->inode); 291 292 spin_lock_irq(&inode->ordered_tree_lock); 293 list_add_tail(&sum->list, &entry->list); 294 spin_unlock_irq(&inode->ordered_tree_lock); 295 } 296 297 void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered) 298 { 299 if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) 300 mapping_set_error(ordered->inode->i_mapping, -EIO); 301 } 302 303 static void finish_ordered_fn(struct btrfs_work *work) 304 { 305 struct btrfs_ordered_extent *ordered_extent; 306 307 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 308 btrfs_finish_ordered_io(ordered_extent); 309 } 310 311 static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, 312 struct page *page, u64 file_offset, 313 u64 len, bool uptodate) 314 { 315 struct btrfs_inode *inode = BTRFS_I(ordered->inode); 316 struct btrfs_fs_info *fs_info = inode->root->fs_info; 317 318 lockdep_assert_held(&inode->ordered_tree_lock); 319 320 if (page) { 321 ASSERT(page->mapping); 322 ASSERT(page_offset(page) <= file_offset); 323 ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE); 324 325 /* 326 * Ordered (Private2) bit indicates whether we still have 327 * pending io unfinished for the ordered extent. 328 * 329 * If there's no such bit, we need to skip to next range. 330 */ 331 if (!btrfs_folio_test_ordered(fs_info, page_folio(page), 332 file_offset, len)) 333 return false; 334 btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len); 335 } 336 337 /* Now we're fine to update the accounting. */ 338 if (WARN_ON_ONCE(len > ordered->bytes_left)) { 339 btrfs_crit(fs_info, 340 "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu", 341 btrfs_root_id(inode->root), btrfs_ino(inode), 342 ordered->file_offset, ordered->num_bytes, 343 len, ordered->bytes_left); 344 ordered->bytes_left = 0; 345 } else { 346 ordered->bytes_left -= len; 347 } 348 349 if (!uptodate) 350 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); 351 352 if (ordered->bytes_left) 353 return false; 354 355 /* 356 * All the IO of the ordered extent is finished, we need to queue 357 * the finish_func to be executed. 358 */ 359 set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags); 360 cond_wake_up(&ordered->wait); 361 refcount_inc(&ordered->refs); 362 trace_btrfs_ordered_extent_mark_finished(inode, ordered); 363 return true; 364 } 365 366 static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) 367 { 368 struct btrfs_inode *inode = BTRFS_I(ordered->inode); 369 struct btrfs_fs_info *fs_info = inode->root->fs_info; 370 struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ? 371 fs_info->endio_freespace_worker : fs_info->endio_write_workers; 372 373 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL); 374 btrfs_queue_work(wq, &ordered->work); 375 } 376 377 bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, 378 struct page *page, u64 file_offset, u64 len, 379 bool uptodate) 380 { 381 struct btrfs_inode *inode = BTRFS_I(ordered->inode); 382 unsigned long flags; 383 bool ret; 384 385 trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); 386 387 spin_lock_irqsave(&inode->ordered_tree_lock, flags); 388 ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); 389 spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); 390 391 /* 392 * If this is a COW write it means we created new extent maps for the 393 * range and they point to unwritten locations if we got an error either 394 * before submitting a bio or during IO. 395 * 396 * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we 397 * are queuing its completion below. During completion, at 398 * btrfs_finish_one_ordered(), we will drop the extent maps for the 399 * unwritten extents. 400 * 401 * However because completion runs in a work queue we can end up having 402 * a fast fsync running before that. In the case of direct IO, once we 403 * unlock the inode the fsync might start, and we queue the completion 404 * before unlocking the inode. In the case of buffered IO when writeback 405 * finishes (end_bbio_data_write()) we queue the completion, so if the 406 * writeback was triggered by a fast fsync, the fsync might start 407 * logging before ordered extent completion runs in the work queue. 408 * 409 * The fast fsync will log file extent items based on the extent maps it 410 * finds, so if by the time it collects extent maps the ordered extent 411 * completion didn't happen yet, it will log file extent items that 412 * point to unwritten extents, resulting in a corruption if a crash 413 * happens and the log tree is replayed. Note that a fast fsync does not 414 * wait for completion of ordered extents in order to reduce latency. 415 * 416 * Set a flag in the inode so that the next fast fsync will wait for 417 * ordered extents to complete before starting to log. 418 */ 419 if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 420 set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); 421 422 if (ret) 423 btrfs_queue_ordered_fn(ordered); 424 return ret; 425 } 426 427 /* 428 * Mark all ordered extents io inside the specified range finished. 429 * 430 * @page: The involved page for the operation. 431 * For uncompressed buffered IO, the page status also needs to be 432 * updated to indicate whether the pending ordered io is finished. 433 * Can be NULL for direct IO and compressed write. 434 * For these cases, callers are ensured they won't execute the 435 * endio function twice. 436 * 437 * This function is called for endio, thus the range must have ordered 438 * extent(s) covering it. 439 */ 440 void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, 441 struct page *page, u64 file_offset, 442 u64 num_bytes, bool uptodate) 443 { 444 struct rb_node *node; 445 struct btrfs_ordered_extent *entry = NULL; 446 unsigned long flags; 447 u64 cur = file_offset; 448 449 trace_btrfs_writepage_end_io_hook(inode, file_offset, 450 file_offset + num_bytes - 1, 451 uptodate); 452 453 spin_lock_irqsave(&inode->ordered_tree_lock, flags); 454 while (cur < file_offset + num_bytes) { 455 u64 entry_end; 456 u64 end; 457 u32 len; 458 459 node = ordered_tree_search(inode, cur); 460 /* No ordered extents at all */ 461 if (!node) 462 break; 463 464 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 465 entry_end = entry->file_offset + entry->num_bytes; 466 /* 467 * |<-- OE --->| | 468 * cur 469 * Go to next OE. 470 */ 471 if (cur >= entry_end) { 472 node = rb_next(node); 473 /* No more ordered extents, exit */ 474 if (!node) 475 break; 476 entry = rb_entry(node, struct btrfs_ordered_extent, 477 rb_node); 478 479 /* Go to next ordered extent and continue */ 480 cur = entry->file_offset; 481 continue; 482 } 483 /* 484 * | |<--- OE --->| 485 * cur 486 * Go to the start of OE. 487 */ 488 if (cur < entry->file_offset) { 489 cur = entry->file_offset; 490 continue; 491 } 492 493 /* 494 * Now we are definitely inside one ordered extent. 495 * 496 * |<--- OE --->| 497 * | 498 * cur 499 */ 500 end = min(entry->file_offset + entry->num_bytes, 501 file_offset + num_bytes) - 1; 502 ASSERT(end + 1 - cur < U32_MAX); 503 len = end + 1 - cur; 504 505 if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { 506 spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); 507 btrfs_queue_ordered_fn(entry); 508 spin_lock_irqsave(&inode->ordered_tree_lock, flags); 509 } 510 cur += len; 511 } 512 spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); 513 } 514 515 /* 516 * Finish IO for one ordered extent across a given range. The range can only 517 * contain one ordered extent. 518 * 519 * @cached: The cached ordered extent. If not NULL, we can skip the tree 520 * search and use the ordered extent directly. 521 * Will be also used to store the finished ordered extent. 522 * @file_offset: File offset for the finished IO 523 * @io_size: Length of the finish IO range 524 * 525 * Return true if the ordered extent is finished in the range, and update 526 * @cached. 527 * Return false otherwise. 528 * 529 * NOTE: The range can NOT cross multiple ordered extents. 530 * Thus caller should ensure the range doesn't cross ordered extents. 531 */ 532 bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, 533 struct btrfs_ordered_extent **cached, 534 u64 file_offset, u64 io_size) 535 { 536 struct rb_node *node; 537 struct btrfs_ordered_extent *entry = NULL; 538 unsigned long flags; 539 bool finished = false; 540 541 spin_lock_irqsave(&inode->ordered_tree_lock, flags); 542 if (cached && *cached) { 543 entry = *cached; 544 goto have_entry; 545 } 546 547 node = ordered_tree_search(inode, file_offset); 548 if (!node) 549 goto out; 550 551 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 552 have_entry: 553 if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) 554 goto out; 555 556 if (io_size > entry->bytes_left) 557 btrfs_crit(inode->root->fs_info, 558 "bad ordered accounting left %llu size %llu", 559 entry->bytes_left, io_size); 560 561 entry->bytes_left -= io_size; 562 563 if (entry->bytes_left == 0) { 564 /* 565 * Ensure only one caller can set the flag and finished_ret 566 * accordingly 567 */ 568 finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 569 /* test_and_set_bit implies a barrier */ 570 cond_wake_up_nomb(&entry->wait); 571 } 572 out: 573 if (finished && cached && entry) { 574 *cached = entry; 575 refcount_inc(&entry->refs); 576 trace_btrfs_ordered_extent_dec_test_pending(inode, entry); 577 } 578 spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); 579 return finished; 580 } 581 582 /* 583 * used to drop a reference on an ordered extent. This will free 584 * the extent if the last reference is dropped 585 */ 586 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) 587 { 588 struct list_head *cur; 589 struct btrfs_ordered_sum *sum; 590 591 trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry); 592 593 if (refcount_dec_and_test(&entry->refs)) { 594 ASSERT(list_empty(&entry->root_extent_list)); 595 ASSERT(list_empty(&entry->log_list)); 596 ASSERT(RB_EMPTY_NODE(&entry->rb_node)); 597 if (entry->inode) 598 btrfs_add_delayed_iput(BTRFS_I(entry->inode)); 599 while (!list_empty(&entry->list)) { 600 cur = entry->list.next; 601 sum = list_entry(cur, struct btrfs_ordered_sum, list); 602 list_del(&sum->list); 603 kvfree(sum); 604 } 605 kmem_cache_free(btrfs_ordered_extent_cache, entry); 606 } 607 } 608 609 /* 610 * remove an ordered extent from the tree. No references are dropped 611 * and waiters are woken up. 612 */ 613 void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, 614 struct btrfs_ordered_extent *entry) 615 { 616 struct btrfs_root *root = btrfs_inode->root; 617 struct btrfs_fs_info *fs_info = root->fs_info; 618 struct rb_node *node; 619 bool pending; 620 bool freespace_inode; 621 622 /* 623 * If this is a free space inode the thread has not acquired the ordered 624 * extents lockdep map. 625 */ 626 freespace_inode = btrfs_is_free_space_inode(btrfs_inode); 627 628 btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered); 629 /* This is paired with btrfs_alloc_ordered_extent. */ 630 spin_lock(&btrfs_inode->lock); 631 btrfs_mod_outstanding_extents(btrfs_inode, -1); 632 spin_unlock(&btrfs_inode->lock); 633 if (root != fs_info->tree_root) { 634 u64 release; 635 636 if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags)) 637 release = entry->disk_num_bytes; 638 else 639 release = entry->num_bytes; 640 btrfs_delalloc_release_metadata(btrfs_inode, release, 641 test_bit(BTRFS_ORDERED_IOERR, 642 &entry->flags)); 643 } 644 645 percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, 646 fs_info->delalloc_batch); 647 648 spin_lock_irq(&btrfs_inode->ordered_tree_lock); 649 node = &entry->rb_node; 650 rb_erase(node, &btrfs_inode->ordered_tree); 651 RB_CLEAR_NODE(node); 652 if (btrfs_inode->ordered_tree_last == node) 653 btrfs_inode->ordered_tree_last = NULL; 654 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 655 pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); 656 spin_unlock_irq(&btrfs_inode->ordered_tree_lock); 657 658 /* 659 * The current running transaction is waiting on us, we need to let it 660 * know that we're complete and wake it up. 661 */ 662 if (pending) { 663 struct btrfs_transaction *trans; 664 665 /* 666 * The checks for trans are just a formality, it should be set, 667 * but if it isn't we don't want to deref/assert under the spin 668 * lock, so be nice and check if trans is set, but ASSERT() so 669 * if it isn't set a developer will notice. 670 */ 671 spin_lock(&fs_info->trans_lock); 672 trans = fs_info->running_transaction; 673 if (trans) 674 refcount_inc(&trans->use_count); 675 spin_unlock(&fs_info->trans_lock); 676 677 ASSERT(trans || BTRFS_FS_ERROR(fs_info)); 678 if (trans) { 679 if (atomic_dec_and_test(&trans->pending_ordered)) 680 wake_up(&trans->pending_wait); 681 btrfs_put_transaction(trans); 682 } 683 } 684 685 btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered); 686 687 spin_lock(&root->ordered_extent_lock); 688 list_del_init(&entry->root_extent_list); 689 root->nr_ordered_extents--; 690 691 trace_btrfs_ordered_extent_remove(btrfs_inode, entry); 692 693 if (!root->nr_ordered_extents) { 694 spin_lock(&fs_info->ordered_root_lock); 695 BUG_ON(list_empty(&root->ordered_root)); 696 list_del_init(&root->ordered_root); 697 spin_unlock(&fs_info->ordered_root_lock); 698 } 699 spin_unlock(&root->ordered_extent_lock); 700 wake_up(&entry->wait); 701 if (!freespace_inode) 702 btrfs_lockdep_release(fs_info, btrfs_ordered_extent); 703 } 704 705 static void btrfs_run_ordered_extent_work(struct btrfs_work *work) 706 { 707 struct btrfs_ordered_extent *ordered; 708 709 ordered = container_of(work, struct btrfs_ordered_extent, flush_work); 710 btrfs_start_ordered_extent(ordered); 711 complete(&ordered->completion); 712 } 713 714 /* 715 * wait for all the ordered extents in a root. This is done when balancing 716 * space between drives. 717 */ 718 u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, 719 const u64 range_start, const u64 range_len) 720 { 721 struct btrfs_fs_info *fs_info = root->fs_info; 722 LIST_HEAD(splice); 723 LIST_HEAD(skipped); 724 LIST_HEAD(works); 725 struct btrfs_ordered_extent *ordered, *next; 726 u64 count = 0; 727 const u64 range_end = range_start + range_len; 728 729 mutex_lock(&root->ordered_extent_mutex); 730 spin_lock(&root->ordered_extent_lock); 731 list_splice_init(&root->ordered_extents, &splice); 732 while (!list_empty(&splice) && nr) { 733 ordered = list_first_entry(&splice, struct btrfs_ordered_extent, 734 root_extent_list); 735 736 if (range_end <= ordered->disk_bytenr || 737 ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) { 738 list_move_tail(&ordered->root_extent_list, &skipped); 739 cond_resched_lock(&root->ordered_extent_lock); 740 continue; 741 } 742 743 list_move_tail(&ordered->root_extent_list, 744 &root->ordered_extents); 745 refcount_inc(&ordered->refs); 746 spin_unlock(&root->ordered_extent_lock); 747 748 btrfs_init_work(&ordered->flush_work, 749 btrfs_run_ordered_extent_work, NULL); 750 list_add_tail(&ordered->work_list, &works); 751 btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); 752 753 cond_resched(); 754 spin_lock(&root->ordered_extent_lock); 755 if (nr != U64_MAX) 756 nr--; 757 count++; 758 } 759 list_splice_tail(&skipped, &root->ordered_extents); 760 list_splice_tail(&splice, &root->ordered_extents); 761 spin_unlock(&root->ordered_extent_lock); 762 763 list_for_each_entry_safe(ordered, next, &works, work_list) { 764 list_del_init(&ordered->work_list); 765 wait_for_completion(&ordered->completion); 766 btrfs_put_ordered_extent(ordered); 767 cond_resched(); 768 } 769 mutex_unlock(&root->ordered_extent_mutex); 770 771 return count; 772 } 773 774 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, 775 const u64 range_start, const u64 range_len) 776 { 777 struct btrfs_root *root; 778 LIST_HEAD(splice); 779 u64 done; 780 781 mutex_lock(&fs_info->ordered_operations_mutex); 782 spin_lock(&fs_info->ordered_root_lock); 783 list_splice_init(&fs_info->ordered_roots, &splice); 784 while (!list_empty(&splice) && nr) { 785 root = list_first_entry(&splice, struct btrfs_root, 786 ordered_root); 787 root = btrfs_grab_root(root); 788 BUG_ON(!root); 789 list_move_tail(&root->ordered_root, 790 &fs_info->ordered_roots); 791 spin_unlock(&fs_info->ordered_root_lock); 792 793 done = btrfs_wait_ordered_extents(root, nr, 794 range_start, range_len); 795 btrfs_put_root(root); 796 797 spin_lock(&fs_info->ordered_root_lock); 798 if (nr != U64_MAX) { 799 nr -= done; 800 } 801 } 802 list_splice_tail(&splice, &fs_info->ordered_roots); 803 spin_unlock(&fs_info->ordered_root_lock); 804 mutex_unlock(&fs_info->ordered_operations_mutex); 805 } 806 807 /* 808 * Start IO and wait for a given ordered extent to finish. 809 * 810 * Wait on page writeback for all the pages in the extent and the IO completion 811 * code to insert metadata into the btree corresponding to the extent. 812 */ 813 void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) 814 { 815 u64 start = entry->file_offset; 816 u64 end = start + entry->num_bytes - 1; 817 struct btrfs_inode *inode = BTRFS_I(entry->inode); 818 bool freespace_inode; 819 820 trace_btrfs_ordered_extent_start(inode, entry); 821 822 /* 823 * If this is a free space inode do not take the ordered extents lockdep 824 * map. 825 */ 826 freespace_inode = btrfs_is_free_space_inode(inode); 827 828 /* 829 * pages in the range can be dirty, clean or writeback. We 830 * start IO on any dirty ones so the wait doesn't stall waiting 831 * for the flusher thread to find them 832 */ 833 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) 834 filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); 835 836 if (!freespace_inode) 837 btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); 838 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); 839 } 840 841 /* 842 * Used to wait on ordered extents across a large range of bytes. 843 */ 844 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) 845 { 846 int ret = 0; 847 int ret_wb = 0; 848 u64 end; 849 u64 orig_end; 850 struct btrfs_ordered_extent *ordered; 851 852 if (start + len < start) { 853 orig_end = OFFSET_MAX; 854 } else { 855 orig_end = start + len - 1; 856 if (orig_end > OFFSET_MAX) 857 orig_end = OFFSET_MAX; 858 } 859 860 /* start IO across the range first to instantiate any delalloc 861 * extents 862 */ 863 ret = btrfs_fdatawrite_range(inode, start, orig_end); 864 if (ret) 865 return ret; 866 867 /* 868 * If we have a writeback error don't return immediately. Wait first 869 * for any ordered extents that haven't completed yet. This is to make 870 * sure no one can dirty the same page ranges and call writepages() 871 * before the ordered extents complete - to avoid failures (-EEXIST) 872 * when adding the new ordered extents to the ordered tree. 873 */ 874 ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); 875 876 end = orig_end; 877 while (1) { 878 ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end); 879 if (!ordered) 880 break; 881 if (ordered->file_offset > orig_end) { 882 btrfs_put_ordered_extent(ordered); 883 break; 884 } 885 if (ordered->file_offset + ordered->num_bytes <= start) { 886 btrfs_put_ordered_extent(ordered); 887 break; 888 } 889 btrfs_start_ordered_extent(ordered); 890 end = ordered->file_offset; 891 /* 892 * If the ordered extent had an error save the error but don't 893 * exit without waiting first for all other ordered extents in 894 * the range to complete. 895 */ 896 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) 897 ret = -EIO; 898 btrfs_put_ordered_extent(ordered); 899 if (end == 0 || end == start) 900 break; 901 end--; 902 } 903 return ret_wb ? ret_wb : ret; 904 } 905 906 /* 907 * find an ordered extent corresponding to file_offset. return NULL if 908 * nothing is found, otherwise take a reference on the extent and return it 909 */ 910 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, 911 u64 file_offset) 912 { 913 struct rb_node *node; 914 struct btrfs_ordered_extent *entry = NULL; 915 unsigned long flags; 916 917 spin_lock_irqsave(&inode->ordered_tree_lock, flags); 918 node = ordered_tree_search(inode, file_offset); 919 if (!node) 920 goto out; 921 922 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 923 if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) 924 entry = NULL; 925 if (entry) { 926 refcount_inc(&entry->refs); 927 trace_btrfs_ordered_extent_lookup(inode, entry); 928 } 929 out: 930 spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); 931 return entry; 932 } 933 934 /* Since the DIO code tries to lock a wide area we need to look for any ordered 935 * extents that exist in the range, rather than just the start of the range. 936 */ 937 struct btrfs_ordered_extent *btrfs_lookup_ordered_range( 938 struct btrfs_inode *inode, u64 file_offset, u64 len) 939 { 940 struct rb_node *node; 941 struct btrfs_ordered_extent *entry = NULL; 942 943 spin_lock_irq(&inode->ordered_tree_lock); 944 node = ordered_tree_search(inode, file_offset); 945 if (!node) { 946 node = ordered_tree_search(inode, file_offset + len); 947 if (!node) 948 goto out; 949 } 950 951 while (1) { 952 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 953 if (range_overlaps(entry, file_offset, len)) 954 break; 955 956 if (entry->file_offset >= file_offset + len) { 957 entry = NULL; 958 break; 959 } 960 entry = NULL; 961 node = rb_next(node); 962 if (!node) 963 break; 964 } 965 out: 966 if (entry) { 967 refcount_inc(&entry->refs); 968 trace_btrfs_ordered_extent_lookup_range(inode, entry); 969 } 970 spin_unlock_irq(&inode->ordered_tree_lock); 971 return entry; 972 } 973 974 /* 975 * Adds all ordered extents to the given list. The list ends up sorted by the 976 * file_offset of the ordered extents. 977 */ 978 void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, 979 struct list_head *list) 980 { 981 struct rb_node *n; 982 983 ASSERT(inode_is_locked(&inode->vfs_inode)); 984 985 spin_lock_irq(&inode->ordered_tree_lock); 986 for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { 987 struct btrfs_ordered_extent *ordered; 988 989 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); 990 991 if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) 992 continue; 993 994 ASSERT(list_empty(&ordered->log_list)); 995 list_add_tail(&ordered->log_list, list); 996 refcount_inc(&ordered->refs); 997 trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); 998 } 999 spin_unlock_irq(&inode->ordered_tree_lock); 1000 } 1001 1002 /* 1003 * lookup and return any extent before 'file_offset'. NULL is returned 1004 * if none is found 1005 */ 1006 struct btrfs_ordered_extent * 1007 btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) 1008 { 1009 struct rb_node *node; 1010 struct btrfs_ordered_extent *entry = NULL; 1011 1012 spin_lock_irq(&inode->ordered_tree_lock); 1013 node = ordered_tree_search(inode, file_offset); 1014 if (!node) 1015 goto out; 1016 1017 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 1018 refcount_inc(&entry->refs); 1019 trace_btrfs_ordered_extent_lookup_first(inode, entry); 1020 out: 1021 spin_unlock_irq(&inode->ordered_tree_lock); 1022 return entry; 1023 } 1024 1025 /* 1026 * Lookup the first ordered extent that overlaps the range 1027 * [@file_offset, @file_offset + @len). 1028 * 1029 * The difference between this and btrfs_lookup_first_ordered_extent() is 1030 * that this one won't return any ordered extent that does not overlap the range. 1031 * And the difference against btrfs_lookup_ordered_extent() is, this function 1032 * ensures the first ordered extent gets returned. 1033 */ 1034 struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( 1035 struct btrfs_inode *inode, u64 file_offset, u64 len) 1036 { 1037 struct rb_node *node; 1038 struct rb_node *cur; 1039 struct rb_node *prev; 1040 struct rb_node *next; 1041 struct btrfs_ordered_extent *entry = NULL; 1042 1043 spin_lock_irq(&inode->ordered_tree_lock); 1044 node = inode->ordered_tree.rb_node; 1045 /* 1046 * Here we don't want to use tree_search() which will use tree->last 1047 * and screw up the search order. 1048 * And __tree_search() can't return the adjacent ordered extents 1049 * either, thus here we do our own search. 1050 */ 1051 while (node) { 1052 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 1053 1054 if (file_offset < entry->file_offset) { 1055 node = node->rb_left; 1056 } else if (file_offset >= entry_end(entry)) { 1057 node = node->rb_right; 1058 } else { 1059 /* 1060 * Direct hit, got an ordered extent that starts at 1061 * @file_offset 1062 */ 1063 goto out; 1064 } 1065 } 1066 if (!entry) { 1067 /* Empty tree */ 1068 goto out; 1069 } 1070 1071 cur = &entry->rb_node; 1072 /* We got an entry around @file_offset, check adjacent entries */ 1073 if (entry->file_offset < file_offset) { 1074 prev = cur; 1075 next = rb_next(cur); 1076 } else { 1077 prev = rb_prev(cur); 1078 next = cur; 1079 } 1080 if (prev) { 1081 entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node); 1082 if (range_overlaps(entry, file_offset, len)) 1083 goto out; 1084 } 1085 if (next) { 1086 entry = rb_entry(next, struct btrfs_ordered_extent, rb_node); 1087 if (range_overlaps(entry, file_offset, len)) 1088 goto out; 1089 } 1090 /* No ordered extent in the range */ 1091 entry = NULL; 1092 out: 1093 if (entry) { 1094 refcount_inc(&entry->refs); 1095 trace_btrfs_ordered_extent_lookup_first_range(inode, entry); 1096 } 1097 1098 spin_unlock_irq(&inode->ordered_tree_lock); 1099 return entry; 1100 } 1101 1102 /* 1103 * Lock the passed range and ensures all pending ordered extents in it are run 1104 * to completion. 1105 * 1106 * @inode: Inode whose ordered tree is to be searched 1107 * @start: Beginning of range to flush 1108 * @end: Last byte of range to lock 1109 * @cached_state: If passed, will return the extent state responsible for the 1110 * locked range. It's the caller's responsibility to free the 1111 * cached state. 1112 * 1113 * Always return with the given range locked, ensuring after it's called no 1114 * order extent can be pending. 1115 */ 1116 void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, 1117 u64 end, 1118 struct extent_state **cached_state) 1119 { 1120 struct btrfs_ordered_extent *ordered; 1121 struct extent_state *cache = NULL; 1122 struct extent_state **cachedp = &cache; 1123 1124 if (cached_state) 1125 cachedp = cached_state; 1126 1127 while (1) { 1128 lock_extent(&inode->io_tree, start, end, cachedp); 1129 ordered = btrfs_lookup_ordered_range(inode, start, 1130 end - start + 1); 1131 if (!ordered) { 1132 /* 1133 * If no external cached_state has been passed then 1134 * decrement the extra ref taken for cachedp since we 1135 * aren't exposing it outside of this function 1136 */ 1137 if (!cached_state) 1138 refcount_dec(&cache->refs); 1139 break; 1140 } 1141 unlock_extent(&inode->io_tree, start, end, cachedp); 1142 btrfs_start_ordered_extent(ordered); 1143 btrfs_put_ordered_extent(ordered); 1144 } 1145 } 1146 1147 /* 1148 * Lock the passed range and ensure all pending ordered extents in it are run 1149 * to completion in nowait mode. 1150 * 1151 * Return true if btrfs_lock_ordered_range does not return any extents, 1152 * otherwise false. 1153 */ 1154 bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, 1155 struct extent_state **cached_state) 1156 { 1157 struct btrfs_ordered_extent *ordered; 1158 1159 if (!try_lock_extent(&inode->io_tree, start, end, cached_state)) 1160 return false; 1161 1162 ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); 1163 if (!ordered) 1164 return true; 1165 1166 btrfs_put_ordered_extent(ordered); 1167 unlock_extent(&inode->io_tree, start, end, cached_state); 1168 1169 return false; 1170 } 1171 1172 /* Split out a new ordered extent for this first @len bytes of @ordered. */ 1173 struct btrfs_ordered_extent *btrfs_split_ordered_extent( 1174 struct btrfs_ordered_extent *ordered, u64 len) 1175 { 1176 struct btrfs_inode *inode = BTRFS_I(ordered->inode); 1177 struct btrfs_root *root = inode->root; 1178 struct btrfs_fs_info *fs_info = root->fs_info; 1179 u64 file_offset = ordered->file_offset; 1180 u64 disk_bytenr = ordered->disk_bytenr; 1181 unsigned long flags = ordered->flags; 1182 struct btrfs_ordered_sum *sum, *tmpsum; 1183 struct btrfs_ordered_extent *new; 1184 struct rb_node *node; 1185 u64 offset = 0; 1186 1187 trace_btrfs_ordered_extent_split(inode, ordered); 1188 1189 ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED))); 1190 1191 /* 1192 * The entire bio must be covered by the ordered extent, but we can't 1193 * reduce the original extent to a zero length either. 1194 */ 1195 if (WARN_ON_ONCE(len >= ordered->num_bytes)) 1196 return ERR_PTR(-EINVAL); 1197 /* We cannot split partially completed ordered extents. */ 1198 if (ordered->bytes_left) { 1199 ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); 1200 if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) 1201 return ERR_PTR(-EINVAL); 1202 } 1203 /* We cannot split a compressed ordered extent. */ 1204 if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) 1205 return ERR_PTR(-EINVAL); 1206 1207 new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr, 1208 len, 0, flags, ordered->compress_type); 1209 if (IS_ERR(new)) 1210 return new; 1211 1212 /* One ref for the tree. */ 1213 refcount_inc(&new->refs); 1214 1215 spin_lock_irq(&root->ordered_extent_lock); 1216 spin_lock(&inode->ordered_tree_lock); 1217 /* Remove from tree once */ 1218 node = &ordered->rb_node; 1219 rb_erase(node, &inode->ordered_tree); 1220 RB_CLEAR_NODE(node); 1221 if (inode->ordered_tree_last == node) 1222 inode->ordered_tree_last = NULL; 1223 1224 ordered->file_offset += len; 1225 ordered->disk_bytenr += len; 1226 ordered->num_bytes -= len; 1227 ordered->disk_num_bytes -= len; 1228 ordered->ram_bytes -= len; 1229 1230 if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) { 1231 ASSERT(ordered->bytes_left == 0); 1232 new->bytes_left = 0; 1233 } else { 1234 ordered->bytes_left -= len; 1235 } 1236 1237 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) { 1238 if (ordered->truncated_len > len) { 1239 ordered->truncated_len -= len; 1240 } else { 1241 new->truncated_len = ordered->truncated_len; 1242 ordered->truncated_len = 0; 1243 } 1244 } 1245 1246 list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) { 1247 if (offset == len) 1248 break; 1249 list_move_tail(&sum->list, &new->list); 1250 offset += sum->len; 1251 } 1252 1253 /* Re-insert the node */ 1254 node = tree_insert(&inode->ordered_tree, ordered->file_offset, 1255 &ordered->rb_node); 1256 if (node) 1257 btrfs_panic(fs_info, -EEXIST, 1258 "zoned: inconsistency in ordered tree at offset %llu", 1259 ordered->file_offset); 1260 1261 node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node); 1262 if (node) 1263 btrfs_panic(fs_info, -EEXIST, 1264 "zoned: inconsistency in ordered tree at offset %llu", 1265 new->file_offset); 1266 spin_unlock(&inode->ordered_tree_lock); 1267 1268 list_add_tail(&new->root_extent_list, &root->ordered_extents); 1269 root->nr_ordered_extents++; 1270 spin_unlock_irq(&root->ordered_extent_lock); 1271 return new; 1272 } 1273 1274 int __init ordered_data_init(void) 1275 { 1276 btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0); 1277 if (!btrfs_ordered_extent_cache) 1278 return -ENOMEM; 1279 1280 return 0; 1281 } 1282 1283 void __cold ordered_data_exit(void) 1284 { 1285 kmem_cache_destroy(btrfs_ordered_extent_cache); 1286 } 1287