1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/fs.h> 7 #include <linux/slab.h> 8 #include <linux/sched.h> 9 #include <linux/sched/mm.h> 10 #include <linux/writeback.h> 11 #include <linux/pagemap.h> 12 #include <linux/blkdev.h> 13 #include <linux/uuid.h> 14 #include <linux/timekeeping.h> 15 #include "misc.h" 16 #include "ctree.h" 17 #include "disk-io.h" 18 #include "transaction.h" 19 #include "locking.h" 20 #include "tree-log.h" 21 #include "volumes.h" 22 #include "dev-replace.h" 23 #include "qgroup.h" 24 #include "block-group.h" 25 #include "space-info.h" 26 #include "zoned.h" 27 #include "fs.h" 28 #include "accessors.h" 29 #include "extent-tree.h" 30 #include "root-tree.h" 31 #include "defrag.h" 32 #include "dir-item.h" 33 #include "uuid-tree.h" 34 #include "ioctl.h" 35 #include "relocation.h" 36 #include "scrub.h" 37 38 static struct kmem_cache *btrfs_trans_handle_cachep; 39 40 /* 41 * Transaction states and transitions 42 * 43 * No running transaction (fs tree blocks are not modified) 44 * | 45 * | To next stage: 46 * | Call start_transaction() variants. Except btrfs_join_transaction_nostart(). 47 * V 48 * Transaction N [[TRANS_STATE_RUNNING]] 49 * | 50 * | New trans handles can be attached to transaction N by calling all 51 * | start_transaction() variants. 52 * | 53 * | To next stage: 54 * | Call btrfs_commit_transaction() on any trans handle attached to 55 * | transaction N 56 * V 57 * Transaction N [[TRANS_STATE_COMMIT_PREP]] 58 * | 59 * | If there are simultaneous calls to btrfs_commit_transaction() one will win 60 * | the race and the rest will wait for the winner to commit the transaction. 61 * | 62 * | The winner will wait for previous running transaction to completely finish 63 * | if there is one. 64 * | 65 * Transaction N [[TRANS_STATE_COMMIT_START]] 66 * | 67 * | Then one of the following happens: 68 * | - Wait for all other trans handle holders to release. 69 * | The btrfs_commit_transaction() caller will do the commit work. 70 * | - Wait for current transaction to be committed by others. 71 * | Other btrfs_commit_transaction() caller will do the commit work. 72 * | 73 * | At this stage, only btrfs_join_transaction*() variants can attach 74 * | to this running transaction. 75 * | All other variants will wait for current one to finish and attach to 76 * | transaction N+1. 77 * | 78 * | To next stage: 79 * | Caller is chosen to commit transaction N, and all other trans handle 80 * | haven been released. 81 * V 82 * Transaction N [[TRANS_STATE_COMMIT_DOING]] 83 * | 84 * | The heavy lifting transaction work is started. 85 * | From running delayed refs (modifying extent tree) to creating pending 86 * | snapshots, running qgroups. 87 * | In short, modify supporting trees to reflect modifications of subvolume 88 * | trees. 89 * | 90 * | At this stage, all start_transaction() calls will wait for this 91 * | transaction to finish and attach to transaction N+1. 92 * | 93 * | To next stage: 94 * | Until all supporting trees are updated. 95 * V 96 * Transaction N [[TRANS_STATE_UNBLOCKED]] 97 * | Transaction N+1 98 * | All needed trees are modified, thus we only [[TRANS_STATE_RUNNING]] 99 * | need to write them back to disk and update | 100 * | super blocks. | 101 * | | 102 * | At this stage, new transaction is allowed to | 103 * | start. | 104 * | All new start_transaction() calls will be | 105 * | attached to transid N+1. | 106 * | | 107 * | To next stage: | 108 * | Until all tree blocks are super blocks are | 109 * | written to block devices | 110 * V | 111 * Transaction N [[TRANS_STATE_COMPLETED]] V 112 * All tree blocks and super blocks are written. Transaction N+1 113 * This transaction is finished and all its [[TRANS_STATE_COMMIT_START]] 114 * data structures will be cleaned up. | Life goes on 115 */ 116 static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { 117 [TRANS_STATE_RUNNING] = 0U, 118 [TRANS_STATE_COMMIT_PREP] = 0U, 119 [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH), 120 [TRANS_STATE_COMMIT_DOING] = (__TRANS_START | 121 __TRANS_ATTACH | 122 __TRANS_JOIN | 123 __TRANS_JOIN_NOSTART), 124 [TRANS_STATE_UNBLOCKED] = (__TRANS_START | 125 __TRANS_ATTACH | 126 __TRANS_JOIN | 127 __TRANS_JOIN_NOLOCK | 128 __TRANS_JOIN_NOSTART), 129 [TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START | 130 __TRANS_ATTACH | 131 __TRANS_JOIN | 132 __TRANS_JOIN_NOLOCK | 133 __TRANS_JOIN_NOSTART), 134 [TRANS_STATE_COMPLETED] = (__TRANS_START | 135 __TRANS_ATTACH | 136 __TRANS_JOIN | 137 __TRANS_JOIN_NOLOCK | 138 __TRANS_JOIN_NOSTART), 139 }; 140 141 void btrfs_put_transaction(struct btrfs_transaction *transaction) 142 { 143 WARN_ON(refcount_read(&transaction->use_count) == 0); 144 if (refcount_dec_and_test(&transaction->use_count)) { 145 BUG_ON(!list_empty(&transaction->list)); 146 WARN_ON(!RB_EMPTY_ROOT( 147 &transaction->delayed_refs.href_root.rb_root)); 148 WARN_ON(!RB_EMPTY_ROOT( 149 &transaction->delayed_refs.dirty_extent_root)); 150 if (transaction->delayed_refs.pending_csums) 151 btrfs_err(transaction->fs_info, 152 "pending csums is %llu", 153 transaction->delayed_refs.pending_csums); 154 /* 155 * If any block groups are found in ->deleted_bgs then it's 156 * because the transaction was aborted and a commit did not 157 * happen (things failed before writing the new superblock 158 * and calling btrfs_finish_extent_commit()), so we can not 159 * discard the physical locations of the block groups. 160 */ 161 while (!list_empty(&transaction->deleted_bgs)) { 162 struct btrfs_block_group *cache; 163 164 cache = list_first_entry(&transaction->deleted_bgs, 165 struct btrfs_block_group, 166 bg_list); 167 list_del_init(&cache->bg_list); 168 btrfs_unfreeze_block_group(cache); 169 btrfs_put_block_group(cache); 170 } 171 WARN_ON(!list_empty(&transaction->dev_update_list)); 172 kfree(transaction); 173 } 174 } 175 176 static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) 177 { 178 struct btrfs_transaction *cur_trans = trans->transaction; 179 struct btrfs_fs_info *fs_info = trans->fs_info; 180 struct btrfs_root *root, *tmp; 181 182 /* 183 * At this point no one can be using this transaction to modify any tree 184 * and no one can start another transaction to modify any tree either. 185 */ 186 ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING); 187 188 down_write(&fs_info->commit_root_sem); 189 190 if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) 191 fs_info->last_reloc_trans = trans->transid; 192 193 list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, 194 dirty_list) { 195 list_del_init(&root->dirty_list); 196 free_extent_buffer(root->commit_root); 197 root->commit_root = btrfs_root_node(root); 198 extent_io_tree_release(&root->dirty_log_pages); 199 btrfs_qgroup_clean_swapped_blocks(root); 200 } 201 202 /* We can free old roots now. */ 203 spin_lock(&cur_trans->dropped_roots_lock); 204 while (!list_empty(&cur_trans->dropped_roots)) { 205 root = list_first_entry(&cur_trans->dropped_roots, 206 struct btrfs_root, root_list); 207 list_del_init(&root->root_list); 208 spin_unlock(&cur_trans->dropped_roots_lock); 209 btrfs_free_log(trans, root); 210 btrfs_drop_and_free_fs_root(fs_info, root); 211 spin_lock(&cur_trans->dropped_roots_lock); 212 } 213 spin_unlock(&cur_trans->dropped_roots_lock); 214 215 up_write(&fs_info->commit_root_sem); 216 } 217 218 static inline void extwriter_counter_inc(struct btrfs_transaction *trans, 219 unsigned int type) 220 { 221 if (type & TRANS_EXTWRITERS) 222 atomic_inc(&trans->num_extwriters); 223 } 224 225 static inline void extwriter_counter_dec(struct btrfs_transaction *trans, 226 unsigned int type) 227 { 228 if (type & TRANS_EXTWRITERS) 229 atomic_dec(&trans->num_extwriters); 230 } 231 232 static inline void extwriter_counter_init(struct btrfs_transaction *trans, 233 unsigned int type) 234 { 235 atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0)); 236 } 237 238 static inline int extwriter_counter_read(struct btrfs_transaction *trans) 239 { 240 return atomic_read(&trans->num_extwriters); 241 } 242 243 /* 244 * To be called after doing the chunk btree updates right after allocating a new 245 * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a 246 * chunk after all chunk btree updates and after finishing the second phase of 247 * chunk allocation (btrfs_create_pending_block_groups()) in case some block 248 * group had its chunk item insertion delayed to the second phase. 249 */ 250 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) 251 { 252 struct btrfs_fs_info *fs_info = trans->fs_info; 253 254 if (!trans->chunk_bytes_reserved) 255 return; 256 257 btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, 258 trans->chunk_bytes_reserved, NULL); 259 trans->chunk_bytes_reserved = 0; 260 } 261 262 /* 263 * either allocate a new transaction or hop into the existing one 264 */ 265 static noinline int join_transaction(struct btrfs_fs_info *fs_info, 266 unsigned int type) 267 { 268 struct btrfs_transaction *cur_trans; 269 270 spin_lock(&fs_info->trans_lock); 271 loop: 272 /* The file system has been taken offline. No new transactions. */ 273 if (BTRFS_FS_ERROR(fs_info)) { 274 spin_unlock(&fs_info->trans_lock); 275 return -EROFS; 276 } 277 278 cur_trans = fs_info->running_transaction; 279 if (cur_trans) { 280 if (TRANS_ABORTED(cur_trans)) { 281 spin_unlock(&fs_info->trans_lock); 282 return cur_trans->aborted; 283 } 284 if (btrfs_blocked_trans_types[cur_trans->state] & type) { 285 spin_unlock(&fs_info->trans_lock); 286 return -EBUSY; 287 } 288 refcount_inc(&cur_trans->use_count); 289 atomic_inc(&cur_trans->num_writers); 290 extwriter_counter_inc(cur_trans, type); 291 spin_unlock(&fs_info->trans_lock); 292 btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers); 293 btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters); 294 return 0; 295 } 296 spin_unlock(&fs_info->trans_lock); 297 298 /* 299 * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the 300 * current transaction, and commit it. If there is no transaction, just 301 * return ENOENT. 302 */ 303 if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART) 304 return -ENOENT; 305 306 /* 307 * JOIN_NOLOCK only happens during the transaction commit, so 308 * it is impossible that ->running_transaction is NULL 309 */ 310 BUG_ON(type == TRANS_JOIN_NOLOCK); 311 312 cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS); 313 if (!cur_trans) 314 return -ENOMEM; 315 316 btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers); 317 btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters); 318 319 spin_lock(&fs_info->trans_lock); 320 if (fs_info->running_transaction) { 321 /* 322 * someone started a transaction after we unlocked. Make sure 323 * to redo the checks above 324 */ 325 btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); 326 btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); 327 kfree(cur_trans); 328 goto loop; 329 } else if (BTRFS_FS_ERROR(fs_info)) { 330 spin_unlock(&fs_info->trans_lock); 331 btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); 332 btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); 333 kfree(cur_trans); 334 return -EROFS; 335 } 336 337 cur_trans->fs_info = fs_info; 338 atomic_set(&cur_trans->pending_ordered, 0); 339 init_waitqueue_head(&cur_trans->pending_wait); 340 atomic_set(&cur_trans->num_writers, 1); 341 extwriter_counter_init(cur_trans, type); 342 init_waitqueue_head(&cur_trans->writer_wait); 343 init_waitqueue_head(&cur_trans->commit_wait); 344 cur_trans->state = TRANS_STATE_RUNNING; 345 /* 346 * One for this trans handle, one so it will live on until we 347 * commit the transaction. 348 */ 349 refcount_set(&cur_trans->use_count, 2); 350 cur_trans->flags = 0; 351 cur_trans->start_time = ktime_get_seconds(); 352 353 memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); 354 355 cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; 356 cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; 357 atomic_set(&cur_trans->delayed_refs.num_entries, 0); 358 359 /* 360 * although the tree mod log is per file system and not per transaction, 361 * the log must never go across transaction boundaries. 362 */ 363 smp_mb(); 364 if (!list_empty(&fs_info->tree_mod_seq_list)) 365 WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n"); 366 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) 367 WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n"); 368 atomic64_set(&fs_info->tree_mod_seq, 0); 369 370 spin_lock_init(&cur_trans->delayed_refs.lock); 371 372 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 373 INIT_LIST_HEAD(&cur_trans->dev_update_list); 374 INIT_LIST_HEAD(&cur_trans->switch_commits); 375 INIT_LIST_HEAD(&cur_trans->dirty_bgs); 376 INIT_LIST_HEAD(&cur_trans->io_bgs); 377 INIT_LIST_HEAD(&cur_trans->dropped_roots); 378 mutex_init(&cur_trans->cache_write_mutex); 379 spin_lock_init(&cur_trans->dirty_bgs_lock); 380 INIT_LIST_HEAD(&cur_trans->deleted_bgs); 381 spin_lock_init(&cur_trans->dropped_roots_lock); 382 list_add_tail(&cur_trans->list, &fs_info->trans_list); 383 extent_io_tree_init(fs_info, &cur_trans->dirty_pages, 384 IO_TREE_TRANS_DIRTY_PAGES); 385 extent_io_tree_init(fs_info, &cur_trans->pinned_extents, 386 IO_TREE_FS_PINNED_EXTENTS); 387 btrfs_set_fs_generation(fs_info, fs_info->generation + 1); 388 cur_trans->transid = fs_info->generation; 389 fs_info->running_transaction = cur_trans; 390 cur_trans->aborted = 0; 391 spin_unlock(&fs_info->trans_lock); 392 393 return 0; 394 } 395 396 /* 397 * This does all the record keeping required to make sure that a shareable root 398 * is properly recorded in a given transaction. This is required to make sure 399 * the old root from before we joined the transaction is deleted when the 400 * transaction commits. 401 */ 402 static int record_root_in_trans(struct btrfs_trans_handle *trans, 403 struct btrfs_root *root, 404 int force) 405 { 406 struct btrfs_fs_info *fs_info = root->fs_info; 407 int ret = 0; 408 409 if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 410 root->last_trans < trans->transid) || force) { 411 WARN_ON(!force && root->commit_root != root->node); 412 413 /* 414 * see below for IN_TRANS_SETUP usage rules 415 * we have the reloc mutex held now, so there 416 * is only one writer in this function 417 */ 418 set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); 419 420 /* make sure readers find IN_TRANS_SETUP before 421 * they find our root->last_trans update 422 */ 423 smp_wmb(); 424 425 spin_lock(&fs_info->fs_roots_radix_lock); 426 if (root->last_trans == trans->transid && !force) { 427 spin_unlock(&fs_info->fs_roots_radix_lock); 428 return 0; 429 } 430 radix_tree_tag_set(&fs_info->fs_roots_radix, 431 (unsigned long)root->root_key.objectid, 432 BTRFS_ROOT_TRANS_TAG); 433 spin_unlock(&fs_info->fs_roots_radix_lock); 434 root->last_trans = trans->transid; 435 436 /* this is pretty tricky. We don't want to 437 * take the relocation lock in btrfs_record_root_in_trans 438 * unless we're really doing the first setup for this root in 439 * this transaction. 440 * 441 * Normally we'd use root->last_trans as a flag to decide 442 * if we want to take the expensive mutex. 443 * 444 * But, we have to set root->last_trans before we 445 * init the relocation root, otherwise, we trip over warnings 446 * in ctree.c. The solution used here is to flag ourselves 447 * with root IN_TRANS_SETUP. When this is 1, we're still 448 * fixing up the reloc trees and everyone must wait. 449 * 450 * When this is zero, they can trust root->last_trans and fly 451 * through btrfs_record_root_in_trans without having to take the 452 * lock. smp_wmb() makes sure that all the writes above are 453 * done before we pop in the zero below 454 */ 455 ret = btrfs_init_reloc_root(trans, root); 456 smp_mb__before_atomic(); 457 clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); 458 } 459 return ret; 460 } 461 462 463 void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, 464 struct btrfs_root *root) 465 { 466 struct btrfs_fs_info *fs_info = root->fs_info; 467 struct btrfs_transaction *cur_trans = trans->transaction; 468 469 /* Add ourselves to the transaction dropped list */ 470 spin_lock(&cur_trans->dropped_roots_lock); 471 list_add_tail(&root->root_list, &cur_trans->dropped_roots); 472 spin_unlock(&cur_trans->dropped_roots_lock); 473 474 /* Make sure we don't try to update the root at commit time */ 475 spin_lock(&fs_info->fs_roots_radix_lock); 476 radix_tree_tag_clear(&fs_info->fs_roots_radix, 477 (unsigned long)root->root_key.objectid, 478 BTRFS_ROOT_TRANS_TAG); 479 spin_unlock(&fs_info->fs_roots_radix_lock); 480 } 481 482 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 483 struct btrfs_root *root) 484 { 485 struct btrfs_fs_info *fs_info = root->fs_info; 486 int ret; 487 488 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 489 return 0; 490 491 /* 492 * see record_root_in_trans for comments about IN_TRANS_SETUP usage 493 * and barriers 494 */ 495 smp_rmb(); 496 if (root->last_trans == trans->transid && 497 !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state)) 498 return 0; 499 500 mutex_lock(&fs_info->reloc_mutex); 501 ret = record_root_in_trans(trans, root, 0); 502 mutex_unlock(&fs_info->reloc_mutex); 503 504 return ret; 505 } 506 507 static inline int is_transaction_blocked(struct btrfs_transaction *trans) 508 { 509 return (trans->state >= TRANS_STATE_COMMIT_START && 510 trans->state < TRANS_STATE_UNBLOCKED && 511 !TRANS_ABORTED(trans)); 512 } 513 514 /* wait for commit against the current transaction to become unblocked 515 * when this is done, it is safe to start a new transaction, but the current 516 * transaction might not be fully on disk. 517 */ 518 static void wait_current_trans(struct btrfs_fs_info *fs_info) 519 { 520 struct btrfs_transaction *cur_trans; 521 522 spin_lock(&fs_info->trans_lock); 523 cur_trans = fs_info->running_transaction; 524 if (cur_trans && is_transaction_blocked(cur_trans)) { 525 refcount_inc(&cur_trans->use_count); 526 spin_unlock(&fs_info->trans_lock); 527 528 btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); 529 wait_event(fs_info->transaction_wait, 530 cur_trans->state >= TRANS_STATE_UNBLOCKED || 531 TRANS_ABORTED(cur_trans)); 532 btrfs_put_transaction(cur_trans); 533 } else { 534 spin_unlock(&fs_info->trans_lock); 535 } 536 } 537 538 static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type) 539 { 540 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 541 return 0; 542 543 if (type == TRANS_START) 544 return 1; 545 546 return 0; 547 } 548 549 static inline bool need_reserve_reloc_root(struct btrfs_root *root) 550 { 551 struct btrfs_fs_info *fs_info = root->fs_info; 552 553 if (!fs_info->reloc_ctl || 554 !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || 555 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 556 root->reloc_root) 557 return false; 558 559 return true; 560 } 561 562 static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, 563 enum btrfs_reserve_flush_enum flush, 564 u64 num_bytes, 565 u64 *delayed_refs_bytes) 566 { 567 struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info; 568 u64 bytes = num_bytes + *delayed_refs_bytes; 569 int ret; 570 571 /* 572 * We want to reserve all the bytes we may need all at once, so we only 573 * do 1 enospc flushing cycle per transaction start. 574 */ 575 ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); 576 577 /* 578 * If we are an emergency flush, which can steal from the global block 579 * reserve, then attempt to not reserve space for the delayed refs, as 580 * we will consume space for them from the global block reserve. 581 */ 582 if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { 583 bytes -= *delayed_refs_bytes; 584 *delayed_refs_bytes = 0; 585 ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); 586 } 587 588 return ret; 589 } 590 591 static struct btrfs_trans_handle * 592 start_transaction(struct btrfs_root *root, unsigned int num_items, 593 unsigned int type, enum btrfs_reserve_flush_enum flush, 594 bool enforce_qgroups) 595 { 596 struct btrfs_fs_info *fs_info = root->fs_info; 597 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 598 struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; 599 struct btrfs_trans_handle *h; 600 struct btrfs_transaction *cur_trans; 601 u64 num_bytes = 0; 602 u64 qgroup_reserved = 0; 603 u64 delayed_refs_bytes = 0; 604 bool reloc_reserved = false; 605 bool do_chunk_alloc = false; 606 int ret; 607 608 if (BTRFS_FS_ERROR(fs_info)) 609 return ERR_PTR(-EROFS); 610 611 if (current->journal_info) { 612 WARN_ON(type & TRANS_EXTWRITERS); 613 h = current->journal_info; 614 refcount_inc(&h->use_count); 615 WARN_ON(refcount_read(&h->use_count) > 2); 616 h->orig_rsv = h->block_rsv; 617 h->block_rsv = NULL; 618 goto got_it; 619 } 620 621 /* 622 * Do the reservation before we join the transaction so we can do all 623 * the appropriate flushing if need be. 624 */ 625 if (num_items && root != fs_info->chunk_root) { 626 qgroup_reserved = num_items * fs_info->nodesize; 627 /* 628 * Use prealloc for now, as there might be a currently running 629 * transaction that could free this reserved space prematurely 630 * by committing. 631 */ 632 ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserved, 633 enforce_qgroups, false); 634 if (ret) 635 return ERR_PTR(ret); 636 637 num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); 638 /* 639 * If we plan to insert/update/delete "num_items" from a btree, 640 * we will also generate delayed refs for extent buffers in the 641 * respective btree paths, so reserve space for the delayed refs 642 * that will be generated by the caller as it modifies btrees. 643 * Try to reserve them to avoid excessive use of the global 644 * block reserve. 645 */ 646 delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items); 647 648 /* 649 * Do the reservation for the relocation root creation 650 */ 651 if (need_reserve_reloc_root(root)) { 652 num_bytes += fs_info->nodesize; 653 reloc_reserved = true; 654 } 655 656 ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes, 657 &delayed_refs_bytes); 658 if (ret) 659 goto reserve_fail; 660 661 btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true); 662 663 if (trans_rsv->space_info->force_alloc) 664 do_chunk_alloc = true; 665 } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && 666 !btrfs_block_rsv_full(delayed_refs_rsv)) { 667 /* 668 * Some people call with btrfs_start_transaction(root, 0) 669 * because they can be throttled, but have some other mechanism 670 * for reserving space. We still want these guys to refill the 671 * delayed block_rsv so just add 1 items worth of reservation 672 * here. 673 */ 674 ret = btrfs_delayed_refs_rsv_refill(fs_info, flush); 675 if (ret) 676 goto reserve_fail; 677 } 678 again: 679 h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS); 680 if (!h) { 681 ret = -ENOMEM; 682 goto alloc_fail; 683 } 684 685 /* 686 * If we are JOIN_NOLOCK we're already committing a transaction and 687 * waiting on this guy, so we don't need to do the sb_start_intwrite 688 * because we're already holding a ref. We need this because we could 689 * have raced in and did an fsync() on a file which can kick a commit 690 * and then we deadlock with somebody doing a freeze. 691 * 692 * If we are ATTACH, it means we just want to catch the current 693 * transaction and commit it, so we needn't do sb_start_intwrite(). 694 */ 695 if (type & __TRANS_FREEZABLE) 696 sb_start_intwrite(fs_info->sb); 697 698 if (may_wait_transaction(fs_info, type)) 699 wait_current_trans(fs_info); 700 701 do { 702 ret = join_transaction(fs_info, type); 703 if (ret == -EBUSY) { 704 wait_current_trans(fs_info); 705 if (unlikely(type == TRANS_ATTACH || 706 type == TRANS_JOIN_NOSTART)) 707 ret = -ENOENT; 708 } 709 } while (ret == -EBUSY); 710 711 if (ret < 0) 712 goto join_fail; 713 714 cur_trans = fs_info->running_transaction; 715 716 h->transid = cur_trans->transid; 717 h->transaction = cur_trans; 718 refcount_set(&h->use_count, 1); 719 h->fs_info = root->fs_info; 720 721 h->type = type; 722 INIT_LIST_HEAD(&h->new_bgs); 723 btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS); 724 725 smp_mb(); 726 if (cur_trans->state >= TRANS_STATE_COMMIT_START && 727 may_wait_transaction(fs_info, type)) { 728 current->journal_info = h; 729 btrfs_commit_transaction(h); 730 goto again; 731 } 732 733 if (num_bytes) { 734 trace_btrfs_space_reservation(fs_info, "transaction", 735 h->transid, num_bytes, 1); 736 h->block_rsv = trans_rsv; 737 h->bytes_reserved = num_bytes; 738 if (delayed_refs_bytes > 0) { 739 trace_btrfs_space_reservation(fs_info, 740 "local_delayed_refs_rsv", 741 h->transid, 742 delayed_refs_bytes, 1); 743 h->delayed_refs_bytes_reserved = delayed_refs_bytes; 744 btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true); 745 delayed_refs_bytes = 0; 746 } 747 h->reloc_reserved = reloc_reserved; 748 } 749 750 /* 751 * Now that we have found a transaction to be a part of, convert the 752 * qgroup reservation from prealloc to pertrans. A different transaction 753 * can't race in and free our pertrans out from under us. 754 */ 755 if (qgroup_reserved) 756 btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); 757 758 got_it: 759 if (!current->journal_info) 760 current->journal_info = h; 761 762 /* 763 * If the space_info is marked ALLOC_FORCE then we'll get upgraded to 764 * ALLOC_FORCE the first run through, and then we won't allocate for 765 * anybody else who races in later. We don't care about the return 766 * value here. 767 */ 768 if (do_chunk_alloc && num_bytes) { 769 u64 flags = h->block_rsv->space_info->flags; 770 771 btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags), 772 CHUNK_ALLOC_NO_FORCE); 773 } 774 775 /* 776 * btrfs_record_root_in_trans() needs to alloc new extents, and may 777 * call btrfs_join_transaction() while we're also starting a 778 * transaction. 779 * 780 * Thus it need to be called after current->journal_info initialized, 781 * or we can deadlock. 782 */ 783 ret = btrfs_record_root_in_trans(h, root); 784 if (ret) { 785 /* 786 * The transaction handle is fully initialized and linked with 787 * other structures so it needs to be ended in case of errors, 788 * not just freed. 789 */ 790 btrfs_end_transaction(h); 791 return ERR_PTR(ret); 792 } 793 794 return h; 795 796 join_fail: 797 if (type & __TRANS_FREEZABLE) 798 sb_end_intwrite(fs_info->sb); 799 kmem_cache_free(btrfs_trans_handle_cachep, h); 800 alloc_fail: 801 if (num_bytes) 802 btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL); 803 if (delayed_refs_bytes) 804 btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info, 805 delayed_refs_bytes); 806 reserve_fail: 807 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); 808 return ERR_PTR(ret); 809 } 810 811 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 812 unsigned int num_items) 813 { 814 return start_transaction(root, num_items, TRANS_START, 815 BTRFS_RESERVE_FLUSH_ALL, true); 816 } 817 818 struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( 819 struct btrfs_root *root, 820 unsigned int num_items) 821 { 822 return start_transaction(root, num_items, TRANS_START, 823 BTRFS_RESERVE_FLUSH_ALL_STEAL, false); 824 } 825 826 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) 827 { 828 return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH, 829 true); 830 } 831 832 struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root) 833 { 834 return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 835 BTRFS_RESERVE_NO_FLUSH, true); 836 } 837 838 /* 839 * Similar to regular join but it never starts a transaction when none is 840 * running or when there's a running one at a state >= TRANS_STATE_UNBLOCKED. 841 * This is similar to btrfs_attach_transaction() but it allows the join to 842 * happen if the transaction commit already started but it's not yet in the 843 * "doing" phase (the state is < TRANS_STATE_COMMIT_DOING). 844 */ 845 struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root) 846 { 847 return start_transaction(root, 0, TRANS_JOIN_NOSTART, 848 BTRFS_RESERVE_NO_FLUSH, true); 849 } 850 851 /* 852 * Catch the running transaction. 853 * 854 * It is used when we want to commit the current the transaction, but 855 * don't want to start a new one. 856 * 857 * Note: If this function return -ENOENT, it just means there is no 858 * running transaction. But it is possible that the inactive transaction 859 * is still in the memory, not fully on disk. If you hope there is no 860 * inactive transaction in the fs when -ENOENT is returned, you should 861 * invoke 862 * btrfs_attach_transaction_barrier() 863 */ 864 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) 865 { 866 return start_transaction(root, 0, TRANS_ATTACH, 867 BTRFS_RESERVE_NO_FLUSH, true); 868 } 869 870 /* 871 * Catch the running transaction. 872 * 873 * It is similar to the above function, the difference is this one 874 * will wait for all the inactive transactions until they fully 875 * complete. 876 */ 877 struct btrfs_trans_handle * 878 btrfs_attach_transaction_barrier(struct btrfs_root *root) 879 { 880 struct btrfs_trans_handle *trans; 881 882 trans = start_transaction(root, 0, TRANS_ATTACH, 883 BTRFS_RESERVE_NO_FLUSH, true); 884 if (trans == ERR_PTR(-ENOENT)) { 885 int ret; 886 887 ret = btrfs_wait_for_commit(root->fs_info, 0); 888 if (ret) 889 return ERR_PTR(ret); 890 } 891 892 return trans; 893 } 894 895 /* Wait for a transaction commit to reach at least the given state. */ 896 static noinline void wait_for_commit(struct btrfs_transaction *commit, 897 const enum btrfs_trans_state min_state) 898 { 899 struct btrfs_fs_info *fs_info = commit->fs_info; 900 u64 transid = commit->transid; 901 bool put = false; 902 903 /* 904 * At the moment this function is called with min_state either being 905 * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED. 906 */ 907 if (min_state == TRANS_STATE_COMPLETED) 908 btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); 909 else 910 btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); 911 912 while (1) { 913 wait_event(commit->commit_wait, commit->state >= min_state); 914 if (put) 915 btrfs_put_transaction(commit); 916 917 if (min_state < TRANS_STATE_COMPLETED) 918 break; 919 920 /* 921 * A transaction isn't really completed until all of the 922 * previous transactions are completed, but with fsync we can 923 * end up with SUPER_COMMITTED transactions before a COMPLETED 924 * transaction. Wait for those. 925 */ 926 927 spin_lock(&fs_info->trans_lock); 928 commit = list_first_entry_or_null(&fs_info->trans_list, 929 struct btrfs_transaction, 930 list); 931 if (!commit || commit->transid > transid) { 932 spin_unlock(&fs_info->trans_lock); 933 break; 934 } 935 refcount_inc(&commit->use_count); 936 put = true; 937 spin_unlock(&fs_info->trans_lock); 938 } 939 } 940 941 int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) 942 { 943 struct btrfs_transaction *cur_trans = NULL, *t; 944 int ret = 0; 945 946 if (transid) { 947 if (transid <= btrfs_get_last_trans_committed(fs_info)) 948 goto out; 949 950 /* find specified transaction */ 951 spin_lock(&fs_info->trans_lock); 952 list_for_each_entry(t, &fs_info->trans_list, list) { 953 if (t->transid == transid) { 954 cur_trans = t; 955 refcount_inc(&cur_trans->use_count); 956 ret = 0; 957 break; 958 } 959 if (t->transid > transid) { 960 ret = 0; 961 break; 962 } 963 } 964 spin_unlock(&fs_info->trans_lock); 965 966 /* 967 * The specified transaction doesn't exist, or we 968 * raced with btrfs_commit_transaction 969 */ 970 if (!cur_trans) { 971 if (transid > btrfs_get_last_trans_committed(fs_info)) 972 ret = -EINVAL; 973 goto out; 974 } 975 } else { 976 /* find newest transaction that is committing | committed */ 977 spin_lock(&fs_info->trans_lock); 978 list_for_each_entry_reverse(t, &fs_info->trans_list, 979 list) { 980 if (t->state >= TRANS_STATE_COMMIT_START) { 981 if (t->state == TRANS_STATE_COMPLETED) 982 break; 983 cur_trans = t; 984 refcount_inc(&cur_trans->use_count); 985 break; 986 } 987 } 988 spin_unlock(&fs_info->trans_lock); 989 if (!cur_trans) 990 goto out; /* nothing committing|committed */ 991 } 992 993 wait_for_commit(cur_trans, TRANS_STATE_COMPLETED); 994 ret = cur_trans->aborted; 995 btrfs_put_transaction(cur_trans); 996 out: 997 return ret; 998 } 999 1000 void btrfs_throttle(struct btrfs_fs_info *fs_info) 1001 { 1002 wait_current_trans(fs_info); 1003 } 1004 1005 bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) 1006 { 1007 struct btrfs_transaction *cur_trans = trans->transaction; 1008 1009 if (cur_trans->state >= TRANS_STATE_COMMIT_START || 1010 test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags)) 1011 return true; 1012 1013 if (btrfs_check_space_for_delayed_refs(trans->fs_info)) 1014 return true; 1015 1016 return !!btrfs_block_rsv_check(&trans->fs_info->global_block_rsv, 50); 1017 } 1018 1019 static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) 1020 1021 { 1022 struct btrfs_fs_info *fs_info = trans->fs_info; 1023 1024 if (!trans->block_rsv) { 1025 ASSERT(!trans->bytes_reserved); 1026 ASSERT(!trans->delayed_refs_bytes_reserved); 1027 return; 1028 } 1029 1030 if (!trans->bytes_reserved) { 1031 ASSERT(!trans->delayed_refs_bytes_reserved); 1032 return; 1033 } 1034 1035 ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); 1036 trace_btrfs_space_reservation(fs_info, "transaction", 1037 trans->transid, trans->bytes_reserved, 0); 1038 btrfs_block_rsv_release(fs_info, trans->block_rsv, 1039 trans->bytes_reserved, NULL); 1040 trans->bytes_reserved = 0; 1041 1042 if (!trans->delayed_refs_bytes_reserved) 1043 return; 1044 1045 trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv", 1046 trans->transid, 1047 trans->delayed_refs_bytes_reserved, 0); 1048 btrfs_block_rsv_release(fs_info, &trans->delayed_rsv, 1049 trans->delayed_refs_bytes_reserved, NULL); 1050 trans->delayed_refs_bytes_reserved = 0; 1051 } 1052 1053 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 1054 int throttle) 1055 { 1056 struct btrfs_fs_info *info = trans->fs_info; 1057 struct btrfs_transaction *cur_trans = trans->transaction; 1058 int err = 0; 1059 1060 if (refcount_read(&trans->use_count) > 1) { 1061 refcount_dec(&trans->use_count); 1062 trans->block_rsv = trans->orig_rsv; 1063 return 0; 1064 } 1065 1066 btrfs_trans_release_metadata(trans); 1067 trans->block_rsv = NULL; 1068 1069 btrfs_create_pending_block_groups(trans); 1070 1071 btrfs_trans_release_chunk_metadata(trans); 1072 1073 if (trans->type & __TRANS_FREEZABLE) 1074 sb_end_intwrite(info->sb); 1075 1076 WARN_ON(cur_trans != info->running_transaction); 1077 WARN_ON(atomic_read(&cur_trans->num_writers) < 1); 1078 atomic_dec(&cur_trans->num_writers); 1079 extwriter_counter_dec(cur_trans, trans->type); 1080 1081 cond_wake_up(&cur_trans->writer_wait); 1082 1083 btrfs_lockdep_release(info, btrfs_trans_num_extwriters); 1084 btrfs_lockdep_release(info, btrfs_trans_num_writers); 1085 1086 btrfs_put_transaction(cur_trans); 1087 1088 if (current->journal_info == trans) 1089 current->journal_info = NULL; 1090 1091 if (throttle) 1092 btrfs_run_delayed_iputs(info); 1093 1094 if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) { 1095 wake_up_process(info->transaction_kthread); 1096 if (TRANS_ABORTED(trans)) 1097 err = trans->aborted; 1098 else 1099 err = -EROFS; 1100 } 1101 1102 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1103 return err; 1104 } 1105 1106 int btrfs_end_transaction(struct btrfs_trans_handle *trans) 1107 { 1108 return __btrfs_end_transaction(trans, 0); 1109 } 1110 1111 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans) 1112 { 1113 return __btrfs_end_transaction(trans, 1); 1114 } 1115 1116 /* 1117 * when btree blocks are allocated, they have some corresponding bits set for 1118 * them in one of two extent_io trees. This is used to make sure all of 1119 * those extents are sent to disk but does not wait on them 1120 */ 1121 int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, 1122 struct extent_io_tree *dirty_pages, int mark) 1123 { 1124 int err = 0; 1125 int werr = 0; 1126 struct address_space *mapping = fs_info->btree_inode->i_mapping; 1127 struct extent_state *cached_state = NULL; 1128 u64 start = 0; 1129 u64 end; 1130 1131 while (find_first_extent_bit(dirty_pages, start, &start, &end, 1132 mark, &cached_state)) { 1133 bool wait_writeback = false; 1134 1135 err = convert_extent_bit(dirty_pages, start, end, 1136 EXTENT_NEED_WAIT, 1137 mark, &cached_state); 1138 /* 1139 * convert_extent_bit can return -ENOMEM, which is most of the 1140 * time a temporary error. So when it happens, ignore the error 1141 * and wait for writeback of this range to finish - because we 1142 * failed to set the bit EXTENT_NEED_WAIT for the range, a call 1143 * to __btrfs_wait_marked_extents() would not know that 1144 * writeback for this range started and therefore wouldn't 1145 * wait for it to finish - we don't want to commit a 1146 * superblock that points to btree nodes/leafs for which 1147 * writeback hasn't finished yet (and without errors). 1148 * We cleanup any entries left in the io tree when committing 1149 * the transaction (through extent_io_tree_release()). 1150 */ 1151 if (err == -ENOMEM) { 1152 err = 0; 1153 wait_writeback = true; 1154 } 1155 if (!err) 1156 err = filemap_fdatawrite_range(mapping, start, end); 1157 if (err) 1158 werr = err; 1159 else if (wait_writeback) 1160 werr = filemap_fdatawait_range(mapping, start, end); 1161 free_extent_state(cached_state); 1162 cached_state = NULL; 1163 cond_resched(); 1164 start = end + 1; 1165 } 1166 return werr; 1167 } 1168 1169 /* 1170 * when btree blocks are allocated, they have some corresponding bits set for 1171 * them in one of two extent_io trees. This is used to make sure all of 1172 * those extents are on disk for transaction or log commit. We wait 1173 * on all the pages and clear them from the dirty pages state tree 1174 */ 1175 static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, 1176 struct extent_io_tree *dirty_pages) 1177 { 1178 int err = 0; 1179 int werr = 0; 1180 struct address_space *mapping = fs_info->btree_inode->i_mapping; 1181 struct extent_state *cached_state = NULL; 1182 u64 start = 0; 1183 u64 end; 1184 1185 while (find_first_extent_bit(dirty_pages, start, &start, &end, 1186 EXTENT_NEED_WAIT, &cached_state)) { 1187 /* 1188 * Ignore -ENOMEM errors returned by clear_extent_bit(). 1189 * When committing the transaction, we'll remove any entries 1190 * left in the io tree. For a log commit, we don't remove them 1191 * after committing the log because the tree can be accessed 1192 * concurrently - we do it only at transaction commit time when 1193 * it's safe to do it (through extent_io_tree_release()). 1194 */ 1195 err = clear_extent_bit(dirty_pages, start, end, 1196 EXTENT_NEED_WAIT, &cached_state); 1197 if (err == -ENOMEM) 1198 err = 0; 1199 if (!err) 1200 err = filemap_fdatawait_range(mapping, start, end); 1201 if (err) 1202 werr = err; 1203 free_extent_state(cached_state); 1204 cached_state = NULL; 1205 cond_resched(); 1206 start = end + 1; 1207 } 1208 if (err) 1209 werr = err; 1210 return werr; 1211 } 1212 1213 static int btrfs_wait_extents(struct btrfs_fs_info *fs_info, 1214 struct extent_io_tree *dirty_pages) 1215 { 1216 bool errors = false; 1217 int err; 1218 1219 err = __btrfs_wait_marked_extents(fs_info, dirty_pages); 1220 if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags)) 1221 errors = true; 1222 1223 if (errors && !err) 1224 err = -EIO; 1225 return err; 1226 } 1227 1228 int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) 1229 { 1230 struct btrfs_fs_info *fs_info = log_root->fs_info; 1231 struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages; 1232 bool errors = false; 1233 int err; 1234 1235 ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); 1236 1237 err = __btrfs_wait_marked_extents(fs_info, dirty_pages); 1238 if ((mark & EXTENT_DIRTY) && 1239 test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags)) 1240 errors = true; 1241 1242 if ((mark & EXTENT_NEW) && 1243 test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags)) 1244 errors = true; 1245 1246 if (errors && !err) 1247 err = -EIO; 1248 return err; 1249 } 1250 1251 /* 1252 * When btree blocks are allocated the corresponding extents are marked dirty. 1253 * This function ensures such extents are persisted on disk for transaction or 1254 * log commit. 1255 * 1256 * @trans: transaction whose dirty pages we'd like to write 1257 */ 1258 static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans) 1259 { 1260 int ret; 1261 int ret2; 1262 struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages; 1263 struct btrfs_fs_info *fs_info = trans->fs_info; 1264 struct blk_plug plug; 1265 1266 blk_start_plug(&plug); 1267 ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY); 1268 blk_finish_plug(&plug); 1269 ret2 = btrfs_wait_extents(fs_info, dirty_pages); 1270 1271 extent_io_tree_release(&trans->transaction->dirty_pages); 1272 1273 if (ret) 1274 return ret; 1275 else if (ret2) 1276 return ret2; 1277 else 1278 return 0; 1279 } 1280 1281 /* 1282 * this is used to update the root pointer in the tree of tree roots. 1283 * 1284 * But, in the case of the extent allocation tree, updating the root 1285 * pointer may allocate blocks which may change the root of the extent 1286 * allocation tree. 1287 * 1288 * So, this loops and repeats and makes sure the cowonly root didn't 1289 * change while the root pointer was being updated in the metadata. 1290 */ 1291 static int update_cowonly_root(struct btrfs_trans_handle *trans, 1292 struct btrfs_root *root) 1293 { 1294 int ret; 1295 u64 old_root_bytenr; 1296 u64 old_root_used; 1297 struct btrfs_fs_info *fs_info = root->fs_info; 1298 struct btrfs_root *tree_root = fs_info->tree_root; 1299 1300 old_root_used = btrfs_root_used(&root->root_item); 1301 1302 while (1) { 1303 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 1304 if (old_root_bytenr == root->node->start && 1305 old_root_used == btrfs_root_used(&root->root_item)) 1306 break; 1307 1308 btrfs_set_root_node(&root->root_item, root->node); 1309 ret = btrfs_update_root(trans, tree_root, 1310 &root->root_key, 1311 &root->root_item); 1312 if (ret) 1313 return ret; 1314 1315 old_root_used = btrfs_root_used(&root->root_item); 1316 } 1317 1318 return 0; 1319 } 1320 1321 /* 1322 * update all the cowonly tree roots on disk 1323 * 1324 * The error handling in this function may not be obvious. Any of the 1325 * failures will cause the file system to go offline. We still need 1326 * to clean up the delayed refs. 1327 */ 1328 static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) 1329 { 1330 struct btrfs_fs_info *fs_info = trans->fs_info; 1331 struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; 1332 struct list_head *io_bgs = &trans->transaction->io_bgs; 1333 struct list_head *next; 1334 struct extent_buffer *eb; 1335 int ret; 1336 1337 /* 1338 * At this point no one can be using this transaction to modify any tree 1339 * and no one can start another transaction to modify any tree either. 1340 */ 1341 ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); 1342 1343 eb = btrfs_lock_root_node(fs_info->tree_root); 1344 ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 1345 0, &eb, BTRFS_NESTING_COW); 1346 btrfs_tree_unlock(eb); 1347 free_extent_buffer(eb); 1348 1349 if (ret) 1350 return ret; 1351 1352 ret = btrfs_run_dev_stats(trans); 1353 if (ret) 1354 return ret; 1355 ret = btrfs_run_dev_replace(trans); 1356 if (ret) 1357 return ret; 1358 ret = btrfs_run_qgroups(trans); 1359 if (ret) 1360 return ret; 1361 1362 ret = btrfs_setup_space_cache(trans); 1363 if (ret) 1364 return ret; 1365 1366 again: 1367 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 1368 struct btrfs_root *root; 1369 next = fs_info->dirty_cowonly_roots.next; 1370 list_del_init(next); 1371 root = list_entry(next, struct btrfs_root, dirty_list); 1372 clear_bit(BTRFS_ROOT_DIRTY, &root->state); 1373 1374 list_add_tail(&root->dirty_list, 1375 &trans->transaction->switch_commits); 1376 ret = update_cowonly_root(trans, root); 1377 if (ret) 1378 return ret; 1379 } 1380 1381 /* Now flush any delayed refs generated by updating all of the roots */ 1382 ret = btrfs_run_delayed_refs(trans, U64_MAX); 1383 if (ret) 1384 return ret; 1385 1386 while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { 1387 ret = btrfs_write_dirty_block_groups(trans); 1388 if (ret) 1389 return ret; 1390 1391 /* 1392 * We're writing the dirty block groups, which could generate 1393 * delayed refs, which could generate more dirty block groups, 1394 * so we want to keep this flushing in this loop to make sure 1395 * everything gets run. 1396 */ 1397 ret = btrfs_run_delayed_refs(trans, U64_MAX); 1398 if (ret) 1399 return ret; 1400 } 1401 1402 if (!list_empty(&fs_info->dirty_cowonly_roots)) 1403 goto again; 1404 1405 /* Update dev-replace pointer once everything is committed */ 1406 fs_info->dev_replace.committed_cursor_left = 1407 fs_info->dev_replace.cursor_left_last_write_of_item; 1408 1409 return 0; 1410 } 1411 1412 /* 1413 * If we had a pending drop we need to see if there are any others left in our 1414 * dead roots list, and if not clear our bit and wake any waiters. 1415 */ 1416 void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info) 1417 { 1418 /* 1419 * We put the drop in progress roots at the front of the list, so if the 1420 * first entry doesn't have UNFINISHED_DROP set we can wake everybody 1421 * up. 1422 */ 1423 spin_lock(&fs_info->trans_lock); 1424 if (!list_empty(&fs_info->dead_roots)) { 1425 struct btrfs_root *root = list_first_entry(&fs_info->dead_roots, 1426 struct btrfs_root, 1427 root_list); 1428 if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) { 1429 spin_unlock(&fs_info->trans_lock); 1430 return; 1431 } 1432 } 1433 spin_unlock(&fs_info->trans_lock); 1434 1435 btrfs_wake_unfinished_drop(fs_info); 1436 } 1437 1438 /* 1439 * dead roots are old snapshots that need to be deleted. This allocates 1440 * a dirty root struct and adds it into the list of dead roots that need to 1441 * be deleted 1442 */ 1443 void btrfs_add_dead_root(struct btrfs_root *root) 1444 { 1445 struct btrfs_fs_info *fs_info = root->fs_info; 1446 1447 spin_lock(&fs_info->trans_lock); 1448 if (list_empty(&root->root_list)) { 1449 btrfs_grab_root(root); 1450 1451 /* We want to process the partially complete drops first. */ 1452 if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) 1453 list_add(&root->root_list, &fs_info->dead_roots); 1454 else 1455 list_add_tail(&root->root_list, &fs_info->dead_roots); 1456 } 1457 spin_unlock(&fs_info->trans_lock); 1458 } 1459 1460 /* 1461 * Update each subvolume root and its relocation root, if it exists, in the tree 1462 * of tree roots. Also free log roots if they exist. 1463 */ 1464 static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) 1465 { 1466 struct btrfs_fs_info *fs_info = trans->fs_info; 1467 struct btrfs_root *gang[8]; 1468 int i; 1469 int ret; 1470 1471 /* 1472 * At this point no one can be using this transaction to modify any tree 1473 * and no one can start another transaction to modify any tree either. 1474 */ 1475 ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); 1476 1477 spin_lock(&fs_info->fs_roots_radix_lock); 1478 while (1) { 1479 ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, 1480 (void **)gang, 0, 1481 ARRAY_SIZE(gang), 1482 BTRFS_ROOT_TRANS_TAG); 1483 if (ret == 0) 1484 break; 1485 for (i = 0; i < ret; i++) { 1486 struct btrfs_root *root = gang[i]; 1487 int ret2; 1488 1489 /* 1490 * At this point we can neither have tasks logging inodes 1491 * from a root nor trying to commit a log tree. 1492 */ 1493 ASSERT(atomic_read(&root->log_writers) == 0); 1494 ASSERT(atomic_read(&root->log_commit[0]) == 0); 1495 ASSERT(atomic_read(&root->log_commit[1]) == 0); 1496 1497 radix_tree_tag_clear(&fs_info->fs_roots_radix, 1498 (unsigned long)root->root_key.objectid, 1499 BTRFS_ROOT_TRANS_TAG); 1500 spin_unlock(&fs_info->fs_roots_radix_lock); 1501 1502 btrfs_free_log(trans, root); 1503 ret2 = btrfs_update_reloc_root(trans, root); 1504 if (ret2) 1505 return ret2; 1506 1507 /* see comments in should_cow_block() */ 1508 clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); 1509 smp_mb__after_atomic(); 1510 1511 if (root->commit_root != root->node) { 1512 list_add_tail(&root->dirty_list, 1513 &trans->transaction->switch_commits); 1514 btrfs_set_root_node(&root->root_item, 1515 root->node); 1516 } 1517 1518 ret2 = btrfs_update_root(trans, fs_info->tree_root, 1519 &root->root_key, 1520 &root->root_item); 1521 if (ret2) 1522 return ret2; 1523 spin_lock(&fs_info->fs_roots_radix_lock); 1524 btrfs_qgroup_free_meta_all_pertrans(root); 1525 } 1526 } 1527 spin_unlock(&fs_info->fs_roots_radix_lock); 1528 return 0; 1529 } 1530 1531 /* 1532 * Do all special snapshot related qgroup dirty hack. 1533 * 1534 * Will do all needed qgroup inherit and dirty hack like switch commit 1535 * roots inside one transaction and write all btree into disk, to make 1536 * qgroup works. 1537 */ 1538 static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, 1539 struct btrfs_root *src, 1540 struct btrfs_root *parent, 1541 struct btrfs_qgroup_inherit *inherit, 1542 u64 dst_objectid) 1543 { 1544 struct btrfs_fs_info *fs_info = src->fs_info; 1545 int ret; 1546 1547 /* 1548 * Save some performance in the case that qgroups are not enabled. If 1549 * this check races with the ioctl, rescan will kick in anyway. 1550 */ 1551 if (!btrfs_qgroup_full_accounting(fs_info)) 1552 return 0; 1553 1554 /* 1555 * Ensure dirty @src will be committed. Or, after coming 1556 * commit_fs_roots() and switch_commit_roots(), any dirty but not 1557 * recorded root will never be updated again, causing an outdated root 1558 * item. 1559 */ 1560 ret = record_root_in_trans(trans, src, 1); 1561 if (ret) 1562 return ret; 1563 1564 /* 1565 * btrfs_qgroup_inherit relies on a consistent view of the usage for the 1566 * src root, so we must run the delayed refs here. 1567 * 1568 * However this isn't particularly fool proof, because there's no 1569 * synchronization keeping us from changing the tree after this point 1570 * before we do the qgroup_inherit, or even from making changes while 1571 * we're doing the qgroup_inherit. But that's a problem for the future, 1572 * for now flush the delayed refs to narrow the race window where the 1573 * qgroup counters could end up wrong. 1574 */ 1575 ret = btrfs_run_delayed_refs(trans, U64_MAX); 1576 if (ret) { 1577 btrfs_abort_transaction(trans, ret); 1578 return ret; 1579 } 1580 1581 ret = commit_fs_roots(trans); 1582 if (ret) 1583 goto out; 1584 ret = btrfs_qgroup_account_extents(trans); 1585 if (ret < 0) 1586 goto out; 1587 1588 /* Now qgroup are all updated, we can inherit it to new qgroups */ 1589 ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid, 1590 parent->root_key.objectid, inherit); 1591 if (ret < 0) 1592 goto out; 1593 1594 /* 1595 * Now we do a simplified commit transaction, which will: 1596 * 1) commit all subvolume and extent tree 1597 * To ensure all subvolume and extent tree have a valid 1598 * commit_root to accounting later insert_dir_item() 1599 * 2) write all btree blocks onto disk 1600 * This is to make sure later btree modification will be cowed 1601 * Or commit_root can be populated and cause wrong qgroup numbers 1602 * In this simplified commit, we don't really care about other trees 1603 * like chunk and root tree, as they won't affect qgroup. 1604 * And we don't write super to avoid half committed status. 1605 */ 1606 ret = commit_cowonly_roots(trans); 1607 if (ret) 1608 goto out; 1609 switch_commit_roots(trans); 1610 ret = btrfs_write_and_wait_transaction(trans); 1611 if (ret) 1612 btrfs_handle_fs_error(fs_info, ret, 1613 "Error while writing out transaction for qgroup"); 1614 1615 out: 1616 /* 1617 * Force parent root to be updated, as we recorded it before so its 1618 * last_trans == cur_transid. 1619 * Or it won't be committed again onto disk after later 1620 * insert_dir_item() 1621 */ 1622 if (!ret) 1623 ret = record_root_in_trans(trans, parent, 1); 1624 return ret; 1625 } 1626 1627 /* 1628 * new snapshots need to be created at a very specific time in the 1629 * transaction commit. This does the actual creation. 1630 * 1631 * Note: 1632 * If the error which may affect the commitment of the current transaction 1633 * happens, we should return the error number. If the error which just affect 1634 * the creation of the pending snapshots, just return 0. 1635 */ 1636 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, 1637 struct btrfs_pending_snapshot *pending) 1638 { 1639 1640 struct btrfs_fs_info *fs_info = trans->fs_info; 1641 struct btrfs_key key; 1642 struct btrfs_root_item *new_root_item; 1643 struct btrfs_root *tree_root = fs_info->tree_root; 1644 struct btrfs_root *root = pending->root; 1645 struct btrfs_root *parent_root; 1646 struct btrfs_block_rsv *rsv; 1647 struct inode *parent_inode = pending->dir; 1648 struct btrfs_path *path; 1649 struct btrfs_dir_item *dir_item; 1650 struct extent_buffer *tmp; 1651 struct extent_buffer *old; 1652 struct timespec64 cur_time; 1653 int ret = 0; 1654 u64 to_reserve = 0; 1655 u64 index = 0; 1656 u64 objectid; 1657 u64 root_flags; 1658 unsigned int nofs_flags; 1659 struct fscrypt_name fname; 1660 1661 ASSERT(pending->path); 1662 path = pending->path; 1663 1664 ASSERT(pending->root_item); 1665 new_root_item = pending->root_item; 1666 1667 /* 1668 * We're inside a transaction and must make sure that any potential 1669 * allocations with GFP_KERNEL in fscrypt won't recurse back to 1670 * filesystem. 1671 */ 1672 nofs_flags = memalloc_nofs_save(); 1673 pending->error = fscrypt_setup_filename(parent_inode, 1674 &pending->dentry->d_name, 0, 1675 &fname); 1676 memalloc_nofs_restore(nofs_flags); 1677 if (pending->error) 1678 goto free_pending; 1679 1680 pending->error = btrfs_get_free_objectid(tree_root, &objectid); 1681 if (pending->error) 1682 goto free_fname; 1683 1684 /* 1685 * Make qgroup to skip current new snapshot's qgroupid, as it is 1686 * accounted by later btrfs_qgroup_inherit(). 1687 */ 1688 btrfs_set_skip_qgroup(trans, objectid); 1689 1690 btrfs_reloc_pre_snapshot(pending, &to_reserve); 1691 1692 if (to_reserve > 0) { 1693 pending->error = btrfs_block_rsv_add(fs_info, 1694 &pending->block_rsv, 1695 to_reserve, 1696 BTRFS_RESERVE_NO_FLUSH); 1697 if (pending->error) 1698 goto clear_skip_qgroup; 1699 } 1700 1701 key.objectid = objectid; 1702 key.offset = (u64)-1; 1703 key.type = BTRFS_ROOT_ITEM_KEY; 1704 1705 rsv = trans->block_rsv; 1706 trans->block_rsv = &pending->block_rsv; 1707 trans->bytes_reserved = trans->block_rsv->reserved; 1708 trace_btrfs_space_reservation(fs_info, "transaction", 1709 trans->transid, 1710 trans->bytes_reserved, 1); 1711 parent_root = BTRFS_I(parent_inode)->root; 1712 ret = record_root_in_trans(trans, parent_root, 0); 1713 if (ret) 1714 goto fail; 1715 cur_time = current_time(parent_inode); 1716 1717 /* 1718 * insert the directory item 1719 */ 1720 ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index); 1721 if (ret) { 1722 btrfs_abort_transaction(trans, ret); 1723 goto fail; 1724 } 1725 1726 /* check if there is a file/dir which has the same name. */ 1727 dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, 1728 btrfs_ino(BTRFS_I(parent_inode)), 1729 &fname.disk_name, 0); 1730 if (dir_item != NULL && !IS_ERR(dir_item)) { 1731 pending->error = -EEXIST; 1732 goto dir_item_existed; 1733 } else if (IS_ERR(dir_item)) { 1734 ret = PTR_ERR(dir_item); 1735 btrfs_abort_transaction(trans, ret); 1736 goto fail; 1737 } 1738 btrfs_release_path(path); 1739 1740 ret = btrfs_create_qgroup(trans, objectid); 1741 if (ret && ret != -EEXIST) { 1742 btrfs_abort_transaction(trans, ret); 1743 goto fail; 1744 } 1745 1746 /* 1747 * pull in the delayed directory update 1748 * and the delayed inode item 1749 * otherwise we corrupt the FS during 1750 * snapshot 1751 */ 1752 ret = btrfs_run_delayed_items(trans); 1753 if (ret) { /* Transaction aborted */ 1754 btrfs_abort_transaction(trans, ret); 1755 goto fail; 1756 } 1757 1758 ret = record_root_in_trans(trans, root, 0); 1759 if (ret) { 1760 btrfs_abort_transaction(trans, ret); 1761 goto fail; 1762 } 1763 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 1764 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 1765 btrfs_check_and_init_root_item(new_root_item); 1766 1767 root_flags = btrfs_root_flags(new_root_item); 1768 if (pending->readonly) 1769 root_flags |= BTRFS_ROOT_SUBVOL_RDONLY; 1770 else 1771 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; 1772 btrfs_set_root_flags(new_root_item, root_flags); 1773 1774 btrfs_set_root_generation_v2(new_root_item, 1775 trans->transid); 1776 generate_random_guid(new_root_item->uuid); 1777 memcpy(new_root_item->parent_uuid, root->root_item.uuid, 1778 BTRFS_UUID_SIZE); 1779 if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) { 1780 memset(new_root_item->received_uuid, 0, 1781 sizeof(new_root_item->received_uuid)); 1782 memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); 1783 memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); 1784 btrfs_set_root_stransid(new_root_item, 0); 1785 btrfs_set_root_rtransid(new_root_item, 0); 1786 } 1787 btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec); 1788 btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec); 1789 btrfs_set_root_otransid(new_root_item, trans->transid); 1790 1791 old = btrfs_lock_root_node(root); 1792 ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, 1793 BTRFS_NESTING_COW); 1794 if (ret) { 1795 btrfs_tree_unlock(old); 1796 free_extent_buffer(old); 1797 btrfs_abort_transaction(trans, ret); 1798 goto fail; 1799 } 1800 1801 ret = btrfs_copy_root(trans, root, old, &tmp, objectid); 1802 /* clean up in any case */ 1803 btrfs_tree_unlock(old); 1804 free_extent_buffer(old); 1805 if (ret) { 1806 btrfs_abort_transaction(trans, ret); 1807 goto fail; 1808 } 1809 /* see comments in should_cow_block() */ 1810 set_bit(BTRFS_ROOT_FORCE_COW, &root->state); 1811 smp_wmb(); 1812 1813 btrfs_set_root_node(new_root_item, tmp); 1814 /* record when the snapshot was created in key.offset */ 1815 key.offset = trans->transid; 1816 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); 1817 btrfs_tree_unlock(tmp); 1818 free_extent_buffer(tmp); 1819 if (ret) { 1820 btrfs_abort_transaction(trans, ret); 1821 goto fail; 1822 } 1823 1824 /* 1825 * insert root back/forward references 1826 */ 1827 ret = btrfs_add_root_ref(trans, objectid, 1828 parent_root->root_key.objectid, 1829 btrfs_ino(BTRFS_I(parent_inode)), index, 1830 &fname.disk_name); 1831 if (ret) { 1832 btrfs_abort_transaction(trans, ret); 1833 goto fail; 1834 } 1835 1836 key.offset = (u64)-1; 1837 pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev); 1838 if (IS_ERR(pending->snap)) { 1839 ret = PTR_ERR(pending->snap); 1840 pending->snap = NULL; 1841 btrfs_abort_transaction(trans, ret); 1842 goto fail; 1843 } 1844 1845 ret = btrfs_reloc_post_snapshot(trans, pending); 1846 if (ret) { 1847 btrfs_abort_transaction(trans, ret); 1848 goto fail; 1849 } 1850 1851 /* 1852 * Do special qgroup accounting for snapshot, as we do some qgroup 1853 * snapshot hack to do fast snapshot. 1854 * To co-operate with that hack, we do hack again. 1855 * Or snapshot will be greatly slowed down by a subtree qgroup rescan 1856 */ 1857 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) 1858 ret = qgroup_account_snapshot(trans, root, parent_root, 1859 pending->inherit, objectid); 1860 else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) 1861 ret = btrfs_qgroup_inherit(trans, root->root_key.objectid, objectid, 1862 parent_root->root_key.objectid, pending->inherit); 1863 if (ret < 0) 1864 goto fail; 1865 1866 ret = btrfs_insert_dir_item(trans, &fname.disk_name, 1867 BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, 1868 index); 1869 /* We have check then name at the beginning, so it is impossible. */ 1870 BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); 1871 if (ret) { 1872 btrfs_abort_transaction(trans, ret); 1873 goto fail; 1874 } 1875 1876 btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + 1877 fname.disk_name.len * 2); 1878 inode_set_mtime_to_ts(parent_inode, 1879 inode_set_ctime_current(parent_inode)); 1880 ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode)); 1881 if (ret) { 1882 btrfs_abort_transaction(trans, ret); 1883 goto fail; 1884 } 1885 ret = btrfs_uuid_tree_add(trans, new_root_item->uuid, 1886 BTRFS_UUID_KEY_SUBVOL, 1887 objectid); 1888 if (ret) { 1889 btrfs_abort_transaction(trans, ret); 1890 goto fail; 1891 } 1892 if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { 1893 ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid, 1894 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 1895 objectid); 1896 if (ret && ret != -EEXIST) { 1897 btrfs_abort_transaction(trans, ret); 1898 goto fail; 1899 } 1900 } 1901 1902 fail: 1903 pending->error = ret; 1904 dir_item_existed: 1905 trans->block_rsv = rsv; 1906 trans->bytes_reserved = 0; 1907 clear_skip_qgroup: 1908 btrfs_clear_skip_qgroup(trans); 1909 free_fname: 1910 fscrypt_free_filename(&fname); 1911 free_pending: 1912 kfree(new_root_item); 1913 pending->root_item = NULL; 1914 btrfs_free_path(path); 1915 pending->path = NULL; 1916 1917 return ret; 1918 } 1919 1920 /* 1921 * create all the snapshots we've scheduled for creation 1922 */ 1923 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans) 1924 { 1925 struct btrfs_pending_snapshot *pending, *next; 1926 struct list_head *head = &trans->transaction->pending_snapshots; 1927 int ret = 0; 1928 1929 list_for_each_entry_safe(pending, next, head, list) { 1930 list_del(&pending->list); 1931 ret = create_pending_snapshot(trans, pending); 1932 if (ret) 1933 break; 1934 } 1935 return ret; 1936 } 1937 1938 static void update_super_roots(struct btrfs_fs_info *fs_info) 1939 { 1940 struct btrfs_root_item *root_item; 1941 struct btrfs_super_block *super; 1942 1943 super = fs_info->super_copy; 1944 1945 root_item = &fs_info->chunk_root->root_item; 1946 super->chunk_root = root_item->bytenr; 1947 super->chunk_root_generation = root_item->generation; 1948 super->chunk_root_level = root_item->level; 1949 1950 root_item = &fs_info->tree_root->root_item; 1951 super->root = root_item->bytenr; 1952 super->generation = root_item->generation; 1953 super->root_level = root_item->level; 1954 if (btrfs_test_opt(fs_info, SPACE_CACHE)) 1955 super->cache_generation = root_item->generation; 1956 else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags)) 1957 super->cache_generation = 0; 1958 if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) 1959 super->uuid_tree_generation = root_item->generation; 1960 } 1961 1962 int btrfs_transaction_in_commit(struct btrfs_fs_info *info) 1963 { 1964 struct btrfs_transaction *trans; 1965 int ret = 0; 1966 1967 spin_lock(&info->trans_lock); 1968 trans = info->running_transaction; 1969 if (trans) 1970 ret = (trans->state >= TRANS_STATE_COMMIT_START); 1971 spin_unlock(&info->trans_lock); 1972 return ret; 1973 } 1974 1975 int btrfs_transaction_blocked(struct btrfs_fs_info *info) 1976 { 1977 struct btrfs_transaction *trans; 1978 int ret = 0; 1979 1980 spin_lock(&info->trans_lock); 1981 trans = info->running_transaction; 1982 if (trans) 1983 ret = is_transaction_blocked(trans); 1984 spin_unlock(&info->trans_lock); 1985 return ret; 1986 } 1987 1988 void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) 1989 { 1990 struct btrfs_fs_info *fs_info = trans->fs_info; 1991 struct btrfs_transaction *cur_trans; 1992 1993 /* Kick the transaction kthread. */ 1994 set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); 1995 wake_up_process(fs_info->transaction_kthread); 1996 1997 /* take transaction reference */ 1998 cur_trans = trans->transaction; 1999 refcount_inc(&cur_trans->use_count); 2000 2001 btrfs_end_transaction(trans); 2002 2003 /* 2004 * Wait for the current transaction commit to start and block 2005 * subsequent transaction joins 2006 */ 2007 btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); 2008 wait_event(fs_info->transaction_blocked_wait, 2009 cur_trans->state >= TRANS_STATE_COMMIT_START || 2010 TRANS_ABORTED(cur_trans)); 2011 btrfs_put_transaction(cur_trans); 2012 } 2013 2014 static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) 2015 { 2016 struct btrfs_fs_info *fs_info = trans->fs_info; 2017 struct btrfs_transaction *cur_trans = trans->transaction; 2018 2019 WARN_ON(refcount_read(&trans->use_count) > 1); 2020 2021 btrfs_abort_transaction(trans, err); 2022 2023 spin_lock(&fs_info->trans_lock); 2024 2025 /* 2026 * If the transaction is removed from the list, it means this 2027 * transaction has been committed successfully, so it is impossible 2028 * to call the cleanup function. 2029 */ 2030 BUG_ON(list_empty(&cur_trans->list)); 2031 2032 if (cur_trans == fs_info->running_transaction) { 2033 cur_trans->state = TRANS_STATE_COMMIT_DOING; 2034 spin_unlock(&fs_info->trans_lock); 2035 2036 /* 2037 * The thread has already released the lockdep map as reader 2038 * already in btrfs_commit_transaction(). 2039 */ 2040 btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); 2041 wait_event(cur_trans->writer_wait, 2042 atomic_read(&cur_trans->num_writers) == 1); 2043 2044 spin_lock(&fs_info->trans_lock); 2045 } 2046 2047 /* 2048 * Now that we know no one else is still using the transaction we can 2049 * remove the transaction from the list of transactions. This avoids 2050 * the transaction kthread from cleaning up the transaction while some 2051 * other task is still using it, which could result in a use-after-free 2052 * on things like log trees, as it forces the transaction kthread to 2053 * wait for this transaction to be cleaned up by us. 2054 */ 2055 list_del_init(&cur_trans->list); 2056 2057 spin_unlock(&fs_info->trans_lock); 2058 2059 btrfs_cleanup_one_transaction(trans->transaction, fs_info); 2060 2061 spin_lock(&fs_info->trans_lock); 2062 if (cur_trans == fs_info->running_transaction) 2063 fs_info->running_transaction = NULL; 2064 spin_unlock(&fs_info->trans_lock); 2065 2066 if (trans->type & __TRANS_FREEZABLE) 2067 sb_end_intwrite(fs_info->sb); 2068 btrfs_put_transaction(cur_trans); 2069 btrfs_put_transaction(cur_trans); 2070 2071 trace_btrfs_transaction_commit(fs_info); 2072 2073 if (current->journal_info == trans) 2074 current->journal_info = NULL; 2075 2076 /* 2077 * If relocation is running, we can't cancel scrub because that will 2078 * result in a deadlock. Before relocating a block group, relocation 2079 * pauses scrub, then starts and commits a transaction before unpausing 2080 * scrub. If the transaction commit is being done by the relocation 2081 * task or triggered by another task and the relocation task is waiting 2082 * for the commit, and we end up here due to an error in the commit 2083 * path, then calling btrfs_scrub_cancel() will deadlock, as we are 2084 * asking for scrub to stop while having it asked to be paused higher 2085 * above in relocation code. 2086 */ 2087 if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) 2088 btrfs_scrub_cancel(fs_info); 2089 2090 kmem_cache_free(btrfs_trans_handle_cachep, trans); 2091 } 2092 2093 /* 2094 * Release reserved delayed ref space of all pending block groups of the 2095 * transaction and remove them from the list 2096 */ 2097 static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) 2098 { 2099 struct btrfs_fs_info *fs_info = trans->fs_info; 2100 struct btrfs_block_group *block_group, *tmp; 2101 2102 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { 2103 btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); 2104 list_del_init(&block_group->bg_list); 2105 } 2106 } 2107 2108 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) 2109 { 2110 /* 2111 * We use try_to_writeback_inodes_sb() here because if we used 2112 * btrfs_start_delalloc_roots we would deadlock with fs freeze. 2113 * Currently are holding the fs freeze lock, if we do an async flush 2114 * we'll do btrfs_join_transaction() and deadlock because we need to 2115 * wait for the fs freeze lock. Using the direct flushing we benefit 2116 * from already being in a transaction and our join_transaction doesn't 2117 * have to re-take the fs freeze lock. 2118 * 2119 * Note that try_to_writeback_inodes_sb() will only trigger writeback 2120 * if it can read lock sb->s_umount. It will always be able to lock it, 2121 * except when the filesystem is being unmounted or being frozen, but in 2122 * those cases sync_filesystem() is called, which results in calling 2123 * writeback_inodes_sb() while holding a write lock on sb->s_umount. 2124 * Note that we don't call writeback_inodes_sb() directly, because it 2125 * will emit a warning if sb->s_umount is not locked. 2126 */ 2127 if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) 2128 try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); 2129 return 0; 2130 } 2131 2132 static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) 2133 { 2134 if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) 2135 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); 2136 } 2137 2138 /* 2139 * Add a pending snapshot associated with the given transaction handle to the 2140 * respective handle. This must be called after the transaction commit started 2141 * and while holding fs_info->trans_lock. 2142 * This serves to guarantee a caller of btrfs_commit_transaction() that it can 2143 * safely free the pending snapshot pointer in case btrfs_commit_transaction() 2144 * returns an error. 2145 */ 2146 static void add_pending_snapshot(struct btrfs_trans_handle *trans) 2147 { 2148 struct btrfs_transaction *cur_trans = trans->transaction; 2149 2150 if (!trans->pending_snapshot) 2151 return; 2152 2153 lockdep_assert_held(&trans->fs_info->trans_lock); 2154 ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP); 2155 2156 list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); 2157 } 2158 2159 static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval) 2160 { 2161 fs_info->commit_stats.commit_count++; 2162 fs_info->commit_stats.last_commit_dur = interval; 2163 fs_info->commit_stats.max_commit_dur = 2164 max_t(u64, fs_info->commit_stats.max_commit_dur, interval); 2165 fs_info->commit_stats.total_commit_dur += interval; 2166 } 2167 2168 int btrfs_commit_transaction(struct btrfs_trans_handle *trans) 2169 { 2170 struct btrfs_fs_info *fs_info = trans->fs_info; 2171 struct btrfs_transaction *cur_trans = trans->transaction; 2172 struct btrfs_transaction *prev_trans = NULL; 2173 int ret; 2174 ktime_t start_time; 2175 ktime_t interval; 2176 2177 ASSERT(refcount_read(&trans->use_count) == 1); 2178 btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); 2179 2180 clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); 2181 2182 /* Stop the commit early if ->aborted is set */ 2183 if (TRANS_ABORTED(cur_trans)) { 2184 ret = cur_trans->aborted; 2185 goto lockdep_trans_commit_start_release; 2186 } 2187 2188 btrfs_trans_release_metadata(trans); 2189 trans->block_rsv = NULL; 2190 2191 /* 2192 * We only want one transaction commit doing the flushing so we do not 2193 * waste a bunch of time on lock contention on the extent root node. 2194 */ 2195 if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING, 2196 &cur_trans->delayed_refs.flags)) { 2197 /* 2198 * Make a pass through all the delayed refs we have so far. 2199 * Any running threads may add more while we are here. 2200 */ 2201 ret = btrfs_run_delayed_refs(trans, 0); 2202 if (ret) 2203 goto lockdep_trans_commit_start_release; 2204 } 2205 2206 btrfs_create_pending_block_groups(trans); 2207 2208 if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) { 2209 int run_it = 0; 2210 2211 /* this mutex is also taken before trying to set 2212 * block groups readonly. We need to make sure 2213 * that nobody has set a block group readonly 2214 * after a extents from that block group have been 2215 * allocated for cache files. btrfs_set_block_group_ro 2216 * will wait for the transaction to commit if it 2217 * finds BTRFS_TRANS_DIRTY_BG_RUN set. 2218 * 2219 * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure 2220 * only one process starts all the block group IO. It wouldn't 2221 * hurt to have more than one go through, but there's no 2222 * real advantage to it either. 2223 */ 2224 mutex_lock(&fs_info->ro_block_group_mutex); 2225 if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN, 2226 &cur_trans->flags)) 2227 run_it = 1; 2228 mutex_unlock(&fs_info->ro_block_group_mutex); 2229 2230 if (run_it) { 2231 ret = btrfs_start_dirty_block_groups(trans); 2232 if (ret) 2233 goto lockdep_trans_commit_start_release; 2234 } 2235 } 2236 2237 spin_lock(&fs_info->trans_lock); 2238 if (cur_trans->state >= TRANS_STATE_COMMIT_PREP) { 2239 enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; 2240 2241 add_pending_snapshot(trans); 2242 2243 spin_unlock(&fs_info->trans_lock); 2244 refcount_inc(&cur_trans->use_count); 2245 2246 if (trans->in_fsync) 2247 want_state = TRANS_STATE_SUPER_COMMITTED; 2248 2249 btrfs_trans_state_lockdep_release(fs_info, 2250 BTRFS_LOCKDEP_TRANS_COMMIT_PREP); 2251 ret = btrfs_end_transaction(trans); 2252 wait_for_commit(cur_trans, want_state); 2253 2254 if (TRANS_ABORTED(cur_trans)) 2255 ret = cur_trans->aborted; 2256 2257 btrfs_put_transaction(cur_trans); 2258 2259 return ret; 2260 } 2261 2262 cur_trans->state = TRANS_STATE_COMMIT_PREP; 2263 wake_up(&fs_info->transaction_blocked_wait); 2264 btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); 2265 2266 if (cur_trans->list.prev != &fs_info->trans_list) { 2267 enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; 2268 2269 if (trans->in_fsync) 2270 want_state = TRANS_STATE_SUPER_COMMITTED; 2271 2272 prev_trans = list_entry(cur_trans->list.prev, 2273 struct btrfs_transaction, list); 2274 if (prev_trans->state < want_state) { 2275 refcount_inc(&prev_trans->use_count); 2276 spin_unlock(&fs_info->trans_lock); 2277 2278 wait_for_commit(prev_trans, want_state); 2279 2280 ret = READ_ONCE(prev_trans->aborted); 2281 2282 btrfs_put_transaction(prev_trans); 2283 if (ret) 2284 goto lockdep_release; 2285 spin_lock(&fs_info->trans_lock); 2286 } 2287 } else { 2288 /* 2289 * The previous transaction was aborted and was already removed 2290 * from the list of transactions at fs_info->trans_list. So we 2291 * abort to prevent writing a new superblock that reflects a 2292 * corrupt state (pointing to trees with unwritten nodes/leafs). 2293 */ 2294 if (BTRFS_FS_ERROR(fs_info)) { 2295 spin_unlock(&fs_info->trans_lock); 2296 ret = -EROFS; 2297 goto lockdep_release; 2298 } 2299 } 2300 2301 cur_trans->state = TRANS_STATE_COMMIT_START; 2302 wake_up(&fs_info->transaction_blocked_wait); 2303 spin_unlock(&fs_info->trans_lock); 2304 2305 /* 2306 * Get the time spent on the work done by the commit thread and not 2307 * the time spent waiting on a previous commit 2308 */ 2309 start_time = ktime_get_ns(); 2310 2311 extwriter_counter_dec(cur_trans, trans->type); 2312 2313 ret = btrfs_start_delalloc_flush(fs_info); 2314 if (ret) 2315 goto lockdep_release; 2316 2317 ret = btrfs_run_delayed_items(trans); 2318 if (ret) 2319 goto lockdep_release; 2320 2321 /* 2322 * The thread has started/joined the transaction thus it holds the 2323 * lockdep map as a reader. It has to release it before acquiring the 2324 * lockdep map as a writer. 2325 */ 2326 btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); 2327 btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters); 2328 wait_event(cur_trans->writer_wait, 2329 extwriter_counter_read(cur_trans) == 0); 2330 2331 /* some pending stuffs might be added after the previous flush. */ 2332 ret = btrfs_run_delayed_items(trans); 2333 if (ret) { 2334 btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); 2335 goto cleanup_transaction; 2336 } 2337 2338 btrfs_wait_delalloc_flush(fs_info); 2339 2340 /* 2341 * Wait for all ordered extents started by a fast fsync that joined this 2342 * transaction. Otherwise if this transaction commits before the ordered 2343 * extents complete we lose logged data after a power failure. 2344 */ 2345 btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered); 2346 wait_event(cur_trans->pending_wait, 2347 atomic_read(&cur_trans->pending_ordered) == 0); 2348 2349 btrfs_scrub_pause(fs_info); 2350 /* 2351 * Ok now we need to make sure to block out any other joins while we 2352 * commit the transaction. We could have started a join before setting 2353 * COMMIT_DOING so make sure to wait for num_writers to == 1 again. 2354 */ 2355 spin_lock(&fs_info->trans_lock); 2356 add_pending_snapshot(trans); 2357 cur_trans->state = TRANS_STATE_COMMIT_DOING; 2358 spin_unlock(&fs_info->trans_lock); 2359 2360 /* 2361 * The thread has started/joined the transaction thus it holds the 2362 * lockdep map as a reader. It has to release it before acquiring the 2363 * lockdep map as a writer. 2364 */ 2365 btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); 2366 btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); 2367 wait_event(cur_trans->writer_wait, 2368 atomic_read(&cur_trans->num_writers) == 1); 2369 2370 /* 2371 * Make lockdep happy by acquiring the state locks after 2372 * btrfs_trans_num_writers is released. If we acquired the state locks 2373 * before releasing the btrfs_trans_num_writers lock then lockdep would 2374 * complain because we did not follow the reverse order unlocking rule. 2375 */ 2376 btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); 2377 btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); 2378 btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); 2379 2380 /* 2381 * We've started the commit, clear the flag in case we were triggered to 2382 * do an async commit but somebody else started before the transaction 2383 * kthread could do the work. 2384 */ 2385 clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); 2386 2387 if (TRANS_ABORTED(cur_trans)) { 2388 ret = cur_trans->aborted; 2389 btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); 2390 goto scrub_continue; 2391 } 2392 /* 2393 * the reloc mutex makes sure that we stop 2394 * the balancing code from coming in and moving 2395 * extents around in the middle of the commit 2396 */ 2397 mutex_lock(&fs_info->reloc_mutex); 2398 2399 /* 2400 * We needn't worry about the delayed items because we will 2401 * deal with them in create_pending_snapshot(), which is the 2402 * core function of the snapshot creation. 2403 */ 2404 ret = create_pending_snapshots(trans); 2405 if (ret) 2406 goto unlock_reloc; 2407 2408 /* 2409 * We insert the dir indexes of the snapshots and update the inode 2410 * of the snapshots' parents after the snapshot creation, so there 2411 * are some delayed items which are not dealt with. Now deal with 2412 * them. 2413 * 2414 * We needn't worry that this operation will corrupt the snapshots, 2415 * because all the tree which are snapshoted will be forced to COW 2416 * the nodes and leaves. 2417 */ 2418 ret = btrfs_run_delayed_items(trans); 2419 if (ret) 2420 goto unlock_reloc; 2421 2422 ret = btrfs_run_delayed_refs(trans, U64_MAX); 2423 if (ret) 2424 goto unlock_reloc; 2425 2426 /* 2427 * make sure none of the code above managed to slip in a 2428 * delayed item 2429 */ 2430 btrfs_assert_delayed_root_empty(fs_info); 2431 2432 WARN_ON(cur_trans != trans->transaction); 2433 2434 ret = commit_fs_roots(trans); 2435 if (ret) 2436 goto unlock_reloc; 2437 2438 /* commit_fs_roots gets rid of all the tree log roots, it is now 2439 * safe to free the root of tree log roots 2440 */ 2441 btrfs_free_log_root_tree(trans, fs_info); 2442 2443 /* 2444 * Since fs roots are all committed, we can get a quite accurate 2445 * new_roots. So let's do quota accounting. 2446 */ 2447 ret = btrfs_qgroup_account_extents(trans); 2448 if (ret < 0) 2449 goto unlock_reloc; 2450 2451 ret = commit_cowonly_roots(trans); 2452 if (ret) 2453 goto unlock_reloc; 2454 2455 /* 2456 * The tasks which save the space cache and inode cache may also 2457 * update ->aborted, check it. 2458 */ 2459 if (TRANS_ABORTED(cur_trans)) { 2460 ret = cur_trans->aborted; 2461 goto unlock_reloc; 2462 } 2463 2464 cur_trans = fs_info->running_transaction; 2465 2466 btrfs_set_root_node(&fs_info->tree_root->root_item, 2467 fs_info->tree_root->node); 2468 list_add_tail(&fs_info->tree_root->dirty_list, 2469 &cur_trans->switch_commits); 2470 2471 btrfs_set_root_node(&fs_info->chunk_root->root_item, 2472 fs_info->chunk_root->node); 2473 list_add_tail(&fs_info->chunk_root->dirty_list, 2474 &cur_trans->switch_commits); 2475 2476 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 2477 btrfs_set_root_node(&fs_info->block_group_root->root_item, 2478 fs_info->block_group_root->node); 2479 list_add_tail(&fs_info->block_group_root->dirty_list, 2480 &cur_trans->switch_commits); 2481 } 2482 2483 switch_commit_roots(trans); 2484 2485 ASSERT(list_empty(&cur_trans->dirty_bgs)); 2486 ASSERT(list_empty(&cur_trans->io_bgs)); 2487 update_super_roots(fs_info); 2488 2489 btrfs_set_super_log_root(fs_info->super_copy, 0); 2490 btrfs_set_super_log_root_level(fs_info->super_copy, 0); 2491 memcpy(fs_info->super_for_commit, fs_info->super_copy, 2492 sizeof(*fs_info->super_copy)); 2493 2494 btrfs_commit_device_sizes(cur_trans); 2495 2496 clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); 2497 clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); 2498 2499 btrfs_trans_release_chunk_metadata(trans); 2500 2501 /* 2502 * Before changing the transaction state to TRANS_STATE_UNBLOCKED and 2503 * setting fs_info->running_transaction to NULL, lock tree_log_mutex to 2504 * make sure that before we commit our superblock, no other task can 2505 * start a new transaction and commit a log tree before we commit our 2506 * superblock. Anyone trying to commit a log tree locks this mutex before 2507 * writing its superblock. 2508 */ 2509 mutex_lock(&fs_info->tree_log_mutex); 2510 2511 spin_lock(&fs_info->trans_lock); 2512 cur_trans->state = TRANS_STATE_UNBLOCKED; 2513 fs_info->running_transaction = NULL; 2514 spin_unlock(&fs_info->trans_lock); 2515 mutex_unlock(&fs_info->reloc_mutex); 2516 2517 wake_up(&fs_info->transaction_wait); 2518 btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); 2519 2520 /* If we have features changed, wake up the cleaner to update sysfs. */ 2521 if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) && 2522 fs_info->cleaner_kthread) 2523 wake_up_process(fs_info->cleaner_kthread); 2524 2525 ret = btrfs_write_and_wait_transaction(trans); 2526 if (ret) { 2527 btrfs_handle_fs_error(fs_info, ret, 2528 "Error while writing out transaction"); 2529 mutex_unlock(&fs_info->tree_log_mutex); 2530 goto scrub_continue; 2531 } 2532 2533 ret = write_all_supers(fs_info, 0); 2534 /* 2535 * the super is written, we can safely allow the tree-loggers 2536 * to go about their business 2537 */ 2538 mutex_unlock(&fs_info->tree_log_mutex); 2539 if (ret) 2540 goto scrub_continue; 2541 2542 /* 2543 * We needn't acquire the lock here because there is no other task 2544 * which can change it. 2545 */ 2546 cur_trans->state = TRANS_STATE_SUPER_COMMITTED; 2547 wake_up(&cur_trans->commit_wait); 2548 btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); 2549 2550 btrfs_finish_extent_commit(trans); 2551 2552 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) 2553 btrfs_clear_space_info_full(fs_info); 2554 2555 btrfs_set_last_trans_committed(fs_info, cur_trans->transid); 2556 /* 2557 * We needn't acquire the lock here because there is no other task 2558 * which can change it. 2559 */ 2560 cur_trans->state = TRANS_STATE_COMPLETED; 2561 wake_up(&cur_trans->commit_wait); 2562 btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); 2563 2564 spin_lock(&fs_info->trans_lock); 2565 list_del_init(&cur_trans->list); 2566 spin_unlock(&fs_info->trans_lock); 2567 2568 btrfs_put_transaction(cur_trans); 2569 btrfs_put_transaction(cur_trans); 2570 2571 if (trans->type & __TRANS_FREEZABLE) 2572 sb_end_intwrite(fs_info->sb); 2573 2574 trace_btrfs_transaction_commit(fs_info); 2575 2576 interval = ktime_get_ns() - start_time; 2577 2578 btrfs_scrub_continue(fs_info); 2579 2580 if (current->journal_info == trans) 2581 current->journal_info = NULL; 2582 2583 kmem_cache_free(btrfs_trans_handle_cachep, trans); 2584 2585 update_commit_stats(fs_info, interval); 2586 2587 return ret; 2588 2589 unlock_reloc: 2590 mutex_unlock(&fs_info->reloc_mutex); 2591 btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); 2592 scrub_continue: 2593 btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); 2594 btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); 2595 btrfs_scrub_continue(fs_info); 2596 cleanup_transaction: 2597 btrfs_trans_release_metadata(trans); 2598 btrfs_cleanup_pending_block_groups(trans); 2599 btrfs_trans_release_chunk_metadata(trans); 2600 trans->block_rsv = NULL; 2601 btrfs_warn(fs_info, "Skipping commit of aborted transaction."); 2602 if (current->journal_info == trans) 2603 current->journal_info = NULL; 2604 cleanup_transaction(trans, ret); 2605 2606 return ret; 2607 2608 lockdep_release: 2609 btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); 2610 btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); 2611 goto cleanup_transaction; 2612 2613 lockdep_trans_commit_start_release: 2614 btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); 2615 btrfs_end_transaction(trans); 2616 return ret; 2617 } 2618 2619 /* 2620 * return < 0 if error 2621 * 0 if there are no more dead_roots at the time of call 2622 * 1 there are more to be processed, call me again 2623 * 2624 * The return value indicates there are certainly more snapshots to delete, but 2625 * if there comes a new one during processing, it may return 0. We don't mind, 2626 * because btrfs_commit_super will poke cleaner thread and it will process it a 2627 * few seconds later. 2628 */ 2629 int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) 2630 { 2631 struct btrfs_root *root; 2632 int ret; 2633 2634 spin_lock(&fs_info->trans_lock); 2635 if (list_empty(&fs_info->dead_roots)) { 2636 spin_unlock(&fs_info->trans_lock); 2637 return 0; 2638 } 2639 root = list_first_entry(&fs_info->dead_roots, 2640 struct btrfs_root, root_list); 2641 list_del_init(&root->root_list); 2642 spin_unlock(&fs_info->trans_lock); 2643 2644 btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); 2645 2646 btrfs_kill_all_delayed_nodes(root); 2647 2648 if (btrfs_header_backref_rev(root->node) < 2649 BTRFS_MIXED_BACKREF_REV) 2650 ret = btrfs_drop_snapshot(root, 0, 0); 2651 else 2652 ret = btrfs_drop_snapshot(root, 1, 0); 2653 2654 btrfs_put_root(root); 2655 return (ret < 0) ? 0 : 1; 2656 } 2657 2658 /* 2659 * We only mark the transaction aborted and then set the file system read-only. 2660 * This will prevent new transactions from starting or trying to join this 2661 * one. 2662 * 2663 * This means that error recovery at the call site is limited to freeing 2664 * any local memory allocations and passing the error code up without 2665 * further cleanup. The transaction should complete as it normally would 2666 * in the call path but will return -EIO. 2667 * 2668 * We'll complete the cleanup in btrfs_end_transaction and 2669 * btrfs_commit_transaction. 2670 */ 2671 void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, 2672 const char *function, 2673 unsigned int line, int error, bool first_hit) 2674 { 2675 struct btrfs_fs_info *fs_info = trans->fs_info; 2676 2677 WRITE_ONCE(trans->aborted, error); 2678 WRITE_ONCE(trans->transaction->aborted, error); 2679 if (first_hit && error == -ENOSPC) 2680 btrfs_dump_space_info_for_trans_abort(fs_info); 2681 /* Wake up anybody who may be waiting on this transaction */ 2682 wake_up(&fs_info->transaction_wait); 2683 wake_up(&fs_info->transaction_blocked_wait); 2684 __btrfs_handle_fs_error(fs_info, function, line, error, NULL); 2685 } 2686 2687 int __init btrfs_transaction_init(void) 2688 { 2689 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", 2690 sizeof(struct btrfs_trans_handle), 0, 2691 SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); 2692 if (!btrfs_trans_handle_cachep) 2693 return -ENOMEM; 2694 return 0; 2695 } 2696 2697 void __cold btrfs_transaction_exit(void) 2698 { 2699 kmem_cache_destroy(btrfs_trans_handle_cachep); 2700 } 2701