1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/fs.h> 20 #include <linux/slab.h> 21 #include <linux/sched.h> 22 #include <linux/writeback.h> 23 #include <linux/pagemap.h> 24 #include <linux/blkdev.h> 25 #include "ctree.h" 26 #include "disk-io.h" 27 #include "transaction.h" 28 #include "locking.h" 29 #include "tree-log.h" 30 #include "inode-map.h" 31 32 #define BTRFS_ROOT_TRANS_TAG 0 33 34 static noinline void put_transaction(struct btrfs_transaction *transaction) 35 { 36 WARN_ON(atomic_read(&transaction->use_count) == 0); 37 if (atomic_dec_and_test(&transaction->use_count)) { 38 BUG_ON(!list_empty(&transaction->list)); 39 memset(transaction, 0, sizeof(*transaction)); 40 kmem_cache_free(btrfs_transaction_cachep, transaction); 41 } 42 } 43 44 static noinline void switch_commit_root(struct btrfs_root *root) 45 { 46 free_extent_buffer(root->commit_root); 47 root->commit_root = btrfs_root_node(root); 48 } 49 50 /* 51 * either allocate a new transaction or hop into the existing one 52 */ 53 static noinline int join_transaction(struct btrfs_root *root, int nofail) 54 { 55 struct btrfs_transaction *cur_trans; 56 57 spin_lock(&root->fs_info->trans_lock); 58 if (root->fs_info->trans_no_join) { 59 if (!nofail) { 60 spin_unlock(&root->fs_info->trans_lock); 61 return -EBUSY; 62 } 63 } 64 65 cur_trans = root->fs_info->running_transaction; 66 if (cur_trans) { 67 atomic_inc(&cur_trans->use_count); 68 atomic_inc(&cur_trans->num_writers); 69 cur_trans->num_joined++; 70 spin_unlock(&root->fs_info->trans_lock); 71 return 0; 72 } 73 spin_unlock(&root->fs_info->trans_lock); 74 75 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 76 if (!cur_trans) 77 return -ENOMEM; 78 spin_lock(&root->fs_info->trans_lock); 79 if (root->fs_info->running_transaction) { 80 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 81 cur_trans = root->fs_info->running_transaction; 82 atomic_inc(&cur_trans->use_count); 83 atomic_inc(&cur_trans->num_writers); 84 cur_trans->num_joined++; 85 spin_unlock(&root->fs_info->trans_lock); 86 return 0; 87 } 88 atomic_set(&cur_trans->num_writers, 1); 89 cur_trans->num_joined = 0; 90 init_waitqueue_head(&cur_trans->writer_wait); 91 init_waitqueue_head(&cur_trans->commit_wait); 92 cur_trans->in_commit = 0; 93 cur_trans->blocked = 0; 94 /* 95 * One for this trans handle, one so it will live on until we 96 * commit the transaction. 97 */ 98 atomic_set(&cur_trans->use_count, 2); 99 cur_trans->commit_done = 0; 100 cur_trans->start_time = get_seconds(); 101 102 cur_trans->delayed_refs.root = RB_ROOT; 103 cur_trans->delayed_refs.num_entries = 0; 104 cur_trans->delayed_refs.num_heads_ready = 0; 105 cur_trans->delayed_refs.num_heads = 0; 106 cur_trans->delayed_refs.flushing = 0; 107 cur_trans->delayed_refs.run_delayed_start = 0; 108 spin_lock_init(&cur_trans->commit_lock); 109 spin_lock_init(&cur_trans->delayed_refs.lock); 110 111 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 112 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 113 extent_io_tree_init(&cur_trans->dirty_pages, 114 root->fs_info->btree_inode->i_mapping); 115 root->fs_info->generation++; 116 cur_trans->transid = root->fs_info->generation; 117 root->fs_info->running_transaction = cur_trans; 118 spin_unlock(&root->fs_info->trans_lock); 119 120 return 0; 121 } 122 123 /* 124 * this does all the record keeping required to make sure that a reference 125 * counted root is properly recorded in a given transaction. This is required 126 * to make sure the old root from before we joined the transaction is deleted 127 * when the transaction commits 128 */ 129 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 130 struct btrfs_root *root) 131 { 132 if (root->ref_cows && root->last_trans < trans->transid) { 133 WARN_ON(root == root->fs_info->extent_root); 134 WARN_ON(root->commit_root != root->node); 135 136 spin_lock(&root->fs_info->fs_roots_radix_lock); 137 if (root->last_trans == trans->transid) { 138 spin_unlock(&root->fs_info->fs_roots_radix_lock); 139 return 0; 140 } 141 root->last_trans = trans->transid; 142 radix_tree_tag_set(&root->fs_info->fs_roots_radix, 143 (unsigned long)root->root_key.objectid, 144 BTRFS_ROOT_TRANS_TAG); 145 spin_unlock(&root->fs_info->fs_roots_radix_lock); 146 btrfs_init_reloc_root(trans, root); 147 } 148 return 0; 149 } 150 151 /* wait for commit against the current transaction to become unblocked 152 * when this is done, it is safe to start a new transaction, but the current 153 * transaction might not be fully on disk. 154 */ 155 static void wait_current_trans(struct btrfs_root *root) 156 { 157 struct btrfs_transaction *cur_trans; 158 159 spin_lock(&root->fs_info->trans_lock); 160 cur_trans = root->fs_info->running_transaction; 161 if (cur_trans && cur_trans->blocked) { 162 DEFINE_WAIT(wait); 163 atomic_inc(&cur_trans->use_count); 164 spin_unlock(&root->fs_info->trans_lock); 165 while (1) { 166 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 167 TASK_UNINTERRUPTIBLE); 168 if (!cur_trans->blocked) 169 break; 170 schedule(); 171 } 172 finish_wait(&root->fs_info->transaction_wait, &wait); 173 put_transaction(cur_trans); 174 } else { 175 spin_unlock(&root->fs_info->trans_lock); 176 } 177 } 178 179 enum btrfs_trans_type { 180 TRANS_START, 181 TRANS_JOIN, 182 TRANS_USERSPACE, 183 TRANS_JOIN_NOLOCK, 184 }; 185 186 static int may_wait_transaction(struct btrfs_root *root, int type) 187 { 188 if (root->fs_info->log_root_recovering) 189 return 0; 190 191 if (type == TRANS_USERSPACE) 192 return 1; 193 194 if (type == TRANS_START && 195 !atomic_read(&root->fs_info->open_ioctl_trans)) 196 return 1; 197 198 return 0; 199 } 200 201 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 202 u64 num_items, int type) 203 { 204 struct btrfs_trans_handle *h; 205 struct btrfs_transaction *cur_trans; 206 int retries = 0; 207 int ret; 208 209 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 210 return ERR_PTR(-EROFS); 211 212 if (current->journal_info) { 213 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); 214 h = current->journal_info; 215 h->use_count++; 216 h->orig_rsv = h->block_rsv; 217 h->block_rsv = NULL; 218 goto got_it; 219 } 220 again: 221 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 222 if (!h) 223 return ERR_PTR(-ENOMEM); 224 225 if (may_wait_transaction(root, type)) 226 wait_current_trans(root); 227 228 do { 229 ret = join_transaction(root, type == TRANS_JOIN_NOLOCK); 230 if (ret == -EBUSY) 231 wait_current_trans(root); 232 } while (ret == -EBUSY); 233 234 if (ret < 0) { 235 kmem_cache_free(btrfs_trans_handle_cachep, h); 236 return ERR_PTR(ret); 237 } 238 239 cur_trans = root->fs_info->running_transaction; 240 241 h->transid = cur_trans->transid; 242 h->transaction = cur_trans; 243 h->blocks_used = 0; 244 h->bytes_reserved = 0; 245 h->delayed_ref_updates = 0; 246 h->use_count = 1; 247 h->block_rsv = NULL; 248 h->orig_rsv = NULL; 249 250 smp_mb(); 251 if (cur_trans->blocked && may_wait_transaction(root, type)) { 252 btrfs_commit_transaction(h, root); 253 goto again; 254 } 255 256 if (num_items > 0) { 257 ret = btrfs_trans_reserve_metadata(h, root, num_items); 258 if (ret == -EAGAIN && !retries) { 259 retries++; 260 btrfs_commit_transaction(h, root); 261 goto again; 262 } else if (ret == -EAGAIN) { 263 /* 264 * We have already retried and got EAGAIN, so really we 265 * don't have space, so set ret to -ENOSPC. 266 */ 267 ret = -ENOSPC; 268 } 269 270 if (ret < 0) { 271 btrfs_end_transaction(h, root); 272 return ERR_PTR(ret); 273 } 274 } 275 276 got_it: 277 btrfs_record_root_in_trans(h, root); 278 279 if (!current->journal_info && type != TRANS_USERSPACE) 280 current->journal_info = h; 281 return h; 282 } 283 284 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 285 int num_items) 286 { 287 return start_transaction(root, num_items, TRANS_START); 288 } 289 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) 290 { 291 return start_transaction(root, 0, TRANS_JOIN); 292 } 293 294 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) 295 { 296 return start_transaction(root, 0, TRANS_JOIN_NOLOCK); 297 } 298 299 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) 300 { 301 return start_transaction(root, 0, TRANS_USERSPACE); 302 } 303 304 /* wait for a transaction commit to be fully complete */ 305 static noinline int wait_for_commit(struct btrfs_root *root, 306 struct btrfs_transaction *commit) 307 { 308 DEFINE_WAIT(wait); 309 while (!commit->commit_done) { 310 prepare_to_wait(&commit->commit_wait, &wait, 311 TASK_UNINTERRUPTIBLE); 312 if (commit->commit_done) 313 break; 314 schedule(); 315 } 316 finish_wait(&commit->commit_wait, &wait); 317 return 0; 318 } 319 320 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 321 { 322 struct btrfs_transaction *cur_trans = NULL, *t; 323 int ret; 324 325 ret = 0; 326 if (transid) { 327 if (transid <= root->fs_info->last_trans_committed) 328 goto out; 329 330 /* find specified transaction */ 331 spin_lock(&root->fs_info->trans_lock); 332 list_for_each_entry(t, &root->fs_info->trans_list, list) { 333 if (t->transid == transid) { 334 cur_trans = t; 335 atomic_inc(&cur_trans->use_count); 336 break; 337 } 338 if (t->transid > transid) 339 break; 340 } 341 spin_unlock(&root->fs_info->trans_lock); 342 ret = -EINVAL; 343 if (!cur_trans) 344 goto out; /* bad transid */ 345 } else { 346 /* find newest transaction that is committing | committed */ 347 spin_lock(&root->fs_info->trans_lock); 348 list_for_each_entry_reverse(t, &root->fs_info->trans_list, 349 list) { 350 if (t->in_commit) { 351 if (t->commit_done) 352 goto out; 353 cur_trans = t; 354 atomic_inc(&cur_trans->use_count); 355 break; 356 } 357 } 358 spin_unlock(&root->fs_info->trans_lock); 359 if (!cur_trans) 360 goto out; /* nothing committing|committed */ 361 } 362 363 wait_for_commit(root, cur_trans); 364 365 put_transaction(cur_trans); 366 ret = 0; 367 out: 368 return ret; 369 } 370 371 void btrfs_throttle(struct btrfs_root *root) 372 { 373 if (!atomic_read(&root->fs_info->open_ioctl_trans)) 374 wait_current_trans(root); 375 } 376 377 static int should_end_transaction(struct btrfs_trans_handle *trans, 378 struct btrfs_root *root) 379 { 380 int ret; 381 ret = btrfs_block_rsv_check(trans, root, 382 &root->fs_info->global_block_rsv, 0, 5); 383 return ret ? 1 : 0; 384 } 385 386 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, 387 struct btrfs_root *root) 388 { 389 struct btrfs_transaction *cur_trans = trans->transaction; 390 int updates; 391 392 smp_mb(); 393 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 394 return 1; 395 396 updates = trans->delayed_ref_updates; 397 trans->delayed_ref_updates = 0; 398 if (updates) 399 btrfs_run_delayed_refs(trans, root, updates); 400 401 return should_end_transaction(trans, root); 402 } 403 404 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 405 struct btrfs_root *root, int throttle, int lock) 406 { 407 struct btrfs_transaction *cur_trans = trans->transaction; 408 struct btrfs_fs_info *info = root->fs_info; 409 int count = 0; 410 411 if (--trans->use_count) { 412 trans->block_rsv = trans->orig_rsv; 413 return 0; 414 } 415 416 while (count < 4) { 417 unsigned long cur = trans->delayed_ref_updates; 418 trans->delayed_ref_updates = 0; 419 if (cur && 420 trans->transaction->delayed_refs.num_heads_ready > 64) { 421 trans->delayed_ref_updates = 0; 422 423 /* 424 * do a full flush if the transaction is trying 425 * to close 426 */ 427 if (trans->transaction->delayed_refs.flushing) 428 cur = 0; 429 btrfs_run_delayed_refs(trans, root, cur); 430 } else { 431 break; 432 } 433 count++; 434 } 435 436 btrfs_trans_release_metadata(trans, root); 437 438 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 439 should_end_transaction(trans, root)) { 440 trans->transaction->blocked = 1; 441 smp_wmb(); 442 } 443 444 if (lock && cur_trans->blocked && !cur_trans->in_commit) { 445 if (throttle) 446 return btrfs_commit_transaction(trans, root); 447 else 448 wake_up_process(info->transaction_kthread); 449 } 450 451 WARN_ON(cur_trans != info->running_transaction); 452 WARN_ON(atomic_read(&cur_trans->num_writers) < 1); 453 atomic_dec(&cur_trans->num_writers); 454 455 smp_mb(); 456 if (waitqueue_active(&cur_trans->writer_wait)) 457 wake_up(&cur_trans->writer_wait); 458 put_transaction(cur_trans); 459 460 if (current->journal_info == trans) 461 current->journal_info = NULL; 462 memset(trans, 0, sizeof(*trans)); 463 kmem_cache_free(btrfs_trans_handle_cachep, trans); 464 465 if (throttle) 466 btrfs_run_delayed_iputs(root); 467 468 return 0; 469 } 470 471 int btrfs_end_transaction(struct btrfs_trans_handle *trans, 472 struct btrfs_root *root) 473 { 474 int ret; 475 476 ret = __btrfs_end_transaction(trans, root, 0, 1); 477 if (ret) 478 return ret; 479 return 0; 480 } 481 482 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 483 struct btrfs_root *root) 484 { 485 int ret; 486 487 ret = __btrfs_end_transaction(trans, root, 1, 1); 488 if (ret) 489 return ret; 490 return 0; 491 } 492 493 int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, 494 struct btrfs_root *root) 495 { 496 int ret; 497 498 ret = __btrfs_end_transaction(trans, root, 0, 0); 499 if (ret) 500 return ret; 501 return 0; 502 } 503 504 int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, 505 struct btrfs_root *root) 506 { 507 return __btrfs_end_transaction(trans, root, 1, 1); 508 } 509 510 /* 511 * when btree blocks are allocated, they have some corresponding bits set for 512 * them in one of two extent_io trees. This is used to make sure all of 513 * those extents are sent to disk but does not wait on them 514 */ 515 int btrfs_write_marked_extents(struct btrfs_root *root, 516 struct extent_io_tree *dirty_pages, int mark) 517 { 518 int ret; 519 int err = 0; 520 int werr = 0; 521 struct page *page; 522 struct inode *btree_inode = root->fs_info->btree_inode; 523 u64 start = 0; 524 u64 end; 525 unsigned long index; 526 527 while (1) { 528 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 529 mark); 530 if (ret) 531 break; 532 while (start <= end) { 533 cond_resched(); 534 535 index = start >> PAGE_CACHE_SHIFT; 536 start = (u64)(index + 1) << PAGE_CACHE_SHIFT; 537 page = find_get_page(btree_inode->i_mapping, index); 538 if (!page) 539 continue; 540 541 btree_lock_page_hook(page); 542 if (!page->mapping) { 543 unlock_page(page); 544 page_cache_release(page); 545 continue; 546 } 547 548 if (PageWriteback(page)) { 549 if (PageDirty(page)) 550 wait_on_page_writeback(page); 551 else { 552 unlock_page(page); 553 page_cache_release(page); 554 continue; 555 } 556 } 557 err = write_one_page(page, 0); 558 if (err) 559 werr = err; 560 page_cache_release(page); 561 } 562 } 563 if (err) 564 werr = err; 565 return werr; 566 } 567 568 /* 569 * when btree blocks are allocated, they have some corresponding bits set for 570 * them in one of two extent_io trees. This is used to make sure all of 571 * those extents are on disk for transaction or log commit. We wait 572 * on all the pages and clear them from the dirty pages state tree 573 */ 574 int btrfs_wait_marked_extents(struct btrfs_root *root, 575 struct extent_io_tree *dirty_pages, int mark) 576 { 577 int ret; 578 int err = 0; 579 int werr = 0; 580 struct page *page; 581 struct inode *btree_inode = root->fs_info->btree_inode; 582 u64 start = 0; 583 u64 end; 584 unsigned long index; 585 586 while (1) { 587 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 588 mark); 589 if (ret) 590 break; 591 592 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 593 while (start <= end) { 594 index = start >> PAGE_CACHE_SHIFT; 595 start = (u64)(index + 1) << PAGE_CACHE_SHIFT; 596 page = find_get_page(btree_inode->i_mapping, index); 597 if (!page) 598 continue; 599 if (PageDirty(page)) { 600 btree_lock_page_hook(page); 601 wait_on_page_writeback(page); 602 err = write_one_page(page, 0); 603 if (err) 604 werr = err; 605 } 606 wait_on_page_writeback(page); 607 page_cache_release(page); 608 cond_resched(); 609 } 610 } 611 if (err) 612 werr = err; 613 return werr; 614 } 615 616 /* 617 * when btree blocks are allocated, they have some corresponding bits set for 618 * them in one of two extent_io trees. This is used to make sure all of 619 * those extents are on disk for transaction or log commit 620 */ 621 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 622 struct extent_io_tree *dirty_pages, int mark) 623 { 624 int ret; 625 int ret2; 626 627 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 628 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 629 return ret || ret2; 630 } 631 632 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 633 struct btrfs_root *root) 634 { 635 if (!trans || !trans->transaction) { 636 struct inode *btree_inode; 637 btree_inode = root->fs_info->btree_inode; 638 return filemap_write_and_wait(btree_inode->i_mapping); 639 } 640 return btrfs_write_and_wait_marked_extents(root, 641 &trans->transaction->dirty_pages, 642 EXTENT_DIRTY); 643 } 644 645 /* 646 * this is used to update the root pointer in the tree of tree roots. 647 * 648 * But, in the case of the extent allocation tree, updating the root 649 * pointer may allocate blocks which may change the root of the extent 650 * allocation tree. 651 * 652 * So, this loops and repeats and makes sure the cowonly root didn't 653 * change while the root pointer was being updated in the metadata. 654 */ 655 static int update_cowonly_root(struct btrfs_trans_handle *trans, 656 struct btrfs_root *root) 657 { 658 int ret; 659 u64 old_root_bytenr; 660 u64 old_root_used; 661 struct btrfs_root *tree_root = root->fs_info->tree_root; 662 663 old_root_used = btrfs_root_used(&root->root_item); 664 btrfs_write_dirty_block_groups(trans, root); 665 666 while (1) { 667 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 668 if (old_root_bytenr == root->node->start && 669 old_root_used == btrfs_root_used(&root->root_item)) 670 break; 671 672 btrfs_set_root_node(&root->root_item, root->node); 673 ret = btrfs_update_root(trans, tree_root, 674 &root->root_key, 675 &root->root_item); 676 BUG_ON(ret); 677 678 old_root_used = btrfs_root_used(&root->root_item); 679 ret = btrfs_write_dirty_block_groups(trans, root); 680 BUG_ON(ret); 681 } 682 683 if (root != root->fs_info->extent_root) 684 switch_commit_root(root); 685 686 return 0; 687 } 688 689 /* 690 * update all the cowonly tree roots on disk 691 */ 692 static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, 693 struct btrfs_root *root) 694 { 695 struct btrfs_fs_info *fs_info = root->fs_info; 696 struct list_head *next; 697 struct extent_buffer *eb; 698 int ret; 699 700 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 701 BUG_ON(ret); 702 703 eb = btrfs_lock_root_node(fs_info->tree_root); 704 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); 705 btrfs_tree_unlock(eb); 706 free_extent_buffer(eb); 707 708 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 709 BUG_ON(ret); 710 711 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 712 next = fs_info->dirty_cowonly_roots.next; 713 list_del_init(next); 714 root = list_entry(next, struct btrfs_root, dirty_list); 715 716 update_cowonly_root(trans, root); 717 } 718 719 down_write(&fs_info->extent_commit_sem); 720 switch_commit_root(fs_info->extent_root); 721 up_write(&fs_info->extent_commit_sem); 722 723 return 0; 724 } 725 726 /* 727 * dead roots are old snapshots that need to be deleted. This allocates 728 * a dirty root struct and adds it into the list of dead roots that need to 729 * be deleted 730 */ 731 int btrfs_add_dead_root(struct btrfs_root *root) 732 { 733 spin_lock(&root->fs_info->trans_lock); 734 list_add(&root->root_list, &root->fs_info->dead_roots); 735 spin_unlock(&root->fs_info->trans_lock); 736 return 0; 737 } 738 739 /* 740 * update all the cowonly tree roots on disk 741 */ 742 static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, 743 struct btrfs_root *root) 744 { 745 struct btrfs_root *gang[8]; 746 struct btrfs_fs_info *fs_info = root->fs_info; 747 int i; 748 int ret; 749 int err = 0; 750 751 spin_lock(&fs_info->fs_roots_radix_lock); 752 while (1) { 753 ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, 754 (void **)gang, 0, 755 ARRAY_SIZE(gang), 756 BTRFS_ROOT_TRANS_TAG); 757 if (ret == 0) 758 break; 759 for (i = 0; i < ret; i++) { 760 root = gang[i]; 761 radix_tree_tag_clear(&fs_info->fs_roots_radix, 762 (unsigned long)root->root_key.objectid, 763 BTRFS_ROOT_TRANS_TAG); 764 spin_unlock(&fs_info->fs_roots_radix_lock); 765 766 btrfs_free_log(trans, root); 767 btrfs_update_reloc_root(trans, root); 768 btrfs_orphan_commit_root(trans, root); 769 770 btrfs_save_ino_cache(root, trans); 771 772 if (root->commit_root != root->node) { 773 mutex_lock(&root->fs_commit_mutex); 774 switch_commit_root(root); 775 btrfs_unpin_free_ino(root); 776 mutex_unlock(&root->fs_commit_mutex); 777 778 btrfs_set_root_node(&root->root_item, 779 root->node); 780 } 781 782 err = btrfs_update_root(trans, fs_info->tree_root, 783 &root->root_key, 784 &root->root_item); 785 spin_lock(&fs_info->fs_roots_radix_lock); 786 if (err) 787 break; 788 } 789 } 790 spin_unlock(&fs_info->fs_roots_radix_lock); 791 return err; 792 } 793 794 /* 795 * defrag a given btree. If cacheonly == 1, this won't read from the disk, 796 * otherwise every leaf in the btree is read and defragged. 797 */ 798 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 799 { 800 struct btrfs_fs_info *info = root->fs_info; 801 struct btrfs_trans_handle *trans; 802 int ret; 803 unsigned long nr; 804 805 if (xchg(&root->defrag_running, 1)) 806 return 0; 807 808 while (1) { 809 trans = btrfs_start_transaction(root, 0); 810 if (IS_ERR(trans)) 811 return PTR_ERR(trans); 812 813 ret = btrfs_defrag_leaves(trans, root, cacheonly); 814 815 nr = trans->blocks_used; 816 btrfs_end_transaction(trans, root); 817 btrfs_btree_balance_dirty(info->tree_root, nr); 818 cond_resched(); 819 820 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) 821 break; 822 } 823 root->defrag_running = 0; 824 return ret; 825 } 826 827 /* 828 * new snapshots need to be created at a very specific time in the 829 * transaction commit. This does the actual creation 830 */ 831 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, 832 struct btrfs_fs_info *fs_info, 833 struct btrfs_pending_snapshot *pending) 834 { 835 struct btrfs_key key; 836 struct btrfs_root_item *new_root_item; 837 struct btrfs_root *tree_root = fs_info->tree_root; 838 struct btrfs_root *root = pending->root; 839 struct btrfs_root *parent_root; 840 struct inode *parent_inode; 841 struct dentry *parent; 842 struct dentry *dentry; 843 struct extent_buffer *tmp; 844 struct extent_buffer *old; 845 int ret; 846 u64 to_reserve = 0; 847 u64 index = 0; 848 u64 objectid; 849 u64 root_flags; 850 851 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 852 if (!new_root_item) { 853 pending->error = -ENOMEM; 854 goto fail; 855 } 856 857 ret = btrfs_find_free_objectid(tree_root, &objectid); 858 if (ret) { 859 pending->error = ret; 860 goto fail; 861 } 862 863 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 864 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve); 865 866 if (to_reserve > 0) { 867 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 868 to_reserve); 869 if (ret) { 870 pending->error = ret; 871 goto fail; 872 } 873 } 874 875 key.objectid = objectid; 876 key.offset = (u64)-1; 877 key.type = BTRFS_ROOT_ITEM_KEY; 878 879 trans->block_rsv = &pending->block_rsv; 880 881 dentry = pending->dentry; 882 parent = dget_parent(dentry); 883 parent_inode = parent->d_inode; 884 parent_root = BTRFS_I(parent_inode)->root; 885 btrfs_record_root_in_trans(trans, parent_root); 886 887 /* 888 * insert the directory item 889 */ 890 ret = btrfs_set_inode_index(parent_inode, &index); 891 BUG_ON(ret); 892 ret = btrfs_insert_dir_item(trans, parent_root, 893 dentry->d_name.name, dentry->d_name.len, 894 parent_inode, &key, 895 BTRFS_FT_DIR, index); 896 BUG_ON(ret); 897 898 btrfs_i_size_write(parent_inode, parent_inode->i_size + 899 dentry->d_name.len * 2); 900 ret = btrfs_update_inode(trans, parent_root, parent_inode); 901 BUG_ON(ret); 902 903 btrfs_record_root_in_trans(trans, root); 904 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 905 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 906 btrfs_check_and_init_root_item(new_root_item); 907 908 root_flags = btrfs_root_flags(new_root_item); 909 if (pending->readonly) 910 root_flags |= BTRFS_ROOT_SUBVOL_RDONLY; 911 else 912 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; 913 btrfs_set_root_flags(new_root_item, root_flags); 914 915 old = btrfs_lock_root_node(root); 916 btrfs_cow_block(trans, root, old, NULL, 0, &old); 917 btrfs_set_lock_blocking(old); 918 919 btrfs_copy_root(trans, root, old, &tmp, objectid); 920 btrfs_tree_unlock(old); 921 free_extent_buffer(old); 922 923 btrfs_set_root_node(new_root_item, tmp); 924 /* record when the snapshot was created in key.offset */ 925 key.offset = trans->transid; 926 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); 927 btrfs_tree_unlock(tmp); 928 free_extent_buffer(tmp); 929 BUG_ON(ret); 930 931 /* 932 * insert root back/forward references 933 */ 934 ret = btrfs_add_root_ref(trans, tree_root, objectid, 935 parent_root->root_key.objectid, 936 btrfs_ino(parent_inode), index, 937 dentry->d_name.name, dentry->d_name.len); 938 BUG_ON(ret); 939 dput(parent); 940 941 key.offset = (u64)-1; 942 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); 943 BUG_ON(IS_ERR(pending->snap)); 944 945 btrfs_reloc_post_snapshot(trans, pending); 946 btrfs_orphan_post_snapshot(trans, pending); 947 fail: 948 kfree(new_root_item); 949 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); 950 return 0; 951 } 952 953 /* 954 * create all the snapshots we've scheduled for creation 955 */ 956 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, 957 struct btrfs_fs_info *fs_info) 958 { 959 struct btrfs_pending_snapshot *pending; 960 struct list_head *head = &trans->transaction->pending_snapshots; 961 int ret; 962 963 list_for_each_entry(pending, head, list) { 964 /* 965 * We must deal with the delayed items before creating 966 * snapshots, or we will create a snapthot with inconsistent 967 * information. 968 */ 969 ret = btrfs_run_delayed_items(trans, fs_info->fs_root); 970 BUG_ON(ret); 971 972 ret = create_pending_snapshot(trans, fs_info, pending); 973 BUG_ON(ret); 974 } 975 return 0; 976 } 977 978 static void update_super_roots(struct btrfs_root *root) 979 { 980 struct btrfs_root_item *root_item; 981 struct btrfs_super_block *super; 982 983 super = &root->fs_info->super_copy; 984 985 root_item = &root->fs_info->chunk_root->root_item; 986 super->chunk_root = root_item->bytenr; 987 super->chunk_root_generation = root_item->generation; 988 super->chunk_root_level = root_item->level; 989 990 root_item = &root->fs_info->tree_root->root_item; 991 super->root = root_item->bytenr; 992 super->generation = root_item->generation; 993 super->root_level = root_item->level; 994 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) 995 super->cache_generation = root_item->generation; 996 } 997 998 int btrfs_transaction_in_commit(struct btrfs_fs_info *info) 999 { 1000 int ret = 0; 1001 spin_lock(&info->trans_lock); 1002 if (info->running_transaction) 1003 ret = info->running_transaction->in_commit; 1004 spin_unlock(&info->trans_lock); 1005 return ret; 1006 } 1007 1008 int btrfs_transaction_blocked(struct btrfs_fs_info *info) 1009 { 1010 int ret = 0; 1011 spin_lock(&info->trans_lock); 1012 if (info->running_transaction) 1013 ret = info->running_transaction->blocked; 1014 spin_unlock(&info->trans_lock); 1015 return ret; 1016 } 1017 1018 /* 1019 * wait for the current transaction commit to start and block subsequent 1020 * transaction joins 1021 */ 1022 static void wait_current_trans_commit_start(struct btrfs_root *root, 1023 struct btrfs_transaction *trans) 1024 { 1025 DEFINE_WAIT(wait); 1026 1027 if (trans->in_commit) 1028 return; 1029 1030 while (1) { 1031 prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait, 1032 TASK_UNINTERRUPTIBLE); 1033 if (trans->in_commit) { 1034 finish_wait(&root->fs_info->transaction_blocked_wait, 1035 &wait); 1036 break; 1037 } 1038 schedule(); 1039 finish_wait(&root->fs_info->transaction_blocked_wait, &wait); 1040 } 1041 } 1042 1043 /* 1044 * wait for the current transaction to start and then become unblocked. 1045 * caller holds ref. 1046 */ 1047 static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, 1048 struct btrfs_transaction *trans) 1049 { 1050 DEFINE_WAIT(wait); 1051 1052 if (trans->commit_done || (trans->in_commit && !trans->blocked)) 1053 return; 1054 1055 while (1) { 1056 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 1057 TASK_UNINTERRUPTIBLE); 1058 if (trans->commit_done || 1059 (trans->in_commit && !trans->blocked)) { 1060 finish_wait(&root->fs_info->transaction_wait, 1061 &wait); 1062 break; 1063 } 1064 schedule(); 1065 finish_wait(&root->fs_info->transaction_wait, 1066 &wait); 1067 } 1068 } 1069 1070 /* 1071 * commit transactions asynchronously. once btrfs_commit_transaction_async 1072 * returns, any subsequent transaction will not be allowed to join. 1073 */ 1074 struct btrfs_async_commit { 1075 struct btrfs_trans_handle *newtrans; 1076 struct btrfs_root *root; 1077 struct delayed_work work; 1078 }; 1079 1080 static void do_async_commit(struct work_struct *work) 1081 { 1082 struct btrfs_async_commit *ac = 1083 container_of(work, struct btrfs_async_commit, work.work); 1084 1085 btrfs_commit_transaction(ac->newtrans, ac->root); 1086 kfree(ac); 1087 } 1088 1089 int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, 1090 struct btrfs_root *root, 1091 int wait_for_unblock) 1092 { 1093 struct btrfs_async_commit *ac; 1094 struct btrfs_transaction *cur_trans; 1095 1096 ac = kmalloc(sizeof(*ac), GFP_NOFS); 1097 if (!ac) 1098 return -ENOMEM; 1099 1100 INIT_DELAYED_WORK(&ac->work, do_async_commit); 1101 ac->root = root; 1102 ac->newtrans = btrfs_join_transaction(root); 1103 if (IS_ERR(ac->newtrans)) { 1104 int err = PTR_ERR(ac->newtrans); 1105 kfree(ac); 1106 return err; 1107 } 1108 1109 /* take transaction reference */ 1110 cur_trans = trans->transaction; 1111 atomic_inc(&cur_trans->use_count); 1112 1113 btrfs_end_transaction(trans, root); 1114 schedule_delayed_work(&ac->work, 0); 1115 1116 /* wait for transaction to start and unblock */ 1117 if (wait_for_unblock) 1118 wait_current_trans_commit_start_and_unblock(root, cur_trans); 1119 else 1120 wait_current_trans_commit_start(root, cur_trans); 1121 put_transaction(cur_trans); 1122 1123 return 0; 1124 } 1125 1126 /* 1127 * btrfs_transaction state sequence: 1128 * in_commit = 0, blocked = 0 (initial) 1129 * in_commit = 1, blocked = 1 1130 * blocked = 0 1131 * commit_done = 1 1132 */ 1133 int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1134 struct btrfs_root *root) 1135 { 1136 unsigned long joined = 0; 1137 struct btrfs_transaction *cur_trans; 1138 struct btrfs_transaction *prev_trans = NULL; 1139 DEFINE_WAIT(wait); 1140 int ret; 1141 int should_grow = 0; 1142 unsigned long now = get_seconds(); 1143 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); 1144 1145 btrfs_run_ordered_operations(root, 0); 1146 1147 /* make a pass through all the delayed refs we have so far 1148 * any runnings procs may add more while we are here 1149 */ 1150 ret = btrfs_run_delayed_refs(trans, root, 0); 1151 BUG_ON(ret); 1152 1153 btrfs_trans_release_metadata(trans, root); 1154 1155 cur_trans = trans->transaction; 1156 /* 1157 * set the flushing flag so procs in this transaction have to 1158 * start sending their work down. 1159 */ 1160 cur_trans->delayed_refs.flushing = 1; 1161 1162 ret = btrfs_run_delayed_refs(trans, root, 0); 1163 BUG_ON(ret); 1164 1165 spin_lock(&cur_trans->commit_lock); 1166 if (cur_trans->in_commit) { 1167 spin_unlock(&cur_trans->commit_lock); 1168 atomic_inc(&cur_trans->use_count); 1169 btrfs_end_transaction(trans, root); 1170 1171 ret = wait_for_commit(root, cur_trans); 1172 BUG_ON(ret); 1173 1174 put_transaction(cur_trans); 1175 1176 return 0; 1177 } 1178 1179 trans->transaction->in_commit = 1; 1180 trans->transaction->blocked = 1; 1181 spin_unlock(&cur_trans->commit_lock); 1182 wake_up(&root->fs_info->transaction_blocked_wait); 1183 1184 spin_lock(&root->fs_info->trans_lock); 1185 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1186 prev_trans = list_entry(cur_trans->list.prev, 1187 struct btrfs_transaction, list); 1188 if (!prev_trans->commit_done) { 1189 atomic_inc(&prev_trans->use_count); 1190 spin_unlock(&root->fs_info->trans_lock); 1191 1192 wait_for_commit(root, prev_trans); 1193 1194 put_transaction(prev_trans); 1195 } else { 1196 spin_unlock(&root->fs_info->trans_lock); 1197 } 1198 } else { 1199 spin_unlock(&root->fs_info->trans_lock); 1200 } 1201 1202 if (now < cur_trans->start_time || now - cur_trans->start_time < 1) 1203 should_grow = 1; 1204 1205 do { 1206 int snap_pending = 0; 1207 1208 joined = cur_trans->num_joined; 1209 if (!list_empty(&trans->transaction->pending_snapshots)) 1210 snap_pending = 1; 1211 1212 WARN_ON(cur_trans != trans->transaction); 1213 1214 if (flush_on_commit || snap_pending) { 1215 btrfs_start_delalloc_inodes(root, 1); 1216 ret = btrfs_wait_ordered_extents(root, 0, 1); 1217 BUG_ON(ret); 1218 } 1219 1220 ret = btrfs_run_delayed_items(trans, root); 1221 BUG_ON(ret); 1222 1223 /* 1224 * rename don't use btrfs_join_transaction, so, once we 1225 * set the transaction to blocked above, we aren't going 1226 * to get any new ordered operations. We can safely run 1227 * it here and no for sure that nothing new will be added 1228 * to the list 1229 */ 1230 btrfs_run_ordered_operations(root, 1); 1231 1232 prepare_to_wait(&cur_trans->writer_wait, &wait, 1233 TASK_UNINTERRUPTIBLE); 1234 1235 if (atomic_read(&cur_trans->num_writers) > 1) 1236 schedule_timeout(MAX_SCHEDULE_TIMEOUT); 1237 else if (should_grow) 1238 schedule_timeout(1); 1239 1240 finish_wait(&cur_trans->writer_wait, &wait); 1241 spin_lock(&root->fs_info->trans_lock); 1242 root->fs_info->trans_no_join = 1; 1243 spin_unlock(&root->fs_info->trans_lock); 1244 } while (atomic_read(&cur_trans->num_writers) > 1 || 1245 (should_grow && cur_trans->num_joined != joined)); 1246 1247 ret = create_pending_snapshots(trans, root->fs_info); 1248 BUG_ON(ret); 1249 1250 ret = btrfs_run_delayed_items(trans, root); 1251 BUG_ON(ret); 1252 1253 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 1254 BUG_ON(ret); 1255 1256 WARN_ON(cur_trans != trans->transaction); 1257 1258 btrfs_scrub_pause(root); 1259 /* btrfs_commit_tree_roots is responsible for getting the 1260 * various roots consistent with each other. Every pointer 1261 * in the tree of tree roots has to point to the most up to date 1262 * root for every subvolume and other tree. So, we have to keep 1263 * the tree logging code from jumping in and changing any 1264 * of the trees. 1265 * 1266 * At this point in the commit, there can't be any tree-log 1267 * writers, but a little lower down we drop the trans mutex 1268 * and let new people in. By holding the tree_log_mutex 1269 * from now until after the super is written, we avoid races 1270 * with the tree-log code. 1271 */ 1272 mutex_lock(&root->fs_info->tree_log_mutex); 1273 1274 ret = commit_fs_roots(trans, root); 1275 BUG_ON(ret); 1276 1277 /* commit_fs_roots gets rid of all the tree log roots, it is now 1278 * safe to free the root of tree log roots 1279 */ 1280 btrfs_free_log_root_tree(trans, root->fs_info); 1281 1282 ret = commit_cowonly_roots(trans, root); 1283 BUG_ON(ret); 1284 1285 btrfs_prepare_extent_commit(trans, root); 1286 1287 cur_trans = root->fs_info->running_transaction; 1288 1289 btrfs_set_root_node(&root->fs_info->tree_root->root_item, 1290 root->fs_info->tree_root->node); 1291 switch_commit_root(root->fs_info->tree_root); 1292 1293 btrfs_set_root_node(&root->fs_info->chunk_root->root_item, 1294 root->fs_info->chunk_root->node); 1295 switch_commit_root(root->fs_info->chunk_root); 1296 1297 update_super_roots(root); 1298 1299 if (!root->fs_info->log_root_recovering) { 1300 btrfs_set_super_log_root(&root->fs_info->super_copy, 0); 1301 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); 1302 } 1303 1304 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, 1305 sizeof(root->fs_info->super_copy)); 1306 1307 trans->transaction->blocked = 0; 1308 spin_lock(&root->fs_info->trans_lock); 1309 root->fs_info->running_transaction = NULL; 1310 root->fs_info->trans_no_join = 0; 1311 spin_unlock(&root->fs_info->trans_lock); 1312 1313 wake_up(&root->fs_info->transaction_wait); 1314 1315 ret = btrfs_write_and_wait_transaction(trans, root); 1316 BUG_ON(ret); 1317 write_ctree_super(trans, root, 0); 1318 1319 /* 1320 * the super is written, we can safely allow the tree-loggers 1321 * to go about their business 1322 */ 1323 mutex_unlock(&root->fs_info->tree_log_mutex); 1324 1325 btrfs_finish_extent_commit(trans, root); 1326 1327 cur_trans->commit_done = 1; 1328 1329 root->fs_info->last_trans_committed = cur_trans->transid; 1330 1331 wake_up(&cur_trans->commit_wait); 1332 1333 spin_lock(&root->fs_info->trans_lock); 1334 list_del_init(&cur_trans->list); 1335 spin_unlock(&root->fs_info->trans_lock); 1336 1337 put_transaction(cur_trans); 1338 put_transaction(cur_trans); 1339 1340 trace_btrfs_transaction_commit(root); 1341 1342 btrfs_scrub_continue(root); 1343 1344 if (current->journal_info == trans) 1345 current->journal_info = NULL; 1346 1347 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1348 1349 if (current != root->fs_info->transaction_kthread) 1350 btrfs_run_delayed_iputs(root); 1351 1352 return ret; 1353 } 1354 1355 /* 1356 * interface function to delete all the snapshots we have scheduled for deletion 1357 */ 1358 int btrfs_clean_old_snapshots(struct btrfs_root *root) 1359 { 1360 LIST_HEAD(list); 1361 struct btrfs_fs_info *fs_info = root->fs_info; 1362 1363 spin_lock(&fs_info->trans_lock); 1364 list_splice_init(&fs_info->dead_roots, &list); 1365 spin_unlock(&fs_info->trans_lock); 1366 1367 while (!list_empty(&list)) { 1368 root = list_entry(list.next, struct btrfs_root, root_list); 1369 list_del(&root->root_list); 1370 1371 btrfs_kill_all_delayed_nodes(root); 1372 1373 if (btrfs_header_backref_rev(root->node) < 1374 BTRFS_MIXED_BACKREF_REV) 1375 btrfs_drop_snapshot(root, NULL, 0); 1376 else 1377 btrfs_drop_snapshot(root, NULL, 1); 1378 } 1379 return 0; 1380 } 1381