1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/fs.h> 20 #include <linux/sched.h> 21 #include <linux/writeback.h> 22 #include <linux/pagemap.h> 23 #include <linux/blkdev.h> 24 #include "ctree.h" 25 #include "disk-io.h" 26 #include "transaction.h" 27 #include "locking.h" 28 #include "ref-cache.h" 29 #include "tree-log.h" 30 31 #define BTRFS_ROOT_TRANS_TAG 0 32 33 static noinline void put_transaction(struct btrfs_transaction *transaction) 34 { 35 WARN_ON(transaction->use_count == 0); 36 transaction->use_count--; 37 if (transaction->use_count == 0) { 38 list_del_init(&transaction->list); 39 memset(transaction, 0, sizeof(*transaction)); 40 kmem_cache_free(btrfs_transaction_cachep, transaction); 41 } 42 } 43 44 /* 45 * either allocate a new transaction or hop into the existing one 46 */ 47 static noinline int join_transaction(struct btrfs_root *root) 48 { 49 struct btrfs_transaction *cur_trans; 50 cur_trans = root->fs_info->running_transaction; 51 if (!cur_trans) { 52 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, 53 GFP_NOFS); 54 BUG_ON(!cur_trans); 55 root->fs_info->generation++; 56 root->fs_info->last_alloc = 0; 57 root->fs_info->last_data_alloc = 0; 58 cur_trans->num_writers = 1; 59 cur_trans->num_joined = 0; 60 cur_trans->transid = root->fs_info->generation; 61 init_waitqueue_head(&cur_trans->writer_wait); 62 init_waitqueue_head(&cur_trans->commit_wait); 63 cur_trans->in_commit = 0; 64 cur_trans->blocked = 0; 65 cur_trans->use_count = 1; 66 cur_trans->commit_done = 0; 67 cur_trans->start_time = get_seconds(); 68 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 69 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 70 extent_io_tree_init(&cur_trans->dirty_pages, 71 root->fs_info->btree_inode->i_mapping, 72 GFP_NOFS); 73 spin_lock(&root->fs_info->new_trans_lock); 74 root->fs_info->running_transaction = cur_trans; 75 spin_unlock(&root->fs_info->new_trans_lock); 76 } else { 77 cur_trans->num_writers++; 78 cur_trans->num_joined++; 79 } 80 81 return 0; 82 } 83 84 /* 85 * this does all the record keeping required to make sure that a reference 86 * counted root is properly recorded in a given transaction. This is required 87 * to make sure the old root from before we joined the transaction is deleted 88 * when the transaction commits 89 */ 90 noinline int btrfs_record_root_in_trans(struct btrfs_root *root) 91 { 92 struct btrfs_dirty_root *dirty; 93 u64 running_trans_id = root->fs_info->running_transaction->transid; 94 if (root->ref_cows && root->last_trans < running_trans_id) { 95 WARN_ON(root == root->fs_info->extent_root); 96 if (root->root_item.refs != 0) { 97 radix_tree_tag_set(&root->fs_info->fs_roots_radix, 98 (unsigned long)root->root_key.objectid, 99 BTRFS_ROOT_TRANS_TAG); 100 101 dirty = kmalloc(sizeof(*dirty), GFP_NOFS); 102 BUG_ON(!dirty); 103 dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS); 104 BUG_ON(!dirty->root); 105 dirty->latest_root = root; 106 INIT_LIST_HEAD(&dirty->list); 107 108 root->commit_root = btrfs_root_node(root); 109 110 memcpy(dirty->root, root, sizeof(*root)); 111 spin_lock_init(&dirty->root->node_lock); 112 spin_lock_init(&dirty->root->list_lock); 113 mutex_init(&dirty->root->objectid_mutex); 114 mutex_init(&dirty->root->log_mutex); 115 INIT_LIST_HEAD(&dirty->root->dead_list); 116 dirty->root->node = root->commit_root; 117 dirty->root->commit_root = NULL; 118 119 spin_lock(&root->list_lock); 120 list_add(&dirty->root->dead_list, &root->dead_list); 121 spin_unlock(&root->list_lock); 122 123 root->dirty_root = dirty; 124 } else { 125 WARN_ON(1); 126 } 127 root->last_trans = running_trans_id; 128 } 129 return 0; 130 } 131 132 /* wait for commit against the current transaction to become unblocked 133 * when this is done, it is safe to start a new transaction, but the current 134 * transaction might not be fully on disk. 135 */ 136 static void wait_current_trans(struct btrfs_root *root) 137 { 138 struct btrfs_transaction *cur_trans; 139 140 cur_trans = root->fs_info->running_transaction; 141 if (cur_trans && cur_trans->blocked) { 142 DEFINE_WAIT(wait); 143 cur_trans->use_count++; 144 while (1) { 145 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 146 TASK_UNINTERRUPTIBLE); 147 if (cur_trans->blocked) { 148 mutex_unlock(&root->fs_info->trans_mutex); 149 schedule(); 150 mutex_lock(&root->fs_info->trans_mutex); 151 finish_wait(&root->fs_info->transaction_wait, 152 &wait); 153 } else { 154 finish_wait(&root->fs_info->transaction_wait, 155 &wait); 156 break; 157 } 158 } 159 put_transaction(cur_trans); 160 } 161 } 162 163 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 164 int num_blocks, int wait) 165 { 166 struct btrfs_trans_handle *h = 167 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 168 int ret; 169 170 mutex_lock(&root->fs_info->trans_mutex); 171 if (!root->fs_info->log_root_recovering && 172 ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)) 173 wait_current_trans(root); 174 ret = join_transaction(root); 175 BUG_ON(ret); 176 177 btrfs_record_root_in_trans(root); 178 h->transid = root->fs_info->running_transaction->transid; 179 h->transaction = root->fs_info->running_transaction; 180 h->blocks_reserved = num_blocks; 181 h->blocks_used = 0; 182 h->block_group = 0; 183 h->alloc_exclude_nr = 0; 184 h->alloc_exclude_start = 0; 185 root->fs_info->running_transaction->use_count++; 186 mutex_unlock(&root->fs_info->trans_mutex); 187 return h; 188 } 189 190 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 191 int num_blocks) 192 { 193 return start_transaction(root, num_blocks, 1); 194 } 195 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 196 int num_blocks) 197 { 198 return start_transaction(root, num_blocks, 0); 199 } 200 201 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 202 int num_blocks) 203 { 204 return start_transaction(r, num_blocks, 2); 205 } 206 207 /* wait for a transaction commit to be fully complete */ 208 static noinline int wait_for_commit(struct btrfs_root *root, 209 struct btrfs_transaction *commit) 210 { 211 DEFINE_WAIT(wait); 212 mutex_lock(&root->fs_info->trans_mutex); 213 while (!commit->commit_done) { 214 prepare_to_wait(&commit->commit_wait, &wait, 215 TASK_UNINTERRUPTIBLE); 216 if (commit->commit_done) 217 break; 218 mutex_unlock(&root->fs_info->trans_mutex); 219 schedule(); 220 mutex_lock(&root->fs_info->trans_mutex); 221 } 222 mutex_unlock(&root->fs_info->trans_mutex); 223 finish_wait(&commit->commit_wait, &wait); 224 return 0; 225 } 226 227 /* 228 * rate limit against the drop_snapshot code. This helps to slow down new 229 * operations if the drop_snapshot code isn't able to keep up. 230 */ 231 static void throttle_on_drops(struct btrfs_root *root) 232 { 233 struct btrfs_fs_info *info = root->fs_info; 234 int harder_count = 0; 235 236 harder: 237 if (atomic_read(&info->throttles)) { 238 DEFINE_WAIT(wait); 239 int thr; 240 thr = atomic_read(&info->throttle_gen); 241 242 do { 243 prepare_to_wait(&info->transaction_throttle, 244 &wait, TASK_UNINTERRUPTIBLE); 245 if (!atomic_read(&info->throttles)) { 246 finish_wait(&info->transaction_throttle, &wait); 247 break; 248 } 249 schedule(); 250 finish_wait(&info->transaction_throttle, &wait); 251 } while (thr == atomic_read(&info->throttle_gen)); 252 harder_count++; 253 254 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 && 255 harder_count < 2) 256 goto harder; 257 258 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 && 259 harder_count < 10) 260 goto harder; 261 262 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 && 263 harder_count < 20) 264 goto harder; 265 } 266 } 267 268 void btrfs_throttle(struct btrfs_root *root) 269 { 270 mutex_lock(&root->fs_info->trans_mutex); 271 if (!root->fs_info->open_ioctl_trans) 272 wait_current_trans(root); 273 mutex_unlock(&root->fs_info->trans_mutex); 274 275 throttle_on_drops(root); 276 } 277 278 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 279 struct btrfs_root *root, int throttle) 280 { 281 struct btrfs_transaction *cur_trans; 282 struct btrfs_fs_info *info = root->fs_info; 283 284 mutex_lock(&info->trans_mutex); 285 cur_trans = info->running_transaction; 286 WARN_ON(cur_trans != trans->transaction); 287 WARN_ON(cur_trans->num_writers < 1); 288 cur_trans->num_writers--; 289 290 if (waitqueue_active(&cur_trans->writer_wait)) 291 wake_up(&cur_trans->writer_wait); 292 put_transaction(cur_trans); 293 mutex_unlock(&info->trans_mutex); 294 memset(trans, 0, sizeof(*trans)); 295 kmem_cache_free(btrfs_trans_handle_cachep, trans); 296 297 if (throttle) 298 throttle_on_drops(root); 299 300 return 0; 301 } 302 303 int btrfs_end_transaction(struct btrfs_trans_handle *trans, 304 struct btrfs_root *root) 305 { 306 return __btrfs_end_transaction(trans, root, 0); 307 } 308 309 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 310 struct btrfs_root *root) 311 { 312 return __btrfs_end_transaction(trans, root, 1); 313 } 314 315 /* 316 * when btree blocks are allocated, they have some corresponding bits set for 317 * them in one of two extent_io trees. This is used to make sure all of 318 * those extents are on disk for transaction or log commit 319 */ 320 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 321 struct extent_io_tree *dirty_pages) 322 { 323 int ret; 324 int err = 0; 325 int werr = 0; 326 struct page *page; 327 struct inode *btree_inode = root->fs_info->btree_inode; 328 u64 start = 0; 329 u64 end; 330 unsigned long index; 331 332 while (1) { 333 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 334 EXTENT_DIRTY); 335 if (ret) 336 break; 337 while (start <= end) { 338 cond_resched(); 339 340 index = start >> PAGE_CACHE_SHIFT; 341 start = (u64)(index + 1) << PAGE_CACHE_SHIFT; 342 page = find_get_page(btree_inode->i_mapping, index); 343 if (!page) 344 continue; 345 346 btree_lock_page_hook(page); 347 if (!page->mapping) { 348 unlock_page(page); 349 page_cache_release(page); 350 continue; 351 } 352 353 if (PageWriteback(page)) { 354 if (PageDirty(page)) 355 wait_on_page_writeback(page); 356 else { 357 unlock_page(page); 358 page_cache_release(page); 359 continue; 360 } 361 } 362 err = write_one_page(page, 0); 363 if (err) 364 werr = err; 365 page_cache_release(page); 366 } 367 } 368 while (1) { 369 ret = find_first_extent_bit(dirty_pages, 0, &start, &end, 370 EXTENT_DIRTY); 371 if (ret) 372 break; 373 374 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS); 375 while (start <= end) { 376 index = start >> PAGE_CACHE_SHIFT; 377 start = (u64)(index + 1) << PAGE_CACHE_SHIFT; 378 page = find_get_page(btree_inode->i_mapping, index); 379 if (!page) 380 continue; 381 if (PageDirty(page)) { 382 btree_lock_page_hook(page); 383 wait_on_page_writeback(page); 384 err = write_one_page(page, 0); 385 if (err) 386 werr = err; 387 } 388 wait_on_page_writeback(page); 389 page_cache_release(page); 390 cond_resched(); 391 } 392 } 393 if (err) 394 werr = err; 395 return werr; 396 } 397 398 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 399 struct btrfs_root *root) 400 { 401 if (!trans || !trans->transaction) { 402 struct inode *btree_inode; 403 btree_inode = root->fs_info->btree_inode; 404 return filemap_write_and_wait(btree_inode->i_mapping); 405 } 406 return btrfs_write_and_wait_marked_extents(root, 407 &trans->transaction->dirty_pages); 408 } 409 410 /* 411 * this is used to update the root pointer in the tree of tree roots. 412 * 413 * But, in the case of the extent allocation tree, updating the root 414 * pointer may allocate blocks which may change the root of the extent 415 * allocation tree. 416 * 417 * So, this loops and repeats and makes sure the cowonly root didn't 418 * change while the root pointer was being updated in the metadata. 419 */ 420 static int update_cowonly_root(struct btrfs_trans_handle *trans, 421 struct btrfs_root *root) 422 { 423 int ret; 424 u64 old_root_bytenr; 425 struct btrfs_root *tree_root = root->fs_info->tree_root; 426 427 btrfs_extent_post_op(trans, root); 428 btrfs_write_dirty_block_groups(trans, root); 429 btrfs_extent_post_op(trans, root); 430 431 while (1) { 432 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 433 if (old_root_bytenr == root->node->start) 434 break; 435 btrfs_set_root_bytenr(&root->root_item, 436 root->node->start); 437 btrfs_set_root_level(&root->root_item, 438 btrfs_header_level(root->node)); 439 btrfs_set_root_generation(&root->root_item, trans->transid); 440 441 btrfs_extent_post_op(trans, root); 442 443 ret = btrfs_update_root(trans, tree_root, 444 &root->root_key, 445 &root->root_item); 446 BUG_ON(ret); 447 btrfs_write_dirty_block_groups(trans, root); 448 btrfs_extent_post_op(trans, root); 449 } 450 return 0; 451 } 452 453 /* 454 * update all the cowonly tree roots on disk 455 */ 456 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 457 struct btrfs_root *root) 458 { 459 struct btrfs_fs_info *fs_info = root->fs_info; 460 struct list_head *next; 461 struct extent_buffer *eb; 462 463 btrfs_extent_post_op(trans, fs_info->tree_root); 464 465 eb = btrfs_lock_root_node(fs_info->tree_root); 466 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); 467 btrfs_tree_unlock(eb); 468 free_extent_buffer(eb); 469 470 btrfs_extent_post_op(trans, fs_info->tree_root); 471 472 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 473 next = fs_info->dirty_cowonly_roots.next; 474 list_del_init(next); 475 root = list_entry(next, struct btrfs_root, dirty_list); 476 477 update_cowonly_root(trans, root); 478 } 479 return 0; 480 } 481 482 /* 483 * dead roots are old snapshots that need to be deleted. This allocates 484 * a dirty root struct and adds it into the list of dead roots that need to 485 * be deleted 486 */ 487 int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) 488 { 489 struct btrfs_dirty_root *dirty; 490 491 dirty = kmalloc(sizeof(*dirty), GFP_NOFS); 492 if (!dirty) 493 return -ENOMEM; 494 dirty->root = root; 495 dirty->latest_root = latest; 496 497 mutex_lock(&root->fs_info->trans_mutex); 498 list_add(&dirty->list, &latest->fs_info->dead_roots); 499 mutex_unlock(&root->fs_info->trans_mutex); 500 return 0; 501 } 502 503 /* 504 * at transaction commit time we need to schedule the old roots for 505 * deletion via btrfs_drop_snapshot. This runs through all the 506 * reference counted roots that were modified in the current 507 * transaction and puts them into the drop list 508 */ 509 static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, 510 struct radix_tree_root *radix, 511 struct list_head *list) 512 { 513 struct btrfs_dirty_root *dirty; 514 struct btrfs_root *gang[8]; 515 struct btrfs_root *root; 516 int i; 517 int ret; 518 int err = 0; 519 u32 refs; 520 521 while (1) { 522 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0, 523 ARRAY_SIZE(gang), 524 BTRFS_ROOT_TRANS_TAG); 525 if (ret == 0) 526 break; 527 for (i = 0; i < ret; i++) { 528 root = gang[i]; 529 radix_tree_tag_clear(radix, 530 (unsigned long)root->root_key.objectid, 531 BTRFS_ROOT_TRANS_TAG); 532 533 BUG_ON(!root->ref_tree); 534 dirty = root->dirty_root; 535 536 btrfs_free_log(trans, root); 537 btrfs_free_reloc_root(trans, root); 538 539 if (root->commit_root == root->node) { 540 WARN_ON(root->node->start != 541 btrfs_root_bytenr(&root->root_item)); 542 543 free_extent_buffer(root->commit_root); 544 root->commit_root = NULL; 545 root->dirty_root = NULL; 546 547 spin_lock(&root->list_lock); 548 list_del_init(&dirty->root->dead_list); 549 spin_unlock(&root->list_lock); 550 551 kfree(dirty->root); 552 kfree(dirty); 553 554 /* make sure to update the root on disk 555 * so we get any updates to the block used 556 * counts 557 */ 558 err = btrfs_update_root(trans, 559 root->fs_info->tree_root, 560 &root->root_key, 561 &root->root_item); 562 continue; 563 } 564 565 memset(&root->root_item.drop_progress, 0, 566 sizeof(struct btrfs_disk_key)); 567 root->root_item.drop_level = 0; 568 root->commit_root = NULL; 569 root->dirty_root = NULL; 570 root->root_key.offset = root->fs_info->generation; 571 btrfs_set_root_bytenr(&root->root_item, 572 root->node->start); 573 btrfs_set_root_level(&root->root_item, 574 btrfs_header_level(root->node)); 575 btrfs_set_root_generation(&root->root_item, 576 root->root_key.offset); 577 578 err = btrfs_insert_root(trans, root->fs_info->tree_root, 579 &root->root_key, 580 &root->root_item); 581 if (err) 582 break; 583 584 refs = btrfs_root_refs(&dirty->root->root_item); 585 btrfs_set_root_refs(&dirty->root->root_item, refs - 1); 586 err = btrfs_update_root(trans, root->fs_info->tree_root, 587 &dirty->root->root_key, 588 &dirty->root->root_item); 589 590 BUG_ON(err); 591 if (refs == 1) { 592 list_add(&dirty->list, list); 593 } else { 594 WARN_ON(1); 595 free_extent_buffer(dirty->root->node); 596 kfree(dirty->root); 597 kfree(dirty); 598 } 599 } 600 } 601 return err; 602 } 603 604 /* 605 * defrag a given btree. If cacheonly == 1, this won't read from the disk, 606 * otherwise every leaf in the btree is read and defragged. 607 */ 608 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 609 { 610 struct btrfs_fs_info *info = root->fs_info; 611 int ret; 612 struct btrfs_trans_handle *trans; 613 unsigned long nr; 614 615 smp_mb(); 616 if (root->defrag_running) 617 return 0; 618 trans = btrfs_start_transaction(root, 1); 619 while (1) { 620 root->defrag_running = 1; 621 ret = btrfs_defrag_leaves(trans, root, cacheonly); 622 nr = trans->blocks_used; 623 btrfs_end_transaction(trans, root); 624 btrfs_btree_balance_dirty(info->tree_root, nr); 625 cond_resched(); 626 627 trans = btrfs_start_transaction(root, 1); 628 if (root->fs_info->closing || ret != -EAGAIN) 629 break; 630 } 631 root->defrag_running = 0; 632 smp_mb(); 633 btrfs_end_transaction(trans, root); 634 return 0; 635 } 636 637 /* 638 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on 639 * all of them 640 */ 641 static noinline int drop_dirty_roots(struct btrfs_root *tree_root, 642 struct list_head *list) 643 { 644 struct btrfs_dirty_root *dirty; 645 struct btrfs_trans_handle *trans; 646 unsigned long nr; 647 u64 num_bytes; 648 u64 bytes_used; 649 u64 max_useless; 650 int ret = 0; 651 int err; 652 653 while (!list_empty(list)) { 654 struct btrfs_root *root; 655 656 dirty = list_entry(list->prev, struct btrfs_dirty_root, list); 657 list_del_init(&dirty->list); 658 659 num_bytes = btrfs_root_used(&dirty->root->root_item); 660 root = dirty->latest_root; 661 atomic_inc(&root->fs_info->throttles); 662 663 while (1) { 664 trans = btrfs_start_transaction(tree_root, 1); 665 mutex_lock(&root->fs_info->drop_mutex); 666 ret = btrfs_drop_snapshot(trans, dirty->root); 667 if (ret != -EAGAIN) 668 break; 669 mutex_unlock(&root->fs_info->drop_mutex); 670 671 err = btrfs_update_root(trans, 672 tree_root, 673 &dirty->root->root_key, 674 &dirty->root->root_item); 675 if (err) 676 ret = err; 677 nr = trans->blocks_used; 678 ret = btrfs_end_transaction(trans, tree_root); 679 BUG_ON(ret); 680 681 btrfs_btree_balance_dirty(tree_root, nr); 682 cond_resched(); 683 } 684 BUG_ON(ret); 685 atomic_dec(&root->fs_info->throttles); 686 wake_up(&root->fs_info->transaction_throttle); 687 688 num_bytes -= btrfs_root_used(&dirty->root->root_item); 689 bytes_used = btrfs_root_used(&root->root_item); 690 if (num_bytes) { 691 mutex_lock(&root->fs_info->trans_mutex); 692 btrfs_record_root_in_trans(root); 693 mutex_unlock(&root->fs_info->trans_mutex); 694 btrfs_set_root_used(&root->root_item, 695 bytes_used - num_bytes); 696 } 697 698 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key); 699 if (ret) { 700 BUG(); 701 break; 702 } 703 mutex_unlock(&root->fs_info->drop_mutex); 704 705 spin_lock(&root->list_lock); 706 list_del_init(&dirty->root->dead_list); 707 if (!list_empty(&root->dead_list)) { 708 struct btrfs_root *oldest; 709 oldest = list_entry(root->dead_list.prev, 710 struct btrfs_root, dead_list); 711 max_useless = oldest->root_key.offset - 1; 712 } else { 713 max_useless = root->root_key.offset - 1; 714 } 715 spin_unlock(&root->list_lock); 716 717 nr = trans->blocks_used; 718 ret = btrfs_end_transaction(trans, tree_root); 719 BUG_ON(ret); 720 721 ret = btrfs_remove_leaf_refs(root, max_useless, 0); 722 BUG_ON(ret); 723 724 free_extent_buffer(dirty->root->node); 725 kfree(dirty->root); 726 kfree(dirty); 727 728 btrfs_btree_balance_dirty(tree_root, nr); 729 cond_resched(); 730 } 731 return ret; 732 } 733 734 /* 735 * new snapshots need to be created at a very specific time in the 736 * transaction commit. This does the actual creation 737 */ 738 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, 739 struct btrfs_fs_info *fs_info, 740 struct btrfs_pending_snapshot *pending) 741 { 742 struct btrfs_key key; 743 struct btrfs_root_item *new_root_item; 744 struct btrfs_root *tree_root = fs_info->tree_root; 745 struct btrfs_root *root = pending->root; 746 struct extent_buffer *tmp; 747 struct extent_buffer *old; 748 int ret; 749 u64 objectid; 750 751 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 752 if (!new_root_item) { 753 ret = -ENOMEM; 754 goto fail; 755 } 756 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); 757 if (ret) 758 goto fail; 759 760 btrfs_record_root_in_trans(root); 761 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 762 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 763 764 key.objectid = objectid; 765 key.offset = trans->transid; 766 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 767 768 old = btrfs_lock_root_node(root); 769 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); 770 771 btrfs_copy_root(trans, root, old, &tmp, objectid); 772 btrfs_tree_unlock(old); 773 free_extent_buffer(old); 774 775 btrfs_set_root_bytenr(new_root_item, tmp->start); 776 btrfs_set_root_level(new_root_item, btrfs_header_level(tmp)); 777 btrfs_set_root_generation(new_root_item, trans->transid); 778 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 779 new_root_item); 780 btrfs_tree_unlock(tmp); 781 free_extent_buffer(tmp); 782 if (ret) 783 goto fail; 784 785 key.offset = (u64)-1; 786 memcpy(&pending->root_key, &key, sizeof(key)); 787 fail: 788 kfree(new_root_item); 789 return ret; 790 } 791 792 static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, 793 struct btrfs_pending_snapshot *pending) 794 { 795 int ret; 796 int namelen; 797 u64 index = 0; 798 struct btrfs_trans_handle *trans; 799 struct inode *parent_inode; 800 struct inode *inode; 801 struct btrfs_root *parent_root; 802 803 parent_inode = pending->dentry->d_parent->d_inode; 804 parent_root = BTRFS_I(parent_inode)->root; 805 trans = btrfs_join_transaction(parent_root, 1); 806 807 /* 808 * insert the directory item 809 */ 810 namelen = strlen(pending->name); 811 ret = btrfs_set_inode_index(parent_inode, &index); 812 ret = btrfs_insert_dir_item(trans, parent_root, 813 pending->name, namelen, 814 parent_inode->i_ino, 815 &pending->root_key, BTRFS_FT_DIR, index); 816 817 if (ret) 818 goto fail; 819 820 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); 821 ret = btrfs_update_inode(trans, parent_root, parent_inode); 822 BUG_ON(ret); 823 824 /* add the backref first */ 825 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 826 pending->root_key.objectid, 827 BTRFS_ROOT_BACKREF_KEY, 828 parent_root->root_key.objectid, 829 parent_inode->i_ino, index, pending->name, 830 namelen); 831 832 BUG_ON(ret); 833 834 /* now add the forward ref */ 835 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 836 parent_root->root_key.objectid, 837 BTRFS_ROOT_REF_KEY, 838 pending->root_key.objectid, 839 parent_inode->i_ino, index, pending->name, 840 namelen); 841 842 inode = btrfs_lookup_dentry(parent_inode, pending->dentry); 843 d_instantiate(pending->dentry, inode); 844 fail: 845 btrfs_end_transaction(trans, fs_info->fs_root); 846 return ret; 847 } 848 849 /* 850 * create all the snapshots we've scheduled for creation 851 */ 852 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, 853 struct btrfs_fs_info *fs_info) 854 { 855 struct btrfs_pending_snapshot *pending; 856 struct list_head *head = &trans->transaction->pending_snapshots; 857 int ret; 858 859 list_for_each_entry(pending, head, list) { 860 ret = create_pending_snapshot(trans, fs_info, pending); 861 BUG_ON(ret); 862 } 863 return 0; 864 } 865 866 static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans, 867 struct btrfs_fs_info *fs_info) 868 { 869 struct btrfs_pending_snapshot *pending; 870 struct list_head *head = &trans->transaction->pending_snapshots; 871 int ret; 872 873 while (!list_empty(head)) { 874 pending = list_entry(head->next, 875 struct btrfs_pending_snapshot, list); 876 ret = finish_pending_snapshot(fs_info, pending); 877 BUG_ON(ret); 878 list_del(&pending->list); 879 kfree(pending->name); 880 kfree(pending); 881 } 882 return 0; 883 } 884 885 int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 886 struct btrfs_root *root) 887 { 888 unsigned long joined = 0; 889 unsigned long timeout = 1; 890 struct btrfs_transaction *cur_trans; 891 struct btrfs_transaction *prev_trans = NULL; 892 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 893 struct list_head dirty_fs_roots; 894 struct extent_io_tree *pinned_copy; 895 DEFINE_WAIT(wait); 896 int ret; 897 898 INIT_LIST_HEAD(&dirty_fs_roots); 899 mutex_lock(&root->fs_info->trans_mutex); 900 if (trans->transaction->in_commit) { 901 cur_trans = trans->transaction; 902 trans->transaction->use_count++; 903 mutex_unlock(&root->fs_info->trans_mutex); 904 btrfs_end_transaction(trans, root); 905 906 ret = wait_for_commit(root, cur_trans); 907 BUG_ON(ret); 908 909 mutex_lock(&root->fs_info->trans_mutex); 910 put_transaction(cur_trans); 911 mutex_unlock(&root->fs_info->trans_mutex); 912 913 return 0; 914 } 915 916 pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS); 917 if (!pinned_copy) 918 return -ENOMEM; 919 920 extent_io_tree_init(pinned_copy, 921 root->fs_info->btree_inode->i_mapping, GFP_NOFS); 922 923 trans->transaction->in_commit = 1; 924 trans->transaction->blocked = 1; 925 cur_trans = trans->transaction; 926 if (cur_trans->list.prev != &root->fs_info->trans_list) { 927 prev_trans = list_entry(cur_trans->list.prev, 928 struct btrfs_transaction, list); 929 if (!prev_trans->commit_done) { 930 prev_trans->use_count++; 931 mutex_unlock(&root->fs_info->trans_mutex); 932 933 wait_for_commit(root, prev_trans); 934 935 mutex_lock(&root->fs_info->trans_mutex); 936 put_transaction(prev_trans); 937 } 938 } 939 940 do { 941 int snap_pending = 0; 942 joined = cur_trans->num_joined; 943 if (!list_empty(&trans->transaction->pending_snapshots)) 944 snap_pending = 1; 945 946 WARN_ON(cur_trans != trans->transaction); 947 prepare_to_wait(&cur_trans->writer_wait, &wait, 948 TASK_UNINTERRUPTIBLE); 949 950 if (cur_trans->num_writers > 1) 951 timeout = MAX_SCHEDULE_TIMEOUT; 952 else 953 timeout = 1; 954 955 mutex_unlock(&root->fs_info->trans_mutex); 956 957 if (snap_pending) { 958 ret = btrfs_wait_ordered_extents(root, 1); 959 BUG_ON(ret); 960 } 961 962 schedule_timeout(timeout); 963 964 mutex_lock(&root->fs_info->trans_mutex); 965 finish_wait(&cur_trans->writer_wait, &wait); 966 } while (cur_trans->num_writers > 1 || 967 (cur_trans->num_joined != joined)); 968 969 ret = create_pending_snapshots(trans, root->fs_info); 970 BUG_ON(ret); 971 972 WARN_ON(cur_trans != trans->transaction); 973 974 /* btrfs_commit_tree_roots is responsible for getting the 975 * various roots consistent with each other. Every pointer 976 * in the tree of tree roots has to point to the most up to date 977 * root for every subvolume and other tree. So, we have to keep 978 * the tree logging code from jumping in and changing any 979 * of the trees. 980 * 981 * At this point in the commit, there can't be any tree-log 982 * writers, but a little lower down we drop the trans mutex 983 * and let new people in. By holding the tree_log_mutex 984 * from now until after the super is written, we avoid races 985 * with the tree-log code. 986 */ 987 mutex_lock(&root->fs_info->tree_log_mutex); 988 /* 989 * keep tree reloc code from adding new reloc trees 990 */ 991 mutex_lock(&root->fs_info->tree_reloc_mutex); 992 993 994 ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix, 995 &dirty_fs_roots); 996 BUG_ON(ret); 997 998 /* add_dirty_roots gets rid of all the tree log roots, it is now 999 * safe to free the root of tree log roots 1000 */ 1001 btrfs_free_log_root_tree(trans, root->fs_info); 1002 1003 ret = btrfs_commit_tree_roots(trans, root); 1004 BUG_ON(ret); 1005 1006 cur_trans = root->fs_info->running_transaction; 1007 spin_lock(&root->fs_info->new_trans_lock); 1008 root->fs_info->running_transaction = NULL; 1009 spin_unlock(&root->fs_info->new_trans_lock); 1010 btrfs_set_super_generation(&root->fs_info->super_copy, 1011 cur_trans->transid); 1012 btrfs_set_super_root(&root->fs_info->super_copy, 1013 root->fs_info->tree_root->node->start); 1014 btrfs_set_super_root_level(&root->fs_info->super_copy, 1015 btrfs_header_level(root->fs_info->tree_root->node)); 1016 1017 btrfs_set_super_chunk_root(&root->fs_info->super_copy, 1018 chunk_root->node->start); 1019 btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, 1020 btrfs_header_level(chunk_root->node)); 1021 btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy, 1022 btrfs_header_generation(chunk_root->node)); 1023 1024 if (!root->fs_info->log_root_recovering) { 1025 btrfs_set_super_log_root(&root->fs_info->super_copy, 0); 1026 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); 1027 } 1028 1029 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, 1030 sizeof(root->fs_info->super_copy)); 1031 1032 btrfs_copy_pinned(root, pinned_copy); 1033 1034 trans->transaction->blocked = 0; 1035 wake_up(&root->fs_info->transaction_throttle); 1036 wake_up(&root->fs_info->transaction_wait); 1037 1038 mutex_unlock(&root->fs_info->trans_mutex); 1039 ret = btrfs_write_and_wait_transaction(trans, root); 1040 BUG_ON(ret); 1041 write_ctree_super(trans, root, 0); 1042 1043 /* 1044 * the super is written, we can safely allow the tree-loggers 1045 * to go about their business 1046 */ 1047 mutex_unlock(&root->fs_info->tree_log_mutex); 1048 1049 btrfs_finish_extent_commit(trans, root, pinned_copy); 1050 kfree(pinned_copy); 1051 1052 btrfs_drop_dead_reloc_roots(root); 1053 mutex_unlock(&root->fs_info->tree_reloc_mutex); 1054 1055 /* do the directory inserts of any pending snapshot creations */ 1056 finish_pending_snapshots(trans, root->fs_info); 1057 1058 mutex_lock(&root->fs_info->trans_mutex); 1059 1060 cur_trans->commit_done = 1; 1061 root->fs_info->last_trans_committed = cur_trans->transid; 1062 wake_up(&cur_trans->commit_wait); 1063 1064 put_transaction(cur_trans); 1065 put_transaction(cur_trans); 1066 1067 list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots); 1068 if (root->fs_info->closing) 1069 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots); 1070 1071 mutex_unlock(&root->fs_info->trans_mutex); 1072 1073 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1074 1075 if (root->fs_info->closing) 1076 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots); 1077 return ret; 1078 } 1079 1080 /* 1081 * interface function to delete all the snapshots we have scheduled for deletion 1082 */ 1083 int btrfs_clean_old_snapshots(struct btrfs_root *root) 1084 { 1085 struct list_head dirty_roots; 1086 INIT_LIST_HEAD(&dirty_roots); 1087 again: 1088 mutex_lock(&root->fs_info->trans_mutex); 1089 list_splice_init(&root->fs_info->dead_roots, &dirty_roots); 1090 mutex_unlock(&root->fs_info->trans_mutex); 1091 1092 if (!list_empty(&dirty_roots)) { 1093 drop_dirty_roots(root, &dirty_roots); 1094 goto again; 1095 } 1096 return 0; 1097 } 1098