1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include "ctree.h" 21 #include "transaction.h" 22 #include "disk-io.h" 23 #include "locking.h" 24 #include "print-tree.h" 25 #include "compat.h" 26 #include "tree-log.h" 27 28 /* magic values for the inode_only field in btrfs_log_inode: 29 * 30 * LOG_INODE_ALL means to log everything 31 * LOG_INODE_EXISTS means to log just enough to recreate the inode 32 * during log replay 33 */ 34 #define LOG_INODE_ALL 0 35 #define LOG_INODE_EXISTS 1 36 37 /* 38 * directory trouble cases 39 * 40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 41 * log, we must force a full commit before doing an fsync of the directory 42 * where the unlink was done. 43 * ---> record transid of last unlink/rename per directory 44 * 45 * mkdir foo/some_dir 46 * normal commit 47 * rename foo/some_dir foo2/some_dir 48 * mkdir foo/some_dir 49 * fsync foo/some_dir/some_file 50 * 51 * The fsync above will unlink the original some_dir without recording 52 * it in its new location (foo2). After a crash, some_dir will be gone 53 * unless the fsync of some_file forces a full commit 54 * 55 * 2) we must log any new names for any file or dir that is in the fsync 56 * log. ---> check inode while renaming/linking. 57 * 58 * 2a) we must log any new names for any file or dir during rename 59 * when the directory they are being removed from was logged. 60 * ---> check inode and old parent dir during rename 61 * 62 * 2a is actually the more important variant. With the extra logging 63 * a crash might unlink the old name without recreating the new one 64 * 65 * 3) after a crash, we must go through any directories with a link count 66 * of zero and redo the rm -rf 67 * 68 * mkdir f1/foo 69 * normal commit 70 * rm -rf f1/foo 71 * fsync(f1) 72 * 73 * The directory f1 was fully removed from the FS, but fsync was never 74 * called on f1, only its parent dir. After a crash the rm -rf must 75 * be replayed. This must be able to recurse down the entire 76 * directory tree. The inode link count fixup code takes care of the 77 * ugly details. 78 */ 79 80 /* 81 * stages for the tree walking. The first 82 * stage (0) is to only pin down the blocks we find 83 * the second stage (1) is to make sure that all the inodes 84 * we find in the log are created in the subvolume. 85 * 86 * The last stage is to deal with directories and links and extents 87 * and all the other fun semantics 88 */ 89 #define LOG_WALK_PIN_ONLY 0 90 #define LOG_WALK_REPLAY_INODES 1 91 #define LOG_WALK_REPLAY_ALL 2 92 93 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 94 struct btrfs_root *root, struct inode *inode, 95 int inode_only); 96 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, 98 struct btrfs_path *path, u64 objectid); 99 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 100 struct btrfs_root *root, 101 struct btrfs_root *log, 102 struct btrfs_path *path, 103 u64 dirid, int del_all); 104 105 /* 106 * tree logging is a special write ahead log used to make sure that 107 * fsyncs and O_SYNCs can happen without doing full tree commits. 108 * 109 * Full tree commits are expensive because they require commonly 110 * modified blocks to be recowed, creating many dirty pages in the 111 * extent tree an 4x-6x higher write load than ext3. 112 * 113 * Instead of doing a tree commit on every fsync, we use the 114 * key ranges and transaction ids to find items for a given file or directory 115 * that have changed in this transaction. Those items are copied into 116 * a special tree (one per subvolume root), that tree is written to disk 117 * and then the fsync is considered complete. 118 * 119 * After a crash, items are copied out of the log-tree back into the 120 * subvolume tree. Any file data extents found are recorded in the extent 121 * allocation tree, and the log-tree freed. 122 * 123 * The log tree is read three times, once to pin down all the extents it is 124 * using in ram and once, once to create all the inodes logged in the tree 125 * and once to do all the other items. 126 */ 127 128 /* 129 * start a sub transaction and setup the log tree 130 * this increments the log tree writer count to make the people 131 * syncing the tree wait for us to finish 132 */ 133 static int start_log_trans(struct btrfs_trans_handle *trans, 134 struct btrfs_root *root) 135 { 136 int ret; 137 138 mutex_lock(&root->log_mutex); 139 if (root->log_root) { 140 root->log_batch++; 141 atomic_inc(&root->log_writers); 142 mutex_unlock(&root->log_mutex); 143 return 0; 144 } 145 mutex_lock(&root->fs_info->tree_log_mutex); 146 if (!root->fs_info->log_root_tree) { 147 ret = btrfs_init_log_root_tree(trans, root->fs_info); 148 BUG_ON(ret); 149 } 150 if (!root->log_root) { 151 ret = btrfs_add_log_tree(trans, root); 152 BUG_ON(ret); 153 } 154 mutex_unlock(&root->fs_info->tree_log_mutex); 155 root->log_batch++; 156 atomic_inc(&root->log_writers); 157 mutex_unlock(&root->log_mutex); 158 return 0; 159 } 160 161 /* 162 * returns 0 if there was a log transaction running and we were able 163 * to join, or returns -ENOENT if there were not transactions 164 * in progress 165 */ 166 static int join_running_log_trans(struct btrfs_root *root) 167 { 168 int ret = -ENOENT; 169 170 smp_mb(); 171 if (!root->log_root) 172 return -ENOENT; 173 174 mutex_lock(&root->log_mutex); 175 if (root->log_root) { 176 ret = 0; 177 atomic_inc(&root->log_writers); 178 } 179 mutex_unlock(&root->log_mutex); 180 return ret; 181 } 182 183 /* 184 * This either makes the current running log transaction wait 185 * until you call btrfs_end_log_trans() or it makes any future 186 * log transactions wait until you call btrfs_end_log_trans() 187 */ 188 int btrfs_pin_log_trans(struct btrfs_root *root) 189 { 190 int ret = -ENOENT; 191 192 mutex_lock(&root->log_mutex); 193 atomic_inc(&root->log_writers); 194 mutex_unlock(&root->log_mutex); 195 return ret; 196 } 197 198 /* 199 * indicate we're done making changes to the log tree 200 * and wake up anyone waiting to do a sync 201 */ 202 int btrfs_end_log_trans(struct btrfs_root *root) 203 { 204 if (atomic_dec_and_test(&root->log_writers)) { 205 smp_mb(); 206 if (waitqueue_active(&root->log_writer_wait)) 207 wake_up(&root->log_writer_wait); 208 } 209 return 0; 210 } 211 212 213 /* 214 * the walk control struct is used to pass state down the chain when 215 * processing the log tree. The stage field tells us which part 216 * of the log tree processing we are currently doing. The others 217 * are state fields used for that specific part 218 */ 219 struct walk_control { 220 /* should we free the extent on disk when done? This is used 221 * at transaction commit time while freeing a log tree 222 */ 223 int free; 224 225 /* should we write out the extent buffer? This is used 226 * while flushing the log tree to disk during a sync 227 */ 228 int write; 229 230 /* should we wait for the extent buffer io to finish? Also used 231 * while flushing the log tree to disk for a sync 232 */ 233 int wait; 234 235 /* pin only walk, we record which extents on disk belong to the 236 * log trees 237 */ 238 int pin; 239 240 /* what stage of the replay code we're currently in */ 241 int stage; 242 243 /* the root we are currently replaying */ 244 struct btrfs_root *replay_dest; 245 246 /* the trans handle for the current replay */ 247 struct btrfs_trans_handle *trans; 248 249 /* the function that gets used to process blocks we find in the 250 * tree. Note the extent_buffer might not be up to date when it is 251 * passed in, and it must be checked or read if you need the data 252 * inside it 253 */ 254 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 255 struct walk_control *wc, u64 gen); 256 }; 257 258 /* 259 * process_func used to pin down extents, write them or wait on them 260 */ 261 static int process_one_buffer(struct btrfs_root *log, 262 struct extent_buffer *eb, 263 struct walk_control *wc, u64 gen) 264 { 265 if (wc->pin) 266 btrfs_update_pinned_extents(log->fs_info->extent_root, 267 eb->start, eb->len, 1); 268 269 if (btrfs_buffer_uptodate(eb, gen)) { 270 if (wc->write) 271 btrfs_write_tree_block(eb); 272 if (wc->wait) 273 btrfs_wait_tree_block_writeback(eb); 274 } 275 return 0; 276 } 277 278 /* 279 * Item overwrite used by replay and tree logging. eb, slot and key all refer 280 * to the src data we are copying out. 281 * 282 * root is the tree we are copying into, and path is a scratch 283 * path for use in this function (it should be released on entry and 284 * will be released on exit). 285 * 286 * If the key is already in the destination tree the existing item is 287 * overwritten. If the existing item isn't big enough, it is extended. 288 * If it is too large, it is truncated. 289 * 290 * If the key isn't in the destination yet, a new item is inserted. 291 */ 292 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 293 struct btrfs_root *root, 294 struct btrfs_path *path, 295 struct extent_buffer *eb, int slot, 296 struct btrfs_key *key) 297 { 298 int ret; 299 u32 item_size; 300 u64 saved_i_size = 0; 301 int save_old_i_size = 0; 302 unsigned long src_ptr; 303 unsigned long dst_ptr; 304 int overwrite_root = 0; 305 306 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 307 overwrite_root = 1; 308 309 item_size = btrfs_item_size_nr(eb, slot); 310 src_ptr = btrfs_item_ptr_offset(eb, slot); 311 312 /* look for the key in the destination tree */ 313 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 314 if (ret == 0) { 315 char *src_copy; 316 char *dst_copy; 317 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 318 path->slots[0]); 319 if (dst_size != item_size) 320 goto insert; 321 322 if (item_size == 0) { 323 btrfs_release_path(root, path); 324 return 0; 325 } 326 dst_copy = kmalloc(item_size, GFP_NOFS); 327 src_copy = kmalloc(item_size, GFP_NOFS); 328 329 read_extent_buffer(eb, src_copy, src_ptr, item_size); 330 331 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 332 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 333 item_size); 334 ret = memcmp(dst_copy, src_copy, item_size); 335 336 kfree(dst_copy); 337 kfree(src_copy); 338 /* 339 * they have the same contents, just return, this saves 340 * us from cowing blocks in the destination tree and doing 341 * extra writes that may not have been done by a previous 342 * sync 343 */ 344 if (ret == 0) { 345 btrfs_release_path(root, path); 346 return 0; 347 } 348 349 } 350 insert: 351 btrfs_release_path(root, path); 352 /* try to insert the key into the destination tree */ 353 ret = btrfs_insert_empty_item(trans, root, path, 354 key, item_size); 355 356 /* make sure any existing item is the correct size */ 357 if (ret == -EEXIST) { 358 u32 found_size; 359 found_size = btrfs_item_size_nr(path->nodes[0], 360 path->slots[0]); 361 if (found_size > item_size) { 362 btrfs_truncate_item(trans, root, path, item_size, 1); 363 } else if (found_size < item_size) { 364 ret = btrfs_extend_item(trans, root, path, 365 item_size - found_size); 366 BUG_ON(ret); 367 } 368 } else if (ret) { 369 BUG(); 370 } 371 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 372 path->slots[0]); 373 374 /* don't overwrite an existing inode if the generation number 375 * was logged as zero. This is done when the tree logging code 376 * is just logging an inode to make sure it exists after recovery. 377 * 378 * Also, don't overwrite i_size on directories during replay. 379 * log replay inserts and removes directory items based on the 380 * state of the tree found in the subvolume, and i_size is modified 381 * as it goes 382 */ 383 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 384 struct btrfs_inode_item *src_item; 385 struct btrfs_inode_item *dst_item; 386 387 src_item = (struct btrfs_inode_item *)src_ptr; 388 dst_item = (struct btrfs_inode_item *)dst_ptr; 389 390 if (btrfs_inode_generation(eb, src_item) == 0) 391 goto no_copy; 392 393 if (overwrite_root && 394 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 395 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 396 save_old_i_size = 1; 397 saved_i_size = btrfs_inode_size(path->nodes[0], 398 dst_item); 399 } 400 } 401 402 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 403 src_ptr, item_size); 404 405 if (save_old_i_size) { 406 struct btrfs_inode_item *dst_item; 407 dst_item = (struct btrfs_inode_item *)dst_ptr; 408 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 409 } 410 411 /* make sure the generation is filled in */ 412 if (key->type == BTRFS_INODE_ITEM_KEY) { 413 struct btrfs_inode_item *dst_item; 414 dst_item = (struct btrfs_inode_item *)dst_ptr; 415 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 416 btrfs_set_inode_generation(path->nodes[0], dst_item, 417 trans->transid); 418 } 419 } 420 no_copy: 421 btrfs_mark_buffer_dirty(path->nodes[0]); 422 btrfs_release_path(root, path); 423 return 0; 424 } 425 426 /* 427 * simple helper to read an inode off the disk from a given root 428 * This can only be called for subvolume roots and not for the log 429 */ 430 static noinline struct inode *read_one_inode(struct btrfs_root *root, 431 u64 objectid) 432 { 433 struct btrfs_key key; 434 struct inode *inode; 435 436 key.objectid = objectid; 437 key.type = BTRFS_INODE_ITEM_KEY; 438 key.offset = 0; 439 inode = btrfs_iget(root->fs_info->sb, &key, root); 440 if (IS_ERR(inode)) { 441 inode = NULL; 442 } else if (is_bad_inode(inode)) { 443 iput(inode); 444 inode = NULL; 445 } 446 return inode; 447 } 448 449 /* replays a single extent in 'eb' at 'slot' with 'key' into the 450 * subvolume 'root'. path is released on entry and should be released 451 * on exit. 452 * 453 * extents in the log tree have not been allocated out of the extent 454 * tree yet. So, this completes the allocation, taking a reference 455 * as required if the extent already exists or creating a new extent 456 * if it isn't in the extent allocation tree yet. 457 * 458 * The extent is inserted into the file, dropping any existing extents 459 * from the file that overlap the new one. 460 */ 461 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 462 struct btrfs_root *root, 463 struct btrfs_path *path, 464 struct extent_buffer *eb, int slot, 465 struct btrfs_key *key) 466 { 467 int found_type; 468 u64 mask = root->sectorsize - 1; 469 u64 extent_end; 470 u64 alloc_hint; 471 u64 start = key->offset; 472 u64 saved_nbytes; 473 struct btrfs_file_extent_item *item; 474 struct inode *inode = NULL; 475 unsigned long size; 476 int ret = 0; 477 478 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 479 found_type = btrfs_file_extent_type(eb, item); 480 481 if (found_type == BTRFS_FILE_EXTENT_REG || 482 found_type == BTRFS_FILE_EXTENT_PREALLOC) 483 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 484 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 485 size = btrfs_file_extent_inline_len(eb, item); 486 extent_end = (start + size + mask) & ~mask; 487 } else { 488 ret = 0; 489 goto out; 490 } 491 492 inode = read_one_inode(root, key->objectid); 493 if (!inode) { 494 ret = -EIO; 495 goto out; 496 } 497 498 /* 499 * first check to see if we already have this extent in the 500 * file. This must be done before the btrfs_drop_extents run 501 * so we don't try to drop this extent. 502 */ 503 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 504 start, 0); 505 506 if (ret == 0 && 507 (found_type == BTRFS_FILE_EXTENT_REG || 508 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 509 struct btrfs_file_extent_item cmp1; 510 struct btrfs_file_extent_item cmp2; 511 struct btrfs_file_extent_item *existing; 512 struct extent_buffer *leaf; 513 514 leaf = path->nodes[0]; 515 existing = btrfs_item_ptr(leaf, path->slots[0], 516 struct btrfs_file_extent_item); 517 518 read_extent_buffer(eb, &cmp1, (unsigned long)item, 519 sizeof(cmp1)); 520 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 521 sizeof(cmp2)); 522 523 /* 524 * we already have a pointer to this exact extent, 525 * we don't have to do anything 526 */ 527 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 528 btrfs_release_path(root, path); 529 goto out; 530 } 531 } 532 btrfs_release_path(root, path); 533 534 saved_nbytes = inode_get_bytes(inode); 535 /* drop any overlapping extents */ 536 ret = btrfs_drop_extents(trans, root, inode, 537 start, extent_end, extent_end, start, &alloc_hint); 538 BUG_ON(ret); 539 540 if (found_type == BTRFS_FILE_EXTENT_REG || 541 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 542 u64 offset; 543 unsigned long dest_offset; 544 struct btrfs_key ins; 545 546 ret = btrfs_insert_empty_item(trans, root, path, key, 547 sizeof(*item)); 548 BUG_ON(ret); 549 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 550 path->slots[0]); 551 copy_extent_buffer(path->nodes[0], eb, dest_offset, 552 (unsigned long)item, sizeof(*item)); 553 554 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 555 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 556 ins.type = BTRFS_EXTENT_ITEM_KEY; 557 offset = key->offset - btrfs_file_extent_offset(eb, item); 558 559 if (ins.objectid > 0) { 560 u64 csum_start; 561 u64 csum_end; 562 LIST_HEAD(ordered_sums); 563 /* 564 * is this extent already allocated in the extent 565 * allocation tree? If so, just add a reference 566 */ 567 ret = btrfs_lookup_extent(root, ins.objectid, 568 ins.offset); 569 if (ret == 0) { 570 ret = btrfs_inc_extent_ref(trans, root, 571 ins.objectid, ins.offset, 572 0, root->root_key.objectid, 573 key->objectid, offset); 574 } else { 575 /* 576 * insert the extent pointer in the extent 577 * allocation tree 578 */ 579 ret = btrfs_alloc_logged_file_extent(trans, 580 root, root->root_key.objectid, 581 key->objectid, offset, &ins); 582 BUG_ON(ret); 583 } 584 btrfs_release_path(root, path); 585 586 if (btrfs_file_extent_compression(eb, item)) { 587 csum_start = ins.objectid; 588 csum_end = csum_start + ins.offset; 589 } else { 590 csum_start = ins.objectid + 591 btrfs_file_extent_offset(eb, item); 592 csum_end = csum_start + 593 btrfs_file_extent_num_bytes(eb, item); 594 } 595 596 ret = btrfs_lookup_csums_range(root->log_root, 597 csum_start, csum_end - 1, 598 &ordered_sums); 599 BUG_ON(ret); 600 while (!list_empty(&ordered_sums)) { 601 struct btrfs_ordered_sum *sums; 602 sums = list_entry(ordered_sums.next, 603 struct btrfs_ordered_sum, 604 list); 605 ret = btrfs_csum_file_blocks(trans, 606 root->fs_info->csum_root, 607 sums); 608 BUG_ON(ret); 609 list_del(&sums->list); 610 kfree(sums); 611 } 612 } else { 613 btrfs_release_path(root, path); 614 } 615 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 616 /* inline extents are easy, we just overwrite them */ 617 ret = overwrite_item(trans, root, path, eb, slot, key); 618 BUG_ON(ret); 619 } 620 621 inode_set_bytes(inode, saved_nbytes); 622 btrfs_update_inode(trans, root, inode); 623 out: 624 if (inode) 625 iput(inode); 626 return ret; 627 } 628 629 /* 630 * when cleaning up conflicts between the directory names in the 631 * subvolume, directory names in the log and directory names in the 632 * inode back references, we may have to unlink inodes from directories. 633 * 634 * This is a helper function to do the unlink of a specific directory 635 * item 636 */ 637 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 638 struct btrfs_root *root, 639 struct btrfs_path *path, 640 struct inode *dir, 641 struct btrfs_dir_item *di) 642 { 643 struct inode *inode; 644 char *name; 645 int name_len; 646 struct extent_buffer *leaf; 647 struct btrfs_key location; 648 int ret; 649 650 leaf = path->nodes[0]; 651 652 btrfs_dir_item_key_to_cpu(leaf, di, &location); 653 name_len = btrfs_dir_name_len(leaf, di); 654 name = kmalloc(name_len, GFP_NOFS); 655 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 656 btrfs_release_path(root, path); 657 658 inode = read_one_inode(root, location.objectid); 659 BUG_ON(!inode); 660 661 ret = link_to_fixup_dir(trans, root, path, location.objectid); 662 BUG_ON(ret); 663 664 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 665 BUG_ON(ret); 666 kfree(name); 667 668 iput(inode); 669 return ret; 670 } 671 672 /* 673 * helper function to see if a given name and sequence number found 674 * in an inode back reference are already in a directory and correctly 675 * point to this inode 676 */ 677 static noinline int inode_in_dir(struct btrfs_root *root, 678 struct btrfs_path *path, 679 u64 dirid, u64 objectid, u64 index, 680 const char *name, int name_len) 681 { 682 struct btrfs_dir_item *di; 683 struct btrfs_key location; 684 int match = 0; 685 686 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 687 index, name, name_len, 0); 688 if (di && !IS_ERR(di)) { 689 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 690 if (location.objectid != objectid) 691 goto out; 692 } else 693 goto out; 694 btrfs_release_path(root, path); 695 696 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 697 if (di && !IS_ERR(di)) { 698 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 699 if (location.objectid != objectid) 700 goto out; 701 } else 702 goto out; 703 match = 1; 704 out: 705 btrfs_release_path(root, path); 706 return match; 707 } 708 709 /* 710 * helper function to check a log tree for a named back reference in 711 * an inode. This is used to decide if a back reference that is 712 * found in the subvolume conflicts with what we find in the log. 713 * 714 * inode backreferences may have multiple refs in a single item, 715 * during replay we process one reference at a time, and we don't 716 * want to delete valid links to a file from the subvolume if that 717 * link is also in the log. 718 */ 719 static noinline int backref_in_log(struct btrfs_root *log, 720 struct btrfs_key *key, 721 char *name, int namelen) 722 { 723 struct btrfs_path *path; 724 struct btrfs_inode_ref *ref; 725 unsigned long ptr; 726 unsigned long ptr_end; 727 unsigned long name_ptr; 728 int found_name_len; 729 int item_size; 730 int ret; 731 int match = 0; 732 733 path = btrfs_alloc_path(); 734 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 735 if (ret != 0) 736 goto out; 737 738 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 739 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 740 ptr_end = ptr + item_size; 741 while (ptr < ptr_end) { 742 ref = (struct btrfs_inode_ref *)ptr; 743 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 744 if (found_name_len == namelen) { 745 name_ptr = (unsigned long)(ref + 1); 746 ret = memcmp_extent_buffer(path->nodes[0], name, 747 name_ptr, namelen); 748 if (ret == 0) { 749 match = 1; 750 goto out; 751 } 752 } 753 ptr = (unsigned long)(ref + 1) + found_name_len; 754 } 755 out: 756 btrfs_free_path(path); 757 return match; 758 } 759 760 761 /* 762 * replay one inode back reference item found in the log tree. 763 * eb, slot and key refer to the buffer and key found in the log tree. 764 * root is the destination we are replaying into, and path is for temp 765 * use by this function. (it should be released on return). 766 */ 767 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 768 struct btrfs_root *root, 769 struct btrfs_root *log, 770 struct btrfs_path *path, 771 struct extent_buffer *eb, int slot, 772 struct btrfs_key *key) 773 { 774 struct inode *dir; 775 int ret; 776 struct btrfs_key location; 777 struct btrfs_inode_ref *ref; 778 struct btrfs_dir_item *di; 779 struct inode *inode; 780 char *name; 781 int namelen; 782 unsigned long ref_ptr; 783 unsigned long ref_end; 784 785 location.objectid = key->objectid; 786 location.type = BTRFS_INODE_ITEM_KEY; 787 location.offset = 0; 788 789 /* 790 * it is possible that we didn't log all the parent directories 791 * for a given inode. If we don't find the dir, just don't 792 * copy the back ref in. The link count fixup code will take 793 * care of the rest 794 */ 795 dir = read_one_inode(root, key->offset); 796 if (!dir) 797 return -ENOENT; 798 799 inode = read_one_inode(root, key->objectid); 800 BUG_ON(!inode); 801 802 ref_ptr = btrfs_item_ptr_offset(eb, slot); 803 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 804 805 again: 806 ref = (struct btrfs_inode_ref *)ref_ptr; 807 808 namelen = btrfs_inode_ref_name_len(eb, ref); 809 name = kmalloc(namelen, GFP_NOFS); 810 BUG_ON(!name); 811 812 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); 813 814 /* if we already have a perfect match, we're done */ 815 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, 816 btrfs_inode_ref_index(eb, ref), 817 name, namelen)) { 818 goto out; 819 } 820 821 /* 822 * look for a conflicting back reference in the metadata. 823 * if we find one we have to unlink that name of the file 824 * before we add our new link. Later on, we overwrite any 825 * existing back reference, and we don't want to create 826 * dangling pointers in the directory. 827 */ 828 conflict_again: 829 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 830 if (ret == 0) { 831 char *victim_name; 832 int victim_name_len; 833 struct btrfs_inode_ref *victim_ref; 834 unsigned long ptr; 835 unsigned long ptr_end; 836 struct extent_buffer *leaf = path->nodes[0]; 837 838 /* are we trying to overwrite a back ref for the root directory 839 * if so, just jump out, we're done 840 */ 841 if (key->objectid == key->offset) 842 goto out_nowrite; 843 844 /* check all the names in this back reference to see 845 * if they are in the log. if so, we allow them to stay 846 * otherwise they must be unlinked as a conflict 847 */ 848 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 849 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 850 while (ptr < ptr_end) { 851 victim_ref = (struct btrfs_inode_ref *)ptr; 852 victim_name_len = btrfs_inode_ref_name_len(leaf, 853 victim_ref); 854 victim_name = kmalloc(victim_name_len, GFP_NOFS); 855 BUG_ON(!victim_name); 856 857 read_extent_buffer(leaf, victim_name, 858 (unsigned long)(victim_ref + 1), 859 victim_name_len); 860 861 if (!backref_in_log(log, key, victim_name, 862 victim_name_len)) { 863 btrfs_inc_nlink(inode); 864 btrfs_release_path(root, path); 865 866 ret = btrfs_unlink_inode(trans, root, dir, 867 inode, victim_name, 868 victim_name_len); 869 kfree(victim_name); 870 btrfs_release_path(root, path); 871 goto conflict_again; 872 } 873 kfree(victim_name); 874 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 875 } 876 BUG_ON(ret); 877 } 878 btrfs_release_path(root, path); 879 880 /* look for a conflicting sequence number */ 881 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 882 btrfs_inode_ref_index(eb, ref), 883 name, namelen, 0); 884 if (di && !IS_ERR(di)) { 885 ret = drop_one_dir_item(trans, root, path, dir, di); 886 BUG_ON(ret); 887 } 888 btrfs_release_path(root, path); 889 890 891 /* look for a conflicting name */ 892 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 893 name, namelen, 0); 894 if (di && !IS_ERR(di)) { 895 ret = drop_one_dir_item(trans, root, path, dir, di); 896 BUG_ON(ret); 897 } 898 btrfs_release_path(root, path); 899 900 /* insert our name */ 901 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 902 btrfs_inode_ref_index(eb, ref)); 903 BUG_ON(ret); 904 905 btrfs_update_inode(trans, root, inode); 906 907 out: 908 ref_ptr = (unsigned long)(ref + 1) + namelen; 909 kfree(name); 910 if (ref_ptr < ref_end) 911 goto again; 912 913 /* finally write the back reference in the inode */ 914 ret = overwrite_item(trans, root, path, eb, slot, key); 915 BUG_ON(ret); 916 917 out_nowrite: 918 btrfs_release_path(root, path); 919 iput(dir); 920 iput(inode); 921 return 0; 922 } 923 924 /* 925 * There are a few corners where the link count of the file can't 926 * be properly maintained during replay. So, instead of adding 927 * lots of complexity to the log code, we just scan the backrefs 928 * for any file that has been through replay. 929 * 930 * The scan will update the link count on the inode to reflect the 931 * number of back refs found. If it goes down to zero, the iput 932 * will free the inode. 933 */ 934 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 935 struct btrfs_root *root, 936 struct inode *inode) 937 { 938 struct btrfs_path *path; 939 int ret; 940 struct btrfs_key key; 941 u64 nlink = 0; 942 unsigned long ptr; 943 unsigned long ptr_end; 944 int name_len; 945 946 key.objectid = inode->i_ino; 947 key.type = BTRFS_INODE_REF_KEY; 948 key.offset = (u64)-1; 949 950 path = btrfs_alloc_path(); 951 952 while (1) { 953 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 954 if (ret < 0) 955 break; 956 if (ret > 0) { 957 if (path->slots[0] == 0) 958 break; 959 path->slots[0]--; 960 } 961 btrfs_item_key_to_cpu(path->nodes[0], &key, 962 path->slots[0]); 963 if (key.objectid != inode->i_ino || 964 key.type != BTRFS_INODE_REF_KEY) 965 break; 966 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 967 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 968 path->slots[0]); 969 while (ptr < ptr_end) { 970 struct btrfs_inode_ref *ref; 971 972 ref = (struct btrfs_inode_ref *)ptr; 973 name_len = btrfs_inode_ref_name_len(path->nodes[0], 974 ref); 975 ptr = (unsigned long)(ref + 1) + name_len; 976 nlink++; 977 } 978 979 if (key.offset == 0) 980 break; 981 key.offset--; 982 btrfs_release_path(root, path); 983 } 984 btrfs_release_path(root, path); 985 if (nlink != inode->i_nlink) { 986 inode->i_nlink = nlink; 987 btrfs_update_inode(trans, root, inode); 988 } 989 BTRFS_I(inode)->index_cnt = (u64)-1; 990 991 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) { 992 ret = replay_dir_deletes(trans, root, NULL, path, 993 inode->i_ino, 1); 994 BUG_ON(ret); 995 } 996 btrfs_free_path(path); 997 998 return 0; 999 } 1000 1001 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1002 struct btrfs_root *root, 1003 struct btrfs_path *path) 1004 { 1005 int ret; 1006 struct btrfs_key key; 1007 struct inode *inode; 1008 1009 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1010 key.type = BTRFS_ORPHAN_ITEM_KEY; 1011 key.offset = (u64)-1; 1012 while (1) { 1013 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1014 if (ret < 0) 1015 break; 1016 1017 if (ret == 1) { 1018 if (path->slots[0] == 0) 1019 break; 1020 path->slots[0]--; 1021 } 1022 1023 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1024 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1025 key.type != BTRFS_ORPHAN_ITEM_KEY) 1026 break; 1027 1028 ret = btrfs_del_item(trans, root, path); 1029 BUG_ON(ret); 1030 1031 btrfs_release_path(root, path); 1032 inode = read_one_inode(root, key.offset); 1033 BUG_ON(!inode); 1034 1035 ret = fixup_inode_link_count(trans, root, inode); 1036 BUG_ON(ret); 1037 1038 iput(inode); 1039 1040 /* 1041 * fixup on a directory may create new entries, 1042 * make sure we always look for the highset possible 1043 * offset 1044 */ 1045 key.offset = (u64)-1; 1046 } 1047 btrfs_release_path(root, path); 1048 return 0; 1049 } 1050 1051 1052 /* 1053 * record a given inode in the fixup dir so we can check its link 1054 * count when replay is done. The link count is incremented here 1055 * so the inode won't go away until we check it 1056 */ 1057 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1058 struct btrfs_root *root, 1059 struct btrfs_path *path, 1060 u64 objectid) 1061 { 1062 struct btrfs_key key; 1063 int ret = 0; 1064 struct inode *inode; 1065 1066 inode = read_one_inode(root, objectid); 1067 BUG_ON(!inode); 1068 1069 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1070 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1071 key.offset = objectid; 1072 1073 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1074 1075 btrfs_release_path(root, path); 1076 if (ret == 0) { 1077 btrfs_inc_nlink(inode); 1078 btrfs_update_inode(trans, root, inode); 1079 } else if (ret == -EEXIST) { 1080 ret = 0; 1081 } else { 1082 BUG(); 1083 } 1084 iput(inode); 1085 1086 return ret; 1087 } 1088 1089 /* 1090 * when replaying the log for a directory, we only insert names 1091 * for inodes that actually exist. This means an fsync on a directory 1092 * does not implicitly fsync all the new files in it 1093 */ 1094 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1095 struct btrfs_root *root, 1096 struct btrfs_path *path, 1097 u64 dirid, u64 index, 1098 char *name, int name_len, u8 type, 1099 struct btrfs_key *location) 1100 { 1101 struct inode *inode; 1102 struct inode *dir; 1103 int ret; 1104 1105 inode = read_one_inode(root, location->objectid); 1106 if (!inode) 1107 return -ENOENT; 1108 1109 dir = read_one_inode(root, dirid); 1110 if (!dir) { 1111 iput(inode); 1112 return -EIO; 1113 } 1114 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1115 1116 /* FIXME, put inode into FIXUP list */ 1117 1118 iput(inode); 1119 iput(dir); 1120 return ret; 1121 } 1122 1123 /* 1124 * take a single entry in a log directory item and replay it into 1125 * the subvolume. 1126 * 1127 * if a conflicting item exists in the subdirectory already, 1128 * the inode it points to is unlinked and put into the link count 1129 * fix up tree. 1130 * 1131 * If a name from the log points to a file or directory that does 1132 * not exist in the FS, it is skipped. fsyncs on directories 1133 * do not force down inodes inside that directory, just changes to the 1134 * names or unlinks in a directory. 1135 */ 1136 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1137 struct btrfs_root *root, 1138 struct btrfs_path *path, 1139 struct extent_buffer *eb, 1140 struct btrfs_dir_item *di, 1141 struct btrfs_key *key) 1142 { 1143 char *name; 1144 int name_len; 1145 struct btrfs_dir_item *dst_di; 1146 struct btrfs_key found_key; 1147 struct btrfs_key log_key; 1148 struct inode *dir; 1149 u8 log_type; 1150 int exists; 1151 int ret; 1152 1153 dir = read_one_inode(root, key->objectid); 1154 BUG_ON(!dir); 1155 1156 name_len = btrfs_dir_name_len(eb, di); 1157 name = kmalloc(name_len, GFP_NOFS); 1158 log_type = btrfs_dir_type(eb, di); 1159 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1160 name_len); 1161 1162 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1163 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1164 if (exists == 0) 1165 exists = 1; 1166 else 1167 exists = 0; 1168 btrfs_release_path(root, path); 1169 1170 if (key->type == BTRFS_DIR_ITEM_KEY) { 1171 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1172 name, name_len, 1); 1173 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1174 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1175 key->objectid, 1176 key->offset, name, 1177 name_len, 1); 1178 } else { 1179 BUG(); 1180 } 1181 if (!dst_di || IS_ERR(dst_di)) { 1182 /* we need a sequence number to insert, so we only 1183 * do inserts for the BTRFS_DIR_INDEX_KEY types 1184 */ 1185 if (key->type != BTRFS_DIR_INDEX_KEY) 1186 goto out; 1187 goto insert; 1188 } 1189 1190 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1191 /* the existing item matches the logged item */ 1192 if (found_key.objectid == log_key.objectid && 1193 found_key.type == log_key.type && 1194 found_key.offset == log_key.offset && 1195 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1196 goto out; 1197 } 1198 1199 /* 1200 * don't drop the conflicting directory entry if the inode 1201 * for the new entry doesn't exist 1202 */ 1203 if (!exists) 1204 goto out; 1205 1206 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1207 BUG_ON(ret); 1208 1209 if (key->type == BTRFS_DIR_INDEX_KEY) 1210 goto insert; 1211 out: 1212 btrfs_release_path(root, path); 1213 kfree(name); 1214 iput(dir); 1215 return 0; 1216 1217 insert: 1218 btrfs_release_path(root, path); 1219 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1220 name, name_len, log_type, &log_key); 1221 1222 BUG_ON(ret && ret != -ENOENT); 1223 goto out; 1224 } 1225 1226 /* 1227 * find all the names in a directory item and reconcile them into 1228 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1229 * one name in a directory item, but the same code gets used for 1230 * both directory index types 1231 */ 1232 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1233 struct btrfs_root *root, 1234 struct btrfs_path *path, 1235 struct extent_buffer *eb, int slot, 1236 struct btrfs_key *key) 1237 { 1238 int ret; 1239 u32 item_size = btrfs_item_size_nr(eb, slot); 1240 struct btrfs_dir_item *di; 1241 int name_len; 1242 unsigned long ptr; 1243 unsigned long ptr_end; 1244 1245 ptr = btrfs_item_ptr_offset(eb, slot); 1246 ptr_end = ptr + item_size; 1247 while (ptr < ptr_end) { 1248 di = (struct btrfs_dir_item *)ptr; 1249 name_len = btrfs_dir_name_len(eb, di); 1250 ret = replay_one_name(trans, root, path, eb, di, key); 1251 BUG_ON(ret); 1252 ptr = (unsigned long)(di + 1); 1253 ptr += name_len; 1254 } 1255 return 0; 1256 } 1257 1258 /* 1259 * directory replay has two parts. There are the standard directory 1260 * items in the log copied from the subvolume, and range items 1261 * created in the log while the subvolume was logged. 1262 * 1263 * The range items tell us which parts of the key space the log 1264 * is authoritative for. During replay, if a key in the subvolume 1265 * directory is in a logged range item, but not actually in the log 1266 * that means it was deleted from the directory before the fsync 1267 * and should be removed. 1268 */ 1269 static noinline int find_dir_range(struct btrfs_root *root, 1270 struct btrfs_path *path, 1271 u64 dirid, int key_type, 1272 u64 *start_ret, u64 *end_ret) 1273 { 1274 struct btrfs_key key; 1275 u64 found_end; 1276 struct btrfs_dir_log_item *item; 1277 int ret; 1278 int nritems; 1279 1280 if (*start_ret == (u64)-1) 1281 return 1; 1282 1283 key.objectid = dirid; 1284 key.type = key_type; 1285 key.offset = *start_ret; 1286 1287 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1288 if (ret < 0) 1289 goto out; 1290 if (ret > 0) { 1291 if (path->slots[0] == 0) 1292 goto out; 1293 path->slots[0]--; 1294 } 1295 if (ret != 0) 1296 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1297 1298 if (key.type != key_type || key.objectid != dirid) { 1299 ret = 1; 1300 goto next; 1301 } 1302 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1303 struct btrfs_dir_log_item); 1304 found_end = btrfs_dir_log_end(path->nodes[0], item); 1305 1306 if (*start_ret >= key.offset && *start_ret <= found_end) { 1307 ret = 0; 1308 *start_ret = key.offset; 1309 *end_ret = found_end; 1310 goto out; 1311 } 1312 ret = 1; 1313 next: 1314 /* check the next slot in the tree to see if it is a valid item */ 1315 nritems = btrfs_header_nritems(path->nodes[0]); 1316 if (path->slots[0] >= nritems) { 1317 ret = btrfs_next_leaf(root, path); 1318 if (ret) 1319 goto out; 1320 } else { 1321 path->slots[0]++; 1322 } 1323 1324 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1325 1326 if (key.type != key_type || key.objectid != dirid) { 1327 ret = 1; 1328 goto out; 1329 } 1330 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1331 struct btrfs_dir_log_item); 1332 found_end = btrfs_dir_log_end(path->nodes[0], item); 1333 *start_ret = key.offset; 1334 *end_ret = found_end; 1335 ret = 0; 1336 out: 1337 btrfs_release_path(root, path); 1338 return ret; 1339 } 1340 1341 /* 1342 * this looks for a given directory item in the log. If the directory 1343 * item is not in the log, the item is removed and the inode it points 1344 * to is unlinked 1345 */ 1346 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1347 struct btrfs_root *root, 1348 struct btrfs_root *log, 1349 struct btrfs_path *path, 1350 struct btrfs_path *log_path, 1351 struct inode *dir, 1352 struct btrfs_key *dir_key) 1353 { 1354 int ret; 1355 struct extent_buffer *eb; 1356 int slot; 1357 u32 item_size; 1358 struct btrfs_dir_item *di; 1359 struct btrfs_dir_item *log_di; 1360 int name_len; 1361 unsigned long ptr; 1362 unsigned long ptr_end; 1363 char *name; 1364 struct inode *inode; 1365 struct btrfs_key location; 1366 1367 again: 1368 eb = path->nodes[0]; 1369 slot = path->slots[0]; 1370 item_size = btrfs_item_size_nr(eb, slot); 1371 ptr = btrfs_item_ptr_offset(eb, slot); 1372 ptr_end = ptr + item_size; 1373 while (ptr < ptr_end) { 1374 di = (struct btrfs_dir_item *)ptr; 1375 name_len = btrfs_dir_name_len(eb, di); 1376 name = kmalloc(name_len, GFP_NOFS); 1377 if (!name) { 1378 ret = -ENOMEM; 1379 goto out; 1380 } 1381 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1382 name_len); 1383 log_di = NULL; 1384 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1385 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1386 dir_key->objectid, 1387 name, name_len, 0); 1388 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1389 log_di = btrfs_lookup_dir_index_item(trans, log, 1390 log_path, 1391 dir_key->objectid, 1392 dir_key->offset, 1393 name, name_len, 0); 1394 } 1395 if (!log_di || IS_ERR(log_di)) { 1396 btrfs_dir_item_key_to_cpu(eb, di, &location); 1397 btrfs_release_path(root, path); 1398 btrfs_release_path(log, log_path); 1399 inode = read_one_inode(root, location.objectid); 1400 BUG_ON(!inode); 1401 1402 ret = link_to_fixup_dir(trans, root, 1403 path, location.objectid); 1404 BUG_ON(ret); 1405 btrfs_inc_nlink(inode); 1406 ret = btrfs_unlink_inode(trans, root, dir, inode, 1407 name, name_len); 1408 BUG_ON(ret); 1409 kfree(name); 1410 iput(inode); 1411 1412 /* there might still be more names under this key 1413 * check and repeat if required 1414 */ 1415 ret = btrfs_search_slot(NULL, root, dir_key, path, 1416 0, 0); 1417 if (ret == 0) 1418 goto again; 1419 ret = 0; 1420 goto out; 1421 } 1422 btrfs_release_path(log, log_path); 1423 kfree(name); 1424 1425 ptr = (unsigned long)(di + 1); 1426 ptr += name_len; 1427 } 1428 ret = 0; 1429 out: 1430 btrfs_release_path(root, path); 1431 btrfs_release_path(log, log_path); 1432 return ret; 1433 } 1434 1435 /* 1436 * deletion replay happens before we copy any new directory items 1437 * out of the log or out of backreferences from inodes. It 1438 * scans the log to find ranges of keys that log is authoritative for, 1439 * and then scans the directory to find items in those ranges that are 1440 * not present in the log. 1441 * 1442 * Anything we don't find in the log is unlinked and removed from the 1443 * directory. 1444 */ 1445 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1446 struct btrfs_root *root, 1447 struct btrfs_root *log, 1448 struct btrfs_path *path, 1449 u64 dirid, int del_all) 1450 { 1451 u64 range_start; 1452 u64 range_end; 1453 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1454 int ret = 0; 1455 struct btrfs_key dir_key; 1456 struct btrfs_key found_key; 1457 struct btrfs_path *log_path; 1458 struct inode *dir; 1459 1460 dir_key.objectid = dirid; 1461 dir_key.type = BTRFS_DIR_ITEM_KEY; 1462 log_path = btrfs_alloc_path(); 1463 if (!log_path) 1464 return -ENOMEM; 1465 1466 dir = read_one_inode(root, dirid); 1467 /* it isn't an error if the inode isn't there, that can happen 1468 * because we replay the deletes before we copy in the inode item 1469 * from the log 1470 */ 1471 if (!dir) { 1472 btrfs_free_path(log_path); 1473 return 0; 1474 } 1475 again: 1476 range_start = 0; 1477 range_end = 0; 1478 while (1) { 1479 if (del_all) 1480 range_end = (u64)-1; 1481 else { 1482 ret = find_dir_range(log, path, dirid, key_type, 1483 &range_start, &range_end); 1484 if (ret != 0) 1485 break; 1486 } 1487 1488 dir_key.offset = range_start; 1489 while (1) { 1490 int nritems; 1491 ret = btrfs_search_slot(NULL, root, &dir_key, path, 1492 0, 0); 1493 if (ret < 0) 1494 goto out; 1495 1496 nritems = btrfs_header_nritems(path->nodes[0]); 1497 if (path->slots[0] >= nritems) { 1498 ret = btrfs_next_leaf(root, path); 1499 if (ret) 1500 break; 1501 } 1502 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1503 path->slots[0]); 1504 if (found_key.objectid != dirid || 1505 found_key.type != dir_key.type) 1506 goto next_type; 1507 1508 if (found_key.offset > range_end) 1509 break; 1510 1511 ret = check_item_in_log(trans, root, log, path, 1512 log_path, dir, 1513 &found_key); 1514 BUG_ON(ret); 1515 if (found_key.offset == (u64)-1) 1516 break; 1517 dir_key.offset = found_key.offset + 1; 1518 } 1519 btrfs_release_path(root, path); 1520 if (range_end == (u64)-1) 1521 break; 1522 range_start = range_end + 1; 1523 } 1524 1525 next_type: 1526 ret = 0; 1527 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1528 key_type = BTRFS_DIR_LOG_INDEX_KEY; 1529 dir_key.type = BTRFS_DIR_INDEX_KEY; 1530 btrfs_release_path(root, path); 1531 goto again; 1532 } 1533 out: 1534 btrfs_release_path(root, path); 1535 btrfs_free_path(log_path); 1536 iput(dir); 1537 return ret; 1538 } 1539 1540 /* 1541 * the process_func used to replay items from the log tree. This 1542 * gets called in two different stages. The first stage just looks 1543 * for inodes and makes sure they are all copied into the subvolume. 1544 * 1545 * The second stage copies all the other item types from the log into 1546 * the subvolume. The two stage approach is slower, but gets rid of 1547 * lots of complexity around inodes referencing other inodes that exist 1548 * only in the log (references come from either directory items or inode 1549 * back refs). 1550 */ 1551 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 1552 struct walk_control *wc, u64 gen) 1553 { 1554 int nritems; 1555 struct btrfs_path *path; 1556 struct btrfs_root *root = wc->replay_dest; 1557 struct btrfs_key key; 1558 u32 item_size; 1559 int level; 1560 int i; 1561 int ret; 1562 1563 btrfs_read_buffer(eb, gen); 1564 1565 level = btrfs_header_level(eb); 1566 1567 if (level != 0) 1568 return 0; 1569 1570 path = btrfs_alloc_path(); 1571 BUG_ON(!path); 1572 1573 nritems = btrfs_header_nritems(eb); 1574 for (i = 0; i < nritems; i++) { 1575 btrfs_item_key_to_cpu(eb, &key, i); 1576 item_size = btrfs_item_size_nr(eb, i); 1577 1578 /* inode keys are done during the first stage */ 1579 if (key.type == BTRFS_INODE_ITEM_KEY && 1580 wc->stage == LOG_WALK_REPLAY_INODES) { 1581 struct inode *inode; 1582 struct btrfs_inode_item *inode_item; 1583 u32 mode; 1584 1585 inode_item = btrfs_item_ptr(eb, i, 1586 struct btrfs_inode_item); 1587 mode = btrfs_inode_mode(eb, inode_item); 1588 if (S_ISDIR(mode)) { 1589 ret = replay_dir_deletes(wc->trans, 1590 root, log, path, key.objectid, 0); 1591 BUG_ON(ret); 1592 } 1593 ret = overwrite_item(wc->trans, root, path, 1594 eb, i, &key); 1595 BUG_ON(ret); 1596 1597 /* for regular files, truncate away 1598 * extents past the new EOF 1599 */ 1600 if (S_ISREG(mode)) { 1601 inode = read_one_inode(root, 1602 key.objectid); 1603 BUG_ON(!inode); 1604 1605 ret = btrfs_truncate_inode_items(wc->trans, 1606 root, inode, inode->i_size, 1607 BTRFS_EXTENT_DATA_KEY); 1608 BUG_ON(ret); 1609 1610 /* if the nlink count is zero here, the iput 1611 * will free the inode. We bump it to make 1612 * sure it doesn't get freed until the link 1613 * count fixup is done 1614 */ 1615 if (inode->i_nlink == 0) { 1616 btrfs_inc_nlink(inode); 1617 btrfs_update_inode(wc->trans, 1618 root, inode); 1619 } 1620 iput(inode); 1621 } 1622 ret = link_to_fixup_dir(wc->trans, root, 1623 path, key.objectid); 1624 BUG_ON(ret); 1625 } 1626 if (wc->stage < LOG_WALK_REPLAY_ALL) 1627 continue; 1628 1629 /* these keys are simply copied */ 1630 if (key.type == BTRFS_XATTR_ITEM_KEY) { 1631 ret = overwrite_item(wc->trans, root, path, 1632 eb, i, &key); 1633 BUG_ON(ret); 1634 } else if (key.type == BTRFS_INODE_REF_KEY) { 1635 ret = add_inode_ref(wc->trans, root, log, path, 1636 eb, i, &key); 1637 BUG_ON(ret && ret != -ENOENT); 1638 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1639 ret = replay_one_extent(wc->trans, root, path, 1640 eb, i, &key); 1641 BUG_ON(ret); 1642 } else if (key.type == BTRFS_DIR_ITEM_KEY || 1643 key.type == BTRFS_DIR_INDEX_KEY) { 1644 ret = replay_one_dir_item(wc->trans, root, path, 1645 eb, i, &key); 1646 BUG_ON(ret); 1647 } 1648 } 1649 btrfs_free_path(path); 1650 return 0; 1651 } 1652 1653 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 1654 struct btrfs_root *root, 1655 struct btrfs_path *path, int *level, 1656 struct walk_control *wc) 1657 { 1658 u64 root_owner; 1659 u64 root_gen; 1660 u64 bytenr; 1661 u64 ptr_gen; 1662 struct extent_buffer *next; 1663 struct extent_buffer *cur; 1664 struct extent_buffer *parent; 1665 u32 blocksize; 1666 int ret = 0; 1667 1668 WARN_ON(*level < 0); 1669 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1670 1671 while (*level > 0) { 1672 WARN_ON(*level < 0); 1673 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1674 cur = path->nodes[*level]; 1675 1676 if (btrfs_header_level(cur) != *level) 1677 WARN_ON(1); 1678 1679 if (path->slots[*level] >= 1680 btrfs_header_nritems(cur)) 1681 break; 1682 1683 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 1684 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 1685 blocksize = btrfs_level_size(root, *level - 1); 1686 1687 parent = path->nodes[*level]; 1688 root_owner = btrfs_header_owner(parent); 1689 root_gen = btrfs_header_generation(parent); 1690 1691 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1692 1693 wc->process_func(root, next, wc, ptr_gen); 1694 1695 if (*level == 1) { 1696 path->slots[*level]++; 1697 if (wc->free) { 1698 btrfs_read_buffer(next, ptr_gen); 1699 1700 btrfs_tree_lock(next); 1701 clean_tree_block(trans, root, next); 1702 btrfs_set_lock_blocking(next); 1703 btrfs_wait_tree_block_writeback(next); 1704 btrfs_tree_unlock(next); 1705 1706 WARN_ON(root_owner != 1707 BTRFS_TREE_LOG_OBJECTID); 1708 ret = btrfs_free_reserved_extent(root, 1709 bytenr, blocksize); 1710 BUG_ON(ret); 1711 } 1712 free_extent_buffer(next); 1713 continue; 1714 } 1715 btrfs_read_buffer(next, ptr_gen); 1716 1717 WARN_ON(*level <= 0); 1718 if (path->nodes[*level-1]) 1719 free_extent_buffer(path->nodes[*level-1]); 1720 path->nodes[*level-1] = next; 1721 *level = btrfs_header_level(next); 1722 path->slots[*level] = 0; 1723 cond_resched(); 1724 } 1725 WARN_ON(*level < 0); 1726 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1727 1728 if (path->nodes[*level] == root->node) 1729 parent = path->nodes[*level]; 1730 else 1731 parent = path->nodes[*level + 1]; 1732 1733 bytenr = path->nodes[*level]->start; 1734 1735 blocksize = btrfs_level_size(root, *level); 1736 root_owner = btrfs_header_owner(parent); 1737 root_gen = btrfs_header_generation(parent); 1738 1739 wc->process_func(root, path->nodes[*level], wc, 1740 btrfs_header_generation(path->nodes[*level])); 1741 1742 if (wc->free) { 1743 next = path->nodes[*level]; 1744 btrfs_tree_lock(next); 1745 clean_tree_block(trans, root, next); 1746 btrfs_set_lock_blocking(next); 1747 btrfs_wait_tree_block_writeback(next); 1748 btrfs_tree_unlock(next); 1749 1750 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1751 ret = btrfs_free_reserved_extent(root, bytenr, blocksize); 1752 BUG_ON(ret); 1753 } 1754 free_extent_buffer(path->nodes[*level]); 1755 path->nodes[*level] = NULL; 1756 *level += 1; 1757 1758 cond_resched(); 1759 return 0; 1760 } 1761 1762 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 1763 struct btrfs_root *root, 1764 struct btrfs_path *path, int *level, 1765 struct walk_control *wc) 1766 { 1767 u64 root_owner; 1768 u64 root_gen; 1769 int i; 1770 int slot; 1771 int ret; 1772 1773 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1774 slot = path->slots[i]; 1775 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1776 struct extent_buffer *node; 1777 node = path->nodes[i]; 1778 path->slots[i]++; 1779 *level = i; 1780 WARN_ON(*level == 0); 1781 return 0; 1782 } else { 1783 struct extent_buffer *parent; 1784 if (path->nodes[*level] == root->node) 1785 parent = path->nodes[*level]; 1786 else 1787 parent = path->nodes[*level + 1]; 1788 1789 root_owner = btrfs_header_owner(parent); 1790 root_gen = btrfs_header_generation(parent); 1791 wc->process_func(root, path->nodes[*level], wc, 1792 btrfs_header_generation(path->nodes[*level])); 1793 if (wc->free) { 1794 struct extent_buffer *next; 1795 1796 next = path->nodes[*level]; 1797 1798 btrfs_tree_lock(next); 1799 clean_tree_block(trans, root, next); 1800 btrfs_set_lock_blocking(next); 1801 btrfs_wait_tree_block_writeback(next); 1802 btrfs_tree_unlock(next); 1803 1804 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1805 ret = btrfs_free_reserved_extent(root, 1806 path->nodes[*level]->start, 1807 path->nodes[*level]->len); 1808 BUG_ON(ret); 1809 } 1810 free_extent_buffer(path->nodes[*level]); 1811 path->nodes[*level] = NULL; 1812 *level = i + 1; 1813 } 1814 } 1815 return 1; 1816 } 1817 1818 /* 1819 * drop the reference count on the tree rooted at 'snap'. This traverses 1820 * the tree freeing any blocks that have a ref count of zero after being 1821 * decremented. 1822 */ 1823 static int walk_log_tree(struct btrfs_trans_handle *trans, 1824 struct btrfs_root *log, struct walk_control *wc) 1825 { 1826 int ret = 0; 1827 int wret; 1828 int level; 1829 struct btrfs_path *path; 1830 int i; 1831 int orig_level; 1832 1833 path = btrfs_alloc_path(); 1834 BUG_ON(!path); 1835 1836 level = btrfs_header_level(log->node); 1837 orig_level = level; 1838 path->nodes[level] = log->node; 1839 extent_buffer_get(log->node); 1840 path->slots[level] = 0; 1841 1842 while (1) { 1843 wret = walk_down_log_tree(trans, log, path, &level, wc); 1844 if (wret > 0) 1845 break; 1846 if (wret < 0) 1847 ret = wret; 1848 1849 wret = walk_up_log_tree(trans, log, path, &level, wc); 1850 if (wret > 0) 1851 break; 1852 if (wret < 0) 1853 ret = wret; 1854 } 1855 1856 /* was the root node processed? if not, catch it here */ 1857 if (path->nodes[orig_level]) { 1858 wc->process_func(log, path->nodes[orig_level], wc, 1859 btrfs_header_generation(path->nodes[orig_level])); 1860 if (wc->free) { 1861 struct extent_buffer *next; 1862 1863 next = path->nodes[orig_level]; 1864 1865 btrfs_tree_lock(next); 1866 clean_tree_block(trans, log, next); 1867 btrfs_set_lock_blocking(next); 1868 btrfs_wait_tree_block_writeback(next); 1869 btrfs_tree_unlock(next); 1870 1871 WARN_ON(log->root_key.objectid != 1872 BTRFS_TREE_LOG_OBJECTID); 1873 ret = btrfs_free_reserved_extent(log, next->start, 1874 next->len); 1875 BUG_ON(ret); 1876 } 1877 } 1878 1879 for (i = 0; i <= orig_level; i++) { 1880 if (path->nodes[i]) { 1881 free_extent_buffer(path->nodes[i]); 1882 path->nodes[i] = NULL; 1883 } 1884 } 1885 btrfs_free_path(path); 1886 return ret; 1887 } 1888 1889 /* 1890 * helper function to update the item for a given subvolumes log root 1891 * in the tree of log roots 1892 */ 1893 static int update_log_root(struct btrfs_trans_handle *trans, 1894 struct btrfs_root *log) 1895 { 1896 int ret; 1897 1898 if (log->log_transid == 1) { 1899 /* insert root item on the first sync */ 1900 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 1901 &log->root_key, &log->root_item); 1902 } else { 1903 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 1904 &log->root_key, &log->root_item); 1905 } 1906 return ret; 1907 } 1908 1909 static int wait_log_commit(struct btrfs_trans_handle *trans, 1910 struct btrfs_root *root, unsigned long transid) 1911 { 1912 DEFINE_WAIT(wait); 1913 int index = transid % 2; 1914 1915 /* 1916 * we only allow two pending log transactions at a time, 1917 * so we know that if ours is more than 2 older than the 1918 * current transaction, we're done 1919 */ 1920 do { 1921 prepare_to_wait(&root->log_commit_wait[index], 1922 &wait, TASK_UNINTERRUPTIBLE); 1923 mutex_unlock(&root->log_mutex); 1924 1925 if (root->fs_info->last_trans_log_full_commit != 1926 trans->transid && root->log_transid < transid + 2 && 1927 atomic_read(&root->log_commit[index])) 1928 schedule(); 1929 1930 finish_wait(&root->log_commit_wait[index], &wait); 1931 mutex_lock(&root->log_mutex); 1932 } while (root->log_transid < transid + 2 && 1933 atomic_read(&root->log_commit[index])); 1934 return 0; 1935 } 1936 1937 static int wait_for_writer(struct btrfs_trans_handle *trans, 1938 struct btrfs_root *root) 1939 { 1940 DEFINE_WAIT(wait); 1941 while (atomic_read(&root->log_writers)) { 1942 prepare_to_wait(&root->log_writer_wait, 1943 &wait, TASK_UNINTERRUPTIBLE); 1944 mutex_unlock(&root->log_mutex); 1945 if (root->fs_info->last_trans_log_full_commit != 1946 trans->transid && atomic_read(&root->log_writers)) 1947 schedule(); 1948 mutex_lock(&root->log_mutex); 1949 finish_wait(&root->log_writer_wait, &wait); 1950 } 1951 return 0; 1952 } 1953 1954 /* 1955 * btrfs_sync_log does sends a given tree log down to the disk and 1956 * updates the super blocks to record it. When this call is done, 1957 * you know that any inodes previously logged are safely on disk only 1958 * if it returns 0. 1959 * 1960 * Any other return value means you need to call btrfs_commit_transaction. 1961 * Some of the edge cases for fsyncing directories that have had unlinks 1962 * or renames done in the past mean that sometimes the only safe 1963 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 1964 * that has happened. 1965 */ 1966 int btrfs_sync_log(struct btrfs_trans_handle *trans, 1967 struct btrfs_root *root) 1968 { 1969 int index1; 1970 int index2; 1971 int ret; 1972 struct btrfs_root *log = root->log_root; 1973 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 1974 1975 mutex_lock(&root->log_mutex); 1976 index1 = root->log_transid % 2; 1977 if (atomic_read(&root->log_commit[index1])) { 1978 wait_log_commit(trans, root, root->log_transid); 1979 mutex_unlock(&root->log_mutex); 1980 return 0; 1981 } 1982 atomic_set(&root->log_commit[index1], 1); 1983 1984 /* wait for previous tree log sync to complete */ 1985 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 1986 wait_log_commit(trans, root, root->log_transid - 1); 1987 1988 while (1) { 1989 unsigned long batch = root->log_batch; 1990 mutex_unlock(&root->log_mutex); 1991 schedule_timeout_uninterruptible(1); 1992 mutex_lock(&root->log_mutex); 1993 1994 wait_for_writer(trans, root); 1995 if (batch == root->log_batch) 1996 break; 1997 } 1998 1999 /* bail out if we need to do a full commit */ 2000 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2001 ret = -EAGAIN; 2002 mutex_unlock(&root->log_mutex); 2003 goto out; 2004 } 2005 2006 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2007 BUG_ON(ret); 2008 2009 btrfs_set_root_node(&log->root_item, log->node); 2010 2011 root->log_batch = 0; 2012 root->log_transid++; 2013 log->log_transid = root->log_transid; 2014 smp_mb(); 2015 /* 2016 * log tree has been flushed to disk, new modifications of 2017 * the log will be written to new positions. so it's safe to 2018 * allow log writers to go in. 2019 */ 2020 mutex_unlock(&root->log_mutex); 2021 2022 mutex_lock(&log_root_tree->log_mutex); 2023 log_root_tree->log_batch++; 2024 atomic_inc(&log_root_tree->log_writers); 2025 mutex_unlock(&log_root_tree->log_mutex); 2026 2027 ret = update_log_root(trans, log); 2028 BUG_ON(ret); 2029 2030 mutex_lock(&log_root_tree->log_mutex); 2031 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2032 smp_mb(); 2033 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2034 wake_up(&log_root_tree->log_writer_wait); 2035 } 2036 2037 index2 = log_root_tree->log_transid % 2; 2038 if (atomic_read(&log_root_tree->log_commit[index2])) { 2039 wait_log_commit(trans, log_root_tree, 2040 log_root_tree->log_transid); 2041 mutex_unlock(&log_root_tree->log_mutex); 2042 goto out; 2043 } 2044 atomic_set(&log_root_tree->log_commit[index2], 1); 2045 2046 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2047 wait_log_commit(trans, log_root_tree, 2048 log_root_tree->log_transid - 1); 2049 } 2050 2051 wait_for_writer(trans, log_root_tree); 2052 2053 /* 2054 * now that we've moved on to the tree of log tree roots, 2055 * check the full commit flag again 2056 */ 2057 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2058 mutex_unlock(&log_root_tree->log_mutex); 2059 ret = -EAGAIN; 2060 goto out_wake_log_root; 2061 } 2062 2063 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2064 &log_root_tree->dirty_log_pages); 2065 BUG_ON(ret); 2066 2067 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2068 log_root_tree->node->start); 2069 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2070 btrfs_header_level(log_root_tree->node)); 2071 2072 log_root_tree->log_batch = 0; 2073 log_root_tree->log_transid++; 2074 smp_mb(); 2075 2076 mutex_unlock(&log_root_tree->log_mutex); 2077 2078 /* 2079 * nobody else is going to jump in and write the the ctree 2080 * super here because the log_commit atomic below is protecting 2081 * us. We must be called with a transaction handle pinning 2082 * the running transaction open, so a full commit can't hop 2083 * in and cause problems either. 2084 */ 2085 write_ctree_super(trans, root->fs_info->tree_root, 2); 2086 ret = 0; 2087 2088 out_wake_log_root: 2089 atomic_set(&log_root_tree->log_commit[index2], 0); 2090 smp_mb(); 2091 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2092 wake_up(&log_root_tree->log_commit_wait[index2]); 2093 out: 2094 atomic_set(&root->log_commit[index1], 0); 2095 smp_mb(); 2096 if (waitqueue_active(&root->log_commit_wait[index1])) 2097 wake_up(&root->log_commit_wait[index1]); 2098 return 0; 2099 } 2100 2101 /* 2102 * free all the extents used by the tree log. This should be called 2103 * at commit time of the full transaction 2104 */ 2105 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2106 { 2107 int ret; 2108 struct btrfs_root *log; 2109 struct key; 2110 u64 start; 2111 u64 end; 2112 struct walk_control wc = { 2113 .free = 1, 2114 .process_func = process_one_buffer 2115 }; 2116 2117 if (!root->log_root || root->fs_info->log_root_recovering) 2118 return 0; 2119 2120 log = root->log_root; 2121 ret = walk_log_tree(trans, log, &wc); 2122 BUG_ON(ret); 2123 2124 while (1) { 2125 ret = find_first_extent_bit(&log->dirty_log_pages, 2126 0, &start, &end, EXTENT_DIRTY); 2127 if (ret) 2128 break; 2129 2130 clear_extent_dirty(&log->dirty_log_pages, 2131 start, end, GFP_NOFS); 2132 } 2133 2134 if (log->log_transid > 0) { 2135 ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2136 &log->root_key); 2137 BUG_ON(ret); 2138 } 2139 root->log_root = NULL; 2140 free_extent_buffer(log->node); 2141 kfree(log); 2142 return 0; 2143 } 2144 2145 /* 2146 * If both a file and directory are logged, and unlinks or renames are 2147 * mixed in, we have a few interesting corners: 2148 * 2149 * create file X in dir Y 2150 * link file X to X.link in dir Y 2151 * fsync file X 2152 * unlink file X but leave X.link 2153 * fsync dir Y 2154 * 2155 * After a crash we would expect only X.link to exist. But file X 2156 * didn't get fsync'd again so the log has back refs for X and X.link. 2157 * 2158 * We solve this by removing directory entries and inode backrefs from the 2159 * log when a file that was logged in the current transaction is 2160 * unlinked. Any later fsync will include the updated log entries, and 2161 * we'll be able to reconstruct the proper directory items from backrefs. 2162 * 2163 * This optimizations allows us to avoid relogging the entire inode 2164 * or the entire directory. 2165 */ 2166 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2167 struct btrfs_root *root, 2168 const char *name, int name_len, 2169 struct inode *dir, u64 index) 2170 { 2171 struct btrfs_root *log; 2172 struct btrfs_dir_item *di; 2173 struct btrfs_path *path; 2174 int ret; 2175 int bytes_del = 0; 2176 2177 if (BTRFS_I(dir)->logged_trans < trans->transid) 2178 return 0; 2179 2180 ret = join_running_log_trans(root); 2181 if (ret) 2182 return 0; 2183 2184 mutex_lock(&BTRFS_I(dir)->log_mutex); 2185 2186 log = root->log_root; 2187 path = btrfs_alloc_path(); 2188 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2189 name, name_len, -1); 2190 if (di && !IS_ERR(di)) { 2191 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2192 bytes_del += name_len; 2193 BUG_ON(ret); 2194 } 2195 btrfs_release_path(log, path); 2196 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2197 index, name, name_len, -1); 2198 if (di && !IS_ERR(di)) { 2199 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2200 bytes_del += name_len; 2201 BUG_ON(ret); 2202 } 2203 2204 /* update the directory size in the log to reflect the names 2205 * we have removed 2206 */ 2207 if (bytes_del) { 2208 struct btrfs_key key; 2209 2210 key.objectid = dir->i_ino; 2211 key.offset = 0; 2212 key.type = BTRFS_INODE_ITEM_KEY; 2213 btrfs_release_path(log, path); 2214 2215 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2216 if (ret == 0) { 2217 struct btrfs_inode_item *item; 2218 u64 i_size; 2219 2220 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2221 struct btrfs_inode_item); 2222 i_size = btrfs_inode_size(path->nodes[0], item); 2223 if (i_size > bytes_del) 2224 i_size -= bytes_del; 2225 else 2226 i_size = 0; 2227 btrfs_set_inode_size(path->nodes[0], item, i_size); 2228 btrfs_mark_buffer_dirty(path->nodes[0]); 2229 } else 2230 ret = 0; 2231 btrfs_release_path(log, path); 2232 } 2233 2234 btrfs_free_path(path); 2235 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2236 btrfs_end_log_trans(root); 2237 2238 return 0; 2239 } 2240 2241 /* see comments for btrfs_del_dir_entries_in_log */ 2242 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2243 struct btrfs_root *root, 2244 const char *name, int name_len, 2245 struct inode *inode, u64 dirid) 2246 { 2247 struct btrfs_root *log; 2248 u64 index; 2249 int ret; 2250 2251 if (BTRFS_I(inode)->logged_trans < trans->transid) 2252 return 0; 2253 2254 ret = join_running_log_trans(root); 2255 if (ret) 2256 return 0; 2257 log = root->log_root; 2258 mutex_lock(&BTRFS_I(inode)->log_mutex); 2259 2260 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2261 dirid, &index); 2262 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2263 btrfs_end_log_trans(root); 2264 2265 return ret; 2266 } 2267 2268 /* 2269 * creates a range item in the log for 'dirid'. first_offset and 2270 * last_offset tell us which parts of the key space the log should 2271 * be considered authoritative for. 2272 */ 2273 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2274 struct btrfs_root *log, 2275 struct btrfs_path *path, 2276 int key_type, u64 dirid, 2277 u64 first_offset, u64 last_offset) 2278 { 2279 int ret; 2280 struct btrfs_key key; 2281 struct btrfs_dir_log_item *item; 2282 2283 key.objectid = dirid; 2284 key.offset = first_offset; 2285 if (key_type == BTRFS_DIR_ITEM_KEY) 2286 key.type = BTRFS_DIR_LOG_ITEM_KEY; 2287 else 2288 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2289 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2290 BUG_ON(ret); 2291 2292 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2293 struct btrfs_dir_log_item); 2294 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2295 btrfs_mark_buffer_dirty(path->nodes[0]); 2296 btrfs_release_path(log, path); 2297 return 0; 2298 } 2299 2300 /* 2301 * log all the items included in the current transaction for a given 2302 * directory. This also creates the range items in the log tree required 2303 * to replay anything deleted before the fsync 2304 */ 2305 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2306 struct btrfs_root *root, struct inode *inode, 2307 struct btrfs_path *path, 2308 struct btrfs_path *dst_path, int key_type, 2309 u64 min_offset, u64 *last_offset_ret) 2310 { 2311 struct btrfs_key min_key; 2312 struct btrfs_key max_key; 2313 struct btrfs_root *log = root->log_root; 2314 struct extent_buffer *src; 2315 int ret; 2316 int i; 2317 int nritems; 2318 u64 first_offset = min_offset; 2319 u64 last_offset = (u64)-1; 2320 2321 log = root->log_root; 2322 max_key.objectid = inode->i_ino; 2323 max_key.offset = (u64)-1; 2324 max_key.type = key_type; 2325 2326 min_key.objectid = inode->i_ino; 2327 min_key.type = key_type; 2328 min_key.offset = min_offset; 2329 2330 path->keep_locks = 1; 2331 2332 ret = btrfs_search_forward(root, &min_key, &max_key, 2333 path, 0, trans->transid); 2334 2335 /* 2336 * we didn't find anything from this transaction, see if there 2337 * is anything at all 2338 */ 2339 if (ret != 0 || min_key.objectid != inode->i_ino || 2340 min_key.type != key_type) { 2341 min_key.objectid = inode->i_ino; 2342 min_key.type = key_type; 2343 min_key.offset = (u64)-1; 2344 btrfs_release_path(root, path); 2345 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2346 if (ret < 0) { 2347 btrfs_release_path(root, path); 2348 return ret; 2349 } 2350 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2351 2352 /* if ret == 0 there are items for this type, 2353 * create a range to tell us the last key of this type. 2354 * otherwise, there are no items in this directory after 2355 * *min_offset, and we create a range to indicate that. 2356 */ 2357 if (ret == 0) { 2358 struct btrfs_key tmp; 2359 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 2360 path->slots[0]); 2361 if (key_type == tmp.type) 2362 first_offset = max(min_offset, tmp.offset) + 1; 2363 } 2364 goto done; 2365 } 2366 2367 /* go backward to find any previous key */ 2368 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2369 if (ret == 0) { 2370 struct btrfs_key tmp; 2371 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2372 if (key_type == tmp.type) { 2373 first_offset = tmp.offset; 2374 ret = overwrite_item(trans, log, dst_path, 2375 path->nodes[0], path->slots[0], 2376 &tmp); 2377 } 2378 } 2379 btrfs_release_path(root, path); 2380 2381 /* find the first key from this transaction again */ 2382 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2383 if (ret != 0) { 2384 WARN_ON(1); 2385 goto done; 2386 } 2387 2388 /* 2389 * we have a block from this transaction, log every item in it 2390 * from our directory 2391 */ 2392 while (1) { 2393 struct btrfs_key tmp; 2394 src = path->nodes[0]; 2395 nritems = btrfs_header_nritems(src); 2396 for (i = path->slots[0]; i < nritems; i++) { 2397 btrfs_item_key_to_cpu(src, &min_key, i); 2398 2399 if (min_key.objectid != inode->i_ino || 2400 min_key.type != key_type) 2401 goto done; 2402 ret = overwrite_item(trans, log, dst_path, src, i, 2403 &min_key); 2404 BUG_ON(ret); 2405 } 2406 path->slots[0] = nritems; 2407 2408 /* 2409 * look ahead to the next item and see if it is also 2410 * from this directory and from this transaction 2411 */ 2412 ret = btrfs_next_leaf(root, path); 2413 if (ret == 1) { 2414 last_offset = (u64)-1; 2415 goto done; 2416 } 2417 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2418 if (tmp.objectid != inode->i_ino || tmp.type != key_type) { 2419 last_offset = (u64)-1; 2420 goto done; 2421 } 2422 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 2423 ret = overwrite_item(trans, log, dst_path, 2424 path->nodes[0], path->slots[0], 2425 &tmp); 2426 2427 BUG_ON(ret); 2428 last_offset = tmp.offset; 2429 goto done; 2430 } 2431 } 2432 done: 2433 *last_offset_ret = last_offset; 2434 btrfs_release_path(root, path); 2435 btrfs_release_path(log, dst_path); 2436 2437 /* insert the log range keys to indicate where the log is valid */ 2438 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2439 first_offset, last_offset); 2440 BUG_ON(ret); 2441 return 0; 2442 } 2443 2444 /* 2445 * logging directories is very similar to logging inodes, We find all the items 2446 * from the current transaction and write them to the log. 2447 * 2448 * The recovery code scans the directory in the subvolume, and if it finds a 2449 * key in the range logged that is not present in the log tree, then it means 2450 * that dir entry was unlinked during the transaction. 2451 * 2452 * In order for that scan to work, we must include one key smaller than 2453 * the smallest logged by this transaction and one key larger than the largest 2454 * key logged by this transaction. 2455 */ 2456 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 2457 struct btrfs_root *root, struct inode *inode, 2458 struct btrfs_path *path, 2459 struct btrfs_path *dst_path) 2460 { 2461 u64 min_key; 2462 u64 max_key; 2463 int ret; 2464 int key_type = BTRFS_DIR_ITEM_KEY; 2465 2466 again: 2467 min_key = 0; 2468 max_key = 0; 2469 while (1) { 2470 ret = log_dir_items(trans, root, inode, path, 2471 dst_path, key_type, min_key, 2472 &max_key); 2473 BUG_ON(ret); 2474 if (max_key == (u64)-1) 2475 break; 2476 min_key = max_key + 1; 2477 } 2478 2479 if (key_type == BTRFS_DIR_ITEM_KEY) { 2480 key_type = BTRFS_DIR_INDEX_KEY; 2481 goto again; 2482 } 2483 return 0; 2484 } 2485 2486 /* 2487 * a helper function to drop items from the log before we relog an 2488 * inode. max_key_type indicates the highest item type to remove. 2489 * This cannot be run for file data extents because it does not 2490 * free the extents they point to. 2491 */ 2492 static int drop_objectid_items(struct btrfs_trans_handle *trans, 2493 struct btrfs_root *log, 2494 struct btrfs_path *path, 2495 u64 objectid, int max_key_type) 2496 { 2497 int ret; 2498 struct btrfs_key key; 2499 struct btrfs_key found_key; 2500 2501 key.objectid = objectid; 2502 key.type = max_key_type; 2503 key.offset = (u64)-1; 2504 2505 while (1) { 2506 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2507 2508 if (ret != 1) 2509 break; 2510 2511 if (path->slots[0] == 0) 2512 break; 2513 2514 path->slots[0]--; 2515 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2516 path->slots[0]); 2517 2518 if (found_key.objectid != objectid) 2519 break; 2520 2521 ret = btrfs_del_item(trans, log, path); 2522 BUG_ON(ret); 2523 btrfs_release_path(log, path); 2524 } 2525 btrfs_release_path(log, path); 2526 return 0; 2527 } 2528 2529 static noinline int copy_items(struct btrfs_trans_handle *trans, 2530 struct btrfs_root *log, 2531 struct btrfs_path *dst_path, 2532 struct extent_buffer *src, 2533 int start_slot, int nr, int inode_only) 2534 { 2535 unsigned long src_offset; 2536 unsigned long dst_offset; 2537 struct btrfs_file_extent_item *extent; 2538 struct btrfs_inode_item *inode_item; 2539 int ret; 2540 struct btrfs_key *ins_keys; 2541 u32 *ins_sizes; 2542 char *ins_data; 2543 int i; 2544 struct list_head ordered_sums; 2545 2546 INIT_LIST_HEAD(&ordered_sums); 2547 2548 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2549 nr * sizeof(u32), GFP_NOFS); 2550 ins_sizes = (u32 *)ins_data; 2551 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 2552 2553 for (i = 0; i < nr; i++) { 2554 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 2555 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 2556 } 2557 ret = btrfs_insert_empty_items(trans, log, dst_path, 2558 ins_keys, ins_sizes, nr); 2559 BUG_ON(ret); 2560 2561 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 2562 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2563 dst_path->slots[0]); 2564 2565 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 2566 2567 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 2568 src_offset, ins_sizes[i]); 2569 2570 if (inode_only == LOG_INODE_EXISTS && 2571 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 2572 inode_item = btrfs_item_ptr(dst_path->nodes[0], 2573 dst_path->slots[0], 2574 struct btrfs_inode_item); 2575 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); 2576 2577 /* set the generation to zero so the recover code 2578 * can tell the difference between an logging 2579 * just to say 'this inode exists' and a logging 2580 * to say 'update this inode with these values' 2581 */ 2582 btrfs_set_inode_generation(dst_path->nodes[0], 2583 inode_item, 0); 2584 } 2585 /* take a reference on file data extents so that truncates 2586 * or deletes of this inode don't have to relog the inode 2587 * again 2588 */ 2589 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { 2590 int found_type; 2591 extent = btrfs_item_ptr(src, start_slot + i, 2592 struct btrfs_file_extent_item); 2593 2594 found_type = btrfs_file_extent_type(src, extent); 2595 if (found_type == BTRFS_FILE_EXTENT_REG || 2596 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 2597 u64 ds, dl, cs, cl; 2598 ds = btrfs_file_extent_disk_bytenr(src, 2599 extent); 2600 /* ds == 0 is a hole */ 2601 if (ds == 0) 2602 continue; 2603 2604 dl = btrfs_file_extent_disk_num_bytes(src, 2605 extent); 2606 cs = btrfs_file_extent_offset(src, extent); 2607 cl = btrfs_file_extent_num_bytes(src, 2608 extent);; 2609 if (btrfs_file_extent_compression(src, 2610 extent)) { 2611 cs = 0; 2612 cl = dl; 2613 } 2614 2615 ret = btrfs_lookup_csums_range( 2616 log->fs_info->csum_root, 2617 ds + cs, ds + cs + cl - 1, 2618 &ordered_sums); 2619 BUG_ON(ret); 2620 } 2621 } 2622 } 2623 2624 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 2625 btrfs_release_path(log, dst_path); 2626 kfree(ins_data); 2627 2628 /* 2629 * we have to do this after the loop above to avoid changing the 2630 * log tree while trying to change the log tree. 2631 */ 2632 while (!list_empty(&ordered_sums)) { 2633 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2634 struct btrfs_ordered_sum, 2635 list); 2636 ret = btrfs_csum_file_blocks(trans, log, sums); 2637 BUG_ON(ret); 2638 list_del(&sums->list); 2639 kfree(sums); 2640 } 2641 return 0; 2642 } 2643 2644 /* log a single inode in the tree log. 2645 * At least one parent directory for this inode must exist in the tree 2646 * or be logged already. 2647 * 2648 * Any items from this inode changed by the current transaction are copied 2649 * to the log tree. An extra reference is taken on any extents in this 2650 * file, allowing us to avoid a whole pile of corner cases around logging 2651 * blocks that have been removed from the tree. 2652 * 2653 * See LOG_INODE_ALL and related defines for a description of what inode_only 2654 * does. 2655 * 2656 * This handles both files and directories. 2657 */ 2658 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 2659 struct btrfs_root *root, struct inode *inode, 2660 int inode_only) 2661 { 2662 struct btrfs_path *path; 2663 struct btrfs_path *dst_path; 2664 struct btrfs_key min_key; 2665 struct btrfs_key max_key; 2666 struct btrfs_root *log = root->log_root; 2667 struct extent_buffer *src = NULL; 2668 u32 size; 2669 int ret; 2670 int nritems; 2671 int ins_start_slot = 0; 2672 int ins_nr; 2673 2674 log = root->log_root; 2675 2676 path = btrfs_alloc_path(); 2677 dst_path = btrfs_alloc_path(); 2678 2679 min_key.objectid = inode->i_ino; 2680 min_key.type = BTRFS_INODE_ITEM_KEY; 2681 min_key.offset = 0; 2682 2683 max_key.objectid = inode->i_ino; 2684 2685 /* today the code can only do partial logging of directories */ 2686 if (!S_ISDIR(inode->i_mode)) 2687 inode_only = LOG_INODE_ALL; 2688 2689 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2690 max_key.type = BTRFS_XATTR_ITEM_KEY; 2691 else 2692 max_key.type = (u8)-1; 2693 max_key.offset = (u64)-1; 2694 2695 mutex_lock(&BTRFS_I(inode)->log_mutex); 2696 2697 /* 2698 * a brute force approach to making sure we get the most uptodate 2699 * copies of everything. 2700 */ 2701 if (S_ISDIR(inode->i_mode)) { 2702 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 2703 2704 if (inode_only == LOG_INODE_EXISTS) 2705 max_key_type = BTRFS_XATTR_ITEM_KEY; 2706 ret = drop_objectid_items(trans, log, path, 2707 inode->i_ino, max_key_type); 2708 } else { 2709 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2710 } 2711 BUG_ON(ret); 2712 path->keep_locks = 1; 2713 2714 while (1) { 2715 ins_nr = 0; 2716 ret = btrfs_search_forward(root, &min_key, &max_key, 2717 path, 0, trans->transid); 2718 if (ret != 0) 2719 break; 2720 again: 2721 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 2722 if (min_key.objectid != inode->i_ino) 2723 break; 2724 if (min_key.type > max_key.type) 2725 break; 2726 2727 src = path->nodes[0]; 2728 size = btrfs_item_size_nr(src, path->slots[0]); 2729 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 2730 ins_nr++; 2731 goto next_slot; 2732 } else if (!ins_nr) { 2733 ins_start_slot = path->slots[0]; 2734 ins_nr = 1; 2735 goto next_slot; 2736 } 2737 2738 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2739 ins_nr, inode_only); 2740 BUG_ON(ret); 2741 ins_nr = 1; 2742 ins_start_slot = path->slots[0]; 2743 next_slot: 2744 2745 nritems = btrfs_header_nritems(path->nodes[0]); 2746 path->slots[0]++; 2747 if (path->slots[0] < nritems) { 2748 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 2749 path->slots[0]); 2750 goto again; 2751 } 2752 if (ins_nr) { 2753 ret = copy_items(trans, log, dst_path, src, 2754 ins_start_slot, 2755 ins_nr, inode_only); 2756 BUG_ON(ret); 2757 ins_nr = 0; 2758 } 2759 btrfs_release_path(root, path); 2760 2761 if (min_key.offset < (u64)-1) 2762 min_key.offset++; 2763 else if (min_key.type < (u8)-1) 2764 min_key.type++; 2765 else if (min_key.objectid < (u64)-1) 2766 min_key.objectid++; 2767 else 2768 break; 2769 } 2770 if (ins_nr) { 2771 ret = copy_items(trans, log, dst_path, src, 2772 ins_start_slot, 2773 ins_nr, inode_only); 2774 BUG_ON(ret); 2775 ins_nr = 0; 2776 } 2777 WARN_ON(ins_nr); 2778 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2779 btrfs_release_path(root, path); 2780 btrfs_release_path(log, dst_path); 2781 ret = log_directory_changes(trans, root, inode, path, dst_path); 2782 BUG_ON(ret); 2783 } 2784 BTRFS_I(inode)->logged_trans = trans->transid; 2785 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2786 2787 btrfs_free_path(path); 2788 btrfs_free_path(dst_path); 2789 return 0; 2790 } 2791 2792 /* 2793 * follow the dentry parent pointers up the chain and see if any 2794 * of the directories in it require a full commit before they can 2795 * be logged. Returns zero if nothing special needs to be done or 1 if 2796 * a full commit is required. 2797 */ 2798 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 2799 struct inode *inode, 2800 struct dentry *parent, 2801 struct super_block *sb, 2802 u64 last_committed) 2803 { 2804 int ret = 0; 2805 struct btrfs_root *root; 2806 2807 /* 2808 * for regular files, if its inode is already on disk, we don't 2809 * have to worry about the parents at all. This is because 2810 * we can use the last_unlink_trans field to record renames 2811 * and other fun in this file. 2812 */ 2813 if (S_ISREG(inode->i_mode) && 2814 BTRFS_I(inode)->generation <= last_committed && 2815 BTRFS_I(inode)->last_unlink_trans <= last_committed) 2816 goto out; 2817 2818 if (!S_ISDIR(inode->i_mode)) { 2819 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2820 goto out; 2821 inode = parent->d_inode; 2822 } 2823 2824 while (1) { 2825 BTRFS_I(inode)->logged_trans = trans->transid; 2826 smp_mb(); 2827 2828 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 2829 root = BTRFS_I(inode)->root; 2830 2831 /* 2832 * make sure any commits to the log are forced 2833 * to be full commits 2834 */ 2835 root->fs_info->last_trans_log_full_commit = 2836 trans->transid; 2837 ret = 1; 2838 break; 2839 } 2840 2841 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2842 break; 2843 2844 if (parent == sb->s_root) 2845 break; 2846 2847 parent = parent->d_parent; 2848 inode = parent->d_inode; 2849 2850 } 2851 out: 2852 return ret; 2853 } 2854 2855 /* 2856 * helper function around btrfs_log_inode to make sure newly created 2857 * parent directories also end up in the log. A minimal inode and backref 2858 * only logging is done of any parent directories that are older than 2859 * the last committed transaction 2860 */ 2861 int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 2862 struct btrfs_root *root, struct inode *inode, 2863 struct dentry *parent, int exists_only) 2864 { 2865 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 2866 struct super_block *sb; 2867 int ret = 0; 2868 u64 last_committed = root->fs_info->last_trans_committed; 2869 2870 sb = inode->i_sb; 2871 2872 if (btrfs_test_opt(root, NOTREELOG)) { 2873 ret = 1; 2874 goto end_no_trans; 2875 } 2876 2877 if (root->fs_info->last_trans_log_full_commit > 2878 root->fs_info->last_trans_committed) { 2879 ret = 1; 2880 goto end_no_trans; 2881 } 2882 2883 ret = check_parent_dirs_for_sync(trans, inode, parent, 2884 sb, last_committed); 2885 if (ret) 2886 goto end_no_trans; 2887 2888 start_log_trans(trans, root); 2889 2890 ret = btrfs_log_inode(trans, root, inode, inode_only); 2891 BUG_ON(ret); 2892 2893 /* 2894 * for regular files, if its inode is already on disk, we don't 2895 * have to worry about the parents at all. This is because 2896 * we can use the last_unlink_trans field to record renames 2897 * and other fun in this file. 2898 */ 2899 if (S_ISREG(inode->i_mode) && 2900 BTRFS_I(inode)->generation <= last_committed && 2901 BTRFS_I(inode)->last_unlink_trans <= last_committed) 2902 goto no_parent; 2903 2904 inode_only = LOG_INODE_EXISTS; 2905 while (1) { 2906 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2907 break; 2908 2909 inode = parent->d_inode; 2910 if (BTRFS_I(inode)->generation > 2911 root->fs_info->last_trans_committed) { 2912 ret = btrfs_log_inode(trans, root, inode, inode_only); 2913 BUG_ON(ret); 2914 } 2915 if (parent == sb->s_root) 2916 break; 2917 2918 parent = parent->d_parent; 2919 } 2920 no_parent: 2921 ret = 0; 2922 btrfs_end_log_trans(root); 2923 end_no_trans: 2924 return ret; 2925 } 2926 2927 /* 2928 * it is not safe to log dentry if the chunk root has added new 2929 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 2930 * If this returns 1, you must commit the transaction to safely get your 2931 * data on disk. 2932 */ 2933 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2934 struct btrfs_root *root, struct dentry *dentry) 2935 { 2936 return btrfs_log_inode_parent(trans, root, dentry->d_inode, 2937 dentry->d_parent, 0); 2938 } 2939 2940 /* 2941 * should be called during mount to recover any replay any log trees 2942 * from the FS 2943 */ 2944 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 2945 { 2946 int ret; 2947 struct btrfs_path *path; 2948 struct btrfs_trans_handle *trans; 2949 struct btrfs_key key; 2950 struct btrfs_key found_key; 2951 struct btrfs_key tmp_key; 2952 struct btrfs_root *log; 2953 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 2954 u64 highest_inode; 2955 struct walk_control wc = { 2956 .process_func = process_one_buffer, 2957 .stage = 0, 2958 }; 2959 2960 fs_info->log_root_recovering = 1; 2961 path = btrfs_alloc_path(); 2962 BUG_ON(!path); 2963 2964 trans = btrfs_start_transaction(fs_info->tree_root, 1); 2965 2966 wc.trans = trans; 2967 wc.pin = 1; 2968 2969 walk_log_tree(trans, log_root_tree, &wc); 2970 2971 again: 2972 key.objectid = BTRFS_TREE_LOG_OBJECTID; 2973 key.offset = (u64)-1; 2974 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 2975 2976 while (1) { 2977 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 2978 if (ret < 0) 2979 break; 2980 if (ret > 0) { 2981 if (path->slots[0] == 0) 2982 break; 2983 path->slots[0]--; 2984 } 2985 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2986 path->slots[0]); 2987 btrfs_release_path(log_root_tree, path); 2988 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 2989 break; 2990 2991 log = btrfs_read_fs_root_no_radix(log_root_tree, 2992 &found_key); 2993 BUG_ON(!log); 2994 2995 2996 tmp_key.objectid = found_key.offset; 2997 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 2998 tmp_key.offset = (u64)-1; 2999 3000 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 3001 BUG_ON(!wc.replay_dest); 3002 3003 wc.replay_dest->log_root = log; 3004 btrfs_record_root_in_trans(trans, wc.replay_dest); 3005 ret = walk_log_tree(trans, log, &wc); 3006 BUG_ON(ret); 3007 3008 if (wc.stage == LOG_WALK_REPLAY_ALL) { 3009 ret = fixup_inode_link_counts(trans, wc.replay_dest, 3010 path); 3011 BUG_ON(ret); 3012 } 3013 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode); 3014 if (ret == 0) { 3015 wc.replay_dest->highest_inode = highest_inode; 3016 wc.replay_dest->last_inode_alloc = highest_inode; 3017 } 3018 3019 key.offset = found_key.offset - 1; 3020 wc.replay_dest->log_root = NULL; 3021 free_extent_buffer(log->node); 3022 free_extent_buffer(log->commit_root); 3023 kfree(log); 3024 3025 if (found_key.offset == 0) 3026 break; 3027 } 3028 btrfs_release_path(log_root_tree, path); 3029 3030 /* step one is to pin it all, step two is to replay just inodes */ 3031 if (wc.pin) { 3032 wc.pin = 0; 3033 wc.process_func = replay_one_buffer; 3034 wc.stage = LOG_WALK_REPLAY_INODES; 3035 goto again; 3036 } 3037 /* step three is to replay everything */ 3038 if (wc.stage < LOG_WALK_REPLAY_ALL) { 3039 wc.stage++; 3040 goto again; 3041 } 3042 3043 btrfs_free_path(path); 3044 3045 free_extent_buffer(log_root_tree->node); 3046 log_root_tree->log_root = NULL; 3047 fs_info->log_root_recovering = 0; 3048 3049 /* step 4: commit the transaction, which also unpins the blocks */ 3050 btrfs_commit_transaction(trans, fs_info->tree_root); 3051 3052 kfree(log_root_tree); 3053 return 0; 3054 } 3055 3056 /* 3057 * there are some corner cases where we want to force a full 3058 * commit instead of allowing a directory to be logged. 3059 * 3060 * They revolve around files there were unlinked from the directory, and 3061 * this function updates the parent directory so that a full commit is 3062 * properly done if it is fsync'd later after the unlinks are done. 3063 */ 3064 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 3065 struct inode *dir, struct inode *inode, 3066 int for_rename) 3067 { 3068 /* 3069 * when we're logging a file, if it hasn't been renamed 3070 * or unlinked, and its inode is fully committed on disk, 3071 * we don't have to worry about walking up the directory chain 3072 * to log its parents. 3073 * 3074 * So, we use the last_unlink_trans field to put this transid 3075 * into the file. When the file is logged we check it and 3076 * don't log the parents if the file is fully on disk. 3077 */ 3078 if (S_ISREG(inode->i_mode)) 3079 BTRFS_I(inode)->last_unlink_trans = trans->transid; 3080 3081 /* 3082 * if this directory was already logged any new 3083 * names for this file/dir will get recorded 3084 */ 3085 smp_mb(); 3086 if (BTRFS_I(dir)->logged_trans == trans->transid) 3087 return; 3088 3089 /* 3090 * if the inode we're about to unlink was logged, 3091 * the log will be properly updated for any new names 3092 */ 3093 if (BTRFS_I(inode)->logged_trans == trans->transid) 3094 return; 3095 3096 /* 3097 * when renaming files across directories, if the directory 3098 * there we're unlinking from gets fsync'd later on, there's 3099 * no way to find the destination directory later and fsync it 3100 * properly. So, we have to be conservative and force commits 3101 * so the new name gets discovered. 3102 */ 3103 if (for_rename) 3104 goto record; 3105 3106 /* we can safely do the unlink without any special recording */ 3107 return; 3108 3109 record: 3110 BTRFS_I(dir)->last_unlink_trans = trans->transid; 3111 } 3112 3113 /* 3114 * Call this after adding a new name for a file and it will properly 3115 * update the log to reflect the new name. 3116 * 3117 * It will return zero if all goes well, and it will return 1 if a 3118 * full transaction commit is required. 3119 */ 3120 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 3121 struct inode *inode, struct inode *old_dir, 3122 struct dentry *parent) 3123 { 3124 struct btrfs_root * root = BTRFS_I(inode)->root; 3125 3126 /* 3127 * this will force the logging code to walk the dentry chain 3128 * up for the file 3129 */ 3130 if (S_ISREG(inode->i_mode)) 3131 BTRFS_I(inode)->last_unlink_trans = trans->transid; 3132 3133 /* 3134 * if this inode hasn't been logged and directory we're renaming it 3135 * from hasn't been logged, we don't need to log it 3136 */ 3137 if (BTRFS_I(inode)->logged_trans <= 3138 root->fs_info->last_trans_committed && 3139 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 3140 root->fs_info->last_trans_committed)) 3141 return 0; 3142 3143 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 3144 } 3145 3146