1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include "ctree.h" 21 #include "transaction.h" 22 #include "disk-io.h" 23 #include "locking.h" 24 #include "print-tree.h" 25 #include "compat.h" 26 #include "tree-log.h" 27 28 /* magic values for the inode_only field in btrfs_log_inode: 29 * 30 * LOG_INODE_ALL means to log everything 31 * LOG_INODE_EXISTS means to log just enough to recreate the inode 32 * during log replay 33 */ 34 #define LOG_INODE_ALL 0 35 #define LOG_INODE_EXISTS 1 36 37 /* 38 * directory trouble cases 39 * 40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 41 * log, we must force a full commit before doing an fsync of the directory 42 * where the unlink was done. 43 * ---> record transid of last unlink/rename per directory 44 * 45 * mkdir foo/some_dir 46 * normal commit 47 * rename foo/some_dir foo2/some_dir 48 * mkdir foo/some_dir 49 * fsync foo/some_dir/some_file 50 * 51 * The fsync above will unlink the original some_dir without recording 52 * it in its new location (foo2). After a crash, some_dir will be gone 53 * unless the fsync of some_file forces a full commit 54 * 55 * 2) we must log any new names for any file or dir that is in the fsync 56 * log. ---> check inode while renaming/linking. 57 * 58 * 2a) we must log any new names for any file or dir during rename 59 * when the directory they are being removed from was logged. 60 * ---> check inode and old parent dir during rename 61 * 62 * 2a is actually the more important variant. With the extra logging 63 * a crash might unlink the old name without recreating the new one 64 * 65 * 3) after a crash, we must go through any directories with a link count 66 * of zero and redo the rm -rf 67 * 68 * mkdir f1/foo 69 * normal commit 70 * rm -rf f1/foo 71 * fsync(f1) 72 * 73 * The directory f1 was fully removed from the FS, but fsync was never 74 * called on f1, only its parent dir. After a crash the rm -rf must 75 * be replayed. This must be able to recurse down the entire 76 * directory tree. The inode link count fixup code takes care of the 77 * ugly details. 78 */ 79 80 /* 81 * stages for the tree walking. The first 82 * stage (0) is to only pin down the blocks we find 83 * the second stage (1) is to make sure that all the inodes 84 * we find in the log are created in the subvolume. 85 * 86 * The last stage is to deal with directories and links and extents 87 * and all the other fun semantics 88 */ 89 #define LOG_WALK_PIN_ONLY 0 90 #define LOG_WALK_REPLAY_INODES 1 91 #define LOG_WALK_REPLAY_ALL 2 92 93 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 94 struct btrfs_root *root, struct inode *inode, 95 int inode_only); 96 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, 98 struct btrfs_path *path, u64 objectid); 99 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 100 struct btrfs_root *root, 101 struct btrfs_root *log, 102 struct btrfs_path *path, 103 u64 dirid, int del_all); 104 105 /* 106 * tree logging is a special write ahead log used to make sure that 107 * fsyncs and O_SYNCs can happen without doing full tree commits. 108 * 109 * Full tree commits are expensive because they require commonly 110 * modified blocks to be recowed, creating many dirty pages in the 111 * extent tree an 4x-6x higher write load than ext3. 112 * 113 * Instead of doing a tree commit on every fsync, we use the 114 * key ranges and transaction ids to find items for a given file or directory 115 * that have changed in this transaction. Those items are copied into 116 * a special tree (one per subvolume root), that tree is written to disk 117 * and then the fsync is considered complete. 118 * 119 * After a crash, items are copied out of the log-tree back into the 120 * subvolume tree. Any file data extents found are recorded in the extent 121 * allocation tree, and the log-tree freed. 122 * 123 * The log tree is read three times, once to pin down all the extents it is 124 * using in ram and once, once to create all the inodes logged in the tree 125 * and once to do all the other items. 126 */ 127 128 /* 129 * start a sub transaction and setup the log tree 130 * this increments the log tree writer count to make the people 131 * syncing the tree wait for us to finish 132 */ 133 static int start_log_trans(struct btrfs_trans_handle *trans, 134 struct btrfs_root *root) 135 { 136 int ret; 137 138 mutex_lock(&root->log_mutex); 139 if (root->log_root) { 140 if (!root->log_start_pid) { 141 root->log_start_pid = current->pid; 142 root->log_multiple_pids = false; 143 } else if (root->log_start_pid != current->pid) { 144 root->log_multiple_pids = true; 145 } 146 147 root->log_batch++; 148 atomic_inc(&root->log_writers); 149 mutex_unlock(&root->log_mutex); 150 return 0; 151 } 152 root->log_multiple_pids = false; 153 root->log_start_pid = current->pid; 154 mutex_lock(&root->fs_info->tree_log_mutex); 155 if (!root->fs_info->log_root_tree) { 156 ret = btrfs_init_log_root_tree(trans, root->fs_info); 157 BUG_ON(ret); 158 } 159 if (!root->log_root) { 160 ret = btrfs_add_log_tree(trans, root); 161 BUG_ON(ret); 162 } 163 mutex_unlock(&root->fs_info->tree_log_mutex); 164 root->log_batch++; 165 atomic_inc(&root->log_writers); 166 mutex_unlock(&root->log_mutex); 167 return 0; 168 } 169 170 /* 171 * returns 0 if there was a log transaction running and we were able 172 * to join, or returns -ENOENT if there were not transactions 173 * in progress 174 */ 175 static int join_running_log_trans(struct btrfs_root *root) 176 { 177 int ret = -ENOENT; 178 179 smp_mb(); 180 if (!root->log_root) 181 return -ENOENT; 182 183 mutex_lock(&root->log_mutex); 184 if (root->log_root) { 185 ret = 0; 186 atomic_inc(&root->log_writers); 187 } 188 mutex_unlock(&root->log_mutex); 189 return ret; 190 } 191 192 /* 193 * This either makes the current running log transaction wait 194 * until you call btrfs_end_log_trans() or it makes any future 195 * log transactions wait until you call btrfs_end_log_trans() 196 */ 197 int btrfs_pin_log_trans(struct btrfs_root *root) 198 { 199 int ret = -ENOENT; 200 201 mutex_lock(&root->log_mutex); 202 atomic_inc(&root->log_writers); 203 mutex_unlock(&root->log_mutex); 204 return ret; 205 } 206 207 /* 208 * indicate we're done making changes to the log tree 209 * and wake up anyone waiting to do a sync 210 */ 211 int btrfs_end_log_trans(struct btrfs_root *root) 212 { 213 if (atomic_dec_and_test(&root->log_writers)) { 214 smp_mb(); 215 if (waitqueue_active(&root->log_writer_wait)) 216 wake_up(&root->log_writer_wait); 217 } 218 return 0; 219 } 220 221 222 /* 223 * the walk control struct is used to pass state down the chain when 224 * processing the log tree. The stage field tells us which part 225 * of the log tree processing we are currently doing. The others 226 * are state fields used for that specific part 227 */ 228 struct walk_control { 229 /* should we free the extent on disk when done? This is used 230 * at transaction commit time while freeing a log tree 231 */ 232 int free; 233 234 /* should we write out the extent buffer? This is used 235 * while flushing the log tree to disk during a sync 236 */ 237 int write; 238 239 /* should we wait for the extent buffer io to finish? Also used 240 * while flushing the log tree to disk for a sync 241 */ 242 int wait; 243 244 /* pin only walk, we record which extents on disk belong to the 245 * log trees 246 */ 247 int pin; 248 249 /* what stage of the replay code we're currently in */ 250 int stage; 251 252 /* the root we are currently replaying */ 253 struct btrfs_root *replay_dest; 254 255 /* the trans handle for the current replay */ 256 struct btrfs_trans_handle *trans; 257 258 /* the function that gets used to process blocks we find in the 259 * tree. Note the extent_buffer might not be up to date when it is 260 * passed in, and it must be checked or read if you need the data 261 * inside it 262 */ 263 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 264 struct walk_control *wc, u64 gen); 265 }; 266 267 /* 268 * process_func used to pin down extents, write them or wait on them 269 */ 270 static int process_one_buffer(struct btrfs_root *log, 271 struct extent_buffer *eb, 272 struct walk_control *wc, u64 gen) 273 { 274 if (wc->pin) 275 btrfs_pin_extent(log->fs_info->extent_root, 276 eb->start, eb->len, 0); 277 278 if (btrfs_buffer_uptodate(eb, gen)) { 279 if (wc->write) 280 btrfs_write_tree_block(eb); 281 if (wc->wait) 282 btrfs_wait_tree_block_writeback(eb); 283 } 284 return 0; 285 } 286 287 /* 288 * Item overwrite used by replay and tree logging. eb, slot and key all refer 289 * to the src data we are copying out. 290 * 291 * root is the tree we are copying into, and path is a scratch 292 * path for use in this function (it should be released on entry and 293 * will be released on exit). 294 * 295 * If the key is already in the destination tree the existing item is 296 * overwritten. If the existing item isn't big enough, it is extended. 297 * If it is too large, it is truncated. 298 * 299 * If the key isn't in the destination yet, a new item is inserted. 300 */ 301 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 302 struct btrfs_root *root, 303 struct btrfs_path *path, 304 struct extent_buffer *eb, int slot, 305 struct btrfs_key *key) 306 { 307 int ret; 308 u32 item_size; 309 u64 saved_i_size = 0; 310 int save_old_i_size = 0; 311 unsigned long src_ptr; 312 unsigned long dst_ptr; 313 int overwrite_root = 0; 314 315 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 316 overwrite_root = 1; 317 318 item_size = btrfs_item_size_nr(eb, slot); 319 src_ptr = btrfs_item_ptr_offset(eb, slot); 320 321 /* look for the key in the destination tree */ 322 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 323 if (ret == 0) { 324 char *src_copy; 325 char *dst_copy; 326 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 327 path->slots[0]); 328 if (dst_size != item_size) 329 goto insert; 330 331 if (item_size == 0) { 332 btrfs_release_path(root, path); 333 return 0; 334 } 335 dst_copy = kmalloc(item_size, GFP_NOFS); 336 src_copy = kmalloc(item_size, GFP_NOFS); 337 338 read_extent_buffer(eb, src_copy, src_ptr, item_size); 339 340 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 341 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 342 item_size); 343 ret = memcmp(dst_copy, src_copy, item_size); 344 345 kfree(dst_copy); 346 kfree(src_copy); 347 /* 348 * they have the same contents, just return, this saves 349 * us from cowing blocks in the destination tree and doing 350 * extra writes that may not have been done by a previous 351 * sync 352 */ 353 if (ret == 0) { 354 btrfs_release_path(root, path); 355 return 0; 356 } 357 358 } 359 insert: 360 btrfs_release_path(root, path); 361 /* try to insert the key into the destination tree */ 362 ret = btrfs_insert_empty_item(trans, root, path, 363 key, item_size); 364 365 /* make sure any existing item is the correct size */ 366 if (ret == -EEXIST) { 367 u32 found_size; 368 found_size = btrfs_item_size_nr(path->nodes[0], 369 path->slots[0]); 370 if (found_size > item_size) { 371 btrfs_truncate_item(trans, root, path, item_size, 1); 372 } else if (found_size < item_size) { 373 ret = btrfs_extend_item(trans, root, path, 374 item_size - found_size); 375 BUG_ON(ret); 376 } 377 } else if (ret) { 378 BUG(); 379 } 380 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 381 path->slots[0]); 382 383 /* don't overwrite an existing inode if the generation number 384 * was logged as zero. This is done when the tree logging code 385 * is just logging an inode to make sure it exists after recovery. 386 * 387 * Also, don't overwrite i_size on directories during replay. 388 * log replay inserts and removes directory items based on the 389 * state of the tree found in the subvolume, and i_size is modified 390 * as it goes 391 */ 392 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 393 struct btrfs_inode_item *src_item; 394 struct btrfs_inode_item *dst_item; 395 396 src_item = (struct btrfs_inode_item *)src_ptr; 397 dst_item = (struct btrfs_inode_item *)dst_ptr; 398 399 if (btrfs_inode_generation(eb, src_item) == 0) 400 goto no_copy; 401 402 if (overwrite_root && 403 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 404 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 405 save_old_i_size = 1; 406 saved_i_size = btrfs_inode_size(path->nodes[0], 407 dst_item); 408 } 409 } 410 411 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 412 src_ptr, item_size); 413 414 if (save_old_i_size) { 415 struct btrfs_inode_item *dst_item; 416 dst_item = (struct btrfs_inode_item *)dst_ptr; 417 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 418 } 419 420 /* make sure the generation is filled in */ 421 if (key->type == BTRFS_INODE_ITEM_KEY) { 422 struct btrfs_inode_item *dst_item; 423 dst_item = (struct btrfs_inode_item *)dst_ptr; 424 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 425 btrfs_set_inode_generation(path->nodes[0], dst_item, 426 trans->transid); 427 } 428 } 429 no_copy: 430 btrfs_mark_buffer_dirty(path->nodes[0]); 431 btrfs_release_path(root, path); 432 return 0; 433 } 434 435 /* 436 * simple helper to read an inode off the disk from a given root 437 * This can only be called for subvolume roots and not for the log 438 */ 439 static noinline struct inode *read_one_inode(struct btrfs_root *root, 440 u64 objectid) 441 { 442 struct btrfs_key key; 443 struct inode *inode; 444 445 key.objectid = objectid; 446 key.type = BTRFS_INODE_ITEM_KEY; 447 key.offset = 0; 448 inode = btrfs_iget(root->fs_info->sb, &key, root); 449 if (IS_ERR(inode)) { 450 inode = NULL; 451 } else if (is_bad_inode(inode)) { 452 iput(inode); 453 inode = NULL; 454 } 455 return inode; 456 } 457 458 /* replays a single extent in 'eb' at 'slot' with 'key' into the 459 * subvolume 'root'. path is released on entry and should be released 460 * on exit. 461 * 462 * extents in the log tree have not been allocated out of the extent 463 * tree yet. So, this completes the allocation, taking a reference 464 * as required if the extent already exists or creating a new extent 465 * if it isn't in the extent allocation tree yet. 466 * 467 * The extent is inserted into the file, dropping any existing extents 468 * from the file that overlap the new one. 469 */ 470 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 471 struct btrfs_root *root, 472 struct btrfs_path *path, 473 struct extent_buffer *eb, int slot, 474 struct btrfs_key *key) 475 { 476 int found_type; 477 u64 mask = root->sectorsize - 1; 478 u64 extent_end; 479 u64 alloc_hint; 480 u64 start = key->offset; 481 u64 saved_nbytes; 482 struct btrfs_file_extent_item *item; 483 struct inode *inode = NULL; 484 unsigned long size; 485 int ret = 0; 486 487 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 488 found_type = btrfs_file_extent_type(eb, item); 489 490 if (found_type == BTRFS_FILE_EXTENT_REG || 491 found_type == BTRFS_FILE_EXTENT_PREALLOC) 492 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 493 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 494 size = btrfs_file_extent_inline_len(eb, item); 495 extent_end = (start + size + mask) & ~mask; 496 } else { 497 ret = 0; 498 goto out; 499 } 500 501 inode = read_one_inode(root, key->objectid); 502 if (!inode) { 503 ret = -EIO; 504 goto out; 505 } 506 507 /* 508 * first check to see if we already have this extent in the 509 * file. This must be done before the btrfs_drop_extents run 510 * so we don't try to drop this extent. 511 */ 512 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 513 start, 0); 514 515 if (ret == 0 && 516 (found_type == BTRFS_FILE_EXTENT_REG || 517 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 518 struct btrfs_file_extent_item cmp1; 519 struct btrfs_file_extent_item cmp2; 520 struct btrfs_file_extent_item *existing; 521 struct extent_buffer *leaf; 522 523 leaf = path->nodes[0]; 524 existing = btrfs_item_ptr(leaf, path->slots[0], 525 struct btrfs_file_extent_item); 526 527 read_extent_buffer(eb, &cmp1, (unsigned long)item, 528 sizeof(cmp1)); 529 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 530 sizeof(cmp2)); 531 532 /* 533 * we already have a pointer to this exact extent, 534 * we don't have to do anything 535 */ 536 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 537 btrfs_release_path(root, path); 538 goto out; 539 } 540 } 541 btrfs_release_path(root, path); 542 543 saved_nbytes = inode_get_bytes(inode); 544 /* drop any overlapping extents */ 545 ret = btrfs_drop_extents(trans, root, inode, 546 start, extent_end, extent_end, start, &alloc_hint, 1); 547 BUG_ON(ret); 548 549 if (found_type == BTRFS_FILE_EXTENT_REG || 550 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 551 u64 offset; 552 unsigned long dest_offset; 553 struct btrfs_key ins; 554 555 ret = btrfs_insert_empty_item(trans, root, path, key, 556 sizeof(*item)); 557 BUG_ON(ret); 558 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 559 path->slots[0]); 560 copy_extent_buffer(path->nodes[0], eb, dest_offset, 561 (unsigned long)item, sizeof(*item)); 562 563 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 564 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 565 ins.type = BTRFS_EXTENT_ITEM_KEY; 566 offset = key->offset - btrfs_file_extent_offset(eb, item); 567 568 if (ins.objectid > 0) { 569 u64 csum_start; 570 u64 csum_end; 571 LIST_HEAD(ordered_sums); 572 /* 573 * is this extent already allocated in the extent 574 * allocation tree? If so, just add a reference 575 */ 576 ret = btrfs_lookup_extent(root, ins.objectid, 577 ins.offset); 578 if (ret == 0) { 579 ret = btrfs_inc_extent_ref(trans, root, 580 ins.objectid, ins.offset, 581 0, root->root_key.objectid, 582 key->objectid, offset); 583 } else { 584 /* 585 * insert the extent pointer in the extent 586 * allocation tree 587 */ 588 ret = btrfs_alloc_logged_file_extent(trans, 589 root, root->root_key.objectid, 590 key->objectid, offset, &ins); 591 BUG_ON(ret); 592 } 593 btrfs_release_path(root, path); 594 595 if (btrfs_file_extent_compression(eb, item)) { 596 csum_start = ins.objectid; 597 csum_end = csum_start + ins.offset; 598 } else { 599 csum_start = ins.objectid + 600 btrfs_file_extent_offset(eb, item); 601 csum_end = csum_start + 602 btrfs_file_extent_num_bytes(eb, item); 603 } 604 605 ret = btrfs_lookup_csums_range(root->log_root, 606 csum_start, csum_end - 1, 607 &ordered_sums); 608 BUG_ON(ret); 609 while (!list_empty(&ordered_sums)) { 610 struct btrfs_ordered_sum *sums; 611 sums = list_entry(ordered_sums.next, 612 struct btrfs_ordered_sum, 613 list); 614 ret = btrfs_csum_file_blocks(trans, 615 root->fs_info->csum_root, 616 sums); 617 BUG_ON(ret); 618 list_del(&sums->list); 619 kfree(sums); 620 } 621 } else { 622 btrfs_release_path(root, path); 623 } 624 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 625 /* inline extents are easy, we just overwrite them */ 626 ret = overwrite_item(trans, root, path, eb, slot, key); 627 BUG_ON(ret); 628 } 629 630 inode_set_bytes(inode, saved_nbytes); 631 btrfs_update_inode(trans, root, inode); 632 out: 633 if (inode) 634 iput(inode); 635 return ret; 636 } 637 638 /* 639 * when cleaning up conflicts between the directory names in the 640 * subvolume, directory names in the log and directory names in the 641 * inode back references, we may have to unlink inodes from directories. 642 * 643 * This is a helper function to do the unlink of a specific directory 644 * item 645 */ 646 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 647 struct btrfs_root *root, 648 struct btrfs_path *path, 649 struct inode *dir, 650 struct btrfs_dir_item *di) 651 { 652 struct inode *inode; 653 char *name; 654 int name_len; 655 struct extent_buffer *leaf; 656 struct btrfs_key location; 657 int ret; 658 659 leaf = path->nodes[0]; 660 661 btrfs_dir_item_key_to_cpu(leaf, di, &location); 662 name_len = btrfs_dir_name_len(leaf, di); 663 name = kmalloc(name_len, GFP_NOFS); 664 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 665 btrfs_release_path(root, path); 666 667 inode = read_one_inode(root, location.objectid); 668 BUG_ON(!inode); 669 670 ret = link_to_fixup_dir(trans, root, path, location.objectid); 671 BUG_ON(ret); 672 673 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 674 BUG_ON(ret); 675 kfree(name); 676 677 iput(inode); 678 return ret; 679 } 680 681 /* 682 * helper function to see if a given name and sequence number found 683 * in an inode back reference are already in a directory and correctly 684 * point to this inode 685 */ 686 static noinline int inode_in_dir(struct btrfs_root *root, 687 struct btrfs_path *path, 688 u64 dirid, u64 objectid, u64 index, 689 const char *name, int name_len) 690 { 691 struct btrfs_dir_item *di; 692 struct btrfs_key location; 693 int match = 0; 694 695 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 696 index, name, name_len, 0); 697 if (di && !IS_ERR(di)) { 698 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 699 if (location.objectid != objectid) 700 goto out; 701 } else 702 goto out; 703 btrfs_release_path(root, path); 704 705 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 706 if (di && !IS_ERR(di)) { 707 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 708 if (location.objectid != objectid) 709 goto out; 710 } else 711 goto out; 712 match = 1; 713 out: 714 btrfs_release_path(root, path); 715 return match; 716 } 717 718 /* 719 * helper function to check a log tree for a named back reference in 720 * an inode. This is used to decide if a back reference that is 721 * found in the subvolume conflicts with what we find in the log. 722 * 723 * inode backreferences may have multiple refs in a single item, 724 * during replay we process one reference at a time, and we don't 725 * want to delete valid links to a file from the subvolume if that 726 * link is also in the log. 727 */ 728 static noinline int backref_in_log(struct btrfs_root *log, 729 struct btrfs_key *key, 730 char *name, int namelen) 731 { 732 struct btrfs_path *path; 733 struct btrfs_inode_ref *ref; 734 unsigned long ptr; 735 unsigned long ptr_end; 736 unsigned long name_ptr; 737 int found_name_len; 738 int item_size; 739 int ret; 740 int match = 0; 741 742 path = btrfs_alloc_path(); 743 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 744 if (ret != 0) 745 goto out; 746 747 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 748 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 749 ptr_end = ptr + item_size; 750 while (ptr < ptr_end) { 751 ref = (struct btrfs_inode_ref *)ptr; 752 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 753 if (found_name_len == namelen) { 754 name_ptr = (unsigned long)(ref + 1); 755 ret = memcmp_extent_buffer(path->nodes[0], name, 756 name_ptr, namelen); 757 if (ret == 0) { 758 match = 1; 759 goto out; 760 } 761 } 762 ptr = (unsigned long)(ref + 1) + found_name_len; 763 } 764 out: 765 btrfs_free_path(path); 766 return match; 767 } 768 769 770 /* 771 * replay one inode back reference item found in the log tree. 772 * eb, slot and key refer to the buffer and key found in the log tree. 773 * root is the destination we are replaying into, and path is for temp 774 * use by this function. (it should be released on return). 775 */ 776 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 777 struct btrfs_root *root, 778 struct btrfs_root *log, 779 struct btrfs_path *path, 780 struct extent_buffer *eb, int slot, 781 struct btrfs_key *key) 782 { 783 struct inode *dir; 784 int ret; 785 struct btrfs_key location; 786 struct btrfs_inode_ref *ref; 787 struct btrfs_dir_item *di; 788 struct inode *inode; 789 char *name; 790 int namelen; 791 unsigned long ref_ptr; 792 unsigned long ref_end; 793 794 location.objectid = key->objectid; 795 location.type = BTRFS_INODE_ITEM_KEY; 796 location.offset = 0; 797 798 /* 799 * it is possible that we didn't log all the parent directories 800 * for a given inode. If we don't find the dir, just don't 801 * copy the back ref in. The link count fixup code will take 802 * care of the rest 803 */ 804 dir = read_one_inode(root, key->offset); 805 if (!dir) 806 return -ENOENT; 807 808 inode = read_one_inode(root, key->objectid); 809 BUG_ON(!inode); 810 811 ref_ptr = btrfs_item_ptr_offset(eb, slot); 812 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 813 814 again: 815 ref = (struct btrfs_inode_ref *)ref_ptr; 816 817 namelen = btrfs_inode_ref_name_len(eb, ref); 818 name = kmalloc(namelen, GFP_NOFS); 819 BUG_ON(!name); 820 821 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); 822 823 /* if we already have a perfect match, we're done */ 824 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, 825 btrfs_inode_ref_index(eb, ref), 826 name, namelen)) { 827 goto out; 828 } 829 830 /* 831 * look for a conflicting back reference in the metadata. 832 * if we find one we have to unlink that name of the file 833 * before we add our new link. Later on, we overwrite any 834 * existing back reference, and we don't want to create 835 * dangling pointers in the directory. 836 */ 837 conflict_again: 838 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 839 if (ret == 0) { 840 char *victim_name; 841 int victim_name_len; 842 struct btrfs_inode_ref *victim_ref; 843 unsigned long ptr; 844 unsigned long ptr_end; 845 struct extent_buffer *leaf = path->nodes[0]; 846 847 /* are we trying to overwrite a back ref for the root directory 848 * if so, just jump out, we're done 849 */ 850 if (key->objectid == key->offset) 851 goto out_nowrite; 852 853 /* check all the names in this back reference to see 854 * if they are in the log. if so, we allow them to stay 855 * otherwise they must be unlinked as a conflict 856 */ 857 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 858 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 859 while (ptr < ptr_end) { 860 victim_ref = (struct btrfs_inode_ref *)ptr; 861 victim_name_len = btrfs_inode_ref_name_len(leaf, 862 victim_ref); 863 victim_name = kmalloc(victim_name_len, GFP_NOFS); 864 BUG_ON(!victim_name); 865 866 read_extent_buffer(leaf, victim_name, 867 (unsigned long)(victim_ref + 1), 868 victim_name_len); 869 870 if (!backref_in_log(log, key, victim_name, 871 victim_name_len)) { 872 btrfs_inc_nlink(inode); 873 btrfs_release_path(root, path); 874 875 ret = btrfs_unlink_inode(trans, root, dir, 876 inode, victim_name, 877 victim_name_len); 878 kfree(victim_name); 879 btrfs_release_path(root, path); 880 goto conflict_again; 881 } 882 kfree(victim_name); 883 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 884 } 885 BUG_ON(ret); 886 } 887 btrfs_release_path(root, path); 888 889 /* look for a conflicting sequence number */ 890 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 891 btrfs_inode_ref_index(eb, ref), 892 name, namelen, 0); 893 if (di && !IS_ERR(di)) { 894 ret = drop_one_dir_item(trans, root, path, dir, di); 895 BUG_ON(ret); 896 } 897 btrfs_release_path(root, path); 898 899 900 /* look for a conflicting name */ 901 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 902 name, namelen, 0); 903 if (di && !IS_ERR(di)) { 904 ret = drop_one_dir_item(trans, root, path, dir, di); 905 BUG_ON(ret); 906 } 907 btrfs_release_path(root, path); 908 909 /* insert our name */ 910 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 911 btrfs_inode_ref_index(eb, ref)); 912 BUG_ON(ret); 913 914 btrfs_update_inode(trans, root, inode); 915 916 out: 917 ref_ptr = (unsigned long)(ref + 1) + namelen; 918 kfree(name); 919 if (ref_ptr < ref_end) 920 goto again; 921 922 /* finally write the back reference in the inode */ 923 ret = overwrite_item(trans, root, path, eb, slot, key); 924 BUG_ON(ret); 925 926 out_nowrite: 927 btrfs_release_path(root, path); 928 iput(dir); 929 iput(inode); 930 return 0; 931 } 932 933 /* 934 * There are a few corners where the link count of the file can't 935 * be properly maintained during replay. So, instead of adding 936 * lots of complexity to the log code, we just scan the backrefs 937 * for any file that has been through replay. 938 * 939 * The scan will update the link count on the inode to reflect the 940 * number of back refs found. If it goes down to zero, the iput 941 * will free the inode. 942 */ 943 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 944 struct btrfs_root *root, 945 struct inode *inode) 946 { 947 struct btrfs_path *path; 948 int ret; 949 struct btrfs_key key; 950 u64 nlink = 0; 951 unsigned long ptr; 952 unsigned long ptr_end; 953 int name_len; 954 955 key.objectid = inode->i_ino; 956 key.type = BTRFS_INODE_REF_KEY; 957 key.offset = (u64)-1; 958 959 path = btrfs_alloc_path(); 960 961 while (1) { 962 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 963 if (ret < 0) 964 break; 965 if (ret > 0) { 966 if (path->slots[0] == 0) 967 break; 968 path->slots[0]--; 969 } 970 btrfs_item_key_to_cpu(path->nodes[0], &key, 971 path->slots[0]); 972 if (key.objectid != inode->i_ino || 973 key.type != BTRFS_INODE_REF_KEY) 974 break; 975 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 976 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 977 path->slots[0]); 978 while (ptr < ptr_end) { 979 struct btrfs_inode_ref *ref; 980 981 ref = (struct btrfs_inode_ref *)ptr; 982 name_len = btrfs_inode_ref_name_len(path->nodes[0], 983 ref); 984 ptr = (unsigned long)(ref + 1) + name_len; 985 nlink++; 986 } 987 988 if (key.offset == 0) 989 break; 990 key.offset--; 991 btrfs_release_path(root, path); 992 } 993 btrfs_release_path(root, path); 994 if (nlink != inode->i_nlink) { 995 inode->i_nlink = nlink; 996 btrfs_update_inode(trans, root, inode); 997 } 998 BTRFS_I(inode)->index_cnt = (u64)-1; 999 1000 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) { 1001 ret = replay_dir_deletes(trans, root, NULL, path, 1002 inode->i_ino, 1); 1003 BUG_ON(ret); 1004 } 1005 btrfs_free_path(path); 1006 1007 return 0; 1008 } 1009 1010 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1011 struct btrfs_root *root, 1012 struct btrfs_path *path) 1013 { 1014 int ret; 1015 struct btrfs_key key; 1016 struct inode *inode; 1017 1018 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1019 key.type = BTRFS_ORPHAN_ITEM_KEY; 1020 key.offset = (u64)-1; 1021 while (1) { 1022 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1023 if (ret < 0) 1024 break; 1025 1026 if (ret == 1) { 1027 if (path->slots[0] == 0) 1028 break; 1029 path->slots[0]--; 1030 } 1031 1032 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1033 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1034 key.type != BTRFS_ORPHAN_ITEM_KEY) 1035 break; 1036 1037 ret = btrfs_del_item(trans, root, path); 1038 BUG_ON(ret); 1039 1040 btrfs_release_path(root, path); 1041 inode = read_one_inode(root, key.offset); 1042 BUG_ON(!inode); 1043 1044 ret = fixup_inode_link_count(trans, root, inode); 1045 BUG_ON(ret); 1046 1047 iput(inode); 1048 1049 /* 1050 * fixup on a directory may create new entries, 1051 * make sure we always look for the highset possible 1052 * offset 1053 */ 1054 key.offset = (u64)-1; 1055 } 1056 btrfs_release_path(root, path); 1057 return 0; 1058 } 1059 1060 1061 /* 1062 * record a given inode in the fixup dir so we can check its link 1063 * count when replay is done. The link count is incremented here 1064 * so the inode won't go away until we check it 1065 */ 1066 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1067 struct btrfs_root *root, 1068 struct btrfs_path *path, 1069 u64 objectid) 1070 { 1071 struct btrfs_key key; 1072 int ret = 0; 1073 struct inode *inode; 1074 1075 inode = read_one_inode(root, objectid); 1076 BUG_ON(!inode); 1077 1078 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1079 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1080 key.offset = objectid; 1081 1082 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1083 1084 btrfs_release_path(root, path); 1085 if (ret == 0) { 1086 btrfs_inc_nlink(inode); 1087 btrfs_update_inode(trans, root, inode); 1088 } else if (ret == -EEXIST) { 1089 ret = 0; 1090 } else { 1091 BUG(); 1092 } 1093 iput(inode); 1094 1095 return ret; 1096 } 1097 1098 /* 1099 * when replaying the log for a directory, we only insert names 1100 * for inodes that actually exist. This means an fsync on a directory 1101 * does not implicitly fsync all the new files in it 1102 */ 1103 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1104 struct btrfs_root *root, 1105 struct btrfs_path *path, 1106 u64 dirid, u64 index, 1107 char *name, int name_len, u8 type, 1108 struct btrfs_key *location) 1109 { 1110 struct inode *inode; 1111 struct inode *dir; 1112 int ret; 1113 1114 inode = read_one_inode(root, location->objectid); 1115 if (!inode) 1116 return -ENOENT; 1117 1118 dir = read_one_inode(root, dirid); 1119 if (!dir) { 1120 iput(inode); 1121 return -EIO; 1122 } 1123 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1124 1125 /* FIXME, put inode into FIXUP list */ 1126 1127 iput(inode); 1128 iput(dir); 1129 return ret; 1130 } 1131 1132 /* 1133 * take a single entry in a log directory item and replay it into 1134 * the subvolume. 1135 * 1136 * if a conflicting item exists in the subdirectory already, 1137 * the inode it points to is unlinked and put into the link count 1138 * fix up tree. 1139 * 1140 * If a name from the log points to a file or directory that does 1141 * not exist in the FS, it is skipped. fsyncs on directories 1142 * do not force down inodes inside that directory, just changes to the 1143 * names or unlinks in a directory. 1144 */ 1145 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1146 struct btrfs_root *root, 1147 struct btrfs_path *path, 1148 struct extent_buffer *eb, 1149 struct btrfs_dir_item *di, 1150 struct btrfs_key *key) 1151 { 1152 char *name; 1153 int name_len; 1154 struct btrfs_dir_item *dst_di; 1155 struct btrfs_key found_key; 1156 struct btrfs_key log_key; 1157 struct inode *dir; 1158 u8 log_type; 1159 int exists; 1160 int ret; 1161 1162 dir = read_one_inode(root, key->objectid); 1163 BUG_ON(!dir); 1164 1165 name_len = btrfs_dir_name_len(eb, di); 1166 name = kmalloc(name_len, GFP_NOFS); 1167 log_type = btrfs_dir_type(eb, di); 1168 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1169 name_len); 1170 1171 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1172 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1173 if (exists == 0) 1174 exists = 1; 1175 else 1176 exists = 0; 1177 btrfs_release_path(root, path); 1178 1179 if (key->type == BTRFS_DIR_ITEM_KEY) { 1180 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1181 name, name_len, 1); 1182 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1183 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1184 key->objectid, 1185 key->offset, name, 1186 name_len, 1); 1187 } else { 1188 BUG(); 1189 } 1190 if (!dst_di || IS_ERR(dst_di)) { 1191 /* we need a sequence number to insert, so we only 1192 * do inserts for the BTRFS_DIR_INDEX_KEY types 1193 */ 1194 if (key->type != BTRFS_DIR_INDEX_KEY) 1195 goto out; 1196 goto insert; 1197 } 1198 1199 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1200 /* the existing item matches the logged item */ 1201 if (found_key.objectid == log_key.objectid && 1202 found_key.type == log_key.type && 1203 found_key.offset == log_key.offset && 1204 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1205 goto out; 1206 } 1207 1208 /* 1209 * don't drop the conflicting directory entry if the inode 1210 * for the new entry doesn't exist 1211 */ 1212 if (!exists) 1213 goto out; 1214 1215 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1216 BUG_ON(ret); 1217 1218 if (key->type == BTRFS_DIR_INDEX_KEY) 1219 goto insert; 1220 out: 1221 btrfs_release_path(root, path); 1222 kfree(name); 1223 iput(dir); 1224 return 0; 1225 1226 insert: 1227 btrfs_release_path(root, path); 1228 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1229 name, name_len, log_type, &log_key); 1230 1231 BUG_ON(ret && ret != -ENOENT); 1232 goto out; 1233 } 1234 1235 /* 1236 * find all the names in a directory item and reconcile them into 1237 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1238 * one name in a directory item, but the same code gets used for 1239 * both directory index types 1240 */ 1241 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1242 struct btrfs_root *root, 1243 struct btrfs_path *path, 1244 struct extent_buffer *eb, int slot, 1245 struct btrfs_key *key) 1246 { 1247 int ret; 1248 u32 item_size = btrfs_item_size_nr(eb, slot); 1249 struct btrfs_dir_item *di; 1250 int name_len; 1251 unsigned long ptr; 1252 unsigned long ptr_end; 1253 1254 ptr = btrfs_item_ptr_offset(eb, slot); 1255 ptr_end = ptr + item_size; 1256 while (ptr < ptr_end) { 1257 di = (struct btrfs_dir_item *)ptr; 1258 name_len = btrfs_dir_name_len(eb, di); 1259 ret = replay_one_name(trans, root, path, eb, di, key); 1260 BUG_ON(ret); 1261 ptr = (unsigned long)(di + 1); 1262 ptr += name_len; 1263 } 1264 return 0; 1265 } 1266 1267 /* 1268 * directory replay has two parts. There are the standard directory 1269 * items in the log copied from the subvolume, and range items 1270 * created in the log while the subvolume was logged. 1271 * 1272 * The range items tell us which parts of the key space the log 1273 * is authoritative for. During replay, if a key in the subvolume 1274 * directory is in a logged range item, but not actually in the log 1275 * that means it was deleted from the directory before the fsync 1276 * and should be removed. 1277 */ 1278 static noinline int find_dir_range(struct btrfs_root *root, 1279 struct btrfs_path *path, 1280 u64 dirid, int key_type, 1281 u64 *start_ret, u64 *end_ret) 1282 { 1283 struct btrfs_key key; 1284 u64 found_end; 1285 struct btrfs_dir_log_item *item; 1286 int ret; 1287 int nritems; 1288 1289 if (*start_ret == (u64)-1) 1290 return 1; 1291 1292 key.objectid = dirid; 1293 key.type = key_type; 1294 key.offset = *start_ret; 1295 1296 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1297 if (ret < 0) 1298 goto out; 1299 if (ret > 0) { 1300 if (path->slots[0] == 0) 1301 goto out; 1302 path->slots[0]--; 1303 } 1304 if (ret != 0) 1305 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1306 1307 if (key.type != key_type || key.objectid != dirid) { 1308 ret = 1; 1309 goto next; 1310 } 1311 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1312 struct btrfs_dir_log_item); 1313 found_end = btrfs_dir_log_end(path->nodes[0], item); 1314 1315 if (*start_ret >= key.offset && *start_ret <= found_end) { 1316 ret = 0; 1317 *start_ret = key.offset; 1318 *end_ret = found_end; 1319 goto out; 1320 } 1321 ret = 1; 1322 next: 1323 /* check the next slot in the tree to see if it is a valid item */ 1324 nritems = btrfs_header_nritems(path->nodes[0]); 1325 if (path->slots[0] >= nritems) { 1326 ret = btrfs_next_leaf(root, path); 1327 if (ret) 1328 goto out; 1329 } else { 1330 path->slots[0]++; 1331 } 1332 1333 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1334 1335 if (key.type != key_type || key.objectid != dirid) { 1336 ret = 1; 1337 goto out; 1338 } 1339 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1340 struct btrfs_dir_log_item); 1341 found_end = btrfs_dir_log_end(path->nodes[0], item); 1342 *start_ret = key.offset; 1343 *end_ret = found_end; 1344 ret = 0; 1345 out: 1346 btrfs_release_path(root, path); 1347 return ret; 1348 } 1349 1350 /* 1351 * this looks for a given directory item in the log. If the directory 1352 * item is not in the log, the item is removed and the inode it points 1353 * to is unlinked 1354 */ 1355 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1356 struct btrfs_root *root, 1357 struct btrfs_root *log, 1358 struct btrfs_path *path, 1359 struct btrfs_path *log_path, 1360 struct inode *dir, 1361 struct btrfs_key *dir_key) 1362 { 1363 int ret; 1364 struct extent_buffer *eb; 1365 int slot; 1366 u32 item_size; 1367 struct btrfs_dir_item *di; 1368 struct btrfs_dir_item *log_di; 1369 int name_len; 1370 unsigned long ptr; 1371 unsigned long ptr_end; 1372 char *name; 1373 struct inode *inode; 1374 struct btrfs_key location; 1375 1376 again: 1377 eb = path->nodes[0]; 1378 slot = path->slots[0]; 1379 item_size = btrfs_item_size_nr(eb, slot); 1380 ptr = btrfs_item_ptr_offset(eb, slot); 1381 ptr_end = ptr + item_size; 1382 while (ptr < ptr_end) { 1383 di = (struct btrfs_dir_item *)ptr; 1384 name_len = btrfs_dir_name_len(eb, di); 1385 name = kmalloc(name_len, GFP_NOFS); 1386 if (!name) { 1387 ret = -ENOMEM; 1388 goto out; 1389 } 1390 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1391 name_len); 1392 log_di = NULL; 1393 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1394 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1395 dir_key->objectid, 1396 name, name_len, 0); 1397 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1398 log_di = btrfs_lookup_dir_index_item(trans, log, 1399 log_path, 1400 dir_key->objectid, 1401 dir_key->offset, 1402 name, name_len, 0); 1403 } 1404 if (!log_di || IS_ERR(log_di)) { 1405 btrfs_dir_item_key_to_cpu(eb, di, &location); 1406 btrfs_release_path(root, path); 1407 btrfs_release_path(log, log_path); 1408 inode = read_one_inode(root, location.objectid); 1409 BUG_ON(!inode); 1410 1411 ret = link_to_fixup_dir(trans, root, 1412 path, location.objectid); 1413 BUG_ON(ret); 1414 btrfs_inc_nlink(inode); 1415 ret = btrfs_unlink_inode(trans, root, dir, inode, 1416 name, name_len); 1417 BUG_ON(ret); 1418 kfree(name); 1419 iput(inode); 1420 1421 /* there might still be more names under this key 1422 * check and repeat if required 1423 */ 1424 ret = btrfs_search_slot(NULL, root, dir_key, path, 1425 0, 0); 1426 if (ret == 0) 1427 goto again; 1428 ret = 0; 1429 goto out; 1430 } 1431 btrfs_release_path(log, log_path); 1432 kfree(name); 1433 1434 ptr = (unsigned long)(di + 1); 1435 ptr += name_len; 1436 } 1437 ret = 0; 1438 out: 1439 btrfs_release_path(root, path); 1440 btrfs_release_path(log, log_path); 1441 return ret; 1442 } 1443 1444 /* 1445 * deletion replay happens before we copy any new directory items 1446 * out of the log or out of backreferences from inodes. It 1447 * scans the log to find ranges of keys that log is authoritative for, 1448 * and then scans the directory to find items in those ranges that are 1449 * not present in the log. 1450 * 1451 * Anything we don't find in the log is unlinked and removed from the 1452 * directory. 1453 */ 1454 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1455 struct btrfs_root *root, 1456 struct btrfs_root *log, 1457 struct btrfs_path *path, 1458 u64 dirid, int del_all) 1459 { 1460 u64 range_start; 1461 u64 range_end; 1462 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1463 int ret = 0; 1464 struct btrfs_key dir_key; 1465 struct btrfs_key found_key; 1466 struct btrfs_path *log_path; 1467 struct inode *dir; 1468 1469 dir_key.objectid = dirid; 1470 dir_key.type = BTRFS_DIR_ITEM_KEY; 1471 log_path = btrfs_alloc_path(); 1472 if (!log_path) 1473 return -ENOMEM; 1474 1475 dir = read_one_inode(root, dirid); 1476 /* it isn't an error if the inode isn't there, that can happen 1477 * because we replay the deletes before we copy in the inode item 1478 * from the log 1479 */ 1480 if (!dir) { 1481 btrfs_free_path(log_path); 1482 return 0; 1483 } 1484 again: 1485 range_start = 0; 1486 range_end = 0; 1487 while (1) { 1488 if (del_all) 1489 range_end = (u64)-1; 1490 else { 1491 ret = find_dir_range(log, path, dirid, key_type, 1492 &range_start, &range_end); 1493 if (ret != 0) 1494 break; 1495 } 1496 1497 dir_key.offset = range_start; 1498 while (1) { 1499 int nritems; 1500 ret = btrfs_search_slot(NULL, root, &dir_key, path, 1501 0, 0); 1502 if (ret < 0) 1503 goto out; 1504 1505 nritems = btrfs_header_nritems(path->nodes[0]); 1506 if (path->slots[0] >= nritems) { 1507 ret = btrfs_next_leaf(root, path); 1508 if (ret) 1509 break; 1510 } 1511 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1512 path->slots[0]); 1513 if (found_key.objectid != dirid || 1514 found_key.type != dir_key.type) 1515 goto next_type; 1516 1517 if (found_key.offset > range_end) 1518 break; 1519 1520 ret = check_item_in_log(trans, root, log, path, 1521 log_path, dir, 1522 &found_key); 1523 BUG_ON(ret); 1524 if (found_key.offset == (u64)-1) 1525 break; 1526 dir_key.offset = found_key.offset + 1; 1527 } 1528 btrfs_release_path(root, path); 1529 if (range_end == (u64)-1) 1530 break; 1531 range_start = range_end + 1; 1532 } 1533 1534 next_type: 1535 ret = 0; 1536 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1537 key_type = BTRFS_DIR_LOG_INDEX_KEY; 1538 dir_key.type = BTRFS_DIR_INDEX_KEY; 1539 btrfs_release_path(root, path); 1540 goto again; 1541 } 1542 out: 1543 btrfs_release_path(root, path); 1544 btrfs_free_path(log_path); 1545 iput(dir); 1546 return ret; 1547 } 1548 1549 /* 1550 * the process_func used to replay items from the log tree. This 1551 * gets called in two different stages. The first stage just looks 1552 * for inodes and makes sure they are all copied into the subvolume. 1553 * 1554 * The second stage copies all the other item types from the log into 1555 * the subvolume. The two stage approach is slower, but gets rid of 1556 * lots of complexity around inodes referencing other inodes that exist 1557 * only in the log (references come from either directory items or inode 1558 * back refs). 1559 */ 1560 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 1561 struct walk_control *wc, u64 gen) 1562 { 1563 int nritems; 1564 struct btrfs_path *path; 1565 struct btrfs_root *root = wc->replay_dest; 1566 struct btrfs_key key; 1567 u32 item_size; 1568 int level; 1569 int i; 1570 int ret; 1571 1572 btrfs_read_buffer(eb, gen); 1573 1574 level = btrfs_header_level(eb); 1575 1576 if (level != 0) 1577 return 0; 1578 1579 path = btrfs_alloc_path(); 1580 BUG_ON(!path); 1581 1582 nritems = btrfs_header_nritems(eb); 1583 for (i = 0; i < nritems; i++) { 1584 btrfs_item_key_to_cpu(eb, &key, i); 1585 item_size = btrfs_item_size_nr(eb, i); 1586 1587 /* inode keys are done during the first stage */ 1588 if (key.type == BTRFS_INODE_ITEM_KEY && 1589 wc->stage == LOG_WALK_REPLAY_INODES) { 1590 struct inode *inode; 1591 struct btrfs_inode_item *inode_item; 1592 u32 mode; 1593 1594 inode_item = btrfs_item_ptr(eb, i, 1595 struct btrfs_inode_item); 1596 mode = btrfs_inode_mode(eb, inode_item); 1597 if (S_ISDIR(mode)) { 1598 ret = replay_dir_deletes(wc->trans, 1599 root, log, path, key.objectid, 0); 1600 BUG_ON(ret); 1601 } 1602 ret = overwrite_item(wc->trans, root, path, 1603 eb, i, &key); 1604 BUG_ON(ret); 1605 1606 /* for regular files, truncate away 1607 * extents past the new EOF 1608 */ 1609 if (S_ISREG(mode)) { 1610 inode = read_one_inode(root, 1611 key.objectid); 1612 BUG_ON(!inode); 1613 1614 ret = btrfs_truncate_inode_items(wc->trans, 1615 root, inode, inode->i_size, 1616 BTRFS_EXTENT_DATA_KEY); 1617 BUG_ON(ret); 1618 1619 /* if the nlink count is zero here, the iput 1620 * will free the inode. We bump it to make 1621 * sure it doesn't get freed until the link 1622 * count fixup is done 1623 */ 1624 if (inode->i_nlink == 0) { 1625 btrfs_inc_nlink(inode); 1626 btrfs_update_inode(wc->trans, 1627 root, inode); 1628 } 1629 iput(inode); 1630 } 1631 ret = link_to_fixup_dir(wc->trans, root, 1632 path, key.objectid); 1633 BUG_ON(ret); 1634 } 1635 if (wc->stage < LOG_WALK_REPLAY_ALL) 1636 continue; 1637 1638 /* these keys are simply copied */ 1639 if (key.type == BTRFS_XATTR_ITEM_KEY) { 1640 ret = overwrite_item(wc->trans, root, path, 1641 eb, i, &key); 1642 BUG_ON(ret); 1643 } else if (key.type == BTRFS_INODE_REF_KEY) { 1644 ret = add_inode_ref(wc->trans, root, log, path, 1645 eb, i, &key); 1646 BUG_ON(ret && ret != -ENOENT); 1647 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1648 ret = replay_one_extent(wc->trans, root, path, 1649 eb, i, &key); 1650 BUG_ON(ret); 1651 } else if (key.type == BTRFS_DIR_ITEM_KEY || 1652 key.type == BTRFS_DIR_INDEX_KEY) { 1653 ret = replay_one_dir_item(wc->trans, root, path, 1654 eb, i, &key); 1655 BUG_ON(ret); 1656 } 1657 } 1658 btrfs_free_path(path); 1659 return 0; 1660 } 1661 1662 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 1663 struct btrfs_root *root, 1664 struct btrfs_path *path, int *level, 1665 struct walk_control *wc) 1666 { 1667 u64 root_owner; 1668 u64 root_gen; 1669 u64 bytenr; 1670 u64 ptr_gen; 1671 struct extent_buffer *next; 1672 struct extent_buffer *cur; 1673 struct extent_buffer *parent; 1674 u32 blocksize; 1675 int ret = 0; 1676 1677 WARN_ON(*level < 0); 1678 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1679 1680 while (*level > 0) { 1681 WARN_ON(*level < 0); 1682 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1683 cur = path->nodes[*level]; 1684 1685 if (btrfs_header_level(cur) != *level) 1686 WARN_ON(1); 1687 1688 if (path->slots[*level] >= 1689 btrfs_header_nritems(cur)) 1690 break; 1691 1692 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 1693 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 1694 blocksize = btrfs_level_size(root, *level - 1); 1695 1696 parent = path->nodes[*level]; 1697 root_owner = btrfs_header_owner(parent); 1698 root_gen = btrfs_header_generation(parent); 1699 1700 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1701 1702 wc->process_func(root, next, wc, ptr_gen); 1703 1704 if (*level == 1) { 1705 path->slots[*level]++; 1706 if (wc->free) { 1707 btrfs_read_buffer(next, ptr_gen); 1708 1709 btrfs_tree_lock(next); 1710 clean_tree_block(trans, root, next); 1711 btrfs_set_lock_blocking(next); 1712 btrfs_wait_tree_block_writeback(next); 1713 btrfs_tree_unlock(next); 1714 1715 WARN_ON(root_owner != 1716 BTRFS_TREE_LOG_OBJECTID); 1717 ret = btrfs_free_reserved_extent(root, 1718 bytenr, blocksize); 1719 BUG_ON(ret); 1720 } 1721 free_extent_buffer(next); 1722 continue; 1723 } 1724 btrfs_read_buffer(next, ptr_gen); 1725 1726 WARN_ON(*level <= 0); 1727 if (path->nodes[*level-1]) 1728 free_extent_buffer(path->nodes[*level-1]); 1729 path->nodes[*level-1] = next; 1730 *level = btrfs_header_level(next); 1731 path->slots[*level] = 0; 1732 cond_resched(); 1733 } 1734 WARN_ON(*level < 0); 1735 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1736 1737 if (path->nodes[*level] == root->node) 1738 parent = path->nodes[*level]; 1739 else 1740 parent = path->nodes[*level + 1]; 1741 1742 bytenr = path->nodes[*level]->start; 1743 1744 blocksize = btrfs_level_size(root, *level); 1745 root_owner = btrfs_header_owner(parent); 1746 root_gen = btrfs_header_generation(parent); 1747 1748 wc->process_func(root, path->nodes[*level], wc, 1749 btrfs_header_generation(path->nodes[*level])); 1750 1751 if (wc->free) { 1752 next = path->nodes[*level]; 1753 btrfs_tree_lock(next); 1754 clean_tree_block(trans, root, next); 1755 btrfs_set_lock_blocking(next); 1756 btrfs_wait_tree_block_writeback(next); 1757 btrfs_tree_unlock(next); 1758 1759 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1760 ret = btrfs_free_reserved_extent(root, bytenr, blocksize); 1761 BUG_ON(ret); 1762 } 1763 free_extent_buffer(path->nodes[*level]); 1764 path->nodes[*level] = NULL; 1765 *level += 1; 1766 1767 cond_resched(); 1768 return 0; 1769 } 1770 1771 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 1772 struct btrfs_root *root, 1773 struct btrfs_path *path, int *level, 1774 struct walk_control *wc) 1775 { 1776 u64 root_owner; 1777 u64 root_gen; 1778 int i; 1779 int slot; 1780 int ret; 1781 1782 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1783 slot = path->slots[i]; 1784 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1785 struct extent_buffer *node; 1786 node = path->nodes[i]; 1787 path->slots[i]++; 1788 *level = i; 1789 WARN_ON(*level == 0); 1790 return 0; 1791 } else { 1792 struct extent_buffer *parent; 1793 if (path->nodes[*level] == root->node) 1794 parent = path->nodes[*level]; 1795 else 1796 parent = path->nodes[*level + 1]; 1797 1798 root_owner = btrfs_header_owner(parent); 1799 root_gen = btrfs_header_generation(parent); 1800 wc->process_func(root, path->nodes[*level], wc, 1801 btrfs_header_generation(path->nodes[*level])); 1802 if (wc->free) { 1803 struct extent_buffer *next; 1804 1805 next = path->nodes[*level]; 1806 1807 btrfs_tree_lock(next); 1808 clean_tree_block(trans, root, next); 1809 btrfs_set_lock_blocking(next); 1810 btrfs_wait_tree_block_writeback(next); 1811 btrfs_tree_unlock(next); 1812 1813 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1814 ret = btrfs_free_reserved_extent(root, 1815 path->nodes[*level]->start, 1816 path->nodes[*level]->len); 1817 BUG_ON(ret); 1818 } 1819 free_extent_buffer(path->nodes[*level]); 1820 path->nodes[*level] = NULL; 1821 *level = i + 1; 1822 } 1823 } 1824 return 1; 1825 } 1826 1827 /* 1828 * drop the reference count on the tree rooted at 'snap'. This traverses 1829 * the tree freeing any blocks that have a ref count of zero after being 1830 * decremented. 1831 */ 1832 static int walk_log_tree(struct btrfs_trans_handle *trans, 1833 struct btrfs_root *log, struct walk_control *wc) 1834 { 1835 int ret = 0; 1836 int wret; 1837 int level; 1838 struct btrfs_path *path; 1839 int i; 1840 int orig_level; 1841 1842 path = btrfs_alloc_path(); 1843 BUG_ON(!path); 1844 1845 level = btrfs_header_level(log->node); 1846 orig_level = level; 1847 path->nodes[level] = log->node; 1848 extent_buffer_get(log->node); 1849 path->slots[level] = 0; 1850 1851 while (1) { 1852 wret = walk_down_log_tree(trans, log, path, &level, wc); 1853 if (wret > 0) 1854 break; 1855 if (wret < 0) 1856 ret = wret; 1857 1858 wret = walk_up_log_tree(trans, log, path, &level, wc); 1859 if (wret > 0) 1860 break; 1861 if (wret < 0) 1862 ret = wret; 1863 } 1864 1865 /* was the root node processed? if not, catch it here */ 1866 if (path->nodes[orig_level]) { 1867 wc->process_func(log, path->nodes[orig_level], wc, 1868 btrfs_header_generation(path->nodes[orig_level])); 1869 if (wc->free) { 1870 struct extent_buffer *next; 1871 1872 next = path->nodes[orig_level]; 1873 1874 btrfs_tree_lock(next); 1875 clean_tree_block(trans, log, next); 1876 btrfs_set_lock_blocking(next); 1877 btrfs_wait_tree_block_writeback(next); 1878 btrfs_tree_unlock(next); 1879 1880 WARN_ON(log->root_key.objectid != 1881 BTRFS_TREE_LOG_OBJECTID); 1882 ret = btrfs_free_reserved_extent(log, next->start, 1883 next->len); 1884 BUG_ON(ret); 1885 } 1886 } 1887 1888 for (i = 0; i <= orig_level; i++) { 1889 if (path->nodes[i]) { 1890 free_extent_buffer(path->nodes[i]); 1891 path->nodes[i] = NULL; 1892 } 1893 } 1894 btrfs_free_path(path); 1895 return ret; 1896 } 1897 1898 /* 1899 * helper function to update the item for a given subvolumes log root 1900 * in the tree of log roots 1901 */ 1902 static int update_log_root(struct btrfs_trans_handle *trans, 1903 struct btrfs_root *log) 1904 { 1905 int ret; 1906 1907 if (log->log_transid == 1) { 1908 /* insert root item on the first sync */ 1909 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 1910 &log->root_key, &log->root_item); 1911 } else { 1912 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 1913 &log->root_key, &log->root_item); 1914 } 1915 return ret; 1916 } 1917 1918 static int wait_log_commit(struct btrfs_trans_handle *trans, 1919 struct btrfs_root *root, unsigned long transid) 1920 { 1921 DEFINE_WAIT(wait); 1922 int index = transid % 2; 1923 1924 /* 1925 * we only allow two pending log transactions at a time, 1926 * so we know that if ours is more than 2 older than the 1927 * current transaction, we're done 1928 */ 1929 do { 1930 prepare_to_wait(&root->log_commit_wait[index], 1931 &wait, TASK_UNINTERRUPTIBLE); 1932 mutex_unlock(&root->log_mutex); 1933 1934 if (root->fs_info->last_trans_log_full_commit != 1935 trans->transid && root->log_transid < transid + 2 && 1936 atomic_read(&root->log_commit[index])) 1937 schedule(); 1938 1939 finish_wait(&root->log_commit_wait[index], &wait); 1940 mutex_lock(&root->log_mutex); 1941 } while (root->log_transid < transid + 2 && 1942 atomic_read(&root->log_commit[index])); 1943 return 0; 1944 } 1945 1946 static int wait_for_writer(struct btrfs_trans_handle *trans, 1947 struct btrfs_root *root) 1948 { 1949 DEFINE_WAIT(wait); 1950 while (atomic_read(&root->log_writers)) { 1951 prepare_to_wait(&root->log_writer_wait, 1952 &wait, TASK_UNINTERRUPTIBLE); 1953 mutex_unlock(&root->log_mutex); 1954 if (root->fs_info->last_trans_log_full_commit != 1955 trans->transid && atomic_read(&root->log_writers)) 1956 schedule(); 1957 mutex_lock(&root->log_mutex); 1958 finish_wait(&root->log_writer_wait, &wait); 1959 } 1960 return 0; 1961 } 1962 1963 /* 1964 * btrfs_sync_log does sends a given tree log down to the disk and 1965 * updates the super blocks to record it. When this call is done, 1966 * you know that any inodes previously logged are safely on disk only 1967 * if it returns 0. 1968 * 1969 * Any other return value means you need to call btrfs_commit_transaction. 1970 * Some of the edge cases for fsyncing directories that have had unlinks 1971 * or renames done in the past mean that sometimes the only safe 1972 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 1973 * that has happened. 1974 */ 1975 int btrfs_sync_log(struct btrfs_trans_handle *trans, 1976 struct btrfs_root *root) 1977 { 1978 int index1; 1979 int index2; 1980 int ret; 1981 struct btrfs_root *log = root->log_root; 1982 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 1983 u64 log_transid = 0; 1984 1985 mutex_lock(&root->log_mutex); 1986 index1 = root->log_transid % 2; 1987 if (atomic_read(&root->log_commit[index1])) { 1988 wait_log_commit(trans, root, root->log_transid); 1989 mutex_unlock(&root->log_mutex); 1990 return 0; 1991 } 1992 atomic_set(&root->log_commit[index1], 1); 1993 1994 /* wait for previous tree log sync to complete */ 1995 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 1996 wait_log_commit(trans, root, root->log_transid - 1); 1997 1998 while (1) { 1999 unsigned long batch = root->log_batch; 2000 if (root->log_multiple_pids) { 2001 mutex_unlock(&root->log_mutex); 2002 schedule_timeout_uninterruptible(1); 2003 mutex_lock(&root->log_mutex); 2004 } 2005 wait_for_writer(trans, root); 2006 if (batch == root->log_batch) 2007 break; 2008 } 2009 2010 /* bail out if we need to do a full commit */ 2011 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2012 ret = -EAGAIN; 2013 mutex_unlock(&root->log_mutex); 2014 goto out; 2015 } 2016 2017 /* we start IO on all the marked extents here, but we don't actually 2018 * wait for them until later. 2019 */ 2020 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages); 2021 BUG_ON(ret); 2022 2023 btrfs_set_root_node(&log->root_item, log->node); 2024 2025 root->log_batch = 0; 2026 log_transid = root->log_transid; 2027 root->log_transid++; 2028 log->log_transid = root->log_transid; 2029 root->log_start_pid = 0; 2030 smp_mb(); 2031 /* 2032 * log tree has been flushed to disk, new modifications of 2033 * the log will be written to new positions. so it's safe to 2034 * allow log writers to go in. 2035 */ 2036 mutex_unlock(&root->log_mutex); 2037 2038 mutex_lock(&log_root_tree->log_mutex); 2039 log_root_tree->log_batch++; 2040 atomic_inc(&log_root_tree->log_writers); 2041 mutex_unlock(&log_root_tree->log_mutex); 2042 2043 ret = update_log_root(trans, log); 2044 BUG_ON(ret); 2045 2046 mutex_lock(&log_root_tree->log_mutex); 2047 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2048 smp_mb(); 2049 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2050 wake_up(&log_root_tree->log_writer_wait); 2051 } 2052 2053 index2 = log_root_tree->log_transid % 2; 2054 if (atomic_read(&log_root_tree->log_commit[index2])) { 2055 btrfs_wait_marked_extents(log, &log->dirty_log_pages); 2056 wait_log_commit(trans, log_root_tree, 2057 log_root_tree->log_transid); 2058 mutex_unlock(&log_root_tree->log_mutex); 2059 goto out; 2060 } 2061 atomic_set(&log_root_tree->log_commit[index2], 1); 2062 2063 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2064 wait_log_commit(trans, log_root_tree, 2065 log_root_tree->log_transid - 1); 2066 } 2067 2068 wait_for_writer(trans, log_root_tree); 2069 2070 /* 2071 * now that we've moved on to the tree of log tree roots, 2072 * check the full commit flag again 2073 */ 2074 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2075 btrfs_wait_marked_extents(log, &log->dirty_log_pages); 2076 mutex_unlock(&log_root_tree->log_mutex); 2077 ret = -EAGAIN; 2078 goto out_wake_log_root; 2079 } 2080 2081 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2082 &log_root_tree->dirty_log_pages); 2083 BUG_ON(ret); 2084 btrfs_wait_marked_extents(log, &log->dirty_log_pages); 2085 2086 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2087 log_root_tree->node->start); 2088 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2089 btrfs_header_level(log_root_tree->node)); 2090 2091 log_root_tree->log_batch = 0; 2092 log_root_tree->log_transid++; 2093 smp_mb(); 2094 2095 mutex_unlock(&log_root_tree->log_mutex); 2096 2097 /* 2098 * nobody else is going to jump in and write the the ctree 2099 * super here because the log_commit atomic below is protecting 2100 * us. We must be called with a transaction handle pinning 2101 * the running transaction open, so a full commit can't hop 2102 * in and cause problems either. 2103 */ 2104 write_ctree_super(trans, root->fs_info->tree_root, 1); 2105 ret = 0; 2106 2107 mutex_lock(&root->log_mutex); 2108 if (root->last_log_commit < log_transid) 2109 root->last_log_commit = log_transid; 2110 mutex_unlock(&root->log_mutex); 2111 2112 out_wake_log_root: 2113 atomic_set(&log_root_tree->log_commit[index2], 0); 2114 smp_mb(); 2115 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2116 wake_up(&log_root_tree->log_commit_wait[index2]); 2117 out: 2118 atomic_set(&root->log_commit[index1], 0); 2119 smp_mb(); 2120 if (waitqueue_active(&root->log_commit_wait[index1])) 2121 wake_up(&root->log_commit_wait[index1]); 2122 return 0; 2123 } 2124 2125 /* 2126 * free all the extents used by the tree log. This should be called 2127 * at commit time of the full transaction 2128 */ 2129 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2130 { 2131 int ret; 2132 struct btrfs_root *log; 2133 struct key; 2134 u64 start; 2135 u64 end; 2136 struct walk_control wc = { 2137 .free = 1, 2138 .process_func = process_one_buffer 2139 }; 2140 2141 if (!root->log_root || root->fs_info->log_root_recovering) 2142 return 0; 2143 2144 log = root->log_root; 2145 ret = walk_log_tree(trans, log, &wc); 2146 BUG_ON(ret); 2147 2148 while (1) { 2149 ret = find_first_extent_bit(&log->dirty_log_pages, 2150 0, &start, &end, EXTENT_DIRTY); 2151 if (ret) 2152 break; 2153 2154 clear_extent_dirty(&log->dirty_log_pages, 2155 start, end, GFP_NOFS); 2156 } 2157 2158 if (log->log_transid > 0) { 2159 ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2160 &log->root_key); 2161 BUG_ON(ret); 2162 } 2163 root->log_root = NULL; 2164 free_extent_buffer(log->node); 2165 kfree(log); 2166 return 0; 2167 } 2168 2169 /* 2170 * If both a file and directory are logged, and unlinks or renames are 2171 * mixed in, we have a few interesting corners: 2172 * 2173 * create file X in dir Y 2174 * link file X to X.link in dir Y 2175 * fsync file X 2176 * unlink file X but leave X.link 2177 * fsync dir Y 2178 * 2179 * After a crash we would expect only X.link to exist. But file X 2180 * didn't get fsync'd again so the log has back refs for X and X.link. 2181 * 2182 * We solve this by removing directory entries and inode backrefs from the 2183 * log when a file that was logged in the current transaction is 2184 * unlinked. Any later fsync will include the updated log entries, and 2185 * we'll be able to reconstruct the proper directory items from backrefs. 2186 * 2187 * This optimizations allows us to avoid relogging the entire inode 2188 * or the entire directory. 2189 */ 2190 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2191 struct btrfs_root *root, 2192 const char *name, int name_len, 2193 struct inode *dir, u64 index) 2194 { 2195 struct btrfs_root *log; 2196 struct btrfs_dir_item *di; 2197 struct btrfs_path *path; 2198 int ret; 2199 int bytes_del = 0; 2200 2201 if (BTRFS_I(dir)->logged_trans < trans->transid) 2202 return 0; 2203 2204 ret = join_running_log_trans(root); 2205 if (ret) 2206 return 0; 2207 2208 mutex_lock(&BTRFS_I(dir)->log_mutex); 2209 2210 log = root->log_root; 2211 path = btrfs_alloc_path(); 2212 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2213 name, name_len, -1); 2214 if (di && !IS_ERR(di)) { 2215 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2216 bytes_del += name_len; 2217 BUG_ON(ret); 2218 } 2219 btrfs_release_path(log, path); 2220 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2221 index, name, name_len, -1); 2222 if (di && !IS_ERR(di)) { 2223 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2224 bytes_del += name_len; 2225 BUG_ON(ret); 2226 } 2227 2228 /* update the directory size in the log to reflect the names 2229 * we have removed 2230 */ 2231 if (bytes_del) { 2232 struct btrfs_key key; 2233 2234 key.objectid = dir->i_ino; 2235 key.offset = 0; 2236 key.type = BTRFS_INODE_ITEM_KEY; 2237 btrfs_release_path(log, path); 2238 2239 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2240 if (ret == 0) { 2241 struct btrfs_inode_item *item; 2242 u64 i_size; 2243 2244 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2245 struct btrfs_inode_item); 2246 i_size = btrfs_inode_size(path->nodes[0], item); 2247 if (i_size > bytes_del) 2248 i_size -= bytes_del; 2249 else 2250 i_size = 0; 2251 btrfs_set_inode_size(path->nodes[0], item, i_size); 2252 btrfs_mark_buffer_dirty(path->nodes[0]); 2253 } else 2254 ret = 0; 2255 btrfs_release_path(log, path); 2256 } 2257 2258 btrfs_free_path(path); 2259 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2260 btrfs_end_log_trans(root); 2261 2262 return 0; 2263 } 2264 2265 /* see comments for btrfs_del_dir_entries_in_log */ 2266 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2267 struct btrfs_root *root, 2268 const char *name, int name_len, 2269 struct inode *inode, u64 dirid) 2270 { 2271 struct btrfs_root *log; 2272 u64 index; 2273 int ret; 2274 2275 if (BTRFS_I(inode)->logged_trans < trans->transid) 2276 return 0; 2277 2278 ret = join_running_log_trans(root); 2279 if (ret) 2280 return 0; 2281 log = root->log_root; 2282 mutex_lock(&BTRFS_I(inode)->log_mutex); 2283 2284 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2285 dirid, &index); 2286 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2287 btrfs_end_log_trans(root); 2288 2289 return ret; 2290 } 2291 2292 /* 2293 * creates a range item in the log for 'dirid'. first_offset and 2294 * last_offset tell us which parts of the key space the log should 2295 * be considered authoritative for. 2296 */ 2297 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2298 struct btrfs_root *log, 2299 struct btrfs_path *path, 2300 int key_type, u64 dirid, 2301 u64 first_offset, u64 last_offset) 2302 { 2303 int ret; 2304 struct btrfs_key key; 2305 struct btrfs_dir_log_item *item; 2306 2307 key.objectid = dirid; 2308 key.offset = first_offset; 2309 if (key_type == BTRFS_DIR_ITEM_KEY) 2310 key.type = BTRFS_DIR_LOG_ITEM_KEY; 2311 else 2312 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2313 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2314 BUG_ON(ret); 2315 2316 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2317 struct btrfs_dir_log_item); 2318 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2319 btrfs_mark_buffer_dirty(path->nodes[0]); 2320 btrfs_release_path(log, path); 2321 return 0; 2322 } 2323 2324 /* 2325 * log all the items included in the current transaction for a given 2326 * directory. This also creates the range items in the log tree required 2327 * to replay anything deleted before the fsync 2328 */ 2329 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2330 struct btrfs_root *root, struct inode *inode, 2331 struct btrfs_path *path, 2332 struct btrfs_path *dst_path, int key_type, 2333 u64 min_offset, u64 *last_offset_ret) 2334 { 2335 struct btrfs_key min_key; 2336 struct btrfs_key max_key; 2337 struct btrfs_root *log = root->log_root; 2338 struct extent_buffer *src; 2339 int ret; 2340 int i; 2341 int nritems; 2342 u64 first_offset = min_offset; 2343 u64 last_offset = (u64)-1; 2344 2345 log = root->log_root; 2346 max_key.objectid = inode->i_ino; 2347 max_key.offset = (u64)-1; 2348 max_key.type = key_type; 2349 2350 min_key.objectid = inode->i_ino; 2351 min_key.type = key_type; 2352 min_key.offset = min_offset; 2353 2354 path->keep_locks = 1; 2355 2356 ret = btrfs_search_forward(root, &min_key, &max_key, 2357 path, 0, trans->transid); 2358 2359 /* 2360 * we didn't find anything from this transaction, see if there 2361 * is anything at all 2362 */ 2363 if (ret != 0 || min_key.objectid != inode->i_ino || 2364 min_key.type != key_type) { 2365 min_key.objectid = inode->i_ino; 2366 min_key.type = key_type; 2367 min_key.offset = (u64)-1; 2368 btrfs_release_path(root, path); 2369 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2370 if (ret < 0) { 2371 btrfs_release_path(root, path); 2372 return ret; 2373 } 2374 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2375 2376 /* if ret == 0 there are items for this type, 2377 * create a range to tell us the last key of this type. 2378 * otherwise, there are no items in this directory after 2379 * *min_offset, and we create a range to indicate that. 2380 */ 2381 if (ret == 0) { 2382 struct btrfs_key tmp; 2383 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 2384 path->slots[0]); 2385 if (key_type == tmp.type) 2386 first_offset = max(min_offset, tmp.offset) + 1; 2387 } 2388 goto done; 2389 } 2390 2391 /* go backward to find any previous key */ 2392 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2393 if (ret == 0) { 2394 struct btrfs_key tmp; 2395 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2396 if (key_type == tmp.type) { 2397 first_offset = tmp.offset; 2398 ret = overwrite_item(trans, log, dst_path, 2399 path->nodes[0], path->slots[0], 2400 &tmp); 2401 } 2402 } 2403 btrfs_release_path(root, path); 2404 2405 /* find the first key from this transaction again */ 2406 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2407 if (ret != 0) { 2408 WARN_ON(1); 2409 goto done; 2410 } 2411 2412 /* 2413 * we have a block from this transaction, log every item in it 2414 * from our directory 2415 */ 2416 while (1) { 2417 struct btrfs_key tmp; 2418 src = path->nodes[0]; 2419 nritems = btrfs_header_nritems(src); 2420 for (i = path->slots[0]; i < nritems; i++) { 2421 btrfs_item_key_to_cpu(src, &min_key, i); 2422 2423 if (min_key.objectid != inode->i_ino || 2424 min_key.type != key_type) 2425 goto done; 2426 ret = overwrite_item(trans, log, dst_path, src, i, 2427 &min_key); 2428 BUG_ON(ret); 2429 } 2430 path->slots[0] = nritems; 2431 2432 /* 2433 * look ahead to the next item and see if it is also 2434 * from this directory and from this transaction 2435 */ 2436 ret = btrfs_next_leaf(root, path); 2437 if (ret == 1) { 2438 last_offset = (u64)-1; 2439 goto done; 2440 } 2441 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2442 if (tmp.objectid != inode->i_ino || tmp.type != key_type) { 2443 last_offset = (u64)-1; 2444 goto done; 2445 } 2446 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 2447 ret = overwrite_item(trans, log, dst_path, 2448 path->nodes[0], path->slots[0], 2449 &tmp); 2450 2451 BUG_ON(ret); 2452 last_offset = tmp.offset; 2453 goto done; 2454 } 2455 } 2456 done: 2457 *last_offset_ret = last_offset; 2458 btrfs_release_path(root, path); 2459 btrfs_release_path(log, dst_path); 2460 2461 /* insert the log range keys to indicate where the log is valid */ 2462 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2463 first_offset, last_offset); 2464 BUG_ON(ret); 2465 return 0; 2466 } 2467 2468 /* 2469 * logging directories is very similar to logging inodes, We find all the items 2470 * from the current transaction and write them to the log. 2471 * 2472 * The recovery code scans the directory in the subvolume, and if it finds a 2473 * key in the range logged that is not present in the log tree, then it means 2474 * that dir entry was unlinked during the transaction. 2475 * 2476 * In order for that scan to work, we must include one key smaller than 2477 * the smallest logged by this transaction and one key larger than the largest 2478 * key logged by this transaction. 2479 */ 2480 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 2481 struct btrfs_root *root, struct inode *inode, 2482 struct btrfs_path *path, 2483 struct btrfs_path *dst_path) 2484 { 2485 u64 min_key; 2486 u64 max_key; 2487 int ret; 2488 int key_type = BTRFS_DIR_ITEM_KEY; 2489 2490 again: 2491 min_key = 0; 2492 max_key = 0; 2493 while (1) { 2494 ret = log_dir_items(trans, root, inode, path, 2495 dst_path, key_type, min_key, 2496 &max_key); 2497 BUG_ON(ret); 2498 if (max_key == (u64)-1) 2499 break; 2500 min_key = max_key + 1; 2501 } 2502 2503 if (key_type == BTRFS_DIR_ITEM_KEY) { 2504 key_type = BTRFS_DIR_INDEX_KEY; 2505 goto again; 2506 } 2507 return 0; 2508 } 2509 2510 /* 2511 * a helper function to drop items from the log before we relog an 2512 * inode. max_key_type indicates the highest item type to remove. 2513 * This cannot be run for file data extents because it does not 2514 * free the extents they point to. 2515 */ 2516 static int drop_objectid_items(struct btrfs_trans_handle *trans, 2517 struct btrfs_root *log, 2518 struct btrfs_path *path, 2519 u64 objectid, int max_key_type) 2520 { 2521 int ret; 2522 struct btrfs_key key; 2523 struct btrfs_key found_key; 2524 2525 key.objectid = objectid; 2526 key.type = max_key_type; 2527 key.offset = (u64)-1; 2528 2529 while (1) { 2530 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2531 2532 if (ret != 1) 2533 break; 2534 2535 if (path->slots[0] == 0) 2536 break; 2537 2538 path->slots[0]--; 2539 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2540 path->slots[0]); 2541 2542 if (found_key.objectid != objectid) 2543 break; 2544 2545 ret = btrfs_del_item(trans, log, path); 2546 BUG_ON(ret); 2547 btrfs_release_path(log, path); 2548 } 2549 btrfs_release_path(log, path); 2550 return 0; 2551 } 2552 2553 static noinline int copy_items(struct btrfs_trans_handle *trans, 2554 struct btrfs_root *log, 2555 struct btrfs_path *dst_path, 2556 struct extent_buffer *src, 2557 int start_slot, int nr, int inode_only) 2558 { 2559 unsigned long src_offset; 2560 unsigned long dst_offset; 2561 struct btrfs_file_extent_item *extent; 2562 struct btrfs_inode_item *inode_item; 2563 int ret; 2564 struct btrfs_key *ins_keys; 2565 u32 *ins_sizes; 2566 char *ins_data; 2567 int i; 2568 struct list_head ordered_sums; 2569 2570 INIT_LIST_HEAD(&ordered_sums); 2571 2572 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2573 nr * sizeof(u32), GFP_NOFS); 2574 ins_sizes = (u32 *)ins_data; 2575 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 2576 2577 for (i = 0; i < nr; i++) { 2578 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 2579 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 2580 } 2581 ret = btrfs_insert_empty_items(trans, log, dst_path, 2582 ins_keys, ins_sizes, nr); 2583 BUG_ON(ret); 2584 2585 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 2586 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2587 dst_path->slots[0]); 2588 2589 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 2590 2591 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 2592 src_offset, ins_sizes[i]); 2593 2594 if (inode_only == LOG_INODE_EXISTS && 2595 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 2596 inode_item = btrfs_item_ptr(dst_path->nodes[0], 2597 dst_path->slots[0], 2598 struct btrfs_inode_item); 2599 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); 2600 2601 /* set the generation to zero so the recover code 2602 * can tell the difference between an logging 2603 * just to say 'this inode exists' and a logging 2604 * to say 'update this inode with these values' 2605 */ 2606 btrfs_set_inode_generation(dst_path->nodes[0], 2607 inode_item, 0); 2608 } 2609 /* take a reference on file data extents so that truncates 2610 * or deletes of this inode don't have to relog the inode 2611 * again 2612 */ 2613 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { 2614 int found_type; 2615 extent = btrfs_item_ptr(src, start_slot + i, 2616 struct btrfs_file_extent_item); 2617 2618 found_type = btrfs_file_extent_type(src, extent); 2619 if (found_type == BTRFS_FILE_EXTENT_REG || 2620 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 2621 u64 ds, dl, cs, cl; 2622 ds = btrfs_file_extent_disk_bytenr(src, 2623 extent); 2624 /* ds == 0 is a hole */ 2625 if (ds == 0) 2626 continue; 2627 2628 dl = btrfs_file_extent_disk_num_bytes(src, 2629 extent); 2630 cs = btrfs_file_extent_offset(src, extent); 2631 cl = btrfs_file_extent_num_bytes(src, 2632 extent); 2633 if (btrfs_file_extent_compression(src, 2634 extent)) { 2635 cs = 0; 2636 cl = dl; 2637 } 2638 2639 ret = btrfs_lookup_csums_range( 2640 log->fs_info->csum_root, 2641 ds + cs, ds + cs + cl - 1, 2642 &ordered_sums); 2643 BUG_ON(ret); 2644 } 2645 } 2646 } 2647 2648 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 2649 btrfs_release_path(log, dst_path); 2650 kfree(ins_data); 2651 2652 /* 2653 * we have to do this after the loop above to avoid changing the 2654 * log tree while trying to change the log tree. 2655 */ 2656 while (!list_empty(&ordered_sums)) { 2657 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2658 struct btrfs_ordered_sum, 2659 list); 2660 ret = btrfs_csum_file_blocks(trans, log, sums); 2661 BUG_ON(ret); 2662 list_del(&sums->list); 2663 kfree(sums); 2664 } 2665 return 0; 2666 } 2667 2668 /* log a single inode in the tree log. 2669 * At least one parent directory for this inode must exist in the tree 2670 * or be logged already. 2671 * 2672 * Any items from this inode changed by the current transaction are copied 2673 * to the log tree. An extra reference is taken on any extents in this 2674 * file, allowing us to avoid a whole pile of corner cases around logging 2675 * blocks that have been removed from the tree. 2676 * 2677 * See LOG_INODE_ALL and related defines for a description of what inode_only 2678 * does. 2679 * 2680 * This handles both files and directories. 2681 */ 2682 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 2683 struct btrfs_root *root, struct inode *inode, 2684 int inode_only) 2685 { 2686 struct btrfs_path *path; 2687 struct btrfs_path *dst_path; 2688 struct btrfs_key min_key; 2689 struct btrfs_key max_key; 2690 struct btrfs_root *log = root->log_root; 2691 struct extent_buffer *src = NULL; 2692 u32 size; 2693 int ret; 2694 int nritems; 2695 int ins_start_slot = 0; 2696 int ins_nr; 2697 2698 log = root->log_root; 2699 2700 path = btrfs_alloc_path(); 2701 dst_path = btrfs_alloc_path(); 2702 2703 min_key.objectid = inode->i_ino; 2704 min_key.type = BTRFS_INODE_ITEM_KEY; 2705 min_key.offset = 0; 2706 2707 max_key.objectid = inode->i_ino; 2708 2709 /* today the code can only do partial logging of directories */ 2710 if (!S_ISDIR(inode->i_mode)) 2711 inode_only = LOG_INODE_ALL; 2712 2713 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2714 max_key.type = BTRFS_XATTR_ITEM_KEY; 2715 else 2716 max_key.type = (u8)-1; 2717 max_key.offset = (u64)-1; 2718 2719 mutex_lock(&BTRFS_I(inode)->log_mutex); 2720 2721 /* 2722 * a brute force approach to making sure we get the most uptodate 2723 * copies of everything. 2724 */ 2725 if (S_ISDIR(inode->i_mode)) { 2726 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 2727 2728 if (inode_only == LOG_INODE_EXISTS) 2729 max_key_type = BTRFS_XATTR_ITEM_KEY; 2730 ret = drop_objectid_items(trans, log, path, 2731 inode->i_ino, max_key_type); 2732 } else { 2733 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2734 } 2735 BUG_ON(ret); 2736 path->keep_locks = 1; 2737 2738 while (1) { 2739 ins_nr = 0; 2740 ret = btrfs_search_forward(root, &min_key, &max_key, 2741 path, 0, trans->transid); 2742 if (ret != 0) 2743 break; 2744 again: 2745 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 2746 if (min_key.objectid != inode->i_ino) 2747 break; 2748 if (min_key.type > max_key.type) 2749 break; 2750 2751 src = path->nodes[0]; 2752 size = btrfs_item_size_nr(src, path->slots[0]); 2753 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 2754 ins_nr++; 2755 goto next_slot; 2756 } else if (!ins_nr) { 2757 ins_start_slot = path->slots[0]; 2758 ins_nr = 1; 2759 goto next_slot; 2760 } 2761 2762 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2763 ins_nr, inode_only); 2764 BUG_ON(ret); 2765 ins_nr = 1; 2766 ins_start_slot = path->slots[0]; 2767 next_slot: 2768 2769 nritems = btrfs_header_nritems(path->nodes[0]); 2770 path->slots[0]++; 2771 if (path->slots[0] < nritems) { 2772 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 2773 path->slots[0]); 2774 goto again; 2775 } 2776 if (ins_nr) { 2777 ret = copy_items(trans, log, dst_path, src, 2778 ins_start_slot, 2779 ins_nr, inode_only); 2780 BUG_ON(ret); 2781 ins_nr = 0; 2782 } 2783 btrfs_release_path(root, path); 2784 2785 if (min_key.offset < (u64)-1) 2786 min_key.offset++; 2787 else if (min_key.type < (u8)-1) 2788 min_key.type++; 2789 else if (min_key.objectid < (u64)-1) 2790 min_key.objectid++; 2791 else 2792 break; 2793 } 2794 if (ins_nr) { 2795 ret = copy_items(trans, log, dst_path, src, 2796 ins_start_slot, 2797 ins_nr, inode_only); 2798 BUG_ON(ret); 2799 ins_nr = 0; 2800 } 2801 WARN_ON(ins_nr); 2802 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2803 btrfs_release_path(root, path); 2804 btrfs_release_path(log, dst_path); 2805 ret = log_directory_changes(trans, root, inode, path, dst_path); 2806 BUG_ON(ret); 2807 } 2808 BTRFS_I(inode)->logged_trans = trans->transid; 2809 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2810 2811 btrfs_free_path(path); 2812 btrfs_free_path(dst_path); 2813 return 0; 2814 } 2815 2816 /* 2817 * follow the dentry parent pointers up the chain and see if any 2818 * of the directories in it require a full commit before they can 2819 * be logged. Returns zero if nothing special needs to be done or 1 if 2820 * a full commit is required. 2821 */ 2822 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 2823 struct inode *inode, 2824 struct dentry *parent, 2825 struct super_block *sb, 2826 u64 last_committed) 2827 { 2828 int ret = 0; 2829 struct btrfs_root *root; 2830 2831 /* 2832 * for regular files, if its inode is already on disk, we don't 2833 * have to worry about the parents at all. This is because 2834 * we can use the last_unlink_trans field to record renames 2835 * and other fun in this file. 2836 */ 2837 if (S_ISREG(inode->i_mode) && 2838 BTRFS_I(inode)->generation <= last_committed && 2839 BTRFS_I(inode)->last_unlink_trans <= last_committed) 2840 goto out; 2841 2842 if (!S_ISDIR(inode->i_mode)) { 2843 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2844 goto out; 2845 inode = parent->d_inode; 2846 } 2847 2848 while (1) { 2849 BTRFS_I(inode)->logged_trans = trans->transid; 2850 smp_mb(); 2851 2852 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 2853 root = BTRFS_I(inode)->root; 2854 2855 /* 2856 * make sure any commits to the log are forced 2857 * to be full commits 2858 */ 2859 root->fs_info->last_trans_log_full_commit = 2860 trans->transid; 2861 ret = 1; 2862 break; 2863 } 2864 2865 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2866 break; 2867 2868 if (IS_ROOT(parent)) 2869 break; 2870 2871 parent = parent->d_parent; 2872 inode = parent->d_inode; 2873 2874 } 2875 out: 2876 return ret; 2877 } 2878 2879 static int inode_in_log(struct btrfs_trans_handle *trans, 2880 struct inode *inode) 2881 { 2882 struct btrfs_root *root = BTRFS_I(inode)->root; 2883 int ret = 0; 2884 2885 mutex_lock(&root->log_mutex); 2886 if (BTRFS_I(inode)->logged_trans == trans->transid && 2887 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) 2888 ret = 1; 2889 mutex_unlock(&root->log_mutex); 2890 return ret; 2891 } 2892 2893 2894 /* 2895 * helper function around btrfs_log_inode to make sure newly created 2896 * parent directories also end up in the log. A minimal inode and backref 2897 * only logging is done of any parent directories that are older than 2898 * the last committed transaction 2899 */ 2900 int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 2901 struct btrfs_root *root, struct inode *inode, 2902 struct dentry *parent, int exists_only) 2903 { 2904 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 2905 struct super_block *sb; 2906 int ret = 0; 2907 u64 last_committed = root->fs_info->last_trans_committed; 2908 2909 sb = inode->i_sb; 2910 2911 if (btrfs_test_opt(root, NOTREELOG)) { 2912 ret = 1; 2913 goto end_no_trans; 2914 } 2915 2916 if (root->fs_info->last_trans_log_full_commit > 2917 root->fs_info->last_trans_committed) { 2918 ret = 1; 2919 goto end_no_trans; 2920 } 2921 2922 if (root != BTRFS_I(inode)->root || 2923 btrfs_root_refs(&root->root_item) == 0) { 2924 ret = 1; 2925 goto end_no_trans; 2926 } 2927 2928 ret = check_parent_dirs_for_sync(trans, inode, parent, 2929 sb, last_committed); 2930 if (ret) 2931 goto end_no_trans; 2932 2933 if (inode_in_log(trans, inode)) { 2934 ret = BTRFS_NO_LOG_SYNC; 2935 goto end_no_trans; 2936 } 2937 2938 start_log_trans(trans, root); 2939 2940 ret = btrfs_log_inode(trans, root, inode, inode_only); 2941 BUG_ON(ret); 2942 2943 /* 2944 * for regular files, if its inode is already on disk, we don't 2945 * have to worry about the parents at all. This is because 2946 * we can use the last_unlink_trans field to record renames 2947 * and other fun in this file. 2948 */ 2949 if (S_ISREG(inode->i_mode) && 2950 BTRFS_I(inode)->generation <= last_committed && 2951 BTRFS_I(inode)->last_unlink_trans <= last_committed) 2952 goto no_parent; 2953 2954 inode_only = LOG_INODE_EXISTS; 2955 while (1) { 2956 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2957 break; 2958 2959 inode = parent->d_inode; 2960 if (root != BTRFS_I(inode)->root) 2961 break; 2962 2963 if (BTRFS_I(inode)->generation > 2964 root->fs_info->last_trans_committed) { 2965 ret = btrfs_log_inode(trans, root, inode, inode_only); 2966 BUG_ON(ret); 2967 } 2968 if (IS_ROOT(parent)) 2969 break; 2970 2971 parent = parent->d_parent; 2972 } 2973 no_parent: 2974 ret = 0; 2975 btrfs_end_log_trans(root); 2976 end_no_trans: 2977 return ret; 2978 } 2979 2980 /* 2981 * it is not safe to log dentry if the chunk root has added new 2982 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 2983 * If this returns 1, you must commit the transaction to safely get your 2984 * data on disk. 2985 */ 2986 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2987 struct btrfs_root *root, struct dentry *dentry) 2988 { 2989 return btrfs_log_inode_parent(trans, root, dentry->d_inode, 2990 dentry->d_parent, 0); 2991 } 2992 2993 /* 2994 * should be called during mount to recover any replay any log trees 2995 * from the FS 2996 */ 2997 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 2998 { 2999 int ret; 3000 struct btrfs_path *path; 3001 struct btrfs_trans_handle *trans; 3002 struct btrfs_key key; 3003 struct btrfs_key found_key; 3004 struct btrfs_key tmp_key; 3005 struct btrfs_root *log; 3006 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 3007 struct walk_control wc = { 3008 .process_func = process_one_buffer, 3009 .stage = 0, 3010 }; 3011 3012 fs_info->log_root_recovering = 1; 3013 path = btrfs_alloc_path(); 3014 BUG_ON(!path); 3015 3016 trans = btrfs_start_transaction(fs_info->tree_root, 1); 3017 3018 wc.trans = trans; 3019 wc.pin = 1; 3020 3021 walk_log_tree(trans, log_root_tree, &wc); 3022 3023 again: 3024 key.objectid = BTRFS_TREE_LOG_OBJECTID; 3025 key.offset = (u64)-1; 3026 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 3027 3028 while (1) { 3029 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 3030 if (ret < 0) 3031 break; 3032 if (ret > 0) { 3033 if (path->slots[0] == 0) 3034 break; 3035 path->slots[0]--; 3036 } 3037 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3038 path->slots[0]); 3039 btrfs_release_path(log_root_tree, path); 3040 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 3041 break; 3042 3043 log = btrfs_read_fs_root_no_radix(log_root_tree, 3044 &found_key); 3045 BUG_ON(!log); 3046 3047 3048 tmp_key.objectid = found_key.offset; 3049 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 3050 tmp_key.offset = (u64)-1; 3051 3052 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 3053 BUG_ON(!wc.replay_dest); 3054 3055 wc.replay_dest->log_root = log; 3056 btrfs_record_root_in_trans(trans, wc.replay_dest); 3057 ret = walk_log_tree(trans, log, &wc); 3058 BUG_ON(ret); 3059 3060 if (wc.stage == LOG_WALK_REPLAY_ALL) { 3061 ret = fixup_inode_link_counts(trans, wc.replay_dest, 3062 path); 3063 BUG_ON(ret); 3064 } 3065 3066 key.offset = found_key.offset - 1; 3067 wc.replay_dest->log_root = NULL; 3068 free_extent_buffer(log->node); 3069 free_extent_buffer(log->commit_root); 3070 kfree(log); 3071 3072 if (found_key.offset == 0) 3073 break; 3074 } 3075 btrfs_release_path(log_root_tree, path); 3076 3077 /* step one is to pin it all, step two is to replay just inodes */ 3078 if (wc.pin) { 3079 wc.pin = 0; 3080 wc.process_func = replay_one_buffer; 3081 wc.stage = LOG_WALK_REPLAY_INODES; 3082 goto again; 3083 } 3084 /* step three is to replay everything */ 3085 if (wc.stage < LOG_WALK_REPLAY_ALL) { 3086 wc.stage++; 3087 goto again; 3088 } 3089 3090 btrfs_free_path(path); 3091 3092 free_extent_buffer(log_root_tree->node); 3093 log_root_tree->log_root = NULL; 3094 fs_info->log_root_recovering = 0; 3095 3096 /* step 4: commit the transaction, which also unpins the blocks */ 3097 btrfs_commit_transaction(trans, fs_info->tree_root); 3098 3099 kfree(log_root_tree); 3100 return 0; 3101 } 3102 3103 /* 3104 * there are some corner cases where we want to force a full 3105 * commit instead of allowing a directory to be logged. 3106 * 3107 * They revolve around files there were unlinked from the directory, and 3108 * this function updates the parent directory so that a full commit is 3109 * properly done if it is fsync'd later after the unlinks are done. 3110 */ 3111 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 3112 struct inode *dir, struct inode *inode, 3113 int for_rename) 3114 { 3115 /* 3116 * when we're logging a file, if it hasn't been renamed 3117 * or unlinked, and its inode is fully committed on disk, 3118 * we don't have to worry about walking up the directory chain 3119 * to log its parents. 3120 * 3121 * So, we use the last_unlink_trans field to put this transid 3122 * into the file. When the file is logged we check it and 3123 * don't log the parents if the file is fully on disk. 3124 */ 3125 if (S_ISREG(inode->i_mode)) 3126 BTRFS_I(inode)->last_unlink_trans = trans->transid; 3127 3128 /* 3129 * if this directory was already logged any new 3130 * names for this file/dir will get recorded 3131 */ 3132 smp_mb(); 3133 if (BTRFS_I(dir)->logged_trans == trans->transid) 3134 return; 3135 3136 /* 3137 * if the inode we're about to unlink was logged, 3138 * the log will be properly updated for any new names 3139 */ 3140 if (BTRFS_I(inode)->logged_trans == trans->transid) 3141 return; 3142 3143 /* 3144 * when renaming files across directories, if the directory 3145 * there we're unlinking from gets fsync'd later on, there's 3146 * no way to find the destination directory later and fsync it 3147 * properly. So, we have to be conservative and force commits 3148 * so the new name gets discovered. 3149 */ 3150 if (for_rename) 3151 goto record; 3152 3153 /* we can safely do the unlink without any special recording */ 3154 return; 3155 3156 record: 3157 BTRFS_I(dir)->last_unlink_trans = trans->transid; 3158 } 3159 3160 /* 3161 * Call this after adding a new name for a file and it will properly 3162 * update the log to reflect the new name. 3163 * 3164 * It will return zero if all goes well, and it will return 1 if a 3165 * full transaction commit is required. 3166 */ 3167 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 3168 struct inode *inode, struct inode *old_dir, 3169 struct dentry *parent) 3170 { 3171 struct btrfs_root * root = BTRFS_I(inode)->root; 3172 3173 /* 3174 * this will force the logging code to walk the dentry chain 3175 * up for the file 3176 */ 3177 if (S_ISREG(inode->i_mode)) 3178 BTRFS_I(inode)->last_unlink_trans = trans->transid; 3179 3180 /* 3181 * if this inode hasn't been logged and directory we're renaming it 3182 * from hasn't been logged, we don't need to log it 3183 */ 3184 if (BTRFS_I(inode)->logged_trans <= 3185 root->fs_info->last_trans_committed && 3186 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 3187 root->fs_info->last_trans_committed)) 3188 return 0; 3189 3190 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 3191 } 3192 3193