1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include "ctree.h" 21 #include "transaction.h" 22 #include "disk-io.h" 23 #include "locking.h" 24 #include "print-tree.h" 25 #include "compat.h" 26 #include "tree-log.h" 27 28 /* magic values for the inode_only field in btrfs_log_inode: 29 * 30 * LOG_INODE_ALL means to log everything 31 * LOG_INODE_EXISTS means to log just enough to recreate the inode 32 * during log replay 33 */ 34 #define LOG_INODE_ALL 0 35 #define LOG_INODE_EXISTS 1 36 37 /* 38 * directory trouble cases 39 * 40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 41 * log, we must force a full commit before doing an fsync of the directory 42 * where the unlink was done. 43 * ---> record transid of last unlink/rename per directory 44 * 45 * mkdir foo/some_dir 46 * normal commit 47 * rename foo/some_dir foo2/some_dir 48 * mkdir foo/some_dir 49 * fsync foo/some_dir/some_file 50 * 51 * The fsync above will unlink the original some_dir without recording 52 * it in its new location (foo2). After a crash, some_dir will be gone 53 * unless the fsync of some_file forces a full commit 54 * 55 * 2) we must log any new names for any file or dir that is in the fsync 56 * log. ---> check inode while renaming/linking. 57 * 58 * 2a) we must log any new names for any file or dir during rename 59 * when the directory they are being removed from was logged. 60 * ---> check inode and old parent dir during rename 61 * 62 * 2a is actually the more important variant. With the extra logging 63 * a crash might unlink the old name without recreating the new one 64 * 65 * 3) after a crash, we must go through any directories with a link count 66 * of zero and redo the rm -rf 67 * 68 * mkdir f1/foo 69 * normal commit 70 * rm -rf f1/foo 71 * fsync(f1) 72 * 73 * The directory f1 was fully removed from the FS, but fsync was never 74 * called on f1, only its parent dir. After a crash the rm -rf must 75 * be replayed. This must be able to recurse down the entire 76 * directory tree. The inode link count fixup code takes care of the 77 * ugly details. 78 */ 79 80 /* 81 * stages for the tree walking. The first 82 * stage (0) is to only pin down the blocks we find 83 * the second stage (1) is to make sure that all the inodes 84 * we find in the log are created in the subvolume. 85 * 86 * The last stage is to deal with directories and links and extents 87 * and all the other fun semantics 88 */ 89 #define LOG_WALK_PIN_ONLY 0 90 #define LOG_WALK_REPLAY_INODES 1 91 #define LOG_WALK_REPLAY_ALL 2 92 93 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 94 struct btrfs_root *root, struct inode *inode, 95 int inode_only); 96 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, 98 struct btrfs_path *path, u64 objectid); 99 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 100 struct btrfs_root *root, 101 struct btrfs_root *log, 102 struct btrfs_path *path, 103 u64 dirid, int del_all); 104 105 /* 106 * tree logging is a special write ahead log used to make sure that 107 * fsyncs and O_SYNCs can happen without doing full tree commits. 108 * 109 * Full tree commits are expensive because they require commonly 110 * modified blocks to be recowed, creating many dirty pages in the 111 * extent tree an 4x-6x higher write load than ext3. 112 * 113 * Instead of doing a tree commit on every fsync, we use the 114 * key ranges and transaction ids to find items for a given file or directory 115 * that have changed in this transaction. Those items are copied into 116 * a special tree (one per subvolume root), that tree is written to disk 117 * and then the fsync is considered complete. 118 * 119 * After a crash, items are copied out of the log-tree back into the 120 * subvolume tree. Any file data extents found are recorded in the extent 121 * allocation tree, and the log-tree freed. 122 * 123 * The log tree is read three times, once to pin down all the extents it is 124 * using in ram and once, once to create all the inodes logged in the tree 125 * and once to do all the other items. 126 */ 127 128 /* 129 * start a sub transaction and setup the log tree 130 * this increments the log tree writer count to make the people 131 * syncing the tree wait for us to finish 132 */ 133 static int start_log_trans(struct btrfs_trans_handle *trans, 134 struct btrfs_root *root) 135 { 136 int ret; 137 138 mutex_lock(&root->log_mutex); 139 if (root->log_root) { 140 if (!root->log_start_pid) { 141 root->log_start_pid = current->pid; 142 root->log_multiple_pids = false; 143 } else if (root->log_start_pid != current->pid) { 144 root->log_multiple_pids = true; 145 } 146 147 root->log_batch++; 148 atomic_inc(&root->log_writers); 149 mutex_unlock(&root->log_mutex); 150 return 0; 151 } 152 root->log_multiple_pids = false; 153 root->log_start_pid = current->pid; 154 mutex_lock(&root->fs_info->tree_log_mutex); 155 if (!root->fs_info->log_root_tree) { 156 ret = btrfs_init_log_root_tree(trans, root->fs_info); 157 BUG_ON(ret); 158 } 159 if (!root->log_root) { 160 ret = btrfs_add_log_tree(trans, root); 161 BUG_ON(ret); 162 } 163 mutex_unlock(&root->fs_info->tree_log_mutex); 164 root->log_batch++; 165 atomic_inc(&root->log_writers); 166 mutex_unlock(&root->log_mutex); 167 return 0; 168 } 169 170 /* 171 * returns 0 if there was a log transaction running and we were able 172 * to join, or returns -ENOENT if there were not transactions 173 * in progress 174 */ 175 static int join_running_log_trans(struct btrfs_root *root) 176 { 177 int ret = -ENOENT; 178 179 smp_mb(); 180 if (!root->log_root) 181 return -ENOENT; 182 183 mutex_lock(&root->log_mutex); 184 if (root->log_root) { 185 ret = 0; 186 atomic_inc(&root->log_writers); 187 } 188 mutex_unlock(&root->log_mutex); 189 return ret; 190 } 191 192 /* 193 * This either makes the current running log transaction wait 194 * until you call btrfs_end_log_trans() or it makes any future 195 * log transactions wait until you call btrfs_end_log_trans() 196 */ 197 int btrfs_pin_log_trans(struct btrfs_root *root) 198 { 199 int ret = -ENOENT; 200 201 mutex_lock(&root->log_mutex); 202 atomic_inc(&root->log_writers); 203 mutex_unlock(&root->log_mutex); 204 return ret; 205 } 206 207 /* 208 * indicate we're done making changes to the log tree 209 * and wake up anyone waiting to do a sync 210 */ 211 int btrfs_end_log_trans(struct btrfs_root *root) 212 { 213 if (atomic_dec_and_test(&root->log_writers)) { 214 smp_mb(); 215 if (waitqueue_active(&root->log_writer_wait)) 216 wake_up(&root->log_writer_wait); 217 } 218 return 0; 219 } 220 221 222 /* 223 * the walk control struct is used to pass state down the chain when 224 * processing the log tree. The stage field tells us which part 225 * of the log tree processing we are currently doing. The others 226 * are state fields used for that specific part 227 */ 228 struct walk_control { 229 /* should we free the extent on disk when done? This is used 230 * at transaction commit time while freeing a log tree 231 */ 232 int free; 233 234 /* should we write out the extent buffer? This is used 235 * while flushing the log tree to disk during a sync 236 */ 237 int write; 238 239 /* should we wait for the extent buffer io to finish? Also used 240 * while flushing the log tree to disk for a sync 241 */ 242 int wait; 243 244 /* pin only walk, we record which extents on disk belong to the 245 * log trees 246 */ 247 int pin; 248 249 /* what stage of the replay code we're currently in */ 250 int stage; 251 252 /* the root we are currently replaying */ 253 struct btrfs_root *replay_dest; 254 255 /* the trans handle for the current replay */ 256 struct btrfs_trans_handle *trans; 257 258 /* the function that gets used to process blocks we find in the 259 * tree. Note the extent_buffer might not be up to date when it is 260 * passed in, and it must be checked or read if you need the data 261 * inside it 262 */ 263 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 264 struct walk_control *wc, u64 gen); 265 }; 266 267 /* 268 * process_func used to pin down extents, write them or wait on them 269 */ 270 static int process_one_buffer(struct btrfs_root *log, 271 struct extent_buffer *eb, 272 struct walk_control *wc, u64 gen) 273 { 274 if (wc->pin) 275 btrfs_pin_extent(log->fs_info->extent_root, 276 eb->start, eb->len, 0); 277 278 if (btrfs_buffer_uptodate(eb, gen)) { 279 if (wc->write) 280 btrfs_write_tree_block(eb); 281 if (wc->wait) 282 btrfs_wait_tree_block_writeback(eb); 283 } 284 return 0; 285 } 286 287 /* 288 * Item overwrite used by replay and tree logging. eb, slot and key all refer 289 * to the src data we are copying out. 290 * 291 * root is the tree we are copying into, and path is a scratch 292 * path for use in this function (it should be released on entry and 293 * will be released on exit). 294 * 295 * If the key is already in the destination tree the existing item is 296 * overwritten. If the existing item isn't big enough, it is extended. 297 * If it is too large, it is truncated. 298 * 299 * If the key isn't in the destination yet, a new item is inserted. 300 */ 301 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 302 struct btrfs_root *root, 303 struct btrfs_path *path, 304 struct extent_buffer *eb, int slot, 305 struct btrfs_key *key) 306 { 307 int ret; 308 u32 item_size; 309 u64 saved_i_size = 0; 310 int save_old_i_size = 0; 311 unsigned long src_ptr; 312 unsigned long dst_ptr; 313 int overwrite_root = 0; 314 315 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 316 overwrite_root = 1; 317 318 item_size = btrfs_item_size_nr(eb, slot); 319 src_ptr = btrfs_item_ptr_offset(eb, slot); 320 321 /* look for the key in the destination tree */ 322 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 323 if (ret == 0) { 324 char *src_copy; 325 char *dst_copy; 326 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 327 path->slots[0]); 328 if (dst_size != item_size) 329 goto insert; 330 331 if (item_size == 0) { 332 btrfs_release_path(root, path); 333 return 0; 334 } 335 dst_copy = kmalloc(item_size, GFP_NOFS); 336 src_copy = kmalloc(item_size, GFP_NOFS); 337 338 read_extent_buffer(eb, src_copy, src_ptr, item_size); 339 340 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 341 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 342 item_size); 343 ret = memcmp(dst_copy, src_copy, item_size); 344 345 kfree(dst_copy); 346 kfree(src_copy); 347 /* 348 * they have the same contents, just return, this saves 349 * us from cowing blocks in the destination tree and doing 350 * extra writes that may not have been done by a previous 351 * sync 352 */ 353 if (ret == 0) { 354 btrfs_release_path(root, path); 355 return 0; 356 } 357 358 } 359 insert: 360 btrfs_release_path(root, path); 361 /* try to insert the key into the destination tree */ 362 ret = btrfs_insert_empty_item(trans, root, path, 363 key, item_size); 364 365 /* make sure any existing item is the correct size */ 366 if (ret == -EEXIST) { 367 u32 found_size; 368 found_size = btrfs_item_size_nr(path->nodes[0], 369 path->slots[0]); 370 if (found_size > item_size) { 371 btrfs_truncate_item(trans, root, path, item_size, 1); 372 } else if (found_size < item_size) { 373 ret = btrfs_extend_item(trans, root, path, 374 item_size - found_size); 375 BUG_ON(ret); 376 } 377 } else if (ret) { 378 BUG(); 379 } 380 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 381 path->slots[0]); 382 383 /* don't overwrite an existing inode if the generation number 384 * was logged as zero. This is done when the tree logging code 385 * is just logging an inode to make sure it exists after recovery. 386 * 387 * Also, don't overwrite i_size on directories during replay. 388 * log replay inserts and removes directory items based on the 389 * state of the tree found in the subvolume, and i_size is modified 390 * as it goes 391 */ 392 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 393 struct btrfs_inode_item *src_item; 394 struct btrfs_inode_item *dst_item; 395 396 src_item = (struct btrfs_inode_item *)src_ptr; 397 dst_item = (struct btrfs_inode_item *)dst_ptr; 398 399 if (btrfs_inode_generation(eb, src_item) == 0) 400 goto no_copy; 401 402 if (overwrite_root && 403 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 404 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 405 save_old_i_size = 1; 406 saved_i_size = btrfs_inode_size(path->nodes[0], 407 dst_item); 408 } 409 } 410 411 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 412 src_ptr, item_size); 413 414 if (save_old_i_size) { 415 struct btrfs_inode_item *dst_item; 416 dst_item = (struct btrfs_inode_item *)dst_ptr; 417 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 418 } 419 420 /* make sure the generation is filled in */ 421 if (key->type == BTRFS_INODE_ITEM_KEY) { 422 struct btrfs_inode_item *dst_item; 423 dst_item = (struct btrfs_inode_item *)dst_ptr; 424 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 425 btrfs_set_inode_generation(path->nodes[0], dst_item, 426 trans->transid); 427 } 428 } 429 no_copy: 430 btrfs_mark_buffer_dirty(path->nodes[0]); 431 btrfs_release_path(root, path); 432 return 0; 433 } 434 435 /* 436 * simple helper to read an inode off the disk from a given root 437 * This can only be called for subvolume roots and not for the log 438 */ 439 static noinline struct inode *read_one_inode(struct btrfs_root *root, 440 u64 objectid) 441 { 442 struct btrfs_key key; 443 struct inode *inode; 444 445 key.objectid = objectid; 446 key.type = BTRFS_INODE_ITEM_KEY; 447 key.offset = 0; 448 inode = btrfs_iget(root->fs_info->sb, &key, root); 449 if (IS_ERR(inode)) { 450 inode = NULL; 451 } else if (is_bad_inode(inode)) { 452 iput(inode); 453 inode = NULL; 454 } 455 return inode; 456 } 457 458 /* replays a single extent in 'eb' at 'slot' with 'key' into the 459 * subvolume 'root'. path is released on entry and should be released 460 * on exit. 461 * 462 * extents in the log tree have not been allocated out of the extent 463 * tree yet. So, this completes the allocation, taking a reference 464 * as required if the extent already exists or creating a new extent 465 * if it isn't in the extent allocation tree yet. 466 * 467 * The extent is inserted into the file, dropping any existing extents 468 * from the file that overlap the new one. 469 */ 470 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 471 struct btrfs_root *root, 472 struct btrfs_path *path, 473 struct extent_buffer *eb, int slot, 474 struct btrfs_key *key) 475 { 476 int found_type; 477 u64 mask = root->sectorsize - 1; 478 u64 extent_end; 479 u64 alloc_hint; 480 u64 start = key->offset; 481 u64 saved_nbytes; 482 struct btrfs_file_extent_item *item; 483 struct inode *inode = NULL; 484 unsigned long size; 485 int ret = 0; 486 487 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 488 found_type = btrfs_file_extent_type(eb, item); 489 490 if (found_type == BTRFS_FILE_EXTENT_REG || 491 found_type == BTRFS_FILE_EXTENT_PREALLOC) 492 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 493 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 494 size = btrfs_file_extent_inline_len(eb, item); 495 extent_end = (start + size + mask) & ~mask; 496 } else { 497 ret = 0; 498 goto out; 499 } 500 501 inode = read_one_inode(root, key->objectid); 502 if (!inode) { 503 ret = -EIO; 504 goto out; 505 } 506 507 /* 508 * first check to see if we already have this extent in the 509 * file. This must be done before the btrfs_drop_extents run 510 * so we don't try to drop this extent. 511 */ 512 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 513 start, 0); 514 515 if (ret == 0 && 516 (found_type == BTRFS_FILE_EXTENT_REG || 517 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 518 struct btrfs_file_extent_item cmp1; 519 struct btrfs_file_extent_item cmp2; 520 struct btrfs_file_extent_item *existing; 521 struct extent_buffer *leaf; 522 523 leaf = path->nodes[0]; 524 existing = btrfs_item_ptr(leaf, path->slots[0], 525 struct btrfs_file_extent_item); 526 527 read_extent_buffer(eb, &cmp1, (unsigned long)item, 528 sizeof(cmp1)); 529 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 530 sizeof(cmp2)); 531 532 /* 533 * we already have a pointer to this exact extent, 534 * we don't have to do anything 535 */ 536 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 537 btrfs_release_path(root, path); 538 goto out; 539 } 540 } 541 btrfs_release_path(root, path); 542 543 saved_nbytes = inode_get_bytes(inode); 544 /* drop any overlapping extents */ 545 ret = btrfs_drop_extents(trans, inode, start, extent_end, 546 &alloc_hint, 1); 547 BUG_ON(ret); 548 549 if (found_type == BTRFS_FILE_EXTENT_REG || 550 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 551 u64 offset; 552 unsigned long dest_offset; 553 struct btrfs_key ins; 554 555 ret = btrfs_insert_empty_item(trans, root, path, key, 556 sizeof(*item)); 557 BUG_ON(ret); 558 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 559 path->slots[0]); 560 copy_extent_buffer(path->nodes[0], eb, dest_offset, 561 (unsigned long)item, sizeof(*item)); 562 563 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 564 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 565 ins.type = BTRFS_EXTENT_ITEM_KEY; 566 offset = key->offset - btrfs_file_extent_offset(eb, item); 567 568 if (ins.objectid > 0) { 569 u64 csum_start; 570 u64 csum_end; 571 LIST_HEAD(ordered_sums); 572 /* 573 * is this extent already allocated in the extent 574 * allocation tree? If so, just add a reference 575 */ 576 ret = btrfs_lookup_extent(root, ins.objectid, 577 ins.offset); 578 if (ret == 0) { 579 ret = btrfs_inc_extent_ref(trans, root, 580 ins.objectid, ins.offset, 581 0, root->root_key.objectid, 582 key->objectid, offset); 583 } else { 584 /* 585 * insert the extent pointer in the extent 586 * allocation tree 587 */ 588 ret = btrfs_alloc_logged_file_extent(trans, 589 root, root->root_key.objectid, 590 key->objectid, offset, &ins); 591 BUG_ON(ret); 592 } 593 btrfs_release_path(root, path); 594 595 if (btrfs_file_extent_compression(eb, item)) { 596 csum_start = ins.objectid; 597 csum_end = csum_start + ins.offset; 598 } else { 599 csum_start = ins.objectid + 600 btrfs_file_extent_offset(eb, item); 601 csum_end = csum_start + 602 btrfs_file_extent_num_bytes(eb, item); 603 } 604 605 ret = btrfs_lookup_csums_range(root->log_root, 606 csum_start, csum_end - 1, 607 &ordered_sums); 608 BUG_ON(ret); 609 while (!list_empty(&ordered_sums)) { 610 struct btrfs_ordered_sum *sums; 611 sums = list_entry(ordered_sums.next, 612 struct btrfs_ordered_sum, 613 list); 614 ret = btrfs_csum_file_blocks(trans, 615 root->fs_info->csum_root, 616 sums); 617 BUG_ON(ret); 618 list_del(&sums->list); 619 kfree(sums); 620 } 621 } else { 622 btrfs_release_path(root, path); 623 } 624 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 625 /* inline extents are easy, we just overwrite them */ 626 ret = overwrite_item(trans, root, path, eb, slot, key); 627 BUG_ON(ret); 628 } 629 630 inode_set_bytes(inode, saved_nbytes); 631 btrfs_update_inode(trans, root, inode); 632 out: 633 if (inode) 634 iput(inode); 635 return ret; 636 } 637 638 /* 639 * when cleaning up conflicts between the directory names in the 640 * subvolume, directory names in the log and directory names in the 641 * inode back references, we may have to unlink inodes from directories. 642 * 643 * This is a helper function to do the unlink of a specific directory 644 * item 645 */ 646 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 647 struct btrfs_root *root, 648 struct btrfs_path *path, 649 struct inode *dir, 650 struct btrfs_dir_item *di) 651 { 652 struct inode *inode; 653 char *name; 654 int name_len; 655 struct extent_buffer *leaf; 656 struct btrfs_key location; 657 int ret; 658 659 leaf = path->nodes[0]; 660 661 btrfs_dir_item_key_to_cpu(leaf, di, &location); 662 name_len = btrfs_dir_name_len(leaf, di); 663 name = kmalloc(name_len, GFP_NOFS); 664 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 665 btrfs_release_path(root, path); 666 667 inode = read_one_inode(root, location.objectid); 668 BUG_ON(!inode); 669 670 ret = link_to_fixup_dir(trans, root, path, location.objectid); 671 BUG_ON(ret); 672 673 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 674 BUG_ON(ret); 675 kfree(name); 676 677 iput(inode); 678 return ret; 679 } 680 681 /* 682 * helper function to see if a given name and sequence number found 683 * in an inode back reference are already in a directory and correctly 684 * point to this inode 685 */ 686 static noinline int inode_in_dir(struct btrfs_root *root, 687 struct btrfs_path *path, 688 u64 dirid, u64 objectid, u64 index, 689 const char *name, int name_len) 690 { 691 struct btrfs_dir_item *di; 692 struct btrfs_key location; 693 int match = 0; 694 695 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 696 index, name, name_len, 0); 697 if (di && !IS_ERR(di)) { 698 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 699 if (location.objectid != objectid) 700 goto out; 701 } else 702 goto out; 703 btrfs_release_path(root, path); 704 705 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 706 if (di && !IS_ERR(di)) { 707 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 708 if (location.objectid != objectid) 709 goto out; 710 } else 711 goto out; 712 match = 1; 713 out: 714 btrfs_release_path(root, path); 715 return match; 716 } 717 718 /* 719 * helper function to check a log tree for a named back reference in 720 * an inode. This is used to decide if a back reference that is 721 * found in the subvolume conflicts with what we find in the log. 722 * 723 * inode backreferences may have multiple refs in a single item, 724 * during replay we process one reference at a time, and we don't 725 * want to delete valid links to a file from the subvolume if that 726 * link is also in the log. 727 */ 728 static noinline int backref_in_log(struct btrfs_root *log, 729 struct btrfs_key *key, 730 char *name, int namelen) 731 { 732 struct btrfs_path *path; 733 struct btrfs_inode_ref *ref; 734 unsigned long ptr; 735 unsigned long ptr_end; 736 unsigned long name_ptr; 737 int found_name_len; 738 int item_size; 739 int ret; 740 int match = 0; 741 742 path = btrfs_alloc_path(); 743 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 744 if (ret != 0) 745 goto out; 746 747 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 748 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 749 ptr_end = ptr + item_size; 750 while (ptr < ptr_end) { 751 ref = (struct btrfs_inode_ref *)ptr; 752 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 753 if (found_name_len == namelen) { 754 name_ptr = (unsigned long)(ref + 1); 755 ret = memcmp_extent_buffer(path->nodes[0], name, 756 name_ptr, namelen); 757 if (ret == 0) { 758 match = 1; 759 goto out; 760 } 761 } 762 ptr = (unsigned long)(ref + 1) + found_name_len; 763 } 764 out: 765 btrfs_free_path(path); 766 return match; 767 } 768 769 770 /* 771 * replay one inode back reference item found in the log tree. 772 * eb, slot and key refer to the buffer and key found in the log tree. 773 * root is the destination we are replaying into, and path is for temp 774 * use by this function. (it should be released on return). 775 */ 776 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 777 struct btrfs_root *root, 778 struct btrfs_root *log, 779 struct btrfs_path *path, 780 struct extent_buffer *eb, int slot, 781 struct btrfs_key *key) 782 { 783 struct inode *dir; 784 int ret; 785 struct btrfs_key location; 786 struct btrfs_inode_ref *ref; 787 struct btrfs_dir_item *di; 788 struct inode *inode; 789 char *name; 790 int namelen; 791 unsigned long ref_ptr; 792 unsigned long ref_end; 793 794 location.objectid = key->objectid; 795 location.type = BTRFS_INODE_ITEM_KEY; 796 location.offset = 0; 797 798 /* 799 * it is possible that we didn't log all the parent directories 800 * for a given inode. If we don't find the dir, just don't 801 * copy the back ref in. The link count fixup code will take 802 * care of the rest 803 */ 804 dir = read_one_inode(root, key->offset); 805 if (!dir) 806 return -ENOENT; 807 808 inode = read_one_inode(root, key->objectid); 809 BUG_ON(!inode); 810 811 ref_ptr = btrfs_item_ptr_offset(eb, slot); 812 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 813 814 again: 815 ref = (struct btrfs_inode_ref *)ref_ptr; 816 817 namelen = btrfs_inode_ref_name_len(eb, ref); 818 name = kmalloc(namelen, GFP_NOFS); 819 BUG_ON(!name); 820 821 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); 822 823 /* if we already have a perfect match, we're done */ 824 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, 825 btrfs_inode_ref_index(eb, ref), 826 name, namelen)) { 827 goto out; 828 } 829 830 /* 831 * look for a conflicting back reference in the metadata. 832 * if we find one we have to unlink that name of the file 833 * before we add our new link. Later on, we overwrite any 834 * existing back reference, and we don't want to create 835 * dangling pointers in the directory. 836 */ 837 conflict_again: 838 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 839 if (ret == 0) { 840 char *victim_name; 841 int victim_name_len; 842 struct btrfs_inode_ref *victim_ref; 843 unsigned long ptr; 844 unsigned long ptr_end; 845 struct extent_buffer *leaf = path->nodes[0]; 846 847 /* are we trying to overwrite a back ref for the root directory 848 * if so, just jump out, we're done 849 */ 850 if (key->objectid == key->offset) 851 goto out_nowrite; 852 853 /* check all the names in this back reference to see 854 * if they are in the log. if so, we allow them to stay 855 * otherwise they must be unlinked as a conflict 856 */ 857 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 858 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 859 while (ptr < ptr_end) { 860 victim_ref = (struct btrfs_inode_ref *)ptr; 861 victim_name_len = btrfs_inode_ref_name_len(leaf, 862 victim_ref); 863 victim_name = kmalloc(victim_name_len, GFP_NOFS); 864 BUG_ON(!victim_name); 865 866 read_extent_buffer(leaf, victim_name, 867 (unsigned long)(victim_ref + 1), 868 victim_name_len); 869 870 if (!backref_in_log(log, key, victim_name, 871 victim_name_len)) { 872 btrfs_inc_nlink(inode); 873 btrfs_release_path(root, path); 874 875 ret = btrfs_unlink_inode(trans, root, dir, 876 inode, victim_name, 877 victim_name_len); 878 kfree(victim_name); 879 btrfs_release_path(root, path); 880 goto conflict_again; 881 } 882 kfree(victim_name); 883 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 884 } 885 BUG_ON(ret); 886 } 887 btrfs_release_path(root, path); 888 889 /* look for a conflicting sequence number */ 890 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 891 btrfs_inode_ref_index(eb, ref), 892 name, namelen, 0); 893 if (di && !IS_ERR(di)) { 894 ret = drop_one_dir_item(trans, root, path, dir, di); 895 BUG_ON(ret); 896 } 897 btrfs_release_path(root, path); 898 899 900 /* look for a conflicting name */ 901 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 902 name, namelen, 0); 903 if (di && !IS_ERR(di)) { 904 ret = drop_one_dir_item(trans, root, path, dir, di); 905 BUG_ON(ret); 906 } 907 btrfs_release_path(root, path); 908 909 /* insert our name */ 910 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 911 btrfs_inode_ref_index(eb, ref)); 912 BUG_ON(ret); 913 914 btrfs_update_inode(trans, root, inode); 915 916 out: 917 ref_ptr = (unsigned long)(ref + 1) + namelen; 918 kfree(name); 919 if (ref_ptr < ref_end) 920 goto again; 921 922 /* finally write the back reference in the inode */ 923 ret = overwrite_item(trans, root, path, eb, slot, key); 924 BUG_ON(ret); 925 926 out_nowrite: 927 btrfs_release_path(root, path); 928 iput(dir); 929 iput(inode); 930 return 0; 931 } 932 933 static int insert_orphan_item(struct btrfs_trans_handle *trans, 934 struct btrfs_root *root, u64 offset) 935 { 936 int ret; 937 ret = btrfs_find_orphan_item(root, offset); 938 if (ret > 0) 939 ret = btrfs_insert_orphan_item(trans, root, offset); 940 return ret; 941 } 942 943 944 /* 945 * There are a few corners where the link count of the file can't 946 * be properly maintained during replay. So, instead of adding 947 * lots of complexity to the log code, we just scan the backrefs 948 * for any file that has been through replay. 949 * 950 * The scan will update the link count on the inode to reflect the 951 * number of back refs found. If it goes down to zero, the iput 952 * will free the inode. 953 */ 954 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 955 struct btrfs_root *root, 956 struct inode *inode) 957 { 958 struct btrfs_path *path; 959 int ret; 960 struct btrfs_key key; 961 u64 nlink = 0; 962 unsigned long ptr; 963 unsigned long ptr_end; 964 int name_len; 965 966 key.objectid = inode->i_ino; 967 key.type = BTRFS_INODE_REF_KEY; 968 key.offset = (u64)-1; 969 970 path = btrfs_alloc_path(); 971 972 while (1) { 973 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 974 if (ret < 0) 975 break; 976 if (ret > 0) { 977 if (path->slots[0] == 0) 978 break; 979 path->slots[0]--; 980 } 981 btrfs_item_key_to_cpu(path->nodes[0], &key, 982 path->slots[0]); 983 if (key.objectid != inode->i_ino || 984 key.type != BTRFS_INODE_REF_KEY) 985 break; 986 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 987 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 988 path->slots[0]); 989 while (ptr < ptr_end) { 990 struct btrfs_inode_ref *ref; 991 992 ref = (struct btrfs_inode_ref *)ptr; 993 name_len = btrfs_inode_ref_name_len(path->nodes[0], 994 ref); 995 ptr = (unsigned long)(ref + 1) + name_len; 996 nlink++; 997 } 998 999 if (key.offset == 0) 1000 break; 1001 key.offset--; 1002 btrfs_release_path(root, path); 1003 } 1004 btrfs_release_path(root, path); 1005 if (nlink != inode->i_nlink) { 1006 inode->i_nlink = nlink; 1007 btrfs_update_inode(trans, root, inode); 1008 } 1009 BTRFS_I(inode)->index_cnt = (u64)-1; 1010 1011 if (inode->i_nlink == 0) { 1012 if (S_ISDIR(inode->i_mode)) { 1013 ret = replay_dir_deletes(trans, root, NULL, path, 1014 inode->i_ino, 1); 1015 BUG_ON(ret); 1016 } 1017 ret = insert_orphan_item(trans, root, inode->i_ino); 1018 BUG_ON(ret); 1019 } 1020 btrfs_free_path(path); 1021 1022 return 0; 1023 } 1024 1025 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1026 struct btrfs_root *root, 1027 struct btrfs_path *path) 1028 { 1029 int ret; 1030 struct btrfs_key key; 1031 struct inode *inode; 1032 1033 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1034 key.type = BTRFS_ORPHAN_ITEM_KEY; 1035 key.offset = (u64)-1; 1036 while (1) { 1037 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1038 if (ret < 0) 1039 break; 1040 1041 if (ret == 1) { 1042 if (path->slots[0] == 0) 1043 break; 1044 path->slots[0]--; 1045 } 1046 1047 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1048 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1049 key.type != BTRFS_ORPHAN_ITEM_KEY) 1050 break; 1051 1052 ret = btrfs_del_item(trans, root, path); 1053 BUG_ON(ret); 1054 1055 btrfs_release_path(root, path); 1056 inode = read_one_inode(root, key.offset); 1057 BUG_ON(!inode); 1058 1059 ret = fixup_inode_link_count(trans, root, inode); 1060 BUG_ON(ret); 1061 1062 iput(inode); 1063 1064 /* 1065 * fixup on a directory may create new entries, 1066 * make sure we always look for the highset possible 1067 * offset 1068 */ 1069 key.offset = (u64)-1; 1070 } 1071 btrfs_release_path(root, path); 1072 return 0; 1073 } 1074 1075 1076 /* 1077 * record a given inode in the fixup dir so we can check its link 1078 * count when replay is done. The link count is incremented here 1079 * so the inode won't go away until we check it 1080 */ 1081 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1082 struct btrfs_root *root, 1083 struct btrfs_path *path, 1084 u64 objectid) 1085 { 1086 struct btrfs_key key; 1087 int ret = 0; 1088 struct inode *inode; 1089 1090 inode = read_one_inode(root, objectid); 1091 BUG_ON(!inode); 1092 1093 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1094 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1095 key.offset = objectid; 1096 1097 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1098 1099 btrfs_release_path(root, path); 1100 if (ret == 0) { 1101 btrfs_inc_nlink(inode); 1102 btrfs_update_inode(trans, root, inode); 1103 } else if (ret == -EEXIST) { 1104 ret = 0; 1105 } else { 1106 BUG(); 1107 } 1108 iput(inode); 1109 1110 return ret; 1111 } 1112 1113 /* 1114 * when replaying the log for a directory, we only insert names 1115 * for inodes that actually exist. This means an fsync on a directory 1116 * does not implicitly fsync all the new files in it 1117 */ 1118 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1119 struct btrfs_root *root, 1120 struct btrfs_path *path, 1121 u64 dirid, u64 index, 1122 char *name, int name_len, u8 type, 1123 struct btrfs_key *location) 1124 { 1125 struct inode *inode; 1126 struct inode *dir; 1127 int ret; 1128 1129 inode = read_one_inode(root, location->objectid); 1130 if (!inode) 1131 return -ENOENT; 1132 1133 dir = read_one_inode(root, dirid); 1134 if (!dir) { 1135 iput(inode); 1136 return -EIO; 1137 } 1138 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1139 1140 /* FIXME, put inode into FIXUP list */ 1141 1142 iput(inode); 1143 iput(dir); 1144 return ret; 1145 } 1146 1147 /* 1148 * take a single entry in a log directory item and replay it into 1149 * the subvolume. 1150 * 1151 * if a conflicting item exists in the subdirectory already, 1152 * the inode it points to is unlinked and put into the link count 1153 * fix up tree. 1154 * 1155 * If a name from the log points to a file or directory that does 1156 * not exist in the FS, it is skipped. fsyncs on directories 1157 * do not force down inodes inside that directory, just changes to the 1158 * names or unlinks in a directory. 1159 */ 1160 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1161 struct btrfs_root *root, 1162 struct btrfs_path *path, 1163 struct extent_buffer *eb, 1164 struct btrfs_dir_item *di, 1165 struct btrfs_key *key) 1166 { 1167 char *name; 1168 int name_len; 1169 struct btrfs_dir_item *dst_di; 1170 struct btrfs_key found_key; 1171 struct btrfs_key log_key; 1172 struct inode *dir; 1173 u8 log_type; 1174 int exists; 1175 int ret; 1176 1177 dir = read_one_inode(root, key->objectid); 1178 BUG_ON(!dir); 1179 1180 name_len = btrfs_dir_name_len(eb, di); 1181 name = kmalloc(name_len, GFP_NOFS); 1182 log_type = btrfs_dir_type(eb, di); 1183 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1184 name_len); 1185 1186 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1187 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1188 if (exists == 0) 1189 exists = 1; 1190 else 1191 exists = 0; 1192 btrfs_release_path(root, path); 1193 1194 if (key->type == BTRFS_DIR_ITEM_KEY) { 1195 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1196 name, name_len, 1); 1197 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1198 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1199 key->objectid, 1200 key->offset, name, 1201 name_len, 1); 1202 } else { 1203 BUG(); 1204 } 1205 if (!dst_di || IS_ERR(dst_di)) { 1206 /* we need a sequence number to insert, so we only 1207 * do inserts for the BTRFS_DIR_INDEX_KEY types 1208 */ 1209 if (key->type != BTRFS_DIR_INDEX_KEY) 1210 goto out; 1211 goto insert; 1212 } 1213 1214 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1215 /* the existing item matches the logged item */ 1216 if (found_key.objectid == log_key.objectid && 1217 found_key.type == log_key.type && 1218 found_key.offset == log_key.offset && 1219 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1220 goto out; 1221 } 1222 1223 /* 1224 * don't drop the conflicting directory entry if the inode 1225 * for the new entry doesn't exist 1226 */ 1227 if (!exists) 1228 goto out; 1229 1230 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1231 BUG_ON(ret); 1232 1233 if (key->type == BTRFS_DIR_INDEX_KEY) 1234 goto insert; 1235 out: 1236 btrfs_release_path(root, path); 1237 kfree(name); 1238 iput(dir); 1239 return 0; 1240 1241 insert: 1242 btrfs_release_path(root, path); 1243 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1244 name, name_len, log_type, &log_key); 1245 1246 BUG_ON(ret && ret != -ENOENT); 1247 goto out; 1248 } 1249 1250 /* 1251 * find all the names in a directory item and reconcile them into 1252 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1253 * one name in a directory item, but the same code gets used for 1254 * both directory index types 1255 */ 1256 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1257 struct btrfs_root *root, 1258 struct btrfs_path *path, 1259 struct extent_buffer *eb, int slot, 1260 struct btrfs_key *key) 1261 { 1262 int ret; 1263 u32 item_size = btrfs_item_size_nr(eb, slot); 1264 struct btrfs_dir_item *di; 1265 int name_len; 1266 unsigned long ptr; 1267 unsigned long ptr_end; 1268 1269 ptr = btrfs_item_ptr_offset(eb, slot); 1270 ptr_end = ptr + item_size; 1271 while (ptr < ptr_end) { 1272 di = (struct btrfs_dir_item *)ptr; 1273 name_len = btrfs_dir_name_len(eb, di); 1274 ret = replay_one_name(trans, root, path, eb, di, key); 1275 BUG_ON(ret); 1276 ptr = (unsigned long)(di + 1); 1277 ptr += name_len; 1278 } 1279 return 0; 1280 } 1281 1282 /* 1283 * directory replay has two parts. There are the standard directory 1284 * items in the log copied from the subvolume, and range items 1285 * created in the log while the subvolume was logged. 1286 * 1287 * The range items tell us which parts of the key space the log 1288 * is authoritative for. During replay, if a key in the subvolume 1289 * directory is in a logged range item, but not actually in the log 1290 * that means it was deleted from the directory before the fsync 1291 * and should be removed. 1292 */ 1293 static noinline int find_dir_range(struct btrfs_root *root, 1294 struct btrfs_path *path, 1295 u64 dirid, int key_type, 1296 u64 *start_ret, u64 *end_ret) 1297 { 1298 struct btrfs_key key; 1299 u64 found_end; 1300 struct btrfs_dir_log_item *item; 1301 int ret; 1302 int nritems; 1303 1304 if (*start_ret == (u64)-1) 1305 return 1; 1306 1307 key.objectid = dirid; 1308 key.type = key_type; 1309 key.offset = *start_ret; 1310 1311 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1312 if (ret < 0) 1313 goto out; 1314 if (ret > 0) { 1315 if (path->slots[0] == 0) 1316 goto out; 1317 path->slots[0]--; 1318 } 1319 if (ret != 0) 1320 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1321 1322 if (key.type != key_type || key.objectid != dirid) { 1323 ret = 1; 1324 goto next; 1325 } 1326 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1327 struct btrfs_dir_log_item); 1328 found_end = btrfs_dir_log_end(path->nodes[0], item); 1329 1330 if (*start_ret >= key.offset && *start_ret <= found_end) { 1331 ret = 0; 1332 *start_ret = key.offset; 1333 *end_ret = found_end; 1334 goto out; 1335 } 1336 ret = 1; 1337 next: 1338 /* check the next slot in the tree to see if it is a valid item */ 1339 nritems = btrfs_header_nritems(path->nodes[0]); 1340 if (path->slots[0] >= nritems) { 1341 ret = btrfs_next_leaf(root, path); 1342 if (ret) 1343 goto out; 1344 } else { 1345 path->slots[0]++; 1346 } 1347 1348 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1349 1350 if (key.type != key_type || key.objectid != dirid) { 1351 ret = 1; 1352 goto out; 1353 } 1354 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1355 struct btrfs_dir_log_item); 1356 found_end = btrfs_dir_log_end(path->nodes[0], item); 1357 *start_ret = key.offset; 1358 *end_ret = found_end; 1359 ret = 0; 1360 out: 1361 btrfs_release_path(root, path); 1362 return ret; 1363 } 1364 1365 /* 1366 * this looks for a given directory item in the log. If the directory 1367 * item is not in the log, the item is removed and the inode it points 1368 * to is unlinked 1369 */ 1370 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1371 struct btrfs_root *root, 1372 struct btrfs_root *log, 1373 struct btrfs_path *path, 1374 struct btrfs_path *log_path, 1375 struct inode *dir, 1376 struct btrfs_key *dir_key) 1377 { 1378 int ret; 1379 struct extent_buffer *eb; 1380 int slot; 1381 u32 item_size; 1382 struct btrfs_dir_item *di; 1383 struct btrfs_dir_item *log_di; 1384 int name_len; 1385 unsigned long ptr; 1386 unsigned long ptr_end; 1387 char *name; 1388 struct inode *inode; 1389 struct btrfs_key location; 1390 1391 again: 1392 eb = path->nodes[0]; 1393 slot = path->slots[0]; 1394 item_size = btrfs_item_size_nr(eb, slot); 1395 ptr = btrfs_item_ptr_offset(eb, slot); 1396 ptr_end = ptr + item_size; 1397 while (ptr < ptr_end) { 1398 di = (struct btrfs_dir_item *)ptr; 1399 name_len = btrfs_dir_name_len(eb, di); 1400 name = kmalloc(name_len, GFP_NOFS); 1401 if (!name) { 1402 ret = -ENOMEM; 1403 goto out; 1404 } 1405 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1406 name_len); 1407 log_di = NULL; 1408 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1409 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1410 dir_key->objectid, 1411 name, name_len, 0); 1412 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1413 log_di = btrfs_lookup_dir_index_item(trans, log, 1414 log_path, 1415 dir_key->objectid, 1416 dir_key->offset, 1417 name, name_len, 0); 1418 } 1419 if (!log_di || IS_ERR(log_di)) { 1420 btrfs_dir_item_key_to_cpu(eb, di, &location); 1421 btrfs_release_path(root, path); 1422 btrfs_release_path(log, log_path); 1423 inode = read_one_inode(root, location.objectid); 1424 BUG_ON(!inode); 1425 1426 ret = link_to_fixup_dir(trans, root, 1427 path, location.objectid); 1428 BUG_ON(ret); 1429 btrfs_inc_nlink(inode); 1430 ret = btrfs_unlink_inode(trans, root, dir, inode, 1431 name, name_len); 1432 BUG_ON(ret); 1433 kfree(name); 1434 iput(inode); 1435 1436 /* there might still be more names under this key 1437 * check and repeat if required 1438 */ 1439 ret = btrfs_search_slot(NULL, root, dir_key, path, 1440 0, 0); 1441 if (ret == 0) 1442 goto again; 1443 ret = 0; 1444 goto out; 1445 } 1446 btrfs_release_path(log, log_path); 1447 kfree(name); 1448 1449 ptr = (unsigned long)(di + 1); 1450 ptr += name_len; 1451 } 1452 ret = 0; 1453 out: 1454 btrfs_release_path(root, path); 1455 btrfs_release_path(log, log_path); 1456 return ret; 1457 } 1458 1459 /* 1460 * deletion replay happens before we copy any new directory items 1461 * out of the log or out of backreferences from inodes. It 1462 * scans the log to find ranges of keys that log is authoritative for, 1463 * and then scans the directory to find items in those ranges that are 1464 * not present in the log. 1465 * 1466 * Anything we don't find in the log is unlinked and removed from the 1467 * directory. 1468 */ 1469 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1470 struct btrfs_root *root, 1471 struct btrfs_root *log, 1472 struct btrfs_path *path, 1473 u64 dirid, int del_all) 1474 { 1475 u64 range_start; 1476 u64 range_end; 1477 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1478 int ret = 0; 1479 struct btrfs_key dir_key; 1480 struct btrfs_key found_key; 1481 struct btrfs_path *log_path; 1482 struct inode *dir; 1483 1484 dir_key.objectid = dirid; 1485 dir_key.type = BTRFS_DIR_ITEM_KEY; 1486 log_path = btrfs_alloc_path(); 1487 if (!log_path) 1488 return -ENOMEM; 1489 1490 dir = read_one_inode(root, dirid); 1491 /* it isn't an error if the inode isn't there, that can happen 1492 * because we replay the deletes before we copy in the inode item 1493 * from the log 1494 */ 1495 if (!dir) { 1496 btrfs_free_path(log_path); 1497 return 0; 1498 } 1499 again: 1500 range_start = 0; 1501 range_end = 0; 1502 while (1) { 1503 if (del_all) 1504 range_end = (u64)-1; 1505 else { 1506 ret = find_dir_range(log, path, dirid, key_type, 1507 &range_start, &range_end); 1508 if (ret != 0) 1509 break; 1510 } 1511 1512 dir_key.offset = range_start; 1513 while (1) { 1514 int nritems; 1515 ret = btrfs_search_slot(NULL, root, &dir_key, path, 1516 0, 0); 1517 if (ret < 0) 1518 goto out; 1519 1520 nritems = btrfs_header_nritems(path->nodes[0]); 1521 if (path->slots[0] >= nritems) { 1522 ret = btrfs_next_leaf(root, path); 1523 if (ret) 1524 break; 1525 } 1526 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1527 path->slots[0]); 1528 if (found_key.objectid != dirid || 1529 found_key.type != dir_key.type) 1530 goto next_type; 1531 1532 if (found_key.offset > range_end) 1533 break; 1534 1535 ret = check_item_in_log(trans, root, log, path, 1536 log_path, dir, 1537 &found_key); 1538 BUG_ON(ret); 1539 if (found_key.offset == (u64)-1) 1540 break; 1541 dir_key.offset = found_key.offset + 1; 1542 } 1543 btrfs_release_path(root, path); 1544 if (range_end == (u64)-1) 1545 break; 1546 range_start = range_end + 1; 1547 } 1548 1549 next_type: 1550 ret = 0; 1551 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1552 key_type = BTRFS_DIR_LOG_INDEX_KEY; 1553 dir_key.type = BTRFS_DIR_INDEX_KEY; 1554 btrfs_release_path(root, path); 1555 goto again; 1556 } 1557 out: 1558 btrfs_release_path(root, path); 1559 btrfs_free_path(log_path); 1560 iput(dir); 1561 return ret; 1562 } 1563 1564 /* 1565 * the process_func used to replay items from the log tree. This 1566 * gets called in two different stages. The first stage just looks 1567 * for inodes and makes sure they are all copied into the subvolume. 1568 * 1569 * The second stage copies all the other item types from the log into 1570 * the subvolume. The two stage approach is slower, but gets rid of 1571 * lots of complexity around inodes referencing other inodes that exist 1572 * only in the log (references come from either directory items or inode 1573 * back refs). 1574 */ 1575 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 1576 struct walk_control *wc, u64 gen) 1577 { 1578 int nritems; 1579 struct btrfs_path *path; 1580 struct btrfs_root *root = wc->replay_dest; 1581 struct btrfs_key key; 1582 u32 item_size; 1583 int level; 1584 int i; 1585 int ret; 1586 1587 btrfs_read_buffer(eb, gen); 1588 1589 level = btrfs_header_level(eb); 1590 1591 if (level != 0) 1592 return 0; 1593 1594 path = btrfs_alloc_path(); 1595 BUG_ON(!path); 1596 1597 nritems = btrfs_header_nritems(eb); 1598 for (i = 0; i < nritems; i++) { 1599 btrfs_item_key_to_cpu(eb, &key, i); 1600 item_size = btrfs_item_size_nr(eb, i); 1601 1602 /* inode keys are done during the first stage */ 1603 if (key.type == BTRFS_INODE_ITEM_KEY && 1604 wc->stage == LOG_WALK_REPLAY_INODES) { 1605 struct btrfs_inode_item *inode_item; 1606 u32 mode; 1607 1608 inode_item = btrfs_item_ptr(eb, i, 1609 struct btrfs_inode_item); 1610 mode = btrfs_inode_mode(eb, inode_item); 1611 if (S_ISDIR(mode)) { 1612 ret = replay_dir_deletes(wc->trans, 1613 root, log, path, key.objectid, 0); 1614 BUG_ON(ret); 1615 } 1616 ret = overwrite_item(wc->trans, root, path, 1617 eb, i, &key); 1618 BUG_ON(ret); 1619 1620 /* for regular files, make sure corresponding 1621 * orhpan item exist. extents past the new EOF 1622 * will be truncated later by orphan cleanup. 1623 */ 1624 if (S_ISREG(mode)) { 1625 ret = insert_orphan_item(wc->trans, root, 1626 key.objectid); 1627 BUG_ON(ret); 1628 } 1629 1630 ret = link_to_fixup_dir(wc->trans, root, 1631 path, key.objectid); 1632 BUG_ON(ret); 1633 } 1634 if (wc->stage < LOG_WALK_REPLAY_ALL) 1635 continue; 1636 1637 /* these keys are simply copied */ 1638 if (key.type == BTRFS_XATTR_ITEM_KEY) { 1639 ret = overwrite_item(wc->trans, root, path, 1640 eb, i, &key); 1641 BUG_ON(ret); 1642 } else if (key.type == BTRFS_INODE_REF_KEY) { 1643 ret = add_inode_ref(wc->trans, root, log, path, 1644 eb, i, &key); 1645 BUG_ON(ret && ret != -ENOENT); 1646 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1647 ret = replay_one_extent(wc->trans, root, path, 1648 eb, i, &key); 1649 BUG_ON(ret); 1650 } else if (key.type == BTRFS_DIR_ITEM_KEY || 1651 key.type == BTRFS_DIR_INDEX_KEY) { 1652 ret = replay_one_dir_item(wc->trans, root, path, 1653 eb, i, &key); 1654 BUG_ON(ret); 1655 } 1656 } 1657 btrfs_free_path(path); 1658 return 0; 1659 } 1660 1661 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 1662 struct btrfs_root *root, 1663 struct btrfs_path *path, int *level, 1664 struct walk_control *wc) 1665 { 1666 u64 root_owner; 1667 u64 root_gen; 1668 u64 bytenr; 1669 u64 ptr_gen; 1670 struct extent_buffer *next; 1671 struct extent_buffer *cur; 1672 struct extent_buffer *parent; 1673 u32 blocksize; 1674 int ret = 0; 1675 1676 WARN_ON(*level < 0); 1677 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1678 1679 while (*level > 0) { 1680 WARN_ON(*level < 0); 1681 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1682 cur = path->nodes[*level]; 1683 1684 if (btrfs_header_level(cur) != *level) 1685 WARN_ON(1); 1686 1687 if (path->slots[*level] >= 1688 btrfs_header_nritems(cur)) 1689 break; 1690 1691 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 1692 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 1693 blocksize = btrfs_level_size(root, *level - 1); 1694 1695 parent = path->nodes[*level]; 1696 root_owner = btrfs_header_owner(parent); 1697 root_gen = btrfs_header_generation(parent); 1698 1699 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1700 1701 wc->process_func(root, next, wc, ptr_gen); 1702 1703 if (*level == 1) { 1704 path->slots[*level]++; 1705 if (wc->free) { 1706 btrfs_read_buffer(next, ptr_gen); 1707 1708 btrfs_tree_lock(next); 1709 clean_tree_block(trans, root, next); 1710 btrfs_set_lock_blocking(next); 1711 btrfs_wait_tree_block_writeback(next); 1712 btrfs_tree_unlock(next); 1713 1714 WARN_ON(root_owner != 1715 BTRFS_TREE_LOG_OBJECTID); 1716 ret = btrfs_free_reserved_extent(root, 1717 bytenr, blocksize); 1718 BUG_ON(ret); 1719 } 1720 free_extent_buffer(next); 1721 continue; 1722 } 1723 btrfs_read_buffer(next, ptr_gen); 1724 1725 WARN_ON(*level <= 0); 1726 if (path->nodes[*level-1]) 1727 free_extent_buffer(path->nodes[*level-1]); 1728 path->nodes[*level-1] = next; 1729 *level = btrfs_header_level(next); 1730 path->slots[*level] = 0; 1731 cond_resched(); 1732 } 1733 WARN_ON(*level < 0); 1734 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1735 1736 if (path->nodes[*level] == root->node) 1737 parent = path->nodes[*level]; 1738 else 1739 parent = path->nodes[*level + 1]; 1740 1741 bytenr = path->nodes[*level]->start; 1742 1743 blocksize = btrfs_level_size(root, *level); 1744 root_owner = btrfs_header_owner(parent); 1745 root_gen = btrfs_header_generation(parent); 1746 1747 wc->process_func(root, path->nodes[*level], wc, 1748 btrfs_header_generation(path->nodes[*level])); 1749 1750 if (wc->free) { 1751 next = path->nodes[*level]; 1752 btrfs_tree_lock(next); 1753 clean_tree_block(trans, root, next); 1754 btrfs_set_lock_blocking(next); 1755 btrfs_wait_tree_block_writeback(next); 1756 btrfs_tree_unlock(next); 1757 1758 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1759 ret = btrfs_free_reserved_extent(root, bytenr, blocksize); 1760 BUG_ON(ret); 1761 } 1762 free_extent_buffer(path->nodes[*level]); 1763 path->nodes[*level] = NULL; 1764 *level += 1; 1765 1766 cond_resched(); 1767 return 0; 1768 } 1769 1770 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 1771 struct btrfs_root *root, 1772 struct btrfs_path *path, int *level, 1773 struct walk_control *wc) 1774 { 1775 u64 root_owner; 1776 u64 root_gen; 1777 int i; 1778 int slot; 1779 int ret; 1780 1781 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1782 slot = path->slots[i]; 1783 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1784 struct extent_buffer *node; 1785 node = path->nodes[i]; 1786 path->slots[i]++; 1787 *level = i; 1788 WARN_ON(*level == 0); 1789 return 0; 1790 } else { 1791 struct extent_buffer *parent; 1792 if (path->nodes[*level] == root->node) 1793 parent = path->nodes[*level]; 1794 else 1795 parent = path->nodes[*level + 1]; 1796 1797 root_owner = btrfs_header_owner(parent); 1798 root_gen = btrfs_header_generation(parent); 1799 wc->process_func(root, path->nodes[*level], wc, 1800 btrfs_header_generation(path->nodes[*level])); 1801 if (wc->free) { 1802 struct extent_buffer *next; 1803 1804 next = path->nodes[*level]; 1805 1806 btrfs_tree_lock(next); 1807 clean_tree_block(trans, root, next); 1808 btrfs_set_lock_blocking(next); 1809 btrfs_wait_tree_block_writeback(next); 1810 btrfs_tree_unlock(next); 1811 1812 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1813 ret = btrfs_free_reserved_extent(root, 1814 path->nodes[*level]->start, 1815 path->nodes[*level]->len); 1816 BUG_ON(ret); 1817 } 1818 free_extent_buffer(path->nodes[*level]); 1819 path->nodes[*level] = NULL; 1820 *level = i + 1; 1821 } 1822 } 1823 return 1; 1824 } 1825 1826 /* 1827 * drop the reference count on the tree rooted at 'snap'. This traverses 1828 * the tree freeing any blocks that have a ref count of zero after being 1829 * decremented. 1830 */ 1831 static int walk_log_tree(struct btrfs_trans_handle *trans, 1832 struct btrfs_root *log, struct walk_control *wc) 1833 { 1834 int ret = 0; 1835 int wret; 1836 int level; 1837 struct btrfs_path *path; 1838 int i; 1839 int orig_level; 1840 1841 path = btrfs_alloc_path(); 1842 BUG_ON(!path); 1843 1844 level = btrfs_header_level(log->node); 1845 orig_level = level; 1846 path->nodes[level] = log->node; 1847 extent_buffer_get(log->node); 1848 path->slots[level] = 0; 1849 1850 while (1) { 1851 wret = walk_down_log_tree(trans, log, path, &level, wc); 1852 if (wret > 0) 1853 break; 1854 if (wret < 0) 1855 ret = wret; 1856 1857 wret = walk_up_log_tree(trans, log, path, &level, wc); 1858 if (wret > 0) 1859 break; 1860 if (wret < 0) 1861 ret = wret; 1862 } 1863 1864 /* was the root node processed? if not, catch it here */ 1865 if (path->nodes[orig_level]) { 1866 wc->process_func(log, path->nodes[orig_level], wc, 1867 btrfs_header_generation(path->nodes[orig_level])); 1868 if (wc->free) { 1869 struct extent_buffer *next; 1870 1871 next = path->nodes[orig_level]; 1872 1873 btrfs_tree_lock(next); 1874 clean_tree_block(trans, log, next); 1875 btrfs_set_lock_blocking(next); 1876 btrfs_wait_tree_block_writeback(next); 1877 btrfs_tree_unlock(next); 1878 1879 WARN_ON(log->root_key.objectid != 1880 BTRFS_TREE_LOG_OBJECTID); 1881 ret = btrfs_free_reserved_extent(log, next->start, 1882 next->len); 1883 BUG_ON(ret); 1884 } 1885 } 1886 1887 for (i = 0; i <= orig_level; i++) { 1888 if (path->nodes[i]) { 1889 free_extent_buffer(path->nodes[i]); 1890 path->nodes[i] = NULL; 1891 } 1892 } 1893 btrfs_free_path(path); 1894 return ret; 1895 } 1896 1897 /* 1898 * helper function to update the item for a given subvolumes log root 1899 * in the tree of log roots 1900 */ 1901 static int update_log_root(struct btrfs_trans_handle *trans, 1902 struct btrfs_root *log) 1903 { 1904 int ret; 1905 1906 if (log->log_transid == 1) { 1907 /* insert root item on the first sync */ 1908 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 1909 &log->root_key, &log->root_item); 1910 } else { 1911 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 1912 &log->root_key, &log->root_item); 1913 } 1914 return ret; 1915 } 1916 1917 static int wait_log_commit(struct btrfs_trans_handle *trans, 1918 struct btrfs_root *root, unsigned long transid) 1919 { 1920 DEFINE_WAIT(wait); 1921 int index = transid % 2; 1922 1923 /* 1924 * we only allow two pending log transactions at a time, 1925 * so we know that if ours is more than 2 older than the 1926 * current transaction, we're done 1927 */ 1928 do { 1929 prepare_to_wait(&root->log_commit_wait[index], 1930 &wait, TASK_UNINTERRUPTIBLE); 1931 mutex_unlock(&root->log_mutex); 1932 1933 if (root->fs_info->last_trans_log_full_commit != 1934 trans->transid && root->log_transid < transid + 2 && 1935 atomic_read(&root->log_commit[index])) 1936 schedule(); 1937 1938 finish_wait(&root->log_commit_wait[index], &wait); 1939 mutex_lock(&root->log_mutex); 1940 } while (root->log_transid < transid + 2 && 1941 atomic_read(&root->log_commit[index])); 1942 return 0; 1943 } 1944 1945 static int wait_for_writer(struct btrfs_trans_handle *trans, 1946 struct btrfs_root *root) 1947 { 1948 DEFINE_WAIT(wait); 1949 while (atomic_read(&root->log_writers)) { 1950 prepare_to_wait(&root->log_writer_wait, 1951 &wait, TASK_UNINTERRUPTIBLE); 1952 mutex_unlock(&root->log_mutex); 1953 if (root->fs_info->last_trans_log_full_commit != 1954 trans->transid && atomic_read(&root->log_writers)) 1955 schedule(); 1956 mutex_lock(&root->log_mutex); 1957 finish_wait(&root->log_writer_wait, &wait); 1958 } 1959 return 0; 1960 } 1961 1962 /* 1963 * btrfs_sync_log does sends a given tree log down to the disk and 1964 * updates the super blocks to record it. When this call is done, 1965 * you know that any inodes previously logged are safely on disk only 1966 * if it returns 0. 1967 * 1968 * Any other return value means you need to call btrfs_commit_transaction. 1969 * Some of the edge cases for fsyncing directories that have had unlinks 1970 * or renames done in the past mean that sometimes the only safe 1971 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 1972 * that has happened. 1973 */ 1974 int btrfs_sync_log(struct btrfs_trans_handle *trans, 1975 struct btrfs_root *root) 1976 { 1977 int index1; 1978 int index2; 1979 int mark; 1980 int ret; 1981 struct btrfs_root *log = root->log_root; 1982 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 1983 unsigned long log_transid = 0; 1984 1985 mutex_lock(&root->log_mutex); 1986 index1 = root->log_transid % 2; 1987 if (atomic_read(&root->log_commit[index1])) { 1988 wait_log_commit(trans, root, root->log_transid); 1989 mutex_unlock(&root->log_mutex); 1990 return 0; 1991 } 1992 atomic_set(&root->log_commit[index1], 1); 1993 1994 /* wait for previous tree log sync to complete */ 1995 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 1996 wait_log_commit(trans, root, root->log_transid - 1); 1997 1998 while (1) { 1999 unsigned long batch = root->log_batch; 2000 if (root->log_multiple_pids) { 2001 mutex_unlock(&root->log_mutex); 2002 schedule_timeout_uninterruptible(1); 2003 mutex_lock(&root->log_mutex); 2004 } 2005 wait_for_writer(trans, root); 2006 if (batch == root->log_batch) 2007 break; 2008 } 2009 2010 /* bail out if we need to do a full commit */ 2011 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2012 ret = -EAGAIN; 2013 mutex_unlock(&root->log_mutex); 2014 goto out; 2015 } 2016 2017 log_transid = root->log_transid; 2018 if (log_transid % 2 == 0) 2019 mark = EXTENT_DIRTY; 2020 else 2021 mark = EXTENT_NEW; 2022 2023 /* we start IO on all the marked extents here, but we don't actually 2024 * wait for them until later. 2025 */ 2026 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2027 BUG_ON(ret); 2028 2029 btrfs_set_root_node(&log->root_item, log->node); 2030 2031 root->log_batch = 0; 2032 root->log_transid++; 2033 log->log_transid = root->log_transid; 2034 root->log_start_pid = 0; 2035 smp_mb(); 2036 /* 2037 * IO has been started, blocks of the log tree have WRITTEN flag set 2038 * in their headers. new modifications of the log will be written to 2039 * new positions. so it's safe to allow log writers to go in. 2040 */ 2041 mutex_unlock(&root->log_mutex); 2042 2043 mutex_lock(&log_root_tree->log_mutex); 2044 log_root_tree->log_batch++; 2045 atomic_inc(&log_root_tree->log_writers); 2046 mutex_unlock(&log_root_tree->log_mutex); 2047 2048 ret = update_log_root(trans, log); 2049 BUG_ON(ret); 2050 2051 mutex_lock(&log_root_tree->log_mutex); 2052 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2053 smp_mb(); 2054 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2055 wake_up(&log_root_tree->log_writer_wait); 2056 } 2057 2058 index2 = log_root_tree->log_transid % 2; 2059 if (atomic_read(&log_root_tree->log_commit[index2])) { 2060 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2061 wait_log_commit(trans, log_root_tree, 2062 log_root_tree->log_transid); 2063 mutex_unlock(&log_root_tree->log_mutex); 2064 goto out; 2065 } 2066 atomic_set(&log_root_tree->log_commit[index2], 1); 2067 2068 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2069 wait_log_commit(trans, log_root_tree, 2070 log_root_tree->log_transid - 1); 2071 } 2072 2073 wait_for_writer(trans, log_root_tree); 2074 2075 /* 2076 * now that we've moved on to the tree of log tree roots, 2077 * check the full commit flag again 2078 */ 2079 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2080 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2081 mutex_unlock(&log_root_tree->log_mutex); 2082 ret = -EAGAIN; 2083 goto out_wake_log_root; 2084 } 2085 2086 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2087 &log_root_tree->dirty_log_pages, 2088 EXTENT_DIRTY | EXTENT_NEW); 2089 BUG_ON(ret); 2090 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2091 2092 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2093 log_root_tree->node->start); 2094 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2095 btrfs_header_level(log_root_tree->node)); 2096 2097 log_root_tree->log_batch = 0; 2098 log_root_tree->log_transid++; 2099 smp_mb(); 2100 2101 mutex_unlock(&log_root_tree->log_mutex); 2102 2103 /* 2104 * nobody else is going to jump in and write the the ctree 2105 * super here because the log_commit atomic below is protecting 2106 * us. We must be called with a transaction handle pinning 2107 * the running transaction open, so a full commit can't hop 2108 * in and cause problems either. 2109 */ 2110 write_ctree_super(trans, root->fs_info->tree_root, 1); 2111 ret = 0; 2112 2113 mutex_lock(&root->log_mutex); 2114 if (root->last_log_commit < log_transid) 2115 root->last_log_commit = log_transid; 2116 mutex_unlock(&root->log_mutex); 2117 2118 out_wake_log_root: 2119 atomic_set(&log_root_tree->log_commit[index2], 0); 2120 smp_mb(); 2121 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2122 wake_up(&log_root_tree->log_commit_wait[index2]); 2123 out: 2124 atomic_set(&root->log_commit[index1], 0); 2125 smp_mb(); 2126 if (waitqueue_active(&root->log_commit_wait[index1])) 2127 wake_up(&root->log_commit_wait[index1]); 2128 return 0; 2129 } 2130 2131 /* 2132 * free all the extents used by the tree log. This should be called 2133 * at commit time of the full transaction 2134 */ 2135 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2136 { 2137 int ret; 2138 struct btrfs_root *log; 2139 struct key; 2140 u64 start; 2141 u64 end; 2142 struct walk_control wc = { 2143 .free = 1, 2144 .process_func = process_one_buffer 2145 }; 2146 2147 if (!root->log_root || root->fs_info->log_root_recovering) 2148 return 0; 2149 2150 log = root->log_root; 2151 ret = walk_log_tree(trans, log, &wc); 2152 BUG_ON(ret); 2153 2154 while (1) { 2155 ret = find_first_extent_bit(&log->dirty_log_pages, 2156 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); 2157 if (ret) 2158 break; 2159 2160 clear_extent_bits(&log->dirty_log_pages, start, end, 2161 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2162 } 2163 2164 if (log->log_transid > 0) { 2165 ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2166 &log->root_key); 2167 BUG_ON(ret); 2168 } 2169 root->log_root = NULL; 2170 free_extent_buffer(log->node); 2171 kfree(log); 2172 return 0; 2173 } 2174 2175 /* 2176 * If both a file and directory are logged, and unlinks or renames are 2177 * mixed in, we have a few interesting corners: 2178 * 2179 * create file X in dir Y 2180 * link file X to X.link in dir Y 2181 * fsync file X 2182 * unlink file X but leave X.link 2183 * fsync dir Y 2184 * 2185 * After a crash we would expect only X.link to exist. But file X 2186 * didn't get fsync'd again so the log has back refs for X and X.link. 2187 * 2188 * We solve this by removing directory entries and inode backrefs from the 2189 * log when a file that was logged in the current transaction is 2190 * unlinked. Any later fsync will include the updated log entries, and 2191 * we'll be able to reconstruct the proper directory items from backrefs. 2192 * 2193 * This optimizations allows us to avoid relogging the entire inode 2194 * or the entire directory. 2195 */ 2196 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2197 struct btrfs_root *root, 2198 const char *name, int name_len, 2199 struct inode *dir, u64 index) 2200 { 2201 struct btrfs_root *log; 2202 struct btrfs_dir_item *di; 2203 struct btrfs_path *path; 2204 int ret; 2205 int bytes_del = 0; 2206 2207 if (BTRFS_I(dir)->logged_trans < trans->transid) 2208 return 0; 2209 2210 ret = join_running_log_trans(root); 2211 if (ret) 2212 return 0; 2213 2214 mutex_lock(&BTRFS_I(dir)->log_mutex); 2215 2216 log = root->log_root; 2217 path = btrfs_alloc_path(); 2218 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2219 name, name_len, -1); 2220 if (di && !IS_ERR(di)) { 2221 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2222 bytes_del += name_len; 2223 BUG_ON(ret); 2224 } 2225 btrfs_release_path(log, path); 2226 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2227 index, name, name_len, -1); 2228 if (di && !IS_ERR(di)) { 2229 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2230 bytes_del += name_len; 2231 BUG_ON(ret); 2232 } 2233 2234 /* update the directory size in the log to reflect the names 2235 * we have removed 2236 */ 2237 if (bytes_del) { 2238 struct btrfs_key key; 2239 2240 key.objectid = dir->i_ino; 2241 key.offset = 0; 2242 key.type = BTRFS_INODE_ITEM_KEY; 2243 btrfs_release_path(log, path); 2244 2245 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2246 if (ret == 0) { 2247 struct btrfs_inode_item *item; 2248 u64 i_size; 2249 2250 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2251 struct btrfs_inode_item); 2252 i_size = btrfs_inode_size(path->nodes[0], item); 2253 if (i_size > bytes_del) 2254 i_size -= bytes_del; 2255 else 2256 i_size = 0; 2257 btrfs_set_inode_size(path->nodes[0], item, i_size); 2258 btrfs_mark_buffer_dirty(path->nodes[0]); 2259 } else 2260 ret = 0; 2261 btrfs_release_path(log, path); 2262 } 2263 2264 btrfs_free_path(path); 2265 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2266 btrfs_end_log_trans(root); 2267 2268 return 0; 2269 } 2270 2271 /* see comments for btrfs_del_dir_entries_in_log */ 2272 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2273 struct btrfs_root *root, 2274 const char *name, int name_len, 2275 struct inode *inode, u64 dirid) 2276 { 2277 struct btrfs_root *log; 2278 u64 index; 2279 int ret; 2280 2281 if (BTRFS_I(inode)->logged_trans < trans->transid) 2282 return 0; 2283 2284 ret = join_running_log_trans(root); 2285 if (ret) 2286 return 0; 2287 log = root->log_root; 2288 mutex_lock(&BTRFS_I(inode)->log_mutex); 2289 2290 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2291 dirid, &index); 2292 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2293 btrfs_end_log_trans(root); 2294 2295 return ret; 2296 } 2297 2298 /* 2299 * creates a range item in the log for 'dirid'. first_offset and 2300 * last_offset tell us which parts of the key space the log should 2301 * be considered authoritative for. 2302 */ 2303 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2304 struct btrfs_root *log, 2305 struct btrfs_path *path, 2306 int key_type, u64 dirid, 2307 u64 first_offset, u64 last_offset) 2308 { 2309 int ret; 2310 struct btrfs_key key; 2311 struct btrfs_dir_log_item *item; 2312 2313 key.objectid = dirid; 2314 key.offset = first_offset; 2315 if (key_type == BTRFS_DIR_ITEM_KEY) 2316 key.type = BTRFS_DIR_LOG_ITEM_KEY; 2317 else 2318 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2319 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2320 BUG_ON(ret); 2321 2322 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2323 struct btrfs_dir_log_item); 2324 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2325 btrfs_mark_buffer_dirty(path->nodes[0]); 2326 btrfs_release_path(log, path); 2327 return 0; 2328 } 2329 2330 /* 2331 * log all the items included in the current transaction for a given 2332 * directory. This also creates the range items in the log tree required 2333 * to replay anything deleted before the fsync 2334 */ 2335 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2336 struct btrfs_root *root, struct inode *inode, 2337 struct btrfs_path *path, 2338 struct btrfs_path *dst_path, int key_type, 2339 u64 min_offset, u64 *last_offset_ret) 2340 { 2341 struct btrfs_key min_key; 2342 struct btrfs_key max_key; 2343 struct btrfs_root *log = root->log_root; 2344 struct extent_buffer *src; 2345 int ret; 2346 int i; 2347 int nritems; 2348 u64 first_offset = min_offset; 2349 u64 last_offset = (u64)-1; 2350 2351 log = root->log_root; 2352 max_key.objectid = inode->i_ino; 2353 max_key.offset = (u64)-1; 2354 max_key.type = key_type; 2355 2356 min_key.objectid = inode->i_ino; 2357 min_key.type = key_type; 2358 min_key.offset = min_offset; 2359 2360 path->keep_locks = 1; 2361 2362 ret = btrfs_search_forward(root, &min_key, &max_key, 2363 path, 0, trans->transid); 2364 2365 /* 2366 * we didn't find anything from this transaction, see if there 2367 * is anything at all 2368 */ 2369 if (ret != 0 || min_key.objectid != inode->i_ino || 2370 min_key.type != key_type) { 2371 min_key.objectid = inode->i_ino; 2372 min_key.type = key_type; 2373 min_key.offset = (u64)-1; 2374 btrfs_release_path(root, path); 2375 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2376 if (ret < 0) { 2377 btrfs_release_path(root, path); 2378 return ret; 2379 } 2380 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2381 2382 /* if ret == 0 there are items for this type, 2383 * create a range to tell us the last key of this type. 2384 * otherwise, there are no items in this directory after 2385 * *min_offset, and we create a range to indicate that. 2386 */ 2387 if (ret == 0) { 2388 struct btrfs_key tmp; 2389 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 2390 path->slots[0]); 2391 if (key_type == tmp.type) 2392 first_offset = max(min_offset, tmp.offset) + 1; 2393 } 2394 goto done; 2395 } 2396 2397 /* go backward to find any previous key */ 2398 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2399 if (ret == 0) { 2400 struct btrfs_key tmp; 2401 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2402 if (key_type == tmp.type) { 2403 first_offset = tmp.offset; 2404 ret = overwrite_item(trans, log, dst_path, 2405 path->nodes[0], path->slots[0], 2406 &tmp); 2407 } 2408 } 2409 btrfs_release_path(root, path); 2410 2411 /* find the first key from this transaction again */ 2412 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2413 if (ret != 0) { 2414 WARN_ON(1); 2415 goto done; 2416 } 2417 2418 /* 2419 * we have a block from this transaction, log every item in it 2420 * from our directory 2421 */ 2422 while (1) { 2423 struct btrfs_key tmp; 2424 src = path->nodes[0]; 2425 nritems = btrfs_header_nritems(src); 2426 for (i = path->slots[0]; i < nritems; i++) { 2427 btrfs_item_key_to_cpu(src, &min_key, i); 2428 2429 if (min_key.objectid != inode->i_ino || 2430 min_key.type != key_type) 2431 goto done; 2432 ret = overwrite_item(trans, log, dst_path, src, i, 2433 &min_key); 2434 BUG_ON(ret); 2435 } 2436 path->slots[0] = nritems; 2437 2438 /* 2439 * look ahead to the next item and see if it is also 2440 * from this directory and from this transaction 2441 */ 2442 ret = btrfs_next_leaf(root, path); 2443 if (ret == 1) { 2444 last_offset = (u64)-1; 2445 goto done; 2446 } 2447 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2448 if (tmp.objectid != inode->i_ino || tmp.type != key_type) { 2449 last_offset = (u64)-1; 2450 goto done; 2451 } 2452 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 2453 ret = overwrite_item(trans, log, dst_path, 2454 path->nodes[0], path->slots[0], 2455 &tmp); 2456 2457 BUG_ON(ret); 2458 last_offset = tmp.offset; 2459 goto done; 2460 } 2461 } 2462 done: 2463 *last_offset_ret = last_offset; 2464 btrfs_release_path(root, path); 2465 btrfs_release_path(log, dst_path); 2466 2467 /* insert the log range keys to indicate where the log is valid */ 2468 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2469 first_offset, last_offset); 2470 BUG_ON(ret); 2471 return 0; 2472 } 2473 2474 /* 2475 * logging directories is very similar to logging inodes, We find all the items 2476 * from the current transaction and write them to the log. 2477 * 2478 * The recovery code scans the directory in the subvolume, and if it finds a 2479 * key in the range logged that is not present in the log tree, then it means 2480 * that dir entry was unlinked during the transaction. 2481 * 2482 * In order for that scan to work, we must include one key smaller than 2483 * the smallest logged by this transaction and one key larger than the largest 2484 * key logged by this transaction. 2485 */ 2486 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 2487 struct btrfs_root *root, struct inode *inode, 2488 struct btrfs_path *path, 2489 struct btrfs_path *dst_path) 2490 { 2491 u64 min_key; 2492 u64 max_key; 2493 int ret; 2494 int key_type = BTRFS_DIR_ITEM_KEY; 2495 2496 again: 2497 min_key = 0; 2498 max_key = 0; 2499 while (1) { 2500 ret = log_dir_items(trans, root, inode, path, 2501 dst_path, key_type, min_key, 2502 &max_key); 2503 BUG_ON(ret); 2504 if (max_key == (u64)-1) 2505 break; 2506 min_key = max_key + 1; 2507 } 2508 2509 if (key_type == BTRFS_DIR_ITEM_KEY) { 2510 key_type = BTRFS_DIR_INDEX_KEY; 2511 goto again; 2512 } 2513 return 0; 2514 } 2515 2516 /* 2517 * a helper function to drop items from the log before we relog an 2518 * inode. max_key_type indicates the highest item type to remove. 2519 * This cannot be run for file data extents because it does not 2520 * free the extents they point to. 2521 */ 2522 static int drop_objectid_items(struct btrfs_trans_handle *trans, 2523 struct btrfs_root *log, 2524 struct btrfs_path *path, 2525 u64 objectid, int max_key_type) 2526 { 2527 int ret; 2528 struct btrfs_key key; 2529 struct btrfs_key found_key; 2530 2531 key.objectid = objectid; 2532 key.type = max_key_type; 2533 key.offset = (u64)-1; 2534 2535 while (1) { 2536 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2537 2538 if (ret != 1) 2539 break; 2540 2541 if (path->slots[0] == 0) 2542 break; 2543 2544 path->slots[0]--; 2545 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2546 path->slots[0]); 2547 2548 if (found_key.objectid != objectid) 2549 break; 2550 2551 ret = btrfs_del_item(trans, log, path); 2552 BUG_ON(ret); 2553 btrfs_release_path(log, path); 2554 } 2555 btrfs_release_path(log, path); 2556 return 0; 2557 } 2558 2559 static noinline int copy_items(struct btrfs_trans_handle *trans, 2560 struct btrfs_root *log, 2561 struct btrfs_path *dst_path, 2562 struct extent_buffer *src, 2563 int start_slot, int nr, int inode_only) 2564 { 2565 unsigned long src_offset; 2566 unsigned long dst_offset; 2567 struct btrfs_file_extent_item *extent; 2568 struct btrfs_inode_item *inode_item; 2569 int ret; 2570 struct btrfs_key *ins_keys; 2571 u32 *ins_sizes; 2572 char *ins_data; 2573 int i; 2574 struct list_head ordered_sums; 2575 2576 INIT_LIST_HEAD(&ordered_sums); 2577 2578 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2579 nr * sizeof(u32), GFP_NOFS); 2580 ins_sizes = (u32 *)ins_data; 2581 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 2582 2583 for (i = 0; i < nr; i++) { 2584 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 2585 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 2586 } 2587 ret = btrfs_insert_empty_items(trans, log, dst_path, 2588 ins_keys, ins_sizes, nr); 2589 BUG_ON(ret); 2590 2591 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 2592 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2593 dst_path->slots[0]); 2594 2595 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 2596 2597 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 2598 src_offset, ins_sizes[i]); 2599 2600 if (inode_only == LOG_INODE_EXISTS && 2601 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 2602 inode_item = btrfs_item_ptr(dst_path->nodes[0], 2603 dst_path->slots[0], 2604 struct btrfs_inode_item); 2605 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); 2606 2607 /* set the generation to zero so the recover code 2608 * can tell the difference between an logging 2609 * just to say 'this inode exists' and a logging 2610 * to say 'update this inode with these values' 2611 */ 2612 btrfs_set_inode_generation(dst_path->nodes[0], 2613 inode_item, 0); 2614 } 2615 /* take a reference on file data extents so that truncates 2616 * or deletes of this inode don't have to relog the inode 2617 * again 2618 */ 2619 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { 2620 int found_type; 2621 extent = btrfs_item_ptr(src, start_slot + i, 2622 struct btrfs_file_extent_item); 2623 2624 found_type = btrfs_file_extent_type(src, extent); 2625 if (found_type == BTRFS_FILE_EXTENT_REG || 2626 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 2627 u64 ds, dl, cs, cl; 2628 ds = btrfs_file_extent_disk_bytenr(src, 2629 extent); 2630 /* ds == 0 is a hole */ 2631 if (ds == 0) 2632 continue; 2633 2634 dl = btrfs_file_extent_disk_num_bytes(src, 2635 extent); 2636 cs = btrfs_file_extent_offset(src, extent); 2637 cl = btrfs_file_extent_num_bytes(src, 2638 extent); 2639 if (btrfs_file_extent_compression(src, 2640 extent)) { 2641 cs = 0; 2642 cl = dl; 2643 } 2644 2645 ret = btrfs_lookup_csums_range( 2646 log->fs_info->csum_root, 2647 ds + cs, ds + cs + cl - 1, 2648 &ordered_sums); 2649 BUG_ON(ret); 2650 } 2651 } 2652 } 2653 2654 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 2655 btrfs_release_path(log, dst_path); 2656 kfree(ins_data); 2657 2658 /* 2659 * we have to do this after the loop above to avoid changing the 2660 * log tree while trying to change the log tree. 2661 */ 2662 while (!list_empty(&ordered_sums)) { 2663 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2664 struct btrfs_ordered_sum, 2665 list); 2666 ret = btrfs_csum_file_blocks(trans, log, sums); 2667 BUG_ON(ret); 2668 list_del(&sums->list); 2669 kfree(sums); 2670 } 2671 return 0; 2672 } 2673 2674 /* log a single inode in the tree log. 2675 * At least one parent directory for this inode must exist in the tree 2676 * or be logged already. 2677 * 2678 * Any items from this inode changed by the current transaction are copied 2679 * to the log tree. An extra reference is taken on any extents in this 2680 * file, allowing us to avoid a whole pile of corner cases around logging 2681 * blocks that have been removed from the tree. 2682 * 2683 * See LOG_INODE_ALL and related defines for a description of what inode_only 2684 * does. 2685 * 2686 * This handles both files and directories. 2687 */ 2688 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 2689 struct btrfs_root *root, struct inode *inode, 2690 int inode_only) 2691 { 2692 struct btrfs_path *path; 2693 struct btrfs_path *dst_path; 2694 struct btrfs_key min_key; 2695 struct btrfs_key max_key; 2696 struct btrfs_root *log = root->log_root; 2697 struct extent_buffer *src = NULL; 2698 u32 size; 2699 int ret; 2700 int nritems; 2701 int ins_start_slot = 0; 2702 int ins_nr; 2703 2704 log = root->log_root; 2705 2706 path = btrfs_alloc_path(); 2707 dst_path = btrfs_alloc_path(); 2708 2709 min_key.objectid = inode->i_ino; 2710 min_key.type = BTRFS_INODE_ITEM_KEY; 2711 min_key.offset = 0; 2712 2713 max_key.objectid = inode->i_ino; 2714 2715 /* today the code can only do partial logging of directories */ 2716 if (!S_ISDIR(inode->i_mode)) 2717 inode_only = LOG_INODE_ALL; 2718 2719 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2720 max_key.type = BTRFS_XATTR_ITEM_KEY; 2721 else 2722 max_key.type = (u8)-1; 2723 max_key.offset = (u64)-1; 2724 2725 mutex_lock(&BTRFS_I(inode)->log_mutex); 2726 2727 /* 2728 * a brute force approach to making sure we get the most uptodate 2729 * copies of everything. 2730 */ 2731 if (S_ISDIR(inode->i_mode)) { 2732 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 2733 2734 if (inode_only == LOG_INODE_EXISTS) 2735 max_key_type = BTRFS_XATTR_ITEM_KEY; 2736 ret = drop_objectid_items(trans, log, path, 2737 inode->i_ino, max_key_type); 2738 } else { 2739 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2740 } 2741 BUG_ON(ret); 2742 path->keep_locks = 1; 2743 2744 while (1) { 2745 ins_nr = 0; 2746 ret = btrfs_search_forward(root, &min_key, &max_key, 2747 path, 0, trans->transid); 2748 if (ret != 0) 2749 break; 2750 again: 2751 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 2752 if (min_key.objectid != inode->i_ino) 2753 break; 2754 if (min_key.type > max_key.type) 2755 break; 2756 2757 src = path->nodes[0]; 2758 size = btrfs_item_size_nr(src, path->slots[0]); 2759 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 2760 ins_nr++; 2761 goto next_slot; 2762 } else if (!ins_nr) { 2763 ins_start_slot = path->slots[0]; 2764 ins_nr = 1; 2765 goto next_slot; 2766 } 2767 2768 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2769 ins_nr, inode_only); 2770 BUG_ON(ret); 2771 ins_nr = 1; 2772 ins_start_slot = path->slots[0]; 2773 next_slot: 2774 2775 nritems = btrfs_header_nritems(path->nodes[0]); 2776 path->slots[0]++; 2777 if (path->slots[0] < nritems) { 2778 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 2779 path->slots[0]); 2780 goto again; 2781 } 2782 if (ins_nr) { 2783 ret = copy_items(trans, log, dst_path, src, 2784 ins_start_slot, 2785 ins_nr, inode_only); 2786 BUG_ON(ret); 2787 ins_nr = 0; 2788 } 2789 btrfs_release_path(root, path); 2790 2791 if (min_key.offset < (u64)-1) 2792 min_key.offset++; 2793 else if (min_key.type < (u8)-1) 2794 min_key.type++; 2795 else if (min_key.objectid < (u64)-1) 2796 min_key.objectid++; 2797 else 2798 break; 2799 } 2800 if (ins_nr) { 2801 ret = copy_items(trans, log, dst_path, src, 2802 ins_start_slot, 2803 ins_nr, inode_only); 2804 BUG_ON(ret); 2805 ins_nr = 0; 2806 } 2807 WARN_ON(ins_nr); 2808 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2809 btrfs_release_path(root, path); 2810 btrfs_release_path(log, dst_path); 2811 ret = log_directory_changes(trans, root, inode, path, dst_path); 2812 BUG_ON(ret); 2813 } 2814 BTRFS_I(inode)->logged_trans = trans->transid; 2815 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2816 2817 btrfs_free_path(path); 2818 btrfs_free_path(dst_path); 2819 return 0; 2820 } 2821 2822 /* 2823 * follow the dentry parent pointers up the chain and see if any 2824 * of the directories in it require a full commit before they can 2825 * be logged. Returns zero if nothing special needs to be done or 1 if 2826 * a full commit is required. 2827 */ 2828 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 2829 struct inode *inode, 2830 struct dentry *parent, 2831 struct super_block *sb, 2832 u64 last_committed) 2833 { 2834 int ret = 0; 2835 struct btrfs_root *root; 2836 2837 /* 2838 * for regular files, if its inode is already on disk, we don't 2839 * have to worry about the parents at all. This is because 2840 * we can use the last_unlink_trans field to record renames 2841 * and other fun in this file. 2842 */ 2843 if (S_ISREG(inode->i_mode) && 2844 BTRFS_I(inode)->generation <= last_committed && 2845 BTRFS_I(inode)->last_unlink_trans <= last_committed) 2846 goto out; 2847 2848 if (!S_ISDIR(inode->i_mode)) { 2849 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2850 goto out; 2851 inode = parent->d_inode; 2852 } 2853 2854 while (1) { 2855 BTRFS_I(inode)->logged_trans = trans->transid; 2856 smp_mb(); 2857 2858 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 2859 root = BTRFS_I(inode)->root; 2860 2861 /* 2862 * make sure any commits to the log are forced 2863 * to be full commits 2864 */ 2865 root->fs_info->last_trans_log_full_commit = 2866 trans->transid; 2867 ret = 1; 2868 break; 2869 } 2870 2871 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2872 break; 2873 2874 if (IS_ROOT(parent)) 2875 break; 2876 2877 parent = parent->d_parent; 2878 inode = parent->d_inode; 2879 2880 } 2881 out: 2882 return ret; 2883 } 2884 2885 static int inode_in_log(struct btrfs_trans_handle *trans, 2886 struct inode *inode) 2887 { 2888 struct btrfs_root *root = BTRFS_I(inode)->root; 2889 int ret = 0; 2890 2891 mutex_lock(&root->log_mutex); 2892 if (BTRFS_I(inode)->logged_trans == trans->transid && 2893 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) 2894 ret = 1; 2895 mutex_unlock(&root->log_mutex); 2896 return ret; 2897 } 2898 2899 2900 /* 2901 * helper function around btrfs_log_inode to make sure newly created 2902 * parent directories also end up in the log. A minimal inode and backref 2903 * only logging is done of any parent directories that are older than 2904 * the last committed transaction 2905 */ 2906 int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 2907 struct btrfs_root *root, struct inode *inode, 2908 struct dentry *parent, int exists_only) 2909 { 2910 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 2911 struct super_block *sb; 2912 int ret = 0; 2913 u64 last_committed = root->fs_info->last_trans_committed; 2914 2915 sb = inode->i_sb; 2916 2917 if (btrfs_test_opt(root, NOTREELOG)) { 2918 ret = 1; 2919 goto end_no_trans; 2920 } 2921 2922 if (root->fs_info->last_trans_log_full_commit > 2923 root->fs_info->last_trans_committed) { 2924 ret = 1; 2925 goto end_no_trans; 2926 } 2927 2928 if (root != BTRFS_I(inode)->root || 2929 btrfs_root_refs(&root->root_item) == 0) { 2930 ret = 1; 2931 goto end_no_trans; 2932 } 2933 2934 ret = check_parent_dirs_for_sync(trans, inode, parent, 2935 sb, last_committed); 2936 if (ret) 2937 goto end_no_trans; 2938 2939 if (inode_in_log(trans, inode)) { 2940 ret = BTRFS_NO_LOG_SYNC; 2941 goto end_no_trans; 2942 } 2943 2944 start_log_trans(trans, root); 2945 2946 ret = btrfs_log_inode(trans, root, inode, inode_only); 2947 BUG_ON(ret); 2948 2949 /* 2950 * for regular files, if its inode is already on disk, we don't 2951 * have to worry about the parents at all. This is because 2952 * we can use the last_unlink_trans field to record renames 2953 * and other fun in this file. 2954 */ 2955 if (S_ISREG(inode->i_mode) && 2956 BTRFS_I(inode)->generation <= last_committed && 2957 BTRFS_I(inode)->last_unlink_trans <= last_committed) 2958 goto no_parent; 2959 2960 inode_only = LOG_INODE_EXISTS; 2961 while (1) { 2962 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2963 break; 2964 2965 inode = parent->d_inode; 2966 if (root != BTRFS_I(inode)->root) 2967 break; 2968 2969 if (BTRFS_I(inode)->generation > 2970 root->fs_info->last_trans_committed) { 2971 ret = btrfs_log_inode(trans, root, inode, inode_only); 2972 BUG_ON(ret); 2973 } 2974 if (IS_ROOT(parent)) 2975 break; 2976 2977 parent = parent->d_parent; 2978 } 2979 no_parent: 2980 ret = 0; 2981 btrfs_end_log_trans(root); 2982 end_no_trans: 2983 return ret; 2984 } 2985 2986 /* 2987 * it is not safe to log dentry if the chunk root has added new 2988 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 2989 * If this returns 1, you must commit the transaction to safely get your 2990 * data on disk. 2991 */ 2992 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2993 struct btrfs_root *root, struct dentry *dentry) 2994 { 2995 return btrfs_log_inode_parent(trans, root, dentry->d_inode, 2996 dentry->d_parent, 0); 2997 } 2998 2999 /* 3000 * should be called during mount to recover any replay any log trees 3001 * from the FS 3002 */ 3003 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 3004 { 3005 int ret; 3006 struct btrfs_path *path; 3007 struct btrfs_trans_handle *trans; 3008 struct btrfs_key key; 3009 struct btrfs_key found_key; 3010 struct btrfs_key tmp_key; 3011 struct btrfs_root *log; 3012 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 3013 struct walk_control wc = { 3014 .process_func = process_one_buffer, 3015 .stage = 0, 3016 }; 3017 3018 fs_info->log_root_recovering = 1; 3019 path = btrfs_alloc_path(); 3020 BUG_ON(!path); 3021 3022 trans = btrfs_start_transaction(fs_info->tree_root, 1); 3023 3024 wc.trans = trans; 3025 wc.pin = 1; 3026 3027 walk_log_tree(trans, log_root_tree, &wc); 3028 3029 again: 3030 key.objectid = BTRFS_TREE_LOG_OBJECTID; 3031 key.offset = (u64)-1; 3032 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 3033 3034 while (1) { 3035 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 3036 if (ret < 0) 3037 break; 3038 if (ret > 0) { 3039 if (path->slots[0] == 0) 3040 break; 3041 path->slots[0]--; 3042 } 3043 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3044 path->slots[0]); 3045 btrfs_release_path(log_root_tree, path); 3046 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 3047 break; 3048 3049 log = btrfs_read_fs_root_no_radix(log_root_tree, 3050 &found_key); 3051 BUG_ON(!log); 3052 3053 3054 tmp_key.objectid = found_key.offset; 3055 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 3056 tmp_key.offset = (u64)-1; 3057 3058 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 3059 BUG_ON(!wc.replay_dest); 3060 3061 wc.replay_dest->log_root = log; 3062 btrfs_record_root_in_trans(trans, wc.replay_dest); 3063 ret = walk_log_tree(trans, log, &wc); 3064 BUG_ON(ret); 3065 3066 if (wc.stage == LOG_WALK_REPLAY_ALL) { 3067 ret = fixup_inode_link_counts(trans, wc.replay_dest, 3068 path); 3069 BUG_ON(ret); 3070 } 3071 3072 key.offset = found_key.offset - 1; 3073 wc.replay_dest->log_root = NULL; 3074 free_extent_buffer(log->node); 3075 free_extent_buffer(log->commit_root); 3076 kfree(log); 3077 3078 if (found_key.offset == 0) 3079 break; 3080 } 3081 btrfs_release_path(log_root_tree, path); 3082 3083 /* step one is to pin it all, step two is to replay just inodes */ 3084 if (wc.pin) { 3085 wc.pin = 0; 3086 wc.process_func = replay_one_buffer; 3087 wc.stage = LOG_WALK_REPLAY_INODES; 3088 goto again; 3089 } 3090 /* step three is to replay everything */ 3091 if (wc.stage < LOG_WALK_REPLAY_ALL) { 3092 wc.stage++; 3093 goto again; 3094 } 3095 3096 btrfs_free_path(path); 3097 3098 free_extent_buffer(log_root_tree->node); 3099 log_root_tree->log_root = NULL; 3100 fs_info->log_root_recovering = 0; 3101 3102 /* step 4: commit the transaction, which also unpins the blocks */ 3103 btrfs_commit_transaction(trans, fs_info->tree_root); 3104 3105 kfree(log_root_tree); 3106 return 0; 3107 } 3108 3109 /* 3110 * there are some corner cases where we want to force a full 3111 * commit instead of allowing a directory to be logged. 3112 * 3113 * They revolve around files there were unlinked from the directory, and 3114 * this function updates the parent directory so that a full commit is 3115 * properly done if it is fsync'd later after the unlinks are done. 3116 */ 3117 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 3118 struct inode *dir, struct inode *inode, 3119 int for_rename) 3120 { 3121 /* 3122 * when we're logging a file, if it hasn't been renamed 3123 * or unlinked, and its inode is fully committed on disk, 3124 * we don't have to worry about walking up the directory chain 3125 * to log its parents. 3126 * 3127 * So, we use the last_unlink_trans field to put this transid 3128 * into the file. When the file is logged we check it and 3129 * don't log the parents if the file is fully on disk. 3130 */ 3131 if (S_ISREG(inode->i_mode)) 3132 BTRFS_I(inode)->last_unlink_trans = trans->transid; 3133 3134 /* 3135 * if this directory was already logged any new 3136 * names for this file/dir will get recorded 3137 */ 3138 smp_mb(); 3139 if (BTRFS_I(dir)->logged_trans == trans->transid) 3140 return; 3141 3142 /* 3143 * if the inode we're about to unlink was logged, 3144 * the log will be properly updated for any new names 3145 */ 3146 if (BTRFS_I(inode)->logged_trans == trans->transid) 3147 return; 3148 3149 /* 3150 * when renaming files across directories, if the directory 3151 * there we're unlinking from gets fsync'd later on, there's 3152 * no way to find the destination directory later and fsync it 3153 * properly. So, we have to be conservative and force commits 3154 * so the new name gets discovered. 3155 */ 3156 if (for_rename) 3157 goto record; 3158 3159 /* we can safely do the unlink without any special recording */ 3160 return; 3161 3162 record: 3163 BTRFS_I(dir)->last_unlink_trans = trans->transid; 3164 } 3165 3166 /* 3167 * Call this after adding a new name for a file and it will properly 3168 * update the log to reflect the new name. 3169 * 3170 * It will return zero if all goes well, and it will return 1 if a 3171 * full transaction commit is required. 3172 */ 3173 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 3174 struct inode *inode, struct inode *old_dir, 3175 struct dentry *parent) 3176 { 3177 struct btrfs_root * root = BTRFS_I(inode)->root; 3178 3179 /* 3180 * this will force the logging code to walk the dentry chain 3181 * up for the file 3182 */ 3183 if (S_ISREG(inode->i_mode)) 3184 BTRFS_I(inode)->last_unlink_trans = trans->transid; 3185 3186 /* 3187 * if this inode hasn't been logged and directory we're renaming it 3188 * from hasn't been logged, we don't need to log it 3189 */ 3190 if (BTRFS_I(inode)->logged_trans <= 3191 root->fs_info->last_trans_committed && 3192 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 3193 root->fs_info->last_trans_committed)) 3194 return 0; 3195 3196 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 3197 } 3198 3199