1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include "ctree.h" 22 #include "transaction.h" 23 #include "disk-io.h" 24 #include "locking.h" 25 #include "print-tree.h" 26 #include "compat.h" 27 #include "tree-log.h" 28 29 /* magic values for the inode_only field in btrfs_log_inode: 30 * 31 * LOG_INODE_ALL means to log everything 32 * LOG_INODE_EXISTS means to log just enough to recreate the inode 33 * during log replay 34 */ 35 #define LOG_INODE_ALL 0 36 #define LOG_INODE_EXISTS 1 37 38 /* 39 * directory trouble cases 40 * 41 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 42 * log, we must force a full commit before doing an fsync of the directory 43 * where the unlink was done. 44 * ---> record transid of last unlink/rename per directory 45 * 46 * mkdir foo/some_dir 47 * normal commit 48 * rename foo/some_dir foo2/some_dir 49 * mkdir foo/some_dir 50 * fsync foo/some_dir/some_file 51 * 52 * The fsync above will unlink the original some_dir without recording 53 * it in its new location (foo2). After a crash, some_dir will be gone 54 * unless the fsync of some_file forces a full commit 55 * 56 * 2) we must log any new names for any file or dir that is in the fsync 57 * log. ---> check inode while renaming/linking. 58 * 59 * 2a) we must log any new names for any file or dir during rename 60 * when the directory they are being removed from was logged. 61 * ---> check inode and old parent dir during rename 62 * 63 * 2a is actually the more important variant. With the extra logging 64 * a crash might unlink the old name without recreating the new one 65 * 66 * 3) after a crash, we must go through any directories with a link count 67 * of zero and redo the rm -rf 68 * 69 * mkdir f1/foo 70 * normal commit 71 * rm -rf f1/foo 72 * fsync(f1) 73 * 74 * The directory f1 was fully removed from the FS, but fsync was never 75 * called on f1, only its parent dir. After a crash the rm -rf must 76 * be replayed. This must be able to recurse down the entire 77 * directory tree. The inode link count fixup code takes care of the 78 * ugly details. 79 */ 80 81 /* 82 * stages for the tree walking. The first 83 * stage (0) is to only pin down the blocks we find 84 * the second stage (1) is to make sure that all the inodes 85 * we find in the log are created in the subvolume. 86 * 87 * The last stage is to deal with directories and links and extents 88 * and all the other fun semantics 89 */ 90 #define LOG_WALK_PIN_ONLY 0 91 #define LOG_WALK_REPLAY_INODES 1 92 #define LOG_WALK_REPLAY_ALL 2 93 94 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 95 struct btrfs_root *root, struct inode *inode, 96 int inode_only); 97 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 98 struct btrfs_root *root, 99 struct btrfs_path *path, u64 objectid); 100 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 101 struct btrfs_root *root, 102 struct btrfs_root *log, 103 struct btrfs_path *path, 104 u64 dirid, int del_all); 105 106 /* 107 * tree logging is a special write ahead log used to make sure that 108 * fsyncs and O_SYNCs can happen without doing full tree commits. 109 * 110 * Full tree commits are expensive because they require commonly 111 * modified blocks to be recowed, creating many dirty pages in the 112 * extent tree an 4x-6x higher write load than ext3. 113 * 114 * Instead of doing a tree commit on every fsync, we use the 115 * key ranges and transaction ids to find items for a given file or directory 116 * that have changed in this transaction. Those items are copied into 117 * a special tree (one per subvolume root), that tree is written to disk 118 * and then the fsync is considered complete. 119 * 120 * After a crash, items are copied out of the log-tree back into the 121 * subvolume tree. Any file data extents found are recorded in the extent 122 * allocation tree, and the log-tree freed. 123 * 124 * The log tree is read three times, once to pin down all the extents it is 125 * using in ram and once, once to create all the inodes logged in the tree 126 * and once to do all the other items. 127 */ 128 129 /* 130 * start a sub transaction and setup the log tree 131 * this increments the log tree writer count to make the people 132 * syncing the tree wait for us to finish 133 */ 134 static int start_log_trans(struct btrfs_trans_handle *trans, 135 struct btrfs_root *root) 136 { 137 int ret; 138 int err = 0; 139 140 mutex_lock(&root->log_mutex); 141 if (root->log_root) { 142 if (!root->log_start_pid) { 143 root->log_start_pid = current->pid; 144 root->log_multiple_pids = false; 145 } else if (root->log_start_pid != current->pid) { 146 root->log_multiple_pids = true; 147 } 148 149 root->log_batch++; 150 atomic_inc(&root->log_writers); 151 mutex_unlock(&root->log_mutex); 152 return 0; 153 } 154 root->log_multiple_pids = false; 155 root->log_start_pid = current->pid; 156 mutex_lock(&root->fs_info->tree_log_mutex); 157 if (!root->fs_info->log_root_tree) { 158 ret = btrfs_init_log_root_tree(trans, root->fs_info); 159 if (ret) 160 err = ret; 161 } 162 if (err == 0 && !root->log_root) { 163 ret = btrfs_add_log_tree(trans, root); 164 if (ret) 165 err = ret; 166 } 167 mutex_unlock(&root->fs_info->tree_log_mutex); 168 root->log_batch++; 169 atomic_inc(&root->log_writers); 170 mutex_unlock(&root->log_mutex); 171 return err; 172 } 173 174 /* 175 * returns 0 if there was a log transaction running and we were able 176 * to join, or returns -ENOENT if there were not transactions 177 * in progress 178 */ 179 static int join_running_log_trans(struct btrfs_root *root) 180 { 181 int ret = -ENOENT; 182 183 smp_mb(); 184 if (!root->log_root) 185 return -ENOENT; 186 187 mutex_lock(&root->log_mutex); 188 if (root->log_root) { 189 ret = 0; 190 atomic_inc(&root->log_writers); 191 } 192 mutex_unlock(&root->log_mutex); 193 return ret; 194 } 195 196 /* 197 * This either makes the current running log transaction wait 198 * until you call btrfs_end_log_trans() or it makes any future 199 * log transactions wait until you call btrfs_end_log_trans() 200 */ 201 int btrfs_pin_log_trans(struct btrfs_root *root) 202 { 203 int ret = -ENOENT; 204 205 mutex_lock(&root->log_mutex); 206 atomic_inc(&root->log_writers); 207 mutex_unlock(&root->log_mutex); 208 return ret; 209 } 210 211 /* 212 * indicate we're done making changes to the log tree 213 * and wake up anyone waiting to do a sync 214 */ 215 int btrfs_end_log_trans(struct btrfs_root *root) 216 { 217 if (atomic_dec_and_test(&root->log_writers)) { 218 smp_mb(); 219 if (waitqueue_active(&root->log_writer_wait)) 220 wake_up(&root->log_writer_wait); 221 } 222 return 0; 223 } 224 225 226 /* 227 * the walk control struct is used to pass state down the chain when 228 * processing the log tree. The stage field tells us which part 229 * of the log tree processing we are currently doing. The others 230 * are state fields used for that specific part 231 */ 232 struct walk_control { 233 /* should we free the extent on disk when done? This is used 234 * at transaction commit time while freeing a log tree 235 */ 236 int free; 237 238 /* should we write out the extent buffer? This is used 239 * while flushing the log tree to disk during a sync 240 */ 241 int write; 242 243 /* should we wait for the extent buffer io to finish? Also used 244 * while flushing the log tree to disk for a sync 245 */ 246 int wait; 247 248 /* pin only walk, we record which extents on disk belong to the 249 * log trees 250 */ 251 int pin; 252 253 /* what stage of the replay code we're currently in */ 254 int stage; 255 256 /* the root we are currently replaying */ 257 struct btrfs_root *replay_dest; 258 259 /* the trans handle for the current replay */ 260 struct btrfs_trans_handle *trans; 261 262 /* the function that gets used to process blocks we find in the 263 * tree. Note the extent_buffer might not be up to date when it is 264 * passed in, and it must be checked or read if you need the data 265 * inside it 266 */ 267 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 268 struct walk_control *wc, u64 gen); 269 }; 270 271 /* 272 * process_func used to pin down extents, write them or wait on them 273 */ 274 static int process_one_buffer(struct btrfs_root *log, 275 struct extent_buffer *eb, 276 struct walk_control *wc, u64 gen) 277 { 278 if (wc->pin) 279 btrfs_pin_extent(log->fs_info->extent_root, 280 eb->start, eb->len, 0); 281 282 if (btrfs_buffer_uptodate(eb, gen)) { 283 if (wc->write) 284 btrfs_write_tree_block(eb); 285 if (wc->wait) 286 btrfs_wait_tree_block_writeback(eb); 287 } 288 return 0; 289 } 290 291 /* 292 * Item overwrite used by replay and tree logging. eb, slot and key all refer 293 * to the src data we are copying out. 294 * 295 * root is the tree we are copying into, and path is a scratch 296 * path for use in this function (it should be released on entry and 297 * will be released on exit). 298 * 299 * If the key is already in the destination tree the existing item is 300 * overwritten. If the existing item isn't big enough, it is extended. 301 * If it is too large, it is truncated. 302 * 303 * If the key isn't in the destination yet, a new item is inserted. 304 */ 305 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 306 struct btrfs_root *root, 307 struct btrfs_path *path, 308 struct extent_buffer *eb, int slot, 309 struct btrfs_key *key) 310 { 311 int ret; 312 u32 item_size; 313 u64 saved_i_size = 0; 314 int save_old_i_size = 0; 315 unsigned long src_ptr; 316 unsigned long dst_ptr; 317 int overwrite_root = 0; 318 319 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 320 overwrite_root = 1; 321 322 item_size = btrfs_item_size_nr(eb, slot); 323 src_ptr = btrfs_item_ptr_offset(eb, slot); 324 325 /* look for the key in the destination tree */ 326 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 327 if (ret == 0) { 328 char *src_copy; 329 char *dst_copy; 330 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 331 path->slots[0]); 332 if (dst_size != item_size) 333 goto insert; 334 335 if (item_size == 0) { 336 btrfs_release_path(root, path); 337 return 0; 338 } 339 dst_copy = kmalloc(item_size, GFP_NOFS); 340 src_copy = kmalloc(item_size, GFP_NOFS); 341 if (!dst_copy || !src_copy) { 342 btrfs_release_path(root, path); 343 kfree(dst_copy); 344 kfree(src_copy); 345 return -ENOMEM; 346 } 347 348 read_extent_buffer(eb, src_copy, src_ptr, item_size); 349 350 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 351 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 352 item_size); 353 ret = memcmp(dst_copy, src_copy, item_size); 354 355 kfree(dst_copy); 356 kfree(src_copy); 357 /* 358 * they have the same contents, just return, this saves 359 * us from cowing blocks in the destination tree and doing 360 * extra writes that may not have been done by a previous 361 * sync 362 */ 363 if (ret == 0) { 364 btrfs_release_path(root, path); 365 return 0; 366 } 367 368 } 369 insert: 370 btrfs_release_path(root, path); 371 /* try to insert the key into the destination tree */ 372 ret = btrfs_insert_empty_item(trans, root, path, 373 key, item_size); 374 375 /* make sure any existing item is the correct size */ 376 if (ret == -EEXIST) { 377 u32 found_size; 378 found_size = btrfs_item_size_nr(path->nodes[0], 379 path->slots[0]); 380 if (found_size > item_size) { 381 btrfs_truncate_item(trans, root, path, item_size, 1); 382 } else if (found_size < item_size) { 383 ret = btrfs_extend_item(trans, root, path, 384 item_size - found_size); 385 BUG_ON(ret); 386 } 387 } else if (ret) { 388 return ret; 389 } 390 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 391 path->slots[0]); 392 393 /* don't overwrite an existing inode if the generation number 394 * was logged as zero. This is done when the tree logging code 395 * is just logging an inode to make sure it exists after recovery. 396 * 397 * Also, don't overwrite i_size on directories during replay. 398 * log replay inserts and removes directory items based on the 399 * state of the tree found in the subvolume, and i_size is modified 400 * as it goes 401 */ 402 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 403 struct btrfs_inode_item *src_item; 404 struct btrfs_inode_item *dst_item; 405 406 src_item = (struct btrfs_inode_item *)src_ptr; 407 dst_item = (struct btrfs_inode_item *)dst_ptr; 408 409 if (btrfs_inode_generation(eb, src_item) == 0) 410 goto no_copy; 411 412 if (overwrite_root && 413 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 414 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 415 save_old_i_size = 1; 416 saved_i_size = btrfs_inode_size(path->nodes[0], 417 dst_item); 418 } 419 } 420 421 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 422 src_ptr, item_size); 423 424 if (save_old_i_size) { 425 struct btrfs_inode_item *dst_item; 426 dst_item = (struct btrfs_inode_item *)dst_ptr; 427 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 428 } 429 430 /* make sure the generation is filled in */ 431 if (key->type == BTRFS_INODE_ITEM_KEY) { 432 struct btrfs_inode_item *dst_item; 433 dst_item = (struct btrfs_inode_item *)dst_ptr; 434 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 435 btrfs_set_inode_generation(path->nodes[0], dst_item, 436 trans->transid); 437 } 438 } 439 no_copy: 440 btrfs_mark_buffer_dirty(path->nodes[0]); 441 btrfs_release_path(root, path); 442 return 0; 443 } 444 445 /* 446 * simple helper to read an inode off the disk from a given root 447 * This can only be called for subvolume roots and not for the log 448 */ 449 static noinline struct inode *read_one_inode(struct btrfs_root *root, 450 u64 objectid) 451 { 452 struct btrfs_key key; 453 struct inode *inode; 454 455 key.objectid = objectid; 456 key.type = BTRFS_INODE_ITEM_KEY; 457 key.offset = 0; 458 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 459 if (IS_ERR(inode)) { 460 inode = NULL; 461 } else if (is_bad_inode(inode)) { 462 iput(inode); 463 inode = NULL; 464 } 465 return inode; 466 } 467 468 /* replays a single extent in 'eb' at 'slot' with 'key' into the 469 * subvolume 'root'. path is released on entry and should be released 470 * on exit. 471 * 472 * extents in the log tree have not been allocated out of the extent 473 * tree yet. So, this completes the allocation, taking a reference 474 * as required if the extent already exists or creating a new extent 475 * if it isn't in the extent allocation tree yet. 476 * 477 * The extent is inserted into the file, dropping any existing extents 478 * from the file that overlap the new one. 479 */ 480 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 481 struct btrfs_root *root, 482 struct btrfs_path *path, 483 struct extent_buffer *eb, int slot, 484 struct btrfs_key *key) 485 { 486 int found_type; 487 u64 mask = root->sectorsize - 1; 488 u64 extent_end; 489 u64 alloc_hint; 490 u64 start = key->offset; 491 u64 saved_nbytes; 492 struct btrfs_file_extent_item *item; 493 struct inode *inode = NULL; 494 unsigned long size; 495 int ret = 0; 496 497 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 498 found_type = btrfs_file_extent_type(eb, item); 499 500 if (found_type == BTRFS_FILE_EXTENT_REG || 501 found_type == BTRFS_FILE_EXTENT_PREALLOC) 502 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 503 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 504 size = btrfs_file_extent_inline_len(eb, item); 505 extent_end = (start + size + mask) & ~mask; 506 } else { 507 ret = 0; 508 goto out; 509 } 510 511 inode = read_one_inode(root, key->objectid); 512 if (!inode) { 513 ret = -EIO; 514 goto out; 515 } 516 517 /* 518 * first check to see if we already have this extent in the 519 * file. This must be done before the btrfs_drop_extents run 520 * so we don't try to drop this extent. 521 */ 522 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 523 start, 0); 524 525 if (ret == 0 && 526 (found_type == BTRFS_FILE_EXTENT_REG || 527 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 528 struct btrfs_file_extent_item cmp1; 529 struct btrfs_file_extent_item cmp2; 530 struct btrfs_file_extent_item *existing; 531 struct extent_buffer *leaf; 532 533 leaf = path->nodes[0]; 534 existing = btrfs_item_ptr(leaf, path->slots[0], 535 struct btrfs_file_extent_item); 536 537 read_extent_buffer(eb, &cmp1, (unsigned long)item, 538 sizeof(cmp1)); 539 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 540 sizeof(cmp2)); 541 542 /* 543 * we already have a pointer to this exact extent, 544 * we don't have to do anything 545 */ 546 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 547 btrfs_release_path(root, path); 548 goto out; 549 } 550 } 551 btrfs_release_path(root, path); 552 553 saved_nbytes = inode_get_bytes(inode); 554 /* drop any overlapping extents */ 555 ret = btrfs_drop_extents(trans, inode, start, extent_end, 556 &alloc_hint, 1); 557 BUG_ON(ret); 558 559 if (found_type == BTRFS_FILE_EXTENT_REG || 560 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 561 u64 offset; 562 unsigned long dest_offset; 563 struct btrfs_key ins; 564 565 ret = btrfs_insert_empty_item(trans, root, path, key, 566 sizeof(*item)); 567 BUG_ON(ret); 568 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 569 path->slots[0]); 570 copy_extent_buffer(path->nodes[0], eb, dest_offset, 571 (unsigned long)item, sizeof(*item)); 572 573 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 574 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 575 ins.type = BTRFS_EXTENT_ITEM_KEY; 576 offset = key->offset - btrfs_file_extent_offset(eb, item); 577 578 if (ins.objectid > 0) { 579 u64 csum_start; 580 u64 csum_end; 581 LIST_HEAD(ordered_sums); 582 /* 583 * is this extent already allocated in the extent 584 * allocation tree? If so, just add a reference 585 */ 586 ret = btrfs_lookup_extent(root, ins.objectid, 587 ins.offset); 588 if (ret == 0) { 589 ret = btrfs_inc_extent_ref(trans, root, 590 ins.objectid, ins.offset, 591 0, root->root_key.objectid, 592 key->objectid, offset); 593 } else { 594 /* 595 * insert the extent pointer in the extent 596 * allocation tree 597 */ 598 ret = btrfs_alloc_logged_file_extent(trans, 599 root, root->root_key.objectid, 600 key->objectid, offset, &ins); 601 BUG_ON(ret); 602 } 603 btrfs_release_path(root, path); 604 605 if (btrfs_file_extent_compression(eb, item)) { 606 csum_start = ins.objectid; 607 csum_end = csum_start + ins.offset; 608 } else { 609 csum_start = ins.objectid + 610 btrfs_file_extent_offset(eb, item); 611 csum_end = csum_start + 612 btrfs_file_extent_num_bytes(eb, item); 613 } 614 615 ret = btrfs_lookup_csums_range(root->log_root, 616 csum_start, csum_end - 1, 617 &ordered_sums); 618 BUG_ON(ret); 619 while (!list_empty(&ordered_sums)) { 620 struct btrfs_ordered_sum *sums; 621 sums = list_entry(ordered_sums.next, 622 struct btrfs_ordered_sum, 623 list); 624 ret = btrfs_csum_file_blocks(trans, 625 root->fs_info->csum_root, 626 sums); 627 BUG_ON(ret); 628 list_del(&sums->list); 629 kfree(sums); 630 } 631 } else { 632 btrfs_release_path(root, path); 633 } 634 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 635 /* inline extents are easy, we just overwrite them */ 636 ret = overwrite_item(trans, root, path, eb, slot, key); 637 BUG_ON(ret); 638 } 639 640 inode_set_bytes(inode, saved_nbytes); 641 btrfs_update_inode(trans, root, inode); 642 out: 643 if (inode) 644 iput(inode); 645 return ret; 646 } 647 648 /* 649 * when cleaning up conflicts between the directory names in the 650 * subvolume, directory names in the log and directory names in the 651 * inode back references, we may have to unlink inodes from directories. 652 * 653 * This is a helper function to do the unlink of a specific directory 654 * item 655 */ 656 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 657 struct btrfs_root *root, 658 struct btrfs_path *path, 659 struct inode *dir, 660 struct btrfs_dir_item *di) 661 { 662 struct inode *inode; 663 char *name; 664 int name_len; 665 struct extent_buffer *leaf; 666 struct btrfs_key location; 667 int ret; 668 669 leaf = path->nodes[0]; 670 671 btrfs_dir_item_key_to_cpu(leaf, di, &location); 672 name_len = btrfs_dir_name_len(leaf, di); 673 name = kmalloc(name_len, GFP_NOFS); 674 if (!name) 675 return -ENOMEM; 676 677 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 678 btrfs_release_path(root, path); 679 680 inode = read_one_inode(root, location.objectid); 681 BUG_ON(!inode); 682 683 ret = link_to_fixup_dir(trans, root, path, location.objectid); 684 BUG_ON(ret); 685 686 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 687 BUG_ON(ret); 688 kfree(name); 689 690 iput(inode); 691 return ret; 692 } 693 694 /* 695 * helper function to see if a given name and sequence number found 696 * in an inode back reference are already in a directory and correctly 697 * point to this inode 698 */ 699 static noinline int inode_in_dir(struct btrfs_root *root, 700 struct btrfs_path *path, 701 u64 dirid, u64 objectid, u64 index, 702 const char *name, int name_len) 703 { 704 struct btrfs_dir_item *di; 705 struct btrfs_key location; 706 int match = 0; 707 708 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 709 index, name, name_len, 0); 710 if (di && !IS_ERR(di)) { 711 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 712 if (location.objectid != objectid) 713 goto out; 714 } else 715 goto out; 716 btrfs_release_path(root, path); 717 718 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 719 if (di && !IS_ERR(di)) { 720 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 721 if (location.objectid != objectid) 722 goto out; 723 } else 724 goto out; 725 match = 1; 726 out: 727 btrfs_release_path(root, path); 728 return match; 729 } 730 731 /* 732 * helper function to check a log tree for a named back reference in 733 * an inode. This is used to decide if a back reference that is 734 * found in the subvolume conflicts with what we find in the log. 735 * 736 * inode backreferences may have multiple refs in a single item, 737 * during replay we process one reference at a time, and we don't 738 * want to delete valid links to a file from the subvolume if that 739 * link is also in the log. 740 */ 741 static noinline int backref_in_log(struct btrfs_root *log, 742 struct btrfs_key *key, 743 char *name, int namelen) 744 { 745 struct btrfs_path *path; 746 struct btrfs_inode_ref *ref; 747 unsigned long ptr; 748 unsigned long ptr_end; 749 unsigned long name_ptr; 750 int found_name_len; 751 int item_size; 752 int ret; 753 int match = 0; 754 755 path = btrfs_alloc_path(); 756 if (!path) 757 return -ENOMEM; 758 759 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 760 if (ret != 0) 761 goto out; 762 763 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 764 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 765 ptr_end = ptr + item_size; 766 while (ptr < ptr_end) { 767 ref = (struct btrfs_inode_ref *)ptr; 768 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 769 if (found_name_len == namelen) { 770 name_ptr = (unsigned long)(ref + 1); 771 ret = memcmp_extent_buffer(path->nodes[0], name, 772 name_ptr, namelen); 773 if (ret == 0) { 774 match = 1; 775 goto out; 776 } 777 } 778 ptr = (unsigned long)(ref + 1) + found_name_len; 779 } 780 out: 781 btrfs_free_path(path); 782 return match; 783 } 784 785 786 /* 787 * replay one inode back reference item found in the log tree. 788 * eb, slot and key refer to the buffer and key found in the log tree. 789 * root is the destination we are replaying into, and path is for temp 790 * use by this function. (it should be released on return). 791 */ 792 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 793 struct btrfs_root *root, 794 struct btrfs_root *log, 795 struct btrfs_path *path, 796 struct extent_buffer *eb, int slot, 797 struct btrfs_key *key) 798 { 799 struct inode *dir; 800 int ret; 801 struct btrfs_inode_ref *ref; 802 struct inode *inode; 803 char *name; 804 int namelen; 805 unsigned long ref_ptr; 806 unsigned long ref_end; 807 int search_done = 0; 808 809 /* 810 * it is possible that we didn't log all the parent directories 811 * for a given inode. If we don't find the dir, just don't 812 * copy the back ref in. The link count fixup code will take 813 * care of the rest 814 */ 815 dir = read_one_inode(root, key->offset); 816 if (!dir) 817 return -ENOENT; 818 819 inode = read_one_inode(root, key->objectid); 820 BUG_ON(!inode); 821 822 ref_ptr = btrfs_item_ptr_offset(eb, slot); 823 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 824 825 again: 826 ref = (struct btrfs_inode_ref *)ref_ptr; 827 828 namelen = btrfs_inode_ref_name_len(eb, ref); 829 name = kmalloc(namelen, GFP_NOFS); 830 BUG_ON(!name); 831 832 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); 833 834 /* if we already have a perfect match, we're done */ 835 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, 836 btrfs_inode_ref_index(eb, ref), 837 name, namelen)) { 838 goto out; 839 } 840 841 /* 842 * look for a conflicting back reference in the metadata. 843 * if we find one we have to unlink that name of the file 844 * before we add our new link. Later on, we overwrite any 845 * existing back reference, and we don't want to create 846 * dangling pointers in the directory. 847 */ 848 849 if (search_done) 850 goto insert; 851 852 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 853 if (ret == 0) { 854 char *victim_name; 855 int victim_name_len; 856 struct btrfs_inode_ref *victim_ref; 857 unsigned long ptr; 858 unsigned long ptr_end; 859 struct extent_buffer *leaf = path->nodes[0]; 860 861 /* are we trying to overwrite a back ref for the root directory 862 * if so, just jump out, we're done 863 */ 864 if (key->objectid == key->offset) 865 goto out_nowrite; 866 867 /* check all the names in this back reference to see 868 * if they are in the log. if so, we allow them to stay 869 * otherwise they must be unlinked as a conflict 870 */ 871 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 872 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 873 while (ptr < ptr_end) { 874 victim_ref = (struct btrfs_inode_ref *)ptr; 875 victim_name_len = btrfs_inode_ref_name_len(leaf, 876 victim_ref); 877 victim_name = kmalloc(victim_name_len, GFP_NOFS); 878 BUG_ON(!victim_name); 879 880 read_extent_buffer(leaf, victim_name, 881 (unsigned long)(victim_ref + 1), 882 victim_name_len); 883 884 if (!backref_in_log(log, key, victim_name, 885 victim_name_len)) { 886 btrfs_inc_nlink(inode); 887 btrfs_release_path(root, path); 888 889 ret = btrfs_unlink_inode(trans, root, dir, 890 inode, victim_name, 891 victim_name_len); 892 } 893 kfree(victim_name); 894 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 895 } 896 BUG_ON(ret); 897 898 /* 899 * NOTE: we have searched root tree and checked the 900 * coresponding ref, it does not need to check again. 901 */ 902 search_done = 1; 903 } 904 btrfs_release_path(root, path); 905 906 insert: 907 /* insert our name */ 908 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 909 btrfs_inode_ref_index(eb, ref)); 910 BUG_ON(ret); 911 912 btrfs_update_inode(trans, root, inode); 913 914 out: 915 ref_ptr = (unsigned long)(ref + 1) + namelen; 916 kfree(name); 917 if (ref_ptr < ref_end) 918 goto again; 919 920 /* finally write the back reference in the inode */ 921 ret = overwrite_item(trans, root, path, eb, slot, key); 922 BUG_ON(ret); 923 924 out_nowrite: 925 btrfs_release_path(root, path); 926 iput(dir); 927 iput(inode); 928 return 0; 929 } 930 931 static int insert_orphan_item(struct btrfs_trans_handle *trans, 932 struct btrfs_root *root, u64 offset) 933 { 934 int ret; 935 ret = btrfs_find_orphan_item(root, offset); 936 if (ret > 0) 937 ret = btrfs_insert_orphan_item(trans, root, offset); 938 return ret; 939 } 940 941 942 /* 943 * There are a few corners where the link count of the file can't 944 * be properly maintained during replay. So, instead of adding 945 * lots of complexity to the log code, we just scan the backrefs 946 * for any file that has been through replay. 947 * 948 * The scan will update the link count on the inode to reflect the 949 * number of back refs found. If it goes down to zero, the iput 950 * will free the inode. 951 */ 952 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 953 struct btrfs_root *root, 954 struct inode *inode) 955 { 956 struct btrfs_path *path; 957 int ret; 958 struct btrfs_key key; 959 u64 nlink = 0; 960 unsigned long ptr; 961 unsigned long ptr_end; 962 int name_len; 963 964 key.objectid = inode->i_ino; 965 key.type = BTRFS_INODE_REF_KEY; 966 key.offset = (u64)-1; 967 968 path = btrfs_alloc_path(); 969 if (!path) 970 return -ENOMEM; 971 972 while (1) { 973 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 974 if (ret < 0) 975 break; 976 if (ret > 0) { 977 if (path->slots[0] == 0) 978 break; 979 path->slots[0]--; 980 } 981 btrfs_item_key_to_cpu(path->nodes[0], &key, 982 path->slots[0]); 983 if (key.objectid != inode->i_ino || 984 key.type != BTRFS_INODE_REF_KEY) 985 break; 986 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 987 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 988 path->slots[0]); 989 while (ptr < ptr_end) { 990 struct btrfs_inode_ref *ref; 991 992 ref = (struct btrfs_inode_ref *)ptr; 993 name_len = btrfs_inode_ref_name_len(path->nodes[0], 994 ref); 995 ptr = (unsigned long)(ref + 1) + name_len; 996 nlink++; 997 } 998 999 if (key.offset == 0) 1000 break; 1001 key.offset--; 1002 btrfs_release_path(root, path); 1003 } 1004 btrfs_release_path(root, path); 1005 if (nlink != inode->i_nlink) { 1006 inode->i_nlink = nlink; 1007 btrfs_update_inode(trans, root, inode); 1008 } 1009 BTRFS_I(inode)->index_cnt = (u64)-1; 1010 1011 if (inode->i_nlink == 0) { 1012 if (S_ISDIR(inode->i_mode)) { 1013 ret = replay_dir_deletes(trans, root, NULL, path, 1014 inode->i_ino, 1); 1015 BUG_ON(ret); 1016 } 1017 ret = insert_orphan_item(trans, root, inode->i_ino); 1018 BUG_ON(ret); 1019 } 1020 btrfs_free_path(path); 1021 1022 return 0; 1023 } 1024 1025 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1026 struct btrfs_root *root, 1027 struct btrfs_path *path) 1028 { 1029 int ret; 1030 struct btrfs_key key; 1031 struct inode *inode; 1032 1033 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1034 key.type = BTRFS_ORPHAN_ITEM_KEY; 1035 key.offset = (u64)-1; 1036 while (1) { 1037 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1038 if (ret < 0) 1039 break; 1040 1041 if (ret == 1) { 1042 if (path->slots[0] == 0) 1043 break; 1044 path->slots[0]--; 1045 } 1046 1047 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1048 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1049 key.type != BTRFS_ORPHAN_ITEM_KEY) 1050 break; 1051 1052 ret = btrfs_del_item(trans, root, path); 1053 BUG_ON(ret); 1054 1055 btrfs_release_path(root, path); 1056 inode = read_one_inode(root, key.offset); 1057 BUG_ON(!inode); 1058 1059 ret = fixup_inode_link_count(trans, root, inode); 1060 BUG_ON(ret); 1061 1062 iput(inode); 1063 1064 /* 1065 * fixup on a directory may create new entries, 1066 * make sure we always look for the highset possible 1067 * offset 1068 */ 1069 key.offset = (u64)-1; 1070 } 1071 btrfs_release_path(root, path); 1072 return 0; 1073 } 1074 1075 1076 /* 1077 * record a given inode in the fixup dir so we can check its link 1078 * count when replay is done. The link count is incremented here 1079 * so the inode won't go away until we check it 1080 */ 1081 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1082 struct btrfs_root *root, 1083 struct btrfs_path *path, 1084 u64 objectid) 1085 { 1086 struct btrfs_key key; 1087 int ret = 0; 1088 struct inode *inode; 1089 1090 inode = read_one_inode(root, objectid); 1091 BUG_ON(!inode); 1092 1093 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1094 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1095 key.offset = objectid; 1096 1097 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1098 1099 btrfs_release_path(root, path); 1100 if (ret == 0) { 1101 btrfs_inc_nlink(inode); 1102 btrfs_update_inode(trans, root, inode); 1103 } else if (ret == -EEXIST) { 1104 ret = 0; 1105 } else { 1106 BUG(); 1107 } 1108 iput(inode); 1109 1110 return ret; 1111 } 1112 1113 /* 1114 * when replaying the log for a directory, we only insert names 1115 * for inodes that actually exist. This means an fsync on a directory 1116 * does not implicitly fsync all the new files in it 1117 */ 1118 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1119 struct btrfs_root *root, 1120 struct btrfs_path *path, 1121 u64 dirid, u64 index, 1122 char *name, int name_len, u8 type, 1123 struct btrfs_key *location) 1124 { 1125 struct inode *inode; 1126 struct inode *dir; 1127 int ret; 1128 1129 inode = read_one_inode(root, location->objectid); 1130 if (!inode) 1131 return -ENOENT; 1132 1133 dir = read_one_inode(root, dirid); 1134 if (!dir) { 1135 iput(inode); 1136 return -EIO; 1137 } 1138 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1139 1140 /* FIXME, put inode into FIXUP list */ 1141 1142 iput(inode); 1143 iput(dir); 1144 return ret; 1145 } 1146 1147 /* 1148 * take a single entry in a log directory item and replay it into 1149 * the subvolume. 1150 * 1151 * if a conflicting item exists in the subdirectory already, 1152 * the inode it points to is unlinked and put into the link count 1153 * fix up tree. 1154 * 1155 * If a name from the log points to a file or directory that does 1156 * not exist in the FS, it is skipped. fsyncs on directories 1157 * do not force down inodes inside that directory, just changes to the 1158 * names or unlinks in a directory. 1159 */ 1160 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1161 struct btrfs_root *root, 1162 struct btrfs_path *path, 1163 struct extent_buffer *eb, 1164 struct btrfs_dir_item *di, 1165 struct btrfs_key *key) 1166 { 1167 char *name; 1168 int name_len; 1169 struct btrfs_dir_item *dst_di; 1170 struct btrfs_key found_key; 1171 struct btrfs_key log_key; 1172 struct inode *dir; 1173 u8 log_type; 1174 int exists; 1175 int ret; 1176 1177 dir = read_one_inode(root, key->objectid); 1178 BUG_ON(!dir); 1179 1180 name_len = btrfs_dir_name_len(eb, di); 1181 name = kmalloc(name_len, GFP_NOFS); 1182 if (!name) 1183 return -ENOMEM; 1184 1185 log_type = btrfs_dir_type(eb, di); 1186 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1187 name_len); 1188 1189 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1190 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1191 if (exists == 0) 1192 exists = 1; 1193 else 1194 exists = 0; 1195 btrfs_release_path(root, path); 1196 1197 if (key->type == BTRFS_DIR_ITEM_KEY) { 1198 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1199 name, name_len, 1); 1200 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1201 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1202 key->objectid, 1203 key->offset, name, 1204 name_len, 1); 1205 } else { 1206 BUG(); 1207 } 1208 if (!dst_di || IS_ERR(dst_di)) { 1209 /* we need a sequence number to insert, so we only 1210 * do inserts for the BTRFS_DIR_INDEX_KEY types 1211 */ 1212 if (key->type != BTRFS_DIR_INDEX_KEY) 1213 goto out; 1214 goto insert; 1215 } 1216 1217 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1218 /* the existing item matches the logged item */ 1219 if (found_key.objectid == log_key.objectid && 1220 found_key.type == log_key.type && 1221 found_key.offset == log_key.offset && 1222 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1223 goto out; 1224 } 1225 1226 /* 1227 * don't drop the conflicting directory entry if the inode 1228 * for the new entry doesn't exist 1229 */ 1230 if (!exists) 1231 goto out; 1232 1233 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1234 BUG_ON(ret); 1235 1236 if (key->type == BTRFS_DIR_INDEX_KEY) 1237 goto insert; 1238 out: 1239 btrfs_release_path(root, path); 1240 kfree(name); 1241 iput(dir); 1242 return 0; 1243 1244 insert: 1245 btrfs_release_path(root, path); 1246 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1247 name, name_len, log_type, &log_key); 1248 1249 BUG_ON(ret && ret != -ENOENT); 1250 goto out; 1251 } 1252 1253 /* 1254 * find all the names in a directory item and reconcile them into 1255 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1256 * one name in a directory item, but the same code gets used for 1257 * both directory index types 1258 */ 1259 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1260 struct btrfs_root *root, 1261 struct btrfs_path *path, 1262 struct extent_buffer *eb, int slot, 1263 struct btrfs_key *key) 1264 { 1265 int ret; 1266 u32 item_size = btrfs_item_size_nr(eb, slot); 1267 struct btrfs_dir_item *di; 1268 int name_len; 1269 unsigned long ptr; 1270 unsigned long ptr_end; 1271 1272 ptr = btrfs_item_ptr_offset(eb, slot); 1273 ptr_end = ptr + item_size; 1274 while (ptr < ptr_end) { 1275 di = (struct btrfs_dir_item *)ptr; 1276 if (verify_dir_item(root, eb, di)) 1277 return -EIO; 1278 name_len = btrfs_dir_name_len(eb, di); 1279 ret = replay_one_name(trans, root, path, eb, di, key); 1280 BUG_ON(ret); 1281 ptr = (unsigned long)(di + 1); 1282 ptr += name_len; 1283 } 1284 return 0; 1285 } 1286 1287 /* 1288 * directory replay has two parts. There are the standard directory 1289 * items in the log copied from the subvolume, and range items 1290 * created in the log while the subvolume was logged. 1291 * 1292 * The range items tell us which parts of the key space the log 1293 * is authoritative for. During replay, if a key in the subvolume 1294 * directory is in a logged range item, but not actually in the log 1295 * that means it was deleted from the directory before the fsync 1296 * and should be removed. 1297 */ 1298 static noinline int find_dir_range(struct btrfs_root *root, 1299 struct btrfs_path *path, 1300 u64 dirid, int key_type, 1301 u64 *start_ret, u64 *end_ret) 1302 { 1303 struct btrfs_key key; 1304 u64 found_end; 1305 struct btrfs_dir_log_item *item; 1306 int ret; 1307 int nritems; 1308 1309 if (*start_ret == (u64)-1) 1310 return 1; 1311 1312 key.objectid = dirid; 1313 key.type = key_type; 1314 key.offset = *start_ret; 1315 1316 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1317 if (ret < 0) 1318 goto out; 1319 if (ret > 0) { 1320 if (path->slots[0] == 0) 1321 goto out; 1322 path->slots[0]--; 1323 } 1324 if (ret != 0) 1325 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1326 1327 if (key.type != key_type || key.objectid != dirid) { 1328 ret = 1; 1329 goto next; 1330 } 1331 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1332 struct btrfs_dir_log_item); 1333 found_end = btrfs_dir_log_end(path->nodes[0], item); 1334 1335 if (*start_ret >= key.offset && *start_ret <= found_end) { 1336 ret = 0; 1337 *start_ret = key.offset; 1338 *end_ret = found_end; 1339 goto out; 1340 } 1341 ret = 1; 1342 next: 1343 /* check the next slot in the tree to see if it is a valid item */ 1344 nritems = btrfs_header_nritems(path->nodes[0]); 1345 if (path->slots[0] >= nritems) { 1346 ret = btrfs_next_leaf(root, path); 1347 if (ret) 1348 goto out; 1349 } else { 1350 path->slots[0]++; 1351 } 1352 1353 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1354 1355 if (key.type != key_type || key.objectid != dirid) { 1356 ret = 1; 1357 goto out; 1358 } 1359 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1360 struct btrfs_dir_log_item); 1361 found_end = btrfs_dir_log_end(path->nodes[0], item); 1362 *start_ret = key.offset; 1363 *end_ret = found_end; 1364 ret = 0; 1365 out: 1366 btrfs_release_path(root, path); 1367 return ret; 1368 } 1369 1370 /* 1371 * this looks for a given directory item in the log. If the directory 1372 * item is not in the log, the item is removed and the inode it points 1373 * to is unlinked 1374 */ 1375 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1376 struct btrfs_root *root, 1377 struct btrfs_root *log, 1378 struct btrfs_path *path, 1379 struct btrfs_path *log_path, 1380 struct inode *dir, 1381 struct btrfs_key *dir_key) 1382 { 1383 int ret; 1384 struct extent_buffer *eb; 1385 int slot; 1386 u32 item_size; 1387 struct btrfs_dir_item *di; 1388 struct btrfs_dir_item *log_di; 1389 int name_len; 1390 unsigned long ptr; 1391 unsigned long ptr_end; 1392 char *name; 1393 struct inode *inode; 1394 struct btrfs_key location; 1395 1396 again: 1397 eb = path->nodes[0]; 1398 slot = path->slots[0]; 1399 item_size = btrfs_item_size_nr(eb, slot); 1400 ptr = btrfs_item_ptr_offset(eb, slot); 1401 ptr_end = ptr + item_size; 1402 while (ptr < ptr_end) { 1403 di = (struct btrfs_dir_item *)ptr; 1404 if (verify_dir_item(root, eb, di)) { 1405 ret = -EIO; 1406 goto out; 1407 } 1408 1409 name_len = btrfs_dir_name_len(eb, di); 1410 name = kmalloc(name_len, GFP_NOFS); 1411 if (!name) { 1412 ret = -ENOMEM; 1413 goto out; 1414 } 1415 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1416 name_len); 1417 log_di = NULL; 1418 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1419 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1420 dir_key->objectid, 1421 name, name_len, 0); 1422 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1423 log_di = btrfs_lookup_dir_index_item(trans, log, 1424 log_path, 1425 dir_key->objectid, 1426 dir_key->offset, 1427 name, name_len, 0); 1428 } 1429 if (!log_di || IS_ERR(log_di)) { 1430 btrfs_dir_item_key_to_cpu(eb, di, &location); 1431 btrfs_release_path(root, path); 1432 btrfs_release_path(log, log_path); 1433 inode = read_one_inode(root, location.objectid); 1434 BUG_ON(!inode); 1435 1436 ret = link_to_fixup_dir(trans, root, 1437 path, location.objectid); 1438 BUG_ON(ret); 1439 btrfs_inc_nlink(inode); 1440 ret = btrfs_unlink_inode(trans, root, dir, inode, 1441 name, name_len); 1442 BUG_ON(ret); 1443 kfree(name); 1444 iput(inode); 1445 1446 /* there might still be more names under this key 1447 * check and repeat if required 1448 */ 1449 ret = btrfs_search_slot(NULL, root, dir_key, path, 1450 0, 0); 1451 if (ret == 0) 1452 goto again; 1453 ret = 0; 1454 goto out; 1455 } 1456 btrfs_release_path(log, log_path); 1457 kfree(name); 1458 1459 ptr = (unsigned long)(di + 1); 1460 ptr += name_len; 1461 } 1462 ret = 0; 1463 out: 1464 btrfs_release_path(root, path); 1465 btrfs_release_path(log, log_path); 1466 return ret; 1467 } 1468 1469 /* 1470 * deletion replay happens before we copy any new directory items 1471 * out of the log or out of backreferences from inodes. It 1472 * scans the log to find ranges of keys that log is authoritative for, 1473 * and then scans the directory to find items in those ranges that are 1474 * not present in the log. 1475 * 1476 * Anything we don't find in the log is unlinked and removed from the 1477 * directory. 1478 */ 1479 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1480 struct btrfs_root *root, 1481 struct btrfs_root *log, 1482 struct btrfs_path *path, 1483 u64 dirid, int del_all) 1484 { 1485 u64 range_start; 1486 u64 range_end; 1487 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1488 int ret = 0; 1489 struct btrfs_key dir_key; 1490 struct btrfs_key found_key; 1491 struct btrfs_path *log_path; 1492 struct inode *dir; 1493 1494 dir_key.objectid = dirid; 1495 dir_key.type = BTRFS_DIR_ITEM_KEY; 1496 log_path = btrfs_alloc_path(); 1497 if (!log_path) 1498 return -ENOMEM; 1499 1500 dir = read_one_inode(root, dirid); 1501 /* it isn't an error if the inode isn't there, that can happen 1502 * because we replay the deletes before we copy in the inode item 1503 * from the log 1504 */ 1505 if (!dir) { 1506 btrfs_free_path(log_path); 1507 return 0; 1508 } 1509 again: 1510 range_start = 0; 1511 range_end = 0; 1512 while (1) { 1513 if (del_all) 1514 range_end = (u64)-1; 1515 else { 1516 ret = find_dir_range(log, path, dirid, key_type, 1517 &range_start, &range_end); 1518 if (ret != 0) 1519 break; 1520 } 1521 1522 dir_key.offset = range_start; 1523 while (1) { 1524 int nritems; 1525 ret = btrfs_search_slot(NULL, root, &dir_key, path, 1526 0, 0); 1527 if (ret < 0) 1528 goto out; 1529 1530 nritems = btrfs_header_nritems(path->nodes[0]); 1531 if (path->slots[0] >= nritems) { 1532 ret = btrfs_next_leaf(root, path); 1533 if (ret) 1534 break; 1535 } 1536 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1537 path->slots[0]); 1538 if (found_key.objectid != dirid || 1539 found_key.type != dir_key.type) 1540 goto next_type; 1541 1542 if (found_key.offset > range_end) 1543 break; 1544 1545 ret = check_item_in_log(trans, root, log, path, 1546 log_path, dir, 1547 &found_key); 1548 BUG_ON(ret); 1549 if (found_key.offset == (u64)-1) 1550 break; 1551 dir_key.offset = found_key.offset + 1; 1552 } 1553 btrfs_release_path(root, path); 1554 if (range_end == (u64)-1) 1555 break; 1556 range_start = range_end + 1; 1557 } 1558 1559 next_type: 1560 ret = 0; 1561 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1562 key_type = BTRFS_DIR_LOG_INDEX_KEY; 1563 dir_key.type = BTRFS_DIR_INDEX_KEY; 1564 btrfs_release_path(root, path); 1565 goto again; 1566 } 1567 out: 1568 btrfs_release_path(root, path); 1569 btrfs_free_path(log_path); 1570 iput(dir); 1571 return ret; 1572 } 1573 1574 /* 1575 * the process_func used to replay items from the log tree. This 1576 * gets called in two different stages. The first stage just looks 1577 * for inodes and makes sure they are all copied into the subvolume. 1578 * 1579 * The second stage copies all the other item types from the log into 1580 * the subvolume. The two stage approach is slower, but gets rid of 1581 * lots of complexity around inodes referencing other inodes that exist 1582 * only in the log (references come from either directory items or inode 1583 * back refs). 1584 */ 1585 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 1586 struct walk_control *wc, u64 gen) 1587 { 1588 int nritems; 1589 struct btrfs_path *path; 1590 struct btrfs_root *root = wc->replay_dest; 1591 struct btrfs_key key; 1592 int level; 1593 int i; 1594 int ret; 1595 1596 btrfs_read_buffer(eb, gen); 1597 1598 level = btrfs_header_level(eb); 1599 1600 if (level != 0) 1601 return 0; 1602 1603 path = btrfs_alloc_path(); 1604 BUG_ON(!path); 1605 1606 nritems = btrfs_header_nritems(eb); 1607 for (i = 0; i < nritems; i++) { 1608 btrfs_item_key_to_cpu(eb, &key, i); 1609 1610 /* inode keys are done during the first stage */ 1611 if (key.type == BTRFS_INODE_ITEM_KEY && 1612 wc->stage == LOG_WALK_REPLAY_INODES) { 1613 struct btrfs_inode_item *inode_item; 1614 u32 mode; 1615 1616 inode_item = btrfs_item_ptr(eb, i, 1617 struct btrfs_inode_item); 1618 mode = btrfs_inode_mode(eb, inode_item); 1619 if (S_ISDIR(mode)) { 1620 ret = replay_dir_deletes(wc->trans, 1621 root, log, path, key.objectid, 0); 1622 BUG_ON(ret); 1623 } 1624 ret = overwrite_item(wc->trans, root, path, 1625 eb, i, &key); 1626 BUG_ON(ret); 1627 1628 /* for regular files, make sure corresponding 1629 * orhpan item exist. extents past the new EOF 1630 * will be truncated later by orphan cleanup. 1631 */ 1632 if (S_ISREG(mode)) { 1633 ret = insert_orphan_item(wc->trans, root, 1634 key.objectid); 1635 BUG_ON(ret); 1636 } 1637 1638 ret = link_to_fixup_dir(wc->trans, root, 1639 path, key.objectid); 1640 BUG_ON(ret); 1641 } 1642 if (wc->stage < LOG_WALK_REPLAY_ALL) 1643 continue; 1644 1645 /* these keys are simply copied */ 1646 if (key.type == BTRFS_XATTR_ITEM_KEY) { 1647 ret = overwrite_item(wc->trans, root, path, 1648 eb, i, &key); 1649 BUG_ON(ret); 1650 } else if (key.type == BTRFS_INODE_REF_KEY) { 1651 ret = add_inode_ref(wc->trans, root, log, path, 1652 eb, i, &key); 1653 BUG_ON(ret && ret != -ENOENT); 1654 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1655 ret = replay_one_extent(wc->trans, root, path, 1656 eb, i, &key); 1657 BUG_ON(ret); 1658 } else if (key.type == BTRFS_DIR_ITEM_KEY || 1659 key.type == BTRFS_DIR_INDEX_KEY) { 1660 ret = replay_one_dir_item(wc->trans, root, path, 1661 eb, i, &key); 1662 BUG_ON(ret); 1663 } 1664 } 1665 btrfs_free_path(path); 1666 return 0; 1667 } 1668 1669 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 1670 struct btrfs_root *root, 1671 struct btrfs_path *path, int *level, 1672 struct walk_control *wc) 1673 { 1674 u64 root_owner; 1675 u64 bytenr; 1676 u64 ptr_gen; 1677 struct extent_buffer *next; 1678 struct extent_buffer *cur; 1679 struct extent_buffer *parent; 1680 u32 blocksize; 1681 int ret = 0; 1682 1683 WARN_ON(*level < 0); 1684 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1685 1686 while (*level > 0) { 1687 WARN_ON(*level < 0); 1688 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1689 cur = path->nodes[*level]; 1690 1691 if (btrfs_header_level(cur) != *level) 1692 WARN_ON(1); 1693 1694 if (path->slots[*level] >= 1695 btrfs_header_nritems(cur)) 1696 break; 1697 1698 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 1699 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 1700 blocksize = btrfs_level_size(root, *level - 1); 1701 1702 parent = path->nodes[*level]; 1703 root_owner = btrfs_header_owner(parent); 1704 1705 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1706 if (!next) 1707 return -ENOMEM; 1708 1709 if (*level == 1) { 1710 wc->process_func(root, next, wc, ptr_gen); 1711 1712 path->slots[*level]++; 1713 if (wc->free) { 1714 btrfs_read_buffer(next, ptr_gen); 1715 1716 btrfs_tree_lock(next); 1717 clean_tree_block(trans, root, next); 1718 btrfs_set_lock_blocking(next); 1719 btrfs_wait_tree_block_writeback(next); 1720 btrfs_tree_unlock(next); 1721 1722 WARN_ON(root_owner != 1723 BTRFS_TREE_LOG_OBJECTID); 1724 ret = btrfs_free_reserved_extent(root, 1725 bytenr, blocksize); 1726 BUG_ON(ret); 1727 } 1728 free_extent_buffer(next); 1729 continue; 1730 } 1731 btrfs_read_buffer(next, ptr_gen); 1732 1733 WARN_ON(*level <= 0); 1734 if (path->nodes[*level-1]) 1735 free_extent_buffer(path->nodes[*level-1]); 1736 path->nodes[*level-1] = next; 1737 *level = btrfs_header_level(next); 1738 path->slots[*level] = 0; 1739 cond_resched(); 1740 } 1741 WARN_ON(*level < 0); 1742 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1743 1744 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 1745 1746 cond_resched(); 1747 return 0; 1748 } 1749 1750 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 1751 struct btrfs_root *root, 1752 struct btrfs_path *path, int *level, 1753 struct walk_control *wc) 1754 { 1755 u64 root_owner; 1756 int i; 1757 int slot; 1758 int ret; 1759 1760 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1761 slot = path->slots[i]; 1762 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 1763 path->slots[i]++; 1764 *level = i; 1765 WARN_ON(*level == 0); 1766 return 0; 1767 } else { 1768 struct extent_buffer *parent; 1769 if (path->nodes[*level] == root->node) 1770 parent = path->nodes[*level]; 1771 else 1772 parent = path->nodes[*level + 1]; 1773 1774 root_owner = btrfs_header_owner(parent); 1775 wc->process_func(root, path->nodes[*level], wc, 1776 btrfs_header_generation(path->nodes[*level])); 1777 if (wc->free) { 1778 struct extent_buffer *next; 1779 1780 next = path->nodes[*level]; 1781 1782 btrfs_tree_lock(next); 1783 clean_tree_block(trans, root, next); 1784 btrfs_set_lock_blocking(next); 1785 btrfs_wait_tree_block_writeback(next); 1786 btrfs_tree_unlock(next); 1787 1788 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1789 ret = btrfs_free_reserved_extent(root, 1790 path->nodes[*level]->start, 1791 path->nodes[*level]->len); 1792 BUG_ON(ret); 1793 } 1794 free_extent_buffer(path->nodes[*level]); 1795 path->nodes[*level] = NULL; 1796 *level = i + 1; 1797 } 1798 } 1799 return 1; 1800 } 1801 1802 /* 1803 * drop the reference count on the tree rooted at 'snap'. This traverses 1804 * the tree freeing any blocks that have a ref count of zero after being 1805 * decremented. 1806 */ 1807 static int walk_log_tree(struct btrfs_trans_handle *trans, 1808 struct btrfs_root *log, struct walk_control *wc) 1809 { 1810 int ret = 0; 1811 int wret; 1812 int level; 1813 struct btrfs_path *path; 1814 int i; 1815 int orig_level; 1816 1817 path = btrfs_alloc_path(); 1818 if (!path) 1819 return -ENOMEM; 1820 1821 level = btrfs_header_level(log->node); 1822 orig_level = level; 1823 path->nodes[level] = log->node; 1824 extent_buffer_get(log->node); 1825 path->slots[level] = 0; 1826 1827 while (1) { 1828 wret = walk_down_log_tree(trans, log, path, &level, wc); 1829 if (wret > 0) 1830 break; 1831 if (wret < 0) 1832 ret = wret; 1833 1834 wret = walk_up_log_tree(trans, log, path, &level, wc); 1835 if (wret > 0) 1836 break; 1837 if (wret < 0) 1838 ret = wret; 1839 } 1840 1841 /* was the root node processed? if not, catch it here */ 1842 if (path->nodes[orig_level]) { 1843 wc->process_func(log, path->nodes[orig_level], wc, 1844 btrfs_header_generation(path->nodes[orig_level])); 1845 if (wc->free) { 1846 struct extent_buffer *next; 1847 1848 next = path->nodes[orig_level]; 1849 1850 btrfs_tree_lock(next); 1851 clean_tree_block(trans, log, next); 1852 btrfs_set_lock_blocking(next); 1853 btrfs_wait_tree_block_writeback(next); 1854 btrfs_tree_unlock(next); 1855 1856 WARN_ON(log->root_key.objectid != 1857 BTRFS_TREE_LOG_OBJECTID); 1858 ret = btrfs_free_reserved_extent(log, next->start, 1859 next->len); 1860 BUG_ON(ret); 1861 } 1862 } 1863 1864 for (i = 0; i <= orig_level; i++) { 1865 if (path->nodes[i]) { 1866 free_extent_buffer(path->nodes[i]); 1867 path->nodes[i] = NULL; 1868 } 1869 } 1870 btrfs_free_path(path); 1871 return ret; 1872 } 1873 1874 /* 1875 * helper function to update the item for a given subvolumes log root 1876 * in the tree of log roots 1877 */ 1878 static int update_log_root(struct btrfs_trans_handle *trans, 1879 struct btrfs_root *log) 1880 { 1881 int ret; 1882 1883 if (log->log_transid == 1) { 1884 /* insert root item on the first sync */ 1885 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 1886 &log->root_key, &log->root_item); 1887 } else { 1888 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 1889 &log->root_key, &log->root_item); 1890 } 1891 return ret; 1892 } 1893 1894 static int wait_log_commit(struct btrfs_trans_handle *trans, 1895 struct btrfs_root *root, unsigned long transid) 1896 { 1897 DEFINE_WAIT(wait); 1898 int index = transid % 2; 1899 1900 /* 1901 * we only allow two pending log transactions at a time, 1902 * so we know that if ours is more than 2 older than the 1903 * current transaction, we're done 1904 */ 1905 do { 1906 prepare_to_wait(&root->log_commit_wait[index], 1907 &wait, TASK_UNINTERRUPTIBLE); 1908 mutex_unlock(&root->log_mutex); 1909 1910 if (root->fs_info->last_trans_log_full_commit != 1911 trans->transid && root->log_transid < transid + 2 && 1912 atomic_read(&root->log_commit[index])) 1913 schedule(); 1914 1915 finish_wait(&root->log_commit_wait[index], &wait); 1916 mutex_lock(&root->log_mutex); 1917 } while (root->log_transid < transid + 2 && 1918 atomic_read(&root->log_commit[index])); 1919 return 0; 1920 } 1921 1922 static int wait_for_writer(struct btrfs_trans_handle *trans, 1923 struct btrfs_root *root) 1924 { 1925 DEFINE_WAIT(wait); 1926 while (atomic_read(&root->log_writers)) { 1927 prepare_to_wait(&root->log_writer_wait, 1928 &wait, TASK_UNINTERRUPTIBLE); 1929 mutex_unlock(&root->log_mutex); 1930 if (root->fs_info->last_trans_log_full_commit != 1931 trans->transid && atomic_read(&root->log_writers)) 1932 schedule(); 1933 mutex_lock(&root->log_mutex); 1934 finish_wait(&root->log_writer_wait, &wait); 1935 } 1936 return 0; 1937 } 1938 1939 /* 1940 * btrfs_sync_log does sends a given tree log down to the disk and 1941 * updates the super blocks to record it. When this call is done, 1942 * you know that any inodes previously logged are safely on disk only 1943 * if it returns 0. 1944 * 1945 * Any other return value means you need to call btrfs_commit_transaction. 1946 * Some of the edge cases for fsyncing directories that have had unlinks 1947 * or renames done in the past mean that sometimes the only safe 1948 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 1949 * that has happened. 1950 */ 1951 int btrfs_sync_log(struct btrfs_trans_handle *trans, 1952 struct btrfs_root *root) 1953 { 1954 int index1; 1955 int index2; 1956 int mark; 1957 int ret; 1958 struct btrfs_root *log = root->log_root; 1959 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 1960 unsigned long log_transid = 0; 1961 1962 mutex_lock(&root->log_mutex); 1963 index1 = root->log_transid % 2; 1964 if (atomic_read(&root->log_commit[index1])) { 1965 wait_log_commit(trans, root, root->log_transid); 1966 mutex_unlock(&root->log_mutex); 1967 return 0; 1968 } 1969 atomic_set(&root->log_commit[index1], 1); 1970 1971 /* wait for previous tree log sync to complete */ 1972 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 1973 wait_log_commit(trans, root, root->log_transid - 1); 1974 1975 while (1) { 1976 unsigned long batch = root->log_batch; 1977 if (root->log_multiple_pids) { 1978 mutex_unlock(&root->log_mutex); 1979 schedule_timeout_uninterruptible(1); 1980 mutex_lock(&root->log_mutex); 1981 } 1982 wait_for_writer(trans, root); 1983 if (batch == root->log_batch) 1984 break; 1985 } 1986 1987 /* bail out if we need to do a full commit */ 1988 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 1989 ret = -EAGAIN; 1990 mutex_unlock(&root->log_mutex); 1991 goto out; 1992 } 1993 1994 log_transid = root->log_transid; 1995 if (log_transid % 2 == 0) 1996 mark = EXTENT_DIRTY; 1997 else 1998 mark = EXTENT_NEW; 1999 2000 /* we start IO on all the marked extents here, but we don't actually 2001 * wait for them until later. 2002 */ 2003 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2004 BUG_ON(ret); 2005 2006 btrfs_set_root_node(&log->root_item, log->node); 2007 2008 root->log_batch = 0; 2009 root->log_transid++; 2010 log->log_transid = root->log_transid; 2011 root->log_start_pid = 0; 2012 smp_mb(); 2013 /* 2014 * IO has been started, blocks of the log tree have WRITTEN flag set 2015 * in their headers. new modifications of the log will be written to 2016 * new positions. so it's safe to allow log writers to go in. 2017 */ 2018 mutex_unlock(&root->log_mutex); 2019 2020 mutex_lock(&log_root_tree->log_mutex); 2021 log_root_tree->log_batch++; 2022 atomic_inc(&log_root_tree->log_writers); 2023 mutex_unlock(&log_root_tree->log_mutex); 2024 2025 ret = update_log_root(trans, log); 2026 2027 mutex_lock(&log_root_tree->log_mutex); 2028 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2029 smp_mb(); 2030 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2031 wake_up(&log_root_tree->log_writer_wait); 2032 } 2033 2034 if (ret) { 2035 BUG_ON(ret != -ENOSPC); 2036 root->fs_info->last_trans_log_full_commit = trans->transid; 2037 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2038 mutex_unlock(&log_root_tree->log_mutex); 2039 ret = -EAGAIN; 2040 goto out; 2041 } 2042 2043 index2 = log_root_tree->log_transid % 2; 2044 if (atomic_read(&log_root_tree->log_commit[index2])) { 2045 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2046 wait_log_commit(trans, log_root_tree, 2047 log_root_tree->log_transid); 2048 mutex_unlock(&log_root_tree->log_mutex); 2049 ret = 0; 2050 goto out; 2051 } 2052 atomic_set(&log_root_tree->log_commit[index2], 1); 2053 2054 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2055 wait_log_commit(trans, log_root_tree, 2056 log_root_tree->log_transid - 1); 2057 } 2058 2059 wait_for_writer(trans, log_root_tree); 2060 2061 /* 2062 * now that we've moved on to the tree of log tree roots, 2063 * check the full commit flag again 2064 */ 2065 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2066 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2067 mutex_unlock(&log_root_tree->log_mutex); 2068 ret = -EAGAIN; 2069 goto out_wake_log_root; 2070 } 2071 2072 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2073 &log_root_tree->dirty_log_pages, 2074 EXTENT_DIRTY | EXTENT_NEW); 2075 BUG_ON(ret); 2076 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2077 2078 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2079 log_root_tree->node->start); 2080 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2081 btrfs_header_level(log_root_tree->node)); 2082 2083 log_root_tree->log_batch = 0; 2084 log_root_tree->log_transid++; 2085 smp_mb(); 2086 2087 mutex_unlock(&log_root_tree->log_mutex); 2088 2089 /* 2090 * nobody else is going to jump in and write the the ctree 2091 * super here because the log_commit atomic below is protecting 2092 * us. We must be called with a transaction handle pinning 2093 * the running transaction open, so a full commit can't hop 2094 * in and cause problems either. 2095 */ 2096 write_ctree_super(trans, root->fs_info->tree_root, 1); 2097 ret = 0; 2098 2099 mutex_lock(&root->log_mutex); 2100 if (root->last_log_commit < log_transid) 2101 root->last_log_commit = log_transid; 2102 mutex_unlock(&root->log_mutex); 2103 2104 out_wake_log_root: 2105 atomic_set(&log_root_tree->log_commit[index2], 0); 2106 smp_mb(); 2107 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2108 wake_up(&log_root_tree->log_commit_wait[index2]); 2109 out: 2110 atomic_set(&root->log_commit[index1], 0); 2111 smp_mb(); 2112 if (waitqueue_active(&root->log_commit_wait[index1])) 2113 wake_up(&root->log_commit_wait[index1]); 2114 return ret; 2115 } 2116 2117 static void free_log_tree(struct btrfs_trans_handle *trans, 2118 struct btrfs_root *log) 2119 { 2120 int ret; 2121 u64 start; 2122 u64 end; 2123 struct walk_control wc = { 2124 .free = 1, 2125 .process_func = process_one_buffer 2126 }; 2127 2128 ret = walk_log_tree(trans, log, &wc); 2129 BUG_ON(ret); 2130 2131 while (1) { 2132 ret = find_first_extent_bit(&log->dirty_log_pages, 2133 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); 2134 if (ret) 2135 break; 2136 2137 clear_extent_bits(&log->dirty_log_pages, start, end, 2138 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2139 } 2140 2141 free_extent_buffer(log->node); 2142 kfree(log); 2143 } 2144 2145 /* 2146 * free all the extents used by the tree log. This should be called 2147 * at commit time of the full transaction 2148 */ 2149 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2150 { 2151 if (root->log_root) { 2152 free_log_tree(trans, root->log_root); 2153 root->log_root = NULL; 2154 } 2155 return 0; 2156 } 2157 2158 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 2159 struct btrfs_fs_info *fs_info) 2160 { 2161 if (fs_info->log_root_tree) { 2162 free_log_tree(trans, fs_info->log_root_tree); 2163 fs_info->log_root_tree = NULL; 2164 } 2165 return 0; 2166 } 2167 2168 /* 2169 * If both a file and directory are logged, and unlinks or renames are 2170 * mixed in, we have a few interesting corners: 2171 * 2172 * create file X in dir Y 2173 * link file X to X.link in dir Y 2174 * fsync file X 2175 * unlink file X but leave X.link 2176 * fsync dir Y 2177 * 2178 * After a crash we would expect only X.link to exist. But file X 2179 * didn't get fsync'd again so the log has back refs for X and X.link. 2180 * 2181 * We solve this by removing directory entries and inode backrefs from the 2182 * log when a file that was logged in the current transaction is 2183 * unlinked. Any later fsync will include the updated log entries, and 2184 * we'll be able to reconstruct the proper directory items from backrefs. 2185 * 2186 * This optimizations allows us to avoid relogging the entire inode 2187 * or the entire directory. 2188 */ 2189 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2190 struct btrfs_root *root, 2191 const char *name, int name_len, 2192 struct inode *dir, u64 index) 2193 { 2194 struct btrfs_root *log; 2195 struct btrfs_dir_item *di; 2196 struct btrfs_path *path; 2197 int ret; 2198 int err = 0; 2199 int bytes_del = 0; 2200 2201 if (BTRFS_I(dir)->logged_trans < trans->transid) 2202 return 0; 2203 2204 ret = join_running_log_trans(root); 2205 if (ret) 2206 return 0; 2207 2208 mutex_lock(&BTRFS_I(dir)->log_mutex); 2209 2210 log = root->log_root; 2211 path = btrfs_alloc_path(); 2212 if (!path) { 2213 err = -ENOMEM; 2214 goto out_unlock; 2215 } 2216 2217 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2218 name, name_len, -1); 2219 if (IS_ERR(di)) { 2220 err = PTR_ERR(di); 2221 goto fail; 2222 } 2223 if (di) { 2224 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2225 bytes_del += name_len; 2226 BUG_ON(ret); 2227 } 2228 btrfs_release_path(log, path); 2229 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2230 index, name, name_len, -1); 2231 if (IS_ERR(di)) { 2232 err = PTR_ERR(di); 2233 goto fail; 2234 } 2235 if (di) { 2236 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2237 bytes_del += name_len; 2238 BUG_ON(ret); 2239 } 2240 2241 /* update the directory size in the log to reflect the names 2242 * we have removed 2243 */ 2244 if (bytes_del) { 2245 struct btrfs_key key; 2246 2247 key.objectid = dir->i_ino; 2248 key.offset = 0; 2249 key.type = BTRFS_INODE_ITEM_KEY; 2250 btrfs_release_path(log, path); 2251 2252 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2253 if (ret < 0) { 2254 err = ret; 2255 goto fail; 2256 } 2257 if (ret == 0) { 2258 struct btrfs_inode_item *item; 2259 u64 i_size; 2260 2261 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2262 struct btrfs_inode_item); 2263 i_size = btrfs_inode_size(path->nodes[0], item); 2264 if (i_size > bytes_del) 2265 i_size -= bytes_del; 2266 else 2267 i_size = 0; 2268 btrfs_set_inode_size(path->nodes[0], item, i_size); 2269 btrfs_mark_buffer_dirty(path->nodes[0]); 2270 } else 2271 ret = 0; 2272 btrfs_release_path(log, path); 2273 } 2274 fail: 2275 btrfs_free_path(path); 2276 out_unlock: 2277 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2278 if (ret == -ENOSPC) { 2279 root->fs_info->last_trans_log_full_commit = trans->transid; 2280 ret = 0; 2281 } 2282 btrfs_end_log_trans(root); 2283 2284 return err; 2285 } 2286 2287 /* see comments for btrfs_del_dir_entries_in_log */ 2288 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2289 struct btrfs_root *root, 2290 const char *name, int name_len, 2291 struct inode *inode, u64 dirid) 2292 { 2293 struct btrfs_root *log; 2294 u64 index; 2295 int ret; 2296 2297 if (BTRFS_I(inode)->logged_trans < trans->transid) 2298 return 0; 2299 2300 ret = join_running_log_trans(root); 2301 if (ret) 2302 return 0; 2303 log = root->log_root; 2304 mutex_lock(&BTRFS_I(inode)->log_mutex); 2305 2306 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2307 dirid, &index); 2308 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2309 if (ret == -ENOSPC) { 2310 root->fs_info->last_trans_log_full_commit = trans->transid; 2311 ret = 0; 2312 } 2313 btrfs_end_log_trans(root); 2314 2315 return ret; 2316 } 2317 2318 /* 2319 * creates a range item in the log for 'dirid'. first_offset and 2320 * last_offset tell us which parts of the key space the log should 2321 * be considered authoritative for. 2322 */ 2323 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2324 struct btrfs_root *log, 2325 struct btrfs_path *path, 2326 int key_type, u64 dirid, 2327 u64 first_offset, u64 last_offset) 2328 { 2329 int ret; 2330 struct btrfs_key key; 2331 struct btrfs_dir_log_item *item; 2332 2333 key.objectid = dirid; 2334 key.offset = first_offset; 2335 if (key_type == BTRFS_DIR_ITEM_KEY) 2336 key.type = BTRFS_DIR_LOG_ITEM_KEY; 2337 else 2338 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2339 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2340 if (ret) 2341 return ret; 2342 2343 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2344 struct btrfs_dir_log_item); 2345 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2346 btrfs_mark_buffer_dirty(path->nodes[0]); 2347 btrfs_release_path(log, path); 2348 return 0; 2349 } 2350 2351 /* 2352 * log all the items included in the current transaction for a given 2353 * directory. This also creates the range items in the log tree required 2354 * to replay anything deleted before the fsync 2355 */ 2356 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2357 struct btrfs_root *root, struct inode *inode, 2358 struct btrfs_path *path, 2359 struct btrfs_path *dst_path, int key_type, 2360 u64 min_offset, u64 *last_offset_ret) 2361 { 2362 struct btrfs_key min_key; 2363 struct btrfs_key max_key; 2364 struct btrfs_root *log = root->log_root; 2365 struct extent_buffer *src; 2366 int err = 0; 2367 int ret; 2368 int i; 2369 int nritems; 2370 u64 first_offset = min_offset; 2371 u64 last_offset = (u64)-1; 2372 2373 log = root->log_root; 2374 max_key.objectid = inode->i_ino; 2375 max_key.offset = (u64)-1; 2376 max_key.type = key_type; 2377 2378 min_key.objectid = inode->i_ino; 2379 min_key.type = key_type; 2380 min_key.offset = min_offset; 2381 2382 path->keep_locks = 1; 2383 2384 ret = btrfs_search_forward(root, &min_key, &max_key, 2385 path, 0, trans->transid); 2386 2387 /* 2388 * we didn't find anything from this transaction, see if there 2389 * is anything at all 2390 */ 2391 if (ret != 0 || min_key.objectid != inode->i_ino || 2392 min_key.type != key_type) { 2393 min_key.objectid = inode->i_ino; 2394 min_key.type = key_type; 2395 min_key.offset = (u64)-1; 2396 btrfs_release_path(root, path); 2397 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2398 if (ret < 0) { 2399 btrfs_release_path(root, path); 2400 return ret; 2401 } 2402 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2403 2404 /* if ret == 0 there are items for this type, 2405 * create a range to tell us the last key of this type. 2406 * otherwise, there are no items in this directory after 2407 * *min_offset, and we create a range to indicate that. 2408 */ 2409 if (ret == 0) { 2410 struct btrfs_key tmp; 2411 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 2412 path->slots[0]); 2413 if (key_type == tmp.type) 2414 first_offset = max(min_offset, tmp.offset) + 1; 2415 } 2416 goto done; 2417 } 2418 2419 /* go backward to find any previous key */ 2420 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2421 if (ret == 0) { 2422 struct btrfs_key tmp; 2423 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2424 if (key_type == tmp.type) { 2425 first_offset = tmp.offset; 2426 ret = overwrite_item(trans, log, dst_path, 2427 path->nodes[0], path->slots[0], 2428 &tmp); 2429 if (ret) { 2430 err = ret; 2431 goto done; 2432 } 2433 } 2434 } 2435 btrfs_release_path(root, path); 2436 2437 /* find the first key from this transaction again */ 2438 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2439 if (ret != 0) { 2440 WARN_ON(1); 2441 goto done; 2442 } 2443 2444 /* 2445 * we have a block from this transaction, log every item in it 2446 * from our directory 2447 */ 2448 while (1) { 2449 struct btrfs_key tmp; 2450 src = path->nodes[0]; 2451 nritems = btrfs_header_nritems(src); 2452 for (i = path->slots[0]; i < nritems; i++) { 2453 btrfs_item_key_to_cpu(src, &min_key, i); 2454 2455 if (min_key.objectid != inode->i_ino || 2456 min_key.type != key_type) 2457 goto done; 2458 ret = overwrite_item(trans, log, dst_path, src, i, 2459 &min_key); 2460 if (ret) { 2461 err = ret; 2462 goto done; 2463 } 2464 } 2465 path->slots[0] = nritems; 2466 2467 /* 2468 * look ahead to the next item and see if it is also 2469 * from this directory and from this transaction 2470 */ 2471 ret = btrfs_next_leaf(root, path); 2472 if (ret == 1) { 2473 last_offset = (u64)-1; 2474 goto done; 2475 } 2476 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2477 if (tmp.objectid != inode->i_ino || tmp.type != key_type) { 2478 last_offset = (u64)-1; 2479 goto done; 2480 } 2481 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 2482 ret = overwrite_item(trans, log, dst_path, 2483 path->nodes[0], path->slots[0], 2484 &tmp); 2485 if (ret) 2486 err = ret; 2487 else 2488 last_offset = tmp.offset; 2489 goto done; 2490 } 2491 } 2492 done: 2493 btrfs_release_path(root, path); 2494 btrfs_release_path(log, dst_path); 2495 2496 if (err == 0) { 2497 *last_offset_ret = last_offset; 2498 /* 2499 * insert the log range keys to indicate where the log 2500 * is valid 2501 */ 2502 ret = insert_dir_log_key(trans, log, path, key_type, 2503 inode->i_ino, first_offset, 2504 last_offset); 2505 if (ret) 2506 err = ret; 2507 } 2508 return err; 2509 } 2510 2511 /* 2512 * logging directories is very similar to logging inodes, We find all the items 2513 * from the current transaction and write them to the log. 2514 * 2515 * The recovery code scans the directory in the subvolume, and if it finds a 2516 * key in the range logged that is not present in the log tree, then it means 2517 * that dir entry was unlinked during the transaction. 2518 * 2519 * In order for that scan to work, we must include one key smaller than 2520 * the smallest logged by this transaction and one key larger than the largest 2521 * key logged by this transaction. 2522 */ 2523 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 2524 struct btrfs_root *root, struct inode *inode, 2525 struct btrfs_path *path, 2526 struct btrfs_path *dst_path) 2527 { 2528 u64 min_key; 2529 u64 max_key; 2530 int ret; 2531 int key_type = BTRFS_DIR_ITEM_KEY; 2532 2533 again: 2534 min_key = 0; 2535 max_key = 0; 2536 while (1) { 2537 ret = log_dir_items(trans, root, inode, path, 2538 dst_path, key_type, min_key, 2539 &max_key); 2540 if (ret) 2541 return ret; 2542 if (max_key == (u64)-1) 2543 break; 2544 min_key = max_key + 1; 2545 } 2546 2547 if (key_type == BTRFS_DIR_ITEM_KEY) { 2548 key_type = BTRFS_DIR_INDEX_KEY; 2549 goto again; 2550 } 2551 return 0; 2552 } 2553 2554 /* 2555 * a helper function to drop items from the log before we relog an 2556 * inode. max_key_type indicates the highest item type to remove. 2557 * This cannot be run for file data extents because it does not 2558 * free the extents they point to. 2559 */ 2560 static int drop_objectid_items(struct btrfs_trans_handle *trans, 2561 struct btrfs_root *log, 2562 struct btrfs_path *path, 2563 u64 objectid, int max_key_type) 2564 { 2565 int ret; 2566 struct btrfs_key key; 2567 struct btrfs_key found_key; 2568 2569 key.objectid = objectid; 2570 key.type = max_key_type; 2571 key.offset = (u64)-1; 2572 2573 while (1) { 2574 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2575 BUG_ON(ret == 0); 2576 if (ret < 0) 2577 break; 2578 2579 if (path->slots[0] == 0) 2580 break; 2581 2582 path->slots[0]--; 2583 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2584 path->slots[0]); 2585 2586 if (found_key.objectid != objectid) 2587 break; 2588 2589 ret = btrfs_del_item(trans, log, path); 2590 BUG_ON(ret); 2591 btrfs_release_path(log, path); 2592 } 2593 btrfs_release_path(log, path); 2594 return ret; 2595 } 2596 2597 static noinline int copy_items(struct btrfs_trans_handle *trans, 2598 struct btrfs_root *log, 2599 struct btrfs_path *dst_path, 2600 struct extent_buffer *src, 2601 int start_slot, int nr, int inode_only) 2602 { 2603 unsigned long src_offset; 2604 unsigned long dst_offset; 2605 struct btrfs_file_extent_item *extent; 2606 struct btrfs_inode_item *inode_item; 2607 int ret; 2608 struct btrfs_key *ins_keys; 2609 u32 *ins_sizes; 2610 char *ins_data; 2611 int i; 2612 struct list_head ordered_sums; 2613 2614 INIT_LIST_HEAD(&ordered_sums); 2615 2616 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2617 nr * sizeof(u32), GFP_NOFS); 2618 if (!ins_data) 2619 return -ENOMEM; 2620 2621 ins_sizes = (u32 *)ins_data; 2622 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 2623 2624 for (i = 0; i < nr; i++) { 2625 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 2626 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 2627 } 2628 ret = btrfs_insert_empty_items(trans, log, dst_path, 2629 ins_keys, ins_sizes, nr); 2630 if (ret) { 2631 kfree(ins_data); 2632 return ret; 2633 } 2634 2635 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 2636 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2637 dst_path->slots[0]); 2638 2639 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 2640 2641 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 2642 src_offset, ins_sizes[i]); 2643 2644 if (inode_only == LOG_INODE_EXISTS && 2645 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 2646 inode_item = btrfs_item_ptr(dst_path->nodes[0], 2647 dst_path->slots[0], 2648 struct btrfs_inode_item); 2649 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); 2650 2651 /* set the generation to zero so the recover code 2652 * can tell the difference between an logging 2653 * just to say 'this inode exists' and a logging 2654 * to say 'update this inode with these values' 2655 */ 2656 btrfs_set_inode_generation(dst_path->nodes[0], 2657 inode_item, 0); 2658 } 2659 /* take a reference on file data extents so that truncates 2660 * or deletes of this inode don't have to relog the inode 2661 * again 2662 */ 2663 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { 2664 int found_type; 2665 extent = btrfs_item_ptr(src, start_slot + i, 2666 struct btrfs_file_extent_item); 2667 2668 found_type = btrfs_file_extent_type(src, extent); 2669 if (found_type == BTRFS_FILE_EXTENT_REG || 2670 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 2671 u64 ds, dl, cs, cl; 2672 ds = btrfs_file_extent_disk_bytenr(src, 2673 extent); 2674 /* ds == 0 is a hole */ 2675 if (ds == 0) 2676 continue; 2677 2678 dl = btrfs_file_extent_disk_num_bytes(src, 2679 extent); 2680 cs = btrfs_file_extent_offset(src, extent); 2681 cl = btrfs_file_extent_num_bytes(src, 2682 extent); 2683 if (btrfs_file_extent_compression(src, 2684 extent)) { 2685 cs = 0; 2686 cl = dl; 2687 } 2688 2689 ret = btrfs_lookup_csums_range( 2690 log->fs_info->csum_root, 2691 ds + cs, ds + cs + cl - 1, 2692 &ordered_sums); 2693 BUG_ON(ret); 2694 } 2695 } 2696 } 2697 2698 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 2699 btrfs_release_path(log, dst_path); 2700 kfree(ins_data); 2701 2702 /* 2703 * we have to do this after the loop above to avoid changing the 2704 * log tree while trying to change the log tree. 2705 */ 2706 ret = 0; 2707 while (!list_empty(&ordered_sums)) { 2708 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2709 struct btrfs_ordered_sum, 2710 list); 2711 if (!ret) 2712 ret = btrfs_csum_file_blocks(trans, log, sums); 2713 list_del(&sums->list); 2714 kfree(sums); 2715 } 2716 return ret; 2717 } 2718 2719 /* log a single inode in the tree log. 2720 * At least one parent directory for this inode must exist in the tree 2721 * or be logged already. 2722 * 2723 * Any items from this inode changed by the current transaction are copied 2724 * to the log tree. An extra reference is taken on any extents in this 2725 * file, allowing us to avoid a whole pile of corner cases around logging 2726 * blocks that have been removed from the tree. 2727 * 2728 * See LOG_INODE_ALL and related defines for a description of what inode_only 2729 * does. 2730 * 2731 * This handles both files and directories. 2732 */ 2733 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 2734 struct btrfs_root *root, struct inode *inode, 2735 int inode_only) 2736 { 2737 struct btrfs_path *path; 2738 struct btrfs_path *dst_path; 2739 struct btrfs_key min_key; 2740 struct btrfs_key max_key; 2741 struct btrfs_root *log = root->log_root; 2742 struct extent_buffer *src = NULL; 2743 int err = 0; 2744 int ret; 2745 int nritems; 2746 int ins_start_slot = 0; 2747 int ins_nr; 2748 2749 log = root->log_root; 2750 2751 path = btrfs_alloc_path(); 2752 if (!path) 2753 return -ENOMEM; 2754 dst_path = btrfs_alloc_path(); 2755 if (!dst_path) { 2756 btrfs_free_path(path); 2757 return -ENOMEM; 2758 } 2759 2760 min_key.objectid = inode->i_ino; 2761 min_key.type = BTRFS_INODE_ITEM_KEY; 2762 min_key.offset = 0; 2763 2764 max_key.objectid = inode->i_ino; 2765 2766 /* today the code can only do partial logging of directories */ 2767 if (!S_ISDIR(inode->i_mode)) 2768 inode_only = LOG_INODE_ALL; 2769 2770 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2771 max_key.type = BTRFS_XATTR_ITEM_KEY; 2772 else 2773 max_key.type = (u8)-1; 2774 max_key.offset = (u64)-1; 2775 2776 mutex_lock(&BTRFS_I(inode)->log_mutex); 2777 2778 /* 2779 * a brute force approach to making sure we get the most uptodate 2780 * copies of everything. 2781 */ 2782 if (S_ISDIR(inode->i_mode)) { 2783 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 2784 2785 if (inode_only == LOG_INODE_EXISTS) 2786 max_key_type = BTRFS_XATTR_ITEM_KEY; 2787 ret = drop_objectid_items(trans, log, path, 2788 inode->i_ino, max_key_type); 2789 } else { 2790 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2791 } 2792 if (ret) { 2793 err = ret; 2794 goto out_unlock; 2795 } 2796 path->keep_locks = 1; 2797 2798 while (1) { 2799 ins_nr = 0; 2800 ret = btrfs_search_forward(root, &min_key, &max_key, 2801 path, 0, trans->transid); 2802 if (ret != 0) 2803 break; 2804 again: 2805 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 2806 if (min_key.objectid != inode->i_ino) 2807 break; 2808 if (min_key.type > max_key.type) 2809 break; 2810 2811 src = path->nodes[0]; 2812 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 2813 ins_nr++; 2814 goto next_slot; 2815 } else if (!ins_nr) { 2816 ins_start_slot = path->slots[0]; 2817 ins_nr = 1; 2818 goto next_slot; 2819 } 2820 2821 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2822 ins_nr, inode_only); 2823 if (ret) { 2824 err = ret; 2825 goto out_unlock; 2826 } 2827 ins_nr = 1; 2828 ins_start_slot = path->slots[0]; 2829 next_slot: 2830 2831 nritems = btrfs_header_nritems(path->nodes[0]); 2832 path->slots[0]++; 2833 if (path->slots[0] < nritems) { 2834 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 2835 path->slots[0]); 2836 goto again; 2837 } 2838 if (ins_nr) { 2839 ret = copy_items(trans, log, dst_path, src, 2840 ins_start_slot, 2841 ins_nr, inode_only); 2842 if (ret) { 2843 err = ret; 2844 goto out_unlock; 2845 } 2846 ins_nr = 0; 2847 } 2848 btrfs_release_path(root, path); 2849 2850 if (min_key.offset < (u64)-1) 2851 min_key.offset++; 2852 else if (min_key.type < (u8)-1) 2853 min_key.type++; 2854 else if (min_key.objectid < (u64)-1) 2855 min_key.objectid++; 2856 else 2857 break; 2858 } 2859 if (ins_nr) { 2860 ret = copy_items(trans, log, dst_path, src, 2861 ins_start_slot, 2862 ins_nr, inode_only); 2863 if (ret) { 2864 err = ret; 2865 goto out_unlock; 2866 } 2867 ins_nr = 0; 2868 } 2869 WARN_ON(ins_nr); 2870 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2871 btrfs_release_path(root, path); 2872 btrfs_release_path(log, dst_path); 2873 ret = log_directory_changes(trans, root, inode, path, dst_path); 2874 if (ret) { 2875 err = ret; 2876 goto out_unlock; 2877 } 2878 } 2879 BTRFS_I(inode)->logged_trans = trans->transid; 2880 out_unlock: 2881 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2882 2883 btrfs_free_path(path); 2884 btrfs_free_path(dst_path); 2885 return err; 2886 } 2887 2888 /* 2889 * follow the dentry parent pointers up the chain and see if any 2890 * of the directories in it require a full commit before they can 2891 * be logged. Returns zero if nothing special needs to be done or 1 if 2892 * a full commit is required. 2893 */ 2894 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 2895 struct inode *inode, 2896 struct dentry *parent, 2897 struct super_block *sb, 2898 u64 last_committed) 2899 { 2900 int ret = 0; 2901 struct btrfs_root *root; 2902 struct dentry *old_parent = NULL; 2903 2904 /* 2905 * for regular files, if its inode is already on disk, we don't 2906 * have to worry about the parents at all. This is because 2907 * we can use the last_unlink_trans field to record renames 2908 * and other fun in this file. 2909 */ 2910 if (S_ISREG(inode->i_mode) && 2911 BTRFS_I(inode)->generation <= last_committed && 2912 BTRFS_I(inode)->last_unlink_trans <= last_committed) 2913 goto out; 2914 2915 if (!S_ISDIR(inode->i_mode)) { 2916 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2917 goto out; 2918 inode = parent->d_inode; 2919 } 2920 2921 while (1) { 2922 BTRFS_I(inode)->logged_trans = trans->transid; 2923 smp_mb(); 2924 2925 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 2926 root = BTRFS_I(inode)->root; 2927 2928 /* 2929 * make sure any commits to the log are forced 2930 * to be full commits 2931 */ 2932 root->fs_info->last_trans_log_full_commit = 2933 trans->transid; 2934 ret = 1; 2935 break; 2936 } 2937 2938 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2939 break; 2940 2941 if (IS_ROOT(parent)) 2942 break; 2943 2944 parent = dget_parent(parent); 2945 dput(old_parent); 2946 old_parent = parent; 2947 inode = parent->d_inode; 2948 2949 } 2950 dput(old_parent); 2951 out: 2952 return ret; 2953 } 2954 2955 static int inode_in_log(struct btrfs_trans_handle *trans, 2956 struct inode *inode) 2957 { 2958 struct btrfs_root *root = BTRFS_I(inode)->root; 2959 int ret = 0; 2960 2961 mutex_lock(&root->log_mutex); 2962 if (BTRFS_I(inode)->logged_trans == trans->transid && 2963 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) 2964 ret = 1; 2965 mutex_unlock(&root->log_mutex); 2966 return ret; 2967 } 2968 2969 2970 /* 2971 * helper function around btrfs_log_inode to make sure newly created 2972 * parent directories also end up in the log. A minimal inode and backref 2973 * only logging is done of any parent directories that are older than 2974 * the last committed transaction 2975 */ 2976 int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 2977 struct btrfs_root *root, struct inode *inode, 2978 struct dentry *parent, int exists_only) 2979 { 2980 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 2981 struct super_block *sb; 2982 struct dentry *old_parent = NULL; 2983 int ret = 0; 2984 u64 last_committed = root->fs_info->last_trans_committed; 2985 2986 sb = inode->i_sb; 2987 2988 if (btrfs_test_opt(root, NOTREELOG)) { 2989 ret = 1; 2990 goto end_no_trans; 2991 } 2992 2993 if (root->fs_info->last_trans_log_full_commit > 2994 root->fs_info->last_trans_committed) { 2995 ret = 1; 2996 goto end_no_trans; 2997 } 2998 2999 if (root != BTRFS_I(inode)->root || 3000 btrfs_root_refs(&root->root_item) == 0) { 3001 ret = 1; 3002 goto end_no_trans; 3003 } 3004 3005 ret = check_parent_dirs_for_sync(trans, inode, parent, 3006 sb, last_committed); 3007 if (ret) 3008 goto end_no_trans; 3009 3010 if (inode_in_log(trans, inode)) { 3011 ret = BTRFS_NO_LOG_SYNC; 3012 goto end_no_trans; 3013 } 3014 3015 ret = start_log_trans(trans, root); 3016 if (ret) 3017 goto end_trans; 3018 3019 ret = btrfs_log_inode(trans, root, inode, inode_only); 3020 if (ret) 3021 goto end_trans; 3022 3023 /* 3024 * for regular files, if its inode is already on disk, we don't 3025 * have to worry about the parents at all. This is because 3026 * we can use the last_unlink_trans field to record renames 3027 * and other fun in this file. 3028 */ 3029 if (S_ISREG(inode->i_mode) && 3030 BTRFS_I(inode)->generation <= last_committed && 3031 BTRFS_I(inode)->last_unlink_trans <= last_committed) { 3032 ret = 0; 3033 goto end_trans; 3034 } 3035 3036 inode_only = LOG_INODE_EXISTS; 3037 while (1) { 3038 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3039 break; 3040 3041 inode = parent->d_inode; 3042 if (root != BTRFS_I(inode)->root) 3043 break; 3044 3045 if (BTRFS_I(inode)->generation > 3046 root->fs_info->last_trans_committed) { 3047 ret = btrfs_log_inode(trans, root, inode, inode_only); 3048 if (ret) 3049 goto end_trans; 3050 } 3051 if (IS_ROOT(parent)) 3052 break; 3053 3054 parent = dget_parent(parent); 3055 dput(old_parent); 3056 old_parent = parent; 3057 } 3058 ret = 0; 3059 end_trans: 3060 dput(old_parent); 3061 if (ret < 0) { 3062 BUG_ON(ret != -ENOSPC); 3063 root->fs_info->last_trans_log_full_commit = trans->transid; 3064 ret = 1; 3065 } 3066 btrfs_end_log_trans(root); 3067 end_no_trans: 3068 return ret; 3069 } 3070 3071 /* 3072 * it is not safe to log dentry if the chunk root has added new 3073 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 3074 * If this returns 1, you must commit the transaction to safely get your 3075 * data on disk. 3076 */ 3077 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 3078 struct btrfs_root *root, struct dentry *dentry) 3079 { 3080 struct dentry *parent = dget_parent(dentry); 3081 int ret; 3082 3083 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); 3084 dput(parent); 3085 3086 return ret; 3087 } 3088 3089 /* 3090 * should be called during mount to recover any replay any log trees 3091 * from the FS 3092 */ 3093 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 3094 { 3095 int ret; 3096 struct btrfs_path *path; 3097 struct btrfs_trans_handle *trans; 3098 struct btrfs_key key; 3099 struct btrfs_key found_key; 3100 struct btrfs_key tmp_key; 3101 struct btrfs_root *log; 3102 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 3103 struct walk_control wc = { 3104 .process_func = process_one_buffer, 3105 .stage = 0, 3106 }; 3107 3108 path = btrfs_alloc_path(); 3109 if (!path) 3110 return -ENOMEM; 3111 3112 fs_info->log_root_recovering = 1; 3113 3114 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3115 BUG_ON(IS_ERR(trans)); 3116 3117 wc.trans = trans; 3118 wc.pin = 1; 3119 3120 ret = walk_log_tree(trans, log_root_tree, &wc); 3121 BUG_ON(ret); 3122 3123 again: 3124 key.objectid = BTRFS_TREE_LOG_OBJECTID; 3125 key.offset = (u64)-1; 3126 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 3127 3128 while (1) { 3129 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 3130 if (ret < 0) 3131 break; 3132 if (ret > 0) { 3133 if (path->slots[0] == 0) 3134 break; 3135 path->slots[0]--; 3136 } 3137 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3138 path->slots[0]); 3139 btrfs_release_path(log_root_tree, path); 3140 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 3141 break; 3142 3143 log = btrfs_read_fs_root_no_radix(log_root_tree, 3144 &found_key); 3145 BUG_ON(IS_ERR(log)); 3146 3147 tmp_key.objectid = found_key.offset; 3148 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 3149 tmp_key.offset = (u64)-1; 3150 3151 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 3152 BUG_ON(!wc.replay_dest); 3153 3154 wc.replay_dest->log_root = log; 3155 btrfs_record_root_in_trans(trans, wc.replay_dest); 3156 ret = walk_log_tree(trans, log, &wc); 3157 BUG_ON(ret); 3158 3159 if (wc.stage == LOG_WALK_REPLAY_ALL) { 3160 ret = fixup_inode_link_counts(trans, wc.replay_dest, 3161 path); 3162 BUG_ON(ret); 3163 } 3164 3165 key.offset = found_key.offset - 1; 3166 wc.replay_dest->log_root = NULL; 3167 free_extent_buffer(log->node); 3168 free_extent_buffer(log->commit_root); 3169 kfree(log); 3170 3171 if (found_key.offset == 0) 3172 break; 3173 } 3174 btrfs_release_path(log_root_tree, path); 3175 3176 /* step one is to pin it all, step two is to replay just inodes */ 3177 if (wc.pin) { 3178 wc.pin = 0; 3179 wc.process_func = replay_one_buffer; 3180 wc.stage = LOG_WALK_REPLAY_INODES; 3181 goto again; 3182 } 3183 /* step three is to replay everything */ 3184 if (wc.stage < LOG_WALK_REPLAY_ALL) { 3185 wc.stage++; 3186 goto again; 3187 } 3188 3189 btrfs_free_path(path); 3190 3191 free_extent_buffer(log_root_tree->node); 3192 log_root_tree->log_root = NULL; 3193 fs_info->log_root_recovering = 0; 3194 3195 /* step 4: commit the transaction, which also unpins the blocks */ 3196 btrfs_commit_transaction(trans, fs_info->tree_root); 3197 3198 kfree(log_root_tree); 3199 return 0; 3200 } 3201 3202 /* 3203 * there are some corner cases where we want to force a full 3204 * commit instead of allowing a directory to be logged. 3205 * 3206 * They revolve around files there were unlinked from the directory, and 3207 * this function updates the parent directory so that a full commit is 3208 * properly done if it is fsync'd later after the unlinks are done. 3209 */ 3210 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 3211 struct inode *dir, struct inode *inode, 3212 int for_rename) 3213 { 3214 /* 3215 * when we're logging a file, if it hasn't been renamed 3216 * or unlinked, and its inode is fully committed on disk, 3217 * we don't have to worry about walking up the directory chain 3218 * to log its parents. 3219 * 3220 * So, we use the last_unlink_trans field to put this transid 3221 * into the file. When the file is logged we check it and 3222 * don't log the parents if the file is fully on disk. 3223 */ 3224 if (S_ISREG(inode->i_mode)) 3225 BTRFS_I(inode)->last_unlink_trans = trans->transid; 3226 3227 /* 3228 * if this directory was already logged any new 3229 * names for this file/dir will get recorded 3230 */ 3231 smp_mb(); 3232 if (BTRFS_I(dir)->logged_trans == trans->transid) 3233 return; 3234 3235 /* 3236 * if the inode we're about to unlink was logged, 3237 * the log will be properly updated for any new names 3238 */ 3239 if (BTRFS_I(inode)->logged_trans == trans->transid) 3240 return; 3241 3242 /* 3243 * when renaming files across directories, if the directory 3244 * there we're unlinking from gets fsync'd later on, there's 3245 * no way to find the destination directory later and fsync it 3246 * properly. So, we have to be conservative and force commits 3247 * so the new name gets discovered. 3248 */ 3249 if (for_rename) 3250 goto record; 3251 3252 /* we can safely do the unlink without any special recording */ 3253 return; 3254 3255 record: 3256 BTRFS_I(dir)->last_unlink_trans = trans->transid; 3257 } 3258 3259 /* 3260 * Call this after adding a new name for a file and it will properly 3261 * update the log to reflect the new name. 3262 * 3263 * It will return zero if all goes well, and it will return 1 if a 3264 * full transaction commit is required. 3265 */ 3266 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 3267 struct inode *inode, struct inode *old_dir, 3268 struct dentry *parent) 3269 { 3270 struct btrfs_root * root = BTRFS_I(inode)->root; 3271 3272 /* 3273 * this will force the logging code to walk the dentry chain 3274 * up for the file 3275 */ 3276 if (S_ISREG(inode->i_mode)) 3277 BTRFS_I(inode)->last_unlink_trans = trans->transid; 3278 3279 /* 3280 * if this inode hasn't been logged and directory we're renaming it 3281 * from hasn't been logged, we don't need to log it 3282 */ 3283 if (BTRFS_I(inode)->logged_trans <= 3284 root->fs_info->last_trans_committed && 3285 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 3286 root->fs_info->last_trans_committed)) 3287 return 0; 3288 3289 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 3290 } 3291 3292