1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include "tree-log.h" 24 #include "disk-io.h" 25 #include "locking.h" 26 #include "print-tree.h" 27 #include "backref.h" 28 #include "hash.h" 29 30 /* magic values for the inode_only field in btrfs_log_inode: 31 * 32 * LOG_INODE_ALL means to log everything 33 * LOG_INODE_EXISTS means to log just enough to recreate the inode 34 * during log replay 35 */ 36 #define LOG_INODE_ALL 0 37 #define LOG_INODE_EXISTS 1 38 39 /* 40 * directory trouble cases 41 * 42 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 43 * log, we must force a full commit before doing an fsync of the directory 44 * where the unlink was done. 45 * ---> record transid of last unlink/rename per directory 46 * 47 * mkdir foo/some_dir 48 * normal commit 49 * rename foo/some_dir foo2/some_dir 50 * mkdir foo/some_dir 51 * fsync foo/some_dir/some_file 52 * 53 * The fsync above will unlink the original some_dir without recording 54 * it in its new location (foo2). After a crash, some_dir will be gone 55 * unless the fsync of some_file forces a full commit 56 * 57 * 2) we must log any new names for any file or dir that is in the fsync 58 * log. ---> check inode while renaming/linking. 59 * 60 * 2a) we must log any new names for any file or dir during rename 61 * when the directory they are being removed from was logged. 62 * ---> check inode and old parent dir during rename 63 * 64 * 2a is actually the more important variant. With the extra logging 65 * a crash might unlink the old name without recreating the new one 66 * 67 * 3) after a crash, we must go through any directories with a link count 68 * of zero and redo the rm -rf 69 * 70 * mkdir f1/foo 71 * normal commit 72 * rm -rf f1/foo 73 * fsync(f1) 74 * 75 * The directory f1 was fully removed from the FS, but fsync was never 76 * called on f1, only its parent dir. After a crash the rm -rf must 77 * be replayed. This must be able to recurse down the entire 78 * directory tree. The inode link count fixup code takes care of the 79 * ugly details. 80 */ 81 82 /* 83 * stages for the tree walking. The first 84 * stage (0) is to only pin down the blocks we find 85 * the second stage (1) is to make sure that all the inodes 86 * we find in the log are created in the subvolume. 87 * 88 * The last stage is to deal with directories and links and extents 89 * and all the other fun semantics 90 */ 91 #define LOG_WALK_PIN_ONLY 0 92 #define LOG_WALK_REPLAY_INODES 1 93 #define LOG_WALK_REPLAY_DIR_INDEX 2 94 #define LOG_WALK_REPLAY_ALL 3 95 96 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, struct inode *inode, 98 int inode_only, 99 const loff_t start, 100 const loff_t end, 101 struct btrfs_log_ctx *ctx); 102 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 103 struct btrfs_root *root, 104 struct btrfs_path *path, u64 objectid); 105 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 106 struct btrfs_root *root, 107 struct btrfs_root *log, 108 struct btrfs_path *path, 109 u64 dirid, int del_all); 110 111 /* 112 * tree logging is a special write ahead log used to make sure that 113 * fsyncs and O_SYNCs can happen without doing full tree commits. 114 * 115 * Full tree commits are expensive because they require commonly 116 * modified blocks to be recowed, creating many dirty pages in the 117 * extent tree an 4x-6x higher write load than ext3. 118 * 119 * Instead of doing a tree commit on every fsync, we use the 120 * key ranges and transaction ids to find items for a given file or directory 121 * that have changed in this transaction. Those items are copied into 122 * a special tree (one per subvolume root), that tree is written to disk 123 * and then the fsync is considered complete. 124 * 125 * After a crash, items are copied out of the log-tree back into the 126 * subvolume tree. Any file data extents found are recorded in the extent 127 * allocation tree, and the log-tree freed. 128 * 129 * The log tree is read three times, once to pin down all the extents it is 130 * using in ram and once, once to create all the inodes logged in the tree 131 * and once to do all the other items. 132 */ 133 134 /* 135 * start a sub transaction and setup the log tree 136 * this increments the log tree writer count to make the people 137 * syncing the tree wait for us to finish 138 */ 139 static int start_log_trans(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root, 141 struct btrfs_log_ctx *ctx) 142 { 143 int index; 144 int ret; 145 146 mutex_lock(&root->log_mutex); 147 if (root->log_root) { 148 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 149 ret = -EAGAIN; 150 goto out; 151 } 152 if (!root->log_start_pid) { 153 root->log_start_pid = current->pid; 154 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 155 } else if (root->log_start_pid != current->pid) { 156 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 157 } 158 159 atomic_inc(&root->log_batch); 160 atomic_inc(&root->log_writers); 161 if (ctx) { 162 index = root->log_transid % 2; 163 list_add_tail(&ctx->list, &root->log_ctxs[index]); 164 ctx->log_transid = root->log_transid; 165 } 166 mutex_unlock(&root->log_mutex); 167 return 0; 168 } 169 170 ret = 0; 171 mutex_lock(&root->fs_info->tree_log_mutex); 172 if (!root->fs_info->log_root_tree) 173 ret = btrfs_init_log_root_tree(trans, root->fs_info); 174 mutex_unlock(&root->fs_info->tree_log_mutex); 175 if (ret) 176 goto out; 177 178 if (!root->log_root) { 179 ret = btrfs_add_log_tree(trans, root); 180 if (ret) 181 goto out; 182 } 183 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 184 root->log_start_pid = current->pid; 185 atomic_inc(&root->log_batch); 186 atomic_inc(&root->log_writers); 187 if (ctx) { 188 index = root->log_transid % 2; 189 list_add_tail(&ctx->list, &root->log_ctxs[index]); 190 ctx->log_transid = root->log_transid; 191 } 192 out: 193 mutex_unlock(&root->log_mutex); 194 return ret; 195 } 196 197 /* 198 * returns 0 if there was a log transaction running and we were able 199 * to join, or returns -ENOENT if there were not transactions 200 * in progress 201 */ 202 static int join_running_log_trans(struct btrfs_root *root) 203 { 204 int ret = -ENOENT; 205 206 smp_mb(); 207 if (!root->log_root) 208 return -ENOENT; 209 210 mutex_lock(&root->log_mutex); 211 if (root->log_root) { 212 ret = 0; 213 atomic_inc(&root->log_writers); 214 } 215 mutex_unlock(&root->log_mutex); 216 return ret; 217 } 218 219 /* 220 * This either makes the current running log transaction wait 221 * until you call btrfs_end_log_trans() or it makes any future 222 * log transactions wait until you call btrfs_end_log_trans() 223 */ 224 int btrfs_pin_log_trans(struct btrfs_root *root) 225 { 226 int ret = -ENOENT; 227 228 mutex_lock(&root->log_mutex); 229 atomic_inc(&root->log_writers); 230 mutex_unlock(&root->log_mutex); 231 return ret; 232 } 233 234 /* 235 * indicate we're done making changes to the log tree 236 * and wake up anyone waiting to do a sync 237 */ 238 void btrfs_end_log_trans(struct btrfs_root *root) 239 { 240 if (atomic_dec_and_test(&root->log_writers)) { 241 smp_mb(); 242 if (waitqueue_active(&root->log_writer_wait)) 243 wake_up(&root->log_writer_wait); 244 } 245 } 246 247 248 /* 249 * the walk control struct is used to pass state down the chain when 250 * processing the log tree. The stage field tells us which part 251 * of the log tree processing we are currently doing. The others 252 * are state fields used for that specific part 253 */ 254 struct walk_control { 255 /* should we free the extent on disk when done? This is used 256 * at transaction commit time while freeing a log tree 257 */ 258 int free; 259 260 /* should we write out the extent buffer? This is used 261 * while flushing the log tree to disk during a sync 262 */ 263 int write; 264 265 /* should we wait for the extent buffer io to finish? Also used 266 * while flushing the log tree to disk for a sync 267 */ 268 int wait; 269 270 /* pin only walk, we record which extents on disk belong to the 271 * log trees 272 */ 273 int pin; 274 275 /* what stage of the replay code we're currently in */ 276 int stage; 277 278 /* the root we are currently replaying */ 279 struct btrfs_root *replay_dest; 280 281 /* the trans handle for the current replay */ 282 struct btrfs_trans_handle *trans; 283 284 /* the function that gets used to process blocks we find in the 285 * tree. Note the extent_buffer might not be up to date when it is 286 * passed in, and it must be checked or read if you need the data 287 * inside it 288 */ 289 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 290 struct walk_control *wc, u64 gen); 291 }; 292 293 /* 294 * process_func used to pin down extents, write them or wait on them 295 */ 296 static int process_one_buffer(struct btrfs_root *log, 297 struct extent_buffer *eb, 298 struct walk_control *wc, u64 gen) 299 { 300 int ret = 0; 301 302 /* 303 * If this fs is mixed then we need to be able to process the leaves to 304 * pin down any logged extents, so we have to read the block. 305 */ 306 if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { 307 ret = btrfs_read_buffer(eb, gen); 308 if (ret) 309 return ret; 310 } 311 312 if (wc->pin) 313 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 314 eb->start, eb->len); 315 316 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 317 if (wc->pin && btrfs_header_level(eb) == 0) 318 ret = btrfs_exclude_logged_extents(log, eb); 319 if (wc->write) 320 btrfs_write_tree_block(eb); 321 if (wc->wait) 322 btrfs_wait_tree_block_writeback(eb); 323 } 324 return ret; 325 } 326 327 /* 328 * Item overwrite used by replay and tree logging. eb, slot and key all refer 329 * to the src data we are copying out. 330 * 331 * root is the tree we are copying into, and path is a scratch 332 * path for use in this function (it should be released on entry and 333 * will be released on exit). 334 * 335 * If the key is already in the destination tree the existing item is 336 * overwritten. If the existing item isn't big enough, it is extended. 337 * If it is too large, it is truncated. 338 * 339 * If the key isn't in the destination yet, a new item is inserted. 340 */ 341 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 342 struct btrfs_root *root, 343 struct btrfs_path *path, 344 struct extent_buffer *eb, int slot, 345 struct btrfs_key *key) 346 { 347 int ret; 348 u32 item_size; 349 u64 saved_i_size = 0; 350 int save_old_i_size = 0; 351 unsigned long src_ptr; 352 unsigned long dst_ptr; 353 int overwrite_root = 0; 354 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 355 356 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 357 overwrite_root = 1; 358 359 item_size = btrfs_item_size_nr(eb, slot); 360 src_ptr = btrfs_item_ptr_offset(eb, slot); 361 362 /* look for the key in the destination tree */ 363 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 364 if (ret < 0) 365 return ret; 366 367 if (ret == 0) { 368 char *src_copy; 369 char *dst_copy; 370 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 371 path->slots[0]); 372 if (dst_size != item_size) 373 goto insert; 374 375 if (item_size == 0) { 376 btrfs_release_path(path); 377 return 0; 378 } 379 dst_copy = kmalloc(item_size, GFP_NOFS); 380 src_copy = kmalloc(item_size, GFP_NOFS); 381 if (!dst_copy || !src_copy) { 382 btrfs_release_path(path); 383 kfree(dst_copy); 384 kfree(src_copy); 385 return -ENOMEM; 386 } 387 388 read_extent_buffer(eb, src_copy, src_ptr, item_size); 389 390 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 391 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 392 item_size); 393 ret = memcmp(dst_copy, src_copy, item_size); 394 395 kfree(dst_copy); 396 kfree(src_copy); 397 /* 398 * they have the same contents, just return, this saves 399 * us from cowing blocks in the destination tree and doing 400 * extra writes that may not have been done by a previous 401 * sync 402 */ 403 if (ret == 0) { 404 btrfs_release_path(path); 405 return 0; 406 } 407 408 /* 409 * We need to load the old nbytes into the inode so when we 410 * replay the extents we've logged we get the right nbytes. 411 */ 412 if (inode_item) { 413 struct btrfs_inode_item *item; 414 u64 nbytes; 415 u32 mode; 416 417 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 418 struct btrfs_inode_item); 419 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 420 item = btrfs_item_ptr(eb, slot, 421 struct btrfs_inode_item); 422 btrfs_set_inode_nbytes(eb, item, nbytes); 423 424 /* 425 * If this is a directory we need to reset the i_size to 426 * 0 so that we can set it up properly when replaying 427 * the rest of the items in this log. 428 */ 429 mode = btrfs_inode_mode(eb, item); 430 if (S_ISDIR(mode)) 431 btrfs_set_inode_size(eb, item, 0); 432 } 433 } else if (inode_item) { 434 struct btrfs_inode_item *item; 435 u32 mode; 436 437 /* 438 * New inode, set nbytes to 0 so that the nbytes comes out 439 * properly when we replay the extents. 440 */ 441 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 442 btrfs_set_inode_nbytes(eb, item, 0); 443 444 /* 445 * If this is a directory we need to reset the i_size to 0 so 446 * that we can set it up properly when replaying the rest of 447 * the items in this log. 448 */ 449 mode = btrfs_inode_mode(eb, item); 450 if (S_ISDIR(mode)) 451 btrfs_set_inode_size(eb, item, 0); 452 } 453 insert: 454 btrfs_release_path(path); 455 /* try to insert the key into the destination tree */ 456 path->skip_release_on_error = 1; 457 ret = btrfs_insert_empty_item(trans, root, path, 458 key, item_size); 459 path->skip_release_on_error = 0; 460 461 /* make sure any existing item is the correct size */ 462 if (ret == -EEXIST || ret == -EOVERFLOW) { 463 u32 found_size; 464 found_size = btrfs_item_size_nr(path->nodes[0], 465 path->slots[0]); 466 if (found_size > item_size) 467 btrfs_truncate_item(root, path, item_size, 1); 468 else if (found_size < item_size) 469 btrfs_extend_item(root, path, 470 item_size - found_size); 471 } else if (ret) { 472 return ret; 473 } 474 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 475 path->slots[0]); 476 477 /* don't overwrite an existing inode if the generation number 478 * was logged as zero. This is done when the tree logging code 479 * is just logging an inode to make sure it exists after recovery. 480 * 481 * Also, don't overwrite i_size on directories during replay. 482 * log replay inserts and removes directory items based on the 483 * state of the tree found in the subvolume, and i_size is modified 484 * as it goes 485 */ 486 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 487 struct btrfs_inode_item *src_item; 488 struct btrfs_inode_item *dst_item; 489 490 src_item = (struct btrfs_inode_item *)src_ptr; 491 dst_item = (struct btrfs_inode_item *)dst_ptr; 492 493 if (btrfs_inode_generation(eb, src_item) == 0) { 494 struct extent_buffer *dst_eb = path->nodes[0]; 495 const u64 ino_size = btrfs_inode_size(eb, src_item); 496 497 /* 498 * For regular files an ino_size == 0 is used only when 499 * logging that an inode exists, as part of a directory 500 * fsync, and the inode wasn't fsynced before. In this 501 * case don't set the size of the inode in the fs/subvol 502 * tree, otherwise we would be throwing valid data away. 503 */ 504 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 505 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 506 ino_size != 0) { 507 struct btrfs_map_token token; 508 509 btrfs_init_map_token(&token); 510 btrfs_set_token_inode_size(dst_eb, dst_item, 511 ino_size, &token); 512 } 513 goto no_copy; 514 } 515 516 if (overwrite_root && 517 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 518 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 519 save_old_i_size = 1; 520 saved_i_size = btrfs_inode_size(path->nodes[0], 521 dst_item); 522 } 523 } 524 525 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 526 src_ptr, item_size); 527 528 if (save_old_i_size) { 529 struct btrfs_inode_item *dst_item; 530 dst_item = (struct btrfs_inode_item *)dst_ptr; 531 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 532 } 533 534 /* make sure the generation is filled in */ 535 if (key->type == BTRFS_INODE_ITEM_KEY) { 536 struct btrfs_inode_item *dst_item; 537 dst_item = (struct btrfs_inode_item *)dst_ptr; 538 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 539 btrfs_set_inode_generation(path->nodes[0], dst_item, 540 trans->transid); 541 } 542 } 543 no_copy: 544 btrfs_mark_buffer_dirty(path->nodes[0]); 545 btrfs_release_path(path); 546 return 0; 547 } 548 549 /* 550 * simple helper to read an inode off the disk from a given root 551 * This can only be called for subvolume roots and not for the log 552 */ 553 static noinline struct inode *read_one_inode(struct btrfs_root *root, 554 u64 objectid) 555 { 556 struct btrfs_key key; 557 struct inode *inode; 558 559 key.objectid = objectid; 560 key.type = BTRFS_INODE_ITEM_KEY; 561 key.offset = 0; 562 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 563 if (IS_ERR(inode)) { 564 inode = NULL; 565 } else if (is_bad_inode(inode)) { 566 iput(inode); 567 inode = NULL; 568 } 569 return inode; 570 } 571 572 /* replays a single extent in 'eb' at 'slot' with 'key' into the 573 * subvolume 'root'. path is released on entry and should be released 574 * on exit. 575 * 576 * extents in the log tree have not been allocated out of the extent 577 * tree yet. So, this completes the allocation, taking a reference 578 * as required if the extent already exists or creating a new extent 579 * if it isn't in the extent allocation tree yet. 580 * 581 * The extent is inserted into the file, dropping any existing extents 582 * from the file that overlap the new one. 583 */ 584 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 585 struct btrfs_root *root, 586 struct btrfs_path *path, 587 struct extent_buffer *eb, int slot, 588 struct btrfs_key *key) 589 { 590 int found_type; 591 u64 extent_end; 592 u64 start = key->offset; 593 u64 nbytes = 0; 594 struct btrfs_file_extent_item *item; 595 struct inode *inode = NULL; 596 unsigned long size; 597 int ret = 0; 598 599 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 600 found_type = btrfs_file_extent_type(eb, item); 601 602 if (found_type == BTRFS_FILE_EXTENT_REG || 603 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 604 nbytes = btrfs_file_extent_num_bytes(eb, item); 605 extent_end = start + nbytes; 606 607 /* 608 * We don't add to the inodes nbytes if we are prealloc or a 609 * hole. 610 */ 611 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 612 nbytes = 0; 613 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 614 size = btrfs_file_extent_inline_len(eb, slot, item); 615 nbytes = btrfs_file_extent_ram_bytes(eb, item); 616 extent_end = ALIGN(start + size, root->sectorsize); 617 } else { 618 ret = 0; 619 goto out; 620 } 621 622 inode = read_one_inode(root, key->objectid); 623 if (!inode) { 624 ret = -EIO; 625 goto out; 626 } 627 628 /* 629 * first check to see if we already have this extent in the 630 * file. This must be done before the btrfs_drop_extents run 631 * so we don't try to drop this extent. 632 */ 633 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 634 start, 0); 635 636 if (ret == 0 && 637 (found_type == BTRFS_FILE_EXTENT_REG || 638 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 639 struct btrfs_file_extent_item cmp1; 640 struct btrfs_file_extent_item cmp2; 641 struct btrfs_file_extent_item *existing; 642 struct extent_buffer *leaf; 643 644 leaf = path->nodes[0]; 645 existing = btrfs_item_ptr(leaf, path->slots[0], 646 struct btrfs_file_extent_item); 647 648 read_extent_buffer(eb, &cmp1, (unsigned long)item, 649 sizeof(cmp1)); 650 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 651 sizeof(cmp2)); 652 653 /* 654 * we already have a pointer to this exact extent, 655 * we don't have to do anything 656 */ 657 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 658 btrfs_release_path(path); 659 goto out; 660 } 661 } 662 btrfs_release_path(path); 663 664 /* drop any overlapping extents */ 665 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 666 if (ret) 667 goto out; 668 669 if (found_type == BTRFS_FILE_EXTENT_REG || 670 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 671 u64 offset; 672 unsigned long dest_offset; 673 struct btrfs_key ins; 674 675 ret = btrfs_insert_empty_item(trans, root, path, key, 676 sizeof(*item)); 677 if (ret) 678 goto out; 679 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 680 path->slots[0]); 681 copy_extent_buffer(path->nodes[0], eb, dest_offset, 682 (unsigned long)item, sizeof(*item)); 683 684 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 685 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 686 ins.type = BTRFS_EXTENT_ITEM_KEY; 687 offset = key->offset - btrfs_file_extent_offset(eb, item); 688 689 if (ins.objectid > 0) { 690 u64 csum_start; 691 u64 csum_end; 692 LIST_HEAD(ordered_sums); 693 /* 694 * is this extent already allocated in the extent 695 * allocation tree? If so, just add a reference 696 */ 697 ret = btrfs_lookup_data_extent(root, ins.objectid, 698 ins.offset); 699 if (ret == 0) { 700 ret = btrfs_inc_extent_ref(trans, root, 701 ins.objectid, ins.offset, 702 0, root->root_key.objectid, 703 key->objectid, offset, 0); 704 if (ret) 705 goto out; 706 } else { 707 /* 708 * insert the extent pointer in the extent 709 * allocation tree 710 */ 711 ret = btrfs_alloc_logged_file_extent(trans, 712 root, root->root_key.objectid, 713 key->objectid, offset, &ins); 714 if (ret) 715 goto out; 716 } 717 btrfs_release_path(path); 718 719 if (btrfs_file_extent_compression(eb, item)) { 720 csum_start = ins.objectid; 721 csum_end = csum_start + ins.offset; 722 } else { 723 csum_start = ins.objectid + 724 btrfs_file_extent_offset(eb, item); 725 csum_end = csum_start + 726 btrfs_file_extent_num_bytes(eb, item); 727 } 728 729 ret = btrfs_lookup_csums_range(root->log_root, 730 csum_start, csum_end - 1, 731 &ordered_sums, 0); 732 if (ret) 733 goto out; 734 while (!list_empty(&ordered_sums)) { 735 struct btrfs_ordered_sum *sums; 736 sums = list_entry(ordered_sums.next, 737 struct btrfs_ordered_sum, 738 list); 739 if (!ret) 740 ret = btrfs_csum_file_blocks(trans, 741 root->fs_info->csum_root, 742 sums); 743 list_del(&sums->list); 744 kfree(sums); 745 } 746 if (ret) 747 goto out; 748 } else { 749 btrfs_release_path(path); 750 } 751 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 752 /* inline extents are easy, we just overwrite them */ 753 ret = overwrite_item(trans, root, path, eb, slot, key); 754 if (ret) 755 goto out; 756 } 757 758 inode_add_bytes(inode, nbytes); 759 ret = btrfs_update_inode(trans, root, inode); 760 out: 761 if (inode) 762 iput(inode); 763 return ret; 764 } 765 766 /* 767 * when cleaning up conflicts between the directory names in the 768 * subvolume, directory names in the log and directory names in the 769 * inode back references, we may have to unlink inodes from directories. 770 * 771 * This is a helper function to do the unlink of a specific directory 772 * item 773 */ 774 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 775 struct btrfs_root *root, 776 struct btrfs_path *path, 777 struct inode *dir, 778 struct btrfs_dir_item *di) 779 { 780 struct inode *inode; 781 char *name; 782 int name_len; 783 struct extent_buffer *leaf; 784 struct btrfs_key location; 785 int ret; 786 787 leaf = path->nodes[0]; 788 789 btrfs_dir_item_key_to_cpu(leaf, di, &location); 790 name_len = btrfs_dir_name_len(leaf, di); 791 name = kmalloc(name_len, GFP_NOFS); 792 if (!name) 793 return -ENOMEM; 794 795 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 796 btrfs_release_path(path); 797 798 inode = read_one_inode(root, location.objectid); 799 if (!inode) { 800 ret = -EIO; 801 goto out; 802 } 803 804 ret = link_to_fixup_dir(trans, root, path, location.objectid); 805 if (ret) 806 goto out; 807 808 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 809 if (ret) 810 goto out; 811 else 812 ret = btrfs_run_delayed_items(trans, root); 813 out: 814 kfree(name); 815 iput(inode); 816 return ret; 817 } 818 819 /* 820 * helper function to see if a given name and sequence number found 821 * in an inode back reference are already in a directory and correctly 822 * point to this inode 823 */ 824 static noinline int inode_in_dir(struct btrfs_root *root, 825 struct btrfs_path *path, 826 u64 dirid, u64 objectid, u64 index, 827 const char *name, int name_len) 828 { 829 struct btrfs_dir_item *di; 830 struct btrfs_key location; 831 int match = 0; 832 833 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 834 index, name, name_len, 0); 835 if (di && !IS_ERR(di)) { 836 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 837 if (location.objectid != objectid) 838 goto out; 839 } else 840 goto out; 841 btrfs_release_path(path); 842 843 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 844 if (di && !IS_ERR(di)) { 845 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 846 if (location.objectid != objectid) 847 goto out; 848 } else 849 goto out; 850 match = 1; 851 out: 852 btrfs_release_path(path); 853 return match; 854 } 855 856 /* 857 * helper function to check a log tree for a named back reference in 858 * an inode. This is used to decide if a back reference that is 859 * found in the subvolume conflicts with what we find in the log. 860 * 861 * inode backreferences may have multiple refs in a single item, 862 * during replay we process one reference at a time, and we don't 863 * want to delete valid links to a file from the subvolume if that 864 * link is also in the log. 865 */ 866 static noinline int backref_in_log(struct btrfs_root *log, 867 struct btrfs_key *key, 868 u64 ref_objectid, 869 const char *name, int namelen) 870 { 871 struct btrfs_path *path; 872 struct btrfs_inode_ref *ref; 873 unsigned long ptr; 874 unsigned long ptr_end; 875 unsigned long name_ptr; 876 int found_name_len; 877 int item_size; 878 int ret; 879 int match = 0; 880 881 path = btrfs_alloc_path(); 882 if (!path) 883 return -ENOMEM; 884 885 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 886 if (ret != 0) 887 goto out; 888 889 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 890 891 if (key->type == BTRFS_INODE_EXTREF_KEY) { 892 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 893 name, namelen, NULL)) 894 match = 1; 895 896 goto out; 897 } 898 899 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 900 ptr_end = ptr + item_size; 901 while (ptr < ptr_end) { 902 ref = (struct btrfs_inode_ref *)ptr; 903 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 904 if (found_name_len == namelen) { 905 name_ptr = (unsigned long)(ref + 1); 906 ret = memcmp_extent_buffer(path->nodes[0], name, 907 name_ptr, namelen); 908 if (ret == 0) { 909 match = 1; 910 goto out; 911 } 912 } 913 ptr = (unsigned long)(ref + 1) + found_name_len; 914 } 915 out: 916 btrfs_free_path(path); 917 return match; 918 } 919 920 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 921 struct btrfs_root *root, 922 struct btrfs_path *path, 923 struct btrfs_root *log_root, 924 struct inode *dir, struct inode *inode, 925 struct extent_buffer *eb, 926 u64 inode_objectid, u64 parent_objectid, 927 u64 ref_index, char *name, int namelen, 928 int *search_done) 929 { 930 int ret; 931 char *victim_name; 932 int victim_name_len; 933 struct extent_buffer *leaf; 934 struct btrfs_dir_item *di; 935 struct btrfs_key search_key; 936 struct btrfs_inode_extref *extref; 937 938 again: 939 /* Search old style refs */ 940 search_key.objectid = inode_objectid; 941 search_key.type = BTRFS_INODE_REF_KEY; 942 search_key.offset = parent_objectid; 943 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 944 if (ret == 0) { 945 struct btrfs_inode_ref *victim_ref; 946 unsigned long ptr; 947 unsigned long ptr_end; 948 949 leaf = path->nodes[0]; 950 951 /* are we trying to overwrite a back ref for the root directory 952 * if so, just jump out, we're done 953 */ 954 if (search_key.objectid == search_key.offset) 955 return 1; 956 957 /* check all the names in this back reference to see 958 * if they are in the log. if so, we allow them to stay 959 * otherwise they must be unlinked as a conflict 960 */ 961 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 962 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 963 while (ptr < ptr_end) { 964 victim_ref = (struct btrfs_inode_ref *)ptr; 965 victim_name_len = btrfs_inode_ref_name_len(leaf, 966 victim_ref); 967 victim_name = kmalloc(victim_name_len, GFP_NOFS); 968 if (!victim_name) 969 return -ENOMEM; 970 971 read_extent_buffer(leaf, victim_name, 972 (unsigned long)(victim_ref + 1), 973 victim_name_len); 974 975 if (!backref_in_log(log_root, &search_key, 976 parent_objectid, 977 victim_name, 978 victim_name_len)) { 979 inc_nlink(inode); 980 btrfs_release_path(path); 981 982 ret = btrfs_unlink_inode(trans, root, dir, 983 inode, victim_name, 984 victim_name_len); 985 kfree(victim_name); 986 if (ret) 987 return ret; 988 ret = btrfs_run_delayed_items(trans, root); 989 if (ret) 990 return ret; 991 *search_done = 1; 992 goto again; 993 } 994 kfree(victim_name); 995 996 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 997 } 998 999 /* 1000 * NOTE: we have searched root tree and checked the 1001 * coresponding ref, it does not need to check again. 1002 */ 1003 *search_done = 1; 1004 } 1005 btrfs_release_path(path); 1006 1007 /* Same search but for extended refs */ 1008 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1009 inode_objectid, parent_objectid, 0, 1010 0); 1011 if (!IS_ERR_OR_NULL(extref)) { 1012 u32 item_size; 1013 u32 cur_offset = 0; 1014 unsigned long base; 1015 struct inode *victim_parent; 1016 1017 leaf = path->nodes[0]; 1018 1019 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1020 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1021 1022 while (cur_offset < item_size) { 1023 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1024 1025 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1026 1027 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1028 goto next; 1029 1030 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1031 if (!victim_name) 1032 return -ENOMEM; 1033 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1034 victim_name_len); 1035 1036 search_key.objectid = inode_objectid; 1037 search_key.type = BTRFS_INODE_EXTREF_KEY; 1038 search_key.offset = btrfs_extref_hash(parent_objectid, 1039 victim_name, 1040 victim_name_len); 1041 ret = 0; 1042 if (!backref_in_log(log_root, &search_key, 1043 parent_objectid, victim_name, 1044 victim_name_len)) { 1045 ret = -ENOENT; 1046 victim_parent = read_one_inode(root, 1047 parent_objectid); 1048 if (victim_parent) { 1049 inc_nlink(inode); 1050 btrfs_release_path(path); 1051 1052 ret = btrfs_unlink_inode(trans, root, 1053 victim_parent, 1054 inode, 1055 victim_name, 1056 victim_name_len); 1057 if (!ret) 1058 ret = btrfs_run_delayed_items( 1059 trans, root); 1060 } 1061 iput(victim_parent); 1062 kfree(victim_name); 1063 if (ret) 1064 return ret; 1065 *search_done = 1; 1066 goto again; 1067 } 1068 kfree(victim_name); 1069 if (ret) 1070 return ret; 1071 next: 1072 cur_offset += victim_name_len + sizeof(*extref); 1073 } 1074 *search_done = 1; 1075 } 1076 btrfs_release_path(path); 1077 1078 /* look for a conflicting sequence number */ 1079 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1080 ref_index, name, namelen, 0); 1081 if (di && !IS_ERR(di)) { 1082 ret = drop_one_dir_item(trans, root, path, dir, di); 1083 if (ret) 1084 return ret; 1085 } 1086 btrfs_release_path(path); 1087 1088 /* look for a conflicing name */ 1089 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1090 name, namelen, 0); 1091 if (di && !IS_ERR(di)) { 1092 ret = drop_one_dir_item(trans, root, path, dir, di); 1093 if (ret) 1094 return ret; 1095 } 1096 btrfs_release_path(path); 1097 1098 return 0; 1099 } 1100 1101 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1102 u32 *namelen, char **name, u64 *index, 1103 u64 *parent_objectid) 1104 { 1105 struct btrfs_inode_extref *extref; 1106 1107 extref = (struct btrfs_inode_extref *)ref_ptr; 1108 1109 *namelen = btrfs_inode_extref_name_len(eb, extref); 1110 *name = kmalloc(*namelen, GFP_NOFS); 1111 if (*name == NULL) 1112 return -ENOMEM; 1113 1114 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1115 *namelen); 1116 1117 *index = btrfs_inode_extref_index(eb, extref); 1118 if (parent_objectid) 1119 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1120 1121 return 0; 1122 } 1123 1124 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1125 u32 *namelen, char **name, u64 *index) 1126 { 1127 struct btrfs_inode_ref *ref; 1128 1129 ref = (struct btrfs_inode_ref *)ref_ptr; 1130 1131 *namelen = btrfs_inode_ref_name_len(eb, ref); 1132 *name = kmalloc(*namelen, GFP_NOFS); 1133 if (*name == NULL) 1134 return -ENOMEM; 1135 1136 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1137 1138 *index = btrfs_inode_ref_index(eb, ref); 1139 1140 return 0; 1141 } 1142 1143 /* 1144 * replay one inode back reference item found in the log tree. 1145 * eb, slot and key refer to the buffer and key found in the log tree. 1146 * root is the destination we are replaying into, and path is for temp 1147 * use by this function. (it should be released on return). 1148 */ 1149 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1150 struct btrfs_root *root, 1151 struct btrfs_root *log, 1152 struct btrfs_path *path, 1153 struct extent_buffer *eb, int slot, 1154 struct btrfs_key *key) 1155 { 1156 struct inode *dir = NULL; 1157 struct inode *inode = NULL; 1158 unsigned long ref_ptr; 1159 unsigned long ref_end; 1160 char *name = NULL; 1161 int namelen; 1162 int ret; 1163 int search_done = 0; 1164 int log_ref_ver = 0; 1165 u64 parent_objectid; 1166 u64 inode_objectid; 1167 u64 ref_index = 0; 1168 int ref_struct_size; 1169 1170 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1171 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1172 1173 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1174 struct btrfs_inode_extref *r; 1175 1176 ref_struct_size = sizeof(struct btrfs_inode_extref); 1177 log_ref_ver = 1; 1178 r = (struct btrfs_inode_extref *)ref_ptr; 1179 parent_objectid = btrfs_inode_extref_parent(eb, r); 1180 } else { 1181 ref_struct_size = sizeof(struct btrfs_inode_ref); 1182 parent_objectid = key->offset; 1183 } 1184 inode_objectid = key->objectid; 1185 1186 /* 1187 * it is possible that we didn't log all the parent directories 1188 * for a given inode. If we don't find the dir, just don't 1189 * copy the back ref in. The link count fixup code will take 1190 * care of the rest 1191 */ 1192 dir = read_one_inode(root, parent_objectid); 1193 if (!dir) { 1194 ret = -ENOENT; 1195 goto out; 1196 } 1197 1198 inode = read_one_inode(root, inode_objectid); 1199 if (!inode) { 1200 ret = -EIO; 1201 goto out; 1202 } 1203 1204 while (ref_ptr < ref_end) { 1205 if (log_ref_ver) { 1206 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1207 &ref_index, &parent_objectid); 1208 /* 1209 * parent object can change from one array 1210 * item to another. 1211 */ 1212 if (!dir) 1213 dir = read_one_inode(root, parent_objectid); 1214 if (!dir) { 1215 ret = -ENOENT; 1216 goto out; 1217 } 1218 } else { 1219 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1220 &ref_index); 1221 } 1222 if (ret) 1223 goto out; 1224 1225 /* if we already have a perfect match, we're done */ 1226 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), 1227 ref_index, name, namelen)) { 1228 /* 1229 * look for a conflicting back reference in the 1230 * metadata. if we find one we have to unlink that name 1231 * of the file before we add our new link. Later on, we 1232 * overwrite any existing back reference, and we don't 1233 * want to create dangling pointers in the directory. 1234 */ 1235 1236 if (!search_done) { 1237 ret = __add_inode_ref(trans, root, path, log, 1238 dir, inode, eb, 1239 inode_objectid, 1240 parent_objectid, 1241 ref_index, name, namelen, 1242 &search_done); 1243 if (ret) { 1244 if (ret == 1) 1245 ret = 0; 1246 goto out; 1247 } 1248 } 1249 1250 /* insert our name */ 1251 ret = btrfs_add_link(trans, dir, inode, name, namelen, 1252 0, ref_index); 1253 if (ret) 1254 goto out; 1255 1256 btrfs_update_inode(trans, root, inode); 1257 } 1258 1259 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1260 kfree(name); 1261 name = NULL; 1262 if (log_ref_ver) { 1263 iput(dir); 1264 dir = NULL; 1265 } 1266 } 1267 1268 /* finally write the back reference in the inode */ 1269 ret = overwrite_item(trans, root, path, eb, slot, key); 1270 out: 1271 btrfs_release_path(path); 1272 kfree(name); 1273 iput(dir); 1274 iput(inode); 1275 return ret; 1276 } 1277 1278 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1279 struct btrfs_root *root, u64 ino) 1280 { 1281 int ret; 1282 1283 ret = btrfs_insert_orphan_item(trans, root, ino); 1284 if (ret == -EEXIST) 1285 ret = 0; 1286 1287 return ret; 1288 } 1289 1290 static int count_inode_extrefs(struct btrfs_root *root, 1291 struct inode *inode, struct btrfs_path *path) 1292 { 1293 int ret = 0; 1294 int name_len; 1295 unsigned int nlink = 0; 1296 u32 item_size; 1297 u32 cur_offset = 0; 1298 u64 inode_objectid = btrfs_ino(inode); 1299 u64 offset = 0; 1300 unsigned long ptr; 1301 struct btrfs_inode_extref *extref; 1302 struct extent_buffer *leaf; 1303 1304 while (1) { 1305 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1306 &extref, &offset); 1307 if (ret) 1308 break; 1309 1310 leaf = path->nodes[0]; 1311 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1312 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1313 cur_offset = 0; 1314 1315 while (cur_offset < item_size) { 1316 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1317 name_len = btrfs_inode_extref_name_len(leaf, extref); 1318 1319 nlink++; 1320 1321 cur_offset += name_len + sizeof(*extref); 1322 } 1323 1324 offset++; 1325 btrfs_release_path(path); 1326 } 1327 btrfs_release_path(path); 1328 1329 if (ret < 0 && ret != -ENOENT) 1330 return ret; 1331 return nlink; 1332 } 1333 1334 static int count_inode_refs(struct btrfs_root *root, 1335 struct inode *inode, struct btrfs_path *path) 1336 { 1337 int ret; 1338 struct btrfs_key key; 1339 unsigned int nlink = 0; 1340 unsigned long ptr; 1341 unsigned long ptr_end; 1342 int name_len; 1343 u64 ino = btrfs_ino(inode); 1344 1345 key.objectid = ino; 1346 key.type = BTRFS_INODE_REF_KEY; 1347 key.offset = (u64)-1; 1348 1349 while (1) { 1350 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1351 if (ret < 0) 1352 break; 1353 if (ret > 0) { 1354 if (path->slots[0] == 0) 1355 break; 1356 path->slots[0]--; 1357 } 1358 process_slot: 1359 btrfs_item_key_to_cpu(path->nodes[0], &key, 1360 path->slots[0]); 1361 if (key.objectid != ino || 1362 key.type != BTRFS_INODE_REF_KEY) 1363 break; 1364 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1365 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1366 path->slots[0]); 1367 while (ptr < ptr_end) { 1368 struct btrfs_inode_ref *ref; 1369 1370 ref = (struct btrfs_inode_ref *)ptr; 1371 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1372 ref); 1373 ptr = (unsigned long)(ref + 1) + name_len; 1374 nlink++; 1375 } 1376 1377 if (key.offset == 0) 1378 break; 1379 if (path->slots[0] > 0) { 1380 path->slots[0]--; 1381 goto process_slot; 1382 } 1383 key.offset--; 1384 btrfs_release_path(path); 1385 } 1386 btrfs_release_path(path); 1387 1388 return nlink; 1389 } 1390 1391 /* 1392 * There are a few corners where the link count of the file can't 1393 * be properly maintained during replay. So, instead of adding 1394 * lots of complexity to the log code, we just scan the backrefs 1395 * for any file that has been through replay. 1396 * 1397 * The scan will update the link count on the inode to reflect the 1398 * number of back refs found. If it goes down to zero, the iput 1399 * will free the inode. 1400 */ 1401 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1402 struct btrfs_root *root, 1403 struct inode *inode) 1404 { 1405 struct btrfs_path *path; 1406 int ret; 1407 u64 nlink = 0; 1408 u64 ino = btrfs_ino(inode); 1409 1410 path = btrfs_alloc_path(); 1411 if (!path) 1412 return -ENOMEM; 1413 1414 ret = count_inode_refs(root, inode, path); 1415 if (ret < 0) 1416 goto out; 1417 1418 nlink = ret; 1419 1420 ret = count_inode_extrefs(root, inode, path); 1421 if (ret < 0) 1422 goto out; 1423 1424 nlink += ret; 1425 1426 ret = 0; 1427 1428 if (nlink != inode->i_nlink) { 1429 set_nlink(inode, nlink); 1430 btrfs_update_inode(trans, root, inode); 1431 } 1432 BTRFS_I(inode)->index_cnt = (u64)-1; 1433 1434 if (inode->i_nlink == 0) { 1435 if (S_ISDIR(inode->i_mode)) { 1436 ret = replay_dir_deletes(trans, root, NULL, path, 1437 ino, 1); 1438 if (ret) 1439 goto out; 1440 } 1441 ret = insert_orphan_item(trans, root, ino); 1442 } 1443 1444 out: 1445 btrfs_free_path(path); 1446 return ret; 1447 } 1448 1449 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1450 struct btrfs_root *root, 1451 struct btrfs_path *path) 1452 { 1453 int ret; 1454 struct btrfs_key key; 1455 struct inode *inode; 1456 1457 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1458 key.type = BTRFS_ORPHAN_ITEM_KEY; 1459 key.offset = (u64)-1; 1460 while (1) { 1461 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1462 if (ret < 0) 1463 break; 1464 1465 if (ret == 1) { 1466 if (path->slots[0] == 0) 1467 break; 1468 path->slots[0]--; 1469 } 1470 1471 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1472 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1473 key.type != BTRFS_ORPHAN_ITEM_KEY) 1474 break; 1475 1476 ret = btrfs_del_item(trans, root, path); 1477 if (ret) 1478 goto out; 1479 1480 btrfs_release_path(path); 1481 inode = read_one_inode(root, key.offset); 1482 if (!inode) 1483 return -EIO; 1484 1485 ret = fixup_inode_link_count(trans, root, inode); 1486 iput(inode); 1487 if (ret) 1488 goto out; 1489 1490 /* 1491 * fixup on a directory may create new entries, 1492 * make sure we always look for the highset possible 1493 * offset 1494 */ 1495 key.offset = (u64)-1; 1496 } 1497 ret = 0; 1498 out: 1499 btrfs_release_path(path); 1500 return ret; 1501 } 1502 1503 1504 /* 1505 * record a given inode in the fixup dir so we can check its link 1506 * count when replay is done. The link count is incremented here 1507 * so the inode won't go away until we check it 1508 */ 1509 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1510 struct btrfs_root *root, 1511 struct btrfs_path *path, 1512 u64 objectid) 1513 { 1514 struct btrfs_key key; 1515 int ret = 0; 1516 struct inode *inode; 1517 1518 inode = read_one_inode(root, objectid); 1519 if (!inode) 1520 return -EIO; 1521 1522 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1523 key.type = BTRFS_ORPHAN_ITEM_KEY; 1524 key.offset = objectid; 1525 1526 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1527 1528 btrfs_release_path(path); 1529 if (ret == 0) { 1530 if (!inode->i_nlink) 1531 set_nlink(inode, 1); 1532 else 1533 inc_nlink(inode); 1534 ret = btrfs_update_inode(trans, root, inode); 1535 } else if (ret == -EEXIST) { 1536 ret = 0; 1537 } else { 1538 BUG(); /* Logic Error */ 1539 } 1540 iput(inode); 1541 1542 return ret; 1543 } 1544 1545 /* 1546 * when replaying the log for a directory, we only insert names 1547 * for inodes that actually exist. This means an fsync on a directory 1548 * does not implicitly fsync all the new files in it 1549 */ 1550 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1551 struct btrfs_root *root, 1552 struct btrfs_path *path, 1553 u64 dirid, u64 index, 1554 char *name, int name_len, u8 type, 1555 struct btrfs_key *location) 1556 { 1557 struct inode *inode; 1558 struct inode *dir; 1559 int ret; 1560 1561 inode = read_one_inode(root, location->objectid); 1562 if (!inode) 1563 return -ENOENT; 1564 1565 dir = read_one_inode(root, dirid); 1566 if (!dir) { 1567 iput(inode); 1568 return -EIO; 1569 } 1570 1571 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1572 1573 /* FIXME, put inode into FIXUP list */ 1574 1575 iput(inode); 1576 iput(dir); 1577 return ret; 1578 } 1579 1580 /* 1581 * Return true if an inode reference exists in the log for the given name, 1582 * inode and parent inode. 1583 */ 1584 static bool name_in_log_ref(struct btrfs_root *log_root, 1585 const char *name, const int name_len, 1586 const u64 dirid, const u64 ino) 1587 { 1588 struct btrfs_key search_key; 1589 1590 search_key.objectid = ino; 1591 search_key.type = BTRFS_INODE_REF_KEY; 1592 search_key.offset = dirid; 1593 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1594 return true; 1595 1596 search_key.type = BTRFS_INODE_EXTREF_KEY; 1597 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1598 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1599 return true; 1600 1601 return false; 1602 } 1603 1604 /* 1605 * take a single entry in a log directory item and replay it into 1606 * the subvolume. 1607 * 1608 * if a conflicting item exists in the subdirectory already, 1609 * the inode it points to is unlinked and put into the link count 1610 * fix up tree. 1611 * 1612 * If a name from the log points to a file or directory that does 1613 * not exist in the FS, it is skipped. fsyncs on directories 1614 * do not force down inodes inside that directory, just changes to the 1615 * names or unlinks in a directory. 1616 */ 1617 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1618 struct btrfs_root *root, 1619 struct btrfs_path *path, 1620 struct extent_buffer *eb, 1621 struct btrfs_dir_item *di, 1622 struct btrfs_key *key) 1623 { 1624 char *name; 1625 int name_len; 1626 struct btrfs_dir_item *dst_di; 1627 struct btrfs_key found_key; 1628 struct btrfs_key log_key; 1629 struct inode *dir; 1630 u8 log_type; 1631 int exists; 1632 int ret = 0; 1633 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1634 1635 dir = read_one_inode(root, key->objectid); 1636 if (!dir) 1637 return -EIO; 1638 1639 name_len = btrfs_dir_name_len(eb, di); 1640 name = kmalloc(name_len, GFP_NOFS); 1641 if (!name) { 1642 ret = -ENOMEM; 1643 goto out; 1644 } 1645 1646 log_type = btrfs_dir_type(eb, di); 1647 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1648 name_len); 1649 1650 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1651 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1652 if (exists == 0) 1653 exists = 1; 1654 else 1655 exists = 0; 1656 btrfs_release_path(path); 1657 1658 if (key->type == BTRFS_DIR_ITEM_KEY) { 1659 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1660 name, name_len, 1); 1661 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1662 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1663 key->objectid, 1664 key->offset, name, 1665 name_len, 1); 1666 } else { 1667 /* Corruption */ 1668 ret = -EINVAL; 1669 goto out; 1670 } 1671 if (IS_ERR_OR_NULL(dst_di)) { 1672 /* we need a sequence number to insert, so we only 1673 * do inserts for the BTRFS_DIR_INDEX_KEY types 1674 */ 1675 if (key->type != BTRFS_DIR_INDEX_KEY) 1676 goto out; 1677 goto insert; 1678 } 1679 1680 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1681 /* the existing item matches the logged item */ 1682 if (found_key.objectid == log_key.objectid && 1683 found_key.type == log_key.type && 1684 found_key.offset == log_key.offset && 1685 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1686 update_size = false; 1687 goto out; 1688 } 1689 1690 /* 1691 * don't drop the conflicting directory entry if the inode 1692 * for the new entry doesn't exist 1693 */ 1694 if (!exists) 1695 goto out; 1696 1697 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1698 if (ret) 1699 goto out; 1700 1701 if (key->type == BTRFS_DIR_INDEX_KEY) 1702 goto insert; 1703 out: 1704 btrfs_release_path(path); 1705 if (!ret && update_size) { 1706 btrfs_i_size_write(dir, dir->i_size + name_len * 2); 1707 ret = btrfs_update_inode(trans, root, dir); 1708 } 1709 kfree(name); 1710 iput(dir); 1711 return ret; 1712 1713 insert: 1714 if (name_in_log_ref(root->log_root, name, name_len, 1715 key->objectid, log_key.objectid)) { 1716 /* The dentry will be added later. */ 1717 ret = 0; 1718 update_size = false; 1719 goto out; 1720 } 1721 btrfs_release_path(path); 1722 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1723 name, name_len, log_type, &log_key); 1724 if (ret && ret != -ENOENT && ret != -EEXIST) 1725 goto out; 1726 update_size = false; 1727 ret = 0; 1728 goto out; 1729 } 1730 1731 /* 1732 * find all the names in a directory item and reconcile them into 1733 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1734 * one name in a directory item, but the same code gets used for 1735 * both directory index types 1736 */ 1737 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1738 struct btrfs_root *root, 1739 struct btrfs_path *path, 1740 struct extent_buffer *eb, int slot, 1741 struct btrfs_key *key) 1742 { 1743 int ret; 1744 u32 item_size = btrfs_item_size_nr(eb, slot); 1745 struct btrfs_dir_item *di; 1746 int name_len; 1747 unsigned long ptr; 1748 unsigned long ptr_end; 1749 1750 ptr = btrfs_item_ptr_offset(eb, slot); 1751 ptr_end = ptr + item_size; 1752 while (ptr < ptr_end) { 1753 di = (struct btrfs_dir_item *)ptr; 1754 if (verify_dir_item(root, eb, di)) 1755 return -EIO; 1756 name_len = btrfs_dir_name_len(eb, di); 1757 ret = replay_one_name(trans, root, path, eb, di, key); 1758 if (ret) 1759 return ret; 1760 ptr = (unsigned long)(di + 1); 1761 ptr += name_len; 1762 } 1763 return 0; 1764 } 1765 1766 /* 1767 * directory replay has two parts. There are the standard directory 1768 * items in the log copied from the subvolume, and range items 1769 * created in the log while the subvolume was logged. 1770 * 1771 * The range items tell us which parts of the key space the log 1772 * is authoritative for. During replay, if a key in the subvolume 1773 * directory is in a logged range item, but not actually in the log 1774 * that means it was deleted from the directory before the fsync 1775 * and should be removed. 1776 */ 1777 static noinline int find_dir_range(struct btrfs_root *root, 1778 struct btrfs_path *path, 1779 u64 dirid, int key_type, 1780 u64 *start_ret, u64 *end_ret) 1781 { 1782 struct btrfs_key key; 1783 u64 found_end; 1784 struct btrfs_dir_log_item *item; 1785 int ret; 1786 int nritems; 1787 1788 if (*start_ret == (u64)-1) 1789 return 1; 1790 1791 key.objectid = dirid; 1792 key.type = key_type; 1793 key.offset = *start_ret; 1794 1795 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1796 if (ret < 0) 1797 goto out; 1798 if (ret > 0) { 1799 if (path->slots[0] == 0) 1800 goto out; 1801 path->slots[0]--; 1802 } 1803 if (ret != 0) 1804 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1805 1806 if (key.type != key_type || key.objectid != dirid) { 1807 ret = 1; 1808 goto next; 1809 } 1810 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1811 struct btrfs_dir_log_item); 1812 found_end = btrfs_dir_log_end(path->nodes[0], item); 1813 1814 if (*start_ret >= key.offset && *start_ret <= found_end) { 1815 ret = 0; 1816 *start_ret = key.offset; 1817 *end_ret = found_end; 1818 goto out; 1819 } 1820 ret = 1; 1821 next: 1822 /* check the next slot in the tree to see if it is a valid item */ 1823 nritems = btrfs_header_nritems(path->nodes[0]); 1824 if (path->slots[0] >= nritems) { 1825 ret = btrfs_next_leaf(root, path); 1826 if (ret) 1827 goto out; 1828 } else { 1829 path->slots[0]++; 1830 } 1831 1832 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1833 1834 if (key.type != key_type || key.objectid != dirid) { 1835 ret = 1; 1836 goto out; 1837 } 1838 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1839 struct btrfs_dir_log_item); 1840 found_end = btrfs_dir_log_end(path->nodes[0], item); 1841 *start_ret = key.offset; 1842 *end_ret = found_end; 1843 ret = 0; 1844 out: 1845 btrfs_release_path(path); 1846 return ret; 1847 } 1848 1849 /* 1850 * this looks for a given directory item in the log. If the directory 1851 * item is not in the log, the item is removed and the inode it points 1852 * to is unlinked 1853 */ 1854 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1855 struct btrfs_root *root, 1856 struct btrfs_root *log, 1857 struct btrfs_path *path, 1858 struct btrfs_path *log_path, 1859 struct inode *dir, 1860 struct btrfs_key *dir_key) 1861 { 1862 int ret; 1863 struct extent_buffer *eb; 1864 int slot; 1865 u32 item_size; 1866 struct btrfs_dir_item *di; 1867 struct btrfs_dir_item *log_di; 1868 int name_len; 1869 unsigned long ptr; 1870 unsigned long ptr_end; 1871 char *name; 1872 struct inode *inode; 1873 struct btrfs_key location; 1874 1875 again: 1876 eb = path->nodes[0]; 1877 slot = path->slots[0]; 1878 item_size = btrfs_item_size_nr(eb, slot); 1879 ptr = btrfs_item_ptr_offset(eb, slot); 1880 ptr_end = ptr + item_size; 1881 while (ptr < ptr_end) { 1882 di = (struct btrfs_dir_item *)ptr; 1883 if (verify_dir_item(root, eb, di)) { 1884 ret = -EIO; 1885 goto out; 1886 } 1887 1888 name_len = btrfs_dir_name_len(eb, di); 1889 name = kmalloc(name_len, GFP_NOFS); 1890 if (!name) { 1891 ret = -ENOMEM; 1892 goto out; 1893 } 1894 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1895 name_len); 1896 log_di = NULL; 1897 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1898 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1899 dir_key->objectid, 1900 name, name_len, 0); 1901 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1902 log_di = btrfs_lookup_dir_index_item(trans, log, 1903 log_path, 1904 dir_key->objectid, 1905 dir_key->offset, 1906 name, name_len, 0); 1907 } 1908 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 1909 btrfs_dir_item_key_to_cpu(eb, di, &location); 1910 btrfs_release_path(path); 1911 btrfs_release_path(log_path); 1912 inode = read_one_inode(root, location.objectid); 1913 if (!inode) { 1914 kfree(name); 1915 return -EIO; 1916 } 1917 1918 ret = link_to_fixup_dir(trans, root, 1919 path, location.objectid); 1920 if (ret) { 1921 kfree(name); 1922 iput(inode); 1923 goto out; 1924 } 1925 1926 inc_nlink(inode); 1927 ret = btrfs_unlink_inode(trans, root, dir, inode, 1928 name, name_len); 1929 if (!ret) 1930 ret = btrfs_run_delayed_items(trans, root); 1931 kfree(name); 1932 iput(inode); 1933 if (ret) 1934 goto out; 1935 1936 /* there might still be more names under this key 1937 * check and repeat if required 1938 */ 1939 ret = btrfs_search_slot(NULL, root, dir_key, path, 1940 0, 0); 1941 if (ret == 0) 1942 goto again; 1943 ret = 0; 1944 goto out; 1945 } else if (IS_ERR(log_di)) { 1946 kfree(name); 1947 return PTR_ERR(log_di); 1948 } 1949 btrfs_release_path(log_path); 1950 kfree(name); 1951 1952 ptr = (unsigned long)(di + 1); 1953 ptr += name_len; 1954 } 1955 ret = 0; 1956 out: 1957 btrfs_release_path(path); 1958 btrfs_release_path(log_path); 1959 return ret; 1960 } 1961 1962 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 1963 struct btrfs_root *root, 1964 struct btrfs_root *log, 1965 struct btrfs_path *path, 1966 const u64 ino) 1967 { 1968 struct btrfs_key search_key; 1969 struct btrfs_path *log_path; 1970 int i; 1971 int nritems; 1972 int ret; 1973 1974 log_path = btrfs_alloc_path(); 1975 if (!log_path) 1976 return -ENOMEM; 1977 1978 search_key.objectid = ino; 1979 search_key.type = BTRFS_XATTR_ITEM_KEY; 1980 search_key.offset = 0; 1981 again: 1982 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1983 if (ret < 0) 1984 goto out; 1985 process_leaf: 1986 nritems = btrfs_header_nritems(path->nodes[0]); 1987 for (i = path->slots[0]; i < nritems; i++) { 1988 struct btrfs_key key; 1989 struct btrfs_dir_item *di; 1990 struct btrfs_dir_item *log_di; 1991 u32 total_size; 1992 u32 cur; 1993 1994 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 1995 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 1996 ret = 0; 1997 goto out; 1998 } 1999 2000 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2001 total_size = btrfs_item_size_nr(path->nodes[0], i); 2002 cur = 0; 2003 while (cur < total_size) { 2004 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2005 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2006 u32 this_len = sizeof(*di) + name_len + data_len; 2007 char *name; 2008 2009 name = kmalloc(name_len, GFP_NOFS); 2010 if (!name) { 2011 ret = -ENOMEM; 2012 goto out; 2013 } 2014 read_extent_buffer(path->nodes[0], name, 2015 (unsigned long)(di + 1), name_len); 2016 2017 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2018 name, name_len, 0); 2019 btrfs_release_path(log_path); 2020 if (!log_di) { 2021 /* Doesn't exist in log tree, so delete it. */ 2022 btrfs_release_path(path); 2023 di = btrfs_lookup_xattr(trans, root, path, ino, 2024 name, name_len, -1); 2025 kfree(name); 2026 if (IS_ERR(di)) { 2027 ret = PTR_ERR(di); 2028 goto out; 2029 } 2030 ASSERT(di); 2031 ret = btrfs_delete_one_dir_name(trans, root, 2032 path, di); 2033 if (ret) 2034 goto out; 2035 btrfs_release_path(path); 2036 search_key = key; 2037 goto again; 2038 } 2039 kfree(name); 2040 if (IS_ERR(log_di)) { 2041 ret = PTR_ERR(log_di); 2042 goto out; 2043 } 2044 cur += this_len; 2045 di = (struct btrfs_dir_item *)((char *)di + this_len); 2046 } 2047 } 2048 ret = btrfs_next_leaf(root, path); 2049 if (ret > 0) 2050 ret = 0; 2051 else if (ret == 0) 2052 goto process_leaf; 2053 out: 2054 btrfs_free_path(log_path); 2055 btrfs_release_path(path); 2056 return ret; 2057 } 2058 2059 2060 /* 2061 * deletion replay happens before we copy any new directory items 2062 * out of the log or out of backreferences from inodes. It 2063 * scans the log to find ranges of keys that log is authoritative for, 2064 * and then scans the directory to find items in those ranges that are 2065 * not present in the log. 2066 * 2067 * Anything we don't find in the log is unlinked and removed from the 2068 * directory. 2069 */ 2070 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2071 struct btrfs_root *root, 2072 struct btrfs_root *log, 2073 struct btrfs_path *path, 2074 u64 dirid, int del_all) 2075 { 2076 u64 range_start; 2077 u64 range_end; 2078 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2079 int ret = 0; 2080 struct btrfs_key dir_key; 2081 struct btrfs_key found_key; 2082 struct btrfs_path *log_path; 2083 struct inode *dir; 2084 2085 dir_key.objectid = dirid; 2086 dir_key.type = BTRFS_DIR_ITEM_KEY; 2087 log_path = btrfs_alloc_path(); 2088 if (!log_path) 2089 return -ENOMEM; 2090 2091 dir = read_one_inode(root, dirid); 2092 /* it isn't an error if the inode isn't there, that can happen 2093 * because we replay the deletes before we copy in the inode item 2094 * from the log 2095 */ 2096 if (!dir) { 2097 btrfs_free_path(log_path); 2098 return 0; 2099 } 2100 again: 2101 range_start = 0; 2102 range_end = 0; 2103 while (1) { 2104 if (del_all) 2105 range_end = (u64)-1; 2106 else { 2107 ret = find_dir_range(log, path, dirid, key_type, 2108 &range_start, &range_end); 2109 if (ret != 0) 2110 break; 2111 } 2112 2113 dir_key.offset = range_start; 2114 while (1) { 2115 int nritems; 2116 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2117 0, 0); 2118 if (ret < 0) 2119 goto out; 2120 2121 nritems = btrfs_header_nritems(path->nodes[0]); 2122 if (path->slots[0] >= nritems) { 2123 ret = btrfs_next_leaf(root, path); 2124 if (ret) 2125 break; 2126 } 2127 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2128 path->slots[0]); 2129 if (found_key.objectid != dirid || 2130 found_key.type != dir_key.type) 2131 goto next_type; 2132 2133 if (found_key.offset > range_end) 2134 break; 2135 2136 ret = check_item_in_log(trans, root, log, path, 2137 log_path, dir, 2138 &found_key); 2139 if (ret) 2140 goto out; 2141 if (found_key.offset == (u64)-1) 2142 break; 2143 dir_key.offset = found_key.offset + 1; 2144 } 2145 btrfs_release_path(path); 2146 if (range_end == (u64)-1) 2147 break; 2148 range_start = range_end + 1; 2149 } 2150 2151 next_type: 2152 ret = 0; 2153 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2154 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2155 dir_key.type = BTRFS_DIR_INDEX_KEY; 2156 btrfs_release_path(path); 2157 goto again; 2158 } 2159 out: 2160 btrfs_release_path(path); 2161 btrfs_free_path(log_path); 2162 iput(dir); 2163 return ret; 2164 } 2165 2166 /* 2167 * the process_func used to replay items from the log tree. This 2168 * gets called in two different stages. The first stage just looks 2169 * for inodes and makes sure they are all copied into the subvolume. 2170 * 2171 * The second stage copies all the other item types from the log into 2172 * the subvolume. The two stage approach is slower, but gets rid of 2173 * lots of complexity around inodes referencing other inodes that exist 2174 * only in the log (references come from either directory items or inode 2175 * back refs). 2176 */ 2177 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2178 struct walk_control *wc, u64 gen) 2179 { 2180 int nritems; 2181 struct btrfs_path *path; 2182 struct btrfs_root *root = wc->replay_dest; 2183 struct btrfs_key key; 2184 int level; 2185 int i; 2186 int ret; 2187 2188 ret = btrfs_read_buffer(eb, gen); 2189 if (ret) 2190 return ret; 2191 2192 level = btrfs_header_level(eb); 2193 2194 if (level != 0) 2195 return 0; 2196 2197 path = btrfs_alloc_path(); 2198 if (!path) 2199 return -ENOMEM; 2200 2201 nritems = btrfs_header_nritems(eb); 2202 for (i = 0; i < nritems; i++) { 2203 btrfs_item_key_to_cpu(eb, &key, i); 2204 2205 /* inode keys are done during the first stage */ 2206 if (key.type == BTRFS_INODE_ITEM_KEY && 2207 wc->stage == LOG_WALK_REPLAY_INODES) { 2208 struct btrfs_inode_item *inode_item; 2209 u32 mode; 2210 2211 inode_item = btrfs_item_ptr(eb, i, 2212 struct btrfs_inode_item); 2213 ret = replay_xattr_deletes(wc->trans, root, log, 2214 path, key.objectid); 2215 if (ret) 2216 break; 2217 mode = btrfs_inode_mode(eb, inode_item); 2218 if (S_ISDIR(mode)) { 2219 ret = replay_dir_deletes(wc->trans, 2220 root, log, path, key.objectid, 0); 2221 if (ret) 2222 break; 2223 } 2224 ret = overwrite_item(wc->trans, root, path, 2225 eb, i, &key); 2226 if (ret) 2227 break; 2228 2229 /* for regular files, make sure corresponding 2230 * orhpan item exist. extents past the new EOF 2231 * will be truncated later by orphan cleanup. 2232 */ 2233 if (S_ISREG(mode)) { 2234 ret = insert_orphan_item(wc->trans, root, 2235 key.objectid); 2236 if (ret) 2237 break; 2238 } 2239 2240 ret = link_to_fixup_dir(wc->trans, root, 2241 path, key.objectid); 2242 if (ret) 2243 break; 2244 } 2245 2246 if (key.type == BTRFS_DIR_INDEX_KEY && 2247 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2248 ret = replay_one_dir_item(wc->trans, root, path, 2249 eb, i, &key); 2250 if (ret) 2251 break; 2252 } 2253 2254 if (wc->stage < LOG_WALK_REPLAY_ALL) 2255 continue; 2256 2257 /* these keys are simply copied */ 2258 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2259 ret = overwrite_item(wc->trans, root, path, 2260 eb, i, &key); 2261 if (ret) 2262 break; 2263 } else if (key.type == BTRFS_INODE_REF_KEY || 2264 key.type == BTRFS_INODE_EXTREF_KEY) { 2265 ret = add_inode_ref(wc->trans, root, log, path, 2266 eb, i, &key); 2267 if (ret && ret != -ENOENT) 2268 break; 2269 ret = 0; 2270 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2271 ret = replay_one_extent(wc->trans, root, path, 2272 eb, i, &key); 2273 if (ret) 2274 break; 2275 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2276 ret = replay_one_dir_item(wc->trans, root, path, 2277 eb, i, &key); 2278 if (ret) 2279 break; 2280 } 2281 } 2282 btrfs_free_path(path); 2283 return ret; 2284 } 2285 2286 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2287 struct btrfs_root *root, 2288 struct btrfs_path *path, int *level, 2289 struct walk_control *wc) 2290 { 2291 u64 root_owner; 2292 u64 bytenr; 2293 u64 ptr_gen; 2294 struct extent_buffer *next; 2295 struct extent_buffer *cur; 2296 struct extent_buffer *parent; 2297 u32 blocksize; 2298 int ret = 0; 2299 2300 WARN_ON(*level < 0); 2301 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2302 2303 while (*level > 0) { 2304 WARN_ON(*level < 0); 2305 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2306 cur = path->nodes[*level]; 2307 2308 WARN_ON(btrfs_header_level(cur) != *level); 2309 2310 if (path->slots[*level] >= 2311 btrfs_header_nritems(cur)) 2312 break; 2313 2314 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2315 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2316 blocksize = root->nodesize; 2317 2318 parent = path->nodes[*level]; 2319 root_owner = btrfs_header_owner(parent); 2320 2321 next = btrfs_find_create_tree_block(root, bytenr); 2322 if (!next) 2323 return -ENOMEM; 2324 2325 if (*level == 1) { 2326 ret = wc->process_func(root, next, wc, ptr_gen); 2327 if (ret) { 2328 free_extent_buffer(next); 2329 return ret; 2330 } 2331 2332 path->slots[*level]++; 2333 if (wc->free) { 2334 ret = btrfs_read_buffer(next, ptr_gen); 2335 if (ret) { 2336 free_extent_buffer(next); 2337 return ret; 2338 } 2339 2340 if (trans) { 2341 btrfs_tree_lock(next); 2342 btrfs_set_lock_blocking(next); 2343 clean_tree_block(trans, root->fs_info, 2344 next); 2345 btrfs_wait_tree_block_writeback(next); 2346 btrfs_tree_unlock(next); 2347 } 2348 2349 WARN_ON(root_owner != 2350 BTRFS_TREE_LOG_OBJECTID); 2351 ret = btrfs_free_and_pin_reserved_extent(root, 2352 bytenr, blocksize); 2353 if (ret) { 2354 free_extent_buffer(next); 2355 return ret; 2356 } 2357 } 2358 free_extent_buffer(next); 2359 continue; 2360 } 2361 ret = btrfs_read_buffer(next, ptr_gen); 2362 if (ret) { 2363 free_extent_buffer(next); 2364 return ret; 2365 } 2366 2367 WARN_ON(*level <= 0); 2368 if (path->nodes[*level-1]) 2369 free_extent_buffer(path->nodes[*level-1]); 2370 path->nodes[*level-1] = next; 2371 *level = btrfs_header_level(next); 2372 path->slots[*level] = 0; 2373 cond_resched(); 2374 } 2375 WARN_ON(*level < 0); 2376 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2377 2378 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2379 2380 cond_resched(); 2381 return 0; 2382 } 2383 2384 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2385 struct btrfs_root *root, 2386 struct btrfs_path *path, int *level, 2387 struct walk_control *wc) 2388 { 2389 u64 root_owner; 2390 int i; 2391 int slot; 2392 int ret; 2393 2394 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2395 slot = path->slots[i]; 2396 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2397 path->slots[i]++; 2398 *level = i; 2399 WARN_ON(*level == 0); 2400 return 0; 2401 } else { 2402 struct extent_buffer *parent; 2403 if (path->nodes[*level] == root->node) 2404 parent = path->nodes[*level]; 2405 else 2406 parent = path->nodes[*level + 1]; 2407 2408 root_owner = btrfs_header_owner(parent); 2409 ret = wc->process_func(root, path->nodes[*level], wc, 2410 btrfs_header_generation(path->nodes[*level])); 2411 if (ret) 2412 return ret; 2413 2414 if (wc->free) { 2415 struct extent_buffer *next; 2416 2417 next = path->nodes[*level]; 2418 2419 if (trans) { 2420 btrfs_tree_lock(next); 2421 btrfs_set_lock_blocking(next); 2422 clean_tree_block(trans, root->fs_info, 2423 next); 2424 btrfs_wait_tree_block_writeback(next); 2425 btrfs_tree_unlock(next); 2426 } 2427 2428 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2429 ret = btrfs_free_and_pin_reserved_extent(root, 2430 path->nodes[*level]->start, 2431 path->nodes[*level]->len); 2432 if (ret) 2433 return ret; 2434 } 2435 free_extent_buffer(path->nodes[*level]); 2436 path->nodes[*level] = NULL; 2437 *level = i + 1; 2438 } 2439 } 2440 return 1; 2441 } 2442 2443 /* 2444 * drop the reference count on the tree rooted at 'snap'. This traverses 2445 * the tree freeing any blocks that have a ref count of zero after being 2446 * decremented. 2447 */ 2448 static int walk_log_tree(struct btrfs_trans_handle *trans, 2449 struct btrfs_root *log, struct walk_control *wc) 2450 { 2451 int ret = 0; 2452 int wret; 2453 int level; 2454 struct btrfs_path *path; 2455 int orig_level; 2456 2457 path = btrfs_alloc_path(); 2458 if (!path) 2459 return -ENOMEM; 2460 2461 level = btrfs_header_level(log->node); 2462 orig_level = level; 2463 path->nodes[level] = log->node; 2464 extent_buffer_get(log->node); 2465 path->slots[level] = 0; 2466 2467 while (1) { 2468 wret = walk_down_log_tree(trans, log, path, &level, wc); 2469 if (wret > 0) 2470 break; 2471 if (wret < 0) { 2472 ret = wret; 2473 goto out; 2474 } 2475 2476 wret = walk_up_log_tree(trans, log, path, &level, wc); 2477 if (wret > 0) 2478 break; 2479 if (wret < 0) { 2480 ret = wret; 2481 goto out; 2482 } 2483 } 2484 2485 /* was the root node processed? if not, catch it here */ 2486 if (path->nodes[orig_level]) { 2487 ret = wc->process_func(log, path->nodes[orig_level], wc, 2488 btrfs_header_generation(path->nodes[orig_level])); 2489 if (ret) 2490 goto out; 2491 if (wc->free) { 2492 struct extent_buffer *next; 2493 2494 next = path->nodes[orig_level]; 2495 2496 if (trans) { 2497 btrfs_tree_lock(next); 2498 btrfs_set_lock_blocking(next); 2499 clean_tree_block(trans, log->fs_info, next); 2500 btrfs_wait_tree_block_writeback(next); 2501 btrfs_tree_unlock(next); 2502 } 2503 2504 WARN_ON(log->root_key.objectid != 2505 BTRFS_TREE_LOG_OBJECTID); 2506 ret = btrfs_free_and_pin_reserved_extent(log, next->start, 2507 next->len); 2508 if (ret) 2509 goto out; 2510 } 2511 } 2512 2513 out: 2514 btrfs_free_path(path); 2515 return ret; 2516 } 2517 2518 /* 2519 * helper function to update the item for a given subvolumes log root 2520 * in the tree of log roots 2521 */ 2522 static int update_log_root(struct btrfs_trans_handle *trans, 2523 struct btrfs_root *log) 2524 { 2525 int ret; 2526 2527 if (log->log_transid == 1) { 2528 /* insert root item on the first sync */ 2529 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 2530 &log->root_key, &log->root_item); 2531 } else { 2532 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 2533 &log->root_key, &log->root_item); 2534 } 2535 return ret; 2536 } 2537 2538 static void wait_log_commit(struct btrfs_trans_handle *trans, 2539 struct btrfs_root *root, int transid) 2540 { 2541 DEFINE_WAIT(wait); 2542 int index = transid % 2; 2543 2544 /* 2545 * we only allow two pending log transactions at a time, 2546 * so we know that if ours is more than 2 older than the 2547 * current transaction, we're done 2548 */ 2549 do { 2550 prepare_to_wait(&root->log_commit_wait[index], 2551 &wait, TASK_UNINTERRUPTIBLE); 2552 mutex_unlock(&root->log_mutex); 2553 2554 if (root->log_transid_committed < transid && 2555 atomic_read(&root->log_commit[index])) 2556 schedule(); 2557 2558 finish_wait(&root->log_commit_wait[index], &wait); 2559 mutex_lock(&root->log_mutex); 2560 } while (root->log_transid_committed < transid && 2561 atomic_read(&root->log_commit[index])); 2562 } 2563 2564 static void wait_for_writer(struct btrfs_trans_handle *trans, 2565 struct btrfs_root *root) 2566 { 2567 DEFINE_WAIT(wait); 2568 2569 while (atomic_read(&root->log_writers)) { 2570 prepare_to_wait(&root->log_writer_wait, 2571 &wait, TASK_UNINTERRUPTIBLE); 2572 mutex_unlock(&root->log_mutex); 2573 if (atomic_read(&root->log_writers)) 2574 schedule(); 2575 finish_wait(&root->log_writer_wait, &wait); 2576 mutex_lock(&root->log_mutex); 2577 } 2578 } 2579 2580 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2581 struct btrfs_log_ctx *ctx) 2582 { 2583 if (!ctx) 2584 return; 2585 2586 mutex_lock(&root->log_mutex); 2587 list_del_init(&ctx->list); 2588 mutex_unlock(&root->log_mutex); 2589 } 2590 2591 /* 2592 * Invoked in log mutex context, or be sure there is no other task which 2593 * can access the list. 2594 */ 2595 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2596 int index, int error) 2597 { 2598 struct btrfs_log_ctx *ctx; 2599 2600 if (!error) { 2601 INIT_LIST_HEAD(&root->log_ctxs[index]); 2602 return; 2603 } 2604 2605 list_for_each_entry(ctx, &root->log_ctxs[index], list) 2606 ctx->log_ret = error; 2607 2608 INIT_LIST_HEAD(&root->log_ctxs[index]); 2609 } 2610 2611 /* 2612 * btrfs_sync_log does sends a given tree log down to the disk and 2613 * updates the super blocks to record it. When this call is done, 2614 * you know that any inodes previously logged are safely on disk only 2615 * if it returns 0. 2616 * 2617 * Any other return value means you need to call btrfs_commit_transaction. 2618 * Some of the edge cases for fsyncing directories that have had unlinks 2619 * or renames done in the past mean that sometimes the only safe 2620 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2621 * that has happened. 2622 */ 2623 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2624 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2625 { 2626 int index1; 2627 int index2; 2628 int mark; 2629 int ret; 2630 struct btrfs_root *log = root->log_root; 2631 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2632 int log_transid = 0; 2633 struct btrfs_log_ctx root_log_ctx; 2634 struct blk_plug plug; 2635 2636 mutex_lock(&root->log_mutex); 2637 log_transid = ctx->log_transid; 2638 if (root->log_transid_committed >= log_transid) { 2639 mutex_unlock(&root->log_mutex); 2640 return ctx->log_ret; 2641 } 2642 2643 index1 = log_transid % 2; 2644 if (atomic_read(&root->log_commit[index1])) { 2645 wait_log_commit(trans, root, log_transid); 2646 mutex_unlock(&root->log_mutex); 2647 return ctx->log_ret; 2648 } 2649 ASSERT(log_transid == root->log_transid); 2650 atomic_set(&root->log_commit[index1], 1); 2651 2652 /* wait for previous tree log sync to complete */ 2653 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2654 wait_log_commit(trans, root, log_transid - 1); 2655 2656 while (1) { 2657 int batch = atomic_read(&root->log_batch); 2658 /* when we're on an ssd, just kick the log commit out */ 2659 if (!btrfs_test_opt(root, SSD) && 2660 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2661 mutex_unlock(&root->log_mutex); 2662 schedule_timeout_uninterruptible(1); 2663 mutex_lock(&root->log_mutex); 2664 } 2665 wait_for_writer(trans, root); 2666 if (batch == atomic_read(&root->log_batch)) 2667 break; 2668 } 2669 2670 /* bail out if we need to do a full commit */ 2671 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 2672 ret = -EAGAIN; 2673 btrfs_free_logged_extents(log, log_transid); 2674 mutex_unlock(&root->log_mutex); 2675 goto out; 2676 } 2677 2678 if (log_transid % 2 == 0) 2679 mark = EXTENT_DIRTY; 2680 else 2681 mark = EXTENT_NEW; 2682 2683 /* we start IO on all the marked extents here, but we don't actually 2684 * wait for them until later. 2685 */ 2686 blk_start_plug(&plug); 2687 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2688 if (ret) { 2689 blk_finish_plug(&plug); 2690 btrfs_abort_transaction(trans, root, ret); 2691 btrfs_free_logged_extents(log, log_transid); 2692 btrfs_set_log_full_commit(root->fs_info, trans); 2693 mutex_unlock(&root->log_mutex); 2694 goto out; 2695 } 2696 2697 btrfs_set_root_node(&log->root_item, log->node); 2698 2699 root->log_transid++; 2700 log->log_transid = root->log_transid; 2701 root->log_start_pid = 0; 2702 /* 2703 * IO has been started, blocks of the log tree have WRITTEN flag set 2704 * in their headers. new modifications of the log will be written to 2705 * new positions. so it's safe to allow log writers to go in. 2706 */ 2707 mutex_unlock(&root->log_mutex); 2708 2709 btrfs_init_log_ctx(&root_log_ctx); 2710 2711 mutex_lock(&log_root_tree->log_mutex); 2712 atomic_inc(&log_root_tree->log_batch); 2713 atomic_inc(&log_root_tree->log_writers); 2714 2715 index2 = log_root_tree->log_transid % 2; 2716 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2717 root_log_ctx.log_transid = log_root_tree->log_transid; 2718 2719 mutex_unlock(&log_root_tree->log_mutex); 2720 2721 ret = update_log_root(trans, log); 2722 2723 mutex_lock(&log_root_tree->log_mutex); 2724 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2725 smp_mb(); 2726 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2727 wake_up(&log_root_tree->log_writer_wait); 2728 } 2729 2730 if (ret) { 2731 if (!list_empty(&root_log_ctx.list)) 2732 list_del_init(&root_log_ctx.list); 2733 2734 blk_finish_plug(&plug); 2735 btrfs_set_log_full_commit(root->fs_info, trans); 2736 2737 if (ret != -ENOSPC) { 2738 btrfs_abort_transaction(trans, root, ret); 2739 mutex_unlock(&log_root_tree->log_mutex); 2740 goto out; 2741 } 2742 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2743 btrfs_free_logged_extents(log, log_transid); 2744 mutex_unlock(&log_root_tree->log_mutex); 2745 ret = -EAGAIN; 2746 goto out; 2747 } 2748 2749 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2750 blk_finish_plug(&plug); 2751 mutex_unlock(&log_root_tree->log_mutex); 2752 ret = root_log_ctx.log_ret; 2753 goto out; 2754 } 2755 2756 index2 = root_log_ctx.log_transid % 2; 2757 if (atomic_read(&log_root_tree->log_commit[index2])) { 2758 blk_finish_plug(&plug); 2759 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, 2760 mark); 2761 btrfs_wait_logged_extents(trans, log, log_transid); 2762 wait_log_commit(trans, log_root_tree, 2763 root_log_ctx.log_transid); 2764 mutex_unlock(&log_root_tree->log_mutex); 2765 if (!ret) 2766 ret = root_log_ctx.log_ret; 2767 goto out; 2768 } 2769 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2770 atomic_set(&log_root_tree->log_commit[index2], 1); 2771 2772 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2773 wait_log_commit(trans, log_root_tree, 2774 root_log_ctx.log_transid - 1); 2775 } 2776 2777 wait_for_writer(trans, log_root_tree); 2778 2779 /* 2780 * now that we've moved on to the tree of log tree roots, 2781 * check the full commit flag again 2782 */ 2783 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 2784 blk_finish_plug(&plug); 2785 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2786 btrfs_free_logged_extents(log, log_transid); 2787 mutex_unlock(&log_root_tree->log_mutex); 2788 ret = -EAGAIN; 2789 goto out_wake_log_root; 2790 } 2791 2792 ret = btrfs_write_marked_extents(log_root_tree, 2793 &log_root_tree->dirty_log_pages, 2794 EXTENT_DIRTY | EXTENT_NEW); 2795 blk_finish_plug(&plug); 2796 if (ret) { 2797 btrfs_set_log_full_commit(root->fs_info, trans); 2798 btrfs_abort_transaction(trans, root, ret); 2799 btrfs_free_logged_extents(log, log_transid); 2800 mutex_unlock(&log_root_tree->log_mutex); 2801 goto out_wake_log_root; 2802 } 2803 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2804 if (!ret) 2805 ret = btrfs_wait_marked_extents(log_root_tree, 2806 &log_root_tree->dirty_log_pages, 2807 EXTENT_NEW | EXTENT_DIRTY); 2808 if (ret) { 2809 btrfs_set_log_full_commit(root->fs_info, trans); 2810 btrfs_free_logged_extents(log, log_transid); 2811 mutex_unlock(&log_root_tree->log_mutex); 2812 goto out_wake_log_root; 2813 } 2814 btrfs_wait_logged_extents(trans, log, log_transid); 2815 2816 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2817 log_root_tree->node->start); 2818 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2819 btrfs_header_level(log_root_tree->node)); 2820 2821 log_root_tree->log_transid++; 2822 mutex_unlock(&log_root_tree->log_mutex); 2823 2824 /* 2825 * nobody else is going to jump in and write the the ctree 2826 * super here because the log_commit atomic below is protecting 2827 * us. We must be called with a transaction handle pinning 2828 * the running transaction open, so a full commit can't hop 2829 * in and cause problems either. 2830 */ 2831 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2832 if (ret) { 2833 btrfs_set_log_full_commit(root->fs_info, trans); 2834 btrfs_abort_transaction(trans, root, ret); 2835 goto out_wake_log_root; 2836 } 2837 2838 mutex_lock(&root->log_mutex); 2839 if (root->last_log_commit < log_transid) 2840 root->last_log_commit = log_transid; 2841 mutex_unlock(&root->log_mutex); 2842 2843 out_wake_log_root: 2844 /* 2845 * We needn't get log_mutex here because we are sure all 2846 * the other tasks are blocked. 2847 */ 2848 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 2849 2850 mutex_lock(&log_root_tree->log_mutex); 2851 log_root_tree->log_transid_committed++; 2852 atomic_set(&log_root_tree->log_commit[index2], 0); 2853 mutex_unlock(&log_root_tree->log_mutex); 2854 2855 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2856 wake_up(&log_root_tree->log_commit_wait[index2]); 2857 out: 2858 /* See above. */ 2859 btrfs_remove_all_log_ctxs(root, index1, ret); 2860 2861 mutex_lock(&root->log_mutex); 2862 root->log_transid_committed++; 2863 atomic_set(&root->log_commit[index1], 0); 2864 mutex_unlock(&root->log_mutex); 2865 2866 if (waitqueue_active(&root->log_commit_wait[index1])) 2867 wake_up(&root->log_commit_wait[index1]); 2868 return ret; 2869 } 2870 2871 static void free_log_tree(struct btrfs_trans_handle *trans, 2872 struct btrfs_root *log) 2873 { 2874 int ret; 2875 u64 start; 2876 u64 end; 2877 struct walk_control wc = { 2878 .free = 1, 2879 .process_func = process_one_buffer 2880 }; 2881 2882 ret = walk_log_tree(trans, log, &wc); 2883 /* I don't think this can happen but just in case */ 2884 if (ret) 2885 btrfs_abort_transaction(trans, log, ret); 2886 2887 while (1) { 2888 ret = find_first_extent_bit(&log->dirty_log_pages, 2889 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 2890 NULL); 2891 if (ret) 2892 break; 2893 2894 clear_extent_bits(&log->dirty_log_pages, start, end, 2895 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2896 } 2897 2898 /* 2899 * We may have short-circuited the log tree with the full commit logic 2900 * and left ordered extents on our list, so clear these out to keep us 2901 * from leaking inodes and memory. 2902 */ 2903 btrfs_free_logged_extents(log, 0); 2904 btrfs_free_logged_extents(log, 1); 2905 2906 free_extent_buffer(log->node); 2907 kfree(log); 2908 } 2909 2910 /* 2911 * free all the extents used by the tree log. This should be called 2912 * at commit time of the full transaction 2913 */ 2914 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2915 { 2916 if (root->log_root) { 2917 free_log_tree(trans, root->log_root); 2918 root->log_root = NULL; 2919 } 2920 return 0; 2921 } 2922 2923 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 2924 struct btrfs_fs_info *fs_info) 2925 { 2926 if (fs_info->log_root_tree) { 2927 free_log_tree(trans, fs_info->log_root_tree); 2928 fs_info->log_root_tree = NULL; 2929 } 2930 return 0; 2931 } 2932 2933 /* 2934 * If both a file and directory are logged, and unlinks or renames are 2935 * mixed in, we have a few interesting corners: 2936 * 2937 * create file X in dir Y 2938 * link file X to X.link in dir Y 2939 * fsync file X 2940 * unlink file X but leave X.link 2941 * fsync dir Y 2942 * 2943 * After a crash we would expect only X.link to exist. But file X 2944 * didn't get fsync'd again so the log has back refs for X and X.link. 2945 * 2946 * We solve this by removing directory entries and inode backrefs from the 2947 * log when a file that was logged in the current transaction is 2948 * unlinked. Any later fsync will include the updated log entries, and 2949 * we'll be able to reconstruct the proper directory items from backrefs. 2950 * 2951 * This optimizations allows us to avoid relogging the entire inode 2952 * or the entire directory. 2953 */ 2954 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2955 struct btrfs_root *root, 2956 const char *name, int name_len, 2957 struct inode *dir, u64 index) 2958 { 2959 struct btrfs_root *log; 2960 struct btrfs_dir_item *di; 2961 struct btrfs_path *path; 2962 int ret; 2963 int err = 0; 2964 int bytes_del = 0; 2965 u64 dir_ino = btrfs_ino(dir); 2966 2967 if (BTRFS_I(dir)->logged_trans < trans->transid) 2968 return 0; 2969 2970 ret = join_running_log_trans(root); 2971 if (ret) 2972 return 0; 2973 2974 mutex_lock(&BTRFS_I(dir)->log_mutex); 2975 2976 log = root->log_root; 2977 path = btrfs_alloc_path(); 2978 if (!path) { 2979 err = -ENOMEM; 2980 goto out_unlock; 2981 } 2982 2983 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 2984 name, name_len, -1); 2985 if (IS_ERR(di)) { 2986 err = PTR_ERR(di); 2987 goto fail; 2988 } 2989 if (di) { 2990 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2991 bytes_del += name_len; 2992 if (ret) { 2993 err = ret; 2994 goto fail; 2995 } 2996 } 2997 btrfs_release_path(path); 2998 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 2999 index, name, name_len, -1); 3000 if (IS_ERR(di)) { 3001 err = PTR_ERR(di); 3002 goto fail; 3003 } 3004 if (di) { 3005 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3006 bytes_del += name_len; 3007 if (ret) { 3008 err = ret; 3009 goto fail; 3010 } 3011 } 3012 3013 /* update the directory size in the log to reflect the names 3014 * we have removed 3015 */ 3016 if (bytes_del) { 3017 struct btrfs_key key; 3018 3019 key.objectid = dir_ino; 3020 key.offset = 0; 3021 key.type = BTRFS_INODE_ITEM_KEY; 3022 btrfs_release_path(path); 3023 3024 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3025 if (ret < 0) { 3026 err = ret; 3027 goto fail; 3028 } 3029 if (ret == 0) { 3030 struct btrfs_inode_item *item; 3031 u64 i_size; 3032 3033 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3034 struct btrfs_inode_item); 3035 i_size = btrfs_inode_size(path->nodes[0], item); 3036 if (i_size > bytes_del) 3037 i_size -= bytes_del; 3038 else 3039 i_size = 0; 3040 btrfs_set_inode_size(path->nodes[0], item, i_size); 3041 btrfs_mark_buffer_dirty(path->nodes[0]); 3042 } else 3043 ret = 0; 3044 btrfs_release_path(path); 3045 } 3046 fail: 3047 btrfs_free_path(path); 3048 out_unlock: 3049 mutex_unlock(&BTRFS_I(dir)->log_mutex); 3050 if (ret == -ENOSPC) { 3051 btrfs_set_log_full_commit(root->fs_info, trans); 3052 ret = 0; 3053 } else if (ret < 0) 3054 btrfs_abort_transaction(trans, root, ret); 3055 3056 btrfs_end_log_trans(root); 3057 3058 return err; 3059 } 3060 3061 /* see comments for btrfs_del_dir_entries_in_log */ 3062 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3063 struct btrfs_root *root, 3064 const char *name, int name_len, 3065 struct inode *inode, u64 dirid) 3066 { 3067 struct btrfs_root *log; 3068 u64 index; 3069 int ret; 3070 3071 if (BTRFS_I(inode)->logged_trans < trans->transid) 3072 return 0; 3073 3074 ret = join_running_log_trans(root); 3075 if (ret) 3076 return 0; 3077 log = root->log_root; 3078 mutex_lock(&BTRFS_I(inode)->log_mutex); 3079 3080 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3081 dirid, &index); 3082 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3083 if (ret == -ENOSPC) { 3084 btrfs_set_log_full_commit(root->fs_info, trans); 3085 ret = 0; 3086 } else if (ret < 0 && ret != -ENOENT) 3087 btrfs_abort_transaction(trans, root, ret); 3088 btrfs_end_log_trans(root); 3089 3090 return ret; 3091 } 3092 3093 /* 3094 * creates a range item in the log for 'dirid'. first_offset and 3095 * last_offset tell us which parts of the key space the log should 3096 * be considered authoritative for. 3097 */ 3098 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3099 struct btrfs_root *log, 3100 struct btrfs_path *path, 3101 int key_type, u64 dirid, 3102 u64 first_offset, u64 last_offset) 3103 { 3104 int ret; 3105 struct btrfs_key key; 3106 struct btrfs_dir_log_item *item; 3107 3108 key.objectid = dirid; 3109 key.offset = first_offset; 3110 if (key_type == BTRFS_DIR_ITEM_KEY) 3111 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3112 else 3113 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3114 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3115 if (ret) 3116 return ret; 3117 3118 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3119 struct btrfs_dir_log_item); 3120 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3121 btrfs_mark_buffer_dirty(path->nodes[0]); 3122 btrfs_release_path(path); 3123 return 0; 3124 } 3125 3126 /* 3127 * log all the items included in the current transaction for a given 3128 * directory. This also creates the range items in the log tree required 3129 * to replay anything deleted before the fsync 3130 */ 3131 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3132 struct btrfs_root *root, struct inode *inode, 3133 struct btrfs_path *path, 3134 struct btrfs_path *dst_path, int key_type, 3135 struct btrfs_log_ctx *ctx, 3136 u64 min_offset, u64 *last_offset_ret) 3137 { 3138 struct btrfs_key min_key; 3139 struct btrfs_root *log = root->log_root; 3140 struct extent_buffer *src; 3141 int err = 0; 3142 int ret; 3143 int i; 3144 int nritems; 3145 u64 first_offset = min_offset; 3146 u64 last_offset = (u64)-1; 3147 u64 ino = btrfs_ino(inode); 3148 3149 log = root->log_root; 3150 3151 min_key.objectid = ino; 3152 min_key.type = key_type; 3153 min_key.offset = min_offset; 3154 3155 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3156 3157 /* 3158 * we didn't find anything from this transaction, see if there 3159 * is anything at all 3160 */ 3161 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3162 min_key.objectid = ino; 3163 min_key.type = key_type; 3164 min_key.offset = (u64)-1; 3165 btrfs_release_path(path); 3166 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3167 if (ret < 0) { 3168 btrfs_release_path(path); 3169 return ret; 3170 } 3171 ret = btrfs_previous_item(root, path, ino, key_type); 3172 3173 /* if ret == 0 there are items for this type, 3174 * create a range to tell us the last key of this type. 3175 * otherwise, there are no items in this directory after 3176 * *min_offset, and we create a range to indicate that. 3177 */ 3178 if (ret == 0) { 3179 struct btrfs_key tmp; 3180 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3181 path->slots[0]); 3182 if (key_type == tmp.type) 3183 first_offset = max(min_offset, tmp.offset) + 1; 3184 } 3185 goto done; 3186 } 3187 3188 /* go backward to find any previous key */ 3189 ret = btrfs_previous_item(root, path, ino, key_type); 3190 if (ret == 0) { 3191 struct btrfs_key tmp; 3192 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3193 if (key_type == tmp.type) { 3194 first_offset = tmp.offset; 3195 ret = overwrite_item(trans, log, dst_path, 3196 path->nodes[0], path->slots[0], 3197 &tmp); 3198 if (ret) { 3199 err = ret; 3200 goto done; 3201 } 3202 } 3203 } 3204 btrfs_release_path(path); 3205 3206 /* find the first key from this transaction again */ 3207 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3208 if (WARN_ON(ret != 0)) 3209 goto done; 3210 3211 /* 3212 * we have a block from this transaction, log every item in it 3213 * from our directory 3214 */ 3215 while (1) { 3216 struct btrfs_key tmp; 3217 src = path->nodes[0]; 3218 nritems = btrfs_header_nritems(src); 3219 for (i = path->slots[0]; i < nritems; i++) { 3220 struct btrfs_dir_item *di; 3221 3222 btrfs_item_key_to_cpu(src, &min_key, i); 3223 3224 if (min_key.objectid != ino || min_key.type != key_type) 3225 goto done; 3226 ret = overwrite_item(trans, log, dst_path, src, i, 3227 &min_key); 3228 if (ret) { 3229 err = ret; 3230 goto done; 3231 } 3232 3233 /* 3234 * We must make sure that when we log a directory entry, 3235 * the corresponding inode, after log replay, has a 3236 * matching link count. For example: 3237 * 3238 * touch foo 3239 * mkdir mydir 3240 * sync 3241 * ln foo mydir/bar 3242 * xfs_io -c "fsync" mydir 3243 * <crash> 3244 * <mount fs and log replay> 3245 * 3246 * Would result in a fsync log that when replayed, our 3247 * file inode would have a link count of 1, but we get 3248 * two directory entries pointing to the same inode. 3249 * After removing one of the names, it would not be 3250 * possible to remove the other name, which resulted 3251 * always in stale file handle errors, and would not 3252 * be possible to rmdir the parent directory, since 3253 * its i_size could never decrement to the value 3254 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3255 */ 3256 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3257 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3258 if (ctx && 3259 (btrfs_dir_transid(src, di) == trans->transid || 3260 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3261 tmp.type != BTRFS_ROOT_ITEM_KEY) 3262 ctx->log_new_dentries = true; 3263 } 3264 path->slots[0] = nritems; 3265 3266 /* 3267 * look ahead to the next item and see if it is also 3268 * from this directory and from this transaction 3269 */ 3270 ret = btrfs_next_leaf(root, path); 3271 if (ret == 1) { 3272 last_offset = (u64)-1; 3273 goto done; 3274 } 3275 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3276 if (tmp.objectid != ino || tmp.type != key_type) { 3277 last_offset = (u64)-1; 3278 goto done; 3279 } 3280 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3281 ret = overwrite_item(trans, log, dst_path, 3282 path->nodes[0], path->slots[0], 3283 &tmp); 3284 if (ret) 3285 err = ret; 3286 else 3287 last_offset = tmp.offset; 3288 goto done; 3289 } 3290 } 3291 done: 3292 btrfs_release_path(path); 3293 btrfs_release_path(dst_path); 3294 3295 if (err == 0) { 3296 *last_offset_ret = last_offset; 3297 /* 3298 * insert the log range keys to indicate where the log 3299 * is valid 3300 */ 3301 ret = insert_dir_log_key(trans, log, path, key_type, 3302 ino, first_offset, last_offset); 3303 if (ret) 3304 err = ret; 3305 } 3306 return err; 3307 } 3308 3309 /* 3310 * logging directories is very similar to logging inodes, We find all the items 3311 * from the current transaction and write them to the log. 3312 * 3313 * The recovery code scans the directory in the subvolume, and if it finds a 3314 * key in the range logged that is not present in the log tree, then it means 3315 * that dir entry was unlinked during the transaction. 3316 * 3317 * In order for that scan to work, we must include one key smaller than 3318 * the smallest logged by this transaction and one key larger than the largest 3319 * key logged by this transaction. 3320 */ 3321 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3322 struct btrfs_root *root, struct inode *inode, 3323 struct btrfs_path *path, 3324 struct btrfs_path *dst_path, 3325 struct btrfs_log_ctx *ctx) 3326 { 3327 u64 min_key; 3328 u64 max_key; 3329 int ret; 3330 int key_type = BTRFS_DIR_ITEM_KEY; 3331 3332 again: 3333 min_key = 0; 3334 max_key = 0; 3335 while (1) { 3336 ret = log_dir_items(trans, root, inode, path, 3337 dst_path, key_type, ctx, min_key, 3338 &max_key); 3339 if (ret) 3340 return ret; 3341 if (max_key == (u64)-1) 3342 break; 3343 min_key = max_key + 1; 3344 } 3345 3346 if (key_type == BTRFS_DIR_ITEM_KEY) { 3347 key_type = BTRFS_DIR_INDEX_KEY; 3348 goto again; 3349 } 3350 return 0; 3351 } 3352 3353 /* 3354 * a helper function to drop items from the log before we relog an 3355 * inode. max_key_type indicates the highest item type to remove. 3356 * This cannot be run for file data extents because it does not 3357 * free the extents they point to. 3358 */ 3359 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3360 struct btrfs_root *log, 3361 struct btrfs_path *path, 3362 u64 objectid, int max_key_type) 3363 { 3364 int ret; 3365 struct btrfs_key key; 3366 struct btrfs_key found_key; 3367 int start_slot; 3368 3369 key.objectid = objectid; 3370 key.type = max_key_type; 3371 key.offset = (u64)-1; 3372 3373 while (1) { 3374 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3375 BUG_ON(ret == 0); /* Logic error */ 3376 if (ret < 0) 3377 break; 3378 3379 if (path->slots[0] == 0) 3380 break; 3381 3382 path->slots[0]--; 3383 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3384 path->slots[0]); 3385 3386 if (found_key.objectid != objectid) 3387 break; 3388 3389 found_key.offset = 0; 3390 found_key.type = 0; 3391 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3392 &start_slot); 3393 3394 ret = btrfs_del_items(trans, log, path, start_slot, 3395 path->slots[0] - start_slot + 1); 3396 /* 3397 * If start slot isn't 0 then we don't need to re-search, we've 3398 * found the last guy with the objectid in this tree. 3399 */ 3400 if (ret || start_slot != 0) 3401 break; 3402 btrfs_release_path(path); 3403 } 3404 btrfs_release_path(path); 3405 if (ret > 0) 3406 ret = 0; 3407 return ret; 3408 } 3409 3410 static void fill_inode_item(struct btrfs_trans_handle *trans, 3411 struct extent_buffer *leaf, 3412 struct btrfs_inode_item *item, 3413 struct inode *inode, int log_inode_only, 3414 u64 logged_isize) 3415 { 3416 struct btrfs_map_token token; 3417 3418 btrfs_init_map_token(&token); 3419 3420 if (log_inode_only) { 3421 /* set the generation to zero so the recover code 3422 * can tell the difference between an logging 3423 * just to say 'this inode exists' and a logging 3424 * to say 'update this inode with these values' 3425 */ 3426 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3427 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3428 } else { 3429 btrfs_set_token_inode_generation(leaf, item, 3430 BTRFS_I(inode)->generation, 3431 &token); 3432 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3433 } 3434 3435 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3436 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3437 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3438 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3439 3440 btrfs_set_token_timespec_sec(leaf, &item->atime, 3441 inode->i_atime.tv_sec, &token); 3442 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3443 inode->i_atime.tv_nsec, &token); 3444 3445 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3446 inode->i_mtime.tv_sec, &token); 3447 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3448 inode->i_mtime.tv_nsec, &token); 3449 3450 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3451 inode->i_ctime.tv_sec, &token); 3452 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3453 inode->i_ctime.tv_nsec, &token); 3454 3455 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3456 &token); 3457 3458 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3459 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3460 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3461 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3462 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3463 } 3464 3465 static int log_inode_item(struct btrfs_trans_handle *trans, 3466 struct btrfs_root *log, struct btrfs_path *path, 3467 struct inode *inode) 3468 { 3469 struct btrfs_inode_item *inode_item; 3470 int ret; 3471 3472 ret = btrfs_insert_empty_item(trans, log, path, 3473 &BTRFS_I(inode)->location, 3474 sizeof(*inode_item)); 3475 if (ret && ret != -EEXIST) 3476 return ret; 3477 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3478 struct btrfs_inode_item); 3479 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0); 3480 btrfs_release_path(path); 3481 return 0; 3482 } 3483 3484 static noinline int copy_items(struct btrfs_trans_handle *trans, 3485 struct inode *inode, 3486 struct btrfs_path *dst_path, 3487 struct btrfs_path *src_path, u64 *last_extent, 3488 int start_slot, int nr, int inode_only, 3489 u64 logged_isize) 3490 { 3491 unsigned long src_offset; 3492 unsigned long dst_offset; 3493 struct btrfs_root *log = BTRFS_I(inode)->root->log_root; 3494 struct btrfs_file_extent_item *extent; 3495 struct btrfs_inode_item *inode_item; 3496 struct extent_buffer *src = src_path->nodes[0]; 3497 struct btrfs_key first_key, last_key, key; 3498 int ret; 3499 struct btrfs_key *ins_keys; 3500 u32 *ins_sizes; 3501 char *ins_data; 3502 int i; 3503 struct list_head ordered_sums; 3504 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3505 bool has_extents = false; 3506 bool need_find_last_extent = true; 3507 bool done = false; 3508 3509 INIT_LIST_HEAD(&ordered_sums); 3510 3511 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3512 nr * sizeof(u32), GFP_NOFS); 3513 if (!ins_data) 3514 return -ENOMEM; 3515 3516 first_key.objectid = (u64)-1; 3517 3518 ins_sizes = (u32 *)ins_data; 3519 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3520 3521 for (i = 0; i < nr; i++) { 3522 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3523 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3524 } 3525 ret = btrfs_insert_empty_items(trans, log, dst_path, 3526 ins_keys, ins_sizes, nr); 3527 if (ret) { 3528 kfree(ins_data); 3529 return ret; 3530 } 3531 3532 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3533 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3534 dst_path->slots[0]); 3535 3536 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3537 3538 if ((i == (nr - 1))) 3539 last_key = ins_keys[i]; 3540 3541 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3542 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3543 dst_path->slots[0], 3544 struct btrfs_inode_item); 3545 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3546 inode, inode_only == LOG_INODE_EXISTS, 3547 logged_isize); 3548 } else { 3549 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3550 src_offset, ins_sizes[i]); 3551 } 3552 3553 /* 3554 * We set need_find_last_extent here in case we know we were 3555 * processing other items and then walk into the first extent in 3556 * the inode. If we don't hit an extent then nothing changes, 3557 * we'll do the last search the next time around. 3558 */ 3559 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3560 has_extents = true; 3561 if (first_key.objectid == (u64)-1) 3562 first_key = ins_keys[i]; 3563 } else { 3564 need_find_last_extent = false; 3565 } 3566 3567 /* take a reference on file data extents so that truncates 3568 * or deletes of this inode don't have to relog the inode 3569 * again 3570 */ 3571 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3572 !skip_csum) { 3573 int found_type; 3574 extent = btrfs_item_ptr(src, start_slot + i, 3575 struct btrfs_file_extent_item); 3576 3577 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3578 continue; 3579 3580 found_type = btrfs_file_extent_type(src, extent); 3581 if (found_type == BTRFS_FILE_EXTENT_REG) { 3582 u64 ds, dl, cs, cl; 3583 ds = btrfs_file_extent_disk_bytenr(src, 3584 extent); 3585 /* ds == 0 is a hole */ 3586 if (ds == 0) 3587 continue; 3588 3589 dl = btrfs_file_extent_disk_num_bytes(src, 3590 extent); 3591 cs = btrfs_file_extent_offset(src, extent); 3592 cl = btrfs_file_extent_num_bytes(src, 3593 extent); 3594 if (btrfs_file_extent_compression(src, 3595 extent)) { 3596 cs = 0; 3597 cl = dl; 3598 } 3599 3600 ret = btrfs_lookup_csums_range( 3601 log->fs_info->csum_root, 3602 ds + cs, ds + cs + cl - 1, 3603 &ordered_sums, 0); 3604 if (ret) { 3605 btrfs_release_path(dst_path); 3606 kfree(ins_data); 3607 return ret; 3608 } 3609 } 3610 } 3611 } 3612 3613 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3614 btrfs_release_path(dst_path); 3615 kfree(ins_data); 3616 3617 /* 3618 * we have to do this after the loop above to avoid changing the 3619 * log tree while trying to change the log tree. 3620 */ 3621 ret = 0; 3622 while (!list_empty(&ordered_sums)) { 3623 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3624 struct btrfs_ordered_sum, 3625 list); 3626 if (!ret) 3627 ret = btrfs_csum_file_blocks(trans, log, sums); 3628 list_del(&sums->list); 3629 kfree(sums); 3630 } 3631 3632 if (!has_extents) 3633 return ret; 3634 3635 if (need_find_last_extent && *last_extent == first_key.offset) { 3636 /* 3637 * We don't have any leafs between our current one and the one 3638 * we processed before that can have file extent items for our 3639 * inode (and have a generation number smaller than our current 3640 * transaction id). 3641 */ 3642 need_find_last_extent = false; 3643 } 3644 3645 /* 3646 * Because we use btrfs_search_forward we could skip leaves that were 3647 * not modified and then assume *last_extent is valid when it really 3648 * isn't. So back up to the previous leaf and read the end of the last 3649 * extent before we go and fill in holes. 3650 */ 3651 if (need_find_last_extent) { 3652 u64 len; 3653 3654 ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); 3655 if (ret < 0) 3656 return ret; 3657 if (ret) 3658 goto fill_holes; 3659 if (src_path->slots[0]) 3660 src_path->slots[0]--; 3661 src = src_path->nodes[0]; 3662 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3663 if (key.objectid != btrfs_ino(inode) || 3664 key.type != BTRFS_EXTENT_DATA_KEY) 3665 goto fill_holes; 3666 extent = btrfs_item_ptr(src, src_path->slots[0], 3667 struct btrfs_file_extent_item); 3668 if (btrfs_file_extent_type(src, extent) == 3669 BTRFS_FILE_EXTENT_INLINE) { 3670 len = btrfs_file_extent_inline_len(src, 3671 src_path->slots[0], 3672 extent); 3673 *last_extent = ALIGN(key.offset + len, 3674 log->sectorsize); 3675 } else { 3676 len = btrfs_file_extent_num_bytes(src, extent); 3677 *last_extent = key.offset + len; 3678 } 3679 } 3680 fill_holes: 3681 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3682 * things could have happened 3683 * 3684 * 1) A merge could have happened, so we could currently be on a leaf 3685 * that holds what we were copying in the first place. 3686 * 2) A split could have happened, and now not all of the items we want 3687 * are on the same leaf. 3688 * 3689 * So we need to adjust how we search for holes, we need to drop the 3690 * path and re-search for the first extent key we found, and then walk 3691 * forward until we hit the last one we copied. 3692 */ 3693 if (need_find_last_extent) { 3694 /* btrfs_prev_leaf could return 1 without releasing the path */ 3695 btrfs_release_path(src_path); 3696 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, 3697 src_path, 0, 0); 3698 if (ret < 0) 3699 return ret; 3700 ASSERT(ret == 0); 3701 src = src_path->nodes[0]; 3702 i = src_path->slots[0]; 3703 } else { 3704 i = start_slot; 3705 } 3706 3707 /* 3708 * Ok so here we need to go through and fill in any holes we may have 3709 * to make sure that holes are punched for those areas in case they had 3710 * extents previously. 3711 */ 3712 while (!done) { 3713 u64 offset, len; 3714 u64 extent_end; 3715 3716 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3717 ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); 3718 if (ret < 0) 3719 return ret; 3720 ASSERT(ret == 0); 3721 src = src_path->nodes[0]; 3722 i = 0; 3723 } 3724 3725 btrfs_item_key_to_cpu(src, &key, i); 3726 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3727 done = true; 3728 if (key.objectid != btrfs_ino(inode) || 3729 key.type != BTRFS_EXTENT_DATA_KEY) { 3730 i++; 3731 continue; 3732 } 3733 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3734 if (btrfs_file_extent_type(src, extent) == 3735 BTRFS_FILE_EXTENT_INLINE) { 3736 len = btrfs_file_extent_inline_len(src, i, extent); 3737 extent_end = ALIGN(key.offset + len, log->sectorsize); 3738 } else { 3739 len = btrfs_file_extent_num_bytes(src, extent); 3740 extent_end = key.offset + len; 3741 } 3742 i++; 3743 3744 if (*last_extent == key.offset) { 3745 *last_extent = extent_end; 3746 continue; 3747 } 3748 offset = *last_extent; 3749 len = key.offset - *last_extent; 3750 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 3751 offset, 0, 0, len, 0, len, 0, 3752 0, 0); 3753 if (ret) 3754 break; 3755 *last_extent = extent_end; 3756 } 3757 /* 3758 * Need to let the callers know we dropped the path so they should 3759 * re-search. 3760 */ 3761 if (!ret && need_find_last_extent) 3762 ret = 1; 3763 return ret; 3764 } 3765 3766 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3767 { 3768 struct extent_map *em1, *em2; 3769 3770 em1 = list_entry(a, struct extent_map, list); 3771 em2 = list_entry(b, struct extent_map, list); 3772 3773 if (em1->start < em2->start) 3774 return -1; 3775 else if (em1->start > em2->start) 3776 return 1; 3777 return 0; 3778 } 3779 3780 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 3781 struct inode *inode, 3782 struct btrfs_root *root, 3783 const struct extent_map *em, 3784 const struct list_head *logged_list, 3785 bool *ordered_io_error) 3786 { 3787 struct btrfs_ordered_extent *ordered; 3788 struct btrfs_root *log = root->log_root; 3789 u64 mod_start = em->mod_start; 3790 u64 mod_len = em->mod_len; 3791 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3792 u64 csum_offset; 3793 u64 csum_len; 3794 LIST_HEAD(ordered_sums); 3795 int ret = 0; 3796 3797 *ordered_io_error = false; 3798 3799 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 3800 em->block_start == EXTENT_MAP_HOLE) 3801 return 0; 3802 3803 /* 3804 * Wait far any ordered extent that covers our extent map. If it 3805 * finishes without an error, first check and see if our csums are on 3806 * our outstanding ordered extents. 3807 */ 3808 list_for_each_entry(ordered, logged_list, log_list) { 3809 struct btrfs_ordered_sum *sum; 3810 3811 if (!mod_len) 3812 break; 3813 3814 if (ordered->file_offset + ordered->len <= mod_start || 3815 mod_start + mod_len <= ordered->file_offset) 3816 continue; 3817 3818 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 3819 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 3820 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 3821 const u64 start = ordered->file_offset; 3822 const u64 end = ordered->file_offset + ordered->len - 1; 3823 3824 WARN_ON(ordered->inode != inode); 3825 filemap_fdatawrite_range(inode->i_mapping, start, end); 3826 } 3827 3828 wait_event(ordered->wait, 3829 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 3830 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3831 3832 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3833 /* 3834 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 3835 * i_mapping flags, so that the next fsync won't get 3836 * an outdated io error too. 3837 */ 3838 btrfs_inode_check_errors(inode); 3839 *ordered_io_error = true; 3840 break; 3841 } 3842 /* 3843 * We are going to copy all the csums on this ordered extent, so 3844 * go ahead and adjust mod_start and mod_len in case this 3845 * ordered extent has already been logged. 3846 */ 3847 if (ordered->file_offset > mod_start) { 3848 if (ordered->file_offset + ordered->len >= 3849 mod_start + mod_len) 3850 mod_len = ordered->file_offset - mod_start; 3851 /* 3852 * If we have this case 3853 * 3854 * |--------- logged extent ---------| 3855 * |----- ordered extent ----| 3856 * 3857 * Just don't mess with mod_start and mod_len, we'll 3858 * just end up logging more csums than we need and it 3859 * will be ok. 3860 */ 3861 } else { 3862 if (ordered->file_offset + ordered->len < 3863 mod_start + mod_len) { 3864 mod_len = (mod_start + mod_len) - 3865 (ordered->file_offset + ordered->len); 3866 mod_start = ordered->file_offset + 3867 ordered->len; 3868 } else { 3869 mod_len = 0; 3870 } 3871 } 3872 3873 if (skip_csum) 3874 continue; 3875 3876 /* 3877 * To keep us from looping for the above case of an ordered 3878 * extent that falls inside of the logged extent. 3879 */ 3880 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3881 &ordered->flags)) 3882 continue; 3883 3884 if (ordered->csum_bytes_left) { 3885 btrfs_start_ordered_extent(inode, ordered, 0); 3886 wait_event(ordered->wait, 3887 ordered->csum_bytes_left == 0); 3888 } 3889 3890 list_for_each_entry(sum, &ordered->list, list) { 3891 ret = btrfs_csum_file_blocks(trans, log, sum); 3892 if (ret) 3893 break; 3894 } 3895 } 3896 3897 if (*ordered_io_error || !mod_len || ret || skip_csum) 3898 return ret; 3899 3900 if (em->compress_type) { 3901 csum_offset = 0; 3902 csum_len = max(em->block_len, em->orig_block_len); 3903 } else { 3904 csum_offset = mod_start - em->start; 3905 csum_len = mod_len; 3906 } 3907 3908 /* block start is already adjusted for the file extent offset. */ 3909 ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 3910 em->block_start + csum_offset, 3911 em->block_start + csum_offset + 3912 csum_len - 1, &ordered_sums, 0); 3913 if (ret) 3914 return ret; 3915 3916 while (!list_empty(&ordered_sums)) { 3917 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3918 struct btrfs_ordered_sum, 3919 list); 3920 if (!ret) 3921 ret = btrfs_csum_file_blocks(trans, log, sums); 3922 list_del(&sums->list); 3923 kfree(sums); 3924 } 3925 3926 return ret; 3927 } 3928 3929 static int log_one_extent(struct btrfs_trans_handle *trans, 3930 struct inode *inode, struct btrfs_root *root, 3931 const struct extent_map *em, 3932 struct btrfs_path *path, 3933 const struct list_head *logged_list, 3934 struct btrfs_log_ctx *ctx) 3935 { 3936 struct btrfs_root *log = root->log_root; 3937 struct btrfs_file_extent_item *fi; 3938 struct extent_buffer *leaf; 3939 struct btrfs_map_token token; 3940 struct btrfs_key key; 3941 u64 extent_offset = em->start - em->orig_start; 3942 u64 block_len; 3943 int ret; 3944 int extent_inserted = 0; 3945 bool ordered_io_err = false; 3946 3947 ret = wait_ordered_extents(trans, inode, root, em, logged_list, 3948 &ordered_io_err); 3949 if (ret) 3950 return ret; 3951 3952 if (ordered_io_err) { 3953 ctx->io_err = -EIO; 3954 return 0; 3955 } 3956 3957 btrfs_init_map_token(&token); 3958 3959 ret = __btrfs_drop_extents(trans, log, inode, path, em->start, 3960 em->start + em->len, NULL, 0, 1, 3961 sizeof(*fi), &extent_inserted); 3962 if (ret) 3963 return ret; 3964 3965 if (!extent_inserted) { 3966 key.objectid = btrfs_ino(inode); 3967 key.type = BTRFS_EXTENT_DATA_KEY; 3968 key.offset = em->start; 3969 3970 ret = btrfs_insert_empty_item(trans, log, path, &key, 3971 sizeof(*fi)); 3972 if (ret) 3973 return ret; 3974 } 3975 leaf = path->nodes[0]; 3976 fi = btrfs_item_ptr(leaf, path->slots[0], 3977 struct btrfs_file_extent_item); 3978 3979 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 3980 &token); 3981 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3982 btrfs_set_token_file_extent_type(leaf, fi, 3983 BTRFS_FILE_EXTENT_PREALLOC, 3984 &token); 3985 else 3986 btrfs_set_token_file_extent_type(leaf, fi, 3987 BTRFS_FILE_EXTENT_REG, 3988 &token); 3989 3990 block_len = max(em->block_len, em->orig_block_len); 3991 if (em->compress_type != BTRFS_COMPRESS_NONE) { 3992 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3993 em->block_start, 3994 &token); 3995 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3996 &token); 3997 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 3998 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3999 em->block_start - 4000 extent_offset, &token); 4001 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4002 &token); 4003 } else { 4004 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4005 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4006 &token); 4007 } 4008 4009 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4010 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4011 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4012 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4013 &token); 4014 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4015 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4016 btrfs_mark_buffer_dirty(leaf); 4017 4018 btrfs_release_path(path); 4019 4020 return ret; 4021 } 4022 4023 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4024 struct btrfs_root *root, 4025 struct inode *inode, 4026 struct btrfs_path *path, 4027 struct list_head *logged_list, 4028 struct btrfs_log_ctx *ctx) 4029 { 4030 struct extent_map *em, *n; 4031 struct list_head extents; 4032 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 4033 u64 test_gen; 4034 int ret = 0; 4035 int num = 0; 4036 4037 INIT_LIST_HEAD(&extents); 4038 4039 write_lock(&tree->lock); 4040 test_gen = root->fs_info->last_trans_committed; 4041 4042 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4043 list_del_init(&em->list); 4044 4045 /* 4046 * Just an arbitrary number, this can be really CPU intensive 4047 * once we start getting a lot of extents, and really once we 4048 * have a bunch of extents we just want to commit since it will 4049 * be faster. 4050 */ 4051 if (++num > 32768) { 4052 list_del_init(&tree->modified_extents); 4053 ret = -EFBIG; 4054 goto process; 4055 } 4056 4057 if (em->generation <= test_gen) 4058 continue; 4059 /* Need a ref to keep it from getting evicted from cache */ 4060 atomic_inc(&em->refs); 4061 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4062 list_add_tail(&em->list, &extents); 4063 num++; 4064 } 4065 4066 list_sort(NULL, &extents, extent_cmp); 4067 4068 process: 4069 while (!list_empty(&extents)) { 4070 em = list_entry(extents.next, struct extent_map, list); 4071 4072 list_del_init(&em->list); 4073 4074 /* 4075 * If we had an error we just need to delete everybody from our 4076 * private list. 4077 */ 4078 if (ret) { 4079 clear_em_logging(tree, em); 4080 free_extent_map(em); 4081 continue; 4082 } 4083 4084 write_unlock(&tree->lock); 4085 4086 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4087 ctx); 4088 write_lock(&tree->lock); 4089 clear_em_logging(tree, em); 4090 free_extent_map(em); 4091 } 4092 WARN_ON(!list_empty(&extents)); 4093 write_unlock(&tree->lock); 4094 4095 btrfs_release_path(path); 4096 return ret; 4097 } 4098 4099 static int logged_inode_size(struct btrfs_root *log, struct inode *inode, 4100 struct btrfs_path *path, u64 *size_ret) 4101 { 4102 struct btrfs_key key; 4103 int ret; 4104 4105 key.objectid = btrfs_ino(inode); 4106 key.type = BTRFS_INODE_ITEM_KEY; 4107 key.offset = 0; 4108 4109 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4110 if (ret < 0) { 4111 return ret; 4112 } else if (ret > 0) { 4113 *size_ret = 0; 4114 } else { 4115 struct btrfs_inode_item *item; 4116 4117 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4118 struct btrfs_inode_item); 4119 *size_ret = btrfs_inode_size(path->nodes[0], item); 4120 } 4121 4122 btrfs_release_path(path); 4123 return 0; 4124 } 4125 4126 /* log a single inode in the tree log. 4127 * At least one parent directory for this inode must exist in the tree 4128 * or be logged already. 4129 * 4130 * Any items from this inode changed by the current transaction are copied 4131 * to the log tree. An extra reference is taken on any extents in this 4132 * file, allowing us to avoid a whole pile of corner cases around logging 4133 * blocks that have been removed from the tree. 4134 * 4135 * See LOG_INODE_ALL and related defines for a description of what inode_only 4136 * does. 4137 * 4138 * This handles both files and directories. 4139 */ 4140 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4141 struct btrfs_root *root, struct inode *inode, 4142 int inode_only, 4143 const loff_t start, 4144 const loff_t end, 4145 struct btrfs_log_ctx *ctx) 4146 { 4147 struct btrfs_path *path; 4148 struct btrfs_path *dst_path; 4149 struct btrfs_key min_key; 4150 struct btrfs_key max_key; 4151 struct btrfs_root *log = root->log_root; 4152 struct extent_buffer *src = NULL; 4153 LIST_HEAD(logged_list); 4154 u64 last_extent = 0; 4155 int err = 0; 4156 int ret; 4157 int nritems; 4158 int ins_start_slot = 0; 4159 int ins_nr; 4160 bool fast_search = false; 4161 u64 ino = btrfs_ino(inode); 4162 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4163 u64 logged_isize = 0; 4164 4165 path = btrfs_alloc_path(); 4166 if (!path) 4167 return -ENOMEM; 4168 dst_path = btrfs_alloc_path(); 4169 if (!dst_path) { 4170 btrfs_free_path(path); 4171 return -ENOMEM; 4172 } 4173 4174 min_key.objectid = ino; 4175 min_key.type = BTRFS_INODE_ITEM_KEY; 4176 min_key.offset = 0; 4177 4178 max_key.objectid = ino; 4179 4180 4181 /* today the code can only do partial logging of directories */ 4182 if (S_ISDIR(inode->i_mode) || 4183 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4184 &BTRFS_I(inode)->runtime_flags) && 4185 inode_only == LOG_INODE_EXISTS)) 4186 max_key.type = BTRFS_XATTR_ITEM_KEY; 4187 else 4188 max_key.type = (u8)-1; 4189 max_key.offset = (u64)-1; 4190 4191 /* 4192 * Only run delayed items if we are a dir or a new file. 4193 * Otherwise commit the delayed inode only, which is needed in 4194 * order for the log replay code to mark inodes for link count 4195 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4196 */ 4197 if (S_ISDIR(inode->i_mode) || 4198 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) 4199 ret = btrfs_commit_inode_delayed_items(trans, inode); 4200 else 4201 ret = btrfs_commit_inode_delayed_inode(inode); 4202 4203 if (ret) { 4204 btrfs_free_path(path); 4205 btrfs_free_path(dst_path); 4206 return ret; 4207 } 4208 4209 mutex_lock(&BTRFS_I(inode)->log_mutex); 4210 4211 btrfs_get_logged_extents(inode, &logged_list, start, end); 4212 4213 /* 4214 * a brute force approach to making sure we get the most uptodate 4215 * copies of everything. 4216 */ 4217 if (S_ISDIR(inode->i_mode)) { 4218 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4219 4220 if (inode_only == LOG_INODE_EXISTS) 4221 max_key_type = BTRFS_XATTR_ITEM_KEY; 4222 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4223 } else { 4224 if (inode_only == LOG_INODE_EXISTS) { 4225 /* 4226 * Make sure the new inode item we write to the log has 4227 * the same isize as the current one (if it exists). 4228 * This is necessary to prevent data loss after log 4229 * replay, and also to prevent doing a wrong expanding 4230 * truncate - for e.g. create file, write 4K into offset 4231 * 0, fsync, write 4K into offset 4096, add hard link, 4232 * fsync some other file (to sync log), power fail - if 4233 * we use the inode's current i_size, after log replay 4234 * we get a 8Kb file, with the last 4Kb extent as a hole 4235 * (zeroes), as if an expanding truncate happened, 4236 * instead of getting a file of 4Kb only. 4237 */ 4238 err = logged_inode_size(log, inode, path, 4239 &logged_isize); 4240 if (err) 4241 goto out_unlock; 4242 } 4243 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4244 &BTRFS_I(inode)->runtime_flags)) { 4245 if (inode_only == LOG_INODE_EXISTS) { 4246 max_key.type = BTRFS_XATTR_ITEM_KEY; 4247 ret = drop_objectid_items(trans, log, path, ino, 4248 max_key.type); 4249 } else { 4250 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4251 &BTRFS_I(inode)->runtime_flags); 4252 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4253 &BTRFS_I(inode)->runtime_flags); 4254 while(1) { 4255 ret = btrfs_truncate_inode_items(trans, 4256 log, inode, 0, 0); 4257 if (ret != -EAGAIN) 4258 break; 4259 } 4260 } 4261 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4262 &BTRFS_I(inode)->runtime_flags) || 4263 inode_only == LOG_INODE_EXISTS) { 4264 if (inode_only == LOG_INODE_ALL) 4265 fast_search = true; 4266 max_key.type = BTRFS_XATTR_ITEM_KEY; 4267 ret = drop_objectid_items(trans, log, path, ino, 4268 max_key.type); 4269 } else { 4270 if (inode_only == LOG_INODE_ALL) 4271 fast_search = true; 4272 ret = log_inode_item(trans, log, dst_path, inode); 4273 if (ret) { 4274 err = ret; 4275 goto out_unlock; 4276 } 4277 goto log_extents; 4278 } 4279 4280 } 4281 if (ret) { 4282 err = ret; 4283 goto out_unlock; 4284 } 4285 4286 while (1) { 4287 ins_nr = 0; 4288 ret = btrfs_search_forward(root, &min_key, 4289 path, trans->transid); 4290 if (ret != 0) 4291 break; 4292 again: 4293 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4294 if (min_key.objectid != ino) 4295 break; 4296 if (min_key.type > max_key.type) 4297 break; 4298 4299 src = path->nodes[0]; 4300 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4301 ins_nr++; 4302 goto next_slot; 4303 } else if (!ins_nr) { 4304 ins_start_slot = path->slots[0]; 4305 ins_nr = 1; 4306 goto next_slot; 4307 } 4308 4309 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4310 ins_start_slot, ins_nr, inode_only, 4311 logged_isize); 4312 if (ret < 0) { 4313 err = ret; 4314 goto out_unlock; 4315 } 4316 if (ret) { 4317 ins_nr = 0; 4318 btrfs_release_path(path); 4319 continue; 4320 } 4321 ins_nr = 1; 4322 ins_start_slot = path->slots[0]; 4323 next_slot: 4324 4325 nritems = btrfs_header_nritems(path->nodes[0]); 4326 path->slots[0]++; 4327 if (path->slots[0] < nritems) { 4328 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 4329 path->slots[0]); 4330 goto again; 4331 } 4332 if (ins_nr) { 4333 ret = copy_items(trans, inode, dst_path, path, 4334 &last_extent, ins_start_slot, 4335 ins_nr, inode_only, logged_isize); 4336 if (ret < 0) { 4337 err = ret; 4338 goto out_unlock; 4339 } 4340 ret = 0; 4341 ins_nr = 0; 4342 } 4343 btrfs_release_path(path); 4344 4345 if (min_key.offset < (u64)-1) { 4346 min_key.offset++; 4347 } else if (min_key.type < max_key.type) { 4348 min_key.type++; 4349 min_key.offset = 0; 4350 } else { 4351 break; 4352 } 4353 } 4354 if (ins_nr) { 4355 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4356 ins_start_slot, ins_nr, inode_only, 4357 logged_isize); 4358 if (ret < 0) { 4359 err = ret; 4360 goto out_unlock; 4361 } 4362 ret = 0; 4363 ins_nr = 0; 4364 } 4365 4366 log_extents: 4367 btrfs_release_path(path); 4368 btrfs_release_path(dst_path); 4369 if (fast_search) { 4370 /* 4371 * Some ordered extents started by fsync might have completed 4372 * before we collected the ordered extents in logged_list, which 4373 * means they're gone, not in our logged_list nor in the inode's 4374 * ordered tree. We want the application/user space to know an 4375 * error happened while attempting to persist file data so that 4376 * it can take proper action. If such error happened, we leave 4377 * without writing to the log tree and the fsync must report the 4378 * file data write error and not commit the current transaction. 4379 */ 4380 err = btrfs_inode_check_errors(inode); 4381 if (err) { 4382 ctx->io_err = err; 4383 goto out_unlock; 4384 } 4385 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4386 &logged_list, ctx); 4387 if (ret) { 4388 err = ret; 4389 goto out_unlock; 4390 } 4391 } else if (inode_only == LOG_INODE_ALL) { 4392 struct extent_map *em, *n; 4393 4394 write_lock(&em_tree->lock); 4395 /* 4396 * We can't just remove every em if we're called for a ranged 4397 * fsync - that is, one that doesn't cover the whole possible 4398 * file range (0 to LLONG_MAX). This is because we can have 4399 * em's that fall outside the range we're logging and therefore 4400 * their ordered operations haven't completed yet 4401 * (btrfs_finish_ordered_io() not invoked yet). This means we 4402 * didn't get their respective file extent item in the fs/subvol 4403 * tree yet, and need to let the next fast fsync (one which 4404 * consults the list of modified extent maps) find the em so 4405 * that it logs a matching file extent item and waits for the 4406 * respective ordered operation to complete (if it's still 4407 * running). 4408 * 4409 * Removing every em outside the range we're logging would make 4410 * the next fast fsync not log their matching file extent items, 4411 * therefore making us lose data after a log replay. 4412 */ 4413 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 4414 list) { 4415 const u64 mod_end = em->mod_start + em->mod_len - 1; 4416 4417 if (em->mod_start >= start && mod_end <= end) 4418 list_del_init(&em->list); 4419 } 4420 write_unlock(&em_tree->lock); 4421 } 4422 4423 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 4424 ret = log_directory_changes(trans, root, inode, path, dst_path, 4425 ctx); 4426 if (ret) { 4427 err = ret; 4428 goto out_unlock; 4429 } 4430 } 4431 4432 spin_lock(&BTRFS_I(inode)->lock); 4433 BTRFS_I(inode)->logged_trans = trans->transid; 4434 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4435 spin_unlock(&BTRFS_I(inode)->lock); 4436 out_unlock: 4437 if (unlikely(err)) 4438 btrfs_put_logged_extents(&logged_list); 4439 else 4440 btrfs_submit_logged_extents(&logged_list, log); 4441 mutex_unlock(&BTRFS_I(inode)->log_mutex); 4442 4443 btrfs_free_path(path); 4444 btrfs_free_path(dst_path); 4445 return err; 4446 } 4447 4448 /* 4449 * follow the dentry parent pointers up the chain and see if any 4450 * of the directories in it require a full commit before they can 4451 * be logged. Returns zero if nothing special needs to be done or 1 if 4452 * a full commit is required. 4453 */ 4454 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 4455 struct inode *inode, 4456 struct dentry *parent, 4457 struct super_block *sb, 4458 u64 last_committed) 4459 { 4460 int ret = 0; 4461 struct btrfs_root *root; 4462 struct dentry *old_parent = NULL; 4463 struct inode *orig_inode = inode; 4464 4465 /* 4466 * for regular files, if its inode is already on disk, we don't 4467 * have to worry about the parents at all. This is because 4468 * we can use the last_unlink_trans field to record renames 4469 * and other fun in this file. 4470 */ 4471 if (S_ISREG(inode->i_mode) && 4472 BTRFS_I(inode)->generation <= last_committed && 4473 BTRFS_I(inode)->last_unlink_trans <= last_committed) 4474 goto out; 4475 4476 if (!S_ISDIR(inode->i_mode)) { 4477 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) 4478 goto out; 4479 inode = d_inode(parent); 4480 } 4481 4482 while (1) { 4483 /* 4484 * If we are logging a directory then we start with our inode, 4485 * not our parents inode, so we need to skipp setting the 4486 * logged_trans so that further down in the log code we don't 4487 * think this inode has already been logged. 4488 */ 4489 if (inode != orig_inode) 4490 BTRFS_I(inode)->logged_trans = trans->transid; 4491 smp_mb(); 4492 4493 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 4494 root = BTRFS_I(inode)->root; 4495 4496 /* 4497 * make sure any commits to the log are forced 4498 * to be full commits 4499 */ 4500 btrfs_set_log_full_commit(root->fs_info, trans); 4501 ret = 1; 4502 break; 4503 } 4504 4505 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) 4506 break; 4507 4508 if (IS_ROOT(parent)) 4509 break; 4510 4511 parent = dget_parent(parent); 4512 dput(old_parent); 4513 old_parent = parent; 4514 inode = d_inode(parent); 4515 4516 } 4517 dput(old_parent); 4518 out: 4519 return ret; 4520 } 4521 4522 struct btrfs_dir_list { 4523 u64 ino; 4524 struct list_head list; 4525 }; 4526 4527 /* 4528 * Log the inodes of the new dentries of a directory. See log_dir_items() for 4529 * details about the why it is needed. 4530 * This is a recursive operation - if an existing dentry corresponds to a 4531 * directory, that directory's new entries are logged too (same behaviour as 4532 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 4533 * the dentries point to we do not lock their i_mutex, otherwise lockdep 4534 * complains about the following circular lock dependency / possible deadlock: 4535 * 4536 * CPU0 CPU1 4537 * ---- ---- 4538 * lock(&type->i_mutex_dir_key#3/2); 4539 * lock(sb_internal#2); 4540 * lock(&type->i_mutex_dir_key#3/2); 4541 * lock(&sb->s_type->i_mutex_key#14); 4542 * 4543 * Where sb_internal is the lock (a counter that works as a lock) acquired by 4544 * sb_start_intwrite() in btrfs_start_transaction(). 4545 * Not locking i_mutex of the inodes is still safe because: 4546 * 4547 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 4548 * that while logging the inode new references (names) are added or removed 4549 * from the inode, leaving the logged inode item with a link count that does 4550 * not match the number of logged inode reference items. This is fine because 4551 * at log replay time we compute the real number of links and correct the 4552 * link count in the inode item (see replay_one_buffer() and 4553 * link_to_fixup_dir()); 4554 * 4555 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 4556 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 4557 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 4558 * has a size that doesn't match the sum of the lengths of all the logged 4559 * names. This does not result in a problem because if a dir_item key is 4560 * logged but its matching dir_index key is not logged, at log replay time we 4561 * don't use it to replay the respective name (see replay_one_name()). On the 4562 * other hand if only the dir_index key ends up being logged, the respective 4563 * name is added to the fs/subvol tree with both the dir_item and dir_index 4564 * keys created (see replay_one_name()). 4565 * The directory's inode item with a wrong i_size is not a problem as well, 4566 * since we don't use it at log replay time to set the i_size in the inode 4567 * item of the fs/subvol tree (see overwrite_item()). 4568 */ 4569 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 4570 struct btrfs_root *root, 4571 struct inode *start_inode, 4572 struct btrfs_log_ctx *ctx) 4573 { 4574 struct btrfs_root *log = root->log_root; 4575 struct btrfs_path *path; 4576 LIST_HEAD(dir_list); 4577 struct btrfs_dir_list *dir_elem; 4578 int ret = 0; 4579 4580 path = btrfs_alloc_path(); 4581 if (!path) 4582 return -ENOMEM; 4583 4584 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 4585 if (!dir_elem) { 4586 btrfs_free_path(path); 4587 return -ENOMEM; 4588 } 4589 dir_elem->ino = btrfs_ino(start_inode); 4590 list_add_tail(&dir_elem->list, &dir_list); 4591 4592 while (!list_empty(&dir_list)) { 4593 struct extent_buffer *leaf; 4594 struct btrfs_key min_key; 4595 int nritems; 4596 int i; 4597 4598 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 4599 list); 4600 if (ret) 4601 goto next_dir_inode; 4602 4603 min_key.objectid = dir_elem->ino; 4604 min_key.type = BTRFS_DIR_ITEM_KEY; 4605 min_key.offset = 0; 4606 again: 4607 btrfs_release_path(path); 4608 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 4609 if (ret < 0) { 4610 goto next_dir_inode; 4611 } else if (ret > 0) { 4612 ret = 0; 4613 goto next_dir_inode; 4614 } 4615 4616 process_leaf: 4617 leaf = path->nodes[0]; 4618 nritems = btrfs_header_nritems(leaf); 4619 for (i = path->slots[0]; i < nritems; i++) { 4620 struct btrfs_dir_item *di; 4621 struct btrfs_key di_key; 4622 struct inode *di_inode; 4623 struct btrfs_dir_list *new_dir_elem; 4624 int log_mode = LOG_INODE_EXISTS; 4625 int type; 4626 4627 btrfs_item_key_to_cpu(leaf, &min_key, i); 4628 if (min_key.objectid != dir_elem->ino || 4629 min_key.type != BTRFS_DIR_ITEM_KEY) 4630 goto next_dir_inode; 4631 4632 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 4633 type = btrfs_dir_type(leaf, di); 4634 if (btrfs_dir_transid(leaf, di) < trans->transid && 4635 type != BTRFS_FT_DIR) 4636 continue; 4637 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 4638 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 4639 continue; 4640 4641 di_inode = btrfs_iget(root->fs_info->sb, &di_key, 4642 root, NULL); 4643 if (IS_ERR(di_inode)) { 4644 ret = PTR_ERR(di_inode); 4645 goto next_dir_inode; 4646 } 4647 4648 if (btrfs_inode_in_log(di_inode, trans->transid)) { 4649 iput(di_inode); 4650 continue; 4651 } 4652 4653 ctx->log_new_dentries = false; 4654 if (type == BTRFS_FT_DIR) 4655 log_mode = LOG_INODE_ALL; 4656 btrfs_release_path(path); 4657 ret = btrfs_log_inode(trans, root, di_inode, 4658 log_mode, 0, LLONG_MAX, ctx); 4659 iput(di_inode); 4660 if (ret) 4661 goto next_dir_inode; 4662 if (ctx->log_new_dentries) { 4663 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 4664 GFP_NOFS); 4665 if (!new_dir_elem) { 4666 ret = -ENOMEM; 4667 goto next_dir_inode; 4668 } 4669 new_dir_elem->ino = di_key.objectid; 4670 list_add_tail(&new_dir_elem->list, &dir_list); 4671 } 4672 break; 4673 } 4674 if (i == nritems) { 4675 ret = btrfs_next_leaf(log, path); 4676 if (ret < 0) { 4677 goto next_dir_inode; 4678 } else if (ret > 0) { 4679 ret = 0; 4680 goto next_dir_inode; 4681 } 4682 goto process_leaf; 4683 } 4684 if (min_key.offset < (u64)-1) { 4685 min_key.offset++; 4686 goto again; 4687 } 4688 next_dir_inode: 4689 list_del(&dir_elem->list); 4690 kfree(dir_elem); 4691 } 4692 4693 btrfs_free_path(path); 4694 return ret; 4695 } 4696 4697 /* 4698 * helper function around btrfs_log_inode to make sure newly created 4699 * parent directories also end up in the log. A minimal inode and backref 4700 * only logging is done of any parent directories that are older than 4701 * the last committed transaction 4702 */ 4703 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 4704 struct btrfs_root *root, struct inode *inode, 4705 struct dentry *parent, 4706 const loff_t start, 4707 const loff_t end, 4708 int exists_only, 4709 struct btrfs_log_ctx *ctx) 4710 { 4711 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 4712 struct super_block *sb; 4713 struct dentry *old_parent = NULL; 4714 int ret = 0; 4715 u64 last_committed = root->fs_info->last_trans_committed; 4716 const struct dentry * const first_parent = parent; 4717 const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > 4718 last_committed); 4719 bool log_dentries = false; 4720 struct inode *orig_inode = inode; 4721 4722 sb = inode->i_sb; 4723 4724 if (btrfs_test_opt(root, NOTREELOG)) { 4725 ret = 1; 4726 goto end_no_trans; 4727 } 4728 4729 /* 4730 * The prev transaction commit doesn't complete, we need do 4731 * full commit by ourselves. 4732 */ 4733 if (root->fs_info->last_trans_log_full_commit > 4734 root->fs_info->last_trans_committed) { 4735 ret = 1; 4736 goto end_no_trans; 4737 } 4738 4739 if (root != BTRFS_I(inode)->root || 4740 btrfs_root_refs(&root->root_item) == 0) { 4741 ret = 1; 4742 goto end_no_trans; 4743 } 4744 4745 ret = check_parent_dirs_for_sync(trans, inode, parent, 4746 sb, last_committed); 4747 if (ret) 4748 goto end_no_trans; 4749 4750 if (btrfs_inode_in_log(inode, trans->transid)) { 4751 ret = BTRFS_NO_LOG_SYNC; 4752 goto end_no_trans; 4753 } 4754 4755 ret = start_log_trans(trans, root, ctx); 4756 if (ret) 4757 goto end_no_trans; 4758 4759 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 4760 if (ret) 4761 goto end_trans; 4762 4763 /* 4764 * for regular files, if its inode is already on disk, we don't 4765 * have to worry about the parents at all. This is because 4766 * we can use the last_unlink_trans field to record renames 4767 * and other fun in this file. 4768 */ 4769 if (S_ISREG(inode->i_mode) && 4770 BTRFS_I(inode)->generation <= last_committed && 4771 BTRFS_I(inode)->last_unlink_trans <= last_committed) { 4772 ret = 0; 4773 goto end_trans; 4774 } 4775 4776 if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) 4777 log_dentries = true; 4778 4779 while (1) { 4780 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) 4781 break; 4782 4783 inode = d_inode(parent); 4784 if (root != BTRFS_I(inode)->root) 4785 break; 4786 4787 /* 4788 * On unlink we must make sure our immediate parent directory 4789 * inode is fully logged. This is to prevent leaving dangling 4790 * directory index entries and a wrong directory inode's i_size. 4791 * Not doing so can result in a directory being impossible to 4792 * delete after log replay (rmdir will always fail with error 4793 * -ENOTEMPTY). 4794 */ 4795 if (did_unlink && parent == first_parent) 4796 inode_only = LOG_INODE_ALL; 4797 else 4798 inode_only = LOG_INODE_EXISTS; 4799 4800 if (BTRFS_I(inode)->generation > 4801 root->fs_info->last_trans_committed || 4802 inode_only == LOG_INODE_ALL) { 4803 ret = btrfs_log_inode(trans, root, inode, inode_only, 4804 0, LLONG_MAX, ctx); 4805 if (ret) 4806 goto end_trans; 4807 } 4808 if (IS_ROOT(parent)) 4809 break; 4810 4811 parent = dget_parent(parent); 4812 dput(old_parent); 4813 old_parent = parent; 4814 } 4815 if (log_dentries) 4816 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 4817 else 4818 ret = 0; 4819 end_trans: 4820 dput(old_parent); 4821 if (ret < 0) { 4822 btrfs_set_log_full_commit(root->fs_info, trans); 4823 ret = 1; 4824 } 4825 4826 if (ret) 4827 btrfs_remove_log_ctx(root, ctx); 4828 btrfs_end_log_trans(root); 4829 end_no_trans: 4830 return ret; 4831 } 4832 4833 /* 4834 * it is not safe to log dentry if the chunk root has added new 4835 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 4836 * If this returns 1, you must commit the transaction to safely get your 4837 * data on disk. 4838 */ 4839 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 4840 struct btrfs_root *root, struct dentry *dentry, 4841 const loff_t start, 4842 const loff_t end, 4843 struct btrfs_log_ctx *ctx) 4844 { 4845 struct dentry *parent = dget_parent(dentry); 4846 int ret; 4847 4848 ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent, 4849 start, end, 0, ctx); 4850 dput(parent); 4851 4852 return ret; 4853 } 4854 4855 /* 4856 * should be called during mount to recover any replay any log trees 4857 * from the FS 4858 */ 4859 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 4860 { 4861 int ret; 4862 struct btrfs_path *path; 4863 struct btrfs_trans_handle *trans; 4864 struct btrfs_key key; 4865 struct btrfs_key found_key; 4866 struct btrfs_key tmp_key; 4867 struct btrfs_root *log; 4868 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 4869 struct walk_control wc = { 4870 .process_func = process_one_buffer, 4871 .stage = 0, 4872 }; 4873 4874 path = btrfs_alloc_path(); 4875 if (!path) 4876 return -ENOMEM; 4877 4878 fs_info->log_root_recovering = 1; 4879 4880 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4881 if (IS_ERR(trans)) { 4882 ret = PTR_ERR(trans); 4883 goto error; 4884 } 4885 4886 wc.trans = trans; 4887 wc.pin = 1; 4888 4889 ret = walk_log_tree(trans, log_root_tree, &wc); 4890 if (ret) { 4891 btrfs_error(fs_info, ret, "Failed to pin buffers while " 4892 "recovering log root tree."); 4893 goto error; 4894 } 4895 4896 again: 4897 key.objectid = BTRFS_TREE_LOG_OBJECTID; 4898 key.offset = (u64)-1; 4899 key.type = BTRFS_ROOT_ITEM_KEY; 4900 4901 while (1) { 4902 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 4903 4904 if (ret < 0) { 4905 btrfs_error(fs_info, ret, 4906 "Couldn't find tree log root."); 4907 goto error; 4908 } 4909 if (ret > 0) { 4910 if (path->slots[0] == 0) 4911 break; 4912 path->slots[0]--; 4913 } 4914 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4915 path->slots[0]); 4916 btrfs_release_path(path); 4917 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4918 break; 4919 4920 log = btrfs_read_fs_root(log_root_tree, &found_key); 4921 if (IS_ERR(log)) { 4922 ret = PTR_ERR(log); 4923 btrfs_error(fs_info, ret, 4924 "Couldn't read tree log root."); 4925 goto error; 4926 } 4927 4928 tmp_key.objectid = found_key.offset; 4929 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 4930 tmp_key.offset = (u64)-1; 4931 4932 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 4933 if (IS_ERR(wc.replay_dest)) { 4934 ret = PTR_ERR(wc.replay_dest); 4935 free_extent_buffer(log->node); 4936 free_extent_buffer(log->commit_root); 4937 kfree(log); 4938 btrfs_error(fs_info, ret, "Couldn't read target root " 4939 "for tree log recovery."); 4940 goto error; 4941 } 4942 4943 wc.replay_dest->log_root = log; 4944 btrfs_record_root_in_trans(trans, wc.replay_dest); 4945 ret = walk_log_tree(trans, log, &wc); 4946 4947 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 4948 ret = fixup_inode_link_counts(trans, wc.replay_dest, 4949 path); 4950 } 4951 4952 key.offset = found_key.offset - 1; 4953 wc.replay_dest->log_root = NULL; 4954 free_extent_buffer(log->node); 4955 free_extent_buffer(log->commit_root); 4956 kfree(log); 4957 4958 if (ret) 4959 goto error; 4960 4961 if (found_key.offset == 0) 4962 break; 4963 } 4964 btrfs_release_path(path); 4965 4966 /* step one is to pin it all, step two is to replay just inodes */ 4967 if (wc.pin) { 4968 wc.pin = 0; 4969 wc.process_func = replay_one_buffer; 4970 wc.stage = LOG_WALK_REPLAY_INODES; 4971 goto again; 4972 } 4973 /* step three is to replay everything */ 4974 if (wc.stage < LOG_WALK_REPLAY_ALL) { 4975 wc.stage++; 4976 goto again; 4977 } 4978 4979 btrfs_free_path(path); 4980 4981 /* step 4: commit the transaction, which also unpins the blocks */ 4982 ret = btrfs_commit_transaction(trans, fs_info->tree_root); 4983 if (ret) 4984 return ret; 4985 4986 free_extent_buffer(log_root_tree->node); 4987 log_root_tree->log_root = NULL; 4988 fs_info->log_root_recovering = 0; 4989 kfree(log_root_tree); 4990 4991 return 0; 4992 error: 4993 if (wc.trans) 4994 btrfs_end_transaction(wc.trans, fs_info->tree_root); 4995 btrfs_free_path(path); 4996 return ret; 4997 } 4998 4999 /* 5000 * there are some corner cases where we want to force a full 5001 * commit instead of allowing a directory to be logged. 5002 * 5003 * They revolve around files there were unlinked from the directory, and 5004 * this function updates the parent directory so that a full commit is 5005 * properly done if it is fsync'd later after the unlinks are done. 5006 */ 5007 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5008 struct inode *dir, struct inode *inode, 5009 int for_rename) 5010 { 5011 /* 5012 * when we're logging a file, if it hasn't been renamed 5013 * or unlinked, and its inode is fully committed on disk, 5014 * we don't have to worry about walking up the directory chain 5015 * to log its parents. 5016 * 5017 * So, we use the last_unlink_trans field to put this transid 5018 * into the file. When the file is logged we check it and 5019 * don't log the parents if the file is fully on disk. 5020 */ 5021 if (S_ISREG(inode->i_mode)) 5022 BTRFS_I(inode)->last_unlink_trans = trans->transid; 5023 5024 /* 5025 * if this directory was already logged any new 5026 * names for this file/dir will get recorded 5027 */ 5028 smp_mb(); 5029 if (BTRFS_I(dir)->logged_trans == trans->transid) 5030 return; 5031 5032 /* 5033 * if the inode we're about to unlink was logged, 5034 * the log will be properly updated for any new names 5035 */ 5036 if (BTRFS_I(inode)->logged_trans == trans->transid) 5037 return; 5038 5039 /* 5040 * when renaming files across directories, if the directory 5041 * there we're unlinking from gets fsync'd later on, there's 5042 * no way to find the destination directory later and fsync it 5043 * properly. So, we have to be conservative and force commits 5044 * so the new name gets discovered. 5045 */ 5046 if (for_rename) 5047 goto record; 5048 5049 /* we can safely do the unlink without any special recording */ 5050 return; 5051 5052 record: 5053 BTRFS_I(dir)->last_unlink_trans = trans->transid; 5054 } 5055 5056 /* 5057 * Call this after adding a new name for a file and it will properly 5058 * update the log to reflect the new name. 5059 * 5060 * It will return zero if all goes well, and it will return 1 if a 5061 * full transaction commit is required. 5062 */ 5063 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 5064 struct inode *inode, struct inode *old_dir, 5065 struct dentry *parent) 5066 { 5067 struct btrfs_root * root = BTRFS_I(inode)->root; 5068 5069 /* 5070 * this will force the logging code to walk the dentry chain 5071 * up for the file 5072 */ 5073 if (S_ISREG(inode->i_mode)) 5074 BTRFS_I(inode)->last_unlink_trans = trans->transid; 5075 5076 /* 5077 * if this inode hasn't been logged and directory we're renaming it 5078 * from hasn't been logged, we don't need to log it 5079 */ 5080 if (BTRFS_I(inode)->logged_trans <= 5081 root->fs_info->last_trans_committed && 5082 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 5083 root->fs_info->last_trans_committed)) 5084 return 0; 5085 5086 return btrfs_log_inode_parent(trans, root, inode, parent, 0, 5087 LLONG_MAX, 1, NULL); 5088 } 5089 5090