1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "print-tree.h" 17 #include "backref.h" 18 #include "compression.h" 19 #include "qgroup.h" 20 #include "block-group.h" 21 #include "space-info.h" 22 #include "zoned.h" 23 #include "inode-item.h" 24 25 /* magic values for the inode_only field in btrfs_log_inode: 26 * 27 * LOG_INODE_ALL means to log everything 28 * LOG_INODE_EXISTS means to log just enough to recreate the inode 29 * during log replay 30 */ 31 enum { 32 LOG_INODE_ALL, 33 LOG_INODE_EXISTS, 34 LOG_OTHER_INODE, 35 LOG_OTHER_INODE_ALL, 36 }; 37 38 /* 39 * directory trouble cases 40 * 41 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 42 * log, we must force a full commit before doing an fsync of the directory 43 * where the unlink was done. 44 * ---> record transid of last unlink/rename per directory 45 * 46 * mkdir foo/some_dir 47 * normal commit 48 * rename foo/some_dir foo2/some_dir 49 * mkdir foo/some_dir 50 * fsync foo/some_dir/some_file 51 * 52 * The fsync above will unlink the original some_dir without recording 53 * it in its new location (foo2). After a crash, some_dir will be gone 54 * unless the fsync of some_file forces a full commit 55 * 56 * 2) we must log any new names for any file or dir that is in the fsync 57 * log. ---> check inode while renaming/linking. 58 * 59 * 2a) we must log any new names for any file or dir during rename 60 * when the directory they are being removed from was logged. 61 * ---> check inode and old parent dir during rename 62 * 63 * 2a is actually the more important variant. With the extra logging 64 * a crash might unlink the old name without recreating the new one 65 * 66 * 3) after a crash, we must go through any directories with a link count 67 * of zero and redo the rm -rf 68 * 69 * mkdir f1/foo 70 * normal commit 71 * rm -rf f1/foo 72 * fsync(f1) 73 * 74 * The directory f1 was fully removed from the FS, but fsync was never 75 * called on f1, only its parent dir. After a crash the rm -rf must 76 * be replayed. This must be able to recurse down the entire 77 * directory tree. The inode link count fixup code takes care of the 78 * ugly details. 79 */ 80 81 /* 82 * stages for the tree walking. The first 83 * stage (0) is to only pin down the blocks we find 84 * the second stage (1) is to make sure that all the inodes 85 * we find in the log are created in the subvolume. 86 * 87 * The last stage is to deal with directories and links and extents 88 * and all the other fun semantics 89 */ 90 enum { 91 LOG_WALK_PIN_ONLY, 92 LOG_WALK_REPLAY_INODES, 93 LOG_WALK_REPLAY_DIR_INDEX, 94 LOG_WALK_REPLAY_ALL, 95 }; 96 97 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 98 struct btrfs_inode *inode, 99 int inode_only, 100 struct btrfs_log_ctx *ctx); 101 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 102 struct btrfs_root *root, 103 struct btrfs_path *path, u64 objectid); 104 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 105 struct btrfs_root *root, 106 struct btrfs_root *log, 107 struct btrfs_path *path, 108 u64 dirid, int del_all); 109 static void wait_log_commit(struct btrfs_root *root, int transid); 110 111 /* 112 * tree logging is a special write ahead log used to make sure that 113 * fsyncs and O_SYNCs can happen without doing full tree commits. 114 * 115 * Full tree commits are expensive because they require commonly 116 * modified blocks to be recowed, creating many dirty pages in the 117 * extent tree an 4x-6x higher write load than ext3. 118 * 119 * Instead of doing a tree commit on every fsync, we use the 120 * key ranges and transaction ids to find items for a given file or directory 121 * that have changed in this transaction. Those items are copied into 122 * a special tree (one per subvolume root), that tree is written to disk 123 * and then the fsync is considered complete. 124 * 125 * After a crash, items are copied out of the log-tree back into the 126 * subvolume tree. Any file data extents found are recorded in the extent 127 * allocation tree, and the log-tree freed. 128 * 129 * The log tree is read three times, once to pin down all the extents it is 130 * using in ram and once, once to create all the inodes logged in the tree 131 * and once to do all the other items. 132 */ 133 134 /* 135 * start a sub transaction and setup the log tree 136 * this increments the log tree writer count to make the people 137 * syncing the tree wait for us to finish 138 */ 139 static int start_log_trans(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root, 141 struct btrfs_log_ctx *ctx) 142 { 143 struct btrfs_fs_info *fs_info = root->fs_info; 144 struct btrfs_root *tree_root = fs_info->tree_root; 145 const bool zoned = btrfs_is_zoned(fs_info); 146 int ret = 0; 147 bool created = false; 148 149 /* 150 * First check if the log root tree was already created. If not, create 151 * it before locking the root's log_mutex, just to keep lockdep happy. 152 */ 153 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { 154 mutex_lock(&tree_root->log_mutex); 155 if (!fs_info->log_root_tree) { 156 ret = btrfs_init_log_root_tree(trans, fs_info); 157 if (!ret) { 158 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); 159 created = true; 160 } 161 } 162 mutex_unlock(&tree_root->log_mutex); 163 if (ret) 164 return ret; 165 } 166 167 mutex_lock(&root->log_mutex); 168 169 again: 170 if (root->log_root) { 171 int index = (root->log_transid + 1) % 2; 172 173 if (btrfs_need_log_full_commit(trans)) { 174 ret = -EAGAIN; 175 goto out; 176 } 177 178 if (zoned && atomic_read(&root->log_commit[index])) { 179 wait_log_commit(root, root->log_transid - 1); 180 goto again; 181 } 182 183 if (!root->log_start_pid) { 184 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 185 root->log_start_pid = current->pid; 186 } else if (root->log_start_pid != current->pid) { 187 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 188 } 189 } else { 190 /* 191 * This means fs_info->log_root_tree was already created 192 * for some other FS trees. Do the full commit not to mix 193 * nodes from multiple log transactions to do sequential 194 * writing. 195 */ 196 if (zoned && !created) { 197 ret = -EAGAIN; 198 goto out; 199 } 200 201 ret = btrfs_add_log_tree(trans, root); 202 if (ret) 203 goto out; 204 205 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 206 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 207 root->log_start_pid = current->pid; 208 } 209 210 atomic_inc(&root->log_writers); 211 if (!ctx->logging_new_name) { 212 int index = root->log_transid % 2; 213 list_add_tail(&ctx->list, &root->log_ctxs[index]); 214 ctx->log_transid = root->log_transid; 215 } 216 217 out: 218 mutex_unlock(&root->log_mutex); 219 return ret; 220 } 221 222 /* 223 * returns 0 if there was a log transaction running and we were able 224 * to join, or returns -ENOENT if there were not transactions 225 * in progress 226 */ 227 static int join_running_log_trans(struct btrfs_root *root) 228 { 229 const bool zoned = btrfs_is_zoned(root->fs_info); 230 int ret = -ENOENT; 231 232 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 233 return ret; 234 235 mutex_lock(&root->log_mutex); 236 again: 237 if (root->log_root) { 238 int index = (root->log_transid + 1) % 2; 239 240 ret = 0; 241 if (zoned && atomic_read(&root->log_commit[index])) { 242 wait_log_commit(root, root->log_transid - 1); 243 goto again; 244 } 245 atomic_inc(&root->log_writers); 246 } 247 mutex_unlock(&root->log_mutex); 248 return ret; 249 } 250 251 /* 252 * This either makes the current running log transaction wait 253 * until you call btrfs_end_log_trans() or it makes any future 254 * log transactions wait until you call btrfs_end_log_trans() 255 */ 256 void btrfs_pin_log_trans(struct btrfs_root *root) 257 { 258 atomic_inc(&root->log_writers); 259 } 260 261 /* 262 * indicate we're done making changes to the log tree 263 * and wake up anyone waiting to do a sync 264 */ 265 void btrfs_end_log_trans(struct btrfs_root *root) 266 { 267 if (atomic_dec_and_test(&root->log_writers)) { 268 /* atomic_dec_and_test implies a barrier */ 269 cond_wake_up_nomb(&root->log_writer_wait); 270 } 271 } 272 273 static int btrfs_write_tree_block(struct extent_buffer *buf) 274 { 275 return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, 276 buf->start + buf->len - 1); 277 } 278 279 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 280 { 281 filemap_fdatawait_range(buf->pages[0]->mapping, 282 buf->start, buf->start + buf->len - 1); 283 } 284 285 /* 286 * the walk control struct is used to pass state down the chain when 287 * processing the log tree. The stage field tells us which part 288 * of the log tree processing we are currently doing. The others 289 * are state fields used for that specific part 290 */ 291 struct walk_control { 292 /* should we free the extent on disk when done? This is used 293 * at transaction commit time while freeing a log tree 294 */ 295 int free; 296 297 /* should we write out the extent buffer? This is used 298 * while flushing the log tree to disk during a sync 299 */ 300 int write; 301 302 /* should we wait for the extent buffer io to finish? Also used 303 * while flushing the log tree to disk for a sync 304 */ 305 int wait; 306 307 /* pin only walk, we record which extents on disk belong to the 308 * log trees 309 */ 310 int pin; 311 312 /* what stage of the replay code we're currently in */ 313 int stage; 314 315 /* 316 * Ignore any items from the inode currently being processed. Needs 317 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 318 * the LOG_WALK_REPLAY_INODES stage. 319 */ 320 bool ignore_cur_inode; 321 322 /* the root we are currently replaying */ 323 struct btrfs_root *replay_dest; 324 325 /* the trans handle for the current replay */ 326 struct btrfs_trans_handle *trans; 327 328 /* the function that gets used to process blocks we find in the 329 * tree. Note the extent_buffer might not be up to date when it is 330 * passed in, and it must be checked or read if you need the data 331 * inside it 332 */ 333 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 334 struct walk_control *wc, u64 gen, int level); 335 }; 336 337 /* 338 * process_func used to pin down extents, write them or wait on them 339 */ 340 static int process_one_buffer(struct btrfs_root *log, 341 struct extent_buffer *eb, 342 struct walk_control *wc, u64 gen, int level) 343 { 344 struct btrfs_fs_info *fs_info = log->fs_info; 345 int ret = 0; 346 347 /* 348 * If this fs is mixed then we need to be able to process the leaves to 349 * pin down any logged extents, so we have to read the block. 350 */ 351 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 352 ret = btrfs_read_buffer(eb, gen, level, NULL); 353 if (ret) 354 return ret; 355 } 356 357 if (wc->pin) 358 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 359 eb->len); 360 361 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 362 if (wc->pin && btrfs_header_level(eb) == 0) 363 ret = btrfs_exclude_logged_extents(eb); 364 if (wc->write) 365 btrfs_write_tree_block(eb); 366 if (wc->wait) 367 btrfs_wait_tree_block_writeback(eb); 368 } 369 return ret; 370 } 371 372 static int do_overwrite_item(struct btrfs_trans_handle *trans, 373 struct btrfs_root *root, 374 struct btrfs_path *path, 375 struct extent_buffer *eb, int slot, 376 struct btrfs_key *key) 377 { 378 int ret; 379 u32 item_size; 380 u64 saved_i_size = 0; 381 int save_old_i_size = 0; 382 unsigned long src_ptr; 383 unsigned long dst_ptr; 384 int overwrite_root = 0; 385 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 386 387 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 388 overwrite_root = 1; 389 390 item_size = btrfs_item_size(eb, slot); 391 src_ptr = btrfs_item_ptr_offset(eb, slot); 392 393 /* Our caller must have done a search for the key for us. */ 394 ASSERT(path->nodes[0] != NULL); 395 396 /* 397 * And the slot must point to the exact key or the slot where the key 398 * should be at (the first item with a key greater than 'key') 399 */ 400 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 401 struct btrfs_key found_key; 402 403 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 404 ret = btrfs_comp_cpu_keys(&found_key, key); 405 ASSERT(ret >= 0); 406 } else { 407 ret = 1; 408 } 409 410 if (ret == 0) { 411 char *src_copy; 412 char *dst_copy; 413 u32 dst_size = btrfs_item_size(path->nodes[0], 414 path->slots[0]); 415 if (dst_size != item_size) 416 goto insert; 417 418 if (item_size == 0) { 419 btrfs_release_path(path); 420 return 0; 421 } 422 dst_copy = kmalloc(item_size, GFP_NOFS); 423 src_copy = kmalloc(item_size, GFP_NOFS); 424 if (!dst_copy || !src_copy) { 425 btrfs_release_path(path); 426 kfree(dst_copy); 427 kfree(src_copy); 428 return -ENOMEM; 429 } 430 431 read_extent_buffer(eb, src_copy, src_ptr, item_size); 432 433 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 434 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 435 item_size); 436 ret = memcmp(dst_copy, src_copy, item_size); 437 438 kfree(dst_copy); 439 kfree(src_copy); 440 /* 441 * they have the same contents, just return, this saves 442 * us from cowing blocks in the destination tree and doing 443 * extra writes that may not have been done by a previous 444 * sync 445 */ 446 if (ret == 0) { 447 btrfs_release_path(path); 448 return 0; 449 } 450 451 /* 452 * We need to load the old nbytes into the inode so when we 453 * replay the extents we've logged we get the right nbytes. 454 */ 455 if (inode_item) { 456 struct btrfs_inode_item *item; 457 u64 nbytes; 458 u32 mode; 459 460 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 461 struct btrfs_inode_item); 462 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 463 item = btrfs_item_ptr(eb, slot, 464 struct btrfs_inode_item); 465 btrfs_set_inode_nbytes(eb, item, nbytes); 466 467 /* 468 * If this is a directory we need to reset the i_size to 469 * 0 so that we can set it up properly when replaying 470 * the rest of the items in this log. 471 */ 472 mode = btrfs_inode_mode(eb, item); 473 if (S_ISDIR(mode)) 474 btrfs_set_inode_size(eb, item, 0); 475 } 476 } else if (inode_item) { 477 struct btrfs_inode_item *item; 478 u32 mode; 479 480 /* 481 * New inode, set nbytes to 0 so that the nbytes comes out 482 * properly when we replay the extents. 483 */ 484 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 485 btrfs_set_inode_nbytes(eb, item, 0); 486 487 /* 488 * If this is a directory we need to reset the i_size to 0 so 489 * that we can set it up properly when replaying the rest of 490 * the items in this log. 491 */ 492 mode = btrfs_inode_mode(eb, item); 493 if (S_ISDIR(mode)) 494 btrfs_set_inode_size(eb, item, 0); 495 } 496 insert: 497 btrfs_release_path(path); 498 /* try to insert the key into the destination tree */ 499 path->skip_release_on_error = 1; 500 ret = btrfs_insert_empty_item(trans, root, path, 501 key, item_size); 502 path->skip_release_on_error = 0; 503 504 /* make sure any existing item is the correct size */ 505 if (ret == -EEXIST || ret == -EOVERFLOW) { 506 u32 found_size; 507 found_size = btrfs_item_size(path->nodes[0], 508 path->slots[0]); 509 if (found_size > item_size) 510 btrfs_truncate_item(path, item_size, 1); 511 else if (found_size < item_size) 512 btrfs_extend_item(path, item_size - found_size); 513 } else if (ret) { 514 return ret; 515 } 516 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 517 path->slots[0]); 518 519 /* don't overwrite an existing inode if the generation number 520 * was logged as zero. This is done when the tree logging code 521 * is just logging an inode to make sure it exists after recovery. 522 * 523 * Also, don't overwrite i_size on directories during replay. 524 * log replay inserts and removes directory items based on the 525 * state of the tree found in the subvolume, and i_size is modified 526 * as it goes 527 */ 528 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 529 struct btrfs_inode_item *src_item; 530 struct btrfs_inode_item *dst_item; 531 532 src_item = (struct btrfs_inode_item *)src_ptr; 533 dst_item = (struct btrfs_inode_item *)dst_ptr; 534 535 if (btrfs_inode_generation(eb, src_item) == 0) { 536 struct extent_buffer *dst_eb = path->nodes[0]; 537 const u64 ino_size = btrfs_inode_size(eb, src_item); 538 539 /* 540 * For regular files an ino_size == 0 is used only when 541 * logging that an inode exists, as part of a directory 542 * fsync, and the inode wasn't fsynced before. In this 543 * case don't set the size of the inode in the fs/subvol 544 * tree, otherwise we would be throwing valid data away. 545 */ 546 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 547 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 548 ino_size != 0) 549 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 550 goto no_copy; 551 } 552 553 if (overwrite_root && 554 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 555 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 556 save_old_i_size = 1; 557 saved_i_size = btrfs_inode_size(path->nodes[0], 558 dst_item); 559 } 560 } 561 562 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 563 src_ptr, item_size); 564 565 if (save_old_i_size) { 566 struct btrfs_inode_item *dst_item; 567 dst_item = (struct btrfs_inode_item *)dst_ptr; 568 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 569 } 570 571 /* make sure the generation is filled in */ 572 if (key->type == BTRFS_INODE_ITEM_KEY) { 573 struct btrfs_inode_item *dst_item; 574 dst_item = (struct btrfs_inode_item *)dst_ptr; 575 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 576 btrfs_set_inode_generation(path->nodes[0], dst_item, 577 trans->transid); 578 } 579 } 580 no_copy: 581 btrfs_mark_buffer_dirty(path->nodes[0]); 582 btrfs_release_path(path); 583 return 0; 584 } 585 586 /* 587 * Item overwrite used by replay and tree logging. eb, slot and key all refer 588 * to the src data we are copying out. 589 * 590 * root is the tree we are copying into, and path is a scratch 591 * path for use in this function (it should be released on entry and 592 * will be released on exit). 593 * 594 * If the key is already in the destination tree the existing item is 595 * overwritten. If the existing item isn't big enough, it is extended. 596 * If it is too large, it is truncated. 597 * 598 * If the key isn't in the destination yet, a new item is inserted. 599 */ 600 static int overwrite_item(struct btrfs_trans_handle *trans, 601 struct btrfs_root *root, 602 struct btrfs_path *path, 603 struct extent_buffer *eb, int slot, 604 struct btrfs_key *key) 605 { 606 int ret; 607 608 /* Look for the key in the destination tree. */ 609 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 610 if (ret < 0) 611 return ret; 612 613 return do_overwrite_item(trans, root, path, eb, slot, key); 614 } 615 616 /* 617 * simple helper to read an inode off the disk from a given root 618 * This can only be called for subvolume roots and not for the log 619 */ 620 static noinline struct inode *read_one_inode(struct btrfs_root *root, 621 u64 objectid) 622 { 623 struct inode *inode; 624 625 inode = btrfs_iget(root->fs_info->sb, objectid, root); 626 if (IS_ERR(inode)) 627 inode = NULL; 628 return inode; 629 } 630 631 /* replays a single extent in 'eb' at 'slot' with 'key' into the 632 * subvolume 'root'. path is released on entry and should be released 633 * on exit. 634 * 635 * extents in the log tree have not been allocated out of the extent 636 * tree yet. So, this completes the allocation, taking a reference 637 * as required if the extent already exists or creating a new extent 638 * if it isn't in the extent allocation tree yet. 639 * 640 * The extent is inserted into the file, dropping any existing extents 641 * from the file that overlap the new one. 642 */ 643 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 644 struct btrfs_root *root, 645 struct btrfs_path *path, 646 struct extent_buffer *eb, int slot, 647 struct btrfs_key *key) 648 { 649 struct btrfs_drop_extents_args drop_args = { 0 }; 650 struct btrfs_fs_info *fs_info = root->fs_info; 651 int found_type; 652 u64 extent_end; 653 u64 start = key->offset; 654 u64 nbytes = 0; 655 struct btrfs_file_extent_item *item; 656 struct inode *inode = NULL; 657 unsigned long size; 658 int ret = 0; 659 660 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 661 found_type = btrfs_file_extent_type(eb, item); 662 663 if (found_type == BTRFS_FILE_EXTENT_REG || 664 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 665 nbytes = btrfs_file_extent_num_bytes(eb, item); 666 extent_end = start + nbytes; 667 668 /* 669 * We don't add to the inodes nbytes if we are prealloc or a 670 * hole. 671 */ 672 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 673 nbytes = 0; 674 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 675 size = btrfs_file_extent_ram_bytes(eb, item); 676 nbytes = btrfs_file_extent_ram_bytes(eb, item); 677 extent_end = ALIGN(start + size, 678 fs_info->sectorsize); 679 } else { 680 ret = 0; 681 goto out; 682 } 683 684 inode = read_one_inode(root, key->objectid); 685 if (!inode) { 686 ret = -EIO; 687 goto out; 688 } 689 690 /* 691 * first check to see if we already have this extent in the 692 * file. This must be done before the btrfs_drop_extents run 693 * so we don't try to drop this extent. 694 */ 695 ret = btrfs_lookup_file_extent(trans, root, path, 696 btrfs_ino(BTRFS_I(inode)), start, 0); 697 698 if (ret == 0 && 699 (found_type == BTRFS_FILE_EXTENT_REG || 700 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 701 struct btrfs_file_extent_item cmp1; 702 struct btrfs_file_extent_item cmp2; 703 struct btrfs_file_extent_item *existing; 704 struct extent_buffer *leaf; 705 706 leaf = path->nodes[0]; 707 existing = btrfs_item_ptr(leaf, path->slots[0], 708 struct btrfs_file_extent_item); 709 710 read_extent_buffer(eb, &cmp1, (unsigned long)item, 711 sizeof(cmp1)); 712 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 713 sizeof(cmp2)); 714 715 /* 716 * we already have a pointer to this exact extent, 717 * we don't have to do anything 718 */ 719 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 720 btrfs_release_path(path); 721 goto out; 722 } 723 } 724 btrfs_release_path(path); 725 726 /* drop any overlapping extents */ 727 drop_args.start = start; 728 drop_args.end = extent_end; 729 drop_args.drop_cache = true; 730 ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); 731 if (ret) 732 goto out; 733 734 if (found_type == BTRFS_FILE_EXTENT_REG || 735 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 736 u64 offset; 737 unsigned long dest_offset; 738 struct btrfs_key ins; 739 740 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 741 btrfs_fs_incompat(fs_info, NO_HOLES)) 742 goto update_inode; 743 744 ret = btrfs_insert_empty_item(trans, root, path, key, 745 sizeof(*item)); 746 if (ret) 747 goto out; 748 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 749 path->slots[0]); 750 copy_extent_buffer(path->nodes[0], eb, dest_offset, 751 (unsigned long)item, sizeof(*item)); 752 753 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 754 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 755 ins.type = BTRFS_EXTENT_ITEM_KEY; 756 offset = key->offset - btrfs_file_extent_offset(eb, item); 757 758 /* 759 * Manually record dirty extent, as here we did a shallow 760 * file extent item copy and skip normal backref update, 761 * but modifying extent tree all by ourselves. 762 * So need to manually record dirty extent for qgroup, 763 * as the owner of the file extent changed from log tree 764 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 765 */ 766 ret = btrfs_qgroup_trace_extent(trans, 767 btrfs_file_extent_disk_bytenr(eb, item), 768 btrfs_file_extent_disk_num_bytes(eb, item), 769 GFP_NOFS); 770 if (ret < 0) 771 goto out; 772 773 if (ins.objectid > 0) { 774 struct btrfs_ref ref = { 0 }; 775 u64 csum_start; 776 u64 csum_end; 777 LIST_HEAD(ordered_sums); 778 779 /* 780 * is this extent already allocated in the extent 781 * allocation tree? If so, just add a reference 782 */ 783 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 784 ins.offset); 785 if (ret < 0) { 786 goto out; 787 } else if (ret == 0) { 788 btrfs_init_generic_ref(&ref, 789 BTRFS_ADD_DELAYED_REF, 790 ins.objectid, ins.offset, 0); 791 btrfs_init_data_ref(&ref, 792 root->root_key.objectid, 793 key->objectid, offset, 0, false); 794 ret = btrfs_inc_extent_ref(trans, &ref); 795 if (ret) 796 goto out; 797 } else { 798 /* 799 * insert the extent pointer in the extent 800 * allocation tree 801 */ 802 ret = btrfs_alloc_logged_file_extent(trans, 803 root->root_key.objectid, 804 key->objectid, offset, &ins); 805 if (ret) 806 goto out; 807 } 808 btrfs_release_path(path); 809 810 if (btrfs_file_extent_compression(eb, item)) { 811 csum_start = ins.objectid; 812 csum_end = csum_start + ins.offset; 813 } else { 814 csum_start = ins.objectid + 815 btrfs_file_extent_offset(eb, item); 816 csum_end = csum_start + 817 btrfs_file_extent_num_bytes(eb, item); 818 } 819 820 ret = btrfs_lookup_csums_range(root->log_root, 821 csum_start, csum_end - 1, 822 &ordered_sums, 0); 823 if (ret) 824 goto out; 825 /* 826 * Now delete all existing cums in the csum root that 827 * cover our range. We do this because we can have an 828 * extent that is completely referenced by one file 829 * extent item and partially referenced by another 830 * file extent item (like after using the clone or 831 * extent_same ioctls). In this case if we end up doing 832 * the replay of the one that partially references the 833 * extent first, and we do not do the csum deletion 834 * below, we can get 2 csum items in the csum tree that 835 * overlap each other. For example, imagine our log has 836 * the two following file extent items: 837 * 838 * key (257 EXTENT_DATA 409600) 839 * extent data disk byte 12845056 nr 102400 840 * extent data offset 20480 nr 20480 ram 102400 841 * 842 * key (257 EXTENT_DATA 819200) 843 * extent data disk byte 12845056 nr 102400 844 * extent data offset 0 nr 102400 ram 102400 845 * 846 * Where the second one fully references the 100K extent 847 * that starts at disk byte 12845056, and the log tree 848 * has a single csum item that covers the entire range 849 * of the extent: 850 * 851 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 852 * 853 * After the first file extent item is replayed, the 854 * csum tree gets the following csum item: 855 * 856 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 857 * 858 * Which covers the 20K sub-range starting at offset 20K 859 * of our extent. Now when we replay the second file 860 * extent item, if we do not delete existing csum items 861 * that cover any of its blocks, we end up getting two 862 * csum items in our csum tree that overlap each other: 863 * 864 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 865 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 866 * 867 * Which is a problem, because after this anyone trying 868 * to lookup up for the checksum of any block of our 869 * extent starting at an offset of 40K or higher, will 870 * end up looking at the second csum item only, which 871 * does not contain the checksum for any block starting 872 * at offset 40K or higher of our extent. 873 */ 874 while (!list_empty(&ordered_sums)) { 875 struct btrfs_ordered_sum *sums; 876 struct btrfs_root *csum_root; 877 878 sums = list_entry(ordered_sums.next, 879 struct btrfs_ordered_sum, 880 list); 881 csum_root = btrfs_csum_root(fs_info, 882 sums->bytenr); 883 if (!ret) 884 ret = btrfs_del_csums(trans, csum_root, 885 sums->bytenr, 886 sums->len); 887 if (!ret) 888 ret = btrfs_csum_file_blocks(trans, 889 csum_root, 890 sums); 891 list_del(&sums->list); 892 kfree(sums); 893 } 894 if (ret) 895 goto out; 896 } else { 897 btrfs_release_path(path); 898 } 899 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 900 /* inline extents are easy, we just overwrite them */ 901 ret = overwrite_item(trans, root, path, eb, slot, key); 902 if (ret) 903 goto out; 904 } 905 906 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 907 extent_end - start); 908 if (ret) 909 goto out; 910 911 update_inode: 912 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); 913 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 914 out: 915 if (inode) 916 iput(inode); 917 return ret; 918 } 919 920 /* 921 * when cleaning up conflicts between the directory names in the 922 * subvolume, directory names in the log and directory names in the 923 * inode back references, we may have to unlink inodes from directories. 924 * 925 * This is a helper function to do the unlink of a specific directory 926 * item 927 */ 928 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 929 struct btrfs_path *path, 930 struct btrfs_inode *dir, 931 struct btrfs_dir_item *di) 932 { 933 struct btrfs_root *root = dir->root; 934 struct inode *inode; 935 char *name; 936 int name_len; 937 struct extent_buffer *leaf; 938 struct btrfs_key location; 939 int ret; 940 941 leaf = path->nodes[0]; 942 943 btrfs_dir_item_key_to_cpu(leaf, di, &location); 944 name_len = btrfs_dir_name_len(leaf, di); 945 name = kmalloc(name_len, GFP_NOFS); 946 if (!name) 947 return -ENOMEM; 948 949 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 950 btrfs_release_path(path); 951 952 inode = read_one_inode(root, location.objectid); 953 if (!inode) { 954 ret = -EIO; 955 goto out; 956 } 957 958 ret = link_to_fixup_dir(trans, root, path, location.objectid); 959 if (ret) 960 goto out; 961 962 ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name, 963 name_len); 964 if (ret) 965 goto out; 966 else 967 ret = btrfs_run_delayed_items(trans); 968 out: 969 kfree(name); 970 iput(inode); 971 return ret; 972 } 973 974 /* 975 * See if a given name and sequence number found in an inode back reference are 976 * already in a directory and correctly point to this inode. 977 * 978 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it 979 * exists. 980 */ 981 static noinline int inode_in_dir(struct btrfs_root *root, 982 struct btrfs_path *path, 983 u64 dirid, u64 objectid, u64 index, 984 const char *name, int name_len) 985 { 986 struct btrfs_dir_item *di; 987 struct btrfs_key location; 988 int ret = 0; 989 990 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 991 index, name, name_len, 0); 992 if (IS_ERR(di)) { 993 ret = PTR_ERR(di); 994 goto out; 995 } else if (di) { 996 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 997 if (location.objectid != objectid) 998 goto out; 999 } else { 1000 goto out; 1001 } 1002 1003 btrfs_release_path(path); 1004 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 1005 if (IS_ERR(di)) { 1006 ret = PTR_ERR(di); 1007 goto out; 1008 } else if (di) { 1009 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 1010 if (location.objectid == objectid) 1011 ret = 1; 1012 } 1013 out: 1014 btrfs_release_path(path); 1015 return ret; 1016 } 1017 1018 /* 1019 * helper function to check a log tree for a named back reference in 1020 * an inode. This is used to decide if a back reference that is 1021 * found in the subvolume conflicts with what we find in the log. 1022 * 1023 * inode backreferences may have multiple refs in a single item, 1024 * during replay we process one reference at a time, and we don't 1025 * want to delete valid links to a file from the subvolume if that 1026 * link is also in the log. 1027 */ 1028 static noinline int backref_in_log(struct btrfs_root *log, 1029 struct btrfs_key *key, 1030 u64 ref_objectid, 1031 const char *name, int namelen) 1032 { 1033 struct btrfs_path *path; 1034 int ret; 1035 1036 path = btrfs_alloc_path(); 1037 if (!path) 1038 return -ENOMEM; 1039 1040 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 1041 if (ret < 0) { 1042 goto out; 1043 } else if (ret == 1) { 1044 ret = 0; 1045 goto out; 1046 } 1047 1048 if (key->type == BTRFS_INODE_EXTREF_KEY) 1049 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1050 path->slots[0], 1051 ref_objectid, 1052 name, namelen); 1053 else 1054 ret = !!btrfs_find_name_in_backref(path->nodes[0], 1055 path->slots[0], 1056 name, namelen); 1057 out: 1058 btrfs_free_path(path); 1059 return ret; 1060 } 1061 1062 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 1063 struct btrfs_root *root, 1064 struct btrfs_path *path, 1065 struct btrfs_root *log_root, 1066 struct btrfs_inode *dir, 1067 struct btrfs_inode *inode, 1068 u64 inode_objectid, u64 parent_objectid, 1069 u64 ref_index, char *name, int namelen, 1070 int *search_done) 1071 { 1072 int ret; 1073 char *victim_name; 1074 int victim_name_len; 1075 struct extent_buffer *leaf; 1076 struct btrfs_dir_item *di; 1077 struct btrfs_key search_key; 1078 struct btrfs_inode_extref *extref; 1079 1080 again: 1081 /* Search old style refs */ 1082 search_key.objectid = inode_objectid; 1083 search_key.type = BTRFS_INODE_REF_KEY; 1084 search_key.offset = parent_objectid; 1085 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1086 if (ret == 0) { 1087 struct btrfs_inode_ref *victim_ref; 1088 unsigned long ptr; 1089 unsigned long ptr_end; 1090 1091 leaf = path->nodes[0]; 1092 1093 /* are we trying to overwrite a back ref for the root directory 1094 * if so, just jump out, we're done 1095 */ 1096 if (search_key.objectid == search_key.offset) 1097 return 1; 1098 1099 /* check all the names in this back reference to see 1100 * if they are in the log. if so, we allow them to stay 1101 * otherwise they must be unlinked as a conflict 1102 */ 1103 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1104 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); 1105 while (ptr < ptr_end) { 1106 victim_ref = (struct btrfs_inode_ref *)ptr; 1107 victim_name_len = btrfs_inode_ref_name_len(leaf, 1108 victim_ref); 1109 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1110 if (!victim_name) 1111 return -ENOMEM; 1112 1113 read_extent_buffer(leaf, victim_name, 1114 (unsigned long)(victim_ref + 1), 1115 victim_name_len); 1116 1117 ret = backref_in_log(log_root, &search_key, 1118 parent_objectid, victim_name, 1119 victim_name_len); 1120 if (ret < 0) { 1121 kfree(victim_name); 1122 return ret; 1123 } else if (!ret) { 1124 inc_nlink(&inode->vfs_inode); 1125 btrfs_release_path(path); 1126 1127 ret = btrfs_unlink_inode(trans, dir, inode, 1128 victim_name, victim_name_len); 1129 kfree(victim_name); 1130 if (ret) 1131 return ret; 1132 ret = btrfs_run_delayed_items(trans); 1133 if (ret) 1134 return ret; 1135 *search_done = 1; 1136 goto again; 1137 } 1138 kfree(victim_name); 1139 1140 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1141 } 1142 1143 /* 1144 * NOTE: we have searched root tree and checked the 1145 * corresponding ref, it does not need to check again. 1146 */ 1147 *search_done = 1; 1148 } 1149 btrfs_release_path(path); 1150 1151 /* Same search but for extended refs */ 1152 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1153 inode_objectid, parent_objectid, 0, 1154 0); 1155 if (!IS_ERR_OR_NULL(extref)) { 1156 u32 item_size; 1157 u32 cur_offset = 0; 1158 unsigned long base; 1159 struct inode *victim_parent; 1160 1161 leaf = path->nodes[0]; 1162 1163 item_size = btrfs_item_size(leaf, path->slots[0]); 1164 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1165 1166 while (cur_offset < item_size) { 1167 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1168 1169 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1170 1171 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1172 goto next; 1173 1174 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1175 if (!victim_name) 1176 return -ENOMEM; 1177 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1178 victim_name_len); 1179 1180 search_key.objectid = inode_objectid; 1181 search_key.type = BTRFS_INODE_EXTREF_KEY; 1182 search_key.offset = btrfs_extref_hash(parent_objectid, 1183 victim_name, 1184 victim_name_len); 1185 ret = backref_in_log(log_root, &search_key, 1186 parent_objectid, victim_name, 1187 victim_name_len); 1188 if (ret < 0) { 1189 kfree(victim_name); 1190 return ret; 1191 } else if (!ret) { 1192 ret = -ENOENT; 1193 victim_parent = read_one_inode(root, 1194 parent_objectid); 1195 if (victim_parent) { 1196 inc_nlink(&inode->vfs_inode); 1197 btrfs_release_path(path); 1198 1199 ret = btrfs_unlink_inode(trans, 1200 BTRFS_I(victim_parent), 1201 inode, 1202 victim_name, 1203 victim_name_len); 1204 if (!ret) 1205 ret = btrfs_run_delayed_items( 1206 trans); 1207 } 1208 iput(victim_parent); 1209 kfree(victim_name); 1210 if (ret) 1211 return ret; 1212 *search_done = 1; 1213 goto again; 1214 } 1215 kfree(victim_name); 1216 next: 1217 cur_offset += victim_name_len + sizeof(*extref); 1218 } 1219 *search_done = 1; 1220 } 1221 btrfs_release_path(path); 1222 1223 /* look for a conflicting sequence number */ 1224 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1225 ref_index, name, namelen, 0); 1226 if (IS_ERR(di)) { 1227 return PTR_ERR(di); 1228 } else if (di) { 1229 ret = drop_one_dir_item(trans, path, dir, di); 1230 if (ret) 1231 return ret; 1232 } 1233 btrfs_release_path(path); 1234 1235 /* look for a conflicting name */ 1236 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1237 name, namelen, 0); 1238 if (IS_ERR(di)) { 1239 return PTR_ERR(di); 1240 } else if (di) { 1241 ret = drop_one_dir_item(trans, path, dir, di); 1242 if (ret) 1243 return ret; 1244 } 1245 btrfs_release_path(path); 1246 1247 return 0; 1248 } 1249 1250 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1251 u32 *namelen, char **name, u64 *index, 1252 u64 *parent_objectid) 1253 { 1254 struct btrfs_inode_extref *extref; 1255 1256 extref = (struct btrfs_inode_extref *)ref_ptr; 1257 1258 *namelen = btrfs_inode_extref_name_len(eb, extref); 1259 *name = kmalloc(*namelen, GFP_NOFS); 1260 if (*name == NULL) 1261 return -ENOMEM; 1262 1263 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1264 *namelen); 1265 1266 if (index) 1267 *index = btrfs_inode_extref_index(eb, extref); 1268 if (parent_objectid) 1269 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1270 1271 return 0; 1272 } 1273 1274 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1275 u32 *namelen, char **name, u64 *index) 1276 { 1277 struct btrfs_inode_ref *ref; 1278 1279 ref = (struct btrfs_inode_ref *)ref_ptr; 1280 1281 *namelen = btrfs_inode_ref_name_len(eb, ref); 1282 *name = kmalloc(*namelen, GFP_NOFS); 1283 if (*name == NULL) 1284 return -ENOMEM; 1285 1286 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1287 1288 if (index) 1289 *index = btrfs_inode_ref_index(eb, ref); 1290 1291 return 0; 1292 } 1293 1294 /* 1295 * Take an inode reference item from the log tree and iterate all names from the 1296 * inode reference item in the subvolume tree with the same key (if it exists). 1297 * For any name that is not in the inode reference item from the log tree, do a 1298 * proper unlink of that name (that is, remove its entry from the inode 1299 * reference item and both dir index keys). 1300 */ 1301 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1302 struct btrfs_root *root, 1303 struct btrfs_path *path, 1304 struct btrfs_inode *inode, 1305 struct extent_buffer *log_eb, 1306 int log_slot, 1307 struct btrfs_key *key) 1308 { 1309 int ret; 1310 unsigned long ref_ptr; 1311 unsigned long ref_end; 1312 struct extent_buffer *eb; 1313 1314 again: 1315 btrfs_release_path(path); 1316 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1317 if (ret > 0) { 1318 ret = 0; 1319 goto out; 1320 } 1321 if (ret < 0) 1322 goto out; 1323 1324 eb = path->nodes[0]; 1325 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1326 ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); 1327 while (ref_ptr < ref_end) { 1328 char *name = NULL; 1329 int namelen; 1330 u64 parent_id; 1331 1332 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1333 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1334 NULL, &parent_id); 1335 } else { 1336 parent_id = key->offset; 1337 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1338 NULL); 1339 } 1340 if (ret) 1341 goto out; 1342 1343 if (key->type == BTRFS_INODE_EXTREF_KEY) 1344 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1345 parent_id, name, 1346 namelen); 1347 else 1348 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, 1349 name, namelen); 1350 1351 if (!ret) { 1352 struct inode *dir; 1353 1354 btrfs_release_path(path); 1355 dir = read_one_inode(root, parent_id); 1356 if (!dir) { 1357 ret = -ENOENT; 1358 kfree(name); 1359 goto out; 1360 } 1361 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), 1362 inode, name, namelen); 1363 kfree(name); 1364 iput(dir); 1365 /* 1366 * Whenever we need to check if a name exists or not, we 1367 * check the subvolume tree. So after an unlink we must 1368 * run delayed items, so that future checks for a name 1369 * during log replay see that the name does not exists 1370 * anymore. 1371 */ 1372 if (!ret) 1373 ret = btrfs_run_delayed_items(trans); 1374 if (ret) 1375 goto out; 1376 goto again; 1377 } 1378 1379 kfree(name); 1380 ref_ptr += namelen; 1381 if (key->type == BTRFS_INODE_EXTREF_KEY) 1382 ref_ptr += sizeof(struct btrfs_inode_extref); 1383 else 1384 ref_ptr += sizeof(struct btrfs_inode_ref); 1385 } 1386 ret = 0; 1387 out: 1388 btrfs_release_path(path); 1389 return ret; 1390 } 1391 1392 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, 1393 const u8 ref_type, const char *name, 1394 const int namelen) 1395 { 1396 struct btrfs_key key; 1397 struct btrfs_path *path; 1398 const u64 parent_id = btrfs_ino(BTRFS_I(dir)); 1399 int ret; 1400 1401 path = btrfs_alloc_path(); 1402 if (!path) 1403 return -ENOMEM; 1404 1405 key.objectid = btrfs_ino(BTRFS_I(inode)); 1406 key.type = ref_type; 1407 if (key.type == BTRFS_INODE_REF_KEY) 1408 key.offset = parent_id; 1409 else 1410 key.offset = btrfs_extref_hash(parent_id, name, namelen); 1411 1412 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); 1413 if (ret < 0) 1414 goto out; 1415 if (ret > 0) { 1416 ret = 0; 1417 goto out; 1418 } 1419 if (key.type == BTRFS_INODE_EXTREF_KEY) 1420 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1421 path->slots[0], parent_id, name, namelen); 1422 else 1423 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], 1424 name, namelen); 1425 1426 out: 1427 btrfs_free_path(path); 1428 return ret; 1429 } 1430 1431 static int add_link(struct btrfs_trans_handle *trans, 1432 struct inode *dir, struct inode *inode, const char *name, 1433 int namelen, u64 ref_index) 1434 { 1435 struct btrfs_root *root = BTRFS_I(dir)->root; 1436 struct btrfs_dir_item *dir_item; 1437 struct btrfs_key key; 1438 struct btrfs_path *path; 1439 struct inode *other_inode = NULL; 1440 int ret; 1441 1442 path = btrfs_alloc_path(); 1443 if (!path) 1444 return -ENOMEM; 1445 1446 dir_item = btrfs_lookup_dir_item(NULL, root, path, 1447 btrfs_ino(BTRFS_I(dir)), 1448 name, namelen, 0); 1449 if (!dir_item) { 1450 btrfs_release_path(path); 1451 goto add_link; 1452 } else if (IS_ERR(dir_item)) { 1453 ret = PTR_ERR(dir_item); 1454 goto out; 1455 } 1456 1457 /* 1458 * Our inode's dentry collides with the dentry of another inode which is 1459 * in the log but not yet processed since it has a higher inode number. 1460 * So delete that other dentry. 1461 */ 1462 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); 1463 btrfs_release_path(path); 1464 other_inode = read_one_inode(root, key.objectid); 1465 if (!other_inode) { 1466 ret = -ENOENT; 1467 goto out; 1468 } 1469 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode), 1470 name, namelen); 1471 if (ret) 1472 goto out; 1473 /* 1474 * If we dropped the link count to 0, bump it so that later the iput() 1475 * on the inode will not free it. We will fixup the link count later. 1476 */ 1477 if (other_inode->i_nlink == 0) 1478 inc_nlink(other_inode); 1479 1480 ret = btrfs_run_delayed_items(trans); 1481 if (ret) 1482 goto out; 1483 add_link: 1484 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1485 name, namelen, 0, ref_index); 1486 out: 1487 iput(other_inode); 1488 btrfs_free_path(path); 1489 1490 return ret; 1491 } 1492 1493 /* 1494 * replay one inode back reference item found in the log tree. 1495 * eb, slot and key refer to the buffer and key found in the log tree. 1496 * root is the destination we are replaying into, and path is for temp 1497 * use by this function. (it should be released on return). 1498 */ 1499 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1500 struct btrfs_root *root, 1501 struct btrfs_root *log, 1502 struct btrfs_path *path, 1503 struct extent_buffer *eb, int slot, 1504 struct btrfs_key *key) 1505 { 1506 struct inode *dir = NULL; 1507 struct inode *inode = NULL; 1508 unsigned long ref_ptr; 1509 unsigned long ref_end; 1510 char *name = NULL; 1511 int namelen; 1512 int ret; 1513 int search_done = 0; 1514 int log_ref_ver = 0; 1515 u64 parent_objectid; 1516 u64 inode_objectid; 1517 u64 ref_index = 0; 1518 int ref_struct_size; 1519 1520 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1521 ref_end = ref_ptr + btrfs_item_size(eb, slot); 1522 1523 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1524 struct btrfs_inode_extref *r; 1525 1526 ref_struct_size = sizeof(struct btrfs_inode_extref); 1527 log_ref_ver = 1; 1528 r = (struct btrfs_inode_extref *)ref_ptr; 1529 parent_objectid = btrfs_inode_extref_parent(eb, r); 1530 } else { 1531 ref_struct_size = sizeof(struct btrfs_inode_ref); 1532 parent_objectid = key->offset; 1533 } 1534 inode_objectid = key->objectid; 1535 1536 /* 1537 * it is possible that we didn't log all the parent directories 1538 * for a given inode. If we don't find the dir, just don't 1539 * copy the back ref in. The link count fixup code will take 1540 * care of the rest 1541 */ 1542 dir = read_one_inode(root, parent_objectid); 1543 if (!dir) { 1544 ret = -ENOENT; 1545 goto out; 1546 } 1547 1548 inode = read_one_inode(root, inode_objectid); 1549 if (!inode) { 1550 ret = -EIO; 1551 goto out; 1552 } 1553 1554 while (ref_ptr < ref_end) { 1555 if (log_ref_ver) { 1556 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1557 &ref_index, &parent_objectid); 1558 /* 1559 * parent object can change from one array 1560 * item to another. 1561 */ 1562 if (!dir) 1563 dir = read_one_inode(root, parent_objectid); 1564 if (!dir) { 1565 ret = -ENOENT; 1566 goto out; 1567 } 1568 } else { 1569 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1570 &ref_index); 1571 } 1572 if (ret) 1573 goto out; 1574 1575 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1576 btrfs_ino(BTRFS_I(inode)), ref_index, 1577 name, namelen); 1578 if (ret < 0) { 1579 goto out; 1580 } else if (ret == 0) { 1581 /* 1582 * look for a conflicting back reference in the 1583 * metadata. if we find one we have to unlink that name 1584 * of the file before we add our new link. Later on, we 1585 * overwrite any existing back reference, and we don't 1586 * want to create dangling pointers in the directory. 1587 */ 1588 1589 if (!search_done) { 1590 ret = __add_inode_ref(trans, root, path, log, 1591 BTRFS_I(dir), 1592 BTRFS_I(inode), 1593 inode_objectid, 1594 parent_objectid, 1595 ref_index, name, namelen, 1596 &search_done); 1597 if (ret) { 1598 if (ret == 1) 1599 ret = 0; 1600 goto out; 1601 } 1602 } 1603 1604 /* 1605 * If a reference item already exists for this inode 1606 * with the same parent and name, but different index, 1607 * drop it and the corresponding directory index entries 1608 * from the parent before adding the new reference item 1609 * and dir index entries, otherwise we would fail with 1610 * -EEXIST returned from btrfs_add_link() below. 1611 */ 1612 ret = btrfs_inode_ref_exists(inode, dir, key->type, 1613 name, namelen); 1614 if (ret > 0) { 1615 ret = btrfs_unlink_inode(trans, 1616 BTRFS_I(dir), 1617 BTRFS_I(inode), 1618 name, namelen); 1619 /* 1620 * If we dropped the link count to 0, bump it so 1621 * that later the iput() on the inode will not 1622 * free it. We will fixup the link count later. 1623 */ 1624 if (!ret && inode->i_nlink == 0) 1625 inc_nlink(inode); 1626 /* 1627 * Whenever we need to check if a name exists or 1628 * not, we check the subvolume tree. So after an 1629 * unlink we must run delayed items, so that future 1630 * checks for a name during log replay see that the 1631 * name does not exists anymore. 1632 */ 1633 if (!ret) 1634 ret = btrfs_run_delayed_items(trans); 1635 } 1636 if (ret < 0) 1637 goto out; 1638 1639 /* insert our name */ 1640 ret = add_link(trans, dir, inode, name, namelen, 1641 ref_index); 1642 if (ret) 1643 goto out; 1644 1645 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1646 if (ret) 1647 goto out; 1648 } 1649 /* Else, ret == 1, we already have a perfect match, we're done. */ 1650 1651 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1652 kfree(name); 1653 name = NULL; 1654 if (log_ref_ver) { 1655 iput(dir); 1656 dir = NULL; 1657 } 1658 } 1659 1660 /* 1661 * Before we overwrite the inode reference item in the subvolume tree 1662 * with the item from the log tree, we must unlink all names from the 1663 * parent directory that are in the subvolume's tree inode reference 1664 * item, otherwise we end up with an inconsistent subvolume tree where 1665 * dir index entries exist for a name but there is no inode reference 1666 * item with the same name. 1667 */ 1668 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1669 key); 1670 if (ret) 1671 goto out; 1672 1673 /* finally write the back reference in the inode */ 1674 ret = overwrite_item(trans, root, path, eb, slot, key); 1675 out: 1676 btrfs_release_path(path); 1677 kfree(name); 1678 iput(dir); 1679 iput(inode); 1680 return ret; 1681 } 1682 1683 static int count_inode_extrefs(struct btrfs_root *root, 1684 struct btrfs_inode *inode, struct btrfs_path *path) 1685 { 1686 int ret = 0; 1687 int name_len; 1688 unsigned int nlink = 0; 1689 u32 item_size; 1690 u32 cur_offset = 0; 1691 u64 inode_objectid = btrfs_ino(inode); 1692 u64 offset = 0; 1693 unsigned long ptr; 1694 struct btrfs_inode_extref *extref; 1695 struct extent_buffer *leaf; 1696 1697 while (1) { 1698 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1699 &extref, &offset); 1700 if (ret) 1701 break; 1702 1703 leaf = path->nodes[0]; 1704 item_size = btrfs_item_size(leaf, path->slots[0]); 1705 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1706 cur_offset = 0; 1707 1708 while (cur_offset < item_size) { 1709 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1710 name_len = btrfs_inode_extref_name_len(leaf, extref); 1711 1712 nlink++; 1713 1714 cur_offset += name_len + sizeof(*extref); 1715 } 1716 1717 offset++; 1718 btrfs_release_path(path); 1719 } 1720 btrfs_release_path(path); 1721 1722 if (ret < 0 && ret != -ENOENT) 1723 return ret; 1724 return nlink; 1725 } 1726 1727 static int count_inode_refs(struct btrfs_root *root, 1728 struct btrfs_inode *inode, struct btrfs_path *path) 1729 { 1730 int ret; 1731 struct btrfs_key key; 1732 unsigned int nlink = 0; 1733 unsigned long ptr; 1734 unsigned long ptr_end; 1735 int name_len; 1736 u64 ino = btrfs_ino(inode); 1737 1738 key.objectid = ino; 1739 key.type = BTRFS_INODE_REF_KEY; 1740 key.offset = (u64)-1; 1741 1742 while (1) { 1743 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1744 if (ret < 0) 1745 break; 1746 if (ret > 0) { 1747 if (path->slots[0] == 0) 1748 break; 1749 path->slots[0]--; 1750 } 1751 process_slot: 1752 btrfs_item_key_to_cpu(path->nodes[0], &key, 1753 path->slots[0]); 1754 if (key.objectid != ino || 1755 key.type != BTRFS_INODE_REF_KEY) 1756 break; 1757 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1758 ptr_end = ptr + btrfs_item_size(path->nodes[0], 1759 path->slots[0]); 1760 while (ptr < ptr_end) { 1761 struct btrfs_inode_ref *ref; 1762 1763 ref = (struct btrfs_inode_ref *)ptr; 1764 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1765 ref); 1766 ptr = (unsigned long)(ref + 1) + name_len; 1767 nlink++; 1768 } 1769 1770 if (key.offset == 0) 1771 break; 1772 if (path->slots[0] > 0) { 1773 path->slots[0]--; 1774 goto process_slot; 1775 } 1776 key.offset--; 1777 btrfs_release_path(path); 1778 } 1779 btrfs_release_path(path); 1780 1781 return nlink; 1782 } 1783 1784 /* 1785 * There are a few corners where the link count of the file can't 1786 * be properly maintained during replay. So, instead of adding 1787 * lots of complexity to the log code, we just scan the backrefs 1788 * for any file that has been through replay. 1789 * 1790 * The scan will update the link count on the inode to reflect the 1791 * number of back refs found. If it goes down to zero, the iput 1792 * will free the inode. 1793 */ 1794 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1795 struct btrfs_root *root, 1796 struct inode *inode) 1797 { 1798 struct btrfs_path *path; 1799 int ret; 1800 u64 nlink = 0; 1801 u64 ino = btrfs_ino(BTRFS_I(inode)); 1802 1803 path = btrfs_alloc_path(); 1804 if (!path) 1805 return -ENOMEM; 1806 1807 ret = count_inode_refs(root, BTRFS_I(inode), path); 1808 if (ret < 0) 1809 goto out; 1810 1811 nlink = ret; 1812 1813 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1814 if (ret < 0) 1815 goto out; 1816 1817 nlink += ret; 1818 1819 ret = 0; 1820 1821 if (nlink != inode->i_nlink) { 1822 set_nlink(inode, nlink); 1823 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1824 if (ret) 1825 goto out; 1826 } 1827 BTRFS_I(inode)->index_cnt = (u64)-1; 1828 1829 if (inode->i_nlink == 0) { 1830 if (S_ISDIR(inode->i_mode)) { 1831 ret = replay_dir_deletes(trans, root, NULL, path, 1832 ino, 1); 1833 if (ret) 1834 goto out; 1835 } 1836 ret = btrfs_insert_orphan_item(trans, root, ino); 1837 if (ret == -EEXIST) 1838 ret = 0; 1839 } 1840 1841 out: 1842 btrfs_free_path(path); 1843 return ret; 1844 } 1845 1846 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1847 struct btrfs_root *root, 1848 struct btrfs_path *path) 1849 { 1850 int ret; 1851 struct btrfs_key key; 1852 struct inode *inode; 1853 1854 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1855 key.type = BTRFS_ORPHAN_ITEM_KEY; 1856 key.offset = (u64)-1; 1857 while (1) { 1858 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1859 if (ret < 0) 1860 break; 1861 1862 if (ret == 1) { 1863 ret = 0; 1864 if (path->slots[0] == 0) 1865 break; 1866 path->slots[0]--; 1867 } 1868 1869 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1870 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1871 key.type != BTRFS_ORPHAN_ITEM_KEY) 1872 break; 1873 1874 ret = btrfs_del_item(trans, root, path); 1875 if (ret) 1876 break; 1877 1878 btrfs_release_path(path); 1879 inode = read_one_inode(root, key.offset); 1880 if (!inode) { 1881 ret = -EIO; 1882 break; 1883 } 1884 1885 ret = fixup_inode_link_count(trans, root, inode); 1886 iput(inode); 1887 if (ret) 1888 break; 1889 1890 /* 1891 * fixup on a directory may create new entries, 1892 * make sure we always look for the highset possible 1893 * offset 1894 */ 1895 key.offset = (u64)-1; 1896 } 1897 btrfs_release_path(path); 1898 return ret; 1899 } 1900 1901 1902 /* 1903 * record a given inode in the fixup dir so we can check its link 1904 * count when replay is done. The link count is incremented here 1905 * so the inode won't go away until we check it 1906 */ 1907 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1908 struct btrfs_root *root, 1909 struct btrfs_path *path, 1910 u64 objectid) 1911 { 1912 struct btrfs_key key; 1913 int ret = 0; 1914 struct inode *inode; 1915 1916 inode = read_one_inode(root, objectid); 1917 if (!inode) 1918 return -EIO; 1919 1920 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1921 key.type = BTRFS_ORPHAN_ITEM_KEY; 1922 key.offset = objectid; 1923 1924 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1925 1926 btrfs_release_path(path); 1927 if (ret == 0) { 1928 if (!inode->i_nlink) 1929 set_nlink(inode, 1); 1930 else 1931 inc_nlink(inode); 1932 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1933 } else if (ret == -EEXIST) { 1934 ret = 0; 1935 } 1936 iput(inode); 1937 1938 return ret; 1939 } 1940 1941 /* 1942 * when replaying the log for a directory, we only insert names 1943 * for inodes that actually exist. This means an fsync on a directory 1944 * does not implicitly fsync all the new files in it 1945 */ 1946 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1947 struct btrfs_root *root, 1948 u64 dirid, u64 index, 1949 char *name, int name_len, 1950 struct btrfs_key *location) 1951 { 1952 struct inode *inode; 1953 struct inode *dir; 1954 int ret; 1955 1956 inode = read_one_inode(root, location->objectid); 1957 if (!inode) 1958 return -ENOENT; 1959 1960 dir = read_one_inode(root, dirid); 1961 if (!dir) { 1962 iput(inode); 1963 return -EIO; 1964 } 1965 1966 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1967 name_len, 1, index); 1968 1969 /* FIXME, put inode into FIXUP list */ 1970 1971 iput(inode); 1972 iput(dir); 1973 return ret; 1974 } 1975 1976 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, 1977 struct btrfs_inode *dir, 1978 struct btrfs_path *path, 1979 struct btrfs_dir_item *dst_di, 1980 const struct btrfs_key *log_key, 1981 u8 log_type, 1982 bool exists) 1983 { 1984 struct btrfs_key found_key; 1985 1986 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1987 /* The existing dentry points to the same inode, don't delete it. */ 1988 if (found_key.objectid == log_key->objectid && 1989 found_key.type == log_key->type && 1990 found_key.offset == log_key->offset && 1991 btrfs_dir_type(path->nodes[0], dst_di) == log_type) 1992 return 1; 1993 1994 /* 1995 * Don't drop the conflicting directory entry if the inode for the new 1996 * entry doesn't exist. 1997 */ 1998 if (!exists) 1999 return 0; 2000 2001 return drop_one_dir_item(trans, path, dir, dst_di); 2002 } 2003 2004 /* 2005 * take a single entry in a log directory item and replay it into 2006 * the subvolume. 2007 * 2008 * if a conflicting item exists in the subdirectory already, 2009 * the inode it points to is unlinked and put into the link count 2010 * fix up tree. 2011 * 2012 * If a name from the log points to a file or directory that does 2013 * not exist in the FS, it is skipped. fsyncs on directories 2014 * do not force down inodes inside that directory, just changes to the 2015 * names or unlinks in a directory. 2016 * 2017 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 2018 * non-existing inode) and 1 if the name was replayed. 2019 */ 2020 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 2021 struct btrfs_root *root, 2022 struct btrfs_path *path, 2023 struct extent_buffer *eb, 2024 struct btrfs_dir_item *di, 2025 struct btrfs_key *key) 2026 { 2027 char *name; 2028 int name_len; 2029 struct btrfs_dir_item *dir_dst_di; 2030 struct btrfs_dir_item *index_dst_di; 2031 bool dir_dst_matches = false; 2032 bool index_dst_matches = false; 2033 struct btrfs_key log_key; 2034 struct btrfs_key search_key; 2035 struct inode *dir; 2036 u8 log_type; 2037 bool exists; 2038 int ret; 2039 bool update_size = true; 2040 bool name_added = false; 2041 2042 dir = read_one_inode(root, key->objectid); 2043 if (!dir) 2044 return -EIO; 2045 2046 name_len = btrfs_dir_name_len(eb, di); 2047 name = kmalloc(name_len, GFP_NOFS); 2048 if (!name) { 2049 ret = -ENOMEM; 2050 goto out; 2051 } 2052 2053 log_type = btrfs_dir_type(eb, di); 2054 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2055 name_len); 2056 2057 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 2058 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); 2059 btrfs_release_path(path); 2060 if (ret < 0) 2061 goto out; 2062 exists = (ret == 0); 2063 ret = 0; 2064 2065 dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 2066 name, name_len, 1); 2067 if (IS_ERR(dir_dst_di)) { 2068 ret = PTR_ERR(dir_dst_di); 2069 goto out; 2070 } else if (dir_dst_di) { 2071 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 2072 dir_dst_di, &log_key, log_type, 2073 exists); 2074 if (ret < 0) 2075 goto out; 2076 dir_dst_matches = (ret == 1); 2077 } 2078 2079 btrfs_release_path(path); 2080 2081 index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, 2082 key->objectid, key->offset, 2083 name, name_len, 1); 2084 if (IS_ERR(index_dst_di)) { 2085 ret = PTR_ERR(index_dst_di); 2086 goto out; 2087 } else if (index_dst_di) { 2088 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 2089 index_dst_di, &log_key, 2090 log_type, exists); 2091 if (ret < 0) 2092 goto out; 2093 index_dst_matches = (ret == 1); 2094 } 2095 2096 btrfs_release_path(path); 2097 2098 if (dir_dst_matches && index_dst_matches) { 2099 ret = 0; 2100 update_size = false; 2101 goto out; 2102 } 2103 2104 /* 2105 * Check if the inode reference exists in the log for the given name, 2106 * inode and parent inode 2107 */ 2108 search_key.objectid = log_key.objectid; 2109 search_key.type = BTRFS_INODE_REF_KEY; 2110 search_key.offset = key->objectid; 2111 ret = backref_in_log(root->log_root, &search_key, 0, name, name_len); 2112 if (ret < 0) { 2113 goto out; 2114 } else if (ret) { 2115 /* The dentry will be added later. */ 2116 ret = 0; 2117 update_size = false; 2118 goto out; 2119 } 2120 2121 search_key.objectid = log_key.objectid; 2122 search_key.type = BTRFS_INODE_EXTREF_KEY; 2123 search_key.offset = key->objectid; 2124 ret = backref_in_log(root->log_root, &search_key, key->objectid, name, 2125 name_len); 2126 if (ret < 0) { 2127 goto out; 2128 } else if (ret) { 2129 /* The dentry will be added later. */ 2130 ret = 0; 2131 update_size = false; 2132 goto out; 2133 } 2134 btrfs_release_path(path); 2135 ret = insert_one_name(trans, root, key->objectid, key->offset, 2136 name, name_len, &log_key); 2137 if (ret && ret != -ENOENT && ret != -EEXIST) 2138 goto out; 2139 if (!ret) 2140 name_added = true; 2141 update_size = false; 2142 ret = 0; 2143 2144 out: 2145 if (!ret && update_size) { 2146 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 2147 ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); 2148 } 2149 kfree(name); 2150 iput(dir); 2151 if (!ret && name_added) 2152 ret = 1; 2153 return ret; 2154 } 2155 2156 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ 2157 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 2158 struct btrfs_root *root, 2159 struct btrfs_path *path, 2160 struct extent_buffer *eb, int slot, 2161 struct btrfs_key *key) 2162 { 2163 int ret; 2164 struct btrfs_dir_item *di; 2165 2166 /* We only log dir index keys, which only contain a single dir item. */ 2167 ASSERT(key->type == BTRFS_DIR_INDEX_KEY); 2168 2169 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2170 ret = replay_one_name(trans, root, path, eb, di, key); 2171 if (ret < 0) 2172 return ret; 2173 2174 /* 2175 * If this entry refers to a non-directory (directories can not have a 2176 * link count > 1) and it was added in the transaction that was not 2177 * committed, make sure we fixup the link count of the inode the entry 2178 * points to. Otherwise something like the following would result in a 2179 * directory pointing to an inode with a wrong link that does not account 2180 * for this dir entry: 2181 * 2182 * mkdir testdir 2183 * touch testdir/foo 2184 * touch testdir/bar 2185 * sync 2186 * 2187 * ln testdir/bar testdir/bar_link 2188 * ln testdir/foo testdir/foo_link 2189 * xfs_io -c "fsync" testdir/bar 2190 * 2191 * <power failure> 2192 * 2193 * mount fs, log replay happens 2194 * 2195 * File foo would remain with a link count of 1 when it has two entries 2196 * pointing to it in the directory testdir. This would make it impossible 2197 * to ever delete the parent directory has it would result in stale 2198 * dentries that can never be deleted. 2199 */ 2200 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 2201 struct btrfs_path *fixup_path; 2202 struct btrfs_key di_key; 2203 2204 fixup_path = btrfs_alloc_path(); 2205 if (!fixup_path) 2206 return -ENOMEM; 2207 2208 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2209 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); 2210 btrfs_free_path(fixup_path); 2211 } 2212 2213 return ret; 2214 } 2215 2216 /* 2217 * directory replay has two parts. There are the standard directory 2218 * items in the log copied from the subvolume, and range items 2219 * created in the log while the subvolume was logged. 2220 * 2221 * The range items tell us which parts of the key space the log 2222 * is authoritative for. During replay, if a key in the subvolume 2223 * directory is in a logged range item, but not actually in the log 2224 * that means it was deleted from the directory before the fsync 2225 * and should be removed. 2226 */ 2227 static noinline int find_dir_range(struct btrfs_root *root, 2228 struct btrfs_path *path, 2229 u64 dirid, 2230 u64 *start_ret, u64 *end_ret) 2231 { 2232 struct btrfs_key key; 2233 u64 found_end; 2234 struct btrfs_dir_log_item *item; 2235 int ret; 2236 int nritems; 2237 2238 if (*start_ret == (u64)-1) 2239 return 1; 2240 2241 key.objectid = dirid; 2242 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2243 key.offset = *start_ret; 2244 2245 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2246 if (ret < 0) 2247 goto out; 2248 if (ret > 0) { 2249 if (path->slots[0] == 0) 2250 goto out; 2251 path->slots[0]--; 2252 } 2253 if (ret != 0) 2254 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2255 2256 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2257 ret = 1; 2258 goto next; 2259 } 2260 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2261 struct btrfs_dir_log_item); 2262 found_end = btrfs_dir_log_end(path->nodes[0], item); 2263 2264 if (*start_ret >= key.offset && *start_ret <= found_end) { 2265 ret = 0; 2266 *start_ret = key.offset; 2267 *end_ret = found_end; 2268 goto out; 2269 } 2270 ret = 1; 2271 next: 2272 /* check the next slot in the tree to see if it is a valid item */ 2273 nritems = btrfs_header_nritems(path->nodes[0]); 2274 path->slots[0]++; 2275 if (path->slots[0] >= nritems) { 2276 ret = btrfs_next_leaf(root, path); 2277 if (ret) 2278 goto out; 2279 } 2280 2281 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2282 2283 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2284 ret = 1; 2285 goto out; 2286 } 2287 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2288 struct btrfs_dir_log_item); 2289 found_end = btrfs_dir_log_end(path->nodes[0], item); 2290 *start_ret = key.offset; 2291 *end_ret = found_end; 2292 ret = 0; 2293 out: 2294 btrfs_release_path(path); 2295 return ret; 2296 } 2297 2298 /* 2299 * this looks for a given directory item in the log. If the directory 2300 * item is not in the log, the item is removed and the inode it points 2301 * to is unlinked 2302 */ 2303 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2304 struct btrfs_root *log, 2305 struct btrfs_path *path, 2306 struct btrfs_path *log_path, 2307 struct inode *dir, 2308 struct btrfs_key *dir_key) 2309 { 2310 struct btrfs_root *root = BTRFS_I(dir)->root; 2311 int ret; 2312 struct extent_buffer *eb; 2313 int slot; 2314 struct btrfs_dir_item *di; 2315 int name_len; 2316 char *name; 2317 struct inode *inode = NULL; 2318 struct btrfs_key location; 2319 2320 /* 2321 * Currenly we only log dir index keys. Even if we replay a log created 2322 * by an older kernel that logged both dir index and dir item keys, all 2323 * we need to do is process the dir index keys, we (and our caller) can 2324 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). 2325 */ 2326 ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); 2327 2328 eb = path->nodes[0]; 2329 slot = path->slots[0]; 2330 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2331 name_len = btrfs_dir_name_len(eb, di); 2332 name = kmalloc(name_len, GFP_NOFS); 2333 if (!name) { 2334 ret = -ENOMEM; 2335 goto out; 2336 } 2337 2338 read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); 2339 2340 if (log) { 2341 struct btrfs_dir_item *log_di; 2342 2343 log_di = btrfs_lookup_dir_index_item(trans, log, log_path, 2344 dir_key->objectid, 2345 dir_key->offset, 2346 name, name_len, 0); 2347 if (IS_ERR(log_di)) { 2348 ret = PTR_ERR(log_di); 2349 goto out; 2350 } else if (log_di) { 2351 /* The dentry exists in the log, we have nothing to do. */ 2352 ret = 0; 2353 goto out; 2354 } 2355 } 2356 2357 btrfs_dir_item_key_to_cpu(eb, di, &location); 2358 btrfs_release_path(path); 2359 btrfs_release_path(log_path); 2360 inode = read_one_inode(root, location.objectid); 2361 if (!inode) { 2362 ret = -EIO; 2363 goto out; 2364 } 2365 2366 ret = link_to_fixup_dir(trans, root, path, location.objectid); 2367 if (ret) 2368 goto out; 2369 2370 inc_nlink(inode); 2371 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, 2372 name_len); 2373 if (ret) 2374 goto out; 2375 2376 ret = btrfs_run_delayed_items(trans); 2377 if (ret) 2378 goto out; 2379 2380 /* 2381 * Unlike dir item keys, dir index keys can only have one name (entry) in 2382 * them, as there are no key collisions since each key has a unique offset 2383 * (an index number), so we're done. 2384 */ 2385 out: 2386 btrfs_release_path(path); 2387 btrfs_release_path(log_path); 2388 kfree(name); 2389 iput(inode); 2390 return ret; 2391 } 2392 2393 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2394 struct btrfs_root *root, 2395 struct btrfs_root *log, 2396 struct btrfs_path *path, 2397 const u64 ino) 2398 { 2399 struct btrfs_key search_key; 2400 struct btrfs_path *log_path; 2401 int i; 2402 int nritems; 2403 int ret; 2404 2405 log_path = btrfs_alloc_path(); 2406 if (!log_path) 2407 return -ENOMEM; 2408 2409 search_key.objectid = ino; 2410 search_key.type = BTRFS_XATTR_ITEM_KEY; 2411 search_key.offset = 0; 2412 again: 2413 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2414 if (ret < 0) 2415 goto out; 2416 process_leaf: 2417 nritems = btrfs_header_nritems(path->nodes[0]); 2418 for (i = path->slots[0]; i < nritems; i++) { 2419 struct btrfs_key key; 2420 struct btrfs_dir_item *di; 2421 struct btrfs_dir_item *log_di; 2422 u32 total_size; 2423 u32 cur; 2424 2425 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2426 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2427 ret = 0; 2428 goto out; 2429 } 2430 2431 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2432 total_size = btrfs_item_size(path->nodes[0], i); 2433 cur = 0; 2434 while (cur < total_size) { 2435 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2436 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2437 u32 this_len = sizeof(*di) + name_len + data_len; 2438 char *name; 2439 2440 name = kmalloc(name_len, GFP_NOFS); 2441 if (!name) { 2442 ret = -ENOMEM; 2443 goto out; 2444 } 2445 read_extent_buffer(path->nodes[0], name, 2446 (unsigned long)(di + 1), name_len); 2447 2448 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2449 name, name_len, 0); 2450 btrfs_release_path(log_path); 2451 if (!log_di) { 2452 /* Doesn't exist in log tree, so delete it. */ 2453 btrfs_release_path(path); 2454 di = btrfs_lookup_xattr(trans, root, path, ino, 2455 name, name_len, -1); 2456 kfree(name); 2457 if (IS_ERR(di)) { 2458 ret = PTR_ERR(di); 2459 goto out; 2460 } 2461 ASSERT(di); 2462 ret = btrfs_delete_one_dir_name(trans, root, 2463 path, di); 2464 if (ret) 2465 goto out; 2466 btrfs_release_path(path); 2467 search_key = key; 2468 goto again; 2469 } 2470 kfree(name); 2471 if (IS_ERR(log_di)) { 2472 ret = PTR_ERR(log_di); 2473 goto out; 2474 } 2475 cur += this_len; 2476 di = (struct btrfs_dir_item *)((char *)di + this_len); 2477 } 2478 } 2479 ret = btrfs_next_leaf(root, path); 2480 if (ret > 0) 2481 ret = 0; 2482 else if (ret == 0) 2483 goto process_leaf; 2484 out: 2485 btrfs_free_path(log_path); 2486 btrfs_release_path(path); 2487 return ret; 2488 } 2489 2490 2491 /* 2492 * deletion replay happens before we copy any new directory items 2493 * out of the log or out of backreferences from inodes. It 2494 * scans the log to find ranges of keys that log is authoritative for, 2495 * and then scans the directory to find items in those ranges that are 2496 * not present in the log. 2497 * 2498 * Anything we don't find in the log is unlinked and removed from the 2499 * directory. 2500 */ 2501 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2502 struct btrfs_root *root, 2503 struct btrfs_root *log, 2504 struct btrfs_path *path, 2505 u64 dirid, int del_all) 2506 { 2507 u64 range_start; 2508 u64 range_end; 2509 int ret = 0; 2510 struct btrfs_key dir_key; 2511 struct btrfs_key found_key; 2512 struct btrfs_path *log_path; 2513 struct inode *dir; 2514 2515 dir_key.objectid = dirid; 2516 dir_key.type = BTRFS_DIR_INDEX_KEY; 2517 log_path = btrfs_alloc_path(); 2518 if (!log_path) 2519 return -ENOMEM; 2520 2521 dir = read_one_inode(root, dirid); 2522 /* it isn't an error if the inode isn't there, that can happen 2523 * because we replay the deletes before we copy in the inode item 2524 * from the log 2525 */ 2526 if (!dir) { 2527 btrfs_free_path(log_path); 2528 return 0; 2529 } 2530 2531 range_start = 0; 2532 range_end = 0; 2533 while (1) { 2534 if (del_all) 2535 range_end = (u64)-1; 2536 else { 2537 ret = find_dir_range(log, path, dirid, 2538 &range_start, &range_end); 2539 if (ret < 0) 2540 goto out; 2541 else if (ret > 0) 2542 break; 2543 } 2544 2545 dir_key.offset = range_start; 2546 while (1) { 2547 int nritems; 2548 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2549 0, 0); 2550 if (ret < 0) 2551 goto out; 2552 2553 nritems = btrfs_header_nritems(path->nodes[0]); 2554 if (path->slots[0] >= nritems) { 2555 ret = btrfs_next_leaf(root, path); 2556 if (ret == 1) 2557 break; 2558 else if (ret < 0) 2559 goto out; 2560 } 2561 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2562 path->slots[0]); 2563 if (found_key.objectid != dirid || 2564 found_key.type != dir_key.type) { 2565 ret = 0; 2566 goto out; 2567 } 2568 2569 if (found_key.offset > range_end) 2570 break; 2571 2572 ret = check_item_in_log(trans, log, path, 2573 log_path, dir, 2574 &found_key); 2575 if (ret) 2576 goto out; 2577 if (found_key.offset == (u64)-1) 2578 break; 2579 dir_key.offset = found_key.offset + 1; 2580 } 2581 btrfs_release_path(path); 2582 if (range_end == (u64)-1) 2583 break; 2584 range_start = range_end + 1; 2585 } 2586 ret = 0; 2587 out: 2588 btrfs_release_path(path); 2589 btrfs_free_path(log_path); 2590 iput(dir); 2591 return ret; 2592 } 2593 2594 /* 2595 * the process_func used to replay items from the log tree. This 2596 * gets called in two different stages. The first stage just looks 2597 * for inodes and makes sure they are all copied into the subvolume. 2598 * 2599 * The second stage copies all the other item types from the log into 2600 * the subvolume. The two stage approach is slower, but gets rid of 2601 * lots of complexity around inodes referencing other inodes that exist 2602 * only in the log (references come from either directory items or inode 2603 * back refs). 2604 */ 2605 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2606 struct walk_control *wc, u64 gen, int level) 2607 { 2608 int nritems; 2609 struct btrfs_path *path; 2610 struct btrfs_root *root = wc->replay_dest; 2611 struct btrfs_key key; 2612 int i; 2613 int ret; 2614 2615 ret = btrfs_read_buffer(eb, gen, level, NULL); 2616 if (ret) 2617 return ret; 2618 2619 level = btrfs_header_level(eb); 2620 2621 if (level != 0) 2622 return 0; 2623 2624 path = btrfs_alloc_path(); 2625 if (!path) 2626 return -ENOMEM; 2627 2628 nritems = btrfs_header_nritems(eb); 2629 for (i = 0; i < nritems; i++) { 2630 btrfs_item_key_to_cpu(eb, &key, i); 2631 2632 /* inode keys are done during the first stage */ 2633 if (key.type == BTRFS_INODE_ITEM_KEY && 2634 wc->stage == LOG_WALK_REPLAY_INODES) { 2635 struct btrfs_inode_item *inode_item; 2636 u32 mode; 2637 2638 inode_item = btrfs_item_ptr(eb, i, 2639 struct btrfs_inode_item); 2640 /* 2641 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2642 * and never got linked before the fsync, skip it, as 2643 * replaying it is pointless since it would be deleted 2644 * later. We skip logging tmpfiles, but it's always 2645 * possible we are replaying a log created with a kernel 2646 * that used to log tmpfiles. 2647 */ 2648 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2649 wc->ignore_cur_inode = true; 2650 continue; 2651 } else { 2652 wc->ignore_cur_inode = false; 2653 } 2654 ret = replay_xattr_deletes(wc->trans, root, log, 2655 path, key.objectid); 2656 if (ret) 2657 break; 2658 mode = btrfs_inode_mode(eb, inode_item); 2659 if (S_ISDIR(mode)) { 2660 ret = replay_dir_deletes(wc->trans, 2661 root, log, path, key.objectid, 0); 2662 if (ret) 2663 break; 2664 } 2665 ret = overwrite_item(wc->trans, root, path, 2666 eb, i, &key); 2667 if (ret) 2668 break; 2669 2670 /* 2671 * Before replaying extents, truncate the inode to its 2672 * size. We need to do it now and not after log replay 2673 * because before an fsync we can have prealloc extents 2674 * added beyond the inode's i_size. If we did it after, 2675 * through orphan cleanup for example, we would drop 2676 * those prealloc extents just after replaying them. 2677 */ 2678 if (S_ISREG(mode)) { 2679 struct btrfs_drop_extents_args drop_args = { 0 }; 2680 struct inode *inode; 2681 u64 from; 2682 2683 inode = read_one_inode(root, key.objectid); 2684 if (!inode) { 2685 ret = -EIO; 2686 break; 2687 } 2688 from = ALIGN(i_size_read(inode), 2689 root->fs_info->sectorsize); 2690 drop_args.start = from; 2691 drop_args.end = (u64)-1; 2692 drop_args.drop_cache = true; 2693 ret = btrfs_drop_extents(wc->trans, root, 2694 BTRFS_I(inode), 2695 &drop_args); 2696 if (!ret) { 2697 inode_sub_bytes(inode, 2698 drop_args.bytes_found); 2699 /* Update the inode's nbytes. */ 2700 ret = btrfs_update_inode(wc->trans, 2701 root, BTRFS_I(inode)); 2702 } 2703 iput(inode); 2704 if (ret) 2705 break; 2706 } 2707 2708 ret = link_to_fixup_dir(wc->trans, root, 2709 path, key.objectid); 2710 if (ret) 2711 break; 2712 } 2713 2714 if (wc->ignore_cur_inode) 2715 continue; 2716 2717 if (key.type == BTRFS_DIR_INDEX_KEY && 2718 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2719 ret = replay_one_dir_item(wc->trans, root, path, 2720 eb, i, &key); 2721 if (ret) 2722 break; 2723 } 2724 2725 if (wc->stage < LOG_WALK_REPLAY_ALL) 2726 continue; 2727 2728 /* these keys are simply copied */ 2729 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2730 ret = overwrite_item(wc->trans, root, path, 2731 eb, i, &key); 2732 if (ret) 2733 break; 2734 } else if (key.type == BTRFS_INODE_REF_KEY || 2735 key.type == BTRFS_INODE_EXTREF_KEY) { 2736 ret = add_inode_ref(wc->trans, root, log, path, 2737 eb, i, &key); 2738 if (ret && ret != -ENOENT) 2739 break; 2740 ret = 0; 2741 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2742 ret = replay_one_extent(wc->trans, root, path, 2743 eb, i, &key); 2744 if (ret) 2745 break; 2746 } 2747 /* 2748 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the 2749 * BTRFS_DIR_INDEX_KEY items which we use to derive the 2750 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an 2751 * older kernel with such keys, ignore them. 2752 */ 2753 } 2754 btrfs_free_path(path); 2755 return ret; 2756 } 2757 2758 /* 2759 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2760 */ 2761 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2762 { 2763 struct btrfs_block_group *cache; 2764 2765 cache = btrfs_lookup_block_group(fs_info, start); 2766 if (!cache) { 2767 btrfs_err(fs_info, "unable to find block group for %llu", start); 2768 return; 2769 } 2770 2771 spin_lock(&cache->space_info->lock); 2772 spin_lock(&cache->lock); 2773 cache->reserved -= fs_info->nodesize; 2774 cache->space_info->bytes_reserved -= fs_info->nodesize; 2775 spin_unlock(&cache->lock); 2776 spin_unlock(&cache->space_info->lock); 2777 2778 btrfs_put_block_group(cache); 2779 } 2780 2781 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2782 struct btrfs_root *root, 2783 struct btrfs_path *path, int *level, 2784 struct walk_control *wc) 2785 { 2786 struct btrfs_fs_info *fs_info = root->fs_info; 2787 u64 bytenr; 2788 u64 ptr_gen; 2789 struct extent_buffer *next; 2790 struct extent_buffer *cur; 2791 u32 blocksize; 2792 int ret = 0; 2793 2794 while (*level > 0) { 2795 struct btrfs_key first_key; 2796 2797 cur = path->nodes[*level]; 2798 2799 WARN_ON(btrfs_header_level(cur) != *level); 2800 2801 if (path->slots[*level] >= 2802 btrfs_header_nritems(cur)) 2803 break; 2804 2805 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2806 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2807 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2808 blocksize = fs_info->nodesize; 2809 2810 next = btrfs_find_create_tree_block(fs_info, bytenr, 2811 btrfs_header_owner(cur), 2812 *level - 1); 2813 if (IS_ERR(next)) 2814 return PTR_ERR(next); 2815 2816 if (*level == 1) { 2817 ret = wc->process_func(root, next, wc, ptr_gen, 2818 *level - 1); 2819 if (ret) { 2820 free_extent_buffer(next); 2821 return ret; 2822 } 2823 2824 path->slots[*level]++; 2825 if (wc->free) { 2826 ret = btrfs_read_buffer(next, ptr_gen, 2827 *level - 1, &first_key); 2828 if (ret) { 2829 free_extent_buffer(next); 2830 return ret; 2831 } 2832 2833 if (trans) { 2834 btrfs_tree_lock(next); 2835 btrfs_clean_tree_block(next); 2836 btrfs_wait_tree_block_writeback(next); 2837 btrfs_tree_unlock(next); 2838 ret = btrfs_pin_reserved_extent(trans, 2839 bytenr, blocksize); 2840 if (ret) { 2841 free_extent_buffer(next); 2842 return ret; 2843 } 2844 btrfs_redirty_list_add( 2845 trans->transaction, next); 2846 } else { 2847 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2848 clear_extent_buffer_dirty(next); 2849 unaccount_log_buffer(fs_info, bytenr); 2850 } 2851 } 2852 free_extent_buffer(next); 2853 continue; 2854 } 2855 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2856 if (ret) { 2857 free_extent_buffer(next); 2858 return ret; 2859 } 2860 2861 if (path->nodes[*level-1]) 2862 free_extent_buffer(path->nodes[*level-1]); 2863 path->nodes[*level-1] = next; 2864 *level = btrfs_header_level(next); 2865 path->slots[*level] = 0; 2866 cond_resched(); 2867 } 2868 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2869 2870 cond_resched(); 2871 return 0; 2872 } 2873 2874 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2875 struct btrfs_root *root, 2876 struct btrfs_path *path, int *level, 2877 struct walk_control *wc) 2878 { 2879 struct btrfs_fs_info *fs_info = root->fs_info; 2880 int i; 2881 int slot; 2882 int ret; 2883 2884 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2885 slot = path->slots[i]; 2886 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2887 path->slots[i]++; 2888 *level = i; 2889 WARN_ON(*level == 0); 2890 return 0; 2891 } else { 2892 ret = wc->process_func(root, path->nodes[*level], wc, 2893 btrfs_header_generation(path->nodes[*level]), 2894 *level); 2895 if (ret) 2896 return ret; 2897 2898 if (wc->free) { 2899 struct extent_buffer *next; 2900 2901 next = path->nodes[*level]; 2902 2903 if (trans) { 2904 btrfs_tree_lock(next); 2905 btrfs_clean_tree_block(next); 2906 btrfs_wait_tree_block_writeback(next); 2907 btrfs_tree_unlock(next); 2908 ret = btrfs_pin_reserved_extent(trans, 2909 path->nodes[*level]->start, 2910 path->nodes[*level]->len); 2911 if (ret) 2912 return ret; 2913 btrfs_redirty_list_add(trans->transaction, 2914 next); 2915 } else { 2916 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2917 clear_extent_buffer_dirty(next); 2918 2919 unaccount_log_buffer(fs_info, 2920 path->nodes[*level]->start); 2921 } 2922 } 2923 free_extent_buffer(path->nodes[*level]); 2924 path->nodes[*level] = NULL; 2925 *level = i + 1; 2926 } 2927 } 2928 return 1; 2929 } 2930 2931 /* 2932 * drop the reference count on the tree rooted at 'snap'. This traverses 2933 * the tree freeing any blocks that have a ref count of zero after being 2934 * decremented. 2935 */ 2936 static int walk_log_tree(struct btrfs_trans_handle *trans, 2937 struct btrfs_root *log, struct walk_control *wc) 2938 { 2939 struct btrfs_fs_info *fs_info = log->fs_info; 2940 int ret = 0; 2941 int wret; 2942 int level; 2943 struct btrfs_path *path; 2944 int orig_level; 2945 2946 path = btrfs_alloc_path(); 2947 if (!path) 2948 return -ENOMEM; 2949 2950 level = btrfs_header_level(log->node); 2951 orig_level = level; 2952 path->nodes[level] = log->node; 2953 atomic_inc(&log->node->refs); 2954 path->slots[level] = 0; 2955 2956 while (1) { 2957 wret = walk_down_log_tree(trans, log, path, &level, wc); 2958 if (wret > 0) 2959 break; 2960 if (wret < 0) { 2961 ret = wret; 2962 goto out; 2963 } 2964 2965 wret = walk_up_log_tree(trans, log, path, &level, wc); 2966 if (wret > 0) 2967 break; 2968 if (wret < 0) { 2969 ret = wret; 2970 goto out; 2971 } 2972 } 2973 2974 /* was the root node processed? if not, catch it here */ 2975 if (path->nodes[orig_level]) { 2976 ret = wc->process_func(log, path->nodes[orig_level], wc, 2977 btrfs_header_generation(path->nodes[orig_level]), 2978 orig_level); 2979 if (ret) 2980 goto out; 2981 if (wc->free) { 2982 struct extent_buffer *next; 2983 2984 next = path->nodes[orig_level]; 2985 2986 if (trans) { 2987 btrfs_tree_lock(next); 2988 btrfs_clean_tree_block(next); 2989 btrfs_wait_tree_block_writeback(next); 2990 btrfs_tree_unlock(next); 2991 ret = btrfs_pin_reserved_extent(trans, 2992 next->start, next->len); 2993 if (ret) 2994 goto out; 2995 btrfs_redirty_list_add(trans->transaction, next); 2996 } else { 2997 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2998 clear_extent_buffer_dirty(next); 2999 unaccount_log_buffer(fs_info, next->start); 3000 } 3001 } 3002 } 3003 3004 out: 3005 btrfs_free_path(path); 3006 return ret; 3007 } 3008 3009 /* 3010 * helper function to update the item for a given subvolumes log root 3011 * in the tree of log roots 3012 */ 3013 static int update_log_root(struct btrfs_trans_handle *trans, 3014 struct btrfs_root *log, 3015 struct btrfs_root_item *root_item) 3016 { 3017 struct btrfs_fs_info *fs_info = log->fs_info; 3018 int ret; 3019 3020 if (log->log_transid == 1) { 3021 /* insert root item on the first sync */ 3022 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 3023 &log->root_key, root_item); 3024 } else { 3025 ret = btrfs_update_root(trans, fs_info->log_root_tree, 3026 &log->root_key, root_item); 3027 } 3028 return ret; 3029 } 3030 3031 static void wait_log_commit(struct btrfs_root *root, int transid) 3032 { 3033 DEFINE_WAIT(wait); 3034 int index = transid % 2; 3035 3036 /* 3037 * we only allow two pending log transactions at a time, 3038 * so we know that if ours is more than 2 older than the 3039 * current transaction, we're done 3040 */ 3041 for (;;) { 3042 prepare_to_wait(&root->log_commit_wait[index], 3043 &wait, TASK_UNINTERRUPTIBLE); 3044 3045 if (!(root->log_transid_committed < transid && 3046 atomic_read(&root->log_commit[index]))) 3047 break; 3048 3049 mutex_unlock(&root->log_mutex); 3050 schedule(); 3051 mutex_lock(&root->log_mutex); 3052 } 3053 finish_wait(&root->log_commit_wait[index], &wait); 3054 } 3055 3056 static void wait_for_writer(struct btrfs_root *root) 3057 { 3058 DEFINE_WAIT(wait); 3059 3060 for (;;) { 3061 prepare_to_wait(&root->log_writer_wait, &wait, 3062 TASK_UNINTERRUPTIBLE); 3063 if (!atomic_read(&root->log_writers)) 3064 break; 3065 3066 mutex_unlock(&root->log_mutex); 3067 schedule(); 3068 mutex_lock(&root->log_mutex); 3069 } 3070 finish_wait(&root->log_writer_wait, &wait); 3071 } 3072 3073 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 3074 struct btrfs_log_ctx *ctx) 3075 { 3076 mutex_lock(&root->log_mutex); 3077 list_del_init(&ctx->list); 3078 mutex_unlock(&root->log_mutex); 3079 } 3080 3081 /* 3082 * Invoked in log mutex context, or be sure there is no other task which 3083 * can access the list. 3084 */ 3085 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 3086 int index, int error) 3087 { 3088 struct btrfs_log_ctx *ctx; 3089 struct btrfs_log_ctx *safe; 3090 3091 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 3092 list_del_init(&ctx->list); 3093 ctx->log_ret = error; 3094 } 3095 } 3096 3097 /* 3098 * btrfs_sync_log does sends a given tree log down to the disk and 3099 * updates the super blocks to record it. When this call is done, 3100 * you know that any inodes previously logged are safely on disk only 3101 * if it returns 0. 3102 * 3103 * Any other return value means you need to call btrfs_commit_transaction. 3104 * Some of the edge cases for fsyncing directories that have had unlinks 3105 * or renames done in the past mean that sometimes the only safe 3106 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 3107 * that has happened. 3108 */ 3109 int btrfs_sync_log(struct btrfs_trans_handle *trans, 3110 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 3111 { 3112 int index1; 3113 int index2; 3114 int mark; 3115 int ret; 3116 struct btrfs_fs_info *fs_info = root->fs_info; 3117 struct btrfs_root *log = root->log_root; 3118 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 3119 struct btrfs_root_item new_root_item; 3120 int log_transid = 0; 3121 struct btrfs_log_ctx root_log_ctx; 3122 struct blk_plug plug; 3123 u64 log_root_start; 3124 u64 log_root_level; 3125 3126 mutex_lock(&root->log_mutex); 3127 log_transid = ctx->log_transid; 3128 if (root->log_transid_committed >= log_transid) { 3129 mutex_unlock(&root->log_mutex); 3130 return ctx->log_ret; 3131 } 3132 3133 index1 = log_transid % 2; 3134 if (atomic_read(&root->log_commit[index1])) { 3135 wait_log_commit(root, log_transid); 3136 mutex_unlock(&root->log_mutex); 3137 return ctx->log_ret; 3138 } 3139 ASSERT(log_transid == root->log_transid); 3140 atomic_set(&root->log_commit[index1], 1); 3141 3142 /* wait for previous tree log sync to complete */ 3143 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 3144 wait_log_commit(root, log_transid - 1); 3145 3146 while (1) { 3147 int batch = atomic_read(&root->log_batch); 3148 /* when we're on an ssd, just kick the log commit out */ 3149 if (!btrfs_test_opt(fs_info, SSD) && 3150 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 3151 mutex_unlock(&root->log_mutex); 3152 schedule_timeout_uninterruptible(1); 3153 mutex_lock(&root->log_mutex); 3154 } 3155 wait_for_writer(root); 3156 if (batch == atomic_read(&root->log_batch)) 3157 break; 3158 } 3159 3160 /* bail out if we need to do a full commit */ 3161 if (btrfs_need_log_full_commit(trans)) { 3162 ret = -EAGAIN; 3163 mutex_unlock(&root->log_mutex); 3164 goto out; 3165 } 3166 3167 if (log_transid % 2 == 0) 3168 mark = EXTENT_DIRTY; 3169 else 3170 mark = EXTENT_NEW; 3171 3172 /* we start IO on all the marked extents here, but we don't actually 3173 * wait for them until later. 3174 */ 3175 blk_start_plug(&plug); 3176 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 3177 /* 3178 * -EAGAIN happens when someone, e.g., a concurrent transaction 3179 * commit, writes a dirty extent in this tree-log commit. This 3180 * concurrent write will create a hole writing out the extents, 3181 * and we cannot proceed on a zoned filesystem, requiring 3182 * sequential writing. While we can bail out to a full commit 3183 * here, but we can continue hoping the concurrent writing fills 3184 * the hole. 3185 */ 3186 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) 3187 ret = 0; 3188 if (ret) { 3189 blk_finish_plug(&plug); 3190 btrfs_abort_transaction(trans, ret); 3191 btrfs_set_log_full_commit(trans); 3192 mutex_unlock(&root->log_mutex); 3193 goto out; 3194 } 3195 3196 /* 3197 * We _must_ update under the root->log_mutex in order to make sure we 3198 * have a consistent view of the log root we are trying to commit at 3199 * this moment. 3200 * 3201 * We _must_ copy this into a local copy, because we are not holding the 3202 * log_root_tree->log_mutex yet. This is important because when we 3203 * commit the log_root_tree we must have a consistent view of the 3204 * log_root_tree when we update the super block to point at the 3205 * log_root_tree bytenr. If we update the log_root_tree here we'll race 3206 * with the commit and possibly point at the new block which we may not 3207 * have written out. 3208 */ 3209 btrfs_set_root_node(&log->root_item, log->node); 3210 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3211 3212 root->log_transid++; 3213 log->log_transid = root->log_transid; 3214 root->log_start_pid = 0; 3215 /* 3216 * IO has been started, blocks of the log tree have WRITTEN flag set 3217 * in their headers. new modifications of the log will be written to 3218 * new positions. so it's safe to allow log writers to go in. 3219 */ 3220 mutex_unlock(&root->log_mutex); 3221 3222 if (btrfs_is_zoned(fs_info)) { 3223 mutex_lock(&fs_info->tree_root->log_mutex); 3224 if (!log_root_tree->node) { 3225 ret = btrfs_alloc_log_tree_node(trans, log_root_tree); 3226 if (ret) { 3227 mutex_unlock(&fs_info->tree_root->log_mutex); 3228 goto out; 3229 } 3230 } 3231 mutex_unlock(&fs_info->tree_root->log_mutex); 3232 } 3233 3234 btrfs_init_log_ctx(&root_log_ctx, NULL); 3235 3236 mutex_lock(&log_root_tree->log_mutex); 3237 3238 index2 = log_root_tree->log_transid % 2; 3239 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3240 root_log_ctx.log_transid = log_root_tree->log_transid; 3241 3242 /* 3243 * Now we are safe to update the log_root_tree because we're under the 3244 * log_mutex, and we're a current writer so we're holding the commit 3245 * open until we drop the log_mutex. 3246 */ 3247 ret = update_log_root(trans, log, &new_root_item); 3248 if (ret) { 3249 if (!list_empty(&root_log_ctx.list)) 3250 list_del_init(&root_log_ctx.list); 3251 3252 blk_finish_plug(&plug); 3253 btrfs_set_log_full_commit(trans); 3254 3255 if (ret != -ENOSPC) { 3256 btrfs_abort_transaction(trans, ret); 3257 mutex_unlock(&log_root_tree->log_mutex); 3258 goto out; 3259 } 3260 btrfs_wait_tree_log_extents(log, mark); 3261 mutex_unlock(&log_root_tree->log_mutex); 3262 ret = -EAGAIN; 3263 goto out; 3264 } 3265 3266 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3267 blk_finish_plug(&plug); 3268 list_del_init(&root_log_ctx.list); 3269 mutex_unlock(&log_root_tree->log_mutex); 3270 ret = root_log_ctx.log_ret; 3271 goto out; 3272 } 3273 3274 index2 = root_log_ctx.log_transid % 2; 3275 if (atomic_read(&log_root_tree->log_commit[index2])) { 3276 blk_finish_plug(&plug); 3277 ret = btrfs_wait_tree_log_extents(log, mark); 3278 wait_log_commit(log_root_tree, 3279 root_log_ctx.log_transid); 3280 mutex_unlock(&log_root_tree->log_mutex); 3281 if (!ret) 3282 ret = root_log_ctx.log_ret; 3283 goto out; 3284 } 3285 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3286 atomic_set(&log_root_tree->log_commit[index2], 1); 3287 3288 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3289 wait_log_commit(log_root_tree, 3290 root_log_ctx.log_transid - 1); 3291 } 3292 3293 /* 3294 * now that we've moved on to the tree of log tree roots, 3295 * check the full commit flag again 3296 */ 3297 if (btrfs_need_log_full_commit(trans)) { 3298 blk_finish_plug(&plug); 3299 btrfs_wait_tree_log_extents(log, mark); 3300 mutex_unlock(&log_root_tree->log_mutex); 3301 ret = -EAGAIN; 3302 goto out_wake_log_root; 3303 } 3304 3305 ret = btrfs_write_marked_extents(fs_info, 3306 &log_root_tree->dirty_log_pages, 3307 EXTENT_DIRTY | EXTENT_NEW); 3308 blk_finish_plug(&plug); 3309 /* 3310 * As described above, -EAGAIN indicates a hole in the extents. We 3311 * cannot wait for these write outs since the waiting cause a 3312 * deadlock. Bail out to the full commit instead. 3313 */ 3314 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { 3315 btrfs_set_log_full_commit(trans); 3316 btrfs_wait_tree_log_extents(log, mark); 3317 mutex_unlock(&log_root_tree->log_mutex); 3318 goto out_wake_log_root; 3319 } else if (ret) { 3320 btrfs_set_log_full_commit(trans); 3321 btrfs_abort_transaction(trans, ret); 3322 mutex_unlock(&log_root_tree->log_mutex); 3323 goto out_wake_log_root; 3324 } 3325 ret = btrfs_wait_tree_log_extents(log, mark); 3326 if (!ret) 3327 ret = btrfs_wait_tree_log_extents(log_root_tree, 3328 EXTENT_NEW | EXTENT_DIRTY); 3329 if (ret) { 3330 btrfs_set_log_full_commit(trans); 3331 mutex_unlock(&log_root_tree->log_mutex); 3332 goto out_wake_log_root; 3333 } 3334 3335 log_root_start = log_root_tree->node->start; 3336 log_root_level = btrfs_header_level(log_root_tree->node); 3337 log_root_tree->log_transid++; 3338 mutex_unlock(&log_root_tree->log_mutex); 3339 3340 /* 3341 * Here we are guaranteed that nobody is going to write the superblock 3342 * for the current transaction before us and that neither we do write 3343 * our superblock before the previous transaction finishes its commit 3344 * and writes its superblock, because: 3345 * 3346 * 1) We are holding a handle on the current transaction, so no body 3347 * can commit it until we release the handle; 3348 * 3349 * 2) Before writing our superblock we acquire the tree_log_mutex, so 3350 * if the previous transaction is still committing, and hasn't yet 3351 * written its superblock, we wait for it to do it, because a 3352 * transaction commit acquires the tree_log_mutex when the commit 3353 * begins and releases it only after writing its superblock. 3354 */ 3355 mutex_lock(&fs_info->tree_log_mutex); 3356 3357 /* 3358 * The previous transaction writeout phase could have failed, and thus 3359 * marked the fs in an error state. We must not commit here, as we 3360 * could have updated our generation in the super_for_commit and 3361 * writing the super here would result in transid mismatches. If there 3362 * is an error here just bail. 3363 */ 3364 if (BTRFS_FS_ERROR(fs_info)) { 3365 ret = -EIO; 3366 btrfs_set_log_full_commit(trans); 3367 btrfs_abort_transaction(trans, ret); 3368 mutex_unlock(&fs_info->tree_log_mutex); 3369 goto out_wake_log_root; 3370 } 3371 3372 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); 3373 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); 3374 ret = write_all_supers(fs_info, 1); 3375 mutex_unlock(&fs_info->tree_log_mutex); 3376 if (ret) { 3377 btrfs_set_log_full_commit(trans); 3378 btrfs_abort_transaction(trans, ret); 3379 goto out_wake_log_root; 3380 } 3381 3382 /* 3383 * We know there can only be one task here, since we have not yet set 3384 * root->log_commit[index1] to 0 and any task attempting to sync the 3385 * log must wait for the previous log transaction to commit if it's 3386 * still in progress or wait for the current log transaction commit if 3387 * someone else already started it. We use <= and not < because the 3388 * first log transaction has an ID of 0. 3389 */ 3390 ASSERT(root->last_log_commit <= log_transid); 3391 root->last_log_commit = log_transid; 3392 3393 out_wake_log_root: 3394 mutex_lock(&log_root_tree->log_mutex); 3395 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3396 3397 log_root_tree->log_transid_committed++; 3398 atomic_set(&log_root_tree->log_commit[index2], 0); 3399 mutex_unlock(&log_root_tree->log_mutex); 3400 3401 /* 3402 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3403 * all the updates above are seen by the woken threads. It might not be 3404 * necessary, but proving that seems to be hard. 3405 */ 3406 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3407 out: 3408 mutex_lock(&root->log_mutex); 3409 btrfs_remove_all_log_ctxs(root, index1, ret); 3410 root->log_transid_committed++; 3411 atomic_set(&root->log_commit[index1], 0); 3412 mutex_unlock(&root->log_mutex); 3413 3414 /* 3415 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3416 * all the updates above are seen by the woken threads. It might not be 3417 * necessary, but proving that seems to be hard. 3418 */ 3419 cond_wake_up(&root->log_commit_wait[index1]); 3420 return ret; 3421 } 3422 3423 static void free_log_tree(struct btrfs_trans_handle *trans, 3424 struct btrfs_root *log) 3425 { 3426 int ret; 3427 struct walk_control wc = { 3428 .free = 1, 3429 .process_func = process_one_buffer 3430 }; 3431 3432 if (log->node) { 3433 ret = walk_log_tree(trans, log, &wc); 3434 if (ret) { 3435 /* 3436 * We weren't able to traverse the entire log tree, the 3437 * typical scenario is getting an -EIO when reading an 3438 * extent buffer of the tree, due to a previous writeback 3439 * failure of it. 3440 */ 3441 set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, 3442 &log->fs_info->fs_state); 3443 3444 /* 3445 * Some extent buffers of the log tree may still be dirty 3446 * and not yet written back to storage, because we may 3447 * have updates to a log tree without syncing a log tree, 3448 * such as during rename and link operations. So flush 3449 * them out and wait for their writeback to complete, so 3450 * that we properly cleanup their state and pages. 3451 */ 3452 btrfs_write_marked_extents(log->fs_info, 3453 &log->dirty_log_pages, 3454 EXTENT_DIRTY | EXTENT_NEW); 3455 btrfs_wait_tree_log_extents(log, 3456 EXTENT_DIRTY | EXTENT_NEW); 3457 3458 if (trans) 3459 btrfs_abort_transaction(trans, ret); 3460 else 3461 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3462 } 3463 } 3464 3465 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3466 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3467 extent_io_tree_release(&log->log_csum_range); 3468 3469 btrfs_put_root(log); 3470 } 3471 3472 /* 3473 * free all the extents used by the tree log. This should be called 3474 * at commit time of the full transaction 3475 */ 3476 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3477 { 3478 if (root->log_root) { 3479 free_log_tree(trans, root->log_root); 3480 root->log_root = NULL; 3481 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3482 } 3483 return 0; 3484 } 3485 3486 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3487 struct btrfs_fs_info *fs_info) 3488 { 3489 if (fs_info->log_root_tree) { 3490 free_log_tree(trans, fs_info->log_root_tree); 3491 fs_info->log_root_tree = NULL; 3492 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); 3493 } 3494 return 0; 3495 } 3496 3497 /* 3498 * Check if an inode was logged in the current transaction. This may often 3499 * return some false positives, because logged_trans is an in memory only field, 3500 * not persisted anywhere. This is meant to be used in contexts where a false 3501 * positive has no functional consequences. 3502 */ 3503 static bool inode_logged(struct btrfs_trans_handle *trans, 3504 struct btrfs_inode *inode) 3505 { 3506 if (inode->logged_trans == trans->transid) 3507 return true; 3508 3509 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) 3510 return false; 3511 3512 /* 3513 * The inode's logged_trans is always 0 when we load it (because it is 3514 * not persisted in the inode item or elsewhere). So if it is 0, the 3515 * inode was last modified in the current transaction then the inode may 3516 * have been logged before in the current transaction, then evicted and 3517 * loaded again in the current transaction - or may have never been logged 3518 * in the current transaction, but since we can not be sure, we have to 3519 * assume it was, otherwise our callers can leave an inconsistent log. 3520 */ 3521 if (inode->logged_trans == 0 && 3522 inode->last_trans == trans->transid && 3523 !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) 3524 return true; 3525 3526 return false; 3527 } 3528 3529 /* 3530 * If both a file and directory are logged, and unlinks or renames are 3531 * mixed in, we have a few interesting corners: 3532 * 3533 * create file X in dir Y 3534 * link file X to X.link in dir Y 3535 * fsync file X 3536 * unlink file X but leave X.link 3537 * fsync dir Y 3538 * 3539 * After a crash we would expect only X.link to exist. But file X 3540 * didn't get fsync'd again so the log has back refs for X and X.link. 3541 * 3542 * We solve this by removing directory entries and inode backrefs from the 3543 * log when a file that was logged in the current transaction is 3544 * unlinked. Any later fsync will include the updated log entries, and 3545 * we'll be able to reconstruct the proper directory items from backrefs. 3546 * 3547 * This optimizations allows us to avoid relogging the entire inode 3548 * or the entire directory. 3549 */ 3550 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3551 struct btrfs_root *root, 3552 const char *name, int name_len, 3553 struct btrfs_inode *dir, u64 index) 3554 { 3555 struct btrfs_root *log; 3556 struct btrfs_dir_item *di; 3557 struct btrfs_path *path; 3558 int ret; 3559 int err = 0; 3560 u64 dir_ino = btrfs_ino(dir); 3561 3562 if (!inode_logged(trans, dir)) 3563 return; 3564 3565 ret = join_running_log_trans(root); 3566 if (ret) 3567 return; 3568 3569 mutex_lock(&dir->log_mutex); 3570 3571 log = root->log_root; 3572 path = btrfs_alloc_path(); 3573 if (!path) { 3574 err = -ENOMEM; 3575 goto out_unlock; 3576 } 3577 3578 /* 3579 * We only log dir index items of a directory, so we don't need to look 3580 * for dir item keys. 3581 */ 3582 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3583 index, name, name_len, -1); 3584 if (IS_ERR(di)) { 3585 err = PTR_ERR(di); 3586 goto fail; 3587 } 3588 if (di) { 3589 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3590 if (ret) { 3591 err = ret; 3592 goto fail; 3593 } 3594 } 3595 3596 /* 3597 * We do not need to update the size field of the directory's inode item 3598 * because on log replay we update the field to reflect all existing 3599 * entries in the directory (see overwrite_item()). 3600 */ 3601 fail: 3602 btrfs_free_path(path); 3603 out_unlock: 3604 mutex_unlock(&dir->log_mutex); 3605 if (err < 0) 3606 btrfs_set_log_full_commit(trans); 3607 btrfs_end_log_trans(root); 3608 } 3609 3610 /* see comments for btrfs_del_dir_entries_in_log */ 3611 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3612 struct btrfs_root *root, 3613 const char *name, int name_len, 3614 struct btrfs_inode *inode, u64 dirid) 3615 { 3616 struct btrfs_root *log; 3617 u64 index; 3618 int ret; 3619 3620 if (!inode_logged(trans, inode)) 3621 return; 3622 3623 ret = join_running_log_trans(root); 3624 if (ret) 3625 return; 3626 log = root->log_root; 3627 mutex_lock(&inode->log_mutex); 3628 3629 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3630 dirid, &index); 3631 mutex_unlock(&inode->log_mutex); 3632 if (ret < 0 && ret != -ENOENT) 3633 btrfs_set_log_full_commit(trans); 3634 btrfs_end_log_trans(root); 3635 } 3636 3637 /* 3638 * creates a range item in the log for 'dirid'. first_offset and 3639 * last_offset tell us which parts of the key space the log should 3640 * be considered authoritative for. 3641 */ 3642 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3643 struct btrfs_root *log, 3644 struct btrfs_path *path, 3645 u64 dirid, 3646 u64 first_offset, u64 last_offset) 3647 { 3648 int ret; 3649 struct btrfs_key key; 3650 struct btrfs_dir_log_item *item; 3651 3652 key.objectid = dirid; 3653 key.offset = first_offset; 3654 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3655 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3656 if (ret) 3657 return ret; 3658 3659 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3660 struct btrfs_dir_log_item); 3661 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3662 btrfs_mark_buffer_dirty(path->nodes[0]); 3663 btrfs_release_path(path); 3664 return 0; 3665 } 3666 3667 static int flush_dir_items_batch(struct btrfs_trans_handle *trans, 3668 struct btrfs_root *log, 3669 struct extent_buffer *src, 3670 struct btrfs_path *dst_path, 3671 int start_slot, 3672 int count) 3673 { 3674 char *ins_data = NULL; 3675 struct btrfs_item_batch batch; 3676 struct extent_buffer *dst; 3677 unsigned long src_offset; 3678 unsigned long dst_offset; 3679 struct btrfs_key key; 3680 u32 item_size; 3681 int ret; 3682 int i; 3683 3684 ASSERT(count > 0); 3685 batch.nr = count; 3686 3687 if (count == 1) { 3688 btrfs_item_key_to_cpu(src, &key, start_slot); 3689 item_size = btrfs_item_size(src, start_slot); 3690 batch.keys = &key; 3691 batch.data_sizes = &item_size; 3692 batch.total_data_size = item_size; 3693 } else { 3694 struct btrfs_key *ins_keys; 3695 u32 *ins_sizes; 3696 3697 ins_data = kmalloc(count * sizeof(u32) + 3698 count * sizeof(struct btrfs_key), GFP_NOFS); 3699 if (!ins_data) 3700 return -ENOMEM; 3701 3702 ins_sizes = (u32 *)ins_data; 3703 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32)); 3704 batch.keys = ins_keys; 3705 batch.data_sizes = ins_sizes; 3706 batch.total_data_size = 0; 3707 3708 for (i = 0; i < count; i++) { 3709 const int slot = start_slot + i; 3710 3711 btrfs_item_key_to_cpu(src, &ins_keys[i], slot); 3712 ins_sizes[i] = btrfs_item_size(src, slot); 3713 batch.total_data_size += ins_sizes[i]; 3714 } 3715 } 3716 3717 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 3718 if (ret) 3719 goto out; 3720 3721 dst = dst_path->nodes[0]; 3722 /* 3723 * Copy all the items in bulk, in a single copy operation. Item data is 3724 * organized such that it's placed at the end of a leaf and from right 3725 * to left. For example, the data for the second item ends at an offset 3726 * that matches the offset where the data for the first item starts, the 3727 * data for the third item ends at an offset that matches the offset 3728 * where the data of the second items starts, and so on. 3729 * Therefore our source and destination start offsets for copy match the 3730 * offsets of the last items (highest slots). 3731 */ 3732 dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1); 3733 src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); 3734 copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); 3735 btrfs_release_path(dst_path); 3736 out: 3737 kfree(ins_data); 3738 3739 return ret; 3740 } 3741 3742 static int process_dir_items_leaf(struct btrfs_trans_handle *trans, 3743 struct btrfs_inode *inode, 3744 struct btrfs_path *path, 3745 struct btrfs_path *dst_path, 3746 struct btrfs_log_ctx *ctx) 3747 { 3748 struct btrfs_root *log = inode->root->log_root; 3749 struct extent_buffer *src = path->nodes[0]; 3750 const int nritems = btrfs_header_nritems(src); 3751 const u64 ino = btrfs_ino(inode); 3752 const bool inode_logged_before = inode_logged(trans, inode); 3753 bool last_found = false; 3754 int batch_start = 0; 3755 int batch_size = 0; 3756 int i; 3757 3758 for (i = path->slots[0]; i < nritems; i++) { 3759 struct btrfs_key key; 3760 int ret; 3761 3762 btrfs_item_key_to_cpu(src, &key, i); 3763 3764 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { 3765 last_found = true; 3766 break; 3767 } 3768 3769 ctx->last_dir_item_offset = key.offset; 3770 /* 3771 * We must make sure that when we log a directory entry, the 3772 * corresponding inode, after log replay, has a matching link 3773 * count. For example: 3774 * 3775 * touch foo 3776 * mkdir mydir 3777 * sync 3778 * ln foo mydir/bar 3779 * xfs_io -c "fsync" mydir 3780 * <crash> 3781 * <mount fs and log replay> 3782 * 3783 * Would result in a fsync log that when replayed, our file inode 3784 * would have a link count of 1, but we get two directory entries 3785 * pointing to the same inode. After removing one of the names, 3786 * it would not be possible to remove the other name, which 3787 * resulted always in stale file handle errors, and would not be 3788 * possible to rmdir the parent directory, since its i_size could 3789 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE, 3790 * resulting in -ENOTEMPTY errors. 3791 */ 3792 if (!ctx->log_new_dentries) { 3793 struct btrfs_dir_item *di; 3794 struct btrfs_key di_key; 3795 3796 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3797 btrfs_dir_item_key_to_cpu(src, di, &di_key); 3798 if ((btrfs_dir_transid(src, di) == trans->transid || 3799 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3800 di_key.type != BTRFS_ROOT_ITEM_KEY) 3801 ctx->log_new_dentries = true; 3802 } 3803 3804 if (!inode_logged_before) 3805 goto add_to_batch; 3806 3807 /* 3808 * If we were logged before and have logged dir items, we can skip 3809 * checking if any item with a key offset larger than the last one 3810 * we logged is in the log tree, saving time and avoiding adding 3811 * contention on the log tree. 3812 */ 3813 if (key.offset > inode->last_dir_index_offset) 3814 goto add_to_batch; 3815 /* 3816 * Check if the key was already logged before. If not we can add 3817 * it to a batch for bulk insertion. 3818 */ 3819 ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0); 3820 if (ret < 0) { 3821 return ret; 3822 } else if (ret > 0) { 3823 btrfs_release_path(dst_path); 3824 goto add_to_batch; 3825 } 3826 3827 /* 3828 * Item exists in the log. Overwrite the item in the log if it 3829 * has different content or do nothing if it has exactly the same 3830 * content. And then flush the current batch if any - do it after 3831 * overwriting the current item, or we would deadlock otherwise, 3832 * since we are holding a path for the existing item. 3833 */ 3834 ret = do_overwrite_item(trans, log, dst_path, src, i, &key); 3835 if (ret < 0) 3836 return ret; 3837 3838 if (batch_size > 0) { 3839 ret = flush_dir_items_batch(trans, log, src, dst_path, 3840 batch_start, batch_size); 3841 if (ret < 0) 3842 return ret; 3843 batch_size = 0; 3844 } 3845 continue; 3846 add_to_batch: 3847 if (batch_size == 0) 3848 batch_start = i; 3849 batch_size++; 3850 } 3851 3852 if (batch_size > 0) { 3853 int ret; 3854 3855 ret = flush_dir_items_batch(trans, log, src, dst_path, 3856 batch_start, batch_size); 3857 if (ret < 0) 3858 return ret; 3859 } 3860 3861 return last_found ? 1 : 0; 3862 } 3863 3864 /* 3865 * log all the items included in the current transaction for a given 3866 * directory. This also creates the range items in the log tree required 3867 * to replay anything deleted before the fsync 3868 */ 3869 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3870 struct btrfs_inode *inode, 3871 struct btrfs_path *path, 3872 struct btrfs_path *dst_path, 3873 struct btrfs_log_ctx *ctx, 3874 u64 min_offset, u64 *last_offset_ret) 3875 { 3876 struct btrfs_key min_key; 3877 struct btrfs_root *root = inode->root; 3878 struct btrfs_root *log = root->log_root; 3879 int err = 0; 3880 int ret; 3881 u64 first_offset = min_offset; 3882 u64 last_offset = (u64)-1; 3883 u64 ino = btrfs_ino(inode); 3884 3885 min_key.objectid = ino; 3886 min_key.type = BTRFS_DIR_INDEX_KEY; 3887 min_key.offset = min_offset; 3888 3889 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3890 3891 /* 3892 * we didn't find anything from this transaction, see if there 3893 * is anything at all 3894 */ 3895 if (ret != 0 || min_key.objectid != ino || 3896 min_key.type != BTRFS_DIR_INDEX_KEY) { 3897 min_key.objectid = ino; 3898 min_key.type = BTRFS_DIR_INDEX_KEY; 3899 min_key.offset = (u64)-1; 3900 btrfs_release_path(path); 3901 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3902 if (ret < 0) { 3903 btrfs_release_path(path); 3904 return ret; 3905 } 3906 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 3907 3908 /* if ret == 0 there are items for this type, 3909 * create a range to tell us the last key of this type. 3910 * otherwise, there are no items in this directory after 3911 * *min_offset, and we create a range to indicate that. 3912 */ 3913 if (ret == 0) { 3914 struct btrfs_key tmp; 3915 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3916 path->slots[0]); 3917 if (tmp.type == BTRFS_DIR_INDEX_KEY) 3918 first_offset = max(min_offset, tmp.offset) + 1; 3919 } 3920 goto done; 3921 } 3922 3923 /* go backward to find any previous key */ 3924 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 3925 if (ret == 0) { 3926 struct btrfs_key tmp; 3927 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3928 if (tmp.type == BTRFS_DIR_INDEX_KEY) { 3929 first_offset = tmp.offset; 3930 ret = overwrite_item(trans, log, dst_path, 3931 path->nodes[0], path->slots[0], 3932 &tmp); 3933 if (ret) { 3934 err = ret; 3935 goto done; 3936 } 3937 } 3938 } 3939 btrfs_release_path(path); 3940 3941 /* 3942 * Find the first key from this transaction again. See the note for 3943 * log_new_dir_dentries, if we're logging a directory recursively we 3944 * won't be holding its i_mutex, which means we can modify the directory 3945 * while we're logging it. If we remove an entry between our first 3946 * search and this search we'll not find the key again and can just 3947 * bail. 3948 */ 3949 search: 3950 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3951 if (ret != 0) 3952 goto done; 3953 3954 /* 3955 * we have a block from this transaction, log every item in it 3956 * from our directory 3957 */ 3958 while (1) { 3959 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx); 3960 if (ret != 0) { 3961 if (ret < 0) 3962 err = ret; 3963 goto done; 3964 } 3965 path->slots[0] = btrfs_header_nritems(path->nodes[0]); 3966 3967 /* 3968 * look ahead to the next item and see if it is also 3969 * from this directory and from this transaction 3970 */ 3971 ret = btrfs_next_leaf(root, path); 3972 if (ret) { 3973 if (ret == 1) 3974 last_offset = (u64)-1; 3975 else 3976 err = ret; 3977 goto done; 3978 } 3979 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); 3980 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { 3981 last_offset = (u64)-1; 3982 goto done; 3983 } 3984 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3985 ctx->last_dir_item_offset = min_key.offset; 3986 ret = overwrite_item(trans, log, dst_path, 3987 path->nodes[0], path->slots[0], 3988 &min_key); 3989 if (ret) 3990 err = ret; 3991 else 3992 last_offset = min_key.offset; 3993 goto done; 3994 } 3995 if (need_resched()) { 3996 btrfs_release_path(path); 3997 cond_resched(); 3998 goto search; 3999 } 4000 } 4001 done: 4002 btrfs_release_path(path); 4003 btrfs_release_path(dst_path); 4004 4005 if (err == 0) { 4006 *last_offset_ret = last_offset; 4007 /* 4008 * insert the log range keys to indicate where the log 4009 * is valid 4010 */ 4011 ret = insert_dir_log_key(trans, log, path, ino, first_offset, 4012 last_offset); 4013 if (ret) 4014 err = ret; 4015 } 4016 return err; 4017 } 4018 4019 /* 4020 * logging directories is very similar to logging inodes, We find all the items 4021 * from the current transaction and write them to the log. 4022 * 4023 * The recovery code scans the directory in the subvolume, and if it finds a 4024 * key in the range logged that is not present in the log tree, then it means 4025 * that dir entry was unlinked during the transaction. 4026 * 4027 * In order for that scan to work, we must include one key smaller than 4028 * the smallest logged by this transaction and one key larger than the largest 4029 * key logged by this transaction. 4030 */ 4031 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 4032 struct btrfs_inode *inode, 4033 struct btrfs_path *path, 4034 struct btrfs_path *dst_path, 4035 struct btrfs_log_ctx *ctx) 4036 { 4037 u64 min_key; 4038 u64 max_key; 4039 int ret; 4040 4041 /* 4042 * If this is the first time we are being logged in the current 4043 * transaction, or we were logged before but the inode was evicted and 4044 * reloaded later, in which case its logged_trans is 0, reset the value 4045 * of the last logged key offset. Note that we don't use the helper 4046 * function inode_logged() here - that is because the function returns 4047 * true after an inode eviction, assuming the worst case as it can not 4048 * know for sure if the inode was logged before. So we can not skip key 4049 * searches in the case the inode was evicted, because it may not have 4050 * been logged in this transaction and may have been logged in a past 4051 * transaction, so we need to reset the last dir index offset to (u64)-1. 4052 */ 4053 if (inode->logged_trans != trans->transid) 4054 inode->last_dir_index_offset = (u64)-1; 4055 4056 min_key = 0; 4057 max_key = 0; 4058 ctx->last_dir_item_offset = inode->last_dir_index_offset; 4059 4060 while (1) { 4061 ret = log_dir_items(trans, inode, path, dst_path, 4062 ctx, min_key, &max_key); 4063 if (ret) 4064 return ret; 4065 if (max_key == (u64)-1) 4066 break; 4067 min_key = max_key + 1; 4068 } 4069 4070 inode->last_dir_index_offset = ctx->last_dir_item_offset; 4071 4072 return 0; 4073 } 4074 4075 /* 4076 * a helper function to drop items from the log before we relog an 4077 * inode. max_key_type indicates the highest item type to remove. 4078 * This cannot be run for file data extents because it does not 4079 * free the extents they point to. 4080 */ 4081 static int drop_inode_items(struct btrfs_trans_handle *trans, 4082 struct btrfs_root *log, 4083 struct btrfs_path *path, 4084 struct btrfs_inode *inode, 4085 int max_key_type) 4086 { 4087 int ret; 4088 struct btrfs_key key; 4089 struct btrfs_key found_key; 4090 int start_slot; 4091 4092 if (!inode_logged(trans, inode)) 4093 return 0; 4094 4095 key.objectid = btrfs_ino(inode); 4096 key.type = max_key_type; 4097 key.offset = (u64)-1; 4098 4099 while (1) { 4100 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 4101 BUG_ON(ret == 0); /* Logic error */ 4102 if (ret < 0) 4103 break; 4104 4105 if (path->slots[0] == 0) 4106 break; 4107 4108 path->slots[0]--; 4109 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4110 path->slots[0]); 4111 4112 if (found_key.objectid != key.objectid) 4113 break; 4114 4115 found_key.offset = 0; 4116 found_key.type = 0; 4117 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); 4118 if (ret < 0) 4119 break; 4120 4121 ret = btrfs_del_items(trans, log, path, start_slot, 4122 path->slots[0] - start_slot + 1); 4123 /* 4124 * If start slot isn't 0 then we don't need to re-search, we've 4125 * found the last guy with the objectid in this tree. 4126 */ 4127 if (ret || start_slot != 0) 4128 break; 4129 btrfs_release_path(path); 4130 } 4131 btrfs_release_path(path); 4132 if (ret > 0) 4133 ret = 0; 4134 return ret; 4135 } 4136 4137 static int truncate_inode_items(struct btrfs_trans_handle *trans, 4138 struct btrfs_root *log_root, 4139 struct btrfs_inode *inode, 4140 u64 new_size, u32 min_type) 4141 { 4142 struct btrfs_truncate_control control = { 4143 .new_size = new_size, 4144 .ino = btrfs_ino(inode), 4145 .min_type = min_type, 4146 .skip_ref_updates = true, 4147 }; 4148 4149 return btrfs_truncate_inode_items(trans, log_root, &control); 4150 } 4151 4152 static void fill_inode_item(struct btrfs_trans_handle *trans, 4153 struct extent_buffer *leaf, 4154 struct btrfs_inode_item *item, 4155 struct inode *inode, int log_inode_only, 4156 u64 logged_isize) 4157 { 4158 struct btrfs_map_token token; 4159 u64 flags; 4160 4161 btrfs_init_map_token(&token, leaf); 4162 4163 if (log_inode_only) { 4164 /* set the generation to zero so the recover code 4165 * can tell the difference between an logging 4166 * just to say 'this inode exists' and a logging 4167 * to say 'update this inode with these values' 4168 */ 4169 btrfs_set_token_inode_generation(&token, item, 0); 4170 btrfs_set_token_inode_size(&token, item, logged_isize); 4171 } else { 4172 btrfs_set_token_inode_generation(&token, item, 4173 BTRFS_I(inode)->generation); 4174 btrfs_set_token_inode_size(&token, item, inode->i_size); 4175 } 4176 4177 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 4178 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 4179 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 4180 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 4181 4182 btrfs_set_token_timespec_sec(&token, &item->atime, 4183 inode->i_atime.tv_sec); 4184 btrfs_set_token_timespec_nsec(&token, &item->atime, 4185 inode->i_atime.tv_nsec); 4186 4187 btrfs_set_token_timespec_sec(&token, &item->mtime, 4188 inode->i_mtime.tv_sec); 4189 btrfs_set_token_timespec_nsec(&token, &item->mtime, 4190 inode->i_mtime.tv_nsec); 4191 4192 btrfs_set_token_timespec_sec(&token, &item->ctime, 4193 inode->i_ctime.tv_sec); 4194 btrfs_set_token_timespec_nsec(&token, &item->ctime, 4195 inode->i_ctime.tv_nsec); 4196 4197 /* 4198 * We do not need to set the nbytes field, in fact during a fast fsync 4199 * its value may not even be correct, since a fast fsync does not wait 4200 * for ordered extent completion, which is where we update nbytes, it 4201 * only waits for writeback to complete. During log replay as we find 4202 * file extent items and replay them, we adjust the nbytes field of the 4203 * inode item in subvolume tree as needed (see overwrite_item()). 4204 */ 4205 4206 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 4207 btrfs_set_token_inode_transid(&token, item, trans->transid); 4208 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 4209 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 4210 BTRFS_I(inode)->ro_flags); 4211 btrfs_set_token_inode_flags(&token, item, flags); 4212 btrfs_set_token_inode_block_group(&token, item, 0); 4213 } 4214 4215 static int log_inode_item(struct btrfs_trans_handle *trans, 4216 struct btrfs_root *log, struct btrfs_path *path, 4217 struct btrfs_inode *inode, bool inode_item_dropped) 4218 { 4219 struct btrfs_inode_item *inode_item; 4220 int ret; 4221 4222 /* 4223 * If we are doing a fast fsync and the inode was logged before in the 4224 * current transaction, then we know the inode was previously logged and 4225 * it exists in the log tree. For performance reasons, in this case use 4226 * btrfs_search_slot() directly with ins_len set to 0 so that we never 4227 * attempt a write lock on the leaf's parent, which adds unnecessary lock 4228 * contention in case there are concurrent fsyncs for other inodes of the 4229 * same subvolume. Using btrfs_insert_empty_item() when the inode item 4230 * already exists can also result in unnecessarily splitting a leaf. 4231 */ 4232 if (!inode_item_dropped && inode->logged_trans == trans->transid) { 4233 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); 4234 ASSERT(ret <= 0); 4235 if (ret > 0) 4236 ret = -ENOENT; 4237 } else { 4238 /* 4239 * This means it is the first fsync in the current transaction, 4240 * so the inode item is not in the log and we need to insert it. 4241 * We can never get -EEXIST because we are only called for a fast 4242 * fsync and in case an inode eviction happens after the inode was 4243 * logged before in the current transaction, when we load again 4244 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime 4245 * flags and set ->logged_trans to 0. 4246 */ 4247 ret = btrfs_insert_empty_item(trans, log, path, &inode->location, 4248 sizeof(*inode_item)); 4249 ASSERT(ret != -EEXIST); 4250 } 4251 if (ret) 4252 return ret; 4253 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4254 struct btrfs_inode_item); 4255 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 4256 0, 0); 4257 btrfs_release_path(path); 4258 return 0; 4259 } 4260 4261 static int log_csums(struct btrfs_trans_handle *trans, 4262 struct btrfs_inode *inode, 4263 struct btrfs_root *log_root, 4264 struct btrfs_ordered_sum *sums) 4265 { 4266 const u64 lock_end = sums->bytenr + sums->len - 1; 4267 struct extent_state *cached_state = NULL; 4268 int ret; 4269 4270 /* 4271 * If this inode was not used for reflink operations in the current 4272 * transaction with new extents, then do the fast path, no need to 4273 * worry about logging checksum items with overlapping ranges. 4274 */ 4275 if (inode->last_reflink_trans < trans->transid) 4276 return btrfs_csum_file_blocks(trans, log_root, sums); 4277 4278 /* 4279 * Serialize logging for checksums. This is to avoid racing with the 4280 * same checksum being logged by another task that is logging another 4281 * file which happens to refer to the same extent as well. Such races 4282 * can leave checksum items in the log with overlapping ranges. 4283 */ 4284 ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, 4285 lock_end, &cached_state); 4286 if (ret) 4287 return ret; 4288 /* 4289 * Due to extent cloning, we might have logged a csum item that covers a 4290 * subrange of a cloned extent, and later we can end up logging a csum 4291 * item for a larger subrange of the same extent or the entire range. 4292 * This would leave csum items in the log tree that cover the same range 4293 * and break the searches for checksums in the log tree, resulting in 4294 * some checksums missing in the fs/subvolume tree. So just delete (or 4295 * trim and adjust) any existing csum items in the log for this range. 4296 */ 4297 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); 4298 if (!ret) 4299 ret = btrfs_csum_file_blocks(trans, log_root, sums); 4300 4301 unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, 4302 &cached_state); 4303 4304 return ret; 4305 } 4306 4307 static noinline int copy_items(struct btrfs_trans_handle *trans, 4308 struct btrfs_inode *inode, 4309 struct btrfs_path *dst_path, 4310 struct btrfs_path *src_path, 4311 int start_slot, int nr, int inode_only, 4312 u64 logged_isize) 4313 { 4314 struct btrfs_fs_info *fs_info = trans->fs_info; 4315 unsigned long src_offset; 4316 unsigned long dst_offset; 4317 struct btrfs_root *log = inode->root->log_root; 4318 struct btrfs_file_extent_item *extent; 4319 struct btrfs_inode_item *inode_item; 4320 struct extent_buffer *src = src_path->nodes[0]; 4321 int ret; 4322 struct btrfs_key *ins_keys; 4323 u32 *ins_sizes; 4324 struct btrfs_item_batch batch; 4325 char *ins_data; 4326 int i; 4327 struct list_head ordered_sums; 4328 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 4329 4330 INIT_LIST_HEAD(&ordered_sums); 4331 4332 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 4333 nr * sizeof(u32), GFP_NOFS); 4334 if (!ins_data) 4335 return -ENOMEM; 4336 4337 ins_sizes = (u32 *)ins_data; 4338 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 4339 batch.keys = ins_keys; 4340 batch.data_sizes = ins_sizes; 4341 batch.total_data_size = 0; 4342 batch.nr = nr; 4343 4344 for (i = 0; i < nr; i++) { 4345 ins_sizes[i] = btrfs_item_size(src, i + start_slot); 4346 batch.total_data_size += ins_sizes[i]; 4347 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 4348 } 4349 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 4350 if (ret) { 4351 kfree(ins_data); 4352 return ret; 4353 } 4354 4355 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 4356 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 4357 dst_path->slots[0]); 4358 4359 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 4360 4361 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 4362 inode_item = btrfs_item_ptr(dst_path->nodes[0], 4363 dst_path->slots[0], 4364 struct btrfs_inode_item); 4365 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4366 &inode->vfs_inode, 4367 inode_only == LOG_INODE_EXISTS, 4368 logged_isize); 4369 } else { 4370 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4371 src_offset, ins_sizes[i]); 4372 } 4373 4374 /* take a reference on file data extents so that truncates 4375 * or deletes of this inode don't have to relog the inode 4376 * again 4377 */ 4378 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 4379 !skip_csum) { 4380 int found_type; 4381 extent = btrfs_item_ptr(src, start_slot + i, 4382 struct btrfs_file_extent_item); 4383 4384 if (btrfs_file_extent_generation(src, extent) < trans->transid) 4385 continue; 4386 4387 found_type = btrfs_file_extent_type(src, extent); 4388 if (found_type == BTRFS_FILE_EXTENT_REG) { 4389 struct btrfs_root *csum_root; 4390 u64 ds, dl, cs, cl; 4391 ds = btrfs_file_extent_disk_bytenr(src, 4392 extent); 4393 /* ds == 0 is a hole */ 4394 if (ds == 0) 4395 continue; 4396 4397 dl = btrfs_file_extent_disk_num_bytes(src, 4398 extent); 4399 cs = btrfs_file_extent_offset(src, extent); 4400 cl = btrfs_file_extent_num_bytes(src, 4401 extent); 4402 if (btrfs_file_extent_compression(src, 4403 extent)) { 4404 cs = 0; 4405 cl = dl; 4406 } 4407 4408 csum_root = btrfs_csum_root(fs_info, ds); 4409 ret = btrfs_lookup_csums_range(csum_root, 4410 ds + cs, ds + cs + cl - 1, 4411 &ordered_sums, 0); 4412 if (ret) 4413 break; 4414 } 4415 } 4416 } 4417 4418 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4419 btrfs_release_path(dst_path); 4420 kfree(ins_data); 4421 4422 /* 4423 * we have to do this after the loop above to avoid changing the 4424 * log tree while trying to change the log tree. 4425 */ 4426 while (!list_empty(&ordered_sums)) { 4427 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4428 struct btrfs_ordered_sum, 4429 list); 4430 if (!ret) 4431 ret = log_csums(trans, inode, log, sums); 4432 list_del(&sums->list); 4433 kfree(sums); 4434 } 4435 4436 return ret; 4437 } 4438 4439 static int extent_cmp(void *priv, const struct list_head *a, 4440 const struct list_head *b) 4441 { 4442 const struct extent_map *em1, *em2; 4443 4444 em1 = list_entry(a, struct extent_map, list); 4445 em2 = list_entry(b, struct extent_map, list); 4446 4447 if (em1->start < em2->start) 4448 return -1; 4449 else if (em1->start > em2->start) 4450 return 1; 4451 return 0; 4452 } 4453 4454 static int log_extent_csums(struct btrfs_trans_handle *trans, 4455 struct btrfs_inode *inode, 4456 struct btrfs_root *log_root, 4457 const struct extent_map *em, 4458 struct btrfs_log_ctx *ctx) 4459 { 4460 struct btrfs_ordered_extent *ordered; 4461 struct btrfs_root *csum_root; 4462 u64 csum_offset; 4463 u64 csum_len; 4464 u64 mod_start = em->mod_start; 4465 u64 mod_len = em->mod_len; 4466 LIST_HEAD(ordered_sums); 4467 int ret = 0; 4468 4469 if (inode->flags & BTRFS_INODE_NODATASUM || 4470 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4471 em->block_start == EXTENT_MAP_HOLE) 4472 return 0; 4473 4474 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4475 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4476 const u64 mod_end = mod_start + mod_len; 4477 struct btrfs_ordered_sum *sums; 4478 4479 if (mod_len == 0) 4480 break; 4481 4482 if (ordered_end <= mod_start) 4483 continue; 4484 if (mod_end <= ordered->file_offset) 4485 break; 4486 4487 /* 4488 * We are going to copy all the csums on this ordered extent, so 4489 * go ahead and adjust mod_start and mod_len in case this ordered 4490 * extent has already been logged. 4491 */ 4492 if (ordered->file_offset > mod_start) { 4493 if (ordered_end >= mod_end) 4494 mod_len = ordered->file_offset - mod_start; 4495 /* 4496 * If we have this case 4497 * 4498 * |--------- logged extent ---------| 4499 * |----- ordered extent ----| 4500 * 4501 * Just don't mess with mod_start and mod_len, we'll 4502 * just end up logging more csums than we need and it 4503 * will be ok. 4504 */ 4505 } else { 4506 if (ordered_end < mod_end) { 4507 mod_len = mod_end - ordered_end; 4508 mod_start = ordered_end; 4509 } else { 4510 mod_len = 0; 4511 } 4512 } 4513 4514 /* 4515 * To keep us from looping for the above case of an ordered 4516 * extent that falls inside of the logged extent. 4517 */ 4518 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4519 continue; 4520 4521 list_for_each_entry(sums, &ordered->list, list) { 4522 ret = log_csums(trans, inode, log_root, sums); 4523 if (ret) 4524 return ret; 4525 } 4526 } 4527 4528 /* We're done, found all csums in the ordered extents. */ 4529 if (mod_len == 0) 4530 return 0; 4531 4532 /* If we're compressed we have to save the entire range of csums. */ 4533 if (em->compress_type) { 4534 csum_offset = 0; 4535 csum_len = max(em->block_len, em->orig_block_len); 4536 } else { 4537 csum_offset = mod_start - em->start; 4538 csum_len = mod_len; 4539 } 4540 4541 /* block start is already adjusted for the file extent offset. */ 4542 csum_root = btrfs_csum_root(trans->fs_info, em->block_start); 4543 ret = btrfs_lookup_csums_range(csum_root, 4544 em->block_start + csum_offset, 4545 em->block_start + csum_offset + 4546 csum_len - 1, &ordered_sums, 0); 4547 if (ret) 4548 return ret; 4549 4550 while (!list_empty(&ordered_sums)) { 4551 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4552 struct btrfs_ordered_sum, 4553 list); 4554 if (!ret) 4555 ret = log_csums(trans, inode, log_root, sums); 4556 list_del(&sums->list); 4557 kfree(sums); 4558 } 4559 4560 return ret; 4561 } 4562 4563 static int log_one_extent(struct btrfs_trans_handle *trans, 4564 struct btrfs_inode *inode, 4565 const struct extent_map *em, 4566 struct btrfs_path *path, 4567 struct btrfs_log_ctx *ctx) 4568 { 4569 struct btrfs_drop_extents_args drop_args = { 0 }; 4570 struct btrfs_root *log = inode->root->log_root; 4571 struct btrfs_file_extent_item *fi; 4572 struct extent_buffer *leaf; 4573 struct btrfs_map_token token; 4574 struct btrfs_key key; 4575 u64 extent_offset = em->start - em->orig_start; 4576 u64 block_len; 4577 int ret; 4578 4579 ret = log_extent_csums(trans, inode, log, em, ctx); 4580 if (ret) 4581 return ret; 4582 4583 /* 4584 * If this is the first time we are logging the inode in the current 4585 * transaction, we can avoid btrfs_drop_extents(), which is expensive 4586 * because it does a deletion search, which always acquires write locks 4587 * for extent buffers at levels 2, 1 and 0. This not only wastes time 4588 * but also adds significant contention in a log tree, since log trees 4589 * are small, with a root at level 2 or 3 at most, due to their short 4590 * life span. 4591 */ 4592 if (inode_logged(trans, inode)) { 4593 drop_args.path = path; 4594 drop_args.start = em->start; 4595 drop_args.end = em->start + em->len; 4596 drop_args.replace_extent = true; 4597 drop_args.extent_item_size = sizeof(*fi); 4598 ret = btrfs_drop_extents(trans, log, inode, &drop_args); 4599 if (ret) 4600 return ret; 4601 } 4602 4603 if (!drop_args.extent_inserted) { 4604 key.objectid = btrfs_ino(inode); 4605 key.type = BTRFS_EXTENT_DATA_KEY; 4606 key.offset = em->start; 4607 4608 ret = btrfs_insert_empty_item(trans, log, path, &key, 4609 sizeof(*fi)); 4610 if (ret) 4611 return ret; 4612 } 4613 leaf = path->nodes[0]; 4614 btrfs_init_map_token(&token, leaf); 4615 fi = btrfs_item_ptr(leaf, path->slots[0], 4616 struct btrfs_file_extent_item); 4617 4618 btrfs_set_token_file_extent_generation(&token, fi, trans->transid); 4619 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4620 btrfs_set_token_file_extent_type(&token, fi, 4621 BTRFS_FILE_EXTENT_PREALLOC); 4622 else 4623 btrfs_set_token_file_extent_type(&token, fi, 4624 BTRFS_FILE_EXTENT_REG); 4625 4626 block_len = max(em->block_len, em->orig_block_len); 4627 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4628 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4629 em->block_start); 4630 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4631 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4632 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4633 em->block_start - 4634 extent_offset); 4635 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4636 } else { 4637 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); 4638 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); 4639 } 4640 4641 btrfs_set_token_file_extent_offset(&token, fi, extent_offset); 4642 btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); 4643 btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); 4644 btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); 4645 btrfs_set_token_file_extent_encryption(&token, fi, 0); 4646 btrfs_set_token_file_extent_other_encoding(&token, fi, 0); 4647 btrfs_mark_buffer_dirty(leaf); 4648 4649 btrfs_release_path(path); 4650 4651 return ret; 4652 } 4653 4654 /* 4655 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4656 * lose them after doing a full/fast fsync and replaying the log. We scan the 4657 * subvolume's root instead of iterating the inode's extent map tree because 4658 * otherwise we can log incorrect extent items based on extent map conversion. 4659 * That can happen due to the fact that extent maps are merged when they 4660 * are not in the extent map tree's list of modified extents. 4661 */ 4662 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4663 struct btrfs_inode *inode, 4664 struct btrfs_path *path) 4665 { 4666 struct btrfs_root *root = inode->root; 4667 struct btrfs_key key; 4668 const u64 i_size = i_size_read(&inode->vfs_inode); 4669 const u64 ino = btrfs_ino(inode); 4670 struct btrfs_path *dst_path = NULL; 4671 bool dropped_extents = false; 4672 u64 truncate_offset = i_size; 4673 struct extent_buffer *leaf; 4674 int slot; 4675 int ins_nr = 0; 4676 int start_slot; 4677 int ret; 4678 4679 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4680 return 0; 4681 4682 key.objectid = ino; 4683 key.type = BTRFS_EXTENT_DATA_KEY; 4684 key.offset = i_size; 4685 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4686 if (ret < 0) 4687 goto out; 4688 4689 /* 4690 * We must check if there is a prealloc extent that starts before the 4691 * i_size and crosses the i_size boundary. This is to ensure later we 4692 * truncate down to the end of that extent and not to the i_size, as 4693 * otherwise we end up losing part of the prealloc extent after a log 4694 * replay and with an implicit hole if there is another prealloc extent 4695 * that starts at an offset beyond i_size. 4696 */ 4697 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4698 if (ret < 0) 4699 goto out; 4700 4701 if (ret == 0) { 4702 struct btrfs_file_extent_item *ei; 4703 4704 leaf = path->nodes[0]; 4705 slot = path->slots[0]; 4706 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4707 4708 if (btrfs_file_extent_type(leaf, ei) == 4709 BTRFS_FILE_EXTENT_PREALLOC) { 4710 u64 extent_end; 4711 4712 btrfs_item_key_to_cpu(leaf, &key, slot); 4713 extent_end = key.offset + 4714 btrfs_file_extent_num_bytes(leaf, ei); 4715 4716 if (extent_end > i_size) 4717 truncate_offset = extent_end; 4718 } 4719 } else { 4720 ret = 0; 4721 } 4722 4723 while (true) { 4724 leaf = path->nodes[0]; 4725 slot = path->slots[0]; 4726 4727 if (slot >= btrfs_header_nritems(leaf)) { 4728 if (ins_nr > 0) { 4729 ret = copy_items(trans, inode, dst_path, path, 4730 start_slot, ins_nr, 1, 0); 4731 if (ret < 0) 4732 goto out; 4733 ins_nr = 0; 4734 } 4735 ret = btrfs_next_leaf(root, path); 4736 if (ret < 0) 4737 goto out; 4738 if (ret > 0) { 4739 ret = 0; 4740 break; 4741 } 4742 continue; 4743 } 4744 4745 btrfs_item_key_to_cpu(leaf, &key, slot); 4746 if (key.objectid > ino) 4747 break; 4748 if (WARN_ON_ONCE(key.objectid < ino) || 4749 key.type < BTRFS_EXTENT_DATA_KEY || 4750 key.offset < i_size) { 4751 path->slots[0]++; 4752 continue; 4753 } 4754 if (!dropped_extents) { 4755 /* 4756 * Avoid logging extent items logged in past fsync calls 4757 * and leading to duplicate keys in the log tree. 4758 */ 4759 ret = truncate_inode_items(trans, root->log_root, inode, 4760 truncate_offset, 4761 BTRFS_EXTENT_DATA_KEY); 4762 if (ret) 4763 goto out; 4764 dropped_extents = true; 4765 } 4766 if (ins_nr == 0) 4767 start_slot = slot; 4768 ins_nr++; 4769 path->slots[0]++; 4770 if (!dst_path) { 4771 dst_path = btrfs_alloc_path(); 4772 if (!dst_path) { 4773 ret = -ENOMEM; 4774 goto out; 4775 } 4776 } 4777 } 4778 if (ins_nr > 0) 4779 ret = copy_items(trans, inode, dst_path, path, 4780 start_slot, ins_nr, 1, 0); 4781 out: 4782 btrfs_release_path(path); 4783 btrfs_free_path(dst_path); 4784 return ret; 4785 } 4786 4787 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4788 struct btrfs_inode *inode, 4789 struct btrfs_path *path, 4790 struct btrfs_log_ctx *ctx) 4791 { 4792 struct btrfs_ordered_extent *ordered; 4793 struct btrfs_ordered_extent *tmp; 4794 struct extent_map *em, *n; 4795 struct list_head extents; 4796 struct extent_map_tree *tree = &inode->extent_tree; 4797 int ret = 0; 4798 int num = 0; 4799 4800 INIT_LIST_HEAD(&extents); 4801 4802 write_lock(&tree->lock); 4803 4804 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4805 list_del_init(&em->list); 4806 /* 4807 * Just an arbitrary number, this can be really CPU intensive 4808 * once we start getting a lot of extents, and really once we 4809 * have a bunch of extents we just want to commit since it will 4810 * be faster. 4811 */ 4812 if (++num > 32768) { 4813 list_del_init(&tree->modified_extents); 4814 ret = -EFBIG; 4815 goto process; 4816 } 4817 4818 if (em->generation < trans->transid) 4819 continue; 4820 4821 /* We log prealloc extents beyond eof later. */ 4822 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4823 em->start >= i_size_read(&inode->vfs_inode)) 4824 continue; 4825 4826 /* Need a ref to keep it from getting evicted from cache */ 4827 refcount_inc(&em->refs); 4828 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4829 list_add_tail(&em->list, &extents); 4830 num++; 4831 } 4832 4833 list_sort(NULL, &extents, extent_cmp); 4834 process: 4835 while (!list_empty(&extents)) { 4836 em = list_entry(extents.next, struct extent_map, list); 4837 4838 list_del_init(&em->list); 4839 4840 /* 4841 * If we had an error we just need to delete everybody from our 4842 * private list. 4843 */ 4844 if (ret) { 4845 clear_em_logging(tree, em); 4846 free_extent_map(em); 4847 continue; 4848 } 4849 4850 write_unlock(&tree->lock); 4851 4852 ret = log_one_extent(trans, inode, em, path, ctx); 4853 write_lock(&tree->lock); 4854 clear_em_logging(tree, em); 4855 free_extent_map(em); 4856 } 4857 WARN_ON(!list_empty(&extents)); 4858 write_unlock(&tree->lock); 4859 4860 btrfs_release_path(path); 4861 if (!ret) 4862 ret = btrfs_log_prealloc_extents(trans, inode, path); 4863 if (ret) 4864 return ret; 4865 4866 /* 4867 * We have logged all extents successfully, now make sure the commit of 4868 * the current transaction waits for the ordered extents to complete 4869 * before it commits and wipes out the log trees, otherwise we would 4870 * lose data if an ordered extents completes after the transaction 4871 * commits and a power failure happens after the transaction commit. 4872 */ 4873 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 4874 list_del_init(&ordered->log_list); 4875 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 4876 4877 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4878 spin_lock_irq(&inode->ordered_tree.lock); 4879 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4880 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 4881 atomic_inc(&trans->transaction->pending_ordered); 4882 } 4883 spin_unlock_irq(&inode->ordered_tree.lock); 4884 } 4885 btrfs_put_ordered_extent(ordered); 4886 } 4887 4888 return 0; 4889 } 4890 4891 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4892 struct btrfs_path *path, u64 *size_ret) 4893 { 4894 struct btrfs_key key; 4895 int ret; 4896 4897 key.objectid = btrfs_ino(inode); 4898 key.type = BTRFS_INODE_ITEM_KEY; 4899 key.offset = 0; 4900 4901 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4902 if (ret < 0) { 4903 return ret; 4904 } else if (ret > 0) { 4905 *size_ret = 0; 4906 } else { 4907 struct btrfs_inode_item *item; 4908 4909 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4910 struct btrfs_inode_item); 4911 *size_ret = btrfs_inode_size(path->nodes[0], item); 4912 /* 4913 * If the in-memory inode's i_size is smaller then the inode 4914 * size stored in the btree, return the inode's i_size, so 4915 * that we get a correct inode size after replaying the log 4916 * when before a power failure we had a shrinking truncate 4917 * followed by addition of a new name (rename / new hard link). 4918 * Otherwise return the inode size from the btree, to avoid 4919 * data loss when replaying a log due to previously doing a 4920 * write that expands the inode's size and logging a new name 4921 * immediately after. 4922 */ 4923 if (*size_ret > inode->vfs_inode.i_size) 4924 *size_ret = inode->vfs_inode.i_size; 4925 } 4926 4927 btrfs_release_path(path); 4928 return 0; 4929 } 4930 4931 /* 4932 * At the moment we always log all xattrs. This is to figure out at log replay 4933 * time which xattrs must have their deletion replayed. If a xattr is missing 4934 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4935 * because if a xattr is deleted, the inode is fsynced and a power failure 4936 * happens, causing the log to be replayed the next time the fs is mounted, 4937 * we want the xattr to not exist anymore (same behaviour as other filesystems 4938 * with a journal, ext3/4, xfs, f2fs, etc). 4939 */ 4940 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4941 struct btrfs_inode *inode, 4942 struct btrfs_path *path, 4943 struct btrfs_path *dst_path) 4944 { 4945 struct btrfs_root *root = inode->root; 4946 int ret; 4947 struct btrfs_key key; 4948 const u64 ino = btrfs_ino(inode); 4949 int ins_nr = 0; 4950 int start_slot = 0; 4951 bool found_xattrs = false; 4952 4953 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) 4954 return 0; 4955 4956 key.objectid = ino; 4957 key.type = BTRFS_XATTR_ITEM_KEY; 4958 key.offset = 0; 4959 4960 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4961 if (ret < 0) 4962 return ret; 4963 4964 while (true) { 4965 int slot = path->slots[0]; 4966 struct extent_buffer *leaf = path->nodes[0]; 4967 int nritems = btrfs_header_nritems(leaf); 4968 4969 if (slot >= nritems) { 4970 if (ins_nr > 0) { 4971 ret = copy_items(trans, inode, dst_path, path, 4972 start_slot, ins_nr, 1, 0); 4973 if (ret < 0) 4974 return ret; 4975 ins_nr = 0; 4976 } 4977 ret = btrfs_next_leaf(root, path); 4978 if (ret < 0) 4979 return ret; 4980 else if (ret > 0) 4981 break; 4982 continue; 4983 } 4984 4985 btrfs_item_key_to_cpu(leaf, &key, slot); 4986 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4987 break; 4988 4989 if (ins_nr == 0) 4990 start_slot = slot; 4991 ins_nr++; 4992 path->slots[0]++; 4993 found_xattrs = true; 4994 cond_resched(); 4995 } 4996 if (ins_nr > 0) { 4997 ret = copy_items(trans, inode, dst_path, path, 4998 start_slot, ins_nr, 1, 0); 4999 if (ret < 0) 5000 return ret; 5001 } 5002 5003 if (!found_xattrs) 5004 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); 5005 5006 return 0; 5007 } 5008 5009 /* 5010 * When using the NO_HOLES feature if we punched a hole that causes the 5011 * deletion of entire leafs or all the extent items of the first leaf (the one 5012 * that contains the inode item and references) we may end up not processing 5013 * any extents, because there are no leafs with a generation matching the 5014 * current transaction that have extent items for our inode. So we need to find 5015 * if any holes exist and then log them. We also need to log holes after any 5016 * truncate operation that changes the inode's size. 5017 */ 5018 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 5019 struct btrfs_inode *inode, 5020 struct btrfs_path *path) 5021 { 5022 struct btrfs_root *root = inode->root; 5023 struct btrfs_fs_info *fs_info = root->fs_info; 5024 struct btrfs_key key; 5025 const u64 ino = btrfs_ino(inode); 5026 const u64 i_size = i_size_read(&inode->vfs_inode); 5027 u64 prev_extent_end = 0; 5028 int ret; 5029 5030 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 5031 return 0; 5032 5033 key.objectid = ino; 5034 key.type = BTRFS_EXTENT_DATA_KEY; 5035 key.offset = 0; 5036 5037 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5038 if (ret < 0) 5039 return ret; 5040 5041 while (true) { 5042 struct extent_buffer *leaf = path->nodes[0]; 5043 5044 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 5045 ret = btrfs_next_leaf(root, path); 5046 if (ret < 0) 5047 return ret; 5048 if (ret > 0) { 5049 ret = 0; 5050 break; 5051 } 5052 leaf = path->nodes[0]; 5053 } 5054 5055 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5056 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 5057 break; 5058 5059 /* We have a hole, log it. */ 5060 if (prev_extent_end < key.offset) { 5061 const u64 hole_len = key.offset - prev_extent_end; 5062 5063 /* 5064 * Release the path to avoid deadlocks with other code 5065 * paths that search the root while holding locks on 5066 * leafs from the log root. 5067 */ 5068 btrfs_release_path(path); 5069 ret = btrfs_insert_file_extent(trans, root->log_root, 5070 ino, prev_extent_end, 0, 5071 0, hole_len, 0, hole_len, 5072 0, 0, 0); 5073 if (ret < 0) 5074 return ret; 5075 5076 /* 5077 * Search for the same key again in the root. Since it's 5078 * an extent item and we are holding the inode lock, the 5079 * key must still exist. If it doesn't just emit warning 5080 * and return an error to fall back to a transaction 5081 * commit. 5082 */ 5083 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5084 if (ret < 0) 5085 return ret; 5086 if (WARN_ON(ret > 0)) 5087 return -ENOENT; 5088 leaf = path->nodes[0]; 5089 } 5090 5091 prev_extent_end = btrfs_file_extent_end(path); 5092 path->slots[0]++; 5093 cond_resched(); 5094 } 5095 5096 if (prev_extent_end < i_size) { 5097 u64 hole_len; 5098 5099 btrfs_release_path(path); 5100 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 5101 ret = btrfs_insert_file_extent(trans, root->log_root, 5102 ino, prev_extent_end, 0, 0, 5103 hole_len, 0, hole_len, 5104 0, 0, 0); 5105 if (ret < 0) 5106 return ret; 5107 } 5108 5109 return 0; 5110 } 5111 5112 /* 5113 * When we are logging a new inode X, check if it doesn't have a reference that 5114 * matches the reference from some other inode Y created in a past transaction 5115 * and that was renamed in the current transaction. If we don't do this, then at 5116 * log replay time we can lose inode Y (and all its files if it's a directory): 5117 * 5118 * mkdir /mnt/x 5119 * echo "hello world" > /mnt/x/foobar 5120 * sync 5121 * mv /mnt/x /mnt/y 5122 * mkdir /mnt/x # or touch /mnt/x 5123 * xfs_io -c fsync /mnt/x 5124 * <power fail> 5125 * mount fs, trigger log replay 5126 * 5127 * After the log replay procedure, we would lose the first directory and all its 5128 * files (file foobar). 5129 * For the case where inode Y is not a directory we simply end up losing it: 5130 * 5131 * echo "123" > /mnt/foo 5132 * sync 5133 * mv /mnt/foo /mnt/bar 5134 * echo "abc" > /mnt/foo 5135 * xfs_io -c fsync /mnt/foo 5136 * <power fail> 5137 * 5138 * We also need this for cases where a snapshot entry is replaced by some other 5139 * entry (file or directory) otherwise we end up with an unreplayable log due to 5140 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 5141 * if it were a regular entry: 5142 * 5143 * mkdir /mnt/x 5144 * btrfs subvolume snapshot /mnt /mnt/x/snap 5145 * btrfs subvolume delete /mnt/x/snap 5146 * rmdir /mnt/x 5147 * mkdir /mnt/x 5148 * fsync /mnt/x or fsync some new file inside it 5149 * <power fail> 5150 * 5151 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 5152 * the same transaction. 5153 */ 5154 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 5155 const int slot, 5156 const struct btrfs_key *key, 5157 struct btrfs_inode *inode, 5158 u64 *other_ino, u64 *other_parent) 5159 { 5160 int ret; 5161 struct btrfs_path *search_path; 5162 char *name = NULL; 5163 u32 name_len = 0; 5164 u32 item_size = btrfs_item_size(eb, slot); 5165 u32 cur_offset = 0; 5166 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 5167 5168 search_path = btrfs_alloc_path(); 5169 if (!search_path) 5170 return -ENOMEM; 5171 search_path->search_commit_root = 1; 5172 search_path->skip_locking = 1; 5173 5174 while (cur_offset < item_size) { 5175 u64 parent; 5176 u32 this_name_len; 5177 u32 this_len; 5178 unsigned long name_ptr; 5179 struct btrfs_dir_item *di; 5180 5181 if (key->type == BTRFS_INODE_REF_KEY) { 5182 struct btrfs_inode_ref *iref; 5183 5184 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 5185 parent = key->offset; 5186 this_name_len = btrfs_inode_ref_name_len(eb, iref); 5187 name_ptr = (unsigned long)(iref + 1); 5188 this_len = sizeof(*iref) + this_name_len; 5189 } else { 5190 struct btrfs_inode_extref *extref; 5191 5192 extref = (struct btrfs_inode_extref *)(ptr + 5193 cur_offset); 5194 parent = btrfs_inode_extref_parent(eb, extref); 5195 this_name_len = btrfs_inode_extref_name_len(eb, extref); 5196 name_ptr = (unsigned long)&extref->name; 5197 this_len = sizeof(*extref) + this_name_len; 5198 } 5199 5200 if (this_name_len > name_len) { 5201 char *new_name; 5202 5203 new_name = krealloc(name, this_name_len, GFP_NOFS); 5204 if (!new_name) { 5205 ret = -ENOMEM; 5206 goto out; 5207 } 5208 name_len = this_name_len; 5209 name = new_name; 5210 } 5211 5212 read_extent_buffer(eb, name, name_ptr, this_name_len); 5213 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 5214 parent, name, this_name_len, 0); 5215 if (di && !IS_ERR(di)) { 5216 struct btrfs_key di_key; 5217 5218 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 5219 di, &di_key); 5220 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 5221 if (di_key.objectid != key->objectid) { 5222 ret = 1; 5223 *other_ino = di_key.objectid; 5224 *other_parent = parent; 5225 } else { 5226 ret = 0; 5227 } 5228 } else { 5229 ret = -EAGAIN; 5230 } 5231 goto out; 5232 } else if (IS_ERR(di)) { 5233 ret = PTR_ERR(di); 5234 goto out; 5235 } 5236 btrfs_release_path(search_path); 5237 5238 cur_offset += this_len; 5239 } 5240 ret = 0; 5241 out: 5242 btrfs_free_path(search_path); 5243 kfree(name); 5244 return ret; 5245 } 5246 5247 struct btrfs_ino_list { 5248 u64 ino; 5249 u64 parent; 5250 struct list_head list; 5251 }; 5252 5253 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 5254 struct btrfs_root *root, 5255 struct btrfs_path *path, 5256 struct btrfs_log_ctx *ctx, 5257 u64 ino, u64 parent) 5258 { 5259 struct btrfs_ino_list *ino_elem; 5260 LIST_HEAD(inode_list); 5261 int ret = 0; 5262 5263 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5264 if (!ino_elem) 5265 return -ENOMEM; 5266 ino_elem->ino = ino; 5267 ino_elem->parent = parent; 5268 list_add_tail(&ino_elem->list, &inode_list); 5269 5270 while (!list_empty(&inode_list)) { 5271 struct btrfs_fs_info *fs_info = root->fs_info; 5272 struct btrfs_key key; 5273 struct inode *inode; 5274 5275 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, 5276 list); 5277 ino = ino_elem->ino; 5278 parent = ino_elem->parent; 5279 list_del(&ino_elem->list); 5280 kfree(ino_elem); 5281 if (ret) 5282 continue; 5283 5284 btrfs_release_path(path); 5285 5286 inode = btrfs_iget(fs_info->sb, ino, root); 5287 /* 5288 * If the other inode that had a conflicting dir entry was 5289 * deleted in the current transaction, we need to log its parent 5290 * directory. 5291 */ 5292 if (IS_ERR(inode)) { 5293 ret = PTR_ERR(inode); 5294 if (ret == -ENOENT) { 5295 inode = btrfs_iget(fs_info->sb, parent, root); 5296 if (IS_ERR(inode)) { 5297 ret = PTR_ERR(inode); 5298 } else { 5299 ret = btrfs_log_inode(trans, 5300 BTRFS_I(inode), 5301 LOG_OTHER_INODE_ALL, 5302 ctx); 5303 btrfs_add_delayed_iput(inode); 5304 } 5305 } 5306 continue; 5307 } 5308 /* 5309 * If the inode was already logged skip it - otherwise we can 5310 * hit an infinite loop. Example: 5311 * 5312 * From the commit root (previous transaction) we have the 5313 * following inodes: 5314 * 5315 * inode 257 a directory 5316 * inode 258 with references "zz" and "zz_link" on inode 257 5317 * inode 259 with reference "a" on inode 257 5318 * 5319 * And in the current (uncommitted) transaction we have: 5320 * 5321 * inode 257 a directory, unchanged 5322 * inode 258 with references "a" and "a2" on inode 257 5323 * inode 259 with reference "zz_link" on inode 257 5324 * inode 261 with reference "zz" on inode 257 5325 * 5326 * When logging inode 261 the following infinite loop could 5327 * happen if we don't skip already logged inodes: 5328 * 5329 * - we detect inode 258 as a conflicting inode, with inode 261 5330 * on reference "zz", and log it; 5331 * 5332 * - we detect inode 259 as a conflicting inode, with inode 258 5333 * on reference "a", and log it; 5334 * 5335 * - we detect inode 258 as a conflicting inode, with inode 259 5336 * on reference "zz_link", and log it - again! After this we 5337 * repeat the above steps forever. 5338 */ 5339 spin_lock(&BTRFS_I(inode)->lock); 5340 /* 5341 * Check the inode's logged_trans only instead of 5342 * btrfs_inode_in_log(). This is because the last_log_commit of 5343 * the inode is not updated when we only log that it exists (see 5344 * btrfs_log_inode()). 5345 */ 5346 if (BTRFS_I(inode)->logged_trans == trans->transid) { 5347 spin_unlock(&BTRFS_I(inode)->lock); 5348 btrfs_add_delayed_iput(inode); 5349 continue; 5350 } 5351 spin_unlock(&BTRFS_I(inode)->lock); 5352 /* 5353 * We are safe logging the other inode without acquiring its 5354 * lock as long as we log with the LOG_INODE_EXISTS mode. We 5355 * are safe against concurrent renames of the other inode as 5356 * well because during a rename we pin the log and update the 5357 * log with the new name before we unpin it. 5358 */ 5359 ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx); 5360 if (ret) { 5361 btrfs_add_delayed_iput(inode); 5362 continue; 5363 } 5364 5365 key.objectid = ino; 5366 key.type = BTRFS_INODE_REF_KEY; 5367 key.offset = 0; 5368 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5369 if (ret < 0) { 5370 btrfs_add_delayed_iput(inode); 5371 continue; 5372 } 5373 5374 while (true) { 5375 struct extent_buffer *leaf = path->nodes[0]; 5376 int slot = path->slots[0]; 5377 u64 other_ino = 0; 5378 u64 other_parent = 0; 5379 5380 if (slot >= btrfs_header_nritems(leaf)) { 5381 ret = btrfs_next_leaf(root, path); 5382 if (ret < 0) { 5383 break; 5384 } else if (ret > 0) { 5385 ret = 0; 5386 break; 5387 } 5388 continue; 5389 } 5390 5391 btrfs_item_key_to_cpu(leaf, &key, slot); 5392 if (key.objectid != ino || 5393 (key.type != BTRFS_INODE_REF_KEY && 5394 key.type != BTRFS_INODE_EXTREF_KEY)) { 5395 ret = 0; 5396 break; 5397 } 5398 5399 ret = btrfs_check_ref_name_override(leaf, slot, &key, 5400 BTRFS_I(inode), &other_ino, 5401 &other_parent); 5402 if (ret < 0) 5403 break; 5404 if (ret > 0) { 5405 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5406 if (!ino_elem) { 5407 ret = -ENOMEM; 5408 break; 5409 } 5410 ino_elem->ino = other_ino; 5411 ino_elem->parent = other_parent; 5412 list_add_tail(&ino_elem->list, &inode_list); 5413 ret = 0; 5414 } 5415 path->slots[0]++; 5416 } 5417 btrfs_add_delayed_iput(inode); 5418 } 5419 5420 return ret; 5421 } 5422 5423 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5424 struct btrfs_inode *inode, 5425 struct btrfs_key *min_key, 5426 const struct btrfs_key *max_key, 5427 struct btrfs_path *path, 5428 struct btrfs_path *dst_path, 5429 const u64 logged_isize, 5430 const bool recursive_logging, 5431 const int inode_only, 5432 struct btrfs_log_ctx *ctx, 5433 bool *need_log_inode_item) 5434 { 5435 const u64 i_size = i_size_read(&inode->vfs_inode); 5436 struct btrfs_root *root = inode->root; 5437 int ins_start_slot = 0; 5438 int ins_nr = 0; 5439 int ret; 5440 5441 while (1) { 5442 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5443 if (ret < 0) 5444 return ret; 5445 if (ret > 0) { 5446 ret = 0; 5447 break; 5448 } 5449 again: 5450 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5451 if (min_key->objectid != max_key->objectid) 5452 break; 5453 if (min_key->type > max_key->type) 5454 break; 5455 5456 if (min_key->type == BTRFS_INODE_ITEM_KEY) { 5457 *need_log_inode_item = false; 5458 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && 5459 min_key->offset >= i_size) { 5460 /* 5461 * Extents at and beyond eof are logged with 5462 * btrfs_log_prealloc_extents(). 5463 * Only regular files have BTRFS_EXTENT_DATA_KEY keys, 5464 * and no keys greater than that, so bail out. 5465 */ 5466 break; 5467 } else if ((min_key->type == BTRFS_INODE_REF_KEY || 5468 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5469 inode->generation == trans->transid && 5470 !recursive_logging) { 5471 u64 other_ino = 0; 5472 u64 other_parent = 0; 5473 5474 ret = btrfs_check_ref_name_override(path->nodes[0], 5475 path->slots[0], min_key, inode, 5476 &other_ino, &other_parent); 5477 if (ret < 0) { 5478 return ret; 5479 } else if (ret > 0 && 5480 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5481 if (ins_nr > 0) { 5482 ins_nr++; 5483 } else { 5484 ins_nr = 1; 5485 ins_start_slot = path->slots[0]; 5486 } 5487 ret = copy_items(trans, inode, dst_path, path, 5488 ins_start_slot, ins_nr, 5489 inode_only, logged_isize); 5490 if (ret < 0) 5491 return ret; 5492 ins_nr = 0; 5493 5494 ret = log_conflicting_inodes(trans, root, path, 5495 ctx, other_ino, other_parent); 5496 if (ret) 5497 return ret; 5498 btrfs_release_path(path); 5499 goto next_key; 5500 } 5501 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5502 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ 5503 if (ins_nr == 0) 5504 goto next_slot; 5505 ret = copy_items(trans, inode, dst_path, path, 5506 ins_start_slot, 5507 ins_nr, inode_only, logged_isize); 5508 if (ret < 0) 5509 return ret; 5510 ins_nr = 0; 5511 goto next_slot; 5512 } 5513 5514 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5515 ins_nr++; 5516 goto next_slot; 5517 } else if (!ins_nr) { 5518 ins_start_slot = path->slots[0]; 5519 ins_nr = 1; 5520 goto next_slot; 5521 } 5522 5523 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5524 ins_nr, inode_only, logged_isize); 5525 if (ret < 0) 5526 return ret; 5527 ins_nr = 1; 5528 ins_start_slot = path->slots[0]; 5529 next_slot: 5530 path->slots[0]++; 5531 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5532 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5533 path->slots[0]); 5534 goto again; 5535 } 5536 if (ins_nr) { 5537 ret = copy_items(trans, inode, dst_path, path, 5538 ins_start_slot, ins_nr, inode_only, 5539 logged_isize); 5540 if (ret < 0) 5541 return ret; 5542 ins_nr = 0; 5543 } 5544 btrfs_release_path(path); 5545 next_key: 5546 if (min_key->offset < (u64)-1) { 5547 min_key->offset++; 5548 } else if (min_key->type < max_key->type) { 5549 min_key->type++; 5550 min_key->offset = 0; 5551 } else { 5552 break; 5553 } 5554 } 5555 if (ins_nr) { 5556 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5557 ins_nr, inode_only, logged_isize); 5558 if (ret) 5559 return ret; 5560 } 5561 5562 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { 5563 /* 5564 * Release the path because otherwise we might attempt to double 5565 * lock the same leaf with btrfs_log_prealloc_extents() below. 5566 */ 5567 btrfs_release_path(path); 5568 ret = btrfs_log_prealloc_extents(trans, inode, dst_path); 5569 } 5570 5571 return ret; 5572 } 5573 5574 /* log a single inode in the tree log. 5575 * At least one parent directory for this inode must exist in the tree 5576 * or be logged already. 5577 * 5578 * Any items from this inode changed by the current transaction are copied 5579 * to the log tree. An extra reference is taken on any extents in this 5580 * file, allowing us to avoid a whole pile of corner cases around logging 5581 * blocks that have been removed from the tree. 5582 * 5583 * See LOG_INODE_ALL and related defines for a description of what inode_only 5584 * does. 5585 * 5586 * This handles both files and directories. 5587 */ 5588 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 5589 struct btrfs_inode *inode, 5590 int inode_only, 5591 struct btrfs_log_ctx *ctx) 5592 { 5593 struct btrfs_path *path; 5594 struct btrfs_path *dst_path; 5595 struct btrfs_key min_key; 5596 struct btrfs_key max_key; 5597 struct btrfs_root *log = inode->root->log_root; 5598 int err = 0; 5599 int ret = 0; 5600 bool fast_search = false; 5601 u64 ino = btrfs_ino(inode); 5602 struct extent_map_tree *em_tree = &inode->extent_tree; 5603 u64 logged_isize = 0; 5604 bool need_log_inode_item = true; 5605 bool xattrs_logged = false; 5606 bool recursive_logging = false; 5607 bool inode_item_dropped = true; 5608 5609 path = btrfs_alloc_path(); 5610 if (!path) 5611 return -ENOMEM; 5612 dst_path = btrfs_alloc_path(); 5613 if (!dst_path) { 5614 btrfs_free_path(path); 5615 return -ENOMEM; 5616 } 5617 5618 min_key.objectid = ino; 5619 min_key.type = BTRFS_INODE_ITEM_KEY; 5620 min_key.offset = 0; 5621 5622 max_key.objectid = ino; 5623 5624 5625 /* today the code can only do partial logging of directories */ 5626 if (S_ISDIR(inode->vfs_inode.i_mode) || 5627 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5628 &inode->runtime_flags) && 5629 inode_only >= LOG_INODE_EXISTS)) 5630 max_key.type = BTRFS_XATTR_ITEM_KEY; 5631 else 5632 max_key.type = (u8)-1; 5633 max_key.offset = (u64)-1; 5634 5635 /* 5636 * Only run delayed items if we are a directory. We want to make sure 5637 * all directory indexes hit the fs/subvolume tree so we can find them 5638 * and figure out which index ranges have to be logged. 5639 */ 5640 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5641 err = btrfs_commit_inode_delayed_items(trans, inode); 5642 if (err) 5643 goto out; 5644 } 5645 5646 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { 5647 recursive_logging = true; 5648 if (inode_only == LOG_OTHER_INODE) 5649 inode_only = LOG_INODE_EXISTS; 5650 else 5651 inode_only = LOG_INODE_ALL; 5652 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 5653 } else { 5654 mutex_lock(&inode->log_mutex); 5655 } 5656 5657 /* 5658 * This is for cases where logging a directory could result in losing a 5659 * a file after replaying the log. For example, if we move a file from a 5660 * directory A to a directory B, then fsync directory A, we have no way 5661 * to known the file was moved from A to B, so logging just A would 5662 * result in losing the file after a log replay. 5663 */ 5664 if (S_ISDIR(inode->vfs_inode.i_mode) && 5665 inode_only == LOG_INODE_ALL && 5666 inode->last_unlink_trans >= trans->transid) { 5667 btrfs_set_log_full_commit(trans); 5668 err = 1; 5669 goto out_unlock; 5670 } 5671 5672 /* 5673 * a brute force approach to making sure we get the most uptodate 5674 * copies of everything. 5675 */ 5676 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5677 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 5678 5679 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); 5680 if (inode_only == LOG_INODE_EXISTS) 5681 max_key_type = BTRFS_XATTR_ITEM_KEY; 5682 ret = drop_inode_items(trans, log, path, inode, max_key_type); 5683 } else { 5684 if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) { 5685 /* 5686 * Make sure the new inode item we write to the log has 5687 * the same isize as the current one (if it exists). 5688 * This is necessary to prevent data loss after log 5689 * replay, and also to prevent doing a wrong expanding 5690 * truncate - for e.g. create file, write 4K into offset 5691 * 0, fsync, write 4K into offset 4096, add hard link, 5692 * fsync some other file (to sync log), power fail - if 5693 * we use the inode's current i_size, after log replay 5694 * we get a 8Kb file, with the last 4Kb extent as a hole 5695 * (zeroes), as if an expanding truncate happened, 5696 * instead of getting a file of 4Kb only. 5697 */ 5698 err = logged_inode_size(log, inode, path, &logged_isize); 5699 if (err) 5700 goto out_unlock; 5701 } 5702 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5703 &inode->runtime_flags)) { 5704 if (inode_only == LOG_INODE_EXISTS) { 5705 max_key.type = BTRFS_XATTR_ITEM_KEY; 5706 ret = drop_inode_items(trans, log, path, inode, 5707 max_key.type); 5708 } else { 5709 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5710 &inode->runtime_flags); 5711 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5712 &inode->runtime_flags); 5713 if (inode_logged(trans, inode)) 5714 ret = truncate_inode_items(trans, log, 5715 inode, 0, 0); 5716 } 5717 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5718 &inode->runtime_flags) || 5719 inode_only == LOG_INODE_EXISTS) { 5720 if (inode_only == LOG_INODE_ALL) 5721 fast_search = true; 5722 max_key.type = BTRFS_XATTR_ITEM_KEY; 5723 ret = drop_inode_items(trans, log, path, inode, 5724 max_key.type); 5725 } else { 5726 if (inode_only == LOG_INODE_ALL) 5727 fast_search = true; 5728 inode_item_dropped = false; 5729 goto log_extents; 5730 } 5731 5732 } 5733 if (ret) { 5734 err = ret; 5735 goto out_unlock; 5736 } 5737 5738 err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 5739 path, dst_path, logged_isize, 5740 recursive_logging, inode_only, ctx, 5741 &need_log_inode_item); 5742 if (err) 5743 goto out_unlock; 5744 5745 btrfs_release_path(path); 5746 btrfs_release_path(dst_path); 5747 err = btrfs_log_all_xattrs(trans, inode, path, dst_path); 5748 if (err) 5749 goto out_unlock; 5750 xattrs_logged = true; 5751 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5752 btrfs_release_path(path); 5753 btrfs_release_path(dst_path); 5754 err = btrfs_log_holes(trans, inode, path); 5755 if (err) 5756 goto out_unlock; 5757 } 5758 log_extents: 5759 btrfs_release_path(path); 5760 btrfs_release_path(dst_path); 5761 if (need_log_inode_item) { 5762 err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); 5763 if (err) 5764 goto out_unlock; 5765 /* 5766 * If we are doing a fast fsync and the inode was logged before 5767 * in this transaction, we don't need to log the xattrs because 5768 * they were logged before. If xattrs were added, changed or 5769 * deleted since the last time we logged the inode, then we have 5770 * already logged them because the inode had the runtime flag 5771 * BTRFS_INODE_COPY_EVERYTHING set. 5772 */ 5773 if (!xattrs_logged && inode->logged_trans < trans->transid) { 5774 err = btrfs_log_all_xattrs(trans, inode, path, dst_path); 5775 if (err) 5776 goto out_unlock; 5777 btrfs_release_path(path); 5778 } 5779 } 5780 if (fast_search) { 5781 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); 5782 if (ret) { 5783 err = ret; 5784 goto out_unlock; 5785 } 5786 } else if (inode_only == LOG_INODE_ALL) { 5787 struct extent_map *em, *n; 5788 5789 write_lock(&em_tree->lock); 5790 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 5791 list_del_init(&em->list); 5792 write_unlock(&em_tree->lock); 5793 } 5794 5795 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5796 ret = log_directory_changes(trans, inode, path, dst_path, ctx); 5797 if (ret) { 5798 err = ret; 5799 goto out_unlock; 5800 } 5801 } 5802 5803 spin_lock(&inode->lock); 5804 inode->logged_trans = trans->transid; 5805 /* 5806 * Don't update last_log_commit if we logged that an inode exists. 5807 * We do this for three reasons: 5808 * 5809 * 1) We might have had buffered writes to this inode that were 5810 * flushed and had their ordered extents completed in this 5811 * transaction, but we did not previously log the inode with 5812 * LOG_INODE_ALL. Later the inode was evicted and after that 5813 * it was loaded again and this LOG_INODE_EXISTS log operation 5814 * happened. We must make sure that if an explicit fsync against 5815 * the inode is performed later, it logs the new extents, an 5816 * updated inode item, etc, and syncs the log. The same logic 5817 * applies to direct IO writes instead of buffered writes. 5818 * 5819 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item 5820 * is logged with an i_size of 0 or whatever value was logged 5821 * before. If later the i_size of the inode is increased by a 5822 * truncate operation, the log is synced through an fsync of 5823 * some other inode and then finally an explicit fsync against 5824 * this inode is made, we must make sure this fsync logs the 5825 * inode with the new i_size, the hole between old i_size and 5826 * the new i_size, and syncs the log. 5827 * 5828 * 3) If we are logging that an ancestor inode exists as part of 5829 * logging a new name from a link or rename operation, don't update 5830 * its last_log_commit - otherwise if an explicit fsync is made 5831 * against an ancestor, the fsync considers the inode in the log 5832 * and doesn't sync the log, resulting in the ancestor missing after 5833 * a power failure unless the log was synced as part of an fsync 5834 * against any other unrelated inode. 5835 */ 5836 if (inode_only != LOG_INODE_EXISTS) 5837 inode->last_log_commit = inode->last_sub_trans; 5838 spin_unlock(&inode->lock); 5839 out_unlock: 5840 mutex_unlock(&inode->log_mutex); 5841 out: 5842 btrfs_free_path(path); 5843 btrfs_free_path(dst_path); 5844 return err; 5845 } 5846 5847 /* 5848 * Check if we need to log an inode. This is used in contexts where while 5849 * logging an inode we need to log another inode (either that it exists or in 5850 * full mode). This is used instead of btrfs_inode_in_log() because the later 5851 * requires the inode to be in the log and have the log transaction committed, 5852 * while here we do not care if the log transaction was already committed - our 5853 * caller will commit the log later - and we want to avoid logging an inode 5854 * multiple times when multiple tasks have joined the same log transaction. 5855 */ 5856 static bool need_log_inode(struct btrfs_trans_handle *trans, 5857 struct btrfs_inode *inode) 5858 { 5859 /* 5860 * If a directory was not modified, no dentries added or removed, we can 5861 * and should avoid logging it. 5862 */ 5863 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) 5864 return false; 5865 5866 /* 5867 * If this inode does not have new/updated/deleted xattrs since the last 5868 * time it was logged and is flagged as logged in the current transaction, 5869 * we can skip logging it. As for new/deleted names, those are updated in 5870 * the log by link/unlink/rename operations. 5871 * In case the inode was logged and then evicted and reloaded, its 5872 * logged_trans will be 0, in which case we have to fully log it since 5873 * logged_trans is a transient field, not persisted. 5874 */ 5875 if (inode->logged_trans == trans->transid && 5876 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) 5877 return false; 5878 5879 return true; 5880 } 5881 5882 struct btrfs_dir_list { 5883 u64 ino; 5884 struct list_head list; 5885 }; 5886 5887 /* 5888 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5889 * details about the why it is needed. 5890 * This is a recursive operation - if an existing dentry corresponds to a 5891 * directory, that directory's new entries are logged too (same behaviour as 5892 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5893 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5894 * complains about the following circular lock dependency / possible deadlock: 5895 * 5896 * CPU0 CPU1 5897 * ---- ---- 5898 * lock(&type->i_mutex_dir_key#3/2); 5899 * lock(sb_internal#2); 5900 * lock(&type->i_mutex_dir_key#3/2); 5901 * lock(&sb->s_type->i_mutex_key#14); 5902 * 5903 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5904 * sb_start_intwrite() in btrfs_start_transaction(). 5905 * Not locking i_mutex of the inodes is still safe because: 5906 * 5907 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5908 * that while logging the inode new references (names) are added or removed 5909 * from the inode, leaving the logged inode item with a link count that does 5910 * not match the number of logged inode reference items. This is fine because 5911 * at log replay time we compute the real number of links and correct the 5912 * link count in the inode item (see replay_one_buffer() and 5913 * link_to_fixup_dir()); 5914 * 5915 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5916 * while logging the inode's items new index items (key type 5917 * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item 5918 * has a size that doesn't match the sum of the lengths of all the logged 5919 * names - this is ok, not a problem, because at log replay time we set the 5920 * directory's i_size to the correct value (see replay_one_name() and 5921 * do_overwrite_item()). 5922 */ 5923 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5924 struct btrfs_root *root, 5925 struct btrfs_inode *start_inode, 5926 struct btrfs_log_ctx *ctx) 5927 { 5928 struct btrfs_fs_info *fs_info = root->fs_info; 5929 struct btrfs_root *log = root->log_root; 5930 struct btrfs_path *path; 5931 LIST_HEAD(dir_list); 5932 struct btrfs_dir_list *dir_elem; 5933 int ret = 0; 5934 5935 /* 5936 * If we are logging a new name, as part of a link or rename operation, 5937 * don't bother logging new dentries, as we just want to log the names 5938 * of an inode and that any new parents exist. 5939 */ 5940 if (ctx->logging_new_name) 5941 return 0; 5942 5943 path = btrfs_alloc_path(); 5944 if (!path) 5945 return -ENOMEM; 5946 5947 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5948 if (!dir_elem) { 5949 btrfs_free_path(path); 5950 return -ENOMEM; 5951 } 5952 dir_elem->ino = btrfs_ino(start_inode); 5953 list_add_tail(&dir_elem->list, &dir_list); 5954 5955 while (!list_empty(&dir_list)) { 5956 struct extent_buffer *leaf; 5957 struct btrfs_key min_key; 5958 int nritems; 5959 int i; 5960 5961 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5962 list); 5963 if (ret) 5964 goto next_dir_inode; 5965 5966 min_key.objectid = dir_elem->ino; 5967 min_key.type = BTRFS_DIR_INDEX_KEY; 5968 min_key.offset = 0; 5969 again: 5970 btrfs_release_path(path); 5971 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5972 if (ret < 0) { 5973 goto next_dir_inode; 5974 } else if (ret > 0) { 5975 ret = 0; 5976 goto next_dir_inode; 5977 } 5978 5979 process_leaf: 5980 leaf = path->nodes[0]; 5981 nritems = btrfs_header_nritems(leaf); 5982 for (i = path->slots[0]; i < nritems; i++) { 5983 struct btrfs_dir_item *di; 5984 struct btrfs_key di_key; 5985 struct inode *di_inode; 5986 struct btrfs_dir_list *new_dir_elem; 5987 int log_mode = LOG_INODE_EXISTS; 5988 int type; 5989 5990 btrfs_item_key_to_cpu(leaf, &min_key, i); 5991 if (min_key.objectid != dir_elem->ino || 5992 min_key.type != BTRFS_DIR_INDEX_KEY) 5993 goto next_dir_inode; 5994 5995 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5996 type = btrfs_dir_type(leaf, di); 5997 if (btrfs_dir_transid(leaf, di) < trans->transid && 5998 type != BTRFS_FT_DIR) 5999 continue; 6000 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 6001 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 6002 continue; 6003 6004 btrfs_release_path(path); 6005 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 6006 if (IS_ERR(di_inode)) { 6007 ret = PTR_ERR(di_inode); 6008 goto next_dir_inode; 6009 } 6010 6011 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 6012 btrfs_add_delayed_iput(di_inode); 6013 break; 6014 } 6015 6016 ctx->log_new_dentries = false; 6017 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 6018 log_mode = LOG_INODE_ALL; 6019 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), 6020 log_mode, ctx); 6021 btrfs_add_delayed_iput(di_inode); 6022 if (ret) 6023 goto next_dir_inode; 6024 if (ctx->log_new_dentries) { 6025 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 6026 GFP_NOFS); 6027 if (!new_dir_elem) { 6028 ret = -ENOMEM; 6029 goto next_dir_inode; 6030 } 6031 new_dir_elem->ino = di_key.objectid; 6032 list_add_tail(&new_dir_elem->list, &dir_list); 6033 } 6034 break; 6035 } 6036 if (i == nritems) { 6037 ret = btrfs_next_leaf(log, path); 6038 if (ret < 0) { 6039 goto next_dir_inode; 6040 } else if (ret > 0) { 6041 ret = 0; 6042 goto next_dir_inode; 6043 } 6044 goto process_leaf; 6045 } 6046 if (min_key.offset < (u64)-1) { 6047 min_key.offset++; 6048 goto again; 6049 } 6050 next_dir_inode: 6051 list_del(&dir_elem->list); 6052 kfree(dir_elem); 6053 } 6054 6055 btrfs_free_path(path); 6056 return ret; 6057 } 6058 6059 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 6060 struct btrfs_inode *inode, 6061 struct btrfs_log_ctx *ctx) 6062 { 6063 struct btrfs_fs_info *fs_info = trans->fs_info; 6064 int ret; 6065 struct btrfs_path *path; 6066 struct btrfs_key key; 6067 struct btrfs_root *root = inode->root; 6068 const u64 ino = btrfs_ino(inode); 6069 6070 path = btrfs_alloc_path(); 6071 if (!path) 6072 return -ENOMEM; 6073 path->skip_locking = 1; 6074 path->search_commit_root = 1; 6075 6076 key.objectid = ino; 6077 key.type = BTRFS_INODE_REF_KEY; 6078 key.offset = 0; 6079 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6080 if (ret < 0) 6081 goto out; 6082 6083 while (true) { 6084 struct extent_buffer *leaf = path->nodes[0]; 6085 int slot = path->slots[0]; 6086 u32 cur_offset = 0; 6087 u32 item_size; 6088 unsigned long ptr; 6089 6090 if (slot >= btrfs_header_nritems(leaf)) { 6091 ret = btrfs_next_leaf(root, path); 6092 if (ret < 0) 6093 goto out; 6094 else if (ret > 0) 6095 break; 6096 continue; 6097 } 6098 6099 btrfs_item_key_to_cpu(leaf, &key, slot); 6100 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 6101 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 6102 break; 6103 6104 item_size = btrfs_item_size(leaf, slot); 6105 ptr = btrfs_item_ptr_offset(leaf, slot); 6106 while (cur_offset < item_size) { 6107 struct btrfs_key inode_key; 6108 struct inode *dir_inode; 6109 6110 inode_key.type = BTRFS_INODE_ITEM_KEY; 6111 inode_key.offset = 0; 6112 6113 if (key.type == BTRFS_INODE_EXTREF_KEY) { 6114 struct btrfs_inode_extref *extref; 6115 6116 extref = (struct btrfs_inode_extref *) 6117 (ptr + cur_offset); 6118 inode_key.objectid = btrfs_inode_extref_parent( 6119 leaf, extref); 6120 cur_offset += sizeof(*extref); 6121 cur_offset += btrfs_inode_extref_name_len(leaf, 6122 extref); 6123 } else { 6124 inode_key.objectid = key.offset; 6125 cur_offset = item_size; 6126 } 6127 6128 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 6129 root); 6130 /* 6131 * If the parent inode was deleted, return an error to 6132 * fallback to a transaction commit. This is to prevent 6133 * getting an inode that was moved from one parent A to 6134 * a parent B, got its former parent A deleted and then 6135 * it got fsync'ed, from existing at both parents after 6136 * a log replay (and the old parent still existing). 6137 * Example: 6138 * 6139 * mkdir /mnt/A 6140 * mkdir /mnt/B 6141 * touch /mnt/B/bar 6142 * sync 6143 * mv /mnt/B/bar /mnt/A/bar 6144 * mv -T /mnt/A /mnt/B 6145 * fsync /mnt/B/bar 6146 * <power fail> 6147 * 6148 * If we ignore the old parent B which got deleted, 6149 * after a log replay we would have file bar linked 6150 * at both parents and the old parent B would still 6151 * exist. 6152 */ 6153 if (IS_ERR(dir_inode)) { 6154 ret = PTR_ERR(dir_inode); 6155 goto out; 6156 } 6157 6158 if (!need_log_inode(trans, BTRFS_I(dir_inode))) { 6159 btrfs_add_delayed_iput(dir_inode); 6160 continue; 6161 } 6162 6163 ctx->log_new_dentries = false; 6164 ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), 6165 LOG_INODE_ALL, ctx); 6166 if (!ret && ctx->log_new_dentries) 6167 ret = log_new_dir_dentries(trans, root, 6168 BTRFS_I(dir_inode), ctx); 6169 btrfs_add_delayed_iput(dir_inode); 6170 if (ret) 6171 goto out; 6172 } 6173 path->slots[0]++; 6174 } 6175 ret = 0; 6176 out: 6177 btrfs_free_path(path); 6178 return ret; 6179 } 6180 6181 static int log_new_ancestors(struct btrfs_trans_handle *trans, 6182 struct btrfs_root *root, 6183 struct btrfs_path *path, 6184 struct btrfs_log_ctx *ctx) 6185 { 6186 struct btrfs_key found_key; 6187 6188 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 6189 6190 while (true) { 6191 struct btrfs_fs_info *fs_info = root->fs_info; 6192 struct extent_buffer *leaf = path->nodes[0]; 6193 int slot = path->slots[0]; 6194 struct btrfs_key search_key; 6195 struct inode *inode; 6196 u64 ino; 6197 int ret = 0; 6198 6199 btrfs_release_path(path); 6200 6201 ino = found_key.offset; 6202 6203 search_key.objectid = found_key.offset; 6204 search_key.type = BTRFS_INODE_ITEM_KEY; 6205 search_key.offset = 0; 6206 inode = btrfs_iget(fs_info->sb, ino, root); 6207 if (IS_ERR(inode)) 6208 return PTR_ERR(inode); 6209 6210 if (BTRFS_I(inode)->generation >= trans->transid && 6211 need_log_inode(trans, BTRFS_I(inode))) 6212 ret = btrfs_log_inode(trans, BTRFS_I(inode), 6213 LOG_INODE_EXISTS, ctx); 6214 btrfs_add_delayed_iput(inode); 6215 if (ret) 6216 return ret; 6217 6218 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 6219 break; 6220 6221 search_key.type = BTRFS_INODE_REF_KEY; 6222 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6223 if (ret < 0) 6224 return ret; 6225 6226 leaf = path->nodes[0]; 6227 slot = path->slots[0]; 6228 if (slot >= btrfs_header_nritems(leaf)) { 6229 ret = btrfs_next_leaf(root, path); 6230 if (ret < 0) 6231 return ret; 6232 else if (ret > 0) 6233 return -ENOENT; 6234 leaf = path->nodes[0]; 6235 slot = path->slots[0]; 6236 } 6237 6238 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6239 if (found_key.objectid != search_key.objectid || 6240 found_key.type != BTRFS_INODE_REF_KEY) 6241 return -ENOENT; 6242 } 6243 return 0; 6244 } 6245 6246 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 6247 struct btrfs_inode *inode, 6248 struct dentry *parent, 6249 struct btrfs_log_ctx *ctx) 6250 { 6251 struct btrfs_root *root = inode->root; 6252 struct dentry *old_parent = NULL; 6253 struct super_block *sb = inode->vfs_inode.i_sb; 6254 int ret = 0; 6255 6256 while (true) { 6257 if (!parent || d_really_is_negative(parent) || 6258 sb != parent->d_sb) 6259 break; 6260 6261 inode = BTRFS_I(d_inode(parent)); 6262 if (root != inode->root) 6263 break; 6264 6265 if (inode->generation >= trans->transid && 6266 need_log_inode(trans, inode)) { 6267 ret = btrfs_log_inode(trans, inode, 6268 LOG_INODE_EXISTS, ctx); 6269 if (ret) 6270 break; 6271 } 6272 if (IS_ROOT(parent)) 6273 break; 6274 6275 parent = dget_parent(parent); 6276 dput(old_parent); 6277 old_parent = parent; 6278 } 6279 dput(old_parent); 6280 6281 return ret; 6282 } 6283 6284 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 6285 struct btrfs_inode *inode, 6286 struct dentry *parent, 6287 struct btrfs_log_ctx *ctx) 6288 { 6289 struct btrfs_root *root = inode->root; 6290 const u64 ino = btrfs_ino(inode); 6291 struct btrfs_path *path; 6292 struct btrfs_key search_key; 6293 int ret; 6294 6295 /* 6296 * For a single hard link case, go through a fast path that does not 6297 * need to iterate the fs/subvolume tree. 6298 */ 6299 if (inode->vfs_inode.i_nlink < 2) 6300 return log_new_ancestors_fast(trans, inode, parent, ctx); 6301 6302 path = btrfs_alloc_path(); 6303 if (!path) 6304 return -ENOMEM; 6305 6306 search_key.objectid = ino; 6307 search_key.type = BTRFS_INODE_REF_KEY; 6308 search_key.offset = 0; 6309 again: 6310 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6311 if (ret < 0) 6312 goto out; 6313 if (ret == 0) 6314 path->slots[0]++; 6315 6316 while (true) { 6317 struct extent_buffer *leaf = path->nodes[0]; 6318 int slot = path->slots[0]; 6319 struct btrfs_key found_key; 6320 6321 if (slot >= btrfs_header_nritems(leaf)) { 6322 ret = btrfs_next_leaf(root, path); 6323 if (ret < 0) 6324 goto out; 6325 else if (ret > 0) 6326 break; 6327 continue; 6328 } 6329 6330 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6331 if (found_key.objectid != ino || 6332 found_key.type > BTRFS_INODE_EXTREF_KEY) 6333 break; 6334 6335 /* 6336 * Don't deal with extended references because they are rare 6337 * cases and too complex to deal with (we would need to keep 6338 * track of which subitem we are processing for each item in 6339 * this loop, etc). So just return some error to fallback to 6340 * a transaction commit. 6341 */ 6342 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 6343 ret = -EMLINK; 6344 goto out; 6345 } 6346 6347 /* 6348 * Logging ancestors needs to do more searches on the fs/subvol 6349 * tree, so it releases the path as needed to avoid deadlocks. 6350 * Keep track of the last inode ref key and resume from that key 6351 * after logging all new ancestors for the current hard link. 6352 */ 6353 memcpy(&search_key, &found_key, sizeof(search_key)); 6354 6355 ret = log_new_ancestors(trans, root, path, ctx); 6356 if (ret) 6357 goto out; 6358 btrfs_release_path(path); 6359 goto again; 6360 } 6361 ret = 0; 6362 out: 6363 btrfs_free_path(path); 6364 return ret; 6365 } 6366 6367 /* 6368 * helper function around btrfs_log_inode to make sure newly created 6369 * parent directories also end up in the log. A minimal inode and backref 6370 * only logging is done of any parent directories that are older than 6371 * the last committed transaction 6372 */ 6373 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 6374 struct btrfs_inode *inode, 6375 struct dentry *parent, 6376 int inode_only, 6377 struct btrfs_log_ctx *ctx) 6378 { 6379 struct btrfs_root *root = inode->root; 6380 struct btrfs_fs_info *fs_info = root->fs_info; 6381 int ret = 0; 6382 bool log_dentries = false; 6383 6384 if (btrfs_test_opt(fs_info, NOTREELOG)) { 6385 ret = 1; 6386 goto end_no_trans; 6387 } 6388 6389 if (btrfs_root_refs(&root->root_item) == 0) { 6390 ret = 1; 6391 goto end_no_trans; 6392 } 6393 6394 /* 6395 * Skip already logged inodes or inodes corresponding to tmpfiles 6396 * (since logging them is pointless, a link count of 0 means they 6397 * will never be accessible). 6398 */ 6399 if ((btrfs_inode_in_log(inode, trans->transid) && 6400 list_empty(&ctx->ordered_extents)) || 6401 inode->vfs_inode.i_nlink == 0) { 6402 ret = BTRFS_NO_LOG_SYNC; 6403 goto end_no_trans; 6404 } 6405 6406 ret = start_log_trans(trans, root, ctx); 6407 if (ret) 6408 goto end_no_trans; 6409 6410 ret = btrfs_log_inode(trans, inode, inode_only, ctx); 6411 if (ret) 6412 goto end_trans; 6413 6414 /* 6415 * for regular files, if its inode is already on disk, we don't 6416 * have to worry about the parents at all. This is because 6417 * we can use the last_unlink_trans field to record renames 6418 * and other fun in this file. 6419 */ 6420 if (S_ISREG(inode->vfs_inode.i_mode) && 6421 inode->generation < trans->transid && 6422 inode->last_unlink_trans < trans->transid) { 6423 ret = 0; 6424 goto end_trans; 6425 } 6426 6427 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) 6428 log_dentries = true; 6429 6430 /* 6431 * On unlink we must make sure all our current and old parent directory 6432 * inodes are fully logged. This is to prevent leaving dangling 6433 * directory index entries in directories that were our parents but are 6434 * not anymore. Not doing this results in old parent directory being 6435 * impossible to delete after log replay (rmdir will always fail with 6436 * error -ENOTEMPTY). 6437 * 6438 * Example 1: 6439 * 6440 * mkdir testdir 6441 * touch testdir/foo 6442 * ln testdir/foo testdir/bar 6443 * sync 6444 * unlink testdir/bar 6445 * xfs_io -c fsync testdir/foo 6446 * <power failure> 6447 * mount fs, triggers log replay 6448 * 6449 * If we don't log the parent directory (testdir), after log replay the 6450 * directory still has an entry pointing to the file inode using the bar 6451 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 6452 * the file inode has a link count of 1. 6453 * 6454 * Example 2: 6455 * 6456 * mkdir testdir 6457 * touch foo 6458 * ln foo testdir/foo2 6459 * ln foo testdir/foo3 6460 * sync 6461 * unlink testdir/foo3 6462 * xfs_io -c fsync foo 6463 * <power failure> 6464 * mount fs, triggers log replay 6465 * 6466 * Similar as the first example, after log replay the parent directory 6467 * testdir still has an entry pointing to the inode file with name foo3 6468 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 6469 * and has a link count of 2. 6470 */ 6471 if (inode->last_unlink_trans >= trans->transid) { 6472 ret = btrfs_log_all_parents(trans, inode, ctx); 6473 if (ret) 6474 goto end_trans; 6475 } 6476 6477 ret = log_all_new_ancestors(trans, inode, parent, ctx); 6478 if (ret) 6479 goto end_trans; 6480 6481 if (log_dentries) 6482 ret = log_new_dir_dentries(trans, root, inode, ctx); 6483 else 6484 ret = 0; 6485 end_trans: 6486 if (ret < 0) { 6487 btrfs_set_log_full_commit(trans); 6488 ret = 1; 6489 } 6490 6491 if (ret) 6492 btrfs_remove_log_ctx(root, ctx); 6493 btrfs_end_log_trans(root); 6494 end_no_trans: 6495 return ret; 6496 } 6497 6498 /* 6499 * it is not safe to log dentry if the chunk root has added new 6500 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 6501 * If this returns 1, you must commit the transaction to safely get your 6502 * data on disk. 6503 */ 6504 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 6505 struct dentry *dentry, 6506 struct btrfs_log_ctx *ctx) 6507 { 6508 struct dentry *parent = dget_parent(dentry); 6509 int ret; 6510 6511 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 6512 LOG_INODE_ALL, ctx); 6513 dput(parent); 6514 6515 return ret; 6516 } 6517 6518 /* 6519 * should be called during mount to recover any replay any log trees 6520 * from the FS 6521 */ 6522 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 6523 { 6524 int ret; 6525 struct btrfs_path *path; 6526 struct btrfs_trans_handle *trans; 6527 struct btrfs_key key; 6528 struct btrfs_key found_key; 6529 struct btrfs_root *log; 6530 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 6531 struct walk_control wc = { 6532 .process_func = process_one_buffer, 6533 .stage = LOG_WALK_PIN_ONLY, 6534 }; 6535 6536 path = btrfs_alloc_path(); 6537 if (!path) 6538 return -ENOMEM; 6539 6540 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6541 6542 trans = btrfs_start_transaction(fs_info->tree_root, 0); 6543 if (IS_ERR(trans)) { 6544 ret = PTR_ERR(trans); 6545 goto error; 6546 } 6547 6548 wc.trans = trans; 6549 wc.pin = 1; 6550 6551 ret = walk_log_tree(trans, log_root_tree, &wc); 6552 if (ret) { 6553 btrfs_abort_transaction(trans, ret); 6554 goto error; 6555 } 6556 6557 again: 6558 key.objectid = BTRFS_TREE_LOG_OBJECTID; 6559 key.offset = (u64)-1; 6560 key.type = BTRFS_ROOT_ITEM_KEY; 6561 6562 while (1) { 6563 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 6564 6565 if (ret < 0) { 6566 btrfs_abort_transaction(trans, ret); 6567 goto error; 6568 } 6569 if (ret > 0) { 6570 if (path->slots[0] == 0) 6571 break; 6572 path->slots[0]--; 6573 } 6574 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 6575 path->slots[0]); 6576 btrfs_release_path(path); 6577 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 6578 break; 6579 6580 log = btrfs_read_tree_root(log_root_tree, &found_key); 6581 if (IS_ERR(log)) { 6582 ret = PTR_ERR(log); 6583 btrfs_abort_transaction(trans, ret); 6584 goto error; 6585 } 6586 6587 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 6588 true); 6589 if (IS_ERR(wc.replay_dest)) { 6590 ret = PTR_ERR(wc.replay_dest); 6591 6592 /* 6593 * We didn't find the subvol, likely because it was 6594 * deleted. This is ok, simply skip this log and go to 6595 * the next one. 6596 * 6597 * We need to exclude the root because we can't have 6598 * other log replays overwriting this log as we'll read 6599 * it back in a few more times. This will keep our 6600 * block from being modified, and we'll just bail for 6601 * each subsequent pass. 6602 */ 6603 if (ret == -ENOENT) 6604 ret = btrfs_pin_extent_for_log_replay(trans, 6605 log->node->start, 6606 log->node->len); 6607 btrfs_put_root(log); 6608 6609 if (!ret) 6610 goto next; 6611 btrfs_abort_transaction(trans, ret); 6612 goto error; 6613 } 6614 6615 wc.replay_dest->log_root = log; 6616 ret = btrfs_record_root_in_trans(trans, wc.replay_dest); 6617 if (ret) 6618 /* The loop needs to continue due to the root refs */ 6619 btrfs_abort_transaction(trans, ret); 6620 else 6621 ret = walk_log_tree(trans, log, &wc); 6622 6623 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6624 ret = fixup_inode_link_counts(trans, wc.replay_dest, 6625 path); 6626 if (ret) 6627 btrfs_abort_transaction(trans, ret); 6628 } 6629 6630 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6631 struct btrfs_root *root = wc.replay_dest; 6632 6633 btrfs_release_path(path); 6634 6635 /* 6636 * We have just replayed everything, and the highest 6637 * objectid of fs roots probably has changed in case 6638 * some inode_item's got replayed. 6639 * 6640 * root->objectid_mutex is not acquired as log replay 6641 * could only happen during mount. 6642 */ 6643 ret = btrfs_init_root_free_objectid(root); 6644 if (ret) 6645 btrfs_abort_transaction(trans, ret); 6646 } 6647 6648 wc.replay_dest->log_root = NULL; 6649 btrfs_put_root(wc.replay_dest); 6650 btrfs_put_root(log); 6651 6652 if (ret) 6653 goto error; 6654 next: 6655 if (found_key.offset == 0) 6656 break; 6657 key.offset = found_key.offset - 1; 6658 } 6659 btrfs_release_path(path); 6660 6661 /* step one is to pin it all, step two is to replay just inodes */ 6662 if (wc.pin) { 6663 wc.pin = 0; 6664 wc.process_func = replay_one_buffer; 6665 wc.stage = LOG_WALK_REPLAY_INODES; 6666 goto again; 6667 } 6668 /* step three is to replay everything */ 6669 if (wc.stage < LOG_WALK_REPLAY_ALL) { 6670 wc.stage++; 6671 goto again; 6672 } 6673 6674 btrfs_free_path(path); 6675 6676 /* step 4: commit the transaction, which also unpins the blocks */ 6677 ret = btrfs_commit_transaction(trans); 6678 if (ret) 6679 return ret; 6680 6681 log_root_tree->log_root = NULL; 6682 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6683 btrfs_put_root(log_root_tree); 6684 6685 return 0; 6686 error: 6687 if (wc.trans) 6688 btrfs_end_transaction(wc.trans); 6689 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6690 btrfs_free_path(path); 6691 return ret; 6692 } 6693 6694 /* 6695 * there are some corner cases where we want to force a full 6696 * commit instead of allowing a directory to be logged. 6697 * 6698 * They revolve around files there were unlinked from the directory, and 6699 * this function updates the parent directory so that a full commit is 6700 * properly done if it is fsync'd later after the unlinks are done. 6701 * 6702 * Must be called before the unlink operations (updates to the subvolume tree, 6703 * inodes, etc) are done. 6704 */ 6705 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 6706 struct btrfs_inode *dir, struct btrfs_inode *inode, 6707 int for_rename) 6708 { 6709 /* 6710 * when we're logging a file, if it hasn't been renamed 6711 * or unlinked, and its inode is fully committed on disk, 6712 * we don't have to worry about walking up the directory chain 6713 * to log its parents. 6714 * 6715 * So, we use the last_unlink_trans field to put this transid 6716 * into the file. When the file is logged we check it and 6717 * don't log the parents if the file is fully on disk. 6718 */ 6719 mutex_lock(&inode->log_mutex); 6720 inode->last_unlink_trans = trans->transid; 6721 mutex_unlock(&inode->log_mutex); 6722 6723 /* 6724 * if this directory was already logged any new 6725 * names for this file/dir will get recorded 6726 */ 6727 if (dir->logged_trans == trans->transid) 6728 return; 6729 6730 /* 6731 * if the inode we're about to unlink was logged, 6732 * the log will be properly updated for any new names 6733 */ 6734 if (inode->logged_trans == trans->transid) 6735 return; 6736 6737 /* 6738 * when renaming files across directories, if the directory 6739 * there we're unlinking from gets fsync'd later on, there's 6740 * no way to find the destination directory later and fsync it 6741 * properly. So, we have to be conservative and force commits 6742 * so the new name gets discovered. 6743 */ 6744 if (for_rename) 6745 goto record; 6746 6747 /* we can safely do the unlink without any special recording */ 6748 return; 6749 6750 record: 6751 mutex_lock(&dir->log_mutex); 6752 dir->last_unlink_trans = trans->transid; 6753 mutex_unlock(&dir->log_mutex); 6754 } 6755 6756 /* 6757 * Make sure that if someone attempts to fsync the parent directory of a deleted 6758 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6759 * that after replaying the log tree of the parent directory's root we will not 6760 * see the snapshot anymore and at log replay time we will not see any log tree 6761 * corresponding to the deleted snapshot's root, which could lead to replaying 6762 * it after replaying the log tree of the parent directory (which would replay 6763 * the snapshot delete operation). 6764 * 6765 * Must be called before the actual snapshot destroy operation (updates to the 6766 * parent root and tree of tree roots trees, etc) are done. 6767 */ 6768 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6769 struct btrfs_inode *dir) 6770 { 6771 mutex_lock(&dir->log_mutex); 6772 dir->last_unlink_trans = trans->transid; 6773 mutex_unlock(&dir->log_mutex); 6774 } 6775 6776 /* 6777 * Call this after adding a new name for a file and it will properly 6778 * update the log to reflect the new name. 6779 */ 6780 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 6781 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 6782 struct dentry *parent) 6783 { 6784 struct btrfs_log_ctx ctx; 6785 6786 /* 6787 * this will force the logging code to walk the dentry chain 6788 * up for the file 6789 */ 6790 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6791 inode->last_unlink_trans = trans->transid; 6792 6793 /* 6794 * if this inode hasn't been logged and directory we're renaming it 6795 * from hasn't been logged, we don't need to log it 6796 */ 6797 if (!inode_logged(trans, inode) && 6798 (!old_dir || !inode_logged(trans, old_dir))) 6799 return; 6800 6801 /* 6802 * If we are doing a rename (old_dir is not NULL) from a directory that 6803 * was previously logged, make sure the next log attempt on the directory 6804 * is not skipped and logs the inode again. This is because the log may 6805 * not currently be authoritative for a range including the old 6806 * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we 6807 * do not end up with both the new and old dentries around (in case the 6808 * inode is a directory we would have a directory with two hard links and 6809 * 2 inode references for different parents). The next log attempt of 6810 * old_dir will happen at btrfs_log_all_parents(), called through 6811 * btrfs_log_inode_parent() below, because we have previously set 6812 * inode->last_unlink_trans to the current transaction ID, either here or 6813 * at btrfs_record_unlink_dir() in case the inode is a directory. 6814 */ 6815 if (old_dir) 6816 old_dir->logged_trans = 0; 6817 6818 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 6819 ctx.logging_new_name = true; 6820 /* 6821 * We don't care about the return value. If we fail to log the new name 6822 * then we know the next attempt to sync the log will fallback to a full 6823 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 6824 * we don't need to worry about getting a log committed that has an 6825 * inconsistent state after a rename operation. 6826 */ 6827 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 6828 } 6829 6830