1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "print-tree.h" 17 #include "backref.h" 18 #include "compression.h" 19 #include "qgroup.h" 20 #include "block-group.h" 21 #include "space-info.h" 22 #include "zoned.h" 23 #include "inode-item.h" 24 #include "fs.h" 25 #include "accessors.h" 26 #include "extent-tree.h" 27 #include "root-tree.h" 28 #include "dir-item.h" 29 #include "file-item.h" 30 #include "file.h" 31 #include "orphan.h" 32 #include "tree-checker.h" 33 34 #define MAX_CONFLICT_INODES 10 35 36 /* magic values for the inode_only field in btrfs_log_inode: 37 * 38 * LOG_INODE_ALL means to log everything 39 * LOG_INODE_EXISTS means to log just enough to recreate the inode 40 * during log replay 41 */ 42 enum { 43 LOG_INODE_ALL, 44 LOG_INODE_EXISTS, 45 }; 46 47 /* 48 * directory trouble cases 49 * 50 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 51 * log, we must force a full commit before doing an fsync of the directory 52 * where the unlink was done. 53 * ---> record transid of last unlink/rename per directory 54 * 55 * mkdir foo/some_dir 56 * normal commit 57 * rename foo/some_dir foo2/some_dir 58 * mkdir foo/some_dir 59 * fsync foo/some_dir/some_file 60 * 61 * The fsync above will unlink the original some_dir without recording 62 * it in its new location (foo2). After a crash, some_dir will be gone 63 * unless the fsync of some_file forces a full commit 64 * 65 * 2) we must log any new names for any file or dir that is in the fsync 66 * log. ---> check inode while renaming/linking. 67 * 68 * 2a) we must log any new names for any file or dir during rename 69 * when the directory they are being removed from was logged. 70 * ---> check inode and old parent dir during rename 71 * 72 * 2a is actually the more important variant. With the extra logging 73 * a crash might unlink the old name without recreating the new one 74 * 75 * 3) after a crash, we must go through any directories with a link count 76 * of zero and redo the rm -rf 77 * 78 * mkdir f1/foo 79 * normal commit 80 * rm -rf f1/foo 81 * fsync(f1) 82 * 83 * The directory f1 was fully removed from the FS, but fsync was never 84 * called on f1, only its parent dir. After a crash the rm -rf must 85 * be replayed. This must be able to recurse down the entire 86 * directory tree. The inode link count fixup code takes care of the 87 * ugly details. 88 */ 89 90 /* 91 * stages for the tree walking. The first 92 * stage (0) is to only pin down the blocks we find 93 * the second stage (1) is to make sure that all the inodes 94 * we find in the log are created in the subvolume. 95 * 96 * The last stage is to deal with directories and links and extents 97 * and all the other fun semantics 98 */ 99 enum { 100 LOG_WALK_PIN_ONLY, 101 LOG_WALK_REPLAY_INODES, 102 LOG_WALK_REPLAY_DIR_INDEX, 103 LOG_WALK_REPLAY_ALL, 104 }; 105 106 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 107 struct btrfs_inode *inode, 108 int inode_only, 109 struct btrfs_log_ctx *ctx); 110 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 111 struct btrfs_root *root, 112 struct btrfs_path *path, u64 objectid); 113 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 114 struct btrfs_root *root, 115 struct btrfs_root *log, 116 struct btrfs_path *path, 117 u64 dirid, int del_all); 118 static void wait_log_commit(struct btrfs_root *root, int transid); 119 120 /* 121 * tree logging is a special write ahead log used to make sure that 122 * fsyncs and O_SYNCs can happen without doing full tree commits. 123 * 124 * Full tree commits are expensive because they require commonly 125 * modified blocks to be recowed, creating many dirty pages in the 126 * extent tree an 4x-6x higher write load than ext3. 127 * 128 * Instead of doing a tree commit on every fsync, we use the 129 * key ranges and transaction ids to find items for a given file or directory 130 * that have changed in this transaction. Those items are copied into 131 * a special tree (one per subvolume root), that tree is written to disk 132 * and then the fsync is considered complete. 133 * 134 * After a crash, items are copied out of the log-tree back into the 135 * subvolume tree. Any file data extents found are recorded in the extent 136 * allocation tree, and the log-tree freed. 137 * 138 * The log tree is read three times, once to pin down all the extents it is 139 * using in ram and once, once to create all the inodes logged in the tree 140 * and once to do all the other items. 141 */ 142 143 /* 144 * start a sub transaction and setup the log tree 145 * this increments the log tree writer count to make the people 146 * syncing the tree wait for us to finish 147 */ 148 static int start_log_trans(struct btrfs_trans_handle *trans, 149 struct btrfs_root *root, 150 struct btrfs_log_ctx *ctx) 151 { 152 struct btrfs_fs_info *fs_info = root->fs_info; 153 struct btrfs_root *tree_root = fs_info->tree_root; 154 const bool zoned = btrfs_is_zoned(fs_info); 155 int ret = 0; 156 bool created = false; 157 158 /* 159 * First check if the log root tree was already created. If not, create 160 * it before locking the root's log_mutex, just to keep lockdep happy. 161 */ 162 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { 163 mutex_lock(&tree_root->log_mutex); 164 if (!fs_info->log_root_tree) { 165 ret = btrfs_init_log_root_tree(trans, fs_info); 166 if (!ret) { 167 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); 168 created = true; 169 } 170 } 171 mutex_unlock(&tree_root->log_mutex); 172 if (ret) 173 return ret; 174 } 175 176 mutex_lock(&root->log_mutex); 177 178 again: 179 if (root->log_root) { 180 int index = (root->log_transid + 1) % 2; 181 182 if (btrfs_need_log_full_commit(trans)) { 183 ret = BTRFS_LOG_FORCE_COMMIT; 184 goto out; 185 } 186 187 if (zoned && atomic_read(&root->log_commit[index])) { 188 wait_log_commit(root, root->log_transid - 1); 189 goto again; 190 } 191 192 if (!root->log_start_pid) { 193 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 194 root->log_start_pid = current->pid; 195 } else if (root->log_start_pid != current->pid) { 196 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 197 } 198 } else { 199 /* 200 * This means fs_info->log_root_tree was already created 201 * for some other FS trees. Do the full commit not to mix 202 * nodes from multiple log transactions to do sequential 203 * writing. 204 */ 205 if (zoned && !created) { 206 ret = BTRFS_LOG_FORCE_COMMIT; 207 goto out; 208 } 209 210 ret = btrfs_add_log_tree(trans, root); 211 if (ret) 212 goto out; 213 214 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 215 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 216 root->log_start_pid = current->pid; 217 } 218 219 atomic_inc(&root->log_writers); 220 if (!ctx->logging_new_name) { 221 int index = root->log_transid % 2; 222 list_add_tail(&ctx->list, &root->log_ctxs[index]); 223 ctx->log_transid = root->log_transid; 224 } 225 226 out: 227 mutex_unlock(&root->log_mutex); 228 return ret; 229 } 230 231 /* 232 * returns 0 if there was a log transaction running and we were able 233 * to join, or returns -ENOENT if there were not transactions 234 * in progress 235 */ 236 static int join_running_log_trans(struct btrfs_root *root) 237 { 238 const bool zoned = btrfs_is_zoned(root->fs_info); 239 int ret = -ENOENT; 240 241 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 242 return ret; 243 244 mutex_lock(&root->log_mutex); 245 again: 246 if (root->log_root) { 247 int index = (root->log_transid + 1) % 2; 248 249 ret = 0; 250 if (zoned && atomic_read(&root->log_commit[index])) { 251 wait_log_commit(root, root->log_transid - 1); 252 goto again; 253 } 254 atomic_inc(&root->log_writers); 255 } 256 mutex_unlock(&root->log_mutex); 257 return ret; 258 } 259 260 /* 261 * This either makes the current running log transaction wait 262 * until you call btrfs_end_log_trans() or it makes any future 263 * log transactions wait until you call btrfs_end_log_trans() 264 */ 265 void btrfs_pin_log_trans(struct btrfs_root *root) 266 { 267 atomic_inc(&root->log_writers); 268 } 269 270 /* 271 * indicate we're done making changes to the log tree 272 * and wake up anyone waiting to do a sync 273 */ 274 void btrfs_end_log_trans(struct btrfs_root *root) 275 { 276 if (atomic_dec_and_test(&root->log_writers)) { 277 /* atomic_dec_and_test implies a barrier */ 278 cond_wake_up_nomb(&root->log_writer_wait); 279 } 280 } 281 282 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 283 { 284 filemap_fdatawait_range(buf->pages[0]->mapping, 285 buf->start, buf->start + buf->len - 1); 286 } 287 288 /* 289 * the walk control struct is used to pass state down the chain when 290 * processing the log tree. The stage field tells us which part 291 * of the log tree processing we are currently doing. The others 292 * are state fields used for that specific part 293 */ 294 struct walk_control { 295 /* should we free the extent on disk when done? This is used 296 * at transaction commit time while freeing a log tree 297 */ 298 int free; 299 300 /* pin only walk, we record which extents on disk belong to the 301 * log trees 302 */ 303 int pin; 304 305 /* what stage of the replay code we're currently in */ 306 int stage; 307 308 /* 309 * Ignore any items from the inode currently being processed. Needs 310 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 311 * the LOG_WALK_REPLAY_INODES stage. 312 */ 313 bool ignore_cur_inode; 314 315 /* the root we are currently replaying */ 316 struct btrfs_root *replay_dest; 317 318 /* the trans handle for the current replay */ 319 struct btrfs_trans_handle *trans; 320 321 /* the function that gets used to process blocks we find in the 322 * tree. Note the extent_buffer might not be up to date when it is 323 * passed in, and it must be checked or read if you need the data 324 * inside it 325 */ 326 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 327 struct walk_control *wc, u64 gen, int level); 328 }; 329 330 /* 331 * process_func used to pin down extents, write them or wait on them 332 */ 333 static int process_one_buffer(struct btrfs_root *log, 334 struct extent_buffer *eb, 335 struct walk_control *wc, u64 gen, int level) 336 { 337 struct btrfs_fs_info *fs_info = log->fs_info; 338 int ret = 0; 339 340 /* 341 * If this fs is mixed then we need to be able to process the leaves to 342 * pin down any logged extents, so we have to read the block. 343 */ 344 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 345 struct btrfs_tree_parent_check check = { 346 .level = level, 347 .transid = gen 348 }; 349 350 ret = btrfs_read_extent_buffer(eb, &check); 351 if (ret) 352 return ret; 353 } 354 355 if (wc->pin) { 356 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 357 eb->len); 358 if (ret) 359 return ret; 360 361 if (btrfs_buffer_uptodate(eb, gen, 0) && 362 btrfs_header_level(eb) == 0) 363 ret = btrfs_exclude_logged_extents(eb); 364 } 365 return ret; 366 } 367 368 /* 369 * Item overwrite used by replay and tree logging. eb, slot and key all refer 370 * to the src data we are copying out. 371 * 372 * root is the tree we are copying into, and path is a scratch 373 * path for use in this function (it should be released on entry and 374 * will be released on exit). 375 * 376 * If the key is already in the destination tree the existing item is 377 * overwritten. If the existing item isn't big enough, it is extended. 378 * If it is too large, it is truncated. 379 * 380 * If the key isn't in the destination yet, a new item is inserted. 381 */ 382 static int overwrite_item(struct btrfs_trans_handle *trans, 383 struct btrfs_root *root, 384 struct btrfs_path *path, 385 struct extent_buffer *eb, int slot, 386 struct btrfs_key *key) 387 { 388 int ret; 389 u32 item_size; 390 u64 saved_i_size = 0; 391 int save_old_i_size = 0; 392 unsigned long src_ptr; 393 unsigned long dst_ptr; 394 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 395 396 /* 397 * This is only used during log replay, so the root is always from a 398 * fs/subvolume tree. In case we ever need to support a log root, then 399 * we'll have to clone the leaf in the path, release the path and use 400 * the leaf before writing into the log tree. See the comments at 401 * copy_items() for more details. 402 */ 403 ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); 404 405 item_size = btrfs_item_size(eb, slot); 406 src_ptr = btrfs_item_ptr_offset(eb, slot); 407 408 /* Look for the key in the destination tree. */ 409 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 410 if (ret < 0) 411 return ret; 412 413 if (ret == 0) { 414 char *src_copy; 415 char *dst_copy; 416 u32 dst_size = btrfs_item_size(path->nodes[0], 417 path->slots[0]); 418 if (dst_size != item_size) 419 goto insert; 420 421 if (item_size == 0) { 422 btrfs_release_path(path); 423 return 0; 424 } 425 dst_copy = kmalloc(item_size, GFP_NOFS); 426 src_copy = kmalloc(item_size, GFP_NOFS); 427 if (!dst_copy || !src_copy) { 428 btrfs_release_path(path); 429 kfree(dst_copy); 430 kfree(src_copy); 431 return -ENOMEM; 432 } 433 434 read_extent_buffer(eb, src_copy, src_ptr, item_size); 435 436 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 437 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 438 item_size); 439 ret = memcmp(dst_copy, src_copy, item_size); 440 441 kfree(dst_copy); 442 kfree(src_copy); 443 /* 444 * they have the same contents, just return, this saves 445 * us from cowing blocks in the destination tree and doing 446 * extra writes that may not have been done by a previous 447 * sync 448 */ 449 if (ret == 0) { 450 btrfs_release_path(path); 451 return 0; 452 } 453 454 /* 455 * We need to load the old nbytes into the inode so when we 456 * replay the extents we've logged we get the right nbytes. 457 */ 458 if (inode_item) { 459 struct btrfs_inode_item *item; 460 u64 nbytes; 461 u32 mode; 462 463 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 464 struct btrfs_inode_item); 465 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 466 item = btrfs_item_ptr(eb, slot, 467 struct btrfs_inode_item); 468 btrfs_set_inode_nbytes(eb, item, nbytes); 469 470 /* 471 * If this is a directory we need to reset the i_size to 472 * 0 so that we can set it up properly when replaying 473 * the rest of the items in this log. 474 */ 475 mode = btrfs_inode_mode(eb, item); 476 if (S_ISDIR(mode)) 477 btrfs_set_inode_size(eb, item, 0); 478 } 479 } else if (inode_item) { 480 struct btrfs_inode_item *item; 481 u32 mode; 482 483 /* 484 * New inode, set nbytes to 0 so that the nbytes comes out 485 * properly when we replay the extents. 486 */ 487 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 488 btrfs_set_inode_nbytes(eb, item, 0); 489 490 /* 491 * If this is a directory we need to reset the i_size to 0 so 492 * that we can set it up properly when replaying the rest of 493 * the items in this log. 494 */ 495 mode = btrfs_inode_mode(eb, item); 496 if (S_ISDIR(mode)) 497 btrfs_set_inode_size(eb, item, 0); 498 } 499 insert: 500 btrfs_release_path(path); 501 /* try to insert the key into the destination tree */ 502 path->skip_release_on_error = 1; 503 ret = btrfs_insert_empty_item(trans, root, path, 504 key, item_size); 505 path->skip_release_on_error = 0; 506 507 /* make sure any existing item is the correct size */ 508 if (ret == -EEXIST || ret == -EOVERFLOW) { 509 u32 found_size; 510 found_size = btrfs_item_size(path->nodes[0], 511 path->slots[0]); 512 if (found_size > item_size) 513 btrfs_truncate_item(path, item_size, 1); 514 else if (found_size < item_size) 515 btrfs_extend_item(path, item_size - found_size); 516 } else if (ret) { 517 return ret; 518 } 519 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 520 path->slots[0]); 521 522 /* don't overwrite an existing inode if the generation number 523 * was logged as zero. This is done when the tree logging code 524 * is just logging an inode to make sure it exists after recovery. 525 * 526 * Also, don't overwrite i_size on directories during replay. 527 * log replay inserts and removes directory items based on the 528 * state of the tree found in the subvolume, and i_size is modified 529 * as it goes 530 */ 531 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 532 struct btrfs_inode_item *src_item; 533 struct btrfs_inode_item *dst_item; 534 535 src_item = (struct btrfs_inode_item *)src_ptr; 536 dst_item = (struct btrfs_inode_item *)dst_ptr; 537 538 if (btrfs_inode_generation(eb, src_item) == 0) { 539 struct extent_buffer *dst_eb = path->nodes[0]; 540 const u64 ino_size = btrfs_inode_size(eb, src_item); 541 542 /* 543 * For regular files an ino_size == 0 is used only when 544 * logging that an inode exists, as part of a directory 545 * fsync, and the inode wasn't fsynced before. In this 546 * case don't set the size of the inode in the fs/subvol 547 * tree, otherwise we would be throwing valid data away. 548 */ 549 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 550 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 551 ino_size != 0) 552 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 553 goto no_copy; 554 } 555 556 if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && 557 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 558 save_old_i_size = 1; 559 saved_i_size = btrfs_inode_size(path->nodes[0], 560 dst_item); 561 } 562 } 563 564 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 565 src_ptr, item_size); 566 567 if (save_old_i_size) { 568 struct btrfs_inode_item *dst_item; 569 dst_item = (struct btrfs_inode_item *)dst_ptr; 570 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 571 } 572 573 /* make sure the generation is filled in */ 574 if (key->type == BTRFS_INODE_ITEM_KEY) { 575 struct btrfs_inode_item *dst_item; 576 dst_item = (struct btrfs_inode_item *)dst_ptr; 577 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 578 btrfs_set_inode_generation(path->nodes[0], dst_item, 579 trans->transid); 580 } 581 } 582 no_copy: 583 btrfs_mark_buffer_dirty(path->nodes[0]); 584 btrfs_release_path(path); 585 return 0; 586 } 587 588 static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, 589 struct fscrypt_str *name) 590 { 591 char *buf; 592 593 buf = kmalloc(len, GFP_NOFS); 594 if (!buf) 595 return -ENOMEM; 596 597 read_extent_buffer(eb, buf, (unsigned long)start, len); 598 name->name = buf; 599 name->len = len; 600 return 0; 601 } 602 603 /* 604 * simple helper to read an inode off the disk from a given root 605 * This can only be called for subvolume roots and not for the log 606 */ 607 static noinline struct inode *read_one_inode(struct btrfs_root *root, 608 u64 objectid) 609 { 610 struct inode *inode; 611 612 inode = btrfs_iget(root->fs_info->sb, objectid, root); 613 if (IS_ERR(inode)) 614 inode = NULL; 615 return inode; 616 } 617 618 /* replays a single extent in 'eb' at 'slot' with 'key' into the 619 * subvolume 'root'. path is released on entry and should be released 620 * on exit. 621 * 622 * extents in the log tree have not been allocated out of the extent 623 * tree yet. So, this completes the allocation, taking a reference 624 * as required if the extent already exists or creating a new extent 625 * if it isn't in the extent allocation tree yet. 626 * 627 * The extent is inserted into the file, dropping any existing extents 628 * from the file that overlap the new one. 629 */ 630 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 631 struct btrfs_root *root, 632 struct btrfs_path *path, 633 struct extent_buffer *eb, int slot, 634 struct btrfs_key *key) 635 { 636 struct btrfs_drop_extents_args drop_args = { 0 }; 637 struct btrfs_fs_info *fs_info = root->fs_info; 638 int found_type; 639 u64 extent_end; 640 u64 start = key->offset; 641 u64 nbytes = 0; 642 struct btrfs_file_extent_item *item; 643 struct inode *inode = NULL; 644 unsigned long size; 645 int ret = 0; 646 647 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 648 found_type = btrfs_file_extent_type(eb, item); 649 650 if (found_type == BTRFS_FILE_EXTENT_REG || 651 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 652 nbytes = btrfs_file_extent_num_bytes(eb, item); 653 extent_end = start + nbytes; 654 655 /* 656 * We don't add to the inodes nbytes if we are prealloc or a 657 * hole. 658 */ 659 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 660 nbytes = 0; 661 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 662 size = btrfs_file_extent_ram_bytes(eb, item); 663 nbytes = btrfs_file_extent_ram_bytes(eb, item); 664 extent_end = ALIGN(start + size, 665 fs_info->sectorsize); 666 } else { 667 ret = 0; 668 goto out; 669 } 670 671 inode = read_one_inode(root, key->objectid); 672 if (!inode) { 673 ret = -EIO; 674 goto out; 675 } 676 677 /* 678 * first check to see if we already have this extent in the 679 * file. This must be done before the btrfs_drop_extents run 680 * so we don't try to drop this extent. 681 */ 682 ret = btrfs_lookup_file_extent(trans, root, path, 683 btrfs_ino(BTRFS_I(inode)), start, 0); 684 685 if (ret == 0 && 686 (found_type == BTRFS_FILE_EXTENT_REG || 687 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 688 struct btrfs_file_extent_item cmp1; 689 struct btrfs_file_extent_item cmp2; 690 struct btrfs_file_extent_item *existing; 691 struct extent_buffer *leaf; 692 693 leaf = path->nodes[0]; 694 existing = btrfs_item_ptr(leaf, path->slots[0], 695 struct btrfs_file_extent_item); 696 697 read_extent_buffer(eb, &cmp1, (unsigned long)item, 698 sizeof(cmp1)); 699 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 700 sizeof(cmp2)); 701 702 /* 703 * we already have a pointer to this exact extent, 704 * we don't have to do anything 705 */ 706 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 707 btrfs_release_path(path); 708 goto out; 709 } 710 } 711 btrfs_release_path(path); 712 713 /* drop any overlapping extents */ 714 drop_args.start = start; 715 drop_args.end = extent_end; 716 drop_args.drop_cache = true; 717 ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); 718 if (ret) 719 goto out; 720 721 if (found_type == BTRFS_FILE_EXTENT_REG || 722 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 723 u64 offset; 724 unsigned long dest_offset; 725 struct btrfs_key ins; 726 727 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 728 btrfs_fs_incompat(fs_info, NO_HOLES)) 729 goto update_inode; 730 731 ret = btrfs_insert_empty_item(trans, root, path, key, 732 sizeof(*item)); 733 if (ret) 734 goto out; 735 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 736 path->slots[0]); 737 copy_extent_buffer(path->nodes[0], eb, dest_offset, 738 (unsigned long)item, sizeof(*item)); 739 740 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 741 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 742 ins.type = BTRFS_EXTENT_ITEM_KEY; 743 offset = key->offset - btrfs_file_extent_offset(eb, item); 744 745 /* 746 * Manually record dirty extent, as here we did a shallow 747 * file extent item copy and skip normal backref update, 748 * but modifying extent tree all by ourselves. 749 * So need to manually record dirty extent for qgroup, 750 * as the owner of the file extent changed from log tree 751 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 752 */ 753 ret = btrfs_qgroup_trace_extent(trans, 754 btrfs_file_extent_disk_bytenr(eb, item), 755 btrfs_file_extent_disk_num_bytes(eb, item)); 756 if (ret < 0) 757 goto out; 758 759 if (ins.objectid > 0) { 760 struct btrfs_ref ref = { 0 }; 761 u64 csum_start; 762 u64 csum_end; 763 LIST_HEAD(ordered_sums); 764 765 /* 766 * is this extent already allocated in the extent 767 * allocation tree? If so, just add a reference 768 */ 769 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 770 ins.offset); 771 if (ret < 0) { 772 goto out; 773 } else if (ret == 0) { 774 btrfs_init_generic_ref(&ref, 775 BTRFS_ADD_DELAYED_REF, 776 ins.objectid, ins.offset, 0); 777 btrfs_init_data_ref(&ref, 778 root->root_key.objectid, 779 key->objectid, offset, 0, false); 780 ret = btrfs_inc_extent_ref(trans, &ref); 781 if (ret) 782 goto out; 783 } else { 784 /* 785 * insert the extent pointer in the extent 786 * allocation tree 787 */ 788 ret = btrfs_alloc_logged_file_extent(trans, 789 root->root_key.objectid, 790 key->objectid, offset, &ins); 791 if (ret) 792 goto out; 793 } 794 btrfs_release_path(path); 795 796 if (btrfs_file_extent_compression(eb, item)) { 797 csum_start = ins.objectid; 798 csum_end = csum_start + ins.offset; 799 } else { 800 csum_start = ins.objectid + 801 btrfs_file_extent_offset(eb, item); 802 csum_end = csum_start + 803 btrfs_file_extent_num_bytes(eb, item); 804 } 805 806 ret = btrfs_lookup_csums_list(root->log_root, 807 csum_start, csum_end - 1, 808 &ordered_sums, 0, false); 809 if (ret) 810 goto out; 811 /* 812 * Now delete all existing cums in the csum root that 813 * cover our range. We do this because we can have an 814 * extent that is completely referenced by one file 815 * extent item and partially referenced by another 816 * file extent item (like after using the clone or 817 * extent_same ioctls). In this case if we end up doing 818 * the replay of the one that partially references the 819 * extent first, and we do not do the csum deletion 820 * below, we can get 2 csum items in the csum tree that 821 * overlap each other. For example, imagine our log has 822 * the two following file extent items: 823 * 824 * key (257 EXTENT_DATA 409600) 825 * extent data disk byte 12845056 nr 102400 826 * extent data offset 20480 nr 20480 ram 102400 827 * 828 * key (257 EXTENT_DATA 819200) 829 * extent data disk byte 12845056 nr 102400 830 * extent data offset 0 nr 102400 ram 102400 831 * 832 * Where the second one fully references the 100K extent 833 * that starts at disk byte 12845056, and the log tree 834 * has a single csum item that covers the entire range 835 * of the extent: 836 * 837 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 838 * 839 * After the first file extent item is replayed, the 840 * csum tree gets the following csum item: 841 * 842 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 843 * 844 * Which covers the 20K sub-range starting at offset 20K 845 * of our extent. Now when we replay the second file 846 * extent item, if we do not delete existing csum items 847 * that cover any of its blocks, we end up getting two 848 * csum items in our csum tree that overlap each other: 849 * 850 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 851 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 852 * 853 * Which is a problem, because after this anyone trying 854 * to lookup up for the checksum of any block of our 855 * extent starting at an offset of 40K or higher, will 856 * end up looking at the second csum item only, which 857 * does not contain the checksum for any block starting 858 * at offset 40K or higher of our extent. 859 */ 860 while (!list_empty(&ordered_sums)) { 861 struct btrfs_ordered_sum *sums; 862 struct btrfs_root *csum_root; 863 864 sums = list_entry(ordered_sums.next, 865 struct btrfs_ordered_sum, 866 list); 867 csum_root = btrfs_csum_root(fs_info, 868 sums->bytenr); 869 if (!ret) 870 ret = btrfs_del_csums(trans, csum_root, 871 sums->bytenr, 872 sums->len); 873 if (!ret) 874 ret = btrfs_csum_file_blocks(trans, 875 csum_root, 876 sums); 877 list_del(&sums->list); 878 kfree(sums); 879 } 880 if (ret) 881 goto out; 882 } else { 883 btrfs_release_path(path); 884 } 885 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 886 /* inline extents are easy, we just overwrite them */ 887 ret = overwrite_item(trans, root, path, eb, slot, key); 888 if (ret) 889 goto out; 890 } 891 892 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 893 extent_end - start); 894 if (ret) 895 goto out; 896 897 update_inode: 898 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); 899 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 900 out: 901 iput(inode); 902 return ret; 903 } 904 905 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, 906 struct btrfs_inode *dir, 907 struct btrfs_inode *inode, 908 const struct fscrypt_str *name) 909 { 910 int ret; 911 912 ret = btrfs_unlink_inode(trans, dir, inode, name); 913 if (ret) 914 return ret; 915 /* 916 * Whenever we need to check if a name exists or not, we check the 917 * fs/subvolume tree. So after an unlink we must run delayed items, so 918 * that future checks for a name during log replay see that the name 919 * does not exists anymore. 920 */ 921 return btrfs_run_delayed_items(trans); 922 } 923 924 /* 925 * when cleaning up conflicts between the directory names in the 926 * subvolume, directory names in the log and directory names in the 927 * inode back references, we may have to unlink inodes from directories. 928 * 929 * This is a helper function to do the unlink of a specific directory 930 * item 931 */ 932 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 933 struct btrfs_path *path, 934 struct btrfs_inode *dir, 935 struct btrfs_dir_item *di) 936 { 937 struct btrfs_root *root = dir->root; 938 struct inode *inode; 939 struct fscrypt_str name; 940 struct extent_buffer *leaf; 941 struct btrfs_key location; 942 int ret; 943 944 leaf = path->nodes[0]; 945 946 btrfs_dir_item_key_to_cpu(leaf, di, &location); 947 ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); 948 if (ret) 949 return -ENOMEM; 950 951 btrfs_release_path(path); 952 953 inode = read_one_inode(root, location.objectid); 954 if (!inode) { 955 ret = -EIO; 956 goto out; 957 } 958 959 ret = link_to_fixup_dir(trans, root, path, location.objectid); 960 if (ret) 961 goto out; 962 963 ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); 964 out: 965 kfree(name.name); 966 iput(inode); 967 return ret; 968 } 969 970 /* 971 * See if a given name and sequence number found in an inode back reference are 972 * already in a directory and correctly point to this inode. 973 * 974 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it 975 * exists. 976 */ 977 static noinline int inode_in_dir(struct btrfs_root *root, 978 struct btrfs_path *path, 979 u64 dirid, u64 objectid, u64 index, 980 struct fscrypt_str *name) 981 { 982 struct btrfs_dir_item *di; 983 struct btrfs_key location; 984 int ret = 0; 985 986 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 987 index, name, 0); 988 if (IS_ERR(di)) { 989 ret = PTR_ERR(di); 990 goto out; 991 } else if (di) { 992 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 993 if (location.objectid != objectid) 994 goto out; 995 } else { 996 goto out; 997 } 998 999 btrfs_release_path(path); 1000 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0); 1001 if (IS_ERR(di)) { 1002 ret = PTR_ERR(di); 1003 goto out; 1004 } else if (di) { 1005 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 1006 if (location.objectid == objectid) 1007 ret = 1; 1008 } 1009 out: 1010 btrfs_release_path(path); 1011 return ret; 1012 } 1013 1014 /* 1015 * helper function to check a log tree for a named back reference in 1016 * an inode. This is used to decide if a back reference that is 1017 * found in the subvolume conflicts with what we find in the log. 1018 * 1019 * inode backreferences may have multiple refs in a single item, 1020 * during replay we process one reference at a time, and we don't 1021 * want to delete valid links to a file from the subvolume if that 1022 * link is also in the log. 1023 */ 1024 static noinline int backref_in_log(struct btrfs_root *log, 1025 struct btrfs_key *key, 1026 u64 ref_objectid, 1027 const struct fscrypt_str *name) 1028 { 1029 struct btrfs_path *path; 1030 int ret; 1031 1032 path = btrfs_alloc_path(); 1033 if (!path) 1034 return -ENOMEM; 1035 1036 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 1037 if (ret < 0) { 1038 goto out; 1039 } else if (ret == 1) { 1040 ret = 0; 1041 goto out; 1042 } 1043 1044 if (key->type == BTRFS_INODE_EXTREF_KEY) 1045 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1046 path->slots[0], 1047 ref_objectid, name); 1048 else 1049 ret = !!btrfs_find_name_in_backref(path->nodes[0], 1050 path->slots[0], name); 1051 out: 1052 btrfs_free_path(path); 1053 return ret; 1054 } 1055 1056 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 1057 struct btrfs_root *root, 1058 struct btrfs_path *path, 1059 struct btrfs_root *log_root, 1060 struct btrfs_inode *dir, 1061 struct btrfs_inode *inode, 1062 u64 inode_objectid, u64 parent_objectid, 1063 u64 ref_index, struct fscrypt_str *name) 1064 { 1065 int ret; 1066 struct extent_buffer *leaf; 1067 struct btrfs_dir_item *di; 1068 struct btrfs_key search_key; 1069 struct btrfs_inode_extref *extref; 1070 1071 again: 1072 /* Search old style refs */ 1073 search_key.objectid = inode_objectid; 1074 search_key.type = BTRFS_INODE_REF_KEY; 1075 search_key.offset = parent_objectid; 1076 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1077 if (ret == 0) { 1078 struct btrfs_inode_ref *victim_ref; 1079 unsigned long ptr; 1080 unsigned long ptr_end; 1081 1082 leaf = path->nodes[0]; 1083 1084 /* are we trying to overwrite a back ref for the root directory 1085 * if so, just jump out, we're done 1086 */ 1087 if (search_key.objectid == search_key.offset) 1088 return 1; 1089 1090 /* check all the names in this back reference to see 1091 * if they are in the log. if so, we allow them to stay 1092 * otherwise they must be unlinked as a conflict 1093 */ 1094 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1095 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); 1096 while (ptr < ptr_end) { 1097 struct fscrypt_str victim_name; 1098 1099 victim_ref = (struct btrfs_inode_ref *)ptr; 1100 ret = read_alloc_one_name(leaf, (victim_ref + 1), 1101 btrfs_inode_ref_name_len(leaf, victim_ref), 1102 &victim_name); 1103 if (ret) 1104 return ret; 1105 1106 ret = backref_in_log(log_root, &search_key, 1107 parent_objectid, &victim_name); 1108 if (ret < 0) { 1109 kfree(victim_name.name); 1110 return ret; 1111 } else if (!ret) { 1112 inc_nlink(&inode->vfs_inode); 1113 btrfs_release_path(path); 1114 1115 ret = unlink_inode_for_log_replay(trans, dir, inode, 1116 &victim_name); 1117 kfree(victim_name.name); 1118 if (ret) 1119 return ret; 1120 goto again; 1121 } 1122 kfree(victim_name.name); 1123 1124 ptr = (unsigned long)(victim_ref + 1) + victim_name.len; 1125 } 1126 } 1127 btrfs_release_path(path); 1128 1129 /* Same search but for extended refs */ 1130 extref = btrfs_lookup_inode_extref(NULL, root, path, name, 1131 inode_objectid, parent_objectid, 0, 1132 0); 1133 if (IS_ERR(extref)) { 1134 return PTR_ERR(extref); 1135 } else if (extref) { 1136 u32 item_size; 1137 u32 cur_offset = 0; 1138 unsigned long base; 1139 struct inode *victim_parent; 1140 1141 leaf = path->nodes[0]; 1142 1143 item_size = btrfs_item_size(leaf, path->slots[0]); 1144 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1145 1146 while (cur_offset < item_size) { 1147 struct fscrypt_str victim_name; 1148 1149 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1150 1151 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1152 goto next; 1153 1154 ret = read_alloc_one_name(leaf, &extref->name, 1155 btrfs_inode_extref_name_len(leaf, extref), 1156 &victim_name); 1157 if (ret) 1158 return ret; 1159 1160 search_key.objectid = inode_objectid; 1161 search_key.type = BTRFS_INODE_EXTREF_KEY; 1162 search_key.offset = btrfs_extref_hash(parent_objectid, 1163 victim_name.name, 1164 victim_name.len); 1165 ret = backref_in_log(log_root, &search_key, 1166 parent_objectid, &victim_name); 1167 if (ret < 0) { 1168 kfree(victim_name.name); 1169 return ret; 1170 } else if (!ret) { 1171 ret = -ENOENT; 1172 victim_parent = read_one_inode(root, 1173 parent_objectid); 1174 if (victim_parent) { 1175 inc_nlink(&inode->vfs_inode); 1176 btrfs_release_path(path); 1177 1178 ret = unlink_inode_for_log_replay(trans, 1179 BTRFS_I(victim_parent), 1180 inode, &victim_name); 1181 } 1182 iput(victim_parent); 1183 kfree(victim_name.name); 1184 if (ret) 1185 return ret; 1186 goto again; 1187 } 1188 kfree(victim_name.name); 1189 next: 1190 cur_offset += victim_name.len + sizeof(*extref); 1191 } 1192 } 1193 btrfs_release_path(path); 1194 1195 /* look for a conflicting sequence number */ 1196 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1197 ref_index, name, 0); 1198 if (IS_ERR(di)) { 1199 return PTR_ERR(di); 1200 } else if (di) { 1201 ret = drop_one_dir_item(trans, path, dir, di); 1202 if (ret) 1203 return ret; 1204 } 1205 btrfs_release_path(path); 1206 1207 /* look for a conflicting name */ 1208 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0); 1209 if (IS_ERR(di)) { 1210 return PTR_ERR(di); 1211 } else if (di) { 1212 ret = drop_one_dir_item(trans, path, dir, di); 1213 if (ret) 1214 return ret; 1215 } 1216 btrfs_release_path(path); 1217 1218 return 0; 1219 } 1220 1221 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1222 struct fscrypt_str *name, u64 *index, 1223 u64 *parent_objectid) 1224 { 1225 struct btrfs_inode_extref *extref; 1226 int ret; 1227 1228 extref = (struct btrfs_inode_extref *)ref_ptr; 1229 1230 ret = read_alloc_one_name(eb, &extref->name, 1231 btrfs_inode_extref_name_len(eb, extref), name); 1232 if (ret) 1233 return ret; 1234 1235 if (index) 1236 *index = btrfs_inode_extref_index(eb, extref); 1237 if (parent_objectid) 1238 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1239 1240 return 0; 1241 } 1242 1243 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1244 struct fscrypt_str *name, u64 *index) 1245 { 1246 struct btrfs_inode_ref *ref; 1247 int ret; 1248 1249 ref = (struct btrfs_inode_ref *)ref_ptr; 1250 1251 ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref), 1252 name); 1253 if (ret) 1254 return ret; 1255 1256 if (index) 1257 *index = btrfs_inode_ref_index(eb, ref); 1258 1259 return 0; 1260 } 1261 1262 /* 1263 * Take an inode reference item from the log tree and iterate all names from the 1264 * inode reference item in the subvolume tree with the same key (if it exists). 1265 * For any name that is not in the inode reference item from the log tree, do a 1266 * proper unlink of that name (that is, remove its entry from the inode 1267 * reference item and both dir index keys). 1268 */ 1269 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1270 struct btrfs_root *root, 1271 struct btrfs_path *path, 1272 struct btrfs_inode *inode, 1273 struct extent_buffer *log_eb, 1274 int log_slot, 1275 struct btrfs_key *key) 1276 { 1277 int ret; 1278 unsigned long ref_ptr; 1279 unsigned long ref_end; 1280 struct extent_buffer *eb; 1281 1282 again: 1283 btrfs_release_path(path); 1284 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1285 if (ret > 0) { 1286 ret = 0; 1287 goto out; 1288 } 1289 if (ret < 0) 1290 goto out; 1291 1292 eb = path->nodes[0]; 1293 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1294 ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); 1295 while (ref_ptr < ref_end) { 1296 struct fscrypt_str name; 1297 u64 parent_id; 1298 1299 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1300 ret = extref_get_fields(eb, ref_ptr, &name, 1301 NULL, &parent_id); 1302 } else { 1303 parent_id = key->offset; 1304 ret = ref_get_fields(eb, ref_ptr, &name, NULL); 1305 } 1306 if (ret) 1307 goto out; 1308 1309 if (key->type == BTRFS_INODE_EXTREF_KEY) 1310 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1311 parent_id, &name); 1312 else 1313 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); 1314 1315 if (!ret) { 1316 struct inode *dir; 1317 1318 btrfs_release_path(path); 1319 dir = read_one_inode(root, parent_id); 1320 if (!dir) { 1321 ret = -ENOENT; 1322 kfree(name.name); 1323 goto out; 1324 } 1325 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), 1326 inode, &name); 1327 kfree(name.name); 1328 iput(dir); 1329 if (ret) 1330 goto out; 1331 goto again; 1332 } 1333 1334 kfree(name.name); 1335 ref_ptr += name.len; 1336 if (key->type == BTRFS_INODE_EXTREF_KEY) 1337 ref_ptr += sizeof(struct btrfs_inode_extref); 1338 else 1339 ref_ptr += sizeof(struct btrfs_inode_ref); 1340 } 1341 ret = 0; 1342 out: 1343 btrfs_release_path(path); 1344 return ret; 1345 } 1346 1347 /* 1348 * replay one inode back reference item found in the log tree. 1349 * eb, slot and key refer to the buffer and key found in the log tree. 1350 * root is the destination we are replaying into, and path is for temp 1351 * use by this function. (it should be released on return). 1352 */ 1353 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1354 struct btrfs_root *root, 1355 struct btrfs_root *log, 1356 struct btrfs_path *path, 1357 struct extent_buffer *eb, int slot, 1358 struct btrfs_key *key) 1359 { 1360 struct inode *dir = NULL; 1361 struct inode *inode = NULL; 1362 unsigned long ref_ptr; 1363 unsigned long ref_end; 1364 struct fscrypt_str name; 1365 int ret; 1366 int log_ref_ver = 0; 1367 u64 parent_objectid; 1368 u64 inode_objectid; 1369 u64 ref_index = 0; 1370 int ref_struct_size; 1371 1372 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1373 ref_end = ref_ptr + btrfs_item_size(eb, slot); 1374 1375 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1376 struct btrfs_inode_extref *r; 1377 1378 ref_struct_size = sizeof(struct btrfs_inode_extref); 1379 log_ref_ver = 1; 1380 r = (struct btrfs_inode_extref *)ref_ptr; 1381 parent_objectid = btrfs_inode_extref_parent(eb, r); 1382 } else { 1383 ref_struct_size = sizeof(struct btrfs_inode_ref); 1384 parent_objectid = key->offset; 1385 } 1386 inode_objectid = key->objectid; 1387 1388 /* 1389 * it is possible that we didn't log all the parent directories 1390 * for a given inode. If we don't find the dir, just don't 1391 * copy the back ref in. The link count fixup code will take 1392 * care of the rest 1393 */ 1394 dir = read_one_inode(root, parent_objectid); 1395 if (!dir) { 1396 ret = -ENOENT; 1397 goto out; 1398 } 1399 1400 inode = read_one_inode(root, inode_objectid); 1401 if (!inode) { 1402 ret = -EIO; 1403 goto out; 1404 } 1405 1406 while (ref_ptr < ref_end) { 1407 if (log_ref_ver) { 1408 ret = extref_get_fields(eb, ref_ptr, &name, 1409 &ref_index, &parent_objectid); 1410 /* 1411 * parent object can change from one array 1412 * item to another. 1413 */ 1414 if (!dir) 1415 dir = read_one_inode(root, parent_objectid); 1416 if (!dir) { 1417 ret = -ENOENT; 1418 goto out; 1419 } 1420 } else { 1421 ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); 1422 } 1423 if (ret) 1424 goto out; 1425 1426 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1427 btrfs_ino(BTRFS_I(inode)), ref_index, &name); 1428 if (ret < 0) { 1429 goto out; 1430 } else if (ret == 0) { 1431 /* 1432 * look for a conflicting back reference in the 1433 * metadata. if we find one we have to unlink that name 1434 * of the file before we add our new link. Later on, we 1435 * overwrite any existing back reference, and we don't 1436 * want to create dangling pointers in the directory. 1437 */ 1438 ret = __add_inode_ref(trans, root, path, log, 1439 BTRFS_I(dir), BTRFS_I(inode), 1440 inode_objectid, parent_objectid, 1441 ref_index, &name); 1442 if (ret) { 1443 if (ret == 1) 1444 ret = 0; 1445 goto out; 1446 } 1447 1448 /* insert our name */ 1449 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1450 &name, 0, ref_index); 1451 if (ret) 1452 goto out; 1453 1454 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1455 if (ret) 1456 goto out; 1457 } 1458 /* Else, ret == 1, we already have a perfect match, we're done. */ 1459 1460 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len; 1461 kfree(name.name); 1462 name.name = NULL; 1463 if (log_ref_ver) { 1464 iput(dir); 1465 dir = NULL; 1466 } 1467 } 1468 1469 /* 1470 * Before we overwrite the inode reference item in the subvolume tree 1471 * with the item from the log tree, we must unlink all names from the 1472 * parent directory that are in the subvolume's tree inode reference 1473 * item, otherwise we end up with an inconsistent subvolume tree where 1474 * dir index entries exist for a name but there is no inode reference 1475 * item with the same name. 1476 */ 1477 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1478 key); 1479 if (ret) 1480 goto out; 1481 1482 /* finally write the back reference in the inode */ 1483 ret = overwrite_item(trans, root, path, eb, slot, key); 1484 out: 1485 btrfs_release_path(path); 1486 kfree(name.name); 1487 iput(dir); 1488 iput(inode); 1489 return ret; 1490 } 1491 1492 static int count_inode_extrefs(struct btrfs_root *root, 1493 struct btrfs_inode *inode, struct btrfs_path *path) 1494 { 1495 int ret = 0; 1496 int name_len; 1497 unsigned int nlink = 0; 1498 u32 item_size; 1499 u32 cur_offset = 0; 1500 u64 inode_objectid = btrfs_ino(inode); 1501 u64 offset = 0; 1502 unsigned long ptr; 1503 struct btrfs_inode_extref *extref; 1504 struct extent_buffer *leaf; 1505 1506 while (1) { 1507 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1508 &extref, &offset); 1509 if (ret) 1510 break; 1511 1512 leaf = path->nodes[0]; 1513 item_size = btrfs_item_size(leaf, path->slots[0]); 1514 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1515 cur_offset = 0; 1516 1517 while (cur_offset < item_size) { 1518 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1519 name_len = btrfs_inode_extref_name_len(leaf, extref); 1520 1521 nlink++; 1522 1523 cur_offset += name_len + sizeof(*extref); 1524 } 1525 1526 offset++; 1527 btrfs_release_path(path); 1528 } 1529 btrfs_release_path(path); 1530 1531 if (ret < 0 && ret != -ENOENT) 1532 return ret; 1533 return nlink; 1534 } 1535 1536 static int count_inode_refs(struct btrfs_root *root, 1537 struct btrfs_inode *inode, struct btrfs_path *path) 1538 { 1539 int ret; 1540 struct btrfs_key key; 1541 unsigned int nlink = 0; 1542 unsigned long ptr; 1543 unsigned long ptr_end; 1544 int name_len; 1545 u64 ino = btrfs_ino(inode); 1546 1547 key.objectid = ino; 1548 key.type = BTRFS_INODE_REF_KEY; 1549 key.offset = (u64)-1; 1550 1551 while (1) { 1552 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1553 if (ret < 0) 1554 break; 1555 if (ret > 0) { 1556 if (path->slots[0] == 0) 1557 break; 1558 path->slots[0]--; 1559 } 1560 process_slot: 1561 btrfs_item_key_to_cpu(path->nodes[0], &key, 1562 path->slots[0]); 1563 if (key.objectid != ino || 1564 key.type != BTRFS_INODE_REF_KEY) 1565 break; 1566 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1567 ptr_end = ptr + btrfs_item_size(path->nodes[0], 1568 path->slots[0]); 1569 while (ptr < ptr_end) { 1570 struct btrfs_inode_ref *ref; 1571 1572 ref = (struct btrfs_inode_ref *)ptr; 1573 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1574 ref); 1575 ptr = (unsigned long)(ref + 1) + name_len; 1576 nlink++; 1577 } 1578 1579 if (key.offset == 0) 1580 break; 1581 if (path->slots[0] > 0) { 1582 path->slots[0]--; 1583 goto process_slot; 1584 } 1585 key.offset--; 1586 btrfs_release_path(path); 1587 } 1588 btrfs_release_path(path); 1589 1590 return nlink; 1591 } 1592 1593 /* 1594 * There are a few corners where the link count of the file can't 1595 * be properly maintained during replay. So, instead of adding 1596 * lots of complexity to the log code, we just scan the backrefs 1597 * for any file that has been through replay. 1598 * 1599 * The scan will update the link count on the inode to reflect the 1600 * number of back refs found. If it goes down to zero, the iput 1601 * will free the inode. 1602 */ 1603 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1604 struct btrfs_root *root, 1605 struct inode *inode) 1606 { 1607 struct btrfs_path *path; 1608 int ret; 1609 u64 nlink = 0; 1610 u64 ino = btrfs_ino(BTRFS_I(inode)); 1611 1612 path = btrfs_alloc_path(); 1613 if (!path) 1614 return -ENOMEM; 1615 1616 ret = count_inode_refs(root, BTRFS_I(inode), path); 1617 if (ret < 0) 1618 goto out; 1619 1620 nlink = ret; 1621 1622 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1623 if (ret < 0) 1624 goto out; 1625 1626 nlink += ret; 1627 1628 ret = 0; 1629 1630 if (nlink != inode->i_nlink) { 1631 set_nlink(inode, nlink); 1632 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1633 if (ret) 1634 goto out; 1635 } 1636 BTRFS_I(inode)->index_cnt = (u64)-1; 1637 1638 if (inode->i_nlink == 0) { 1639 if (S_ISDIR(inode->i_mode)) { 1640 ret = replay_dir_deletes(trans, root, NULL, path, 1641 ino, 1); 1642 if (ret) 1643 goto out; 1644 } 1645 ret = btrfs_insert_orphan_item(trans, root, ino); 1646 if (ret == -EEXIST) 1647 ret = 0; 1648 } 1649 1650 out: 1651 btrfs_free_path(path); 1652 return ret; 1653 } 1654 1655 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1656 struct btrfs_root *root, 1657 struct btrfs_path *path) 1658 { 1659 int ret; 1660 struct btrfs_key key; 1661 struct inode *inode; 1662 1663 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1664 key.type = BTRFS_ORPHAN_ITEM_KEY; 1665 key.offset = (u64)-1; 1666 while (1) { 1667 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1668 if (ret < 0) 1669 break; 1670 1671 if (ret == 1) { 1672 ret = 0; 1673 if (path->slots[0] == 0) 1674 break; 1675 path->slots[0]--; 1676 } 1677 1678 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1679 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1680 key.type != BTRFS_ORPHAN_ITEM_KEY) 1681 break; 1682 1683 ret = btrfs_del_item(trans, root, path); 1684 if (ret) 1685 break; 1686 1687 btrfs_release_path(path); 1688 inode = read_one_inode(root, key.offset); 1689 if (!inode) { 1690 ret = -EIO; 1691 break; 1692 } 1693 1694 ret = fixup_inode_link_count(trans, root, inode); 1695 iput(inode); 1696 if (ret) 1697 break; 1698 1699 /* 1700 * fixup on a directory may create new entries, 1701 * make sure we always look for the highset possible 1702 * offset 1703 */ 1704 key.offset = (u64)-1; 1705 } 1706 btrfs_release_path(path); 1707 return ret; 1708 } 1709 1710 1711 /* 1712 * record a given inode in the fixup dir so we can check its link 1713 * count when replay is done. The link count is incremented here 1714 * so the inode won't go away until we check it 1715 */ 1716 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1717 struct btrfs_root *root, 1718 struct btrfs_path *path, 1719 u64 objectid) 1720 { 1721 struct btrfs_key key; 1722 int ret = 0; 1723 struct inode *inode; 1724 1725 inode = read_one_inode(root, objectid); 1726 if (!inode) 1727 return -EIO; 1728 1729 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1730 key.type = BTRFS_ORPHAN_ITEM_KEY; 1731 key.offset = objectid; 1732 1733 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1734 1735 btrfs_release_path(path); 1736 if (ret == 0) { 1737 if (!inode->i_nlink) 1738 set_nlink(inode, 1); 1739 else 1740 inc_nlink(inode); 1741 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1742 } else if (ret == -EEXIST) { 1743 ret = 0; 1744 } 1745 iput(inode); 1746 1747 return ret; 1748 } 1749 1750 /* 1751 * when replaying the log for a directory, we only insert names 1752 * for inodes that actually exist. This means an fsync on a directory 1753 * does not implicitly fsync all the new files in it 1754 */ 1755 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1756 struct btrfs_root *root, 1757 u64 dirid, u64 index, 1758 const struct fscrypt_str *name, 1759 struct btrfs_key *location) 1760 { 1761 struct inode *inode; 1762 struct inode *dir; 1763 int ret; 1764 1765 inode = read_one_inode(root, location->objectid); 1766 if (!inode) 1767 return -ENOENT; 1768 1769 dir = read_one_inode(root, dirid); 1770 if (!dir) { 1771 iput(inode); 1772 return -EIO; 1773 } 1774 1775 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1776 1, index); 1777 1778 /* FIXME, put inode into FIXUP list */ 1779 1780 iput(inode); 1781 iput(dir); 1782 return ret; 1783 } 1784 1785 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, 1786 struct btrfs_inode *dir, 1787 struct btrfs_path *path, 1788 struct btrfs_dir_item *dst_di, 1789 const struct btrfs_key *log_key, 1790 u8 log_flags, 1791 bool exists) 1792 { 1793 struct btrfs_key found_key; 1794 1795 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1796 /* The existing dentry points to the same inode, don't delete it. */ 1797 if (found_key.objectid == log_key->objectid && 1798 found_key.type == log_key->type && 1799 found_key.offset == log_key->offset && 1800 btrfs_dir_flags(path->nodes[0], dst_di) == log_flags) 1801 return 1; 1802 1803 /* 1804 * Don't drop the conflicting directory entry if the inode for the new 1805 * entry doesn't exist. 1806 */ 1807 if (!exists) 1808 return 0; 1809 1810 return drop_one_dir_item(trans, path, dir, dst_di); 1811 } 1812 1813 /* 1814 * take a single entry in a log directory item and replay it into 1815 * the subvolume. 1816 * 1817 * if a conflicting item exists in the subdirectory already, 1818 * the inode it points to is unlinked and put into the link count 1819 * fix up tree. 1820 * 1821 * If a name from the log points to a file or directory that does 1822 * not exist in the FS, it is skipped. fsyncs on directories 1823 * do not force down inodes inside that directory, just changes to the 1824 * names or unlinks in a directory. 1825 * 1826 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1827 * non-existing inode) and 1 if the name was replayed. 1828 */ 1829 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1830 struct btrfs_root *root, 1831 struct btrfs_path *path, 1832 struct extent_buffer *eb, 1833 struct btrfs_dir_item *di, 1834 struct btrfs_key *key) 1835 { 1836 struct fscrypt_str name; 1837 struct btrfs_dir_item *dir_dst_di; 1838 struct btrfs_dir_item *index_dst_di; 1839 bool dir_dst_matches = false; 1840 bool index_dst_matches = false; 1841 struct btrfs_key log_key; 1842 struct btrfs_key search_key; 1843 struct inode *dir; 1844 u8 log_flags; 1845 bool exists; 1846 int ret; 1847 bool update_size = true; 1848 bool name_added = false; 1849 1850 dir = read_one_inode(root, key->objectid); 1851 if (!dir) 1852 return -EIO; 1853 1854 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); 1855 if (ret) 1856 goto out; 1857 1858 log_flags = btrfs_dir_flags(eb, di); 1859 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1860 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1861 btrfs_release_path(path); 1862 if (ret < 0) 1863 goto out; 1864 exists = (ret == 0); 1865 ret = 0; 1866 1867 dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1868 &name, 1); 1869 if (IS_ERR(dir_dst_di)) { 1870 ret = PTR_ERR(dir_dst_di); 1871 goto out; 1872 } else if (dir_dst_di) { 1873 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 1874 dir_dst_di, &log_key, 1875 log_flags, exists); 1876 if (ret < 0) 1877 goto out; 1878 dir_dst_matches = (ret == 1); 1879 } 1880 1881 btrfs_release_path(path); 1882 1883 index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1884 key->objectid, key->offset, 1885 &name, 1); 1886 if (IS_ERR(index_dst_di)) { 1887 ret = PTR_ERR(index_dst_di); 1888 goto out; 1889 } else if (index_dst_di) { 1890 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 1891 index_dst_di, &log_key, 1892 log_flags, exists); 1893 if (ret < 0) 1894 goto out; 1895 index_dst_matches = (ret == 1); 1896 } 1897 1898 btrfs_release_path(path); 1899 1900 if (dir_dst_matches && index_dst_matches) { 1901 ret = 0; 1902 update_size = false; 1903 goto out; 1904 } 1905 1906 /* 1907 * Check if the inode reference exists in the log for the given name, 1908 * inode and parent inode 1909 */ 1910 search_key.objectid = log_key.objectid; 1911 search_key.type = BTRFS_INODE_REF_KEY; 1912 search_key.offset = key->objectid; 1913 ret = backref_in_log(root->log_root, &search_key, 0, &name); 1914 if (ret < 0) { 1915 goto out; 1916 } else if (ret) { 1917 /* The dentry will be added later. */ 1918 ret = 0; 1919 update_size = false; 1920 goto out; 1921 } 1922 1923 search_key.objectid = log_key.objectid; 1924 search_key.type = BTRFS_INODE_EXTREF_KEY; 1925 search_key.offset = key->objectid; 1926 ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); 1927 if (ret < 0) { 1928 goto out; 1929 } else if (ret) { 1930 /* The dentry will be added later. */ 1931 ret = 0; 1932 update_size = false; 1933 goto out; 1934 } 1935 btrfs_release_path(path); 1936 ret = insert_one_name(trans, root, key->objectid, key->offset, 1937 &name, &log_key); 1938 if (ret && ret != -ENOENT && ret != -EEXIST) 1939 goto out; 1940 if (!ret) 1941 name_added = true; 1942 update_size = false; 1943 ret = 0; 1944 1945 out: 1946 if (!ret && update_size) { 1947 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); 1948 ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); 1949 } 1950 kfree(name.name); 1951 iput(dir); 1952 if (!ret && name_added) 1953 ret = 1; 1954 return ret; 1955 } 1956 1957 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ 1958 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1959 struct btrfs_root *root, 1960 struct btrfs_path *path, 1961 struct extent_buffer *eb, int slot, 1962 struct btrfs_key *key) 1963 { 1964 int ret; 1965 struct btrfs_dir_item *di; 1966 1967 /* We only log dir index keys, which only contain a single dir item. */ 1968 ASSERT(key->type == BTRFS_DIR_INDEX_KEY); 1969 1970 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 1971 ret = replay_one_name(trans, root, path, eb, di, key); 1972 if (ret < 0) 1973 return ret; 1974 1975 /* 1976 * If this entry refers to a non-directory (directories can not have a 1977 * link count > 1) and it was added in the transaction that was not 1978 * committed, make sure we fixup the link count of the inode the entry 1979 * points to. Otherwise something like the following would result in a 1980 * directory pointing to an inode with a wrong link that does not account 1981 * for this dir entry: 1982 * 1983 * mkdir testdir 1984 * touch testdir/foo 1985 * touch testdir/bar 1986 * sync 1987 * 1988 * ln testdir/bar testdir/bar_link 1989 * ln testdir/foo testdir/foo_link 1990 * xfs_io -c "fsync" testdir/bar 1991 * 1992 * <power failure> 1993 * 1994 * mount fs, log replay happens 1995 * 1996 * File foo would remain with a link count of 1 when it has two entries 1997 * pointing to it in the directory testdir. This would make it impossible 1998 * to ever delete the parent directory has it would result in stale 1999 * dentries that can never be deleted. 2000 */ 2001 if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) { 2002 struct btrfs_path *fixup_path; 2003 struct btrfs_key di_key; 2004 2005 fixup_path = btrfs_alloc_path(); 2006 if (!fixup_path) 2007 return -ENOMEM; 2008 2009 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2010 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); 2011 btrfs_free_path(fixup_path); 2012 } 2013 2014 return ret; 2015 } 2016 2017 /* 2018 * directory replay has two parts. There are the standard directory 2019 * items in the log copied from the subvolume, and range items 2020 * created in the log while the subvolume was logged. 2021 * 2022 * The range items tell us which parts of the key space the log 2023 * is authoritative for. During replay, if a key in the subvolume 2024 * directory is in a logged range item, but not actually in the log 2025 * that means it was deleted from the directory before the fsync 2026 * and should be removed. 2027 */ 2028 static noinline int find_dir_range(struct btrfs_root *root, 2029 struct btrfs_path *path, 2030 u64 dirid, 2031 u64 *start_ret, u64 *end_ret) 2032 { 2033 struct btrfs_key key; 2034 u64 found_end; 2035 struct btrfs_dir_log_item *item; 2036 int ret; 2037 int nritems; 2038 2039 if (*start_ret == (u64)-1) 2040 return 1; 2041 2042 key.objectid = dirid; 2043 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2044 key.offset = *start_ret; 2045 2046 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2047 if (ret < 0) 2048 goto out; 2049 if (ret > 0) { 2050 if (path->slots[0] == 0) 2051 goto out; 2052 path->slots[0]--; 2053 } 2054 if (ret != 0) 2055 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2056 2057 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2058 ret = 1; 2059 goto next; 2060 } 2061 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2062 struct btrfs_dir_log_item); 2063 found_end = btrfs_dir_log_end(path->nodes[0], item); 2064 2065 if (*start_ret >= key.offset && *start_ret <= found_end) { 2066 ret = 0; 2067 *start_ret = key.offset; 2068 *end_ret = found_end; 2069 goto out; 2070 } 2071 ret = 1; 2072 next: 2073 /* check the next slot in the tree to see if it is a valid item */ 2074 nritems = btrfs_header_nritems(path->nodes[0]); 2075 path->slots[0]++; 2076 if (path->slots[0] >= nritems) { 2077 ret = btrfs_next_leaf(root, path); 2078 if (ret) 2079 goto out; 2080 } 2081 2082 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2083 2084 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2085 ret = 1; 2086 goto out; 2087 } 2088 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2089 struct btrfs_dir_log_item); 2090 found_end = btrfs_dir_log_end(path->nodes[0], item); 2091 *start_ret = key.offset; 2092 *end_ret = found_end; 2093 ret = 0; 2094 out: 2095 btrfs_release_path(path); 2096 return ret; 2097 } 2098 2099 /* 2100 * this looks for a given directory item in the log. If the directory 2101 * item is not in the log, the item is removed and the inode it points 2102 * to is unlinked 2103 */ 2104 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2105 struct btrfs_root *log, 2106 struct btrfs_path *path, 2107 struct btrfs_path *log_path, 2108 struct inode *dir, 2109 struct btrfs_key *dir_key) 2110 { 2111 struct btrfs_root *root = BTRFS_I(dir)->root; 2112 int ret; 2113 struct extent_buffer *eb; 2114 int slot; 2115 struct btrfs_dir_item *di; 2116 struct fscrypt_str name; 2117 struct inode *inode = NULL; 2118 struct btrfs_key location; 2119 2120 /* 2121 * Currently we only log dir index keys. Even if we replay a log created 2122 * by an older kernel that logged both dir index and dir item keys, all 2123 * we need to do is process the dir index keys, we (and our caller) can 2124 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). 2125 */ 2126 ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); 2127 2128 eb = path->nodes[0]; 2129 slot = path->slots[0]; 2130 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2131 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); 2132 if (ret) 2133 goto out; 2134 2135 if (log) { 2136 struct btrfs_dir_item *log_di; 2137 2138 log_di = btrfs_lookup_dir_index_item(trans, log, log_path, 2139 dir_key->objectid, 2140 dir_key->offset, &name, 0); 2141 if (IS_ERR(log_di)) { 2142 ret = PTR_ERR(log_di); 2143 goto out; 2144 } else if (log_di) { 2145 /* The dentry exists in the log, we have nothing to do. */ 2146 ret = 0; 2147 goto out; 2148 } 2149 } 2150 2151 btrfs_dir_item_key_to_cpu(eb, di, &location); 2152 btrfs_release_path(path); 2153 btrfs_release_path(log_path); 2154 inode = read_one_inode(root, location.objectid); 2155 if (!inode) { 2156 ret = -EIO; 2157 goto out; 2158 } 2159 2160 ret = link_to_fixup_dir(trans, root, path, location.objectid); 2161 if (ret) 2162 goto out; 2163 2164 inc_nlink(inode); 2165 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), 2166 &name); 2167 /* 2168 * Unlike dir item keys, dir index keys can only have one name (entry) in 2169 * them, as there are no key collisions since each key has a unique offset 2170 * (an index number), so we're done. 2171 */ 2172 out: 2173 btrfs_release_path(path); 2174 btrfs_release_path(log_path); 2175 kfree(name.name); 2176 iput(inode); 2177 return ret; 2178 } 2179 2180 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2181 struct btrfs_root *root, 2182 struct btrfs_root *log, 2183 struct btrfs_path *path, 2184 const u64 ino) 2185 { 2186 struct btrfs_key search_key; 2187 struct btrfs_path *log_path; 2188 int i; 2189 int nritems; 2190 int ret; 2191 2192 log_path = btrfs_alloc_path(); 2193 if (!log_path) 2194 return -ENOMEM; 2195 2196 search_key.objectid = ino; 2197 search_key.type = BTRFS_XATTR_ITEM_KEY; 2198 search_key.offset = 0; 2199 again: 2200 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2201 if (ret < 0) 2202 goto out; 2203 process_leaf: 2204 nritems = btrfs_header_nritems(path->nodes[0]); 2205 for (i = path->slots[0]; i < nritems; i++) { 2206 struct btrfs_key key; 2207 struct btrfs_dir_item *di; 2208 struct btrfs_dir_item *log_di; 2209 u32 total_size; 2210 u32 cur; 2211 2212 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2213 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2214 ret = 0; 2215 goto out; 2216 } 2217 2218 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2219 total_size = btrfs_item_size(path->nodes[0], i); 2220 cur = 0; 2221 while (cur < total_size) { 2222 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2223 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2224 u32 this_len = sizeof(*di) + name_len + data_len; 2225 char *name; 2226 2227 name = kmalloc(name_len, GFP_NOFS); 2228 if (!name) { 2229 ret = -ENOMEM; 2230 goto out; 2231 } 2232 read_extent_buffer(path->nodes[0], name, 2233 (unsigned long)(di + 1), name_len); 2234 2235 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2236 name, name_len, 0); 2237 btrfs_release_path(log_path); 2238 if (!log_di) { 2239 /* Doesn't exist in log tree, so delete it. */ 2240 btrfs_release_path(path); 2241 di = btrfs_lookup_xattr(trans, root, path, ino, 2242 name, name_len, -1); 2243 kfree(name); 2244 if (IS_ERR(di)) { 2245 ret = PTR_ERR(di); 2246 goto out; 2247 } 2248 ASSERT(di); 2249 ret = btrfs_delete_one_dir_name(trans, root, 2250 path, di); 2251 if (ret) 2252 goto out; 2253 btrfs_release_path(path); 2254 search_key = key; 2255 goto again; 2256 } 2257 kfree(name); 2258 if (IS_ERR(log_di)) { 2259 ret = PTR_ERR(log_di); 2260 goto out; 2261 } 2262 cur += this_len; 2263 di = (struct btrfs_dir_item *)((char *)di + this_len); 2264 } 2265 } 2266 ret = btrfs_next_leaf(root, path); 2267 if (ret > 0) 2268 ret = 0; 2269 else if (ret == 0) 2270 goto process_leaf; 2271 out: 2272 btrfs_free_path(log_path); 2273 btrfs_release_path(path); 2274 return ret; 2275 } 2276 2277 2278 /* 2279 * deletion replay happens before we copy any new directory items 2280 * out of the log or out of backreferences from inodes. It 2281 * scans the log to find ranges of keys that log is authoritative for, 2282 * and then scans the directory to find items in those ranges that are 2283 * not present in the log. 2284 * 2285 * Anything we don't find in the log is unlinked and removed from the 2286 * directory. 2287 */ 2288 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2289 struct btrfs_root *root, 2290 struct btrfs_root *log, 2291 struct btrfs_path *path, 2292 u64 dirid, int del_all) 2293 { 2294 u64 range_start; 2295 u64 range_end; 2296 int ret = 0; 2297 struct btrfs_key dir_key; 2298 struct btrfs_key found_key; 2299 struct btrfs_path *log_path; 2300 struct inode *dir; 2301 2302 dir_key.objectid = dirid; 2303 dir_key.type = BTRFS_DIR_INDEX_KEY; 2304 log_path = btrfs_alloc_path(); 2305 if (!log_path) 2306 return -ENOMEM; 2307 2308 dir = read_one_inode(root, dirid); 2309 /* it isn't an error if the inode isn't there, that can happen 2310 * because we replay the deletes before we copy in the inode item 2311 * from the log 2312 */ 2313 if (!dir) { 2314 btrfs_free_path(log_path); 2315 return 0; 2316 } 2317 2318 range_start = 0; 2319 range_end = 0; 2320 while (1) { 2321 if (del_all) 2322 range_end = (u64)-1; 2323 else { 2324 ret = find_dir_range(log, path, dirid, 2325 &range_start, &range_end); 2326 if (ret < 0) 2327 goto out; 2328 else if (ret > 0) 2329 break; 2330 } 2331 2332 dir_key.offset = range_start; 2333 while (1) { 2334 int nritems; 2335 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2336 0, 0); 2337 if (ret < 0) 2338 goto out; 2339 2340 nritems = btrfs_header_nritems(path->nodes[0]); 2341 if (path->slots[0] >= nritems) { 2342 ret = btrfs_next_leaf(root, path); 2343 if (ret == 1) 2344 break; 2345 else if (ret < 0) 2346 goto out; 2347 } 2348 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2349 path->slots[0]); 2350 if (found_key.objectid != dirid || 2351 found_key.type != dir_key.type) { 2352 ret = 0; 2353 goto out; 2354 } 2355 2356 if (found_key.offset > range_end) 2357 break; 2358 2359 ret = check_item_in_log(trans, log, path, 2360 log_path, dir, 2361 &found_key); 2362 if (ret) 2363 goto out; 2364 if (found_key.offset == (u64)-1) 2365 break; 2366 dir_key.offset = found_key.offset + 1; 2367 } 2368 btrfs_release_path(path); 2369 if (range_end == (u64)-1) 2370 break; 2371 range_start = range_end + 1; 2372 } 2373 ret = 0; 2374 out: 2375 btrfs_release_path(path); 2376 btrfs_free_path(log_path); 2377 iput(dir); 2378 return ret; 2379 } 2380 2381 /* 2382 * the process_func used to replay items from the log tree. This 2383 * gets called in two different stages. The first stage just looks 2384 * for inodes and makes sure they are all copied into the subvolume. 2385 * 2386 * The second stage copies all the other item types from the log into 2387 * the subvolume. The two stage approach is slower, but gets rid of 2388 * lots of complexity around inodes referencing other inodes that exist 2389 * only in the log (references come from either directory items or inode 2390 * back refs). 2391 */ 2392 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2393 struct walk_control *wc, u64 gen, int level) 2394 { 2395 int nritems; 2396 struct btrfs_tree_parent_check check = { 2397 .transid = gen, 2398 .level = level 2399 }; 2400 struct btrfs_path *path; 2401 struct btrfs_root *root = wc->replay_dest; 2402 struct btrfs_key key; 2403 int i; 2404 int ret; 2405 2406 ret = btrfs_read_extent_buffer(eb, &check); 2407 if (ret) 2408 return ret; 2409 2410 level = btrfs_header_level(eb); 2411 2412 if (level != 0) 2413 return 0; 2414 2415 path = btrfs_alloc_path(); 2416 if (!path) 2417 return -ENOMEM; 2418 2419 nritems = btrfs_header_nritems(eb); 2420 for (i = 0; i < nritems; i++) { 2421 btrfs_item_key_to_cpu(eb, &key, i); 2422 2423 /* inode keys are done during the first stage */ 2424 if (key.type == BTRFS_INODE_ITEM_KEY && 2425 wc->stage == LOG_WALK_REPLAY_INODES) { 2426 struct btrfs_inode_item *inode_item; 2427 u32 mode; 2428 2429 inode_item = btrfs_item_ptr(eb, i, 2430 struct btrfs_inode_item); 2431 /* 2432 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2433 * and never got linked before the fsync, skip it, as 2434 * replaying it is pointless since it would be deleted 2435 * later. We skip logging tmpfiles, but it's always 2436 * possible we are replaying a log created with a kernel 2437 * that used to log tmpfiles. 2438 */ 2439 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2440 wc->ignore_cur_inode = true; 2441 continue; 2442 } else { 2443 wc->ignore_cur_inode = false; 2444 } 2445 ret = replay_xattr_deletes(wc->trans, root, log, 2446 path, key.objectid); 2447 if (ret) 2448 break; 2449 mode = btrfs_inode_mode(eb, inode_item); 2450 if (S_ISDIR(mode)) { 2451 ret = replay_dir_deletes(wc->trans, 2452 root, log, path, key.objectid, 0); 2453 if (ret) 2454 break; 2455 } 2456 ret = overwrite_item(wc->trans, root, path, 2457 eb, i, &key); 2458 if (ret) 2459 break; 2460 2461 /* 2462 * Before replaying extents, truncate the inode to its 2463 * size. We need to do it now and not after log replay 2464 * because before an fsync we can have prealloc extents 2465 * added beyond the inode's i_size. If we did it after, 2466 * through orphan cleanup for example, we would drop 2467 * those prealloc extents just after replaying them. 2468 */ 2469 if (S_ISREG(mode)) { 2470 struct btrfs_drop_extents_args drop_args = { 0 }; 2471 struct inode *inode; 2472 u64 from; 2473 2474 inode = read_one_inode(root, key.objectid); 2475 if (!inode) { 2476 ret = -EIO; 2477 break; 2478 } 2479 from = ALIGN(i_size_read(inode), 2480 root->fs_info->sectorsize); 2481 drop_args.start = from; 2482 drop_args.end = (u64)-1; 2483 drop_args.drop_cache = true; 2484 ret = btrfs_drop_extents(wc->trans, root, 2485 BTRFS_I(inode), 2486 &drop_args); 2487 if (!ret) { 2488 inode_sub_bytes(inode, 2489 drop_args.bytes_found); 2490 /* Update the inode's nbytes. */ 2491 ret = btrfs_update_inode(wc->trans, 2492 root, BTRFS_I(inode)); 2493 } 2494 iput(inode); 2495 if (ret) 2496 break; 2497 } 2498 2499 ret = link_to_fixup_dir(wc->trans, root, 2500 path, key.objectid); 2501 if (ret) 2502 break; 2503 } 2504 2505 if (wc->ignore_cur_inode) 2506 continue; 2507 2508 if (key.type == BTRFS_DIR_INDEX_KEY && 2509 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2510 ret = replay_one_dir_item(wc->trans, root, path, 2511 eb, i, &key); 2512 if (ret) 2513 break; 2514 } 2515 2516 if (wc->stage < LOG_WALK_REPLAY_ALL) 2517 continue; 2518 2519 /* these keys are simply copied */ 2520 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2521 ret = overwrite_item(wc->trans, root, path, 2522 eb, i, &key); 2523 if (ret) 2524 break; 2525 } else if (key.type == BTRFS_INODE_REF_KEY || 2526 key.type == BTRFS_INODE_EXTREF_KEY) { 2527 ret = add_inode_ref(wc->trans, root, log, path, 2528 eb, i, &key); 2529 if (ret && ret != -ENOENT) 2530 break; 2531 ret = 0; 2532 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2533 ret = replay_one_extent(wc->trans, root, path, 2534 eb, i, &key); 2535 if (ret) 2536 break; 2537 } 2538 /* 2539 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the 2540 * BTRFS_DIR_INDEX_KEY items which we use to derive the 2541 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an 2542 * older kernel with such keys, ignore them. 2543 */ 2544 } 2545 btrfs_free_path(path); 2546 return ret; 2547 } 2548 2549 /* 2550 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2551 */ 2552 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2553 { 2554 struct btrfs_block_group *cache; 2555 2556 cache = btrfs_lookup_block_group(fs_info, start); 2557 if (!cache) { 2558 btrfs_err(fs_info, "unable to find block group for %llu", start); 2559 return; 2560 } 2561 2562 spin_lock(&cache->space_info->lock); 2563 spin_lock(&cache->lock); 2564 cache->reserved -= fs_info->nodesize; 2565 cache->space_info->bytes_reserved -= fs_info->nodesize; 2566 spin_unlock(&cache->lock); 2567 spin_unlock(&cache->space_info->lock); 2568 2569 btrfs_put_block_group(cache); 2570 } 2571 2572 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2573 struct btrfs_root *root, 2574 struct btrfs_path *path, int *level, 2575 struct walk_control *wc) 2576 { 2577 struct btrfs_fs_info *fs_info = root->fs_info; 2578 u64 bytenr; 2579 u64 ptr_gen; 2580 struct extent_buffer *next; 2581 struct extent_buffer *cur; 2582 u32 blocksize; 2583 int ret = 0; 2584 2585 while (*level > 0) { 2586 struct btrfs_tree_parent_check check = { 0 }; 2587 2588 cur = path->nodes[*level]; 2589 2590 WARN_ON(btrfs_header_level(cur) != *level); 2591 2592 if (path->slots[*level] >= 2593 btrfs_header_nritems(cur)) 2594 break; 2595 2596 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2597 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2598 check.transid = ptr_gen; 2599 check.level = *level - 1; 2600 check.has_first_key = true; 2601 btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]); 2602 blocksize = fs_info->nodesize; 2603 2604 next = btrfs_find_create_tree_block(fs_info, bytenr, 2605 btrfs_header_owner(cur), 2606 *level - 1); 2607 if (IS_ERR(next)) 2608 return PTR_ERR(next); 2609 2610 if (*level == 1) { 2611 ret = wc->process_func(root, next, wc, ptr_gen, 2612 *level - 1); 2613 if (ret) { 2614 free_extent_buffer(next); 2615 return ret; 2616 } 2617 2618 path->slots[*level]++; 2619 if (wc->free) { 2620 ret = btrfs_read_extent_buffer(next, &check); 2621 if (ret) { 2622 free_extent_buffer(next); 2623 return ret; 2624 } 2625 2626 if (trans) { 2627 btrfs_tree_lock(next); 2628 btrfs_clean_tree_block(next); 2629 btrfs_wait_tree_block_writeback(next); 2630 btrfs_tree_unlock(next); 2631 ret = btrfs_pin_reserved_extent(trans, 2632 bytenr, blocksize); 2633 if (ret) { 2634 free_extent_buffer(next); 2635 return ret; 2636 } 2637 btrfs_redirty_list_add( 2638 trans->transaction, next); 2639 } else { 2640 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2641 clear_extent_buffer_dirty(next); 2642 unaccount_log_buffer(fs_info, bytenr); 2643 } 2644 } 2645 free_extent_buffer(next); 2646 continue; 2647 } 2648 ret = btrfs_read_extent_buffer(next, &check); 2649 if (ret) { 2650 free_extent_buffer(next); 2651 return ret; 2652 } 2653 2654 if (path->nodes[*level-1]) 2655 free_extent_buffer(path->nodes[*level-1]); 2656 path->nodes[*level-1] = next; 2657 *level = btrfs_header_level(next); 2658 path->slots[*level] = 0; 2659 cond_resched(); 2660 } 2661 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2662 2663 cond_resched(); 2664 return 0; 2665 } 2666 2667 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2668 struct btrfs_root *root, 2669 struct btrfs_path *path, int *level, 2670 struct walk_control *wc) 2671 { 2672 struct btrfs_fs_info *fs_info = root->fs_info; 2673 int i; 2674 int slot; 2675 int ret; 2676 2677 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2678 slot = path->slots[i]; 2679 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2680 path->slots[i]++; 2681 *level = i; 2682 WARN_ON(*level == 0); 2683 return 0; 2684 } else { 2685 ret = wc->process_func(root, path->nodes[*level], wc, 2686 btrfs_header_generation(path->nodes[*level]), 2687 *level); 2688 if (ret) 2689 return ret; 2690 2691 if (wc->free) { 2692 struct extent_buffer *next; 2693 2694 next = path->nodes[*level]; 2695 2696 if (trans) { 2697 btrfs_tree_lock(next); 2698 btrfs_clean_tree_block(next); 2699 btrfs_wait_tree_block_writeback(next); 2700 btrfs_tree_unlock(next); 2701 ret = btrfs_pin_reserved_extent(trans, 2702 path->nodes[*level]->start, 2703 path->nodes[*level]->len); 2704 if (ret) 2705 return ret; 2706 btrfs_redirty_list_add(trans->transaction, 2707 next); 2708 } else { 2709 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2710 clear_extent_buffer_dirty(next); 2711 2712 unaccount_log_buffer(fs_info, 2713 path->nodes[*level]->start); 2714 } 2715 } 2716 free_extent_buffer(path->nodes[*level]); 2717 path->nodes[*level] = NULL; 2718 *level = i + 1; 2719 } 2720 } 2721 return 1; 2722 } 2723 2724 /* 2725 * drop the reference count on the tree rooted at 'snap'. This traverses 2726 * the tree freeing any blocks that have a ref count of zero after being 2727 * decremented. 2728 */ 2729 static int walk_log_tree(struct btrfs_trans_handle *trans, 2730 struct btrfs_root *log, struct walk_control *wc) 2731 { 2732 struct btrfs_fs_info *fs_info = log->fs_info; 2733 int ret = 0; 2734 int wret; 2735 int level; 2736 struct btrfs_path *path; 2737 int orig_level; 2738 2739 path = btrfs_alloc_path(); 2740 if (!path) 2741 return -ENOMEM; 2742 2743 level = btrfs_header_level(log->node); 2744 orig_level = level; 2745 path->nodes[level] = log->node; 2746 atomic_inc(&log->node->refs); 2747 path->slots[level] = 0; 2748 2749 while (1) { 2750 wret = walk_down_log_tree(trans, log, path, &level, wc); 2751 if (wret > 0) 2752 break; 2753 if (wret < 0) { 2754 ret = wret; 2755 goto out; 2756 } 2757 2758 wret = walk_up_log_tree(trans, log, path, &level, wc); 2759 if (wret > 0) 2760 break; 2761 if (wret < 0) { 2762 ret = wret; 2763 goto out; 2764 } 2765 } 2766 2767 /* was the root node processed? if not, catch it here */ 2768 if (path->nodes[orig_level]) { 2769 ret = wc->process_func(log, path->nodes[orig_level], wc, 2770 btrfs_header_generation(path->nodes[orig_level]), 2771 orig_level); 2772 if (ret) 2773 goto out; 2774 if (wc->free) { 2775 struct extent_buffer *next; 2776 2777 next = path->nodes[orig_level]; 2778 2779 if (trans) { 2780 btrfs_tree_lock(next); 2781 btrfs_clean_tree_block(next); 2782 btrfs_wait_tree_block_writeback(next); 2783 btrfs_tree_unlock(next); 2784 ret = btrfs_pin_reserved_extent(trans, 2785 next->start, next->len); 2786 if (ret) 2787 goto out; 2788 btrfs_redirty_list_add(trans->transaction, next); 2789 } else { 2790 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2791 clear_extent_buffer_dirty(next); 2792 unaccount_log_buffer(fs_info, next->start); 2793 } 2794 } 2795 } 2796 2797 out: 2798 btrfs_free_path(path); 2799 return ret; 2800 } 2801 2802 /* 2803 * helper function to update the item for a given subvolumes log root 2804 * in the tree of log roots 2805 */ 2806 static int update_log_root(struct btrfs_trans_handle *trans, 2807 struct btrfs_root *log, 2808 struct btrfs_root_item *root_item) 2809 { 2810 struct btrfs_fs_info *fs_info = log->fs_info; 2811 int ret; 2812 2813 if (log->log_transid == 1) { 2814 /* insert root item on the first sync */ 2815 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2816 &log->root_key, root_item); 2817 } else { 2818 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2819 &log->root_key, root_item); 2820 } 2821 return ret; 2822 } 2823 2824 static void wait_log_commit(struct btrfs_root *root, int transid) 2825 { 2826 DEFINE_WAIT(wait); 2827 int index = transid % 2; 2828 2829 /* 2830 * we only allow two pending log transactions at a time, 2831 * so we know that if ours is more than 2 older than the 2832 * current transaction, we're done 2833 */ 2834 for (;;) { 2835 prepare_to_wait(&root->log_commit_wait[index], 2836 &wait, TASK_UNINTERRUPTIBLE); 2837 2838 if (!(root->log_transid_committed < transid && 2839 atomic_read(&root->log_commit[index]))) 2840 break; 2841 2842 mutex_unlock(&root->log_mutex); 2843 schedule(); 2844 mutex_lock(&root->log_mutex); 2845 } 2846 finish_wait(&root->log_commit_wait[index], &wait); 2847 } 2848 2849 static void wait_for_writer(struct btrfs_root *root) 2850 { 2851 DEFINE_WAIT(wait); 2852 2853 for (;;) { 2854 prepare_to_wait(&root->log_writer_wait, &wait, 2855 TASK_UNINTERRUPTIBLE); 2856 if (!atomic_read(&root->log_writers)) 2857 break; 2858 2859 mutex_unlock(&root->log_mutex); 2860 schedule(); 2861 mutex_lock(&root->log_mutex); 2862 } 2863 finish_wait(&root->log_writer_wait, &wait); 2864 } 2865 2866 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2867 struct btrfs_log_ctx *ctx) 2868 { 2869 mutex_lock(&root->log_mutex); 2870 list_del_init(&ctx->list); 2871 mutex_unlock(&root->log_mutex); 2872 } 2873 2874 /* 2875 * Invoked in log mutex context, or be sure there is no other task which 2876 * can access the list. 2877 */ 2878 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2879 int index, int error) 2880 { 2881 struct btrfs_log_ctx *ctx; 2882 struct btrfs_log_ctx *safe; 2883 2884 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2885 list_del_init(&ctx->list); 2886 ctx->log_ret = error; 2887 } 2888 } 2889 2890 /* 2891 * btrfs_sync_log does sends a given tree log down to the disk and 2892 * updates the super blocks to record it. When this call is done, 2893 * you know that any inodes previously logged are safely on disk only 2894 * if it returns 0. 2895 * 2896 * Any other return value means you need to call btrfs_commit_transaction. 2897 * Some of the edge cases for fsyncing directories that have had unlinks 2898 * or renames done in the past mean that sometimes the only safe 2899 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2900 * that has happened. 2901 */ 2902 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2903 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2904 { 2905 int index1; 2906 int index2; 2907 int mark; 2908 int ret; 2909 struct btrfs_fs_info *fs_info = root->fs_info; 2910 struct btrfs_root *log = root->log_root; 2911 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2912 struct btrfs_root_item new_root_item; 2913 int log_transid = 0; 2914 struct btrfs_log_ctx root_log_ctx; 2915 struct blk_plug plug; 2916 u64 log_root_start; 2917 u64 log_root_level; 2918 2919 mutex_lock(&root->log_mutex); 2920 log_transid = ctx->log_transid; 2921 if (root->log_transid_committed >= log_transid) { 2922 mutex_unlock(&root->log_mutex); 2923 return ctx->log_ret; 2924 } 2925 2926 index1 = log_transid % 2; 2927 if (atomic_read(&root->log_commit[index1])) { 2928 wait_log_commit(root, log_transid); 2929 mutex_unlock(&root->log_mutex); 2930 return ctx->log_ret; 2931 } 2932 ASSERT(log_transid == root->log_transid); 2933 atomic_set(&root->log_commit[index1], 1); 2934 2935 /* wait for previous tree log sync to complete */ 2936 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2937 wait_log_commit(root, log_transid - 1); 2938 2939 while (1) { 2940 int batch = atomic_read(&root->log_batch); 2941 /* when we're on an ssd, just kick the log commit out */ 2942 if (!btrfs_test_opt(fs_info, SSD) && 2943 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2944 mutex_unlock(&root->log_mutex); 2945 schedule_timeout_uninterruptible(1); 2946 mutex_lock(&root->log_mutex); 2947 } 2948 wait_for_writer(root); 2949 if (batch == atomic_read(&root->log_batch)) 2950 break; 2951 } 2952 2953 /* bail out if we need to do a full commit */ 2954 if (btrfs_need_log_full_commit(trans)) { 2955 ret = BTRFS_LOG_FORCE_COMMIT; 2956 mutex_unlock(&root->log_mutex); 2957 goto out; 2958 } 2959 2960 if (log_transid % 2 == 0) 2961 mark = EXTENT_DIRTY; 2962 else 2963 mark = EXTENT_NEW; 2964 2965 /* we start IO on all the marked extents here, but we don't actually 2966 * wait for them until later. 2967 */ 2968 blk_start_plug(&plug); 2969 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 2970 /* 2971 * -EAGAIN happens when someone, e.g., a concurrent transaction 2972 * commit, writes a dirty extent in this tree-log commit. This 2973 * concurrent write will create a hole writing out the extents, 2974 * and we cannot proceed on a zoned filesystem, requiring 2975 * sequential writing. While we can bail out to a full commit 2976 * here, but we can continue hoping the concurrent writing fills 2977 * the hole. 2978 */ 2979 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) 2980 ret = 0; 2981 if (ret) { 2982 blk_finish_plug(&plug); 2983 btrfs_set_log_full_commit(trans); 2984 mutex_unlock(&root->log_mutex); 2985 goto out; 2986 } 2987 2988 /* 2989 * We _must_ update under the root->log_mutex in order to make sure we 2990 * have a consistent view of the log root we are trying to commit at 2991 * this moment. 2992 * 2993 * We _must_ copy this into a local copy, because we are not holding the 2994 * log_root_tree->log_mutex yet. This is important because when we 2995 * commit the log_root_tree we must have a consistent view of the 2996 * log_root_tree when we update the super block to point at the 2997 * log_root_tree bytenr. If we update the log_root_tree here we'll race 2998 * with the commit and possibly point at the new block which we may not 2999 * have written out. 3000 */ 3001 btrfs_set_root_node(&log->root_item, log->node); 3002 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3003 3004 root->log_transid++; 3005 log->log_transid = root->log_transid; 3006 root->log_start_pid = 0; 3007 /* 3008 * IO has been started, blocks of the log tree have WRITTEN flag set 3009 * in their headers. new modifications of the log will be written to 3010 * new positions. so it's safe to allow log writers to go in. 3011 */ 3012 mutex_unlock(&root->log_mutex); 3013 3014 if (btrfs_is_zoned(fs_info)) { 3015 mutex_lock(&fs_info->tree_root->log_mutex); 3016 if (!log_root_tree->node) { 3017 ret = btrfs_alloc_log_tree_node(trans, log_root_tree); 3018 if (ret) { 3019 mutex_unlock(&fs_info->tree_root->log_mutex); 3020 blk_finish_plug(&plug); 3021 goto out; 3022 } 3023 } 3024 mutex_unlock(&fs_info->tree_root->log_mutex); 3025 } 3026 3027 btrfs_init_log_ctx(&root_log_ctx, NULL); 3028 3029 mutex_lock(&log_root_tree->log_mutex); 3030 3031 index2 = log_root_tree->log_transid % 2; 3032 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3033 root_log_ctx.log_transid = log_root_tree->log_transid; 3034 3035 /* 3036 * Now we are safe to update the log_root_tree because we're under the 3037 * log_mutex, and we're a current writer so we're holding the commit 3038 * open until we drop the log_mutex. 3039 */ 3040 ret = update_log_root(trans, log, &new_root_item); 3041 if (ret) { 3042 if (!list_empty(&root_log_ctx.list)) 3043 list_del_init(&root_log_ctx.list); 3044 3045 blk_finish_plug(&plug); 3046 btrfs_set_log_full_commit(trans); 3047 if (ret != -ENOSPC) 3048 btrfs_err(fs_info, 3049 "failed to update log for root %llu ret %d", 3050 root->root_key.objectid, ret); 3051 btrfs_wait_tree_log_extents(log, mark); 3052 mutex_unlock(&log_root_tree->log_mutex); 3053 goto out; 3054 } 3055 3056 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3057 blk_finish_plug(&plug); 3058 list_del_init(&root_log_ctx.list); 3059 mutex_unlock(&log_root_tree->log_mutex); 3060 ret = root_log_ctx.log_ret; 3061 goto out; 3062 } 3063 3064 index2 = root_log_ctx.log_transid % 2; 3065 if (atomic_read(&log_root_tree->log_commit[index2])) { 3066 blk_finish_plug(&plug); 3067 ret = btrfs_wait_tree_log_extents(log, mark); 3068 wait_log_commit(log_root_tree, 3069 root_log_ctx.log_transid); 3070 mutex_unlock(&log_root_tree->log_mutex); 3071 if (!ret) 3072 ret = root_log_ctx.log_ret; 3073 goto out; 3074 } 3075 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3076 atomic_set(&log_root_tree->log_commit[index2], 1); 3077 3078 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3079 wait_log_commit(log_root_tree, 3080 root_log_ctx.log_transid - 1); 3081 } 3082 3083 /* 3084 * now that we've moved on to the tree of log tree roots, 3085 * check the full commit flag again 3086 */ 3087 if (btrfs_need_log_full_commit(trans)) { 3088 blk_finish_plug(&plug); 3089 btrfs_wait_tree_log_extents(log, mark); 3090 mutex_unlock(&log_root_tree->log_mutex); 3091 ret = BTRFS_LOG_FORCE_COMMIT; 3092 goto out_wake_log_root; 3093 } 3094 3095 ret = btrfs_write_marked_extents(fs_info, 3096 &log_root_tree->dirty_log_pages, 3097 EXTENT_DIRTY | EXTENT_NEW); 3098 blk_finish_plug(&plug); 3099 /* 3100 * As described above, -EAGAIN indicates a hole in the extents. We 3101 * cannot wait for these write outs since the waiting cause a 3102 * deadlock. Bail out to the full commit instead. 3103 */ 3104 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { 3105 btrfs_set_log_full_commit(trans); 3106 btrfs_wait_tree_log_extents(log, mark); 3107 mutex_unlock(&log_root_tree->log_mutex); 3108 goto out_wake_log_root; 3109 } else if (ret) { 3110 btrfs_set_log_full_commit(trans); 3111 mutex_unlock(&log_root_tree->log_mutex); 3112 goto out_wake_log_root; 3113 } 3114 ret = btrfs_wait_tree_log_extents(log, mark); 3115 if (!ret) 3116 ret = btrfs_wait_tree_log_extents(log_root_tree, 3117 EXTENT_NEW | EXTENT_DIRTY); 3118 if (ret) { 3119 btrfs_set_log_full_commit(trans); 3120 mutex_unlock(&log_root_tree->log_mutex); 3121 goto out_wake_log_root; 3122 } 3123 3124 log_root_start = log_root_tree->node->start; 3125 log_root_level = btrfs_header_level(log_root_tree->node); 3126 log_root_tree->log_transid++; 3127 mutex_unlock(&log_root_tree->log_mutex); 3128 3129 /* 3130 * Here we are guaranteed that nobody is going to write the superblock 3131 * for the current transaction before us and that neither we do write 3132 * our superblock before the previous transaction finishes its commit 3133 * and writes its superblock, because: 3134 * 3135 * 1) We are holding a handle on the current transaction, so no body 3136 * can commit it until we release the handle; 3137 * 3138 * 2) Before writing our superblock we acquire the tree_log_mutex, so 3139 * if the previous transaction is still committing, and hasn't yet 3140 * written its superblock, we wait for it to do it, because a 3141 * transaction commit acquires the tree_log_mutex when the commit 3142 * begins and releases it only after writing its superblock. 3143 */ 3144 mutex_lock(&fs_info->tree_log_mutex); 3145 3146 /* 3147 * The previous transaction writeout phase could have failed, and thus 3148 * marked the fs in an error state. We must not commit here, as we 3149 * could have updated our generation in the super_for_commit and 3150 * writing the super here would result in transid mismatches. If there 3151 * is an error here just bail. 3152 */ 3153 if (BTRFS_FS_ERROR(fs_info)) { 3154 ret = -EIO; 3155 btrfs_set_log_full_commit(trans); 3156 btrfs_abort_transaction(trans, ret); 3157 mutex_unlock(&fs_info->tree_log_mutex); 3158 goto out_wake_log_root; 3159 } 3160 3161 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); 3162 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); 3163 ret = write_all_supers(fs_info, 1); 3164 mutex_unlock(&fs_info->tree_log_mutex); 3165 if (ret) { 3166 btrfs_set_log_full_commit(trans); 3167 btrfs_abort_transaction(trans, ret); 3168 goto out_wake_log_root; 3169 } 3170 3171 /* 3172 * We know there can only be one task here, since we have not yet set 3173 * root->log_commit[index1] to 0 and any task attempting to sync the 3174 * log must wait for the previous log transaction to commit if it's 3175 * still in progress or wait for the current log transaction commit if 3176 * someone else already started it. We use <= and not < because the 3177 * first log transaction has an ID of 0. 3178 */ 3179 ASSERT(root->last_log_commit <= log_transid); 3180 root->last_log_commit = log_transid; 3181 3182 out_wake_log_root: 3183 mutex_lock(&log_root_tree->log_mutex); 3184 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3185 3186 log_root_tree->log_transid_committed++; 3187 atomic_set(&log_root_tree->log_commit[index2], 0); 3188 mutex_unlock(&log_root_tree->log_mutex); 3189 3190 /* 3191 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3192 * all the updates above are seen by the woken threads. It might not be 3193 * necessary, but proving that seems to be hard. 3194 */ 3195 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3196 out: 3197 mutex_lock(&root->log_mutex); 3198 btrfs_remove_all_log_ctxs(root, index1, ret); 3199 root->log_transid_committed++; 3200 atomic_set(&root->log_commit[index1], 0); 3201 mutex_unlock(&root->log_mutex); 3202 3203 /* 3204 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3205 * all the updates above are seen by the woken threads. It might not be 3206 * necessary, but proving that seems to be hard. 3207 */ 3208 cond_wake_up(&root->log_commit_wait[index1]); 3209 return ret; 3210 } 3211 3212 static void free_log_tree(struct btrfs_trans_handle *trans, 3213 struct btrfs_root *log) 3214 { 3215 int ret; 3216 struct walk_control wc = { 3217 .free = 1, 3218 .process_func = process_one_buffer 3219 }; 3220 3221 if (log->node) { 3222 ret = walk_log_tree(trans, log, &wc); 3223 if (ret) { 3224 /* 3225 * We weren't able to traverse the entire log tree, the 3226 * typical scenario is getting an -EIO when reading an 3227 * extent buffer of the tree, due to a previous writeback 3228 * failure of it. 3229 */ 3230 set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, 3231 &log->fs_info->fs_state); 3232 3233 /* 3234 * Some extent buffers of the log tree may still be dirty 3235 * and not yet written back to storage, because we may 3236 * have updates to a log tree without syncing a log tree, 3237 * such as during rename and link operations. So flush 3238 * them out and wait for their writeback to complete, so 3239 * that we properly cleanup their state and pages. 3240 */ 3241 btrfs_write_marked_extents(log->fs_info, 3242 &log->dirty_log_pages, 3243 EXTENT_DIRTY | EXTENT_NEW); 3244 btrfs_wait_tree_log_extents(log, 3245 EXTENT_DIRTY | EXTENT_NEW); 3246 3247 if (trans) 3248 btrfs_abort_transaction(trans, ret); 3249 else 3250 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3251 } 3252 } 3253 3254 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3255 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3256 extent_io_tree_release(&log->log_csum_range); 3257 3258 btrfs_put_root(log); 3259 } 3260 3261 /* 3262 * free all the extents used by the tree log. This should be called 3263 * at commit time of the full transaction 3264 */ 3265 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3266 { 3267 if (root->log_root) { 3268 free_log_tree(trans, root->log_root); 3269 root->log_root = NULL; 3270 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3271 } 3272 return 0; 3273 } 3274 3275 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3276 struct btrfs_fs_info *fs_info) 3277 { 3278 if (fs_info->log_root_tree) { 3279 free_log_tree(trans, fs_info->log_root_tree); 3280 fs_info->log_root_tree = NULL; 3281 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); 3282 } 3283 return 0; 3284 } 3285 3286 /* 3287 * Check if an inode was logged in the current transaction. This correctly deals 3288 * with the case where the inode was logged but has a logged_trans of 0, which 3289 * happens if the inode is evicted and loaded again, as logged_trans is an in 3290 * memory only field (not persisted). 3291 * 3292 * Returns 1 if the inode was logged before in the transaction, 0 if it was not, 3293 * and < 0 on error. 3294 */ 3295 static int inode_logged(struct btrfs_trans_handle *trans, 3296 struct btrfs_inode *inode, 3297 struct btrfs_path *path_in) 3298 { 3299 struct btrfs_path *path = path_in; 3300 struct btrfs_key key; 3301 int ret; 3302 3303 if (inode->logged_trans == trans->transid) 3304 return 1; 3305 3306 /* 3307 * If logged_trans is not 0, then we know the inode logged was not logged 3308 * in this transaction, so we can return false right away. 3309 */ 3310 if (inode->logged_trans > 0) 3311 return 0; 3312 3313 /* 3314 * If no log tree was created for this root in this transaction, then 3315 * the inode can not have been logged in this transaction. In that case 3316 * set logged_trans to anything greater than 0 and less than the current 3317 * transaction's ID, to avoid the search below in a future call in case 3318 * a log tree gets created after this. 3319 */ 3320 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { 3321 inode->logged_trans = trans->transid - 1; 3322 return 0; 3323 } 3324 3325 /* 3326 * We have a log tree and the inode's logged_trans is 0. We can't tell 3327 * for sure if the inode was logged before in this transaction by looking 3328 * only at logged_trans. We could be pessimistic and assume it was, but 3329 * that can lead to unnecessarily logging an inode during rename and link 3330 * operations, and then further updating the log in followup rename and 3331 * link operations, specially if it's a directory, which adds latency 3332 * visible to applications doing a series of rename or link operations. 3333 * 3334 * A logged_trans of 0 here can mean several things: 3335 * 3336 * 1) The inode was never logged since the filesystem was mounted, and may 3337 * or may have not been evicted and loaded again; 3338 * 3339 * 2) The inode was logged in a previous transaction, then evicted and 3340 * then loaded again; 3341 * 3342 * 3) The inode was logged in the current transaction, then evicted and 3343 * then loaded again. 3344 * 3345 * For cases 1) and 2) we don't want to return true, but we need to detect 3346 * case 3) and return true. So we do a search in the log root for the inode 3347 * item. 3348 */ 3349 key.objectid = btrfs_ino(inode); 3350 key.type = BTRFS_INODE_ITEM_KEY; 3351 key.offset = 0; 3352 3353 if (!path) { 3354 path = btrfs_alloc_path(); 3355 if (!path) 3356 return -ENOMEM; 3357 } 3358 3359 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 3360 3361 if (path_in) 3362 btrfs_release_path(path); 3363 else 3364 btrfs_free_path(path); 3365 3366 /* 3367 * Logging an inode always results in logging its inode item. So if we 3368 * did not find the item we know the inode was not logged for sure. 3369 */ 3370 if (ret < 0) { 3371 return ret; 3372 } else if (ret > 0) { 3373 /* 3374 * Set logged_trans to a value greater than 0 and less then the 3375 * current transaction to avoid doing the search in future calls. 3376 */ 3377 inode->logged_trans = trans->transid - 1; 3378 return 0; 3379 } 3380 3381 /* 3382 * The inode was previously logged and then evicted, set logged_trans to 3383 * the current transacion's ID, to avoid future tree searches as long as 3384 * the inode is not evicted again. 3385 */ 3386 inode->logged_trans = trans->transid; 3387 3388 /* 3389 * If it's a directory, then we must set last_dir_index_offset to the 3390 * maximum possible value, so that the next attempt to log the inode does 3391 * not skip checking if dir index keys found in modified subvolume tree 3392 * leaves have been logged before, otherwise it would result in attempts 3393 * to insert duplicate dir index keys in the log tree. This must be done 3394 * because last_dir_index_offset is an in-memory only field, not persisted 3395 * in the inode item or any other on-disk structure, so its value is lost 3396 * once the inode is evicted. 3397 */ 3398 if (S_ISDIR(inode->vfs_inode.i_mode)) 3399 inode->last_dir_index_offset = (u64)-1; 3400 3401 return 1; 3402 } 3403 3404 /* 3405 * Delete a directory entry from the log if it exists. 3406 * 3407 * Returns < 0 on error 3408 * 1 if the entry does not exists 3409 * 0 if the entry existed and was successfully deleted 3410 */ 3411 static int del_logged_dentry(struct btrfs_trans_handle *trans, 3412 struct btrfs_root *log, 3413 struct btrfs_path *path, 3414 u64 dir_ino, 3415 const struct fscrypt_str *name, 3416 u64 index) 3417 { 3418 struct btrfs_dir_item *di; 3419 3420 /* 3421 * We only log dir index items of a directory, so we don't need to look 3422 * for dir item keys. 3423 */ 3424 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3425 index, name, -1); 3426 if (IS_ERR(di)) 3427 return PTR_ERR(di); 3428 else if (!di) 3429 return 1; 3430 3431 /* 3432 * We do not need to update the size field of the directory's 3433 * inode item because on log replay we update the field to reflect 3434 * all existing entries in the directory (see overwrite_item()). 3435 */ 3436 return btrfs_delete_one_dir_name(trans, log, path, di); 3437 } 3438 3439 /* 3440 * If both a file and directory are logged, and unlinks or renames are 3441 * mixed in, we have a few interesting corners: 3442 * 3443 * create file X in dir Y 3444 * link file X to X.link in dir Y 3445 * fsync file X 3446 * unlink file X but leave X.link 3447 * fsync dir Y 3448 * 3449 * After a crash we would expect only X.link to exist. But file X 3450 * didn't get fsync'd again so the log has back refs for X and X.link. 3451 * 3452 * We solve this by removing directory entries and inode backrefs from the 3453 * log when a file that was logged in the current transaction is 3454 * unlinked. Any later fsync will include the updated log entries, and 3455 * we'll be able to reconstruct the proper directory items from backrefs. 3456 * 3457 * This optimizations allows us to avoid relogging the entire inode 3458 * or the entire directory. 3459 */ 3460 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3461 struct btrfs_root *root, 3462 const struct fscrypt_str *name, 3463 struct btrfs_inode *dir, u64 index) 3464 { 3465 struct btrfs_path *path; 3466 int ret; 3467 3468 ret = inode_logged(trans, dir, NULL); 3469 if (ret == 0) 3470 return; 3471 else if (ret < 0) { 3472 btrfs_set_log_full_commit(trans); 3473 return; 3474 } 3475 3476 ret = join_running_log_trans(root); 3477 if (ret) 3478 return; 3479 3480 mutex_lock(&dir->log_mutex); 3481 3482 path = btrfs_alloc_path(); 3483 if (!path) { 3484 ret = -ENOMEM; 3485 goto out_unlock; 3486 } 3487 3488 ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), 3489 name, index); 3490 btrfs_free_path(path); 3491 out_unlock: 3492 mutex_unlock(&dir->log_mutex); 3493 if (ret < 0) 3494 btrfs_set_log_full_commit(trans); 3495 btrfs_end_log_trans(root); 3496 } 3497 3498 /* see comments for btrfs_del_dir_entries_in_log */ 3499 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3500 struct btrfs_root *root, 3501 const struct fscrypt_str *name, 3502 struct btrfs_inode *inode, u64 dirid) 3503 { 3504 struct btrfs_root *log; 3505 u64 index; 3506 int ret; 3507 3508 ret = inode_logged(trans, inode, NULL); 3509 if (ret == 0) 3510 return; 3511 else if (ret < 0) { 3512 btrfs_set_log_full_commit(trans); 3513 return; 3514 } 3515 3516 ret = join_running_log_trans(root); 3517 if (ret) 3518 return; 3519 log = root->log_root; 3520 mutex_lock(&inode->log_mutex); 3521 3522 ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), 3523 dirid, &index); 3524 mutex_unlock(&inode->log_mutex); 3525 if (ret < 0 && ret != -ENOENT) 3526 btrfs_set_log_full_commit(trans); 3527 btrfs_end_log_trans(root); 3528 } 3529 3530 /* 3531 * creates a range item in the log for 'dirid'. first_offset and 3532 * last_offset tell us which parts of the key space the log should 3533 * be considered authoritative for. 3534 */ 3535 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3536 struct btrfs_root *log, 3537 struct btrfs_path *path, 3538 u64 dirid, 3539 u64 first_offset, u64 last_offset) 3540 { 3541 int ret; 3542 struct btrfs_key key; 3543 struct btrfs_dir_log_item *item; 3544 3545 key.objectid = dirid; 3546 key.offset = first_offset; 3547 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3548 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3549 /* 3550 * -EEXIST is fine and can happen sporadically when we are logging a 3551 * directory and have concurrent insertions in the subvolume's tree for 3552 * items from other inodes and that result in pushing off some dir items 3553 * from one leaf to another in order to accommodate for the new items. 3554 * This results in logging the same dir index range key. 3555 */ 3556 if (ret && ret != -EEXIST) 3557 return ret; 3558 3559 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3560 struct btrfs_dir_log_item); 3561 if (ret == -EEXIST) { 3562 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item); 3563 3564 /* 3565 * btrfs_del_dir_entries_in_log() might have been called during 3566 * an unlink between the initial insertion of this key and the 3567 * current update, or we might be logging a single entry deletion 3568 * during a rename, so set the new last_offset to the max value. 3569 */ 3570 last_offset = max(last_offset, curr_end); 3571 } 3572 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3573 btrfs_mark_buffer_dirty(path->nodes[0]); 3574 btrfs_release_path(path); 3575 return 0; 3576 } 3577 3578 static int flush_dir_items_batch(struct btrfs_trans_handle *trans, 3579 struct btrfs_inode *inode, 3580 struct extent_buffer *src, 3581 struct btrfs_path *dst_path, 3582 int start_slot, 3583 int count) 3584 { 3585 struct btrfs_root *log = inode->root->log_root; 3586 char *ins_data = NULL; 3587 struct btrfs_item_batch batch; 3588 struct extent_buffer *dst; 3589 unsigned long src_offset; 3590 unsigned long dst_offset; 3591 u64 last_index; 3592 struct btrfs_key key; 3593 u32 item_size; 3594 int ret; 3595 int i; 3596 3597 ASSERT(count > 0); 3598 batch.nr = count; 3599 3600 if (count == 1) { 3601 btrfs_item_key_to_cpu(src, &key, start_slot); 3602 item_size = btrfs_item_size(src, start_slot); 3603 batch.keys = &key; 3604 batch.data_sizes = &item_size; 3605 batch.total_data_size = item_size; 3606 } else { 3607 struct btrfs_key *ins_keys; 3608 u32 *ins_sizes; 3609 3610 ins_data = kmalloc(count * sizeof(u32) + 3611 count * sizeof(struct btrfs_key), GFP_NOFS); 3612 if (!ins_data) 3613 return -ENOMEM; 3614 3615 ins_sizes = (u32 *)ins_data; 3616 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32)); 3617 batch.keys = ins_keys; 3618 batch.data_sizes = ins_sizes; 3619 batch.total_data_size = 0; 3620 3621 for (i = 0; i < count; i++) { 3622 const int slot = start_slot + i; 3623 3624 btrfs_item_key_to_cpu(src, &ins_keys[i], slot); 3625 ins_sizes[i] = btrfs_item_size(src, slot); 3626 batch.total_data_size += ins_sizes[i]; 3627 } 3628 } 3629 3630 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 3631 if (ret) 3632 goto out; 3633 3634 dst = dst_path->nodes[0]; 3635 /* 3636 * Copy all the items in bulk, in a single copy operation. Item data is 3637 * organized such that it's placed at the end of a leaf and from right 3638 * to left. For example, the data for the second item ends at an offset 3639 * that matches the offset where the data for the first item starts, the 3640 * data for the third item ends at an offset that matches the offset 3641 * where the data of the second items starts, and so on. 3642 * Therefore our source and destination start offsets for copy match the 3643 * offsets of the last items (highest slots). 3644 */ 3645 dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1); 3646 src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); 3647 copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); 3648 btrfs_release_path(dst_path); 3649 3650 last_index = batch.keys[count - 1].offset; 3651 ASSERT(last_index > inode->last_dir_index_offset); 3652 3653 /* 3654 * If for some unexpected reason the last item's index is not greater 3655 * than the last index we logged, warn and return an error to fallback 3656 * to a transaction commit. 3657 */ 3658 if (WARN_ON(last_index <= inode->last_dir_index_offset)) 3659 ret = -EUCLEAN; 3660 else 3661 inode->last_dir_index_offset = last_index; 3662 out: 3663 kfree(ins_data); 3664 3665 return ret; 3666 } 3667 3668 static int process_dir_items_leaf(struct btrfs_trans_handle *trans, 3669 struct btrfs_inode *inode, 3670 struct btrfs_path *path, 3671 struct btrfs_path *dst_path, 3672 struct btrfs_log_ctx *ctx, 3673 u64 *last_old_dentry_offset) 3674 { 3675 struct btrfs_root *log = inode->root->log_root; 3676 struct extent_buffer *src; 3677 const int nritems = btrfs_header_nritems(path->nodes[0]); 3678 const u64 ino = btrfs_ino(inode); 3679 bool last_found = false; 3680 int batch_start = 0; 3681 int batch_size = 0; 3682 int i; 3683 3684 /* 3685 * We need to clone the leaf, release the read lock on it, and use the 3686 * clone before modifying the log tree. See the comment at copy_items() 3687 * about why we need to do this. 3688 */ 3689 src = btrfs_clone_extent_buffer(path->nodes[0]); 3690 if (!src) 3691 return -ENOMEM; 3692 3693 i = path->slots[0]; 3694 btrfs_release_path(path); 3695 path->nodes[0] = src; 3696 path->slots[0] = i; 3697 3698 for (; i < nritems; i++) { 3699 struct btrfs_dir_item *di; 3700 struct btrfs_key key; 3701 int ret; 3702 3703 btrfs_item_key_to_cpu(src, &key, i); 3704 3705 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { 3706 last_found = true; 3707 break; 3708 } 3709 3710 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3711 3712 /* 3713 * Skip ranges of items that consist only of dir item keys created 3714 * in past transactions. However if we find a gap, we must log a 3715 * dir index range item for that gap, so that index keys in that 3716 * gap are deleted during log replay. 3717 */ 3718 if (btrfs_dir_transid(src, di) < trans->transid) { 3719 if (key.offset > *last_old_dentry_offset + 1) { 3720 ret = insert_dir_log_key(trans, log, dst_path, 3721 ino, *last_old_dentry_offset + 1, 3722 key.offset - 1); 3723 if (ret < 0) 3724 return ret; 3725 } 3726 3727 *last_old_dentry_offset = key.offset; 3728 continue; 3729 } 3730 3731 /* If we logged this dir index item before, we can skip it. */ 3732 if (key.offset <= inode->last_dir_index_offset) 3733 continue; 3734 3735 /* 3736 * We must make sure that when we log a directory entry, the 3737 * corresponding inode, after log replay, has a matching link 3738 * count. For example: 3739 * 3740 * touch foo 3741 * mkdir mydir 3742 * sync 3743 * ln foo mydir/bar 3744 * xfs_io -c "fsync" mydir 3745 * <crash> 3746 * <mount fs and log replay> 3747 * 3748 * Would result in a fsync log that when replayed, our file inode 3749 * would have a link count of 1, but we get two directory entries 3750 * pointing to the same inode. After removing one of the names, 3751 * it would not be possible to remove the other name, which 3752 * resulted always in stale file handle errors, and would not be 3753 * possible to rmdir the parent directory, since its i_size could 3754 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE, 3755 * resulting in -ENOTEMPTY errors. 3756 */ 3757 if (!ctx->log_new_dentries) { 3758 struct btrfs_key di_key; 3759 3760 btrfs_dir_item_key_to_cpu(src, di, &di_key); 3761 if (di_key.type != BTRFS_ROOT_ITEM_KEY) 3762 ctx->log_new_dentries = true; 3763 } 3764 3765 if (batch_size == 0) 3766 batch_start = i; 3767 batch_size++; 3768 } 3769 3770 if (batch_size > 0) { 3771 int ret; 3772 3773 ret = flush_dir_items_batch(trans, inode, src, dst_path, 3774 batch_start, batch_size); 3775 if (ret < 0) 3776 return ret; 3777 } 3778 3779 return last_found ? 1 : 0; 3780 } 3781 3782 /* 3783 * log all the items included in the current transaction for a given 3784 * directory. This also creates the range items in the log tree required 3785 * to replay anything deleted before the fsync 3786 */ 3787 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3788 struct btrfs_inode *inode, 3789 struct btrfs_path *path, 3790 struct btrfs_path *dst_path, 3791 struct btrfs_log_ctx *ctx, 3792 u64 min_offset, u64 *last_offset_ret) 3793 { 3794 struct btrfs_key min_key; 3795 struct btrfs_root *root = inode->root; 3796 struct btrfs_root *log = root->log_root; 3797 int err = 0; 3798 int ret; 3799 u64 last_old_dentry_offset = min_offset - 1; 3800 u64 last_offset = (u64)-1; 3801 u64 ino = btrfs_ino(inode); 3802 3803 min_key.objectid = ino; 3804 min_key.type = BTRFS_DIR_INDEX_KEY; 3805 min_key.offset = min_offset; 3806 3807 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3808 3809 /* 3810 * we didn't find anything from this transaction, see if there 3811 * is anything at all 3812 */ 3813 if (ret != 0 || min_key.objectid != ino || 3814 min_key.type != BTRFS_DIR_INDEX_KEY) { 3815 min_key.objectid = ino; 3816 min_key.type = BTRFS_DIR_INDEX_KEY; 3817 min_key.offset = (u64)-1; 3818 btrfs_release_path(path); 3819 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3820 if (ret < 0) { 3821 btrfs_release_path(path); 3822 return ret; 3823 } 3824 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 3825 3826 /* if ret == 0 there are items for this type, 3827 * create a range to tell us the last key of this type. 3828 * otherwise, there are no items in this directory after 3829 * *min_offset, and we create a range to indicate that. 3830 */ 3831 if (ret == 0) { 3832 struct btrfs_key tmp; 3833 3834 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3835 path->slots[0]); 3836 if (tmp.type == BTRFS_DIR_INDEX_KEY) 3837 last_old_dentry_offset = tmp.offset; 3838 } else if (ret < 0) { 3839 err = ret; 3840 } 3841 3842 goto done; 3843 } 3844 3845 /* go backward to find any previous key */ 3846 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 3847 if (ret == 0) { 3848 struct btrfs_key tmp; 3849 3850 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3851 /* 3852 * The dir index key before the first one we found that needs to 3853 * be logged might be in a previous leaf, and there might be a 3854 * gap between these keys, meaning that we had deletions that 3855 * happened. So the key range item we log (key type 3856 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the 3857 * previous key's offset plus 1, so that those deletes are replayed. 3858 */ 3859 if (tmp.type == BTRFS_DIR_INDEX_KEY) 3860 last_old_dentry_offset = tmp.offset; 3861 } else if (ret < 0) { 3862 err = ret; 3863 goto done; 3864 } 3865 3866 btrfs_release_path(path); 3867 3868 /* 3869 * Find the first key from this transaction again or the one we were at 3870 * in the loop below in case we had to reschedule. We may be logging the 3871 * directory without holding its VFS lock, which happen when logging new 3872 * dentries (through log_new_dir_dentries()) or in some cases when we 3873 * need to log the parent directory of an inode. This means a dir index 3874 * key might be deleted from the inode's root, and therefore we may not 3875 * find it anymore. If we can't find it, just move to the next key. We 3876 * can not bail out and ignore, because if we do that we will simply 3877 * not log dir index keys that come after the one that was just deleted 3878 * and we can end up logging a dir index range that ends at (u64)-1 3879 * (@last_offset is initialized to that), resulting in removing dir 3880 * entries we should not remove at log replay time. 3881 */ 3882 search: 3883 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3884 if (ret > 0) 3885 ret = btrfs_next_item(root, path); 3886 if (ret < 0) 3887 err = ret; 3888 /* If ret is 1, there are no more keys in the inode's root. */ 3889 if (ret != 0) 3890 goto done; 3891 3892 /* 3893 * we have a block from this transaction, log every item in it 3894 * from our directory 3895 */ 3896 while (1) { 3897 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, 3898 &last_old_dentry_offset); 3899 if (ret != 0) { 3900 if (ret < 0) 3901 err = ret; 3902 goto done; 3903 } 3904 path->slots[0] = btrfs_header_nritems(path->nodes[0]); 3905 3906 /* 3907 * look ahead to the next item and see if it is also 3908 * from this directory and from this transaction 3909 */ 3910 ret = btrfs_next_leaf(root, path); 3911 if (ret) { 3912 if (ret == 1) 3913 last_offset = (u64)-1; 3914 else 3915 err = ret; 3916 goto done; 3917 } 3918 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); 3919 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { 3920 last_offset = (u64)-1; 3921 goto done; 3922 } 3923 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3924 /* 3925 * The next leaf was not changed in the current transaction 3926 * and has at least one dir index key. 3927 * We check for the next key because there might have been 3928 * one or more deletions between the last key we logged and 3929 * that next key. So the key range item we log (key type 3930 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's 3931 * offset minus 1, so that those deletes are replayed. 3932 */ 3933 last_offset = min_key.offset - 1; 3934 goto done; 3935 } 3936 if (need_resched()) { 3937 btrfs_release_path(path); 3938 cond_resched(); 3939 goto search; 3940 } 3941 } 3942 done: 3943 btrfs_release_path(path); 3944 btrfs_release_path(dst_path); 3945 3946 if (err == 0) { 3947 *last_offset_ret = last_offset; 3948 /* 3949 * In case the leaf was changed in the current transaction but 3950 * all its dir items are from a past transaction, the last item 3951 * in the leaf is a dir item and there's no gap between that last 3952 * dir item and the first one on the next leaf (which did not 3953 * change in the current transaction), then we don't need to log 3954 * a range, last_old_dentry_offset is == to last_offset. 3955 */ 3956 ASSERT(last_old_dentry_offset <= last_offset); 3957 if (last_old_dentry_offset < last_offset) { 3958 ret = insert_dir_log_key(trans, log, path, ino, 3959 last_old_dentry_offset + 1, 3960 last_offset); 3961 if (ret) 3962 err = ret; 3963 } 3964 } 3965 return err; 3966 } 3967 3968 /* 3969 * If the inode was logged before and it was evicted, then its 3970 * last_dir_index_offset is (u64)-1, so we don't the value of the last index 3971 * key offset. If that's the case, search for it and update the inode. This 3972 * is to avoid lookups in the log tree every time we try to insert a dir index 3973 * key from a leaf changed in the current transaction, and to allow us to always 3974 * do batch insertions of dir index keys. 3975 */ 3976 static int update_last_dir_index_offset(struct btrfs_inode *inode, 3977 struct btrfs_path *path, 3978 const struct btrfs_log_ctx *ctx) 3979 { 3980 const u64 ino = btrfs_ino(inode); 3981 struct btrfs_key key; 3982 int ret; 3983 3984 lockdep_assert_held(&inode->log_mutex); 3985 3986 if (inode->last_dir_index_offset != (u64)-1) 3987 return 0; 3988 3989 if (!ctx->logged_before) { 3990 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; 3991 return 0; 3992 } 3993 3994 key.objectid = ino; 3995 key.type = BTRFS_DIR_INDEX_KEY; 3996 key.offset = (u64)-1; 3997 3998 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 3999 /* 4000 * An error happened or we actually have an index key with an offset 4001 * value of (u64)-1. Bail out, we're done. 4002 */ 4003 if (ret <= 0) 4004 goto out; 4005 4006 ret = 0; 4007 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; 4008 4009 /* 4010 * No dir index items, bail out and leave last_dir_index_offset with 4011 * the value right before the first valid index value. 4012 */ 4013 if (path->slots[0] == 0) 4014 goto out; 4015 4016 /* 4017 * btrfs_search_slot() left us at one slot beyond the slot with the last 4018 * index key, or beyond the last key of the directory that is not an 4019 * index key. If we have an index key before, set last_dir_index_offset 4020 * to its offset value, otherwise leave it with a value right before the 4021 * first valid index value, as it means we have an empty directory. 4022 */ 4023 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); 4024 if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY) 4025 inode->last_dir_index_offset = key.offset; 4026 4027 out: 4028 btrfs_release_path(path); 4029 4030 return ret; 4031 } 4032 4033 /* 4034 * logging directories is very similar to logging inodes, We find all the items 4035 * from the current transaction and write them to the log. 4036 * 4037 * The recovery code scans the directory in the subvolume, and if it finds a 4038 * key in the range logged that is not present in the log tree, then it means 4039 * that dir entry was unlinked during the transaction. 4040 * 4041 * In order for that scan to work, we must include one key smaller than 4042 * the smallest logged by this transaction and one key larger than the largest 4043 * key logged by this transaction. 4044 */ 4045 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 4046 struct btrfs_inode *inode, 4047 struct btrfs_path *path, 4048 struct btrfs_path *dst_path, 4049 struct btrfs_log_ctx *ctx) 4050 { 4051 u64 min_key; 4052 u64 max_key; 4053 int ret; 4054 4055 ret = update_last_dir_index_offset(inode, path, ctx); 4056 if (ret) 4057 return ret; 4058 4059 min_key = BTRFS_DIR_START_INDEX; 4060 max_key = 0; 4061 4062 while (1) { 4063 ret = log_dir_items(trans, inode, path, dst_path, 4064 ctx, min_key, &max_key); 4065 if (ret) 4066 return ret; 4067 if (max_key == (u64)-1) 4068 break; 4069 min_key = max_key + 1; 4070 } 4071 4072 return 0; 4073 } 4074 4075 /* 4076 * a helper function to drop items from the log before we relog an 4077 * inode. max_key_type indicates the highest item type to remove. 4078 * This cannot be run for file data extents because it does not 4079 * free the extents they point to. 4080 */ 4081 static int drop_inode_items(struct btrfs_trans_handle *trans, 4082 struct btrfs_root *log, 4083 struct btrfs_path *path, 4084 struct btrfs_inode *inode, 4085 int max_key_type) 4086 { 4087 int ret; 4088 struct btrfs_key key; 4089 struct btrfs_key found_key; 4090 int start_slot; 4091 4092 key.objectid = btrfs_ino(inode); 4093 key.type = max_key_type; 4094 key.offset = (u64)-1; 4095 4096 while (1) { 4097 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 4098 BUG_ON(ret == 0); /* Logic error */ 4099 if (ret < 0) 4100 break; 4101 4102 if (path->slots[0] == 0) 4103 break; 4104 4105 path->slots[0]--; 4106 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4107 path->slots[0]); 4108 4109 if (found_key.objectid != key.objectid) 4110 break; 4111 4112 found_key.offset = 0; 4113 found_key.type = 0; 4114 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); 4115 if (ret < 0) 4116 break; 4117 4118 ret = btrfs_del_items(trans, log, path, start_slot, 4119 path->slots[0] - start_slot + 1); 4120 /* 4121 * If start slot isn't 0 then we don't need to re-search, we've 4122 * found the last guy with the objectid in this tree. 4123 */ 4124 if (ret || start_slot != 0) 4125 break; 4126 btrfs_release_path(path); 4127 } 4128 btrfs_release_path(path); 4129 if (ret > 0) 4130 ret = 0; 4131 return ret; 4132 } 4133 4134 static int truncate_inode_items(struct btrfs_trans_handle *trans, 4135 struct btrfs_root *log_root, 4136 struct btrfs_inode *inode, 4137 u64 new_size, u32 min_type) 4138 { 4139 struct btrfs_truncate_control control = { 4140 .new_size = new_size, 4141 .ino = btrfs_ino(inode), 4142 .min_type = min_type, 4143 .skip_ref_updates = true, 4144 }; 4145 4146 return btrfs_truncate_inode_items(trans, log_root, &control); 4147 } 4148 4149 static void fill_inode_item(struct btrfs_trans_handle *trans, 4150 struct extent_buffer *leaf, 4151 struct btrfs_inode_item *item, 4152 struct inode *inode, int log_inode_only, 4153 u64 logged_isize) 4154 { 4155 struct btrfs_map_token token; 4156 u64 flags; 4157 4158 btrfs_init_map_token(&token, leaf); 4159 4160 if (log_inode_only) { 4161 /* set the generation to zero so the recover code 4162 * can tell the difference between an logging 4163 * just to say 'this inode exists' and a logging 4164 * to say 'update this inode with these values' 4165 */ 4166 btrfs_set_token_inode_generation(&token, item, 0); 4167 btrfs_set_token_inode_size(&token, item, logged_isize); 4168 } else { 4169 btrfs_set_token_inode_generation(&token, item, 4170 BTRFS_I(inode)->generation); 4171 btrfs_set_token_inode_size(&token, item, inode->i_size); 4172 } 4173 4174 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 4175 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 4176 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 4177 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 4178 4179 btrfs_set_token_timespec_sec(&token, &item->atime, 4180 inode->i_atime.tv_sec); 4181 btrfs_set_token_timespec_nsec(&token, &item->atime, 4182 inode->i_atime.tv_nsec); 4183 4184 btrfs_set_token_timespec_sec(&token, &item->mtime, 4185 inode->i_mtime.tv_sec); 4186 btrfs_set_token_timespec_nsec(&token, &item->mtime, 4187 inode->i_mtime.tv_nsec); 4188 4189 btrfs_set_token_timespec_sec(&token, &item->ctime, 4190 inode->i_ctime.tv_sec); 4191 btrfs_set_token_timespec_nsec(&token, &item->ctime, 4192 inode->i_ctime.tv_nsec); 4193 4194 /* 4195 * We do not need to set the nbytes field, in fact during a fast fsync 4196 * its value may not even be correct, since a fast fsync does not wait 4197 * for ordered extent completion, which is where we update nbytes, it 4198 * only waits for writeback to complete. During log replay as we find 4199 * file extent items and replay them, we adjust the nbytes field of the 4200 * inode item in subvolume tree as needed (see overwrite_item()). 4201 */ 4202 4203 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 4204 btrfs_set_token_inode_transid(&token, item, trans->transid); 4205 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 4206 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 4207 BTRFS_I(inode)->ro_flags); 4208 btrfs_set_token_inode_flags(&token, item, flags); 4209 btrfs_set_token_inode_block_group(&token, item, 0); 4210 } 4211 4212 static int log_inode_item(struct btrfs_trans_handle *trans, 4213 struct btrfs_root *log, struct btrfs_path *path, 4214 struct btrfs_inode *inode, bool inode_item_dropped) 4215 { 4216 struct btrfs_inode_item *inode_item; 4217 int ret; 4218 4219 /* 4220 * If we are doing a fast fsync and the inode was logged before in the 4221 * current transaction, then we know the inode was previously logged and 4222 * it exists in the log tree. For performance reasons, in this case use 4223 * btrfs_search_slot() directly with ins_len set to 0 so that we never 4224 * attempt a write lock on the leaf's parent, which adds unnecessary lock 4225 * contention in case there are concurrent fsyncs for other inodes of the 4226 * same subvolume. Using btrfs_insert_empty_item() when the inode item 4227 * already exists can also result in unnecessarily splitting a leaf. 4228 */ 4229 if (!inode_item_dropped && inode->logged_trans == trans->transid) { 4230 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); 4231 ASSERT(ret <= 0); 4232 if (ret > 0) 4233 ret = -ENOENT; 4234 } else { 4235 /* 4236 * This means it is the first fsync in the current transaction, 4237 * so the inode item is not in the log and we need to insert it. 4238 * We can never get -EEXIST because we are only called for a fast 4239 * fsync and in case an inode eviction happens after the inode was 4240 * logged before in the current transaction, when we load again 4241 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime 4242 * flags and set ->logged_trans to 0. 4243 */ 4244 ret = btrfs_insert_empty_item(trans, log, path, &inode->location, 4245 sizeof(*inode_item)); 4246 ASSERT(ret != -EEXIST); 4247 } 4248 if (ret) 4249 return ret; 4250 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4251 struct btrfs_inode_item); 4252 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 4253 0, 0); 4254 btrfs_release_path(path); 4255 return 0; 4256 } 4257 4258 static int log_csums(struct btrfs_trans_handle *trans, 4259 struct btrfs_inode *inode, 4260 struct btrfs_root *log_root, 4261 struct btrfs_ordered_sum *sums) 4262 { 4263 const u64 lock_end = sums->bytenr + sums->len - 1; 4264 struct extent_state *cached_state = NULL; 4265 int ret; 4266 4267 /* 4268 * If this inode was not used for reflink operations in the current 4269 * transaction with new extents, then do the fast path, no need to 4270 * worry about logging checksum items with overlapping ranges. 4271 */ 4272 if (inode->last_reflink_trans < trans->transid) 4273 return btrfs_csum_file_blocks(trans, log_root, sums); 4274 4275 /* 4276 * Serialize logging for checksums. This is to avoid racing with the 4277 * same checksum being logged by another task that is logging another 4278 * file which happens to refer to the same extent as well. Such races 4279 * can leave checksum items in the log with overlapping ranges. 4280 */ 4281 ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, 4282 &cached_state); 4283 if (ret) 4284 return ret; 4285 /* 4286 * Due to extent cloning, we might have logged a csum item that covers a 4287 * subrange of a cloned extent, and later we can end up logging a csum 4288 * item for a larger subrange of the same extent or the entire range. 4289 * This would leave csum items in the log tree that cover the same range 4290 * and break the searches for checksums in the log tree, resulting in 4291 * some checksums missing in the fs/subvolume tree. So just delete (or 4292 * trim and adjust) any existing csum items in the log for this range. 4293 */ 4294 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); 4295 if (!ret) 4296 ret = btrfs_csum_file_blocks(trans, log_root, sums); 4297 4298 unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, 4299 &cached_state); 4300 4301 return ret; 4302 } 4303 4304 static noinline int copy_items(struct btrfs_trans_handle *trans, 4305 struct btrfs_inode *inode, 4306 struct btrfs_path *dst_path, 4307 struct btrfs_path *src_path, 4308 int start_slot, int nr, int inode_only, 4309 u64 logged_isize) 4310 { 4311 struct btrfs_root *log = inode->root->log_root; 4312 struct btrfs_file_extent_item *extent; 4313 struct extent_buffer *src; 4314 int ret = 0; 4315 struct btrfs_key *ins_keys; 4316 u32 *ins_sizes; 4317 struct btrfs_item_batch batch; 4318 char *ins_data; 4319 int i; 4320 int dst_index; 4321 const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); 4322 const u64 i_size = i_size_read(&inode->vfs_inode); 4323 4324 /* 4325 * To keep lockdep happy and avoid deadlocks, clone the source leaf and 4326 * use the clone. This is because otherwise we would be changing the log 4327 * tree, to insert items from the subvolume tree or insert csum items, 4328 * while holding a read lock on a leaf from the subvolume tree, which 4329 * creates a nasty lock dependency when COWing log tree nodes/leaves: 4330 * 4331 * 1) Modifying the log tree triggers an extent buffer allocation while 4332 * holding a write lock on a parent extent buffer from the log tree. 4333 * Allocating the pages for an extent buffer, or the extent buffer 4334 * struct, can trigger inode eviction and finally the inode eviction 4335 * will trigger a release/remove of a delayed node, which requires 4336 * taking the delayed node's mutex; 4337 * 4338 * 2) Allocating a metadata extent for a log tree can trigger the async 4339 * reclaim thread and make us wait for it to release enough space and 4340 * unblock our reservation ticket. The reclaim thread can start 4341 * flushing delayed items, and that in turn results in the need to 4342 * lock delayed node mutexes and in the need to write lock extent 4343 * buffers of a subvolume tree - all this while holding a write lock 4344 * on the parent extent buffer in the log tree. 4345 * 4346 * So one task in scenario 1) running in parallel with another task in 4347 * scenario 2) could lead to a deadlock, one wanting to lock a delayed 4348 * node mutex while having a read lock on a leaf from the subvolume, 4349 * while the other is holding the delayed node's mutex and wants to 4350 * write lock the same subvolume leaf for flushing delayed items. 4351 */ 4352 src = btrfs_clone_extent_buffer(src_path->nodes[0]); 4353 if (!src) 4354 return -ENOMEM; 4355 4356 i = src_path->slots[0]; 4357 btrfs_release_path(src_path); 4358 src_path->nodes[0] = src; 4359 src_path->slots[0] = i; 4360 4361 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 4362 nr * sizeof(u32), GFP_NOFS); 4363 if (!ins_data) 4364 return -ENOMEM; 4365 4366 ins_sizes = (u32 *)ins_data; 4367 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 4368 batch.keys = ins_keys; 4369 batch.data_sizes = ins_sizes; 4370 batch.total_data_size = 0; 4371 batch.nr = 0; 4372 4373 dst_index = 0; 4374 for (i = 0; i < nr; i++) { 4375 const int src_slot = start_slot + i; 4376 struct btrfs_root *csum_root; 4377 struct btrfs_ordered_sum *sums; 4378 struct btrfs_ordered_sum *sums_next; 4379 LIST_HEAD(ordered_sums); 4380 u64 disk_bytenr; 4381 u64 disk_num_bytes; 4382 u64 extent_offset; 4383 u64 extent_num_bytes; 4384 bool is_old_extent; 4385 4386 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); 4387 4388 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) 4389 goto add_to_batch; 4390 4391 extent = btrfs_item_ptr(src, src_slot, 4392 struct btrfs_file_extent_item); 4393 4394 is_old_extent = (btrfs_file_extent_generation(src, extent) < 4395 trans->transid); 4396 4397 /* 4398 * Don't copy extents from past generations. That would make us 4399 * log a lot more metadata for common cases like doing only a 4400 * few random writes into a file and then fsync it for the first 4401 * time or after the full sync flag is set on the inode. We can 4402 * get leaves full of extent items, most of which are from past 4403 * generations, so we can skip them - as long as the inode has 4404 * not been the target of a reflink operation in this transaction, 4405 * as in that case it might have had file extent items with old 4406 * generations copied into it. We also must always log prealloc 4407 * extents that start at or beyond eof, otherwise we would lose 4408 * them on log replay. 4409 */ 4410 if (is_old_extent && 4411 ins_keys[dst_index].offset < i_size && 4412 inode->last_reflink_trans < trans->transid) 4413 continue; 4414 4415 if (skip_csum) 4416 goto add_to_batch; 4417 4418 /* Only regular extents have checksums. */ 4419 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) 4420 goto add_to_batch; 4421 4422 /* 4423 * If it's an extent created in a past transaction, then its 4424 * checksums are already accessible from the committed csum tree, 4425 * no need to log them. 4426 */ 4427 if (is_old_extent) 4428 goto add_to_batch; 4429 4430 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); 4431 /* If it's an explicit hole, there are no checksums. */ 4432 if (disk_bytenr == 0) 4433 goto add_to_batch; 4434 4435 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); 4436 4437 if (btrfs_file_extent_compression(src, extent)) { 4438 extent_offset = 0; 4439 extent_num_bytes = disk_num_bytes; 4440 } else { 4441 extent_offset = btrfs_file_extent_offset(src, extent); 4442 extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); 4443 } 4444 4445 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); 4446 disk_bytenr += extent_offset; 4447 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, 4448 disk_bytenr + extent_num_bytes - 1, 4449 &ordered_sums, 0, false); 4450 if (ret) 4451 goto out; 4452 4453 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { 4454 if (!ret) 4455 ret = log_csums(trans, inode, log, sums); 4456 list_del(&sums->list); 4457 kfree(sums); 4458 } 4459 if (ret) 4460 goto out; 4461 4462 add_to_batch: 4463 ins_sizes[dst_index] = btrfs_item_size(src, src_slot); 4464 batch.total_data_size += ins_sizes[dst_index]; 4465 batch.nr++; 4466 dst_index++; 4467 } 4468 4469 /* 4470 * We have a leaf full of old extent items that don't need to be logged, 4471 * so we don't need to do anything. 4472 */ 4473 if (batch.nr == 0) 4474 goto out; 4475 4476 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 4477 if (ret) 4478 goto out; 4479 4480 dst_index = 0; 4481 for (i = 0; i < nr; i++) { 4482 const int src_slot = start_slot + i; 4483 const int dst_slot = dst_path->slots[0] + dst_index; 4484 struct btrfs_key key; 4485 unsigned long src_offset; 4486 unsigned long dst_offset; 4487 4488 /* 4489 * We're done, all the remaining items in the source leaf 4490 * correspond to old file extent items. 4491 */ 4492 if (dst_index >= batch.nr) 4493 break; 4494 4495 btrfs_item_key_to_cpu(src, &key, src_slot); 4496 4497 if (key.type != BTRFS_EXTENT_DATA_KEY) 4498 goto copy_item; 4499 4500 extent = btrfs_item_ptr(src, src_slot, 4501 struct btrfs_file_extent_item); 4502 4503 /* See the comment in the previous loop, same logic. */ 4504 if (btrfs_file_extent_generation(src, extent) < trans->transid && 4505 key.offset < i_size && 4506 inode->last_reflink_trans < trans->transid) 4507 continue; 4508 4509 copy_item: 4510 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); 4511 src_offset = btrfs_item_ptr_offset(src, src_slot); 4512 4513 if (key.type == BTRFS_INODE_ITEM_KEY) { 4514 struct btrfs_inode_item *inode_item; 4515 4516 inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, 4517 struct btrfs_inode_item); 4518 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4519 &inode->vfs_inode, 4520 inode_only == LOG_INODE_EXISTS, 4521 logged_isize); 4522 } else { 4523 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4524 src_offset, ins_sizes[dst_index]); 4525 } 4526 4527 dst_index++; 4528 } 4529 4530 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4531 btrfs_release_path(dst_path); 4532 out: 4533 kfree(ins_data); 4534 4535 return ret; 4536 } 4537 4538 static int extent_cmp(void *priv, const struct list_head *a, 4539 const struct list_head *b) 4540 { 4541 const struct extent_map *em1, *em2; 4542 4543 em1 = list_entry(a, struct extent_map, list); 4544 em2 = list_entry(b, struct extent_map, list); 4545 4546 if (em1->start < em2->start) 4547 return -1; 4548 else if (em1->start > em2->start) 4549 return 1; 4550 return 0; 4551 } 4552 4553 static int log_extent_csums(struct btrfs_trans_handle *trans, 4554 struct btrfs_inode *inode, 4555 struct btrfs_root *log_root, 4556 const struct extent_map *em, 4557 struct btrfs_log_ctx *ctx) 4558 { 4559 struct btrfs_ordered_extent *ordered; 4560 struct btrfs_root *csum_root; 4561 u64 csum_offset; 4562 u64 csum_len; 4563 u64 mod_start = em->mod_start; 4564 u64 mod_len = em->mod_len; 4565 LIST_HEAD(ordered_sums); 4566 int ret = 0; 4567 4568 if (inode->flags & BTRFS_INODE_NODATASUM || 4569 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4570 em->block_start == EXTENT_MAP_HOLE) 4571 return 0; 4572 4573 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4574 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4575 const u64 mod_end = mod_start + mod_len; 4576 struct btrfs_ordered_sum *sums; 4577 4578 if (mod_len == 0) 4579 break; 4580 4581 if (ordered_end <= mod_start) 4582 continue; 4583 if (mod_end <= ordered->file_offset) 4584 break; 4585 4586 /* 4587 * We are going to copy all the csums on this ordered extent, so 4588 * go ahead and adjust mod_start and mod_len in case this ordered 4589 * extent has already been logged. 4590 */ 4591 if (ordered->file_offset > mod_start) { 4592 if (ordered_end >= mod_end) 4593 mod_len = ordered->file_offset - mod_start; 4594 /* 4595 * If we have this case 4596 * 4597 * |--------- logged extent ---------| 4598 * |----- ordered extent ----| 4599 * 4600 * Just don't mess with mod_start and mod_len, we'll 4601 * just end up logging more csums than we need and it 4602 * will be ok. 4603 */ 4604 } else { 4605 if (ordered_end < mod_end) { 4606 mod_len = mod_end - ordered_end; 4607 mod_start = ordered_end; 4608 } else { 4609 mod_len = 0; 4610 } 4611 } 4612 4613 /* 4614 * To keep us from looping for the above case of an ordered 4615 * extent that falls inside of the logged extent. 4616 */ 4617 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4618 continue; 4619 4620 list_for_each_entry(sums, &ordered->list, list) { 4621 ret = log_csums(trans, inode, log_root, sums); 4622 if (ret) 4623 return ret; 4624 } 4625 } 4626 4627 /* We're done, found all csums in the ordered extents. */ 4628 if (mod_len == 0) 4629 return 0; 4630 4631 /* If we're compressed we have to save the entire range of csums. */ 4632 if (em->compress_type) { 4633 csum_offset = 0; 4634 csum_len = max(em->block_len, em->orig_block_len); 4635 } else { 4636 csum_offset = mod_start - em->start; 4637 csum_len = mod_len; 4638 } 4639 4640 /* block start is already adjusted for the file extent offset. */ 4641 csum_root = btrfs_csum_root(trans->fs_info, em->block_start); 4642 ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset, 4643 em->block_start + csum_offset + 4644 csum_len - 1, &ordered_sums, 0, false); 4645 if (ret) 4646 return ret; 4647 4648 while (!list_empty(&ordered_sums)) { 4649 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4650 struct btrfs_ordered_sum, 4651 list); 4652 if (!ret) 4653 ret = log_csums(trans, inode, log_root, sums); 4654 list_del(&sums->list); 4655 kfree(sums); 4656 } 4657 4658 return ret; 4659 } 4660 4661 static int log_one_extent(struct btrfs_trans_handle *trans, 4662 struct btrfs_inode *inode, 4663 const struct extent_map *em, 4664 struct btrfs_path *path, 4665 struct btrfs_log_ctx *ctx) 4666 { 4667 struct btrfs_drop_extents_args drop_args = { 0 }; 4668 struct btrfs_root *log = inode->root->log_root; 4669 struct btrfs_file_extent_item fi = { 0 }; 4670 struct extent_buffer *leaf; 4671 struct btrfs_key key; 4672 u64 extent_offset = em->start - em->orig_start; 4673 u64 block_len; 4674 int ret; 4675 4676 btrfs_set_stack_file_extent_generation(&fi, trans->transid); 4677 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4678 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); 4679 else 4680 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); 4681 4682 block_len = max(em->block_len, em->orig_block_len); 4683 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4684 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); 4685 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4686 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4687 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - 4688 extent_offset); 4689 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4690 } 4691 4692 btrfs_set_stack_file_extent_offset(&fi, extent_offset); 4693 btrfs_set_stack_file_extent_num_bytes(&fi, em->len); 4694 btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); 4695 btrfs_set_stack_file_extent_compression(&fi, em->compress_type); 4696 4697 ret = log_extent_csums(trans, inode, log, em, ctx); 4698 if (ret) 4699 return ret; 4700 4701 /* 4702 * If this is the first time we are logging the inode in the current 4703 * transaction, we can avoid btrfs_drop_extents(), which is expensive 4704 * because it does a deletion search, which always acquires write locks 4705 * for extent buffers at levels 2, 1 and 0. This not only wastes time 4706 * but also adds significant contention in a log tree, since log trees 4707 * are small, with a root at level 2 or 3 at most, due to their short 4708 * life span. 4709 */ 4710 if (ctx->logged_before) { 4711 drop_args.path = path; 4712 drop_args.start = em->start; 4713 drop_args.end = em->start + em->len; 4714 drop_args.replace_extent = true; 4715 drop_args.extent_item_size = sizeof(fi); 4716 ret = btrfs_drop_extents(trans, log, inode, &drop_args); 4717 if (ret) 4718 return ret; 4719 } 4720 4721 if (!drop_args.extent_inserted) { 4722 key.objectid = btrfs_ino(inode); 4723 key.type = BTRFS_EXTENT_DATA_KEY; 4724 key.offset = em->start; 4725 4726 ret = btrfs_insert_empty_item(trans, log, path, &key, 4727 sizeof(fi)); 4728 if (ret) 4729 return ret; 4730 } 4731 leaf = path->nodes[0]; 4732 write_extent_buffer(leaf, &fi, 4733 btrfs_item_ptr_offset(leaf, path->slots[0]), 4734 sizeof(fi)); 4735 btrfs_mark_buffer_dirty(leaf); 4736 4737 btrfs_release_path(path); 4738 4739 return ret; 4740 } 4741 4742 /* 4743 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4744 * lose them after doing a full/fast fsync and replaying the log. We scan the 4745 * subvolume's root instead of iterating the inode's extent map tree because 4746 * otherwise we can log incorrect extent items based on extent map conversion. 4747 * That can happen due to the fact that extent maps are merged when they 4748 * are not in the extent map tree's list of modified extents. 4749 */ 4750 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4751 struct btrfs_inode *inode, 4752 struct btrfs_path *path) 4753 { 4754 struct btrfs_root *root = inode->root; 4755 struct btrfs_key key; 4756 const u64 i_size = i_size_read(&inode->vfs_inode); 4757 const u64 ino = btrfs_ino(inode); 4758 struct btrfs_path *dst_path = NULL; 4759 bool dropped_extents = false; 4760 u64 truncate_offset = i_size; 4761 struct extent_buffer *leaf; 4762 int slot; 4763 int ins_nr = 0; 4764 int start_slot; 4765 int ret; 4766 4767 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4768 return 0; 4769 4770 key.objectid = ino; 4771 key.type = BTRFS_EXTENT_DATA_KEY; 4772 key.offset = i_size; 4773 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4774 if (ret < 0) 4775 goto out; 4776 4777 /* 4778 * We must check if there is a prealloc extent that starts before the 4779 * i_size and crosses the i_size boundary. This is to ensure later we 4780 * truncate down to the end of that extent and not to the i_size, as 4781 * otherwise we end up losing part of the prealloc extent after a log 4782 * replay and with an implicit hole if there is another prealloc extent 4783 * that starts at an offset beyond i_size. 4784 */ 4785 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4786 if (ret < 0) 4787 goto out; 4788 4789 if (ret == 0) { 4790 struct btrfs_file_extent_item *ei; 4791 4792 leaf = path->nodes[0]; 4793 slot = path->slots[0]; 4794 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4795 4796 if (btrfs_file_extent_type(leaf, ei) == 4797 BTRFS_FILE_EXTENT_PREALLOC) { 4798 u64 extent_end; 4799 4800 btrfs_item_key_to_cpu(leaf, &key, slot); 4801 extent_end = key.offset + 4802 btrfs_file_extent_num_bytes(leaf, ei); 4803 4804 if (extent_end > i_size) 4805 truncate_offset = extent_end; 4806 } 4807 } else { 4808 ret = 0; 4809 } 4810 4811 while (true) { 4812 leaf = path->nodes[0]; 4813 slot = path->slots[0]; 4814 4815 if (slot >= btrfs_header_nritems(leaf)) { 4816 if (ins_nr > 0) { 4817 ret = copy_items(trans, inode, dst_path, path, 4818 start_slot, ins_nr, 1, 0); 4819 if (ret < 0) 4820 goto out; 4821 ins_nr = 0; 4822 } 4823 ret = btrfs_next_leaf(root, path); 4824 if (ret < 0) 4825 goto out; 4826 if (ret > 0) { 4827 ret = 0; 4828 break; 4829 } 4830 continue; 4831 } 4832 4833 btrfs_item_key_to_cpu(leaf, &key, slot); 4834 if (key.objectid > ino) 4835 break; 4836 if (WARN_ON_ONCE(key.objectid < ino) || 4837 key.type < BTRFS_EXTENT_DATA_KEY || 4838 key.offset < i_size) { 4839 path->slots[0]++; 4840 continue; 4841 } 4842 if (!dropped_extents) { 4843 /* 4844 * Avoid logging extent items logged in past fsync calls 4845 * and leading to duplicate keys in the log tree. 4846 */ 4847 ret = truncate_inode_items(trans, root->log_root, inode, 4848 truncate_offset, 4849 BTRFS_EXTENT_DATA_KEY); 4850 if (ret) 4851 goto out; 4852 dropped_extents = true; 4853 } 4854 if (ins_nr == 0) 4855 start_slot = slot; 4856 ins_nr++; 4857 path->slots[0]++; 4858 if (!dst_path) { 4859 dst_path = btrfs_alloc_path(); 4860 if (!dst_path) { 4861 ret = -ENOMEM; 4862 goto out; 4863 } 4864 } 4865 } 4866 if (ins_nr > 0) 4867 ret = copy_items(trans, inode, dst_path, path, 4868 start_slot, ins_nr, 1, 0); 4869 out: 4870 btrfs_release_path(path); 4871 btrfs_free_path(dst_path); 4872 return ret; 4873 } 4874 4875 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4876 struct btrfs_inode *inode, 4877 struct btrfs_path *path, 4878 struct btrfs_log_ctx *ctx) 4879 { 4880 struct btrfs_ordered_extent *ordered; 4881 struct btrfs_ordered_extent *tmp; 4882 struct extent_map *em, *n; 4883 struct list_head extents; 4884 struct extent_map_tree *tree = &inode->extent_tree; 4885 int ret = 0; 4886 int num = 0; 4887 4888 INIT_LIST_HEAD(&extents); 4889 4890 write_lock(&tree->lock); 4891 4892 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4893 list_del_init(&em->list); 4894 /* 4895 * Just an arbitrary number, this can be really CPU intensive 4896 * once we start getting a lot of extents, and really once we 4897 * have a bunch of extents we just want to commit since it will 4898 * be faster. 4899 */ 4900 if (++num > 32768) { 4901 list_del_init(&tree->modified_extents); 4902 ret = -EFBIG; 4903 goto process; 4904 } 4905 4906 if (em->generation < trans->transid) 4907 continue; 4908 4909 /* We log prealloc extents beyond eof later. */ 4910 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4911 em->start >= i_size_read(&inode->vfs_inode)) 4912 continue; 4913 4914 /* Need a ref to keep it from getting evicted from cache */ 4915 refcount_inc(&em->refs); 4916 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4917 list_add_tail(&em->list, &extents); 4918 num++; 4919 } 4920 4921 list_sort(NULL, &extents, extent_cmp); 4922 process: 4923 while (!list_empty(&extents)) { 4924 em = list_entry(extents.next, struct extent_map, list); 4925 4926 list_del_init(&em->list); 4927 4928 /* 4929 * If we had an error we just need to delete everybody from our 4930 * private list. 4931 */ 4932 if (ret) { 4933 clear_em_logging(tree, em); 4934 free_extent_map(em); 4935 continue; 4936 } 4937 4938 write_unlock(&tree->lock); 4939 4940 ret = log_one_extent(trans, inode, em, path, ctx); 4941 write_lock(&tree->lock); 4942 clear_em_logging(tree, em); 4943 free_extent_map(em); 4944 } 4945 WARN_ON(!list_empty(&extents)); 4946 write_unlock(&tree->lock); 4947 4948 if (!ret) 4949 ret = btrfs_log_prealloc_extents(trans, inode, path); 4950 if (ret) 4951 return ret; 4952 4953 /* 4954 * We have logged all extents successfully, now make sure the commit of 4955 * the current transaction waits for the ordered extents to complete 4956 * before it commits and wipes out the log trees, otherwise we would 4957 * lose data if an ordered extents completes after the transaction 4958 * commits and a power failure happens after the transaction commit. 4959 */ 4960 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 4961 list_del_init(&ordered->log_list); 4962 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 4963 4964 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4965 spin_lock_irq(&inode->ordered_tree.lock); 4966 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4967 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 4968 atomic_inc(&trans->transaction->pending_ordered); 4969 } 4970 spin_unlock_irq(&inode->ordered_tree.lock); 4971 } 4972 btrfs_put_ordered_extent(ordered); 4973 } 4974 4975 return 0; 4976 } 4977 4978 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4979 struct btrfs_path *path, u64 *size_ret) 4980 { 4981 struct btrfs_key key; 4982 int ret; 4983 4984 key.objectid = btrfs_ino(inode); 4985 key.type = BTRFS_INODE_ITEM_KEY; 4986 key.offset = 0; 4987 4988 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4989 if (ret < 0) { 4990 return ret; 4991 } else if (ret > 0) { 4992 *size_ret = 0; 4993 } else { 4994 struct btrfs_inode_item *item; 4995 4996 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4997 struct btrfs_inode_item); 4998 *size_ret = btrfs_inode_size(path->nodes[0], item); 4999 /* 5000 * If the in-memory inode's i_size is smaller then the inode 5001 * size stored in the btree, return the inode's i_size, so 5002 * that we get a correct inode size after replaying the log 5003 * when before a power failure we had a shrinking truncate 5004 * followed by addition of a new name (rename / new hard link). 5005 * Otherwise return the inode size from the btree, to avoid 5006 * data loss when replaying a log due to previously doing a 5007 * write that expands the inode's size and logging a new name 5008 * immediately after. 5009 */ 5010 if (*size_ret > inode->vfs_inode.i_size) 5011 *size_ret = inode->vfs_inode.i_size; 5012 } 5013 5014 btrfs_release_path(path); 5015 return 0; 5016 } 5017 5018 /* 5019 * At the moment we always log all xattrs. This is to figure out at log replay 5020 * time which xattrs must have their deletion replayed. If a xattr is missing 5021 * in the log tree and exists in the fs/subvol tree, we delete it. This is 5022 * because if a xattr is deleted, the inode is fsynced and a power failure 5023 * happens, causing the log to be replayed the next time the fs is mounted, 5024 * we want the xattr to not exist anymore (same behaviour as other filesystems 5025 * with a journal, ext3/4, xfs, f2fs, etc). 5026 */ 5027 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 5028 struct btrfs_inode *inode, 5029 struct btrfs_path *path, 5030 struct btrfs_path *dst_path) 5031 { 5032 struct btrfs_root *root = inode->root; 5033 int ret; 5034 struct btrfs_key key; 5035 const u64 ino = btrfs_ino(inode); 5036 int ins_nr = 0; 5037 int start_slot = 0; 5038 bool found_xattrs = false; 5039 5040 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) 5041 return 0; 5042 5043 key.objectid = ino; 5044 key.type = BTRFS_XATTR_ITEM_KEY; 5045 key.offset = 0; 5046 5047 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5048 if (ret < 0) 5049 return ret; 5050 5051 while (true) { 5052 int slot = path->slots[0]; 5053 struct extent_buffer *leaf = path->nodes[0]; 5054 int nritems = btrfs_header_nritems(leaf); 5055 5056 if (slot >= nritems) { 5057 if (ins_nr > 0) { 5058 ret = copy_items(trans, inode, dst_path, path, 5059 start_slot, ins_nr, 1, 0); 5060 if (ret < 0) 5061 return ret; 5062 ins_nr = 0; 5063 } 5064 ret = btrfs_next_leaf(root, path); 5065 if (ret < 0) 5066 return ret; 5067 else if (ret > 0) 5068 break; 5069 continue; 5070 } 5071 5072 btrfs_item_key_to_cpu(leaf, &key, slot); 5073 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 5074 break; 5075 5076 if (ins_nr == 0) 5077 start_slot = slot; 5078 ins_nr++; 5079 path->slots[0]++; 5080 found_xattrs = true; 5081 cond_resched(); 5082 } 5083 if (ins_nr > 0) { 5084 ret = copy_items(trans, inode, dst_path, path, 5085 start_slot, ins_nr, 1, 0); 5086 if (ret < 0) 5087 return ret; 5088 } 5089 5090 if (!found_xattrs) 5091 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); 5092 5093 return 0; 5094 } 5095 5096 /* 5097 * When using the NO_HOLES feature if we punched a hole that causes the 5098 * deletion of entire leafs or all the extent items of the first leaf (the one 5099 * that contains the inode item and references) we may end up not processing 5100 * any extents, because there are no leafs with a generation matching the 5101 * current transaction that have extent items for our inode. So we need to find 5102 * if any holes exist and then log them. We also need to log holes after any 5103 * truncate operation that changes the inode's size. 5104 */ 5105 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 5106 struct btrfs_inode *inode, 5107 struct btrfs_path *path) 5108 { 5109 struct btrfs_root *root = inode->root; 5110 struct btrfs_fs_info *fs_info = root->fs_info; 5111 struct btrfs_key key; 5112 const u64 ino = btrfs_ino(inode); 5113 const u64 i_size = i_size_read(&inode->vfs_inode); 5114 u64 prev_extent_end = 0; 5115 int ret; 5116 5117 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 5118 return 0; 5119 5120 key.objectid = ino; 5121 key.type = BTRFS_EXTENT_DATA_KEY; 5122 key.offset = 0; 5123 5124 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5125 if (ret < 0) 5126 return ret; 5127 5128 while (true) { 5129 struct extent_buffer *leaf = path->nodes[0]; 5130 5131 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 5132 ret = btrfs_next_leaf(root, path); 5133 if (ret < 0) 5134 return ret; 5135 if (ret > 0) { 5136 ret = 0; 5137 break; 5138 } 5139 leaf = path->nodes[0]; 5140 } 5141 5142 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5143 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 5144 break; 5145 5146 /* We have a hole, log it. */ 5147 if (prev_extent_end < key.offset) { 5148 const u64 hole_len = key.offset - prev_extent_end; 5149 5150 /* 5151 * Release the path to avoid deadlocks with other code 5152 * paths that search the root while holding locks on 5153 * leafs from the log root. 5154 */ 5155 btrfs_release_path(path); 5156 ret = btrfs_insert_hole_extent(trans, root->log_root, 5157 ino, prev_extent_end, 5158 hole_len); 5159 if (ret < 0) 5160 return ret; 5161 5162 /* 5163 * Search for the same key again in the root. Since it's 5164 * an extent item and we are holding the inode lock, the 5165 * key must still exist. If it doesn't just emit warning 5166 * and return an error to fall back to a transaction 5167 * commit. 5168 */ 5169 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5170 if (ret < 0) 5171 return ret; 5172 if (WARN_ON(ret > 0)) 5173 return -ENOENT; 5174 leaf = path->nodes[0]; 5175 } 5176 5177 prev_extent_end = btrfs_file_extent_end(path); 5178 path->slots[0]++; 5179 cond_resched(); 5180 } 5181 5182 if (prev_extent_end < i_size) { 5183 u64 hole_len; 5184 5185 btrfs_release_path(path); 5186 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 5187 ret = btrfs_insert_hole_extent(trans, root->log_root, ino, 5188 prev_extent_end, hole_len); 5189 if (ret < 0) 5190 return ret; 5191 } 5192 5193 return 0; 5194 } 5195 5196 /* 5197 * When we are logging a new inode X, check if it doesn't have a reference that 5198 * matches the reference from some other inode Y created in a past transaction 5199 * and that was renamed in the current transaction. If we don't do this, then at 5200 * log replay time we can lose inode Y (and all its files if it's a directory): 5201 * 5202 * mkdir /mnt/x 5203 * echo "hello world" > /mnt/x/foobar 5204 * sync 5205 * mv /mnt/x /mnt/y 5206 * mkdir /mnt/x # or touch /mnt/x 5207 * xfs_io -c fsync /mnt/x 5208 * <power fail> 5209 * mount fs, trigger log replay 5210 * 5211 * After the log replay procedure, we would lose the first directory and all its 5212 * files (file foobar). 5213 * For the case where inode Y is not a directory we simply end up losing it: 5214 * 5215 * echo "123" > /mnt/foo 5216 * sync 5217 * mv /mnt/foo /mnt/bar 5218 * echo "abc" > /mnt/foo 5219 * xfs_io -c fsync /mnt/foo 5220 * <power fail> 5221 * 5222 * We also need this for cases where a snapshot entry is replaced by some other 5223 * entry (file or directory) otherwise we end up with an unreplayable log due to 5224 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 5225 * if it were a regular entry: 5226 * 5227 * mkdir /mnt/x 5228 * btrfs subvolume snapshot /mnt /mnt/x/snap 5229 * btrfs subvolume delete /mnt/x/snap 5230 * rmdir /mnt/x 5231 * mkdir /mnt/x 5232 * fsync /mnt/x or fsync some new file inside it 5233 * <power fail> 5234 * 5235 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 5236 * the same transaction. 5237 */ 5238 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 5239 const int slot, 5240 const struct btrfs_key *key, 5241 struct btrfs_inode *inode, 5242 u64 *other_ino, u64 *other_parent) 5243 { 5244 int ret; 5245 struct btrfs_path *search_path; 5246 char *name = NULL; 5247 u32 name_len = 0; 5248 u32 item_size = btrfs_item_size(eb, slot); 5249 u32 cur_offset = 0; 5250 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 5251 5252 search_path = btrfs_alloc_path(); 5253 if (!search_path) 5254 return -ENOMEM; 5255 search_path->search_commit_root = 1; 5256 search_path->skip_locking = 1; 5257 5258 while (cur_offset < item_size) { 5259 u64 parent; 5260 u32 this_name_len; 5261 u32 this_len; 5262 unsigned long name_ptr; 5263 struct btrfs_dir_item *di; 5264 struct fscrypt_str name_str; 5265 5266 if (key->type == BTRFS_INODE_REF_KEY) { 5267 struct btrfs_inode_ref *iref; 5268 5269 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 5270 parent = key->offset; 5271 this_name_len = btrfs_inode_ref_name_len(eb, iref); 5272 name_ptr = (unsigned long)(iref + 1); 5273 this_len = sizeof(*iref) + this_name_len; 5274 } else { 5275 struct btrfs_inode_extref *extref; 5276 5277 extref = (struct btrfs_inode_extref *)(ptr + 5278 cur_offset); 5279 parent = btrfs_inode_extref_parent(eb, extref); 5280 this_name_len = btrfs_inode_extref_name_len(eb, extref); 5281 name_ptr = (unsigned long)&extref->name; 5282 this_len = sizeof(*extref) + this_name_len; 5283 } 5284 5285 if (this_name_len > name_len) { 5286 char *new_name; 5287 5288 new_name = krealloc(name, this_name_len, GFP_NOFS); 5289 if (!new_name) { 5290 ret = -ENOMEM; 5291 goto out; 5292 } 5293 name_len = this_name_len; 5294 name = new_name; 5295 } 5296 5297 read_extent_buffer(eb, name, name_ptr, this_name_len); 5298 5299 name_str.name = name; 5300 name_str.len = this_name_len; 5301 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 5302 parent, &name_str, 0); 5303 if (di && !IS_ERR(di)) { 5304 struct btrfs_key di_key; 5305 5306 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 5307 di, &di_key); 5308 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 5309 if (di_key.objectid != key->objectid) { 5310 ret = 1; 5311 *other_ino = di_key.objectid; 5312 *other_parent = parent; 5313 } else { 5314 ret = 0; 5315 } 5316 } else { 5317 ret = -EAGAIN; 5318 } 5319 goto out; 5320 } else if (IS_ERR(di)) { 5321 ret = PTR_ERR(di); 5322 goto out; 5323 } 5324 btrfs_release_path(search_path); 5325 5326 cur_offset += this_len; 5327 } 5328 ret = 0; 5329 out: 5330 btrfs_free_path(search_path); 5331 kfree(name); 5332 return ret; 5333 } 5334 5335 /* 5336 * Check if we need to log an inode. This is used in contexts where while 5337 * logging an inode we need to log another inode (either that it exists or in 5338 * full mode). This is used instead of btrfs_inode_in_log() because the later 5339 * requires the inode to be in the log and have the log transaction committed, 5340 * while here we do not care if the log transaction was already committed - our 5341 * caller will commit the log later - and we want to avoid logging an inode 5342 * multiple times when multiple tasks have joined the same log transaction. 5343 */ 5344 static bool need_log_inode(const struct btrfs_trans_handle *trans, 5345 const struct btrfs_inode *inode) 5346 { 5347 /* 5348 * If a directory was not modified, no dentries added or removed, we can 5349 * and should avoid logging it. 5350 */ 5351 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) 5352 return false; 5353 5354 /* 5355 * If this inode does not have new/updated/deleted xattrs since the last 5356 * time it was logged and is flagged as logged in the current transaction, 5357 * we can skip logging it. As for new/deleted names, those are updated in 5358 * the log by link/unlink/rename operations. 5359 * In case the inode was logged and then evicted and reloaded, its 5360 * logged_trans will be 0, in which case we have to fully log it since 5361 * logged_trans is a transient field, not persisted. 5362 */ 5363 if (inode->logged_trans == trans->transid && 5364 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) 5365 return false; 5366 5367 return true; 5368 } 5369 5370 struct btrfs_dir_list { 5371 u64 ino; 5372 struct list_head list; 5373 }; 5374 5375 /* 5376 * Log the inodes of the new dentries of a directory. 5377 * See process_dir_items_leaf() for details about why it is needed. 5378 * This is a recursive operation - if an existing dentry corresponds to a 5379 * directory, that directory's new entries are logged too (same behaviour as 5380 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5381 * the dentries point to we do not acquire their VFS lock, otherwise lockdep 5382 * complains about the following circular lock dependency / possible deadlock: 5383 * 5384 * CPU0 CPU1 5385 * ---- ---- 5386 * lock(&type->i_mutex_dir_key#3/2); 5387 * lock(sb_internal#2); 5388 * lock(&type->i_mutex_dir_key#3/2); 5389 * lock(&sb->s_type->i_mutex_key#14); 5390 * 5391 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5392 * sb_start_intwrite() in btrfs_start_transaction(). 5393 * Not acquiring the VFS lock of the inodes is still safe because: 5394 * 5395 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5396 * that while logging the inode new references (names) are added or removed 5397 * from the inode, leaving the logged inode item with a link count that does 5398 * not match the number of logged inode reference items. This is fine because 5399 * at log replay time we compute the real number of links and correct the 5400 * link count in the inode item (see replay_one_buffer() and 5401 * link_to_fixup_dir()); 5402 * 5403 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5404 * while logging the inode's items new index items (key type 5405 * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item 5406 * has a size that doesn't match the sum of the lengths of all the logged 5407 * names - this is ok, not a problem, because at log replay time we set the 5408 * directory's i_size to the correct value (see replay_one_name() and 5409 * overwrite_item()). 5410 */ 5411 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5412 struct btrfs_inode *start_inode, 5413 struct btrfs_log_ctx *ctx) 5414 { 5415 struct btrfs_root *root = start_inode->root; 5416 struct btrfs_fs_info *fs_info = root->fs_info; 5417 struct btrfs_path *path; 5418 LIST_HEAD(dir_list); 5419 struct btrfs_dir_list *dir_elem; 5420 u64 ino = btrfs_ino(start_inode); 5421 int ret = 0; 5422 5423 /* 5424 * If we are logging a new name, as part of a link or rename operation, 5425 * don't bother logging new dentries, as we just want to log the names 5426 * of an inode and that any new parents exist. 5427 */ 5428 if (ctx->logging_new_name) 5429 return 0; 5430 5431 path = btrfs_alloc_path(); 5432 if (!path) 5433 return -ENOMEM; 5434 5435 while (true) { 5436 struct extent_buffer *leaf; 5437 struct btrfs_key min_key; 5438 bool continue_curr_inode = true; 5439 int nritems; 5440 int i; 5441 5442 min_key.objectid = ino; 5443 min_key.type = BTRFS_DIR_INDEX_KEY; 5444 min_key.offset = 0; 5445 again: 5446 btrfs_release_path(path); 5447 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 5448 if (ret < 0) { 5449 break; 5450 } else if (ret > 0) { 5451 ret = 0; 5452 goto next; 5453 } 5454 5455 leaf = path->nodes[0]; 5456 nritems = btrfs_header_nritems(leaf); 5457 for (i = path->slots[0]; i < nritems; i++) { 5458 struct btrfs_dir_item *di; 5459 struct btrfs_key di_key; 5460 struct inode *di_inode; 5461 int log_mode = LOG_INODE_EXISTS; 5462 int type; 5463 5464 btrfs_item_key_to_cpu(leaf, &min_key, i); 5465 if (min_key.objectid != ino || 5466 min_key.type != BTRFS_DIR_INDEX_KEY) { 5467 continue_curr_inode = false; 5468 break; 5469 } 5470 5471 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5472 type = btrfs_dir_ftype(leaf, di); 5473 if (btrfs_dir_transid(leaf, di) < trans->transid) 5474 continue; 5475 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5476 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5477 continue; 5478 5479 btrfs_release_path(path); 5480 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 5481 if (IS_ERR(di_inode)) { 5482 ret = PTR_ERR(di_inode); 5483 goto out; 5484 } 5485 5486 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 5487 btrfs_add_delayed_iput(BTRFS_I(di_inode)); 5488 break; 5489 } 5490 5491 ctx->log_new_dentries = false; 5492 if (type == BTRFS_FT_DIR) 5493 log_mode = LOG_INODE_ALL; 5494 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), 5495 log_mode, ctx); 5496 btrfs_add_delayed_iput(BTRFS_I(di_inode)); 5497 if (ret) 5498 goto out; 5499 if (ctx->log_new_dentries) { 5500 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5501 if (!dir_elem) { 5502 ret = -ENOMEM; 5503 goto out; 5504 } 5505 dir_elem->ino = di_key.objectid; 5506 list_add_tail(&dir_elem->list, &dir_list); 5507 } 5508 break; 5509 } 5510 5511 if (continue_curr_inode && min_key.offset < (u64)-1) { 5512 min_key.offset++; 5513 goto again; 5514 } 5515 5516 next: 5517 if (list_empty(&dir_list)) 5518 break; 5519 5520 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list); 5521 ino = dir_elem->ino; 5522 list_del(&dir_elem->list); 5523 kfree(dir_elem); 5524 } 5525 out: 5526 btrfs_free_path(path); 5527 if (ret) { 5528 struct btrfs_dir_list *next; 5529 5530 list_for_each_entry_safe(dir_elem, next, &dir_list, list) 5531 kfree(dir_elem); 5532 } 5533 5534 return ret; 5535 } 5536 5537 struct btrfs_ino_list { 5538 u64 ino; 5539 u64 parent; 5540 struct list_head list; 5541 }; 5542 5543 static void free_conflicting_inodes(struct btrfs_log_ctx *ctx) 5544 { 5545 struct btrfs_ino_list *curr; 5546 struct btrfs_ino_list *next; 5547 5548 list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) { 5549 list_del(&curr->list); 5550 kfree(curr); 5551 } 5552 } 5553 5554 static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, 5555 struct btrfs_path *path) 5556 { 5557 struct btrfs_key key; 5558 int ret; 5559 5560 key.objectid = ino; 5561 key.type = BTRFS_INODE_ITEM_KEY; 5562 key.offset = 0; 5563 5564 path->search_commit_root = 1; 5565 path->skip_locking = 1; 5566 5567 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5568 if (WARN_ON_ONCE(ret > 0)) { 5569 /* 5570 * We have previously found the inode through the commit root 5571 * so this should not happen. If it does, just error out and 5572 * fallback to a transaction commit. 5573 */ 5574 ret = -ENOENT; 5575 } else if (ret == 0) { 5576 struct btrfs_inode_item *item; 5577 5578 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5579 struct btrfs_inode_item); 5580 if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item))) 5581 ret = 1; 5582 } 5583 5584 btrfs_release_path(path); 5585 path->search_commit_root = 0; 5586 path->skip_locking = 0; 5587 5588 return ret; 5589 } 5590 5591 static int add_conflicting_inode(struct btrfs_trans_handle *trans, 5592 struct btrfs_root *root, 5593 struct btrfs_path *path, 5594 u64 ino, u64 parent, 5595 struct btrfs_log_ctx *ctx) 5596 { 5597 struct btrfs_ino_list *ino_elem; 5598 struct inode *inode; 5599 5600 /* 5601 * It's rare to have a lot of conflicting inodes, in practice it is not 5602 * common to have more than 1 or 2. We don't want to collect too many, 5603 * as we could end up logging too many inodes (even if only in 5604 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction 5605 * commits. 5606 */ 5607 if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) { 5608 btrfs_set_log_full_commit(trans); 5609 return BTRFS_LOG_FORCE_COMMIT; 5610 } 5611 5612 inode = btrfs_iget(root->fs_info->sb, ino, root); 5613 /* 5614 * If the other inode that had a conflicting dir entry was deleted in 5615 * the current transaction then we either: 5616 * 5617 * 1) Log the parent directory (later after adding it to the list) if 5618 * the inode is a directory. This is because it may be a deleted 5619 * subvolume/snapshot or it may be a regular directory that had 5620 * deleted subvolumes/snapshots (or subdirectories that had them), 5621 * and at the moment we can't deal with dropping subvolumes/snapshots 5622 * during log replay. So we just log the parent, which will result in 5623 * a fallback to a transaction commit if we are dealing with those 5624 * cases (last_unlink_trans will match the current transaction); 5625 * 5626 * 2) Do nothing if it's not a directory. During log replay we simply 5627 * unlink the conflicting dentry from the parent directory and then 5628 * add the dentry for our inode. Like this we can avoid logging the 5629 * parent directory (and maybe fallback to a transaction commit in 5630 * case it has a last_unlink_trans == trans->transid, due to moving 5631 * some inode from it to some other directory). 5632 */ 5633 if (IS_ERR(inode)) { 5634 int ret = PTR_ERR(inode); 5635 5636 if (ret != -ENOENT) 5637 return ret; 5638 5639 ret = conflicting_inode_is_dir(root, ino, path); 5640 /* Not a directory or we got an error. */ 5641 if (ret <= 0) 5642 return ret; 5643 5644 /* Conflicting inode is a directory, so we'll log its parent. */ 5645 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5646 if (!ino_elem) 5647 return -ENOMEM; 5648 ino_elem->ino = ino; 5649 ino_elem->parent = parent; 5650 list_add_tail(&ino_elem->list, &ctx->conflict_inodes); 5651 ctx->num_conflict_inodes++; 5652 5653 return 0; 5654 } 5655 5656 /* 5657 * If the inode was already logged skip it - otherwise we can hit an 5658 * infinite loop. Example: 5659 * 5660 * From the commit root (previous transaction) we have the following 5661 * inodes: 5662 * 5663 * inode 257 a directory 5664 * inode 258 with references "zz" and "zz_link" on inode 257 5665 * inode 259 with reference "a" on inode 257 5666 * 5667 * And in the current (uncommitted) transaction we have: 5668 * 5669 * inode 257 a directory, unchanged 5670 * inode 258 with references "a" and "a2" on inode 257 5671 * inode 259 with reference "zz_link" on inode 257 5672 * inode 261 with reference "zz" on inode 257 5673 * 5674 * When logging inode 261 the following infinite loop could 5675 * happen if we don't skip already logged inodes: 5676 * 5677 * - we detect inode 258 as a conflicting inode, with inode 261 5678 * on reference "zz", and log it; 5679 * 5680 * - we detect inode 259 as a conflicting inode, with inode 258 5681 * on reference "a", and log it; 5682 * 5683 * - we detect inode 258 as a conflicting inode, with inode 259 5684 * on reference "zz_link", and log it - again! After this we 5685 * repeat the above steps forever. 5686 * 5687 * Here we can use need_log_inode() because we only need to log the 5688 * inode in LOG_INODE_EXISTS mode and rename operations update the log, 5689 * so that the log ends up with the new name and without the old name. 5690 */ 5691 if (!need_log_inode(trans, BTRFS_I(inode))) { 5692 btrfs_add_delayed_iput(BTRFS_I(inode)); 5693 return 0; 5694 } 5695 5696 btrfs_add_delayed_iput(BTRFS_I(inode)); 5697 5698 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5699 if (!ino_elem) 5700 return -ENOMEM; 5701 ino_elem->ino = ino; 5702 ino_elem->parent = parent; 5703 list_add_tail(&ino_elem->list, &ctx->conflict_inodes); 5704 ctx->num_conflict_inodes++; 5705 5706 return 0; 5707 } 5708 5709 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 5710 struct btrfs_root *root, 5711 struct btrfs_log_ctx *ctx) 5712 { 5713 struct btrfs_fs_info *fs_info = root->fs_info; 5714 int ret = 0; 5715 5716 /* 5717 * Conflicting inodes are logged by the first call to btrfs_log_inode(), 5718 * otherwise we could have unbounded recursion of btrfs_log_inode() 5719 * calls. This check guarantees we can have only 1 level of recursion. 5720 */ 5721 if (ctx->logging_conflict_inodes) 5722 return 0; 5723 5724 ctx->logging_conflict_inodes = true; 5725 5726 /* 5727 * New conflicting inodes may be found and added to the list while we 5728 * are logging a conflicting inode, so keep iterating while the list is 5729 * not empty. 5730 */ 5731 while (!list_empty(&ctx->conflict_inodes)) { 5732 struct btrfs_ino_list *curr; 5733 struct inode *inode; 5734 u64 ino; 5735 u64 parent; 5736 5737 curr = list_first_entry(&ctx->conflict_inodes, 5738 struct btrfs_ino_list, list); 5739 ino = curr->ino; 5740 parent = curr->parent; 5741 list_del(&curr->list); 5742 kfree(curr); 5743 5744 inode = btrfs_iget(fs_info->sb, ino, root); 5745 /* 5746 * If the other inode that had a conflicting dir entry was 5747 * deleted in the current transaction, we need to log its parent 5748 * directory. See the comment at add_conflicting_inode(). 5749 */ 5750 if (IS_ERR(inode)) { 5751 ret = PTR_ERR(inode); 5752 if (ret != -ENOENT) 5753 break; 5754 5755 inode = btrfs_iget(fs_info->sb, parent, root); 5756 if (IS_ERR(inode)) { 5757 ret = PTR_ERR(inode); 5758 break; 5759 } 5760 5761 /* 5762 * Always log the directory, we cannot make this 5763 * conditional on need_log_inode() because the directory 5764 * might have been logged in LOG_INODE_EXISTS mode or 5765 * the dir index of the conflicting inode is not in a 5766 * dir index key range logged for the directory. So we 5767 * must make sure the deletion is recorded. 5768 */ 5769 ret = btrfs_log_inode(trans, BTRFS_I(inode), 5770 LOG_INODE_ALL, ctx); 5771 btrfs_add_delayed_iput(BTRFS_I(inode)); 5772 if (ret) 5773 break; 5774 continue; 5775 } 5776 5777 /* 5778 * Here we can use need_log_inode() because we only need to log 5779 * the inode in LOG_INODE_EXISTS mode and rename operations 5780 * update the log, so that the log ends up with the new name and 5781 * without the old name. 5782 * 5783 * We did this check at add_conflicting_inode(), but here we do 5784 * it again because if some other task logged the inode after 5785 * that, we can avoid doing it again. 5786 */ 5787 if (!need_log_inode(trans, BTRFS_I(inode))) { 5788 btrfs_add_delayed_iput(BTRFS_I(inode)); 5789 continue; 5790 } 5791 5792 /* 5793 * We are safe logging the other inode without acquiring its 5794 * lock as long as we log with the LOG_INODE_EXISTS mode. We 5795 * are safe against concurrent renames of the other inode as 5796 * well because during a rename we pin the log and update the 5797 * log with the new name before we unpin it. 5798 */ 5799 ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); 5800 btrfs_add_delayed_iput(BTRFS_I(inode)); 5801 if (ret) 5802 break; 5803 } 5804 5805 ctx->logging_conflict_inodes = false; 5806 if (ret) 5807 free_conflicting_inodes(ctx); 5808 5809 return ret; 5810 } 5811 5812 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5813 struct btrfs_inode *inode, 5814 struct btrfs_key *min_key, 5815 const struct btrfs_key *max_key, 5816 struct btrfs_path *path, 5817 struct btrfs_path *dst_path, 5818 const u64 logged_isize, 5819 const int inode_only, 5820 struct btrfs_log_ctx *ctx, 5821 bool *need_log_inode_item) 5822 { 5823 const u64 i_size = i_size_read(&inode->vfs_inode); 5824 struct btrfs_root *root = inode->root; 5825 int ins_start_slot = 0; 5826 int ins_nr = 0; 5827 int ret; 5828 5829 while (1) { 5830 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5831 if (ret < 0) 5832 return ret; 5833 if (ret > 0) { 5834 ret = 0; 5835 break; 5836 } 5837 again: 5838 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5839 if (min_key->objectid != max_key->objectid) 5840 break; 5841 if (min_key->type > max_key->type) 5842 break; 5843 5844 if (min_key->type == BTRFS_INODE_ITEM_KEY) { 5845 *need_log_inode_item = false; 5846 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && 5847 min_key->offset >= i_size) { 5848 /* 5849 * Extents at and beyond eof are logged with 5850 * btrfs_log_prealloc_extents(). 5851 * Only regular files have BTRFS_EXTENT_DATA_KEY keys, 5852 * and no keys greater than that, so bail out. 5853 */ 5854 break; 5855 } else if ((min_key->type == BTRFS_INODE_REF_KEY || 5856 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5857 (inode->generation == trans->transid || 5858 ctx->logging_conflict_inodes)) { 5859 u64 other_ino = 0; 5860 u64 other_parent = 0; 5861 5862 ret = btrfs_check_ref_name_override(path->nodes[0], 5863 path->slots[0], min_key, inode, 5864 &other_ino, &other_parent); 5865 if (ret < 0) { 5866 return ret; 5867 } else if (ret > 0 && 5868 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5869 if (ins_nr > 0) { 5870 ins_nr++; 5871 } else { 5872 ins_nr = 1; 5873 ins_start_slot = path->slots[0]; 5874 } 5875 ret = copy_items(trans, inode, dst_path, path, 5876 ins_start_slot, ins_nr, 5877 inode_only, logged_isize); 5878 if (ret < 0) 5879 return ret; 5880 ins_nr = 0; 5881 5882 btrfs_release_path(path); 5883 ret = add_conflicting_inode(trans, root, path, 5884 other_ino, 5885 other_parent, ctx); 5886 if (ret) 5887 return ret; 5888 goto next_key; 5889 } 5890 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5891 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ 5892 if (ins_nr == 0) 5893 goto next_slot; 5894 ret = copy_items(trans, inode, dst_path, path, 5895 ins_start_slot, 5896 ins_nr, inode_only, logged_isize); 5897 if (ret < 0) 5898 return ret; 5899 ins_nr = 0; 5900 goto next_slot; 5901 } 5902 5903 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5904 ins_nr++; 5905 goto next_slot; 5906 } else if (!ins_nr) { 5907 ins_start_slot = path->slots[0]; 5908 ins_nr = 1; 5909 goto next_slot; 5910 } 5911 5912 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5913 ins_nr, inode_only, logged_isize); 5914 if (ret < 0) 5915 return ret; 5916 ins_nr = 1; 5917 ins_start_slot = path->slots[0]; 5918 next_slot: 5919 path->slots[0]++; 5920 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5921 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5922 path->slots[0]); 5923 goto again; 5924 } 5925 if (ins_nr) { 5926 ret = copy_items(trans, inode, dst_path, path, 5927 ins_start_slot, ins_nr, inode_only, 5928 logged_isize); 5929 if (ret < 0) 5930 return ret; 5931 ins_nr = 0; 5932 } 5933 btrfs_release_path(path); 5934 next_key: 5935 if (min_key->offset < (u64)-1) { 5936 min_key->offset++; 5937 } else if (min_key->type < max_key->type) { 5938 min_key->type++; 5939 min_key->offset = 0; 5940 } else { 5941 break; 5942 } 5943 5944 /* 5945 * We may process many leaves full of items for our inode, so 5946 * avoid monopolizing a cpu for too long by rescheduling while 5947 * not holding locks on any tree. 5948 */ 5949 cond_resched(); 5950 } 5951 if (ins_nr) { 5952 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5953 ins_nr, inode_only, logged_isize); 5954 if (ret) 5955 return ret; 5956 } 5957 5958 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { 5959 /* 5960 * Release the path because otherwise we might attempt to double 5961 * lock the same leaf with btrfs_log_prealloc_extents() below. 5962 */ 5963 btrfs_release_path(path); 5964 ret = btrfs_log_prealloc_extents(trans, inode, dst_path); 5965 } 5966 5967 return ret; 5968 } 5969 5970 static int insert_delayed_items_batch(struct btrfs_trans_handle *trans, 5971 struct btrfs_root *log, 5972 struct btrfs_path *path, 5973 const struct btrfs_item_batch *batch, 5974 const struct btrfs_delayed_item *first_item) 5975 { 5976 const struct btrfs_delayed_item *curr = first_item; 5977 int ret; 5978 5979 ret = btrfs_insert_empty_items(trans, log, path, batch); 5980 if (ret) 5981 return ret; 5982 5983 for (int i = 0; i < batch->nr; i++) { 5984 char *data_ptr; 5985 5986 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); 5987 write_extent_buffer(path->nodes[0], &curr->data, 5988 (unsigned long)data_ptr, curr->data_len); 5989 curr = list_next_entry(curr, log_list); 5990 path->slots[0]++; 5991 } 5992 5993 btrfs_release_path(path); 5994 5995 return 0; 5996 } 5997 5998 static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, 5999 struct btrfs_inode *inode, 6000 struct btrfs_path *path, 6001 const struct list_head *delayed_ins_list, 6002 struct btrfs_log_ctx *ctx) 6003 { 6004 /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */ 6005 const int max_batch_size = 195; 6006 const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info); 6007 const u64 ino = btrfs_ino(inode); 6008 struct btrfs_root *log = inode->root->log_root; 6009 struct btrfs_item_batch batch = { 6010 .nr = 0, 6011 .total_data_size = 0, 6012 }; 6013 const struct btrfs_delayed_item *first = NULL; 6014 const struct btrfs_delayed_item *curr; 6015 char *ins_data; 6016 struct btrfs_key *ins_keys; 6017 u32 *ins_sizes; 6018 u64 curr_batch_size = 0; 6019 int batch_idx = 0; 6020 int ret; 6021 6022 /* We are adding dir index items to the log tree. */ 6023 lockdep_assert_held(&inode->log_mutex); 6024 6025 /* 6026 * We collect delayed items before copying index keys from the subvolume 6027 * to the log tree. However just after we collected them, they may have 6028 * been flushed (all of them or just some of them), and therefore we 6029 * could have copied them from the subvolume tree to the log tree. 6030 * So find the first delayed item that was not yet logged (they are 6031 * sorted by index number). 6032 */ 6033 list_for_each_entry(curr, delayed_ins_list, log_list) { 6034 if (curr->index > inode->last_dir_index_offset) { 6035 first = curr; 6036 break; 6037 } 6038 } 6039 6040 /* Empty list or all delayed items were already logged. */ 6041 if (!first) 6042 return 0; 6043 6044 ins_data = kmalloc(max_batch_size * sizeof(u32) + 6045 max_batch_size * sizeof(struct btrfs_key), GFP_NOFS); 6046 if (!ins_data) 6047 return -ENOMEM; 6048 ins_sizes = (u32 *)ins_data; 6049 batch.data_sizes = ins_sizes; 6050 ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32)); 6051 batch.keys = ins_keys; 6052 6053 curr = first; 6054 while (!list_entry_is_head(curr, delayed_ins_list, log_list)) { 6055 const u32 curr_size = curr->data_len + sizeof(struct btrfs_item); 6056 6057 if (curr_batch_size + curr_size > leaf_data_size || 6058 batch.nr == max_batch_size) { 6059 ret = insert_delayed_items_batch(trans, log, path, 6060 &batch, first); 6061 if (ret) 6062 goto out; 6063 batch_idx = 0; 6064 batch.nr = 0; 6065 batch.total_data_size = 0; 6066 curr_batch_size = 0; 6067 first = curr; 6068 } 6069 6070 ins_sizes[batch_idx] = curr->data_len; 6071 ins_keys[batch_idx].objectid = ino; 6072 ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY; 6073 ins_keys[batch_idx].offset = curr->index; 6074 curr_batch_size += curr_size; 6075 batch.total_data_size += curr->data_len; 6076 batch.nr++; 6077 batch_idx++; 6078 curr = list_next_entry(curr, log_list); 6079 } 6080 6081 ASSERT(batch.nr >= 1); 6082 ret = insert_delayed_items_batch(trans, log, path, &batch, first); 6083 6084 curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item, 6085 log_list); 6086 inode->last_dir_index_offset = curr->index; 6087 out: 6088 kfree(ins_data); 6089 6090 return ret; 6091 } 6092 6093 static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, 6094 struct btrfs_inode *inode, 6095 struct btrfs_path *path, 6096 const struct list_head *delayed_del_list, 6097 struct btrfs_log_ctx *ctx) 6098 { 6099 const u64 ino = btrfs_ino(inode); 6100 const struct btrfs_delayed_item *curr; 6101 6102 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, 6103 log_list); 6104 6105 while (!list_entry_is_head(curr, delayed_del_list, log_list)) { 6106 u64 first_dir_index = curr->index; 6107 u64 last_dir_index; 6108 const struct btrfs_delayed_item *next; 6109 int ret; 6110 6111 /* 6112 * Find a range of consecutive dir index items to delete. Like 6113 * this we log a single dir range item spanning several contiguous 6114 * dir items instead of logging one range item per dir index item. 6115 */ 6116 next = list_next_entry(curr, log_list); 6117 while (!list_entry_is_head(next, delayed_del_list, log_list)) { 6118 if (next->index != curr->index + 1) 6119 break; 6120 curr = next; 6121 next = list_next_entry(next, log_list); 6122 } 6123 6124 last_dir_index = curr->index; 6125 ASSERT(last_dir_index >= first_dir_index); 6126 6127 ret = insert_dir_log_key(trans, inode->root->log_root, path, 6128 ino, first_dir_index, last_dir_index); 6129 if (ret) 6130 return ret; 6131 curr = list_next_entry(curr, log_list); 6132 } 6133 6134 return 0; 6135 } 6136 6137 static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, 6138 struct btrfs_inode *inode, 6139 struct btrfs_path *path, 6140 struct btrfs_log_ctx *ctx, 6141 const struct list_head *delayed_del_list, 6142 const struct btrfs_delayed_item *first, 6143 const struct btrfs_delayed_item **last_ret) 6144 { 6145 const struct btrfs_delayed_item *next; 6146 struct extent_buffer *leaf = path->nodes[0]; 6147 const int last_slot = btrfs_header_nritems(leaf) - 1; 6148 int slot = path->slots[0] + 1; 6149 const u64 ino = btrfs_ino(inode); 6150 6151 next = list_next_entry(first, log_list); 6152 6153 while (slot < last_slot && 6154 !list_entry_is_head(next, delayed_del_list, log_list)) { 6155 struct btrfs_key key; 6156 6157 btrfs_item_key_to_cpu(leaf, &key, slot); 6158 if (key.objectid != ino || 6159 key.type != BTRFS_DIR_INDEX_KEY || 6160 key.offset != next->index) 6161 break; 6162 6163 slot++; 6164 *last_ret = next; 6165 next = list_next_entry(next, log_list); 6166 } 6167 6168 return btrfs_del_items(trans, inode->root->log_root, path, 6169 path->slots[0], slot - path->slots[0]); 6170 } 6171 6172 static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, 6173 struct btrfs_inode *inode, 6174 struct btrfs_path *path, 6175 const struct list_head *delayed_del_list, 6176 struct btrfs_log_ctx *ctx) 6177 { 6178 struct btrfs_root *log = inode->root->log_root; 6179 const struct btrfs_delayed_item *curr; 6180 u64 last_range_start; 6181 u64 last_range_end = 0; 6182 struct btrfs_key key; 6183 6184 key.objectid = btrfs_ino(inode); 6185 key.type = BTRFS_DIR_INDEX_KEY; 6186 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, 6187 log_list); 6188 6189 while (!list_entry_is_head(curr, delayed_del_list, log_list)) { 6190 const struct btrfs_delayed_item *last = curr; 6191 u64 first_dir_index = curr->index; 6192 u64 last_dir_index; 6193 bool deleted_items = false; 6194 int ret; 6195 6196 key.offset = curr->index; 6197 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 6198 if (ret < 0) { 6199 return ret; 6200 } else if (ret == 0) { 6201 ret = batch_delete_dir_index_items(trans, inode, path, ctx, 6202 delayed_del_list, curr, 6203 &last); 6204 if (ret) 6205 return ret; 6206 deleted_items = true; 6207 } 6208 6209 btrfs_release_path(path); 6210 6211 /* 6212 * If we deleted items from the leaf, it means we have a range 6213 * item logging their range, so no need to add one or update an 6214 * existing one. Otherwise we have to log a dir range item. 6215 */ 6216 if (deleted_items) 6217 goto next_batch; 6218 6219 last_dir_index = last->index; 6220 ASSERT(last_dir_index >= first_dir_index); 6221 /* 6222 * If this range starts right after where the previous one ends, 6223 * then we want to reuse the previous range item and change its 6224 * end offset to the end of this range. This is just to minimize 6225 * leaf space usage, by avoiding adding a new range item. 6226 */ 6227 if (last_range_end != 0 && first_dir_index == last_range_end + 1) 6228 first_dir_index = last_range_start; 6229 6230 ret = insert_dir_log_key(trans, log, path, key.objectid, 6231 first_dir_index, last_dir_index); 6232 if (ret) 6233 return ret; 6234 6235 last_range_start = first_dir_index; 6236 last_range_end = last_dir_index; 6237 next_batch: 6238 curr = list_next_entry(last, log_list); 6239 } 6240 6241 return 0; 6242 } 6243 6244 static int log_delayed_deletion_items(struct btrfs_trans_handle *trans, 6245 struct btrfs_inode *inode, 6246 struct btrfs_path *path, 6247 const struct list_head *delayed_del_list, 6248 struct btrfs_log_ctx *ctx) 6249 { 6250 /* 6251 * We are deleting dir index items from the log tree or adding range 6252 * items to it. 6253 */ 6254 lockdep_assert_held(&inode->log_mutex); 6255 6256 if (list_empty(delayed_del_list)) 6257 return 0; 6258 6259 if (ctx->logged_before) 6260 return log_delayed_deletions_incremental(trans, inode, path, 6261 delayed_del_list, ctx); 6262 6263 return log_delayed_deletions_full(trans, inode, path, delayed_del_list, 6264 ctx); 6265 } 6266 6267 /* 6268 * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed 6269 * items instead of the subvolume tree. 6270 */ 6271 static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, 6272 struct btrfs_inode *inode, 6273 const struct list_head *delayed_ins_list, 6274 struct btrfs_log_ctx *ctx) 6275 { 6276 const bool orig_log_new_dentries = ctx->log_new_dentries; 6277 struct btrfs_fs_info *fs_info = trans->fs_info; 6278 struct btrfs_delayed_item *item; 6279 int ret = 0; 6280 6281 /* 6282 * No need for the log mutex, plus to avoid potential deadlocks or 6283 * lockdep annotations due to nesting of delayed inode mutexes and log 6284 * mutexes. 6285 */ 6286 lockdep_assert_not_held(&inode->log_mutex); 6287 6288 ASSERT(!ctx->logging_new_delayed_dentries); 6289 ctx->logging_new_delayed_dentries = true; 6290 6291 list_for_each_entry(item, delayed_ins_list, log_list) { 6292 struct btrfs_dir_item *dir_item; 6293 struct inode *di_inode; 6294 struct btrfs_key key; 6295 int log_mode = LOG_INODE_EXISTS; 6296 6297 dir_item = (struct btrfs_dir_item *)item->data; 6298 btrfs_disk_key_to_cpu(&key, &dir_item->location); 6299 6300 if (key.type == BTRFS_ROOT_ITEM_KEY) 6301 continue; 6302 6303 di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root); 6304 if (IS_ERR(di_inode)) { 6305 ret = PTR_ERR(di_inode); 6306 break; 6307 } 6308 6309 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 6310 btrfs_add_delayed_iput(BTRFS_I(di_inode)); 6311 continue; 6312 } 6313 6314 if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR) 6315 log_mode = LOG_INODE_ALL; 6316 6317 ctx->log_new_dentries = false; 6318 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); 6319 6320 if (!ret && ctx->log_new_dentries) 6321 ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); 6322 6323 btrfs_add_delayed_iput(BTRFS_I(di_inode)); 6324 6325 if (ret) 6326 break; 6327 } 6328 6329 ctx->log_new_dentries = orig_log_new_dentries; 6330 ctx->logging_new_delayed_dentries = false; 6331 6332 return ret; 6333 } 6334 6335 /* log a single inode in the tree log. 6336 * At least one parent directory for this inode must exist in the tree 6337 * or be logged already. 6338 * 6339 * Any items from this inode changed by the current transaction are copied 6340 * to the log tree. An extra reference is taken on any extents in this 6341 * file, allowing us to avoid a whole pile of corner cases around logging 6342 * blocks that have been removed from the tree. 6343 * 6344 * See LOG_INODE_ALL and related defines for a description of what inode_only 6345 * does. 6346 * 6347 * This handles both files and directories. 6348 */ 6349 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 6350 struct btrfs_inode *inode, 6351 int inode_only, 6352 struct btrfs_log_ctx *ctx) 6353 { 6354 struct btrfs_path *path; 6355 struct btrfs_path *dst_path; 6356 struct btrfs_key min_key; 6357 struct btrfs_key max_key; 6358 struct btrfs_root *log = inode->root->log_root; 6359 int ret; 6360 bool fast_search = false; 6361 u64 ino = btrfs_ino(inode); 6362 struct extent_map_tree *em_tree = &inode->extent_tree; 6363 u64 logged_isize = 0; 6364 bool need_log_inode_item = true; 6365 bool xattrs_logged = false; 6366 bool inode_item_dropped = true; 6367 bool full_dir_logging = false; 6368 LIST_HEAD(delayed_ins_list); 6369 LIST_HEAD(delayed_del_list); 6370 6371 path = btrfs_alloc_path(); 6372 if (!path) 6373 return -ENOMEM; 6374 dst_path = btrfs_alloc_path(); 6375 if (!dst_path) { 6376 btrfs_free_path(path); 6377 return -ENOMEM; 6378 } 6379 6380 min_key.objectid = ino; 6381 min_key.type = BTRFS_INODE_ITEM_KEY; 6382 min_key.offset = 0; 6383 6384 max_key.objectid = ino; 6385 6386 6387 /* today the code can only do partial logging of directories */ 6388 if (S_ISDIR(inode->vfs_inode.i_mode) || 6389 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6390 &inode->runtime_flags) && 6391 inode_only >= LOG_INODE_EXISTS)) 6392 max_key.type = BTRFS_XATTR_ITEM_KEY; 6393 else 6394 max_key.type = (u8)-1; 6395 max_key.offset = (u64)-1; 6396 6397 if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL) 6398 full_dir_logging = true; 6399 6400 /* 6401 * If we are logging a directory while we are logging dentries of the 6402 * delayed items of some other inode, then we need to flush the delayed 6403 * items of this directory and not log the delayed items directly. This 6404 * is to prevent more than one level of recursion into btrfs_log_inode() 6405 * by having something like this: 6406 * 6407 * $ mkdir -p a/b/c/d/e/f/g/h/... 6408 * $ xfs_io -c "fsync" a 6409 * 6410 * Where all directories in the path did not exist before and are 6411 * created in the current transaction. 6412 * So in such a case we directly log the delayed items of the main 6413 * directory ("a") without flushing them first, while for each of its 6414 * subdirectories we flush their delayed items before logging them. 6415 * This prevents a potential unbounded recursion like this: 6416 * 6417 * btrfs_log_inode() 6418 * log_new_delayed_dentries() 6419 * btrfs_log_inode() 6420 * log_new_delayed_dentries() 6421 * btrfs_log_inode() 6422 * log_new_delayed_dentries() 6423 * (...) 6424 * 6425 * We have thresholds for the maximum number of delayed items to have in 6426 * memory, and once they are hit, the items are flushed asynchronously. 6427 * However the limit is quite high, so lets prevent deep levels of 6428 * recursion to happen by limiting the maximum depth to be 1. 6429 */ 6430 if (full_dir_logging && ctx->logging_new_delayed_dentries) { 6431 ret = btrfs_commit_inode_delayed_items(trans, inode); 6432 if (ret) 6433 goto out; 6434 } 6435 6436 mutex_lock(&inode->log_mutex); 6437 6438 /* 6439 * For symlinks, we must always log their content, which is stored in an 6440 * inline extent, otherwise we could end up with an empty symlink after 6441 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if 6442 * one attempts to create an empty symlink). 6443 * We don't need to worry about flushing delalloc, because when we create 6444 * the inline extent when the symlink is created (we never have delalloc 6445 * for symlinks). 6446 */ 6447 if (S_ISLNK(inode->vfs_inode.i_mode)) 6448 inode_only = LOG_INODE_ALL; 6449 6450 /* 6451 * Before logging the inode item, cache the value returned by 6452 * inode_logged(), because after that we have the need to figure out if 6453 * the inode was previously logged in this transaction. 6454 */ 6455 ret = inode_logged(trans, inode, path); 6456 if (ret < 0) 6457 goto out_unlock; 6458 ctx->logged_before = (ret == 1); 6459 ret = 0; 6460 6461 /* 6462 * This is for cases where logging a directory could result in losing a 6463 * a file after replaying the log. For example, if we move a file from a 6464 * directory A to a directory B, then fsync directory A, we have no way 6465 * to known the file was moved from A to B, so logging just A would 6466 * result in losing the file after a log replay. 6467 */ 6468 if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { 6469 btrfs_set_log_full_commit(trans); 6470 ret = BTRFS_LOG_FORCE_COMMIT; 6471 goto out_unlock; 6472 } 6473 6474 /* 6475 * a brute force approach to making sure we get the most uptodate 6476 * copies of everything. 6477 */ 6478 if (S_ISDIR(inode->vfs_inode.i_mode)) { 6479 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); 6480 if (ctx->logged_before) 6481 ret = drop_inode_items(trans, log, path, inode, 6482 BTRFS_XATTR_ITEM_KEY); 6483 } else { 6484 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { 6485 /* 6486 * Make sure the new inode item we write to the log has 6487 * the same isize as the current one (if it exists). 6488 * This is necessary to prevent data loss after log 6489 * replay, and also to prevent doing a wrong expanding 6490 * truncate - for e.g. create file, write 4K into offset 6491 * 0, fsync, write 4K into offset 4096, add hard link, 6492 * fsync some other file (to sync log), power fail - if 6493 * we use the inode's current i_size, after log replay 6494 * we get a 8Kb file, with the last 4Kb extent as a hole 6495 * (zeroes), as if an expanding truncate happened, 6496 * instead of getting a file of 4Kb only. 6497 */ 6498 ret = logged_inode_size(log, inode, path, &logged_isize); 6499 if (ret) 6500 goto out_unlock; 6501 } 6502 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6503 &inode->runtime_flags)) { 6504 if (inode_only == LOG_INODE_EXISTS) { 6505 max_key.type = BTRFS_XATTR_ITEM_KEY; 6506 if (ctx->logged_before) 6507 ret = drop_inode_items(trans, log, path, 6508 inode, max_key.type); 6509 } else { 6510 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6511 &inode->runtime_flags); 6512 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 6513 &inode->runtime_flags); 6514 if (ctx->logged_before) 6515 ret = truncate_inode_items(trans, log, 6516 inode, 0, 0); 6517 } 6518 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 6519 &inode->runtime_flags) || 6520 inode_only == LOG_INODE_EXISTS) { 6521 if (inode_only == LOG_INODE_ALL) 6522 fast_search = true; 6523 max_key.type = BTRFS_XATTR_ITEM_KEY; 6524 if (ctx->logged_before) 6525 ret = drop_inode_items(trans, log, path, inode, 6526 max_key.type); 6527 } else { 6528 if (inode_only == LOG_INODE_ALL) 6529 fast_search = true; 6530 inode_item_dropped = false; 6531 goto log_extents; 6532 } 6533 6534 } 6535 if (ret) 6536 goto out_unlock; 6537 6538 /* 6539 * If we are logging a directory in full mode, collect the delayed items 6540 * before iterating the subvolume tree, so that we don't miss any new 6541 * dir index items in case they get flushed while or right after we are 6542 * iterating the subvolume tree. 6543 */ 6544 if (full_dir_logging && !ctx->logging_new_delayed_dentries) 6545 btrfs_log_get_delayed_items(inode, &delayed_ins_list, 6546 &delayed_del_list); 6547 6548 ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 6549 path, dst_path, logged_isize, 6550 inode_only, ctx, 6551 &need_log_inode_item); 6552 if (ret) 6553 goto out_unlock; 6554 6555 btrfs_release_path(path); 6556 btrfs_release_path(dst_path); 6557 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 6558 if (ret) 6559 goto out_unlock; 6560 xattrs_logged = true; 6561 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 6562 btrfs_release_path(path); 6563 btrfs_release_path(dst_path); 6564 ret = btrfs_log_holes(trans, inode, path); 6565 if (ret) 6566 goto out_unlock; 6567 } 6568 log_extents: 6569 btrfs_release_path(path); 6570 btrfs_release_path(dst_path); 6571 if (need_log_inode_item) { 6572 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); 6573 if (ret) 6574 goto out_unlock; 6575 /* 6576 * If we are doing a fast fsync and the inode was logged before 6577 * in this transaction, we don't need to log the xattrs because 6578 * they were logged before. If xattrs were added, changed or 6579 * deleted since the last time we logged the inode, then we have 6580 * already logged them because the inode had the runtime flag 6581 * BTRFS_INODE_COPY_EVERYTHING set. 6582 */ 6583 if (!xattrs_logged && inode->logged_trans < trans->transid) { 6584 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 6585 if (ret) 6586 goto out_unlock; 6587 btrfs_release_path(path); 6588 } 6589 } 6590 if (fast_search) { 6591 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); 6592 if (ret) 6593 goto out_unlock; 6594 } else if (inode_only == LOG_INODE_ALL) { 6595 struct extent_map *em, *n; 6596 6597 write_lock(&em_tree->lock); 6598 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 6599 list_del_init(&em->list); 6600 write_unlock(&em_tree->lock); 6601 } 6602 6603 if (full_dir_logging) { 6604 ret = log_directory_changes(trans, inode, path, dst_path, ctx); 6605 if (ret) 6606 goto out_unlock; 6607 ret = log_delayed_insertion_items(trans, inode, path, 6608 &delayed_ins_list, ctx); 6609 if (ret) 6610 goto out_unlock; 6611 ret = log_delayed_deletion_items(trans, inode, path, 6612 &delayed_del_list, ctx); 6613 if (ret) 6614 goto out_unlock; 6615 } 6616 6617 spin_lock(&inode->lock); 6618 inode->logged_trans = trans->transid; 6619 /* 6620 * Don't update last_log_commit if we logged that an inode exists. 6621 * We do this for three reasons: 6622 * 6623 * 1) We might have had buffered writes to this inode that were 6624 * flushed and had their ordered extents completed in this 6625 * transaction, but we did not previously log the inode with 6626 * LOG_INODE_ALL. Later the inode was evicted and after that 6627 * it was loaded again and this LOG_INODE_EXISTS log operation 6628 * happened. We must make sure that if an explicit fsync against 6629 * the inode is performed later, it logs the new extents, an 6630 * updated inode item, etc, and syncs the log. The same logic 6631 * applies to direct IO writes instead of buffered writes. 6632 * 6633 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item 6634 * is logged with an i_size of 0 or whatever value was logged 6635 * before. If later the i_size of the inode is increased by a 6636 * truncate operation, the log is synced through an fsync of 6637 * some other inode and then finally an explicit fsync against 6638 * this inode is made, we must make sure this fsync logs the 6639 * inode with the new i_size, the hole between old i_size and 6640 * the new i_size, and syncs the log. 6641 * 6642 * 3) If we are logging that an ancestor inode exists as part of 6643 * logging a new name from a link or rename operation, don't update 6644 * its last_log_commit - otherwise if an explicit fsync is made 6645 * against an ancestor, the fsync considers the inode in the log 6646 * and doesn't sync the log, resulting in the ancestor missing after 6647 * a power failure unless the log was synced as part of an fsync 6648 * against any other unrelated inode. 6649 */ 6650 if (inode_only != LOG_INODE_EXISTS) 6651 inode->last_log_commit = inode->last_sub_trans; 6652 spin_unlock(&inode->lock); 6653 6654 /* 6655 * Reset the last_reflink_trans so that the next fsync does not need to 6656 * go through the slower path when logging extents and their checksums. 6657 */ 6658 if (inode_only == LOG_INODE_ALL) 6659 inode->last_reflink_trans = 0; 6660 6661 out_unlock: 6662 mutex_unlock(&inode->log_mutex); 6663 out: 6664 btrfs_free_path(path); 6665 btrfs_free_path(dst_path); 6666 6667 if (ret) 6668 free_conflicting_inodes(ctx); 6669 else 6670 ret = log_conflicting_inodes(trans, inode->root, ctx); 6671 6672 if (full_dir_logging && !ctx->logging_new_delayed_dentries) { 6673 if (!ret) 6674 ret = log_new_delayed_dentries(trans, inode, 6675 &delayed_ins_list, ctx); 6676 6677 btrfs_log_put_delayed_items(inode, &delayed_ins_list, 6678 &delayed_del_list); 6679 } 6680 6681 return ret; 6682 } 6683 6684 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 6685 struct btrfs_inode *inode, 6686 struct btrfs_log_ctx *ctx) 6687 { 6688 struct btrfs_fs_info *fs_info = trans->fs_info; 6689 int ret; 6690 struct btrfs_path *path; 6691 struct btrfs_key key; 6692 struct btrfs_root *root = inode->root; 6693 const u64 ino = btrfs_ino(inode); 6694 6695 path = btrfs_alloc_path(); 6696 if (!path) 6697 return -ENOMEM; 6698 path->skip_locking = 1; 6699 path->search_commit_root = 1; 6700 6701 key.objectid = ino; 6702 key.type = BTRFS_INODE_REF_KEY; 6703 key.offset = 0; 6704 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6705 if (ret < 0) 6706 goto out; 6707 6708 while (true) { 6709 struct extent_buffer *leaf = path->nodes[0]; 6710 int slot = path->slots[0]; 6711 u32 cur_offset = 0; 6712 u32 item_size; 6713 unsigned long ptr; 6714 6715 if (slot >= btrfs_header_nritems(leaf)) { 6716 ret = btrfs_next_leaf(root, path); 6717 if (ret < 0) 6718 goto out; 6719 else if (ret > 0) 6720 break; 6721 continue; 6722 } 6723 6724 btrfs_item_key_to_cpu(leaf, &key, slot); 6725 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 6726 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 6727 break; 6728 6729 item_size = btrfs_item_size(leaf, slot); 6730 ptr = btrfs_item_ptr_offset(leaf, slot); 6731 while (cur_offset < item_size) { 6732 struct btrfs_key inode_key; 6733 struct inode *dir_inode; 6734 6735 inode_key.type = BTRFS_INODE_ITEM_KEY; 6736 inode_key.offset = 0; 6737 6738 if (key.type == BTRFS_INODE_EXTREF_KEY) { 6739 struct btrfs_inode_extref *extref; 6740 6741 extref = (struct btrfs_inode_extref *) 6742 (ptr + cur_offset); 6743 inode_key.objectid = btrfs_inode_extref_parent( 6744 leaf, extref); 6745 cur_offset += sizeof(*extref); 6746 cur_offset += btrfs_inode_extref_name_len(leaf, 6747 extref); 6748 } else { 6749 inode_key.objectid = key.offset; 6750 cur_offset = item_size; 6751 } 6752 6753 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 6754 root); 6755 /* 6756 * If the parent inode was deleted, return an error to 6757 * fallback to a transaction commit. This is to prevent 6758 * getting an inode that was moved from one parent A to 6759 * a parent B, got its former parent A deleted and then 6760 * it got fsync'ed, from existing at both parents after 6761 * a log replay (and the old parent still existing). 6762 * Example: 6763 * 6764 * mkdir /mnt/A 6765 * mkdir /mnt/B 6766 * touch /mnt/B/bar 6767 * sync 6768 * mv /mnt/B/bar /mnt/A/bar 6769 * mv -T /mnt/A /mnt/B 6770 * fsync /mnt/B/bar 6771 * <power fail> 6772 * 6773 * If we ignore the old parent B which got deleted, 6774 * after a log replay we would have file bar linked 6775 * at both parents and the old parent B would still 6776 * exist. 6777 */ 6778 if (IS_ERR(dir_inode)) { 6779 ret = PTR_ERR(dir_inode); 6780 goto out; 6781 } 6782 6783 if (!need_log_inode(trans, BTRFS_I(dir_inode))) { 6784 btrfs_add_delayed_iput(BTRFS_I(dir_inode)); 6785 continue; 6786 } 6787 6788 ctx->log_new_dentries = false; 6789 ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), 6790 LOG_INODE_ALL, ctx); 6791 if (!ret && ctx->log_new_dentries) 6792 ret = log_new_dir_dentries(trans, 6793 BTRFS_I(dir_inode), ctx); 6794 btrfs_add_delayed_iput(BTRFS_I(dir_inode)); 6795 if (ret) 6796 goto out; 6797 } 6798 path->slots[0]++; 6799 } 6800 ret = 0; 6801 out: 6802 btrfs_free_path(path); 6803 return ret; 6804 } 6805 6806 static int log_new_ancestors(struct btrfs_trans_handle *trans, 6807 struct btrfs_root *root, 6808 struct btrfs_path *path, 6809 struct btrfs_log_ctx *ctx) 6810 { 6811 struct btrfs_key found_key; 6812 6813 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 6814 6815 while (true) { 6816 struct btrfs_fs_info *fs_info = root->fs_info; 6817 struct extent_buffer *leaf = path->nodes[0]; 6818 int slot = path->slots[0]; 6819 struct btrfs_key search_key; 6820 struct inode *inode; 6821 u64 ino; 6822 int ret = 0; 6823 6824 btrfs_release_path(path); 6825 6826 ino = found_key.offset; 6827 6828 search_key.objectid = found_key.offset; 6829 search_key.type = BTRFS_INODE_ITEM_KEY; 6830 search_key.offset = 0; 6831 inode = btrfs_iget(fs_info->sb, ino, root); 6832 if (IS_ERR(inode)) 6833 return PTR_ERR(inode); 6834 6835 if (BTRFS_I(inode)->generation >= trans->transid && 6836 need_log_inode(trans, BTRFS_I(inode))) 6837 ret = btrfs_log_inode(trans, BTRFS_I(inode), 6838 LOG_INODE_EXISTS, ctx); 6839 btrfs_add_delayed_iput(BTRFS_I(inode)); 6840 if (ret) 6841 return ret; 6842 6843 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 6844 break; 6845 6846 search_key.type = BTRFS_INODE_REF_KEY; 6847 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6848 if (ret < 0) 6849 return ret; 6850 6851 leaf = path->nodes[0]; 6852 slot = path->slots[0]; 6853 if (slot >= btrfs_header_nritems(leaf)) { 6854 ret = btrfs_next_leaf(root, path); 6855 if (ret < 0) 6856 return ret; 6857 else if (ret > 0) 6858 return -ENOENT; 6859 leaf = path->nodes[0]; 6860 slot = path->slots[0]; 6861 } 6862 6863 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6864 if (found_key.objectid != search_key.objectid || 6865 found_key.type != BTRFS_INODE_REF_KEY) 6866 return -ENOENT; 6867 } 6868 return 0; 6869 } 6870 6871 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 6872 struct btrfs_inode *inode, 6873 struct dentry *parent, 6874 struct btrfs_log_ctx *ctx) 6875 { 6876 struct btrfs_root *root = inode->root; 6877 struct dentry *old_parent = NULL; 6878 struct super_block *sb = inode->vfs_inode.i_sb; 6879 int ret = 0; 6880 6881 while (true) { 6882 if (!parent || d_really_is_negative(parent) || 6883 sb != parent->d_sb) 6884 break; 6885 6886 inode = BTRFS_I(d_inode(parent)); 6887 if (root != inode->root) 6888 break; 6889 6890 if (inode->generation >= trans->transid && 6891 need_log_inode(trans, inode)) { 6892 ret = btrfs_log_inode(trans, inode, 6893 LOG_INODE_EXISTS, ctx); 6894 if (ret) 6895 break; 6896 } 6897 if (IS_ROOT(parent)) 6898 break; 6899 6900 parent = dget_parent(parent); 6901 dput(old_parent); 6902 old_parent = parent; 6903 } 6904 dput(old_parent); 6905 6906 return ret; 6907 } 6908 6909 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 6910 struct btrfs_inode *inode, 6911 struct dentry *parent, 6912 struct btrfs_log_ctx *ctx) 6913 { 6914 struct btrfs_root *root = inode->root; 6915 const u64 ino = btrfs_ino(inode); 6916 struct btrfs_path *path; 6917 struct btrfs_key search_key; 6918 int ret; 6919 6920 /* 6921 * For a single hard link case, go through a fast path that does not 6922 * need to iterate the fs/subvolume tree. 6923 */ 6924 if (inode->vfs_inode.i_nlink < 2) 6925 return log_new_ancestors_fast(trans, inode, parent, ctx); 6926 6927 path = btrfs_alloc_path(); 6928 if (!path) 6929 return -ENOMEM; 6930 6931 search_key.objectid = ino; 6932 search_key.type = BTRFS_INODE_REF_KEY; 6933 search_key.offset = 0; 6934 again: 6935 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6936 if (ret < 0) 6937 goto out; 6938 if (ret == 0) 6939 path->slots[0]++; 6940 6941 while (true) { 6942 struct extent_buffer *leaf = path->nodes[0]; 6943 int slot = path->slots[0]; 6944 struct btrfs_key found_key; 6945 6946 if (slot >= btrfs_header_nritems(leaf)) { 6947 ret = btrfs_next_leaf(root, path); 6948 if (ret < 0) 6949 goto out; 6950 else if (ret > 0) 6951 break; 6952 continue; 6953 } 6954 6955 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6956 if (found_key.objectid != ino || 6957 found_key.type > BTRFS_INODE_EXTREF_KEY) 6958 break; 6959 6960 /* 6961 * Don't deal with extended references because they are rare 6962 * cases and too complex to deal with (we would need to keep 6963 * track of which subitem we are processing for each item in 6964 * this loop, etc). So just return some error to fallback to 6965 * a transaction commit. 6966 */ 6967 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 6968 ret = -EMLINK; 6969 goto out; 6970 } 6971 6972 /* 6973 * Logging ancestors needs to do more searches on the fs/subvol 6974 * tree, so it releases the path as needed to avoid deadlocks. 6975 * Keep track of the last inode ref key and resume from that key 6976 * after logging all new ancestors for the current hard link. 6977 */ 6978 memcpy(&search_key, &found_key, sizeof(search_key)); 6979 6980 ret = log_new_ancestors(trans, root, path, ctx); 6981 if (ret) 6982 goto out; 6983 btrfs_release_path(path); 6984 goto again; 6985 } 6986 ret = 0; 6987 out: 6988 btrfs_free_path(path); 6989 return ret; 6990 } 6991 6992 /* 6993 * helper function around btrfs_log_inode to make sure newly created 6994 * parent directories also end up in the log. A minimal inode and backref 6995 * only logging is done of any parent directories that are older than 6996 * the last committed transaction 6997 */ 6998 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 6999 struct btrfs_inode *inode, 7000 struct dentry *parent, 7001 int inode_only, 7002 struct btrfs_log_ctx *ctx) 7003 { 7004 struct btrfs_root *root = inode->root; 7005 struct btrfs_fs_info *fs_info = root->fs_info; 7006 int ret = 0; 7007 bool log_dentries = false; 7008 7009 if (btrfs_test_opt(fs_info, NOTREELOG)) { 7010 ret = BTRFS_LOG_FORCE_COMMIT; 7011 goto end_no_trans; 7012 } 7013 7014 if (btrfs_root_refs(&root->root_item) == 0) { 7015 ret = BTRFS_LOG_FORCE_COMMIT; 7016 goto end_no_trans; 7017 } 7018 7019 /* 7020 * Skip already logged inodes or inodes corresponding to tmpfiles 7021 * (since logging them is pointless, a link count of 0 means they 7022 * will never be accessible). 7023 */ 7024 if ((btrfs_inode_in_log(inode, trans->transid) && 7025 list_empty(&ctx->ordered_extents)) || 7026 inode->vfs_inode.i_nlink == 0) { 7027 ret = BTRFS_NO_LOG_SYNC; 7028 goto end_no_trans; 7029 } 7030 7031 ret = start_log_trans(trans, root, ctx); 7032 if (ret) 7033 goto end_no_trans; 7034 7035 ret = btrfs_log_inode(trans, inode, inode_only, ctx); 7036 if (ret) 7037 goto end_trans; 7038 7039 /* 7040 * for regular files, if its inode is already on disk, we don't 7041 * have to worry about the parents at all. This is because 7042 * we can use the last_unlink_trans field to record renames 7043 * and other fun in this file. 7044 */ 7045 if (S_ISREG(inode->vfs_inode.i_mode) && 7046 inode->generation < trans->transid && 7047 inode->last_unlink_trans < trans->transid) { 7048 ret = 0; 7049 goto end_trans; 7050 } 7051 7052 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) 7053 log_dentries = true; 7054 7055 /* 7056 * On unlink we must make sure all our current and old parent directory 7057 * inodes are fully logged. This is to prevent leaving dangling 7058 * directory index entries in directories that were our parents but are 7059 * not anymore. Not doing this results in old parent directory being 7060 * impossible to delete after log replay (rmdir will always fail with 7061 * error -ENOTEMPTY). 7062 * 7063 * Example 1: 7064 * 7065 * mkdir testdir 7066 * touch testdir/foo 7067 * ln testdir/foo testdir/bar 7068 * sync 7069 * unlink testdir/bar 7070 * xfs_io -c fsync testdir/foo 7071 * <power failure> 7072 * mount fs, triggers log replay 7073 * 7074 * If we don't log the parent directory (testdir), after log replay the 7075 * directory still has an entry pointing to the file inode using the bar 7076 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 7077 * the file inode has a link count of 1. 7078 * 7079 * Example 2: 7080 * 7081 * mkdir testdir 7082 * touch foo 7083 * ln foo testdir/foo2 7084 * ln foo testdir/foo3 7085 * sync 7086 * unlink testdir/foo3 7087 * xfs_io -c fsync foo 7088 * <power failure> 7089 * mount fs, triggers log replay 7090 * 7091 * Similar as the first example, after log replay the parent directory 7092 * testdir still has an entry pointing to the inode file with name foo3 7093 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 7094 * and has a link count of 2. 7095 */ 7096 if (inode->last_unlink_trans >= trans->transid) { 7097 ret = btrfs_log_all_parents(trans, inode, ctx); 7098 if (ret) 7099 goto end_trans; 7100 } 7101 7102 ret = log_all_new_ancestors(trans, inode, parent, ctx); 7103 if (ret) 7104 goto end_trans; 7105 7106 if (log_dentries) 7107 ret = log_new_dir_dentries(trans, inode, ctx); 7108 else 7109 ret = 0; 7110 end_trans: 7111 if (ret < 0) { 7112 btrfs_set_log_full_commit(trans); 7113 ret = BTRFS_LOG_FORCE_COMMIT; 7114 } 7115 7116 if (ret) 7117 btrfs_remove_log_ctx(root, ctx); 7118 btrfs_end_log_trans(root); 7119 end_no_trans: 7120 return ret; 7121 } 7122 7123 /* 7124 * it is not safe to log dentry if the chunk root has added new 7125 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 7126 * If this returns 1, you must commit the transaction to safely get your 7127 * data on disk. 7128 */ 7129 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 7130 struct dentry *dentry, 7131 struct btrfs_log_ctx *ctx) 7132 { 7133 struct dentry *parent = dget_parent(dentry); 7134 int ret; 7135 7136 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 7137 LOG_INODE_ALL, ctx); 7138 dput(parent); 7139 7140 return ret; 7141 } 7142 7143 /* 7144 * should be called during mount to recover any replay any log trees 7145 * from the FS 7146 */ 7147 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 7148 { 7149 int ret; 7150 struct btrfs_path *path; 7151 struct btrfs_trans_handle *trans; 7152 struct btrfs_key key; 7153 struct btrfs_key found_key; 7154 struct btrfs_root *log; 7155 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 7156 struct walk_control wc = { 7157 .process_func = process_one_buffer, 7158 .stage = LOG_WALK_PIN_ONLY, 7159 }; 7160 7161 path = btrfs_alloc_path(); 7162 if (!path) 7163 return -ENOMEM; 7164 7165 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7166 7167 trans = btrfs_start_transaction(fs_info->tree_root, 0); 7168 if (IS_ERR(trans)) { 7169 ret = PTR_ERR(trans); 7170 goto error; 7171 } 7172 7173 wc.trans = trans; 7174 wc.pin = 1; 7175 7176 ret = walk_log_tree(trans, log_root_tree, &wc); 7177 if (ret) { 7178 btrfs_abort_transaction(trans, ret); 7179 goto error; 7180 } 7181 7182 again: 7183 key.objectid = BTRFS_TREE_LOG_OBJECTID; 7184 key.offset = (u64)-1; 7185 key.type = BTRFS_ROOT_ITEM_KEY; 7186 7187 while (1) { 7188 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 7189 7190 if (ret < 0) { 7191 btrfs_abort_transaction(trans, ret); 7192 goto error; 7193 } 7194 if (ret > 0) { 7195 if (path->slots[0] == 0) 7196 break; 7197 path->slots[0]--; 7198 } 7199 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 7200 path->slots[0]); 7201 btrfs_release_path(path); 7202 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 7203 break; 7204 7205 log = btrfs_read_tree_root(log_root_tree, &found_key); 7206 if (IS_ERR(log)) { 7207 ret = PTR_ERR(log); 7208 btrfs_abort_transaction(trans, ret); 7209 goto error; 7210 } 7211 7212 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 7213 true); 7214 if (IS_ERR(wc.replay_dest)) { 7215 ret = PTR_ERR(wc.replay_dest); 7216 7217 /* 7218 * We didn't find the subvol, likely because it was 7219 * deleted. This is ok, simply skip this log and go to 7220 * the next one. 7221 * 7222 * We need to exclude the root because we can't have 7223 * other log replays overwriting this log as we'll read 7224 * it back in a few more times. This will keep our 7225 * block from being modified, and we'll just bail for 7226 * each subsequent pass. 7227 */ 7228 if (ret == -ENOENT) 7229 ret = btrfs_pin_extent_for_log_replay(trans, 7230 log->node->start, 7231 log->node->len); 7232 btrfs_put_root(log); 7233 7234 if (!ret) 7235 goto next; 7236 btrfs_abort_transaction(trans, ret); 7237 goto error; 7238 } 7239 7240 wc.replay_dest->log_root = log; 7241 ret = btrfs_record_root_in_trans(trans, wc.replay_dest); 7242 if (ret) 7243 /* The loop needs to continue due to the root refs */ 7244 btrfs_abort_transaction(trans, ret); 7245 else 7246 ret = walk_log_tree(trans, log, &wc); 7247 7248 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 7249 ret = fixup_inode_link_counts(trans, wc.replay_dest, 7250 path); 7251 if (ret) 7252 btrfs_abort_transaction(trans, ret); 7253 } 7254 7255 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 7256 struct btrfs_root *root = wc.replay_dest; 7257 7258 btrfs_release_path(path); 7259 7260 /* 7261 * We have just replayed everything, and the highest 7262 * objectid of fs roots probably has changed in case 7263 * some inode_item's got replayed. 7264 * 7265 * root->objectid_mutex is not acquired as log replay 7266 * could only happen during mount. 7267 */ 7268 ret = btrfs_init_root_free_objectid(root); 7269 if (ret) 7270 btrfs_abort_transaction(trans, ret); 7271 } 7272 7273 wc.replay_dest->log_root = NULL; 7274 btrfs_put_root(wc.replay_dest); 7275 btrfs_put_root(log); 7276 7277 if (ret) 7278 goto error; 7279 next: 7280 if (found_key.offset == 0) 7281 break; 7282 key.offset = found_key.offset - 1; 7283 } 7284 btrfs_release_path(path); 7285 7286 /* step one is to pin it all, step two is to replay just inodes */ 7287 if (wc.pin) { 7288 wc.pin = 0; 7289 wc.process_func = replay_one_buffer; 7290 wc.stage = LOG_WALK_REPLAY_INODES; 7291 goto again; 7292 } 7293 /* step three is to replay everything */ 7294 if (wc.stage < LOG_WALK_REPLAY_ALL) { 7295 wc.stage++; 7296 goto again; 7297 } 7298 7299 btrfs_free_path(path); 7300 7301 /* step 4: commit the transaction, which also unpins the blocks */ 7302 ret = btrfs_commit_transaction(trans); 7303 if (ret) 7304 return ret; 7305 7306 log_root_tree->log_root = NULL; 7307 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7308 btrfs_put_root(log_root_tree); 7309 7310 return 0; 7311 error: 7312 if (wc.trans) 7313 btrfs_end_transaction(wc.trans); 7314 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7315 btrfs_free_path(path); 7316 return ret; 7317 } 7318 7319 /* 7320 * there are some corner cases where we want to force a full 7321 * commit instead of allowing a directory to be logged. 7322 * 7323 * They revolve around files there were unlinked from the directory, and 7324 * this function updates the parent directory so that a full commit is 7325 * properly done if it is fsync'd later after the unlinks are done. 7326 * 7327 * Must be called before the unlink operations (updates to the subvolume tree, 7328 * inodes, etc) are done. 7329 */ 7330 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 7331 struct btrfs_inode *dir, struct btrfs_inode *inode, 7332 int for_rename) 7333 { 7334 /* 7335 * when we're logging a file, if it hasn't been renamed 7336 * or unlinked, and its inode is fully committed on disk, 7337 * we don't have to worry about walking up the directory chain 7338 * to log its parents. 7339 * 7340 * So, we use the last_unlink_trans field to put this transid 7341 * into the file. When the file is logged we check it and 7342 * don't log the parents if the file is fully on disk. 7343 */ 7344 mutex_lock(&inode->log_mutex); 7345 inode->last_unlink_trans = trans->transid; 7346 mutex_unlock(&inode->log_mutex); 7347 7348 /* 7349 * if this directory was already logged any new 7350 * names for this file/dir will get recorded 7351 */ 7352 if (dir->logged_trans == trans->transid) 7353 return; 7354 7355 /* 7356 * if the inode we're about to unlink was logged, 7357 * the log will be properly updated for any new names 7358 */ 7359 if (inode->logged_trans == trans->transid) 7360 return; 7361 7362 /* 7363 * when renaming files across directories, if the directory 7364 * there we're unlinking from gets fsync'd later on, there's 7365 * no way to find the destination directory later and fsync it 7366 * properly. So, we have to be conservative and force commits 7367 * so the new name gets discovered. 7368 */ 7369 if (for_rename) 7370 goto record; 7371 7372 /* we can safely do the unlink without any special recording */ 7373 return; 7374 7375 record: 7376 mutex_lock(&dir->log_mutex); 7377 dir->last_unlink_trans = trans->transid; 7378 mutex_unlock(&dir->log_mutex); 7379 } 7380 7381 /* 7382 * Make sure that if someone attempts to fsync the parent directory of a deleted 7383 * snapshot, it ends up triggering a transaction commit. This is to guarantee 7384 * that after replaying the log tree of the parent directory's root we will not 7385 * see the snapshot anymore and at log replay time we will not see any log tree 7386 * corresponding to the deleted snapshot's root, which could lead to replaying 7387 * it after replaying the log tree of the parent directory (which would replay 7388 * the snapshot delete operation). 7389 * 7390 * Must be called before the actual snapshot destroy operation (updates to the 7391 * parent root and tree of tree roots trees, etc) are done. 7392 */ 7393 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 7394 struct btrfs_inode *dir) 7395 { 7396 mutex_lock(&dir->log_mutex); 7397 dir->last_unlink_trans = trans->transid; 7398 mutex_unlock(&dir->log_mutex); 7399 } 7400 7401 /* 7402 * Update the log after adding a new name for an inode. 7403 * 7404 * @trans: Transaction handle. 7405 * @old_dentry: The dentry associated with the old name and the old 7406 * parent directory. 7407 * @old_dir: The inode of the previous parent directory for the case 7408 * of a rename. For a link operation, it must be NULL. 7409 * @old_dir_index: The index number associated with the old name, meaningful 7410 * only for rename operations (when @old_dir is not NULL). 7411 * Ignored for link operations. 7412 * @parent: The dentry associated with the directory under which the 7413 * new name is located. 7414 * 7415 * Call this after adding a new name for an inode, as a result of a link or 7416 * rename operation, and it will properly update the log to reflect the new name. 7417 */ 7418 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 7419 struct dentry *old_dentry, struct btrfs_inode *old_dir, 7420 u64 old_dir_index, struct dentry *parent) 7421 { 7422 struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); 7423 struct btrfs_root *root = inode->root; 7424 struct btrfs_log_ctx ctx; 7425 bool log_pinned = false; 7426 int ret; 7427 7428 /* 7429 * this will force the logging code to walk the dentry chain 7430 * up for the file 7431 */ 7432 if (!S_ISDIR(inode->vfs_inode.i_mode)) 7433 inode->last_unlink_trans = trans->transid; 7434 7435 /* 7436 * if this inode hasn't been logged and directory we're renaming it 7437 * from hasn't been logged, we don't need to log it 7438 */ 7439 ret = inode_logged(trans, inode, NULL); 7440 if (ret < 0) { 7441 goto out; 7442 } else if (ret == 0) { 7443 if (!old_dir) 7444 return; 7445 /* 7446 * If the inode was not logged and we are doing a rename (old_dir is not 7447 * NULL), check if old_dir was logged - if it was not we can return and 7448 * do nothing. 7449 */ 7450 ret = inode_logged(trans, old_dir, NULL); 7451 if (ret < 0) 7452 goto out; 7453 else if (ret == 0) 7454 return; 7455 } 7456 ret = 0; 7457 7458 /* 7459 * If we are doing a rename (old_dir is not NULL) from a directory that 7460 * was previously logged, make sure that on log replay we get the old 7461 * dir entry deleted. This is needed because we will also log the new 7462 * name of the renamed inode, so we need to make sure that after log 7463 * replay we don't end up with both the new and old dir entries existing. 7464 */ 7465 if (old_dir && old_dir->logged_trans == trans->transid) { 7466 struct btrfs_root *log = old_dir->root->log_root; 7467 struct btrfs_path *path; 7468 struct fscrypt_name fname; 7469 7470 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); 7471 7472 ret = fscrypt_setup_filename(&old_dir->vfs_inode, 7473 &old_dentry->d_name, 0, &fname); 7474 if (ret) 7475 goto out; 7476 /* 7477 * We have two inodes to update in the log, the old directory and 7478 * the inode that got renamed, so we must pin the log to prevent 7479 * anyone from syncing the log until we have updated both inodes 7480 * in the log. 7481 */ 7482 ret = join_running_log_trans(root); 7483 /* 7484 * At least one of the inodes was logged before, so this should 7485 * not fail, but if it does, it's not serious, just bail out and 7486 * mark the log for a full commit. 7487 */ 7488 if (WARN_ON_ONCE(ret < 0)) { 7489 fscrypt_free_filename(&fname); 7490 goto out; 7491 } 7492 7493 log_pinned = true; 7494 7495 path = btrfs_alloc_path(); 7496 if (!path) { 7497 ret = -ENOMEM; 7498 fscrypt_free_filename(&fname); 7499 goto out; 7500 } 7501 7502 /* 7503 * Other concurrent task might be logging the old directory, 7504 * as it can be triggered when logging other inode that had or 7505 * still has a dentry in the old directory. We lock the old 7506 * directory's log_mutex to ensure the deletion of the old 7507 * name is persisted, because during directory logging we 7508 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of 7509 * the old name's dir index item is in the delayed items, so 7510 * it could be missed by an in progress directory logging. 7511 */ 7512 mutex_lock(&old_dir->log_mutex); 7513 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), 7514 &fname.disk_name, old_dir_index); 7515 if (ret > 0) { 7516 /* 7517 * The dentry does not exist in the log, so record its 7518 * deletion. 7519 */ 7520 btrfs_release_path(path); 7521 ret = insert_dir_log_key(trans, log, path, 7522 btrfs_ino(old_dir), 7523 old_dir_index, old_dir_index); 7524 } 7525 mutex_unlock(&old_dir->log_mutex); 7526 7527 btrfs_free_path(path); 7528 fscrypt_free_filename(&fname); 7529 if (ret < 0) 7530 goto out; 7531 } 7532 7533 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 7534 ctx.logging_new_name = true; 7535 /* 7536 * We don't care about the return value. If we fail to log the new name 7537 * then we know the next attempt to sync the log will fallback to a full 7538 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 7539 * we don't need to worry about getting a log committed that has an 7540 * inconsistent state after a rename operation. 7541 */ 7542 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 7543 ASSERT(list_empty(&ctx.conflict_inodes)); 7544 out: 7545 /* 7546 * If an error happened mark the log for a full commit because it's not 7547 * consistent and up to date or we couldn't find out if one of the 7548 * inodes was logged before in this transaction. Do it before unpinning 7549 * the log, to avoid any races with someone else trying to commit it. 7550 */ 7551 if (ret < 0) 7552 btrfs_set_log_full_commit(trans); 7553 if (log_pinned) 7554 btrfs_end_log_trans(root); 7555 } 7556 7557