1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "backref.h" 17 #include "compression.h" 18 #include "qgroup.h" 19 #include "block-group.h" 20 #include "space-info.h" 21 #include "inode-item.h" 22 #include "fs.h" 23 #include "accessors.h" 24 #include "extent-tree.h" 25 #include "root-tree.h" 26 #include "dir-item.h" 27 #include "file-item.h" 28 #include "file.h" 29 #include "orphan.h" 30 #include "tree-checker.h" 31 32 #define MAX_CONFLICT_INODES 10 33 34 /* magic values for the inode_only field in btrfs_log_inode: 35 * 36 * LOG_INODE_ALL means to log everything 37 * LOG_INODE_EXISTS means to log just enough to recreate the inode 38 * during log replay 39 */ 40 enum { 41 LOG_INODE_ALL, 42 LOG_INODE_EXISTS, 43 }; 44 45 /* 46 * directory trouble cases 47 * 48 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 49 * log, we must force a full commit before doing an fsync of the directory 50 * where the unlink was done. 51 * ---> record transid of last unlink/rename per directory 52 * 53 * mkdir foo/some_dir 54 * normal commit 55 * rename foo/some_dir foo2/some_dir 56 * mkdir foo/some_dir 57 * fsync foo/some_dir/some_file 58 * 59 * The fsync above will unlink the original some_dir without recording 60 * it in its new location (foo2). After a crash, some_dir will be gone 61 * unless the fsync of some_file forces a full commit 62 * 63 * 2) we must log any new names for any file or dir that is in the fsync 64 * log. ---> check inode while renaming/linking. 65 * 66 * 2a) we must log any new names for any file or dir during rename 67 * when the directory they are being removed from was logged. 68 * ---> check inode and old parent dir during rename 69 * 70 * 2a is actually the more important variant. With the extra logging 71 * a crash might unlink the old name without recreating the new one 72 * 73 * 3) after a crash, we must go through any directories with a link count 74 * of zero and redo the rm -rf 75 * 76 * mkdir f1/foo 77 * normal commit 78 * rm -rf f1/foo 79 * fsync(f1) 80 * 81 * The directory f1 was fully removed from the FS, but fsync was never 82 * called on f1, only its parent dir. After a crash the rm -rf must 83 * be replayed. This must be able to recurse down the entire 84 * directory tree. The inode link count fixup code takes care of the 85 * ugly details. 86 */ 87 88 /* 89 * stages for the tree walking. The first 90 * stage (0) is to only pin down the blocks we find 91 * the second stage (1) is to make sure that all the inodes 92 * we find in the log are created in the subvolume. 93 * 94 * The last stage is to deal with directories and links and extents 95 * and all the other fun semantics 96 */ 97 enum { 98 LOG_WALK_PIN_ONLY, 99 LOG_WALK_REPLAY_INODES, 100 LOG_WALK_REPLAY_DIR_INDEX, 101 LOG_WALK_REPLAY_ALL, 102 }; 103 104 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 105 struct btrfs_inode *inode, 106 int inode_only, 107 struct btrfs_log_ctx *ctx); 108 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 109 struct btrfs_root *root, 110 struct btrfs_path *path, u64 objectid); 111 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 112 struct btrfs_root *root, 113 struct btrfs_root *log, 114 struct btrfs_path *path, 115 u64 dirid, int del_all); 116 static void wait_log_commit(struct btrfs_root *root, int transid); 117 118 /* 119 * tree logging is a special write ahead log used to make sure that 120 * fsyncs and O_SYNCs can happen without doing full tree commits. 121 * 122 * Full tree commits are expensive because they require commonly 123 * modified blocks to be recowed, creating many dirty pages in the 124 * extent tree an 4x-6x higher write load than ext3. 125 * 126 * Instead of doing a tree commit on every fsync, we use the 127 * key ranges and transaction ids to find items for a given file or directory 128 * that have changed in this transaction. Those items are copied into 129 * a special tree (one per subvolume root), that tree is written to disk 130 * and then the fsync is considered complete. 131 * 132 * After a crash, items are copied out of the log-tree back into the 133 * subvolume tree. Any file data extents found are recorded in the extent 134 * allocation tree, and the log-tree freed. 135 * 136 * The log tree is read three times, once to pin down all the extents it is 137 * using in ram and once, once to create all the inodes logged in the tree 138 * and once to do all the other items. 139 */ 140 141 /* 142 * start a sub transaction and setup the log tree 143 * this increments the log tree writer count to make the people 144 * syncing the tree wait for us to finish 145 */ 146 static int start_log_trans(struct btrfs_trans_handle *trans, 147 struct btrfs_root *root, 148 struct btrfs_log_ctx *ctx) 149 { 150 struct btrfs_fs_info *fs_info = root->fs_info; 151 struct btrfs_root *tree_root = fs_info->tree_root; 152 const bool zoned = btrfs_is_zoned(fs_info); 153 int ret = 0; 154 bool created = false; 155 156 /* 157 * First check if the log root tree was already created. If not, create 158 * it before locking the root's log_mutex, just to keep lockdep happy. 159 */ 160 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { 161 mutex_lock(&tree_root->log_mutex); 162 if (!fs_info->log_root_tree) { 163 ret = btrfs_init_log_root_tree(trans, fs_info); 164 if (!ret) { 165 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); 166 created = true; 167 } 168 } 169 mutex_unlock(&tree_root->log_mutex); 170 if (ret) 171 return ret; 172 } 173 174 mutex_lock(&root->log_mutex); 175 176 again: 177 if (root->log_root) { 178 int index = (root->log_transid + 1) % 2; 179 180 if (btrfs_need_log_full_commit(trans)) { 181 ret = BTRFS_LOG_FORCE_COMMIT; 182 goto out; 183 } 184 185 if (zoned && atomic_read(&root->log_commit[index])) { 186 wait_log_commit(root, root->log_transid - 1); 187 goto again; 188 } 189 190 if (!root->log_start_pid) { 191 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 192 root->log_start_pid = current->pid; 193 } else if (root->log_start_pid != current->pid) { 194 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 195 } 196 } else { 197 /* 198 * This means fs_info->log_root_tree was already created 199 * for some other FS trees. Do the full commit not to mix 200 * nodes from multiple log transactions to do sequential 201 * writing. 202 */ 203 if (zoned && !created) { 204 ret = BTRFS_LOG_FORCE_COMMIT; 205 goto out; 206 } 207 208 ret = btrfs_add_log_tree(trans, root); 209 if (ret) 210 goto out; 211 212 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 213 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 214 root->log_start_pid = current->pid; 215 } 216 217 atomic_inc(&root->log_writers); 218 if (!ctx->logging_new_name) { 219 int index = root->log_transid % 2; 220 list_add_tail(&ctx->list, &root->log_ctxs[index]); 221 ctx->log_transid = root->log_transid; 222 } 223 224 out: 225 mutex_unlock(&root->log_mutex); 226 return ret; 227 } 228 229 /* 230 * returns 0 if there was a log transaction running and we were able 231 * to join, or returns -ENOENT if there were not transactions 232 * in progress 233 */ 234 static int join_running_log_trans(struct btrfs_root *root) 235 { 236 const bool zoned = btrfs_is_zoned(root->fs_info); 237 int ret = -ENOENT; 238 239 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 240 return ret; 241 242 mutex_lock(&root->log_mutex); 243 again: 244 if (root->log_root) { 245 int index = (root->log_transid + 1) % 2; 246 247 ret = 0; 248 if (zoned && atomic_read(&root->log_commit[index])) { 249 wait_log_commit(root, root->log_transid - 1); 250 goto again; 251 } 252 atomic_inc(&root->log_writers); 253 } 254 mutex_unlock(&root->log_mutex); 255 return ret; 256 } 257 258 /* 259 * This either makes the current running log transaction wait 260 * until you call btrfs_end_log_trans() or it makes any future 261 * log transactions wait until you call btrfs_end_log_trans() 262 */ 263 void btrfs_pin_log_trans(struct btrfs_root *root) 264 { 265 atomic_inc(&root->log_writers); 266 } 267 268 /* 269 * indicate we're done making changes to the log tree 270 * and wake up anyone waiting to do a sync 271 */ 272 void btrfs_end_log_trans(struct btrfs_root *root) 273 { 274 if (atomic_dec_and_test(&root->log_writers)) { 275 /* atomic_dec_and_test implies a barrier */ 276 cond_wake_up_nomb(&root->log_writer_wait); 277 } 278 } 279 280 /* 281 * the walk control struct is used to pass state down the chain when 282 * processing the log tree. The stage field tells us which part 283 * of the log tree processing we are currently doing. The others 284 * are state fields used for that specific part 285 */ 286 struct walk_control { 287 /* should we free the extent on disk when done? This is used 288 * at transaction commit time while freeing a log tree 289 */ 290 int free; 291 292 /* pin only walk, we record which extents on disk belong to the 293 * log trees 294 */ 295 int pin; 296 297 /* what stage of the replay code we're currently in */ 298 int stage; 299 300 /* 301 * Ignore any items from the inode currently being processed. Needs 302 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 303 * the LOG_WALK_REPLAY_INODES stage. 304 */ 305 bool ignore_cur_inode; 306 307 /* the root we are currently replaying */ 308 struct btrfs_root *replay_dest; 309 310 /* the trans handle for the current replay */ 311 struct btrfs_trans_handle *trans; 312 313 /* the function that gets used to process blocks we find in the 314 * tree. Note the extent_buffer might not be up to date when it is 315 * passed in, and it must be checked or read if you need the data 316 * inside it 317 */ 318 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 319 struct walk_control *wc, u64 gen, int level); 320 }; 321 322 /* 323 * process_func used to pin down extents, write them or wait on them 324 */ 325 static int process_one_buffer(struct btrfs_root *log, 326 struct extent_buffer *eb, 327 struct walk_control *wc, u64 gen, int level) 328 { 329 struct btrfs_fs_info *fs_info = log->fs_info; 330 int ret = 0; 331 332 /* 333 * If this fs is mixed then we need to be able to process the leaves to 334 * pin down any logged extents, so we have to read the block. 335 */ 336 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 337 struct btrfs_tree_parent_check check = { 338 .level = level, 339 .transid = gen 340 }; 341 342 ret = btrfs_read_extent_buffer(eb, &check); 343 if (ret) 344 return ret; 345 } 346 347 if (wc->pin) { 348 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb); 349 if (ret) 350 return ret; 351 352 if (btrfs_buffer_uptodate(eb, gen, 0) && 353 btrfs_header_level(eb) == 0) 354 ret = btrfs_exclude_logged_extents(eb); 355 } 356 return ret; 357 } 358 359 /* 360 * Item overwrite used by replay and tree logging. eb, slot and key all refer 361 * to the src data we are copying out. 362 * 363 * root is the tree we are copying into, and path is a scratch 364 * path for use in this function (it should be released on entry and 365 * will be released on exit). 366 * 367 * If the key is already in the destination tree the existing item is 368 * overwritten. If the existing item isn't big enough, it is extended. 369 * If it is too large, it is truncated. 370 * 371 * If the key isn't in the destination yet, a new item is inserted. 372 */ 373 static int overwrite_item(struct btrfs_trans_handle *trans, 374 struct btrfs_root *root, 375 struct btrfs_path *path, 376 struct extent_buffer *eb, int slot, 377 struct btrfs_key *key) 378 { 379 int ret; 380 u32 item_size; 381 u64 saved_i_size = 0; 382 int save_old_i_size = 0; 383 unsigned long src_ptr; 384 unsigned long dst_ptr; 385 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 386 387 /* 388 * This is only used during log replay, so the root is always from a 389 * fs/subvolume tree. In case we ever need to support a log root, then 390 * we'll have to clone the leaf in the path, release the path and use 391 * the leaf before writing into the log tree. See the comments at 392 * copy_items() for more details. 393 */ 394 ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); 395 396 item_size = btrfs_item_size(eb, slot); 397 src_ptr = btrfs_item_ptr_offset(eb, slot); 398 399 /* Look for the key in the destination tree. */ 400 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 401 if (ret < 0) 402 return ret; 403 404 if (ret == 0) { 405 char *src_copy; 406 char *dst_copy; 407 u32 dst_size = btrfs_item_size(path->nodes[0], 408 path->slots[0]); 409 if (dst_size != item_size) 410 goto insert; 411 412 if (item_size == 0) { 413 btrfs_release_path(path); 414 return 0; 415 } 416 dst_copy = kmalloc(item_size, GFP_NOFS); 417 src_copy = kmalloc(item_size, GFP_NOFS); 418 if (!dst_copy || !src_copy) { 419 btrfs_release_path(path); 420 kfree(dst_copy); 421 kfree(src_copy); 422 return -ENOMEM; 423 } 424 425 read_extent_buffer(eb, src_copy, src_ptr, item_size); 426 427 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 428 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 429 item_size); 430 ret = memcmp(dst_copy, src_copy, item_size); 431 432 kfree(dst_copy); 433 kfree(src_copy); 434 /* 435 * they have the same contents, just return, this saves 436 * us from cowing blocks in the destination tree and doing 437 * extra writes that may not have been done by a previous 438 * sync 439 */ 440 if (ret == 0) { 441 btrfs_release_path(path); 442 return 0; 443 } 444 445 /* 446 * We need to load the old nbytes into the inode so when we 447 * replay the extents we've logged we get the right nbytes. 448 */ 449 if (inode_item) { 450 struct btrfs_inode_item *item; 451 u64 nbytes; 452 u32 mode; 453 454 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 455 struct btrfs_inode_item); 456 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 457 item = btrfs_item_ptr(eb, slot, 458 struct btrfs_inode_item); 459 btrfs_set_inode_nbytes(eb, item, nbytes); 460 461 /* 462 * If this is a directory we need to reset the i_size to 463 * 0 so that we can set it up properly when replaying 464 * the rest of the items in this log. 465 */ 466 mode = btrfs_inode_mode(eb, item); 467 if (S_ISDIR(mode)) 468 btrfs_set_inode_size(eb, item, 0); 469 } 470 } else if (inode_item) { 471 struct btrfs_inode_item *item; 472 u32 mode; 473 474 /* 475 * New inode, set nbytes to 0 so that the nbytes comes out 476 * properly when we replay the extents. 477 */ 478 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 479 btrfs_set_inode_nbytes(eb, item, 0); 480 481 /* 482 * If this is a directory we need to reset the i_size to 0 so 483 * that we can set it up properly when replaying the rest of 484 * the items in this log. 485 */ 486 mode = btrfs_inode_mode(eb, item); 487 if (S_ISDIR(mode)) 488 btrfs_set_inode_size(eb, item, 0); 489 } 490 insert: 491 btrfs_release_path(path); 492 /* try to insert the key into the destination tree */ 493 path->skip_release_on_error = 1; 494 ret = btrfs_insert_empty_item(trans, root, path, 495 key, item_size); 496 path->skip_release_on_error = 0; 497 498 /* make sure any existing item is the correct size */ 499 if (ret == -EEXIST || ret == -EOVERFLOW) { 500 u32 found_size; 501 found_size = btrfs_item_size(path->nodes[0], 502 path->slots[0]); 503 if (found_size > item_size) 504 btrfs_truncate_item(trans, path, item_size, 1); 505 else if (found_size < item_size) 506 btrfs_extend_item(trans, path, item_size - found_size); 507 } else if (ret) { 508 return ret; 509 } 510 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 511 path->slots[0]); 512 513 /* don't overwrite an existing inode if the generation number 514 * was logged as zero. This is done when the tree logging code 515 * is just logging an inode to make sure it exists after recovery. 516 * 517 * Also, don't overwrite i_size on directories during replay. 518 * log replay inserts and removes directory items based on the 519 * state of the tree found in the subvolume, and i_size is modified 520 * as it goes 521 */ 522 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 523 struct btrfs_inode_item *src_item; 524 struct btrfs_inode_item *dst_item; 525 526 src_item = (struct btrfs_inode_item *)src_ptr; 527 dst_item = (struct btrfs_inode_item *)dst_ptr; 528 529 if (btrfs_inode_generation(eb, src_item) == 0) { 530 struct extent_buffer *dst_eb = path->nodes[0]; 531 const u64 ino_size = btrfs_inode_size(eb, src_item); 532 533 /* 534 * For regular files an ino_size == 0 is used only when 535 * logging that an inode exists, as part of a directory 536 * fsync, and the inode wasn't fsynced before. In this 537 * case don't set the size of the inode in the fs/subvol 538 * tree, otherwise we would be throwing valid data away. 539 */ 540 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 541 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 542 ino_size != 0) 543 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 544 goto no_copy; 545 } 546 547 if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && 548 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 549 save_old_i_size = 1; 550 saved_i_size = btrfs_inode_size(path->nodes[0], 551 dst_item); 552 } 553 } 554 555 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 556 src_ptr, item_size); 557 558 if (save_old_i_size) { 559 struct btrfs_inode_item *dst_item; 560 dst_item = (struct btrfs_inode_item *)dst_ptr; 561 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 562 } 563 564 /* make sure the generation is filled in */ 565 if (key->type == BTRFS_INODE_ITEM_KEY) { 566 struct btrfs_inode_item *dst_item; 567 dst_item = (struct btrfs_inode_item *)dst_ptr; 568 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 569 btrfs_set_inode_generation(path->nodes[0], dst_item, 570 trans->transid); 571 } 572 } 573 no_copy: 574 btrfs_mark_buffer_dirty(trans, path->nodes[0]); 575 btrfs_release_path(path); 576 return 0; 577 } 578 579 static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, 580 struct fscrypt_str *name) 581 { 582 char *buf; 583 584 buf = kmalloc(len, GFP_NOFS); 585 if (!buf) 586 return -ENOMEM; 587 588 read_extent_buffer(eb, buf, (unsigned long)start, len); 589 name->name = buf; 590 name->len = len; 591 return 0; 592 } 593 594 /* 595 * simple helper to read an inode off the disk from a given root 596 * This can only be called for subvolume roots and not for the log 597 */ 598 static noinline struct inode *read_one_inode(struct btrfs_root *root, 599 u64 objectid) 600 { 601 struct inode *inode; 602 603 inode = btrfs_iget(root->fs_info->sb, objectid, root); 604 if (IS_ERR(inode)) 605 inode = NULL; 606 return inode; 607 } 608 609 /* replays a single extent in 'eb' at 'slot' with 'key' into the 610 * subvolume 'root'. path is released on entry and should be released 611 * on exit. 612 * 613 * extents in the log tree have not been allocated out of the extent 614 * tree yet. So, this completes the allocation, taking a reference 615 * as required if the extent already exists or creating a new extent 616 * if it isn't in the extent allocation tree yet. 617 * 618 * The extent is inserted into the file, dropping any existing extents 619 * from the file that overlap the new one. 620 */ 621 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 622 struct btrfs_root *root, 623 struct btrfs_path *path, 624 struct extent_buffer *eb, int slot, 625 struct btrfs_key *key) 626 { 627 struct btrfs_drop_extents_args drop_args = { 0 }; 628 struct btrfs_fs_info *fs_info = root->fs_info; 629 int found_type; 630 u64 extent_end; 631 u64 start = key->offset; 632 u64 nbytes = 0; 633 struct btrfs_file_extent_item *item; 634 struct inode *inode = NULL; 635 unsigned long size; 636 int ret = 0; 637 638 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 639 found_type = btrfs_file_extent_type(eb, item); 640 641 if (found_type == BTRFS_FILE_EXTENT_REG || 642 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 643 nbytes = btrfs_file_extent_num_bytes(eb, item); 644 extent_end = start + nbytes; 645 646 /* 647 * We don't add to the inodes nbytes if we are prealloc or a 648 * hole. 649 */ 650 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 651 nbytes = 0; 652 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 653 size = btrfs_file_extent_ram_bytes(eb, item); 654 nbytes = btrfs_file_extent_ram_bytes(eb, item); 655 extent_end = ALIGN(start + size, 656 fs_info->sectorsize); 657 } else { 658 ret = 0; 659 goto out; 660 } 661 662 inode = read_one_inode(root, key->objectid); 663 if (!inode) { 664 ret = -EIO; 665 goto out; 666 } 667 668 /* 669 * first check to see if we already have this extent in the 670 * file. This must be done before the btrfs_drop_extents run 671 * so we don't try to drop this extent. 672 */ 673 ret = btrfs_lookup_file_extent(trans, root, path, 674 btrfs_ino(BTRFS_I(inode)), start, 0); 675 676 if (ret == 0 && 677 (found_type == BTRFS_FILE_EXTENT_REG || 678 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 679 struct btrfs_file_extent_item cmp1; 680 struct btrfs_file_extent_item cmp2; 681 struct btrfs_file_extent_item *existing; 682 struct extent_buffer *leaf; 683 684 leaf = path->nodes[0]; 685 existing = btrfs_item_ptr(leaf, path->slots[0], 686 struct btrfs_file_extent_item); 687 688 read_extent_buffer(eb, &cmp1, (unsigned long)item, 689 sizeof(cmp1)); 690 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 691 sizeof(cmp2)); 692 693 /* 694 * we already have a pointer to this exact extent, 695 * we don't have to do anything 696 */ 697 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 698 btrfs_release_path(path); 699 goto out; 700 } 701 } 702 btrfs_release_path(path); 703 704 /* drop any overlapping extents */ 705 drop_args.start = start; 706 drop_args.end = extent_end; 707 drop_args.drop_cache = true; 708 ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); 709 if (ret) 710 goto out; 711 712 if (found_type == BTRFS_FILE_EXTENT_REG || 713 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 714 u64 offset; 715 unsigned long dest_offset; 716 struct btrfs_key ins; 717 718 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 719 btrfs_fs_incompat(fs_info, NO_HOLES)) 720 goto update_inode; 721 722 ret = btrfs_insert_empty_item(trans, root, path, key, 723 sizeof(*item)); 724 if (ret) 725 goto out; 726 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 727 path->slots[0]); 728 copy_extent_buffer(path->nodes[0], eb, dest_offset, 729 (unsigned long)item, sizeof(*item)); 730 731 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 732 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 733 ins.type = BTRFS_EXTENT_ITEM_KEY; 734 offset = key->offset - btrfs_file_extent_offset(eb, item); 735 736 /* 737 * Manually record dirty extent, as here we did a shallow 738 * file extent item copy and skip normal backref update, 739 * but modifying extent tree all by ourselves. 740 * So need to manually record dirty extent for qgroup, 741 * as the owner of the file extent changed from log tree 742 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 743 */ 744 ret = btrfs_qgroup_trace_extent(trans, 745 btrfs_file_extent_disk_bytenr(eb, item), 746 btrfs_file_extent_disk_num_bytes(eb, item)); 747 if (ret < 0) 748 goto out; 749 750 if (ins.objectid > 0) { 751 struct btrfs_ref ref = { 0 }; 752 u64 csum_start; 753 u64 csum_end; 754 LIST_HEAD(ordered_sums); 755 756 /* 757 * is this extent already allocated in the extent 758 * allocation tree? If so, just add a reference 759 */ 760 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 761 ins.offset); 762 if (ret < 0) { 763 goto out; 764 } else if (ret == 0) { 765 btrfs_init_generic_ref(&ref, 766 BTRFS_ADD_DELAYED_REF, 767 ins.objectid, ins.offset, 0, 768 root->root_key.objectid); 769 btrfs_init_data_ref(&ref, 770 root->root_key.objectid, 771 key->objectid, offset, 0, false); 772 ret = btrfs_inc_extent_ref(trans, &ref); 773 if (ret) 774 goto out; 775 } else { 776 /* 777 * insert the extent pointer in the extent 778 * allocation tree 779 */ 780 ret = btrfs_alloc_logged_file_extent(trans, 781 root->root_key.objectid, 782 key->objectid, offset, &ins); 783 if (ret) 784 goto out; 785 } 786 btrfs_release_path(path); 787 788 if (btrfs_file_extent_compression(eb, item)) { 789 csum_start = ins.objectid; 790 csum_end = csum_start + ins.offset; 791 } else { 792 csum_start = ins.objectid + 793 btrfs_file_extent_offset(eb, item); 794 csum_end = csum_start + 795 btrfs_file_extent_num_bytes(eb, item); 796 } 797 798 ret = btrfs_lookup_csums_list(root->log_root, 799 csum_start, csum_end - 1, 800 &ordered_sums, 0, false); 801 if (ret) 802 goto out; 803 /* 804 * Now delete all existing cums in the csum root that 805 * cover our range. We do this because we can have an 806 * extent that is completely referenced by one file 807 * extent item and partially referenced by another 808 * file extent item (like after using the clone or 809 * extent_same ioctls). In this case if we end up doing 810 * the replay of the one that partially references the 811 * extent first, and we do not do the csum deletion 812 * below, we can get 2 csum items in the csum tree that 813 * overlap each other. For example, imagine our log has 814 * the two following file extent items: 815 * 816 * key (257 EXTENT_DATA 409600) 817 * extent data disk byte 12845056 nr 102400 818 * extent data offset 20480 nr 20480 ram 102400 819 * 820 * key (257 EXTENT_DATA 819200) 821 * extent data disk byte 12845056 nr 102400 822 * extent data offset 0 nr 102400 ram 102400 823 * 824 * Where the second one fully references the 100K extent 825 * that starts at disk byte 12845056, and the log tree 826 * has a single csum item that covers the entire range 827 * of the extent: 828 * 829 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 830 * 831 * After the first file extent item is replayed, the 832 * csum tree gets the following csum item: 833 * 834 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 835 * 836 * Which covers the 20K sub-range starting at offset 20K 837 * of our extent. Now when we replay the second file 838 * extent item, if we do not delete existing csum items 839 * that cover any of its blocks, we end up getting two 840 * csum items in our csum tree that overlap each other: 841 * 842 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 843 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 844 * 845 * Which is a problem, because after this anyone trying 846 * to lookup up for the checksum of any block of our 847 * extent starting at an offset of 40K or higher, will 848 * end up looking at the second csum item only, which 849 * does not contain the checksum for any block starting 850 * at offset 40K or higher of our extent. 851 */ 852 while (!list_empty(&ordered_sums)) { 853 struct btrfs_ordered_sum *sums; 854 struct btrfs_root *csum_root; 855 856 sums = list_entry(ordered_sums.next, 857 struct btrfs_ordered_sum, 858 list); 859 csum_root = btrfs_csum_root(fs_info, 860 sums->logical); 861 if (!ret) 862 ret = btrfs_del_csums(trans, csum_root, 863 sums->logical, 864 sums->len); 865 if (!ret) 866 ret = btrfs_csum_file_blocks(trans, 867 csum_root, 868 sums); 869 list_del(&sums->list); 870 kfree(sums); 871 } 872 if (ret) 873 goto out; 874 } else { 875 btrfs_release_path(path); 876 } 877 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 878 /* inline extents are easy, we just overwrite them */ 879 ret = overwrite_item(trans, root, path, eb, slot, key); 880 if (ret) 881 goto out; 882 } 883 884 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 885 extent_end - start); 886 if (ret) 887 goto out; 888 889 update_inode: 890 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); 891 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 892 out: 893 iput(inode); 894 return ret; 895 } 896 897 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, 898 struct btrfs_inode *dir, 899 struct btrfs_inode *inode, 900 const struct fscrypt_str *name) 901 { 902 int ret; 903 904 ret = btrfs_unlink_inode(trans, dir, inode, name); 905 if (ret) 906 return ret; 907 /* 908 * Whenever we need to check if a name exists or not, we check the 909 * fs/subvolume tree. So after an unlink we must run delayed items, so 910 * that future checks for a name during log replay see that the name 911 * does not exists anymore. 912 */ 913 return btrfs_run_delayed_items(trans); 914 } 915 916 /* 917 * when cleaning up conflicts between the directory names in the 918 * subvolume, directory names in the log and directory names in the 919 * inode back references, we may have to unlink inodes from directories. 920 * 921 * This is a helper function to do the unlink of a specific directory 922 * item 923 */ 924 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 925 struct btrfs_path *path, 926 struct btrfs_inode *dir, 927 struct btrfs_dir_item *di) 928 { 929 struct btrfs_root *root = dir->root; 930 struct inode *inode; 931 struct fscrypt_str name; 932 struct extent_buffer *leaf; 933 struct btrfs_key location; 934 int ret; 935 936 leaf = path->nodes[0]; 937 938 btrfs_dir_item_key_to_cpu(leaf, di, &location); 939 ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); 940 if (ret) 941 return -ENOMEM; 942 943 btrfs_release_path(path); 944 945 inode = read_one_inode(root, location.objectid); 946 if (!inode) { 947 ret = -EIO; 948 goto out; 949 } 950 951 ret = link_to_fixup_dir(trans, root, path, location.objectid); 952 if (ret) 953 goto out; 954 955 ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); 956 out: 957 kfree(name.name); 958 iput(inode); 959 return ret; 960 } 961 962 /* 963 * See if a given name and sequence number found in an inode back reference are 964 * already in a directory and correctly point to this inode. 965 * 966 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it 967 * exists. 968 */ 969 static noinline int inode_in_dir(struct btrfs_root *root, 970 struct btrfs_path *path, 971 u64 dirid, u64 objectid, u64 index, 972 struct fscrypt_str *name) 973 { 974 struct btrfs_dir_item *di; 975 struct btrfs_key location; 976 int ret = 0; 977 978 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 979 index, name, 0); 980 if (IS_ERR(di)) { 981 ret = PTR_ERR(di); 982 goto out; 983 } else if (di) { 984 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 985 if (location.objectid != objectid) 986 goto out; 987 } else { 988 goto out; 989 } 990 991 btrfs_release_path(path); 992 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0); 993 if (IS_ERR(di)) { 994 ret = PTR_ERR(di); 995 goto out; 996 } else if (di) { 997 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 998 if (location.objectid == objectid) 999 ret = 1; 1000 } 1001 out: 1002 btrfs_release_path(path); 1003 return ret; 1004 } 1005 1006 /* 1007 * helper function to check a log tree for a named back reference in 1008 * an inode. This is used to decide if a back reference that is 1009 * found in the subvolume conflicts with what we find in the log. 1010 * 1011 * inode backreferences may have multiple refs in a single item, 1012 * during replay we process one reference at a time, and we don't 1013 * want to delete valid links to a file from the subvolume if that 1014 * link is also in the log. 1015 */ 1016 static noinline int backref_in_log(struct btrfs_root *log, 1017 struct btrfs_key *key, 1018 u64 ref_objectid, 1019 const struct fscrypt_str *name) 1020 { 1021 struct btrfs_path *path; 1022 int ret; 1023 1024 path = btrfs_alloc_path(); 1025 if (!path) 1026 return -ENOMEM; 1027 1028 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 1029 if (ret < 0) { 1030 goto out; 1031 } else if (ret == 1) { 1032 ret = 0; 1033 goto out; 1034 } 1035 1036 if (key->type == BTRFS_INODE_EXTREF_KEY) 1037 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1038 path->slots[0], 1039 ref_objectid, name); 1040 else 1041 ret = !!btrfs_find_name_in_backref(path->nodes[0], 1042 path->slots[0], name); 1043 out: 1044 btrfs_free_path(path); 1045 return ret; 1046 } 1047 1048 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 1049 struct btrfs_root *root, 1050 struct btrfs_path *path, 1051 struct btrfs_root *log_root, 1052 struct btrfs_inode *dir, 1053 struct btrfs_inode *inode, 1054 u64 inode_objectid, u64 parent_objectid, 1055 u64 ref_index, struct fscrypt_str *name) 1056 { 1057 int ret; 1058 struct extent_buffer *leaf; 1059 struct btrfs_dir_item *di; 1060 struct btrfs_key search_key; 1061 struct btrfs_inode_extref *extref; 1062 1063 again: 1064 /* Search old style refs */ 1065 search_key.objectid = inode_objectid; 1066 search_key.type = BTRFS_INODE_REF_KEY; 1067 search_key.offset = parent_objectid; 1068 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1069 if (ret == 0) { 1070 struct btrfs_inode_ref *victim_ref; 1071 unsigned long ptr; 1072 unsigned long ptr_end; 1073 1074 leaf = path->nodes[0]; 1075 1076 /* are we trying to overwrite a back ref for the root directory 1077 * if so, just jump out, we're done 1078 */ 1079 if (search_key.objectid == search_key.offset) 1080 return 1; 1081 1082 /* check all the names in this back reference to see 1083 * if they are in the log. if so, we allow them to stay 1084 * otherwise they must be unlinked as a conflict 1085 */ 1086 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1087 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); 1088 while (ptr < ptr_end) { 1089 struct fscrypt_str victim_name; 1090 1091 victim_ref = (struct btrfs_inode_ref *)ptr; 1092 ret = read_alloc_one_name(leaf, (victim_ref + 1), 1093 btrfs_inode_ref_name_len(leaf, victim_ref), 1094 &victim_name); 1095 if (ret) 1096 return ret; 1097 1098 ret = backref_in_log(log_root, &search_key, 1099 parent_objectid, &victim_name); 1100 if (ret < 0) { 1101 kfree(victim_name.name); 1102 return ret; 1103 } else if (!ret) { 1104 inc_nlink(&inode->vfs_inode); 1105 btrfs_release_path(path); 1106 1107 ret = unlink_inode_for_log_replay(trans, dir, inode, 1108 &victim_name); 1109 kfree(victim_name.name); 1110 if (ret) 1111 return ret; 1112 goto again; 1113 } 1114 kfree(victim_name.name); 1115 1116 ptr = (unsigned long)(victim_ref + 1) + victim_name.len; 1117 } 1118 } 1119 btrfs_release_path(path); 1120 1121 /* Same search but for extended refs */ 1122 extref = btrfs_lookup_inode_extref(NULL, root, path, name, 1123 inode_objectid, parent_objectid, 0, 1124 0); 1125 if (IS_ERR(extref)) { 1126 return PTR_ERR(extref); 1127 } else if (extref) { 1128 u32 item_size; 1129 u32 cur_offset = 0; 1130 unsigned long base; 1131 struct inode *victim_parent; 1132 1133 leaf = path->nodes[0]; 1134 1135 item_size = btrfs_item_size(leaf, path->slots[0]); 1136 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1137 1138 while (cur_offset < item_size) { 1139 struct fscrypt_str victim_name; 1140 1141 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1142 1143 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1144 goto next; 1145 1146 ret = read_alloc_one_name(leaf, &extref->name, 1147 btrfs_inode_extref_name_len(leaf, extref), 1148 &victim_name); 1149 if (ret) 1150 return ret; 1151 1152 search_key.objectid = inode_objectid; 1153 search_key.type = BTRFS_INODE_EXTREF_KEY; 1154 search_key.offset = btrfs_extref_hash(parent_objectid, 1155 victim_name.name, 1156 victim_name.len); 1157 ret = backref_in_log(log_root, &search_key, 1158 parent_objectid, &victim_name); 1159 if (ret < 0) { 1160 kfree(victim_name.name); 1161 return ret; 1162 } else if (!ret) { 1163 ret = -ENOENT; 1164 victim_parent = read_one_inode(root, 1165 parent_objectid); 1166 if (victim_parent) { 1167 inc_nlink(&inode->vfs_inode); 1168 btrfs_release_path(path); 1169 1170 ret = unlink_inode_for_log_replay(trans, 1171 BTRFS_I(victim_parent), 1172 inode, &victim_name); 1173 } 1174 iput(victim_parent); 1175 kfree(victim_name.name); 1176 if (ret) 1177 return ret; 1178 goto again; 1179 } 1180 kfree(victim_name.name); 1181 next: 1182 cur_offset += victim_name.len + sizeof(*extref); 1183 } 1184 } 1185 btrfs_release_path(path); 1186 1187 /* look for a conflicting sequence number */ 1188 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1189 ref_index, name, 0); 1190 if (IS_ERR(di)) { 1191 return PTR_ERR(di); 1192 } else if (di) { 1193 ret = drop_one_dir_item(trans, path, dir, di); 1194 if (ret) 1195 return ret; 1196 } 1197 btrfs_release_path(path); 1198 1199 /* look for a conflicting name */ 1200 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0); 1201 if (IS_ERR(di)) { 1202 return PTR_ERR(di); 1203 } else if (di) { 1204 ret = drop_one_dir_item(trans, path, dir, di); 1205 if (ret) 1206 return ret; 1207 } 1208 btrfs_release_path(path); 1209 1210 return 0; 1211 } 1212 1213 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1214 struct fscrypt_str *name, u64 *index, 1215 u64 *parent_objectid) 1216 { 1217 struct btrfs_inode_extref *extref; 1218 int ret; 1219 1220 extref = (struct btrfs_inode_extref *)ref_ptr; 1221 1222 ret = read_alloc_one_name(eb, &extref->name, 1223 btrfs_inode_extref_name_len(eb, extref), name); 1224 if (ret) 1225 return ret; 1226 1227 if (index) 1228 *index = btrfs_inode_extref_index(eb, extref); 1229 if (parent_objectid) 1230 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1231 1232 return 0; 1233 } 1234 1235 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1236 struct fscrypt_str *name, u64 *index) 1237 { 1238 struct btrfs_inode_ref *ref; 1239 int ret; 1240 1241 ref = (struct btrfs_inode_ref *)ref_ptr; 1242 1243 ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref), 1244 name); 1245 if (ret) 1246 return ret; 1247 1248 if (index) 1249 *index = btrfs_inode_ref_index(eb, ref); 1250 1251 return 0; 1252 } 1253 1254 /* 1255 * Take an inode reference item from the log tree and iterate all names from the 1256 * inode reference item in the subvolume tree with the same key (if it exists). 1257 * For any name that is not in the inode reference item from the log tree, do a 1258 * proper unlink of that name (that is, remove its entry from the inode 1259 * reference item and both dir index keys). 1260 */ 1261 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1262 struct btrfs_root *root, 1263 struct btrfs_path *path, 1264 struct btrfs_inode *inode, 1265 struct extent_buffer *log_eb, 1266 int log_slot, 1267 struct btrfs_key *key) 1268 { 1269 int ret; 1270 unsigned long ref_ptr; 1271 unsigned long ref_end; 1272 struct extent_buffer *eb; 1273 1274 again: 1275 btrfs_release_path(path); 1276 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1277 if (ret > 0) { 1278 ret = 0; 1279 goto out; 1280 } 1281 if (ret < 0) 1282 goto out; 1283 1284 eb = path->nodes[0]; 1285 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1286 ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); 1287 while (ref_ptr < ref_end) { 1288 struct fscrypt_str name; 1289 u64 parent_id; 1290 1291 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1292 ret = extref_get_fields(eb, ref_ptr, &name, 1293 NULL, &parent_id); 1294 } else { 1295 parent_id = key->offset; 1296 ret = ref_get_fields(eb, ref_ptr, &name, NULL); 1297 } 1298 if (ret) 1299 goto out; 1300 1301 if (key->type == BTRFS_INODE_EXTREF_KEY) 1302 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1303 parent_id, &name); 1304 else 1305 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); 1306 1307 if (!ret) { 1308 struct inode *dir; 1309 1310 btrfs_release_path(path); 1311 dir = read_one_inode(root, parent_id); 1312 if (!dir) { 1313 ret = -ENOENT; 1314 kfree(name.name); 1315 goto out; 1316 } 1317 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), 1318 inode, &name); 1319 kfree(name.name); 1320 iput(dir); 1321 if (ret) 1322 goto out; 1323 goto again; 1324 } 1325 1326 kfree(name.name); 1327 ref_ptr += name.len; 1328 if (key->type == BTRFS_INODE_EXTREF_KEY) 1329 ref_ptr += sizeof(struct btrfs_inode_extref); 1330 else 1331 ref_ptr += sizeof(struct btrfs_inode_ref); 1332 } 1333 ret = 0; 1334 out: 1335 btrfs_release_path(path); 1336 return ret; 1337 } 1338 1339 /* 1340 * replay one inode back reference item found in the log tree. 1341 * eb, slot and key refer to the buffer and key found in the log tree. 1342 * root is the destination we are replaying into, and path is for temp 1343 * use by this function. (it should be released on return). 1344 */ 1345 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1346 struct btrfs_root *root, 1347 struct btrfs_root *log, 1348 struct btrfs_path *path, 1349 struct extent_buffer *eb, int slot, 1350 struct btrfs_key *key) 1351 { 1352 struct inode *dir = NULL; 1353 struct inode *inode = NULL; 1354 unsigned long ref_ptr; 1355 unsigned long ref_end; 1356 struct fscrypt_str name; 1357 int ret; 1358 int log_ref_ver = 0; 1359 u64 parent_objectid; 1360 u64 inode_objectid; 1361 u64 ref_index = 0; 1362 int ref_struct_size; 1363 1364 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1365 ref_end = ref_ptr + btrfs_item_size(eb, slot); 1366 1367 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1368 struct btrfs_inode_extref *r; 1369 1370 ref_struct_size = sizeof(struct btrfs_inode_extref); 1371 log_ref_ver = 1; 1372 r = (struct btrfs_inode_extref *)ref_ptr; 1373 parent_objectid = btrfs_inode_extref_parent(eb, r); 1374 } else { 1375 ref_struct_size = sizeof(struct btrfs_inode_ref); 1376 parent_objectid = key->offset; 1377 } 1378 inode_objectid = key->objectid; 1379 1380 /* 1381 * it is possible that we didn't log all the parent directories 1382 * for a given inode. If we don't find the dir, just don't 1383 * copy the back ref in. The link count fixup code will take 1384 * care of the rest 1385 */ 1386 dir = read_one_inode(root, parent_objectid); 1387 if (!dir) { 1388 ret = -ENOENT; 1389 goto out; 1390 } 1391 1392 inode = read_one_inode(root, inode_objectid); 1393 if (!inode) { 1394 ret = -EIO; 1395 goto out; 1396 } 1397 1398 while (ref_ptr < ref_end) { 1399 if (log_ref_ver) { 1400 ret = extref_get_fields(eb, ref_ptr, &name, 1401 &ref_index, &parent_objectid); 1402 /* 1403 * parent object can change from one array 1404 * item to another. 1405 */ 1406 if (!dir) 1407 dir = read_one_inode(root, parent_objectid); 1408 if (!dir) { 1409 ret = -ENOENT; 1410 goto out; 1411 } 1412 } else { 1413 ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); 1414 } 1415 if (ret) 1416 goto out; 1417 1418 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1419 btrfs_ino(BTRFS_I(inode)), ref_index, &name); 1420 if (ret < 0) { 1421 goto out; 1422 } else if (ret == 0) { 1423 /* 1424 * look for a conflicting back reference in the 1425 * metadata. if we find one we have to unlink that name 1426 * of the file before we add our new link. Later on, we 1427 * overwrite any existing back reference, and we don't 1428 * want to create dangling pointers in the directory. 1429 */ 1430 ret = __add_inode_ref(trans, root, path, log, 1431 BTRFS_I(dir), BTRFS_I(inode), 1432 inode_objectid, parent_objectid, 1433 ref_index, &name); 1434 if (ret) { 1435 if (ret == 1) 1436 ret = 0; 1437 goto out; 1438 } 1439 1440 /* insert our name */ 1441 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1442 &name, 0, ref_index); 1443 if (ret) 1444 goto out; 1445 1446 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 1447 if (ret) 1448 goto out; 1449 } 1450 /* Else, ret == 1, we already have a perfect match, we're done. */ 1451 1452 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len; 1453 kfree(name.name); 1454 name.name = NULL; 1455 if (log_ref_ver) { 1456 iput(dir); 1457 dir = NULL; 1458 } 1459 } 1460 1461 /* 1462 * Before we overwrite the inode reference item in the subvolume tree 1463 * with the item from the log tree, we must unlink all names from the 1464 * parent directory that are in the subvolume's tree inode reference 1465 * item, otherwise we end up with an inconsistent subvolume tree where 1466 * dir index entries exist for a name but there is no inode reference 1467 * item with the same name. 1468 */ 1469 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1470 key); 1471 if (ret) 1472 goto out; 1473 1474 /* finally write the back reference in the inode */ 1475 ret = overwrite_item(trans, root, path, eb, slot, key); 1476 out: 1477 btrfs_release_path(path); 1478 kfree(name.name); 1479 iput(dir); 1480 iput(inode); 1481 return ret; 1482 } 1483 1484 static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path) 1485 { 1486 int ret = 0; 1487 int name_len; 1488 unsigned int nlink = 0; 1489 u32 item_size; 1490 u32 cur_offset = 0; 1491 u64 inode_objectid = btrfs_ino(inode); 1492 u64 offset = 0; 1493 unsigned long ptr; 1494 struct btrfs_inode_extref *extref; 1495 struct extent_buffer *leaf; 1496 1497 while (1) { 1498 ret = btrfs_find_one_extref(inode->root, inode_objectid, offset, 1499 path, &extref, &offset); 1500 if (ret) 1501 break; 1502 1503 leaf = path->nodes[0]; 1504 item_size = btrfs_item_size(leaf, path->slots[0]); 1505 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1506 cur_offset = 0; 1507 1508 while (cur_offset < item_size) { 1509 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1510 name_len = btrfs_inode_extref_name_len(leaf, extref); 1511 1512 nlink++; 1513 1514 cur_offset += name_len + sizeof(*extref); 1515 } 1516 1517 offset++; 1518 btrfs_release_path(path); 1519 } 1520 btrfs_release_path(path); 1521 1522 if (ret < 0 && ret != -ENOENT) 1523 return ret; 1524 return nlink; 1525 } 1526 1527 static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path) 1528 { 1529 int ret; 1530 struct btrfs_key key; 1531 unsigned int nlink = 0; 1532 unsigned long ptr; 1533 unsigned long ptr_end; 1534 int name_len; 1535 u64 ino = btrfs_ino(inode); 1536 1537 key.objectid = ino; 1538 key.type = BTRFS_INODE_REF_KEY; 1539 key.offset = (u64)-1; 1540 1541 while (1) { 1542 ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0); 1543 if (ret < 0) 1544 break; 1545 if (ret > 0) { 1546 if (path->slots[0] == 0) 1547 break; 1548 path->slots[0]--; 1549 } 1550 process_slot: 1551 btrfs_item_key_to_cpu(path->nodes[0], &key, 1552 path->slots[0]); 1553 if (key.objectid != ino || 1554 key.type != BTRFS_INODE_REF_KEY) 1555 break; 1556 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1557 ptr_end = ptr + btrfs_item_size(path->nodes[0], 1558 path->slots[0]); 1559 while (ptr < ptr_end) { 1560 struct btrfs_inode_ref *ref; 1561 1562 ref = (struct btrfs_inode_ref *)ptr; 1563 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1564 ref); 1565 ptr = (unsigned long)(ref + 1) + name_len; 1566 nlink++; 1567 } 1568 1569 if (key.offset == 0) 1570 break; 1571 if (path->slots[0] > 0) { 1572 path->slots[0]--; 1573 goto process_slot; 1574 } 1575 key.offset--; 1576 btrfs_release_path(path); 1577 } 1578 btrfs_release_path(path); 1579 1580 return nlink; 1581 } 1582 1583 /* 1584 * There are a few corners where the link count of the file can't 1585 * be properly maintained during replay. So, instead of adding 1586 * lots of complexity to the log code, we just scan the backrefs 1587 * for any file that has been through replay. 1588 * 1589 * The scan will update the link count on the inode to reflect the 1590 * number of back refs found. If it goes down to zero, the iput 1591 * will free the inode. 1592 */ 1593 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1594 struct inode *inode) 1595 { 1596 struct btrfs_root *root = BTRFS_I(inode)->root; 1597 struct btrfs_path *path; 1598 int ret; 1599 u64 nlink = 0; 1600 u64 ino = btrfs_ino(BTRFS_I(inode)); 1601 1602 path = btrfs_alloc_path(); 1603 if (!path) 1604 return -ENOMEM; 1605 1606 ret = count_inode_refs(BTRFS_I(inode), path); 1607 if (ret < 0) 1608 goto out; 1609 1610 nlink = ret; 1611 1612 ret = count_inode_extrefs(BTRFS_I(inode), path); 1613 if (ret < 0) 1614 goto out; 1615 1616 nlink += ret; 1617 1618 ret = 0; 1619 1620 if (nlink != inode->i_nlink) { 1621 set_nlink(inode, nlink); 1622 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 1623 if (ret) 1624 goto out; 1625 } 1626 BTRFS_I(inode)->index_cnt = (u64)-1; 1627 1628 if (inode->i_nlink == 0) { 1629 if (S_ISDIR(inode->i_mode)) { 1630 ret = replay_dir_deletes(trans, root, NULL, path, 1631 ino, 1); 1632 if (ret) 1633 goto out; 1634 } 1635 ret = btrfs_insert_orphan_item(trans, root, ino); 1636 if (ret == -EEXIST) 1637 ret = 0; 1638 } 1639 1640 out: 1641 btrfs_free_path(path); 1642 return ret; 1643 } 1644 1645 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1646 struct btrfs_root *root, 1647 struct btrfs_path *path) 1648 { 1649 int ret; 1650 struct btrfs_key key; 1651 struct inode *inode; 1652 1653 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1654 key.type = BTRFS_ORPHAN_ITEM_KEY; 1655 key.offset = (u64)-1; 1656 while (1) { 1657 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1658 if (ret < 0) 1659 break; 1660 1661 if (ret == 1) { 1662 ret = 0; 1663 if (path->slots[0] == 0) 1664 break; 1665 path->slots[0]--; 1666 } 1667 1668 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1669 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1670 key.type != BTRFS_ORPHAN_ITEM_KEY) 1671 break; 1672 1673 ret = btrfs_del_item(trans, root, path); 1674 if (ret) 1675 break; 1676 1677 btrfs_release_path(path); 1678 inode = read_one_inode(root, key.offset); 1679 if (!inode) { 1680 ret = -EIO; 1681 break; 1682 } 1683 1684 ret = fixup_inode_link_count(trans, inode); 1685 iput(inode); 1686 if (ret) 1687 break; 1688 1689 /* 1690 * fixup on a directory may create new entries, 1691 * make sure we always look for the highset possible 1692 * offset 1693 */ 1694 key.offset = (u64)-1; 1695 } 1696 btrfs_release_path(path); 1697 return ret; 1698 } 1699 1700 1701 /* 1702 * record a given inode in the fixup dir so we can check its link 1703 * count when replay is done. The link count is incremented here 1704 * so the inode won't go away until we check it 1705 */ 1706 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1707 struct btrfs_root *root, 1708 struct btrfs_path *path, 1709 u64 objectid) 1710 { 1711 struct btrfs_key key; 1712 int ret = 0; 1713 struct inode *inode; 1714 1715 inode = read_one_inode(root, objectid); 1716 if (!inode) 1717 return -EIO; 1718 1719 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1720 key.type = BTRFS_ORPHAN_ITEM_KEY; 1721 key.offset = objectid; 1722 1723 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1724 1725 btrfs_release_path(path); 1726 if (ret == 0) { 1727 if (!inode->i_nlink) 1728 set_nlink(inode, 1); 1729 else 1730 inc_nlink(inode); 1731 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 1732 } else if (ret == -EEXIST) { 1733 ret = 0; 1734 } 1735 iput(inode); 1736 1737 return ret; 1738 } 1739 1740 /* 1741 * when replaying the log for a directory, we only insert names 1742 * for inodes that actually exist. This means an fsync on a directory 1743 * does not implicitly fsync all the new files in it 1744 */ 1745 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1746 struct btrfs_root *root, 1747 u64 dirid, u64 index, 1748 const struct fscrypt_str *name, 1749 struct btrfs_key *location) 1750 { 1751 struct inode *inode; 1752 struct inode *dir; 1753 int ret; 1754 1755 inode = read_one_inode(root, location->objectid); 1756 if (!inode) 1757 return -ENOENT; 1758 1759 dir = read_one_inode(root, dirid); 1760 if (!dir) { 1761 iput(inode); 1762 return -EIO; 1763 } 1764 1765 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1766 1, index); 1767 1768 /* FIXME, put inode into FIXUP list */ 1769 1770 iput(inode); 1771 iput(dir); 1772 return ret; 1773 } 1774 1775 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, 1776 struct btrfs_inode *dir, 1777 struct btrfs_path *path, 1778 struct btrfs_dir_item *dst_di, 1779 const struct btrfs_key *log_key, 1780 u8 log_flags, 1781 bool exists) 1782 { 1783 struct btrfs_key found_key; 1784 1785 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1786 /* The existing dentry points to the same inode, don't delete it. */ 1787 if (found_key.objectid == log_key->objectid && 1788 found_key.type == log_key->type && 1789 found_key.offset == log_key->offset && 1790 btrfs_dir_flags(path->nodes[0], dst_di) == log_flags) 1791 return 1; 1792 1793 /* 1794 * Don't drop the conflicting directory entry if the inode for the new 1795 * entry doesn't exist. 1796 */ 1797 if (!exists) 1798 return 0; 1799 1800 return drop_one_dir_item(trans, path, dir, dst_di); 1801 } 1802 1803 /* 1804 * take a single entry in a log directory item and replay it into 1805 * the subvolume. 1806 * 1807 * if a conflicting item exists in the subdirectory already, 1808 * the inode it points to is unlinked and put into the link count 1809 * fix up tree. 1810 * 1811 * If a name from the log points to a file or directory that does 1812 * not exist in the FS, it is skipped. fsyncs on directories 1813 * do not force down inodes inside that directory, just changes to the 1814 * names or unlinks in a directory. 1815 * 1816 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1817 * non-existing inode) and 1 if the name was replayed. 1818 */ 1819 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1820 struct btrfs_root *root, 1821 struct btrfs_path *path, 1822 struct extent_buffer *eb, 1823 struct btrfs_dir_item *di, 1824 struct btrfs_key *key) 1825 { 1826 struct fscrypt_str name; 1827 struct btrfs_dir_item *dir_dst_di; 1828 struct btrfs_dir_item *index_dst_di; 1829 bool dir_dst_matches = false; 1830 bool index_dst_matches = false; 1831 struct btrfs_key log_key; 1832 struct btrfs_key search_key; 1833 struct inode *dir; 1834 u8 log_flags; 1835 bool exists; 1836 int ret; 1837 bool update_size = true; 1838 bool name_added = false; 1839 1840 dir = read_one_inode(root, key->objectid); 1841 if (!dir) 1842 return -EIO; 1843 1844 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); 1845 if (ret) 1846 goto out; 1847 1848 log_flags = btrfs_dir_flags(eb, di); 1849 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1850 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1851 btrfs_release_path(path); 1852 if (ret < 0) 1853 goto out; 1854 exists = (ret == 0); 1855 ret = 0; 1856 1857 dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1858 &name, 1); 1859 if (IS_ERR(dir_dst_di)) { 1860 ret = PTR_ERR(dir_dst_di); 1861 goto out; 1862 } else if (dir_dst_di) { 1863 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 1864 dir_dst_di, &log_key, 1865 log_flags, exists); 1866 if (ret < 0) 1867 goto out; 1868 dir_dst_matches = (ret == 1); 1869 } 1870 1871 btrfs_release_path(path); 1872 1873 index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1874 key->objectid, key->offset, 1875 &name, 1); 1876 if (IS_ERR(index_dst_di)) { 1877 ret = PTR_ERR(index_dst_di); 1878 goto out; 1879 } else if (index_dst_di) { 1880 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 1881 index_dst_di, &log_key, 1882 log_flags, exists); 1883 if (ret < 0) 1884 goto out; 1885 index_dst_matches = (ret == 1); 1886 } 1887 1888 btrfs_release_path(path); 1889 1890 if (dir_dst_matches && index_dst_matches) { 1891 ret = 0; 1892 update_size = false; 1893 goto out; 1894 } 1895 1896 /* 1897 * Check if the inode reference exists in the log for the given name, 1898 * inode and parent inode 1899 */ 1900 search_key.objectid = log_key.objectid; 1901 search_key.type = BTRFS_INODE_REF_KEY; 1902 search_key.offset = key->objectid; 1903 ret = backref_in_log(root->log_root, &search_key, 0, &name); 1904 if (ret < 0) { 1905 goto out; 1906 } else if (ret) { 1907 /* The dentry will be added later. */ 1908 ret = 0; 1909 update_size = false; 1910 goto out; 1911 } 1912 1913 search_key.objectid = log_key.objectid; 1914 search_key.type = BTRFS_INODE_EXTREF_KEY; 1915 search_key.offset = key->objectid; 1916 ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); 1917 if (ret < 0) { 1918 goto out; 1919 } else if (ret) { 1920 /* The dentry will be added later. */ 1921 ret = 0; 1922 update_size = false; 1923 goto out; 1924 } 1925 btrfs_release_path(path); 1926 ret = insert_one_name(trans, root, key->objectid, key->offset, 1927 &name, &log_key); 1928 if (ret && ret != -ENOENT && ret != -EEXIST) 1929 goto out; 1930 if (!ret) 1931 name_added = true; 1932 update_size = false; 1933 ret = 0; 1934 1935 out: 1936 if (!ret && update_size) { 1937 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); 1938 ret = btrfs_update_inode(trans, BTRFS_I(dir)); 1939 } 1940 kfree(name.name); 1941 iput(dir); 1942 if (!ret && name_added) 1943 ret = 1; 1944 return ret; 1945 } 1946 1947 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ 1948 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1949 struct btrfs_root *root, 1950 struct btrfs_path *path, 1951 struct extent_buffer *eb, int slot, 1952 struct btrfs_key *key) 1953 { 1954 int ret; 1955 struct btrfs_dir_item *di; 1956 1957 /* We only log dir index keys, which only contain a single dir item. */ 1958 ASSERT(key->type == BTRFS_DIR_INDEX_KEY); 1959 1960 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 1961 ret = replay_one_name(trans, root, path, eb, di, key); 1962 if (ret < 0) 1963 return ret; 1964 1965 /* 1966 * If this entry refers to a non-directory (directories can not have a 1967 * link count > 1) and it was added in the transaction that was not 1968 * committed, make sure we fixup the link count of the inode the entry 1969 * points to. Otherwise something like the following would result in a 1970 * directory pointing to an inode with a wrong link that does not account 1971 * for this dir entry: 1972 * 1973 * mkdir testdir 1974 * touch testdir/foo 1975 * touch testdir/bar 1976 * sync 1977 * 1978 * ln testdir/bar testdir/bar_link 1979 * ln testdir/foo testdir/foo_link 1980 * xfs_io -c "fsync" testdir/bar 1981 * 1982 * <power failure> 1983 * 1984 * mount fs, log replay happens 1985 * 1986 * File foo would remain with a link count of 1 when it has two entries 1987 * pointing to it in the directory testdir. This would make it impossible 1988 * to ever delete the parent directory has it would result in stale 1989 * dentries that can never be deleted. 1990 */ 1991 if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) { 1992 struct btrfs_path *fixup_path; 1993 struct btrfs_key di_key; 1994 1995 fixup_path = btrfs_alloc_path(); 1996 if (!fixup_path) 1997 return -ENOMEM; 1998 1999 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2000 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); 2001 btrfs_free_path(fixup_path); 2002 } 2003 2004 return ret; 2005 } 2006 2007 /* 2008 * directory replay has two parts. There are the standard directory 2009 * items in the log copied from the subvolume, and range items 2010 * created in the log while the subvolume was logged. 2011 * 2012 * The range items tell us which parts of the key space the log 2013 * is authoritative for. During replay, if a key in the subvolume 2014 * directory is in a logged range item, but not actually in the log 2015 * that means it was deleted from the directory before the fsync 2016 * and should be removed. 2017 */ 2018 static noinline int find_dir_range(struct btrfs_root *root, 2019 struct btrfs_path *path, 2020 u64 dirid, 2021 u64 *start_ret, u64 *end_ret) 2022 { 2023 struct btrfs_key key; 2024 u64 found_end; 2025 struct btrfs_dir_log_item *item; 2026 int ret; 2027 int nritems; 2028 2029 if (*start_ret == (u64)-1) 2030 return 1; 2031 2032 key.objectid = dirid; 2033 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2034 key.offset = *start_ret; 2035 2036 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2037 if (ret < 0) 2038 goto out; 2039 if (ret > 0) { 2040 if (path->slots[0] == 0) 2041 goto out; 2042 path->slots[0]--; 2043 } 2044 if (ret != 0) 2045 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2046 2047 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2048 ret = 1; 2049 goto next; 2050 } 2051 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2052 struct btrfs_dir_log_item); 2053 found_end = btrfs_dir_log_end(path->nodes[0], item); 2054 2055 if (*start_ret >= key.offset && *start_ret <= found_end) { 2056 ret = 0; 2057 *start_ret = key.offset; 2058 *end_ret = found_end; 2059 goto out; 2060 } 2061 ret = 1; 2062 next: 2063 /* check the next slot in the tree to see if it is a valid item */ 2064 nritems = btrfs_header_nritems(path->nodes[0]); 2065 path->slots[0]++; 2066 if (path->slots[0] >= nritems) { 2067 ret = btrfs_next_leaf(root, path); 2068 if (ret) 2069 goto out; 2070 } 2071 2072 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2073 2074 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2075 ret = 1; 2076 goto out; 2077 } 2078 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2079 struct btrfs_dir_log_item); 2080 found_end = btrfs_dir_log_end(path->nodes[0], item); 2081 *start_ret = key.offset; 2082 *end_ret = found_end; 2083 ret = 0; 2084 out: 2085 btrfs_release_path(path); 2086 return ret; 2087 } 2088 2089 /* 2090 * this looks for a given directory item in the log. If the directory 2091 * item is not in the log, the item is removed and the inode it points 2092 * to is unlinked 2093 */ 2094 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2095 struct btrfs_root *log, 2096 struct btrfs_path *path, 2097 struct btrfs_path *log_path, 2098 struct inode *dir, 2099 struct btrfs_key *dir_key) 2100 { 2101 struct btrfs_root *root = BTRFS_I(dir)->root; 2102 int ret; 2103 struct extent_buffer *eb; 2104 int slot; 2105 struct btrfs_dir_item *di; 2106 struct fscrypt_str name; 2107 struct inode *inode = NULL; 2108 struct btrfs_key location; 2109 2110 /* 2111 * Currently we only log dir index keys. Even if we replay a log created 2112 * by an older kernel that logged both dir index and dir item keys, all 2113 * we need to do is process the dir index keys, we (and our caller) can 2114 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). 2115 */ 2116 ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); 2117 2118 eb = path->nodes[0]; 2119 slot = path->slots[0]; 2120 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2121 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); 2122 if (ret) 2123 goto out; 2124 2125 if (log) { 2126 struct btrfs_dir_item *log_di; 2127 2128 log_di = btrfs_lookup_dir_index_item(trans, log, log_path, 2129 dir_key->objectid, 2130 dir_key->offset, &name, 0); 2131 if (IS_ERR(log_di)) { 2132 ret = PTR_ERR(log_di); 2133 goto out; 2134 } else if (log_di) { 2135 /* The dentry exists in the log, we have nothing to do. */ 2136 ret = 0; 2137 goto out; 2138 } 2139 } 2140 2141 btrfs_dir_item_key_to_cpu(eb, di, &location); 2142 btrfs_release_path(path); 2143 btrfs_release_path(log_path); 2144 inode = read_one_inode(root, location.objectid); 2145 if (!inode) { 2146 ret = -EIO; 2147 goto out; 2148 } 2149 2150 ret = link_to_fixup_dir(trans, root, path, location.objectid); 2151 if (ret) 2152 goto out; 2153 2154 inc_nlink(inode); 2155 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), 2156 &name); 2157 /* 2158 * Unlike dir item keys, dir index keys can only have one name (entry) in 2159 * them, as there are no key collisions since each key has a unique offset 2160 * (an index number), so we're done. 2161 */ 2162 out: 2163 btrfs_release_path(path); 2164 btrfs_release_path(log_path); 2165 kfree(name.name); 2166 iput(inode); 2167 return ret; 2168 } 2169 2170 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2171 struct btrfs_root *root, 2172 struct btrfs_root *log, 2173 struct btrfs_path *path, 2174 const u64 ino) 2175 { 2176 struct btrfs_key search_key; 2177 struct btrfs_path *log_path; 2178 int i; 2179 int nritems; 2180 int ret; 2181 2182 log_path = btrfs_alloc_path(); 2183 if (!log_path) 2184 return -ENOMEM; 2185 2186 search_key.objectid = ino; 2187 search_key.type = BTRFS_XATTR_ITEM_KEY; 2188 search_key.offset = 0; 2189 again: 2190 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2191 if (ret < 0) 2192 goto out; 2193 process_leaf: 2194 nritems = btrfs_header_nritems(path->nodes[0]); 2195 for (i = path->slots[0]; i < nritems; i++) { 2196 struct btrfs_key key; 2197 struct btrfs_dir_item *di; 2198 struct btrfs_dir_item *log_di; 2199 u32 total_size; 2200 u32 cur; 2201 2202 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2203 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2204 ret = 0; 2205 goto out; 2206 } 2207 2208 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2209 total_size = btrfs_item_size(path->nodes[0], i); 2210 cur = 0; 2211 while (cur < total_size) { 2212 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2213 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2214 u32 this_len = sizeof(*di) + name_len + data_len; 2215 char *name; 2216 2217 name = kmalloc(name_len, GFP_NOFS); 2218 if (!name) { 2219 ret = -ENOMEM; 2220 goto out; 2221 } 2222 read_extent_buffer(path->nodes[0], name, 2223 (unsigned long)(di + 1), name_len); 2224 2225 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2226 name, name_len, 0); 2227 btrfs_release_path(log_path); 2228 if (!log_di) { 2229 /* Doesn't exist in log tree, so delete it. */ 2230 btrfs_release_path(path); 2231 di = btrfs_lookup_xattr(trans, root, path, ino, 2232 name, name_len, -1); 2233 kfree(name); 2234 if (IS_ERR(di)) { 2235 ret = PTR_ERR(di); 2236 goto out; 2237 } 2238 ASSERT(di); 2239 ret = btrfs_delete_one_dir_name(trans, root, 2240 path, di); 2241 if (ret) 2242 goto out; 2243 btrfs_release_path(path); 2244 search_key = key; 2245 goto again; 2246 } 2247 kfree(name); 2248 if (IS_ERR(log_di)) { 2249 ret = PTR_ERR(log_di); 2250 goto out; 2251 } 2252 cur += this_len; 2253 di = (struct btrfs_dir_item *)((char *)di + this_len); 2254 } 2255 } 2256 ret = btrfs_next_leaf(root, path); 2257 if (ret > 0) 2258 ret = 0; 2259 else if (ret == 0) 2260 goto process_leaf; 2261 out: 2262 btrfs_free_path(log_path); 2263 btrfs_release_path(path); 2264 return ret; 2265 } 2266 2267 2268 /* 2269 * deletion replay happens before we copy any new directory items 2270 * out of the log or out of backreferences from inodes. It 2271 * scans the log to find ranges of keys that log is authoritative for, 2272 * and then scans the directory to find items in those ranges that are 2273 * not present in the log. 2274 * 2275 * Anything we don't find in the log is unlinked and removed from the 2276 * directory. 2277 */ 2278 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2279 struct btrfs_root *root, 2280 struct btrfs_root *log, 2281 struct btrfs_path *path, 2282 u64 dirid, int del_all) 2283 { 2284 u64 range_start; 2285 u64 range_end; 2286 int ret = 0; 2287 struct btrfs_key dir_key; 2288 struct btrfs_key found_key; 2289 struct btrfs_path *log_path; 2290 struct inode *dir; 2291 2292 dir_key.objectid = dirid; 2293 dir_key.type = BTRFS_DIR_INDEX_KEY; 2294 log_path = btrfs_alloc_path(); 2295 if (!log_path) 2296 return -ENOMEM; 2297 2298 dir = read_one_inode(root, dirid); 2299 /* it isn't an error if the inode isn't there, that can happen 2300 * because we replay the deletes before we copy in the inode item 2301 * from the log 2302 */ 2303 if (!dir) { 2304 btrfs_free_path(log_path); 2305 return 0; 2306 } 2307 2308 range_start = 0; 2309 range_end = 0; 2310 while (1) { 2311 if (del_all) 2312 range_end = (u64)-1; 2313 else { 2314 ret = find_dir_range(log, path, dirid, 2315 &range_start, &range_end); 2316 if (ret < 0) 2317 goto out; 2318 else if (ret > 0) 2319 break; 2320 } 2321 2322 dir_key.offset = range_start; 2323 while (1) { 2324 int nritems; 2325 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2326 0, 0); 2327 if (ret < 0) 2328 goto out; 2329 2330 nritems = btrfs_header_nritems(path->nodes[0]); 2331 if (path->slots[0] >= nritems) { 2332 ret = btrfs_next_leaf(root, path); 2333 if (ret == 1) 2334 break; 2335 else if (ret < 0) 2336 goto out; 2337 } 2338 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2339 path->slots[0]); 2340 if (found_key.objectid != dirid || 2341 found_key.type != dir_key.type) { 2342 ret = 0; 2343 goto out; 2344 } 2345 2346 if (found_key.offset > range_end) 2347 break; 2348 2349 ret = check_item_in_log(trans, log, path, 2350 log_path, dir, 2351 &found_key); 2352 if (ret) 2353 goto out; 2354 if (found_key.offset == (u64)-1) 2355 break; 2356 dir_key.offset = found_key.offset + 1; 2357 } 2358 btrfs_release_path(path); 2359 if (range_end == (u64)-1) 2360 break; 2361 range_start = range_end + 1; 2362 } 2363 ret = 0; 2364 out: 2365 btrfs_release_path(path); 2366 btrfs_free_path(log_path); 2367 iput(dir); 2368 return ret; 2369 } 2370 2371 /* 2372 * the process_func used to replay items from the log tree. This 2373 * gets called in two different stages. The first stage just looks 2374 * for inodes and makes sure they are all copied into the subvolume. 2375 * 2376 * The second stage copies all the other item types from the log into 2377 * the subvolume. The two stage approach is slower, but gets rid of 2378 * lots of complexity around inodes referencing other inodes that exist 2379 * only in the log (references come from either directory items or inode 2380 * back refs). 2381 */ 2382 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2383 struct walk_control *wc, u64 gen, int level) 2384 { 2385 int nritems; 2386 struct btrfs_tree_parent_check check = { 2387 .transid = gen, 2388 .level = level 2389 }; 2390 struct btrfs_path *path; 2391 struct btrfs_root *root = wc->replay_dest; 2392 struct btrfs_key key; 2393 int i; 2394 int ret; 2395 2396 ret = btrfs_read_extent_buffer(eb, &check); 2397 if (ret) 2398 return ret; 2399 2400 level = btrfs_header_level(eb); 2401 2402 if (level != 0) 2403 return 0; 2404 2405 path = btrfs_alloc_path(); 2406 if (!path) 2407 return -ENOMEM; 2408 2409 nritems = btrfs_header_nritems(eb); 2410 for (i = 0; i < nritems; i++) { 2411 btrfs_item_key_to_cpu(eb, &key, i); 2412 2413 /* inode keys are done during the first stage */ 2414 if (key.type == BTRFS_INODE_ITEM_KEY && 2415 wc->stage == LOG_WALK_REPLAY_INODES) { 2416 struct btrfs_inode_item *inode_item; 2417 u32 mode; 2418 2419 inode_item = btrfs_item_ptr(eb, i, 2420 struct btrfs_inode_item); 2421 /* 2422 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2423 * and never got linked before the fsync, skip it, as 2424 * replaying it is pointless since it would be deleted 2425 * later. We skip logging tmpfiles, but it's always 2426 * possible we are replaying a log created with a kernel 2427 * that used to log tmpfiles. 2428 */ 2429 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2430 wc->ignore_cur_inode = true; 2431 continue; 2432 } else { 2433 wc->ignore_cur_inode = false; 2434 } 2435 ret = replay_xattr_deletes(wc->trans, root, log, 2436 path, key.objectid); 2437 if (ret) 2438 break; 2439 mode = btrfs_inode_mode(eb, inode_item); 2440 if (S_ISDIR(mode)) { 2441 ret = replay_dir_deletes(wc->trans, 2442 root, log, path, key.objectid, 0); 2443 if (ret) 2444 break; 2445 } 2446 ret = overwrite_item(wc->trans, root, path, 2447 eb, i, &key); 2448 if (ret) 2449 break; 2450 2451 /* 2452 * Before replaying extents, truncate the inode to its 2453 * size. We need to do it now and not after log replay 2454 * because before an fsync we can have prealloc extents 2455 * added beyond the inode's i_size. If we did it after, 2456 * through orphan cleanup for example, we would drop 2457 * those prealloc extents just after replaying them. 2458 */ 2459 if (S_ISREG(mode)) { 2460 struct btrfs_drop_extents_args drop_args = { 0 }; 2461 struct inode *inode; 2462 u64 from; 2463 2464 inode = read_one_inode(root, key.objectid); 2465 if (!inode) { 2466 ret = -EIO; 2467 break; 2468 } 2469 from = ALIGN(i_size_read(inode), 2470 root->fs_info->sectorsize); 2471 drop_args.start = from; 2472 drop_args.end = (u64)-1; 2473 drop_args.drop_cache = true; 2474 ret = btrfs_drop_extents(wc->trans, root, 2475 BTRFS_I(inode), 2476 &drop_args); 2477 if (!ret) { 2478 inode_sub_bytes(inode, 2479 drop_args.bytes_found); 2480 /* Update the inode's nbytes. */ 2481 ret = btrfs_update_inode(wc->trans, 2482 BTRFS_I(inode)); 2483 } 2484 iput(inode); 2485 if (ret) 2486 break; 2487 } 2488 2489 ret = link_to_fixup_dir(wc->trans, root, 2490 path, key.objectid); 2491 if (ret) 2492 break; 2493 } 2494 2495 if (wc->ignore_cur_inode) 2496 continue; 2497 2498 if (key.type == BTRFS_DIR_INDEX_KEY && 2499 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2500 ret = replay_one_dir_item(wc->trans, root, path, 2501 eb, i, &key); 2502 if (ret) 2503 break; 2504 } 2505 2506 if (wc->stage < LOG_WALK_REPLAY_ALL) 2507 continue; 2508 2509 /* these keys are simply copied */ 2510 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2511 ret = overwrite_item(wc->trans, root, path, 2512 eb, i, &key); 2513 if (ret) 2514 break; 2515 } else if (key.type == BTRFS_INODE_REF_KEY || 2516 key.type == BTRFS_INODE_EXTREF_KEY) { 2517 ret = add_inode_ref(wc->trans, root, log, path, 2518 eb, i, &key); 2519 if (ret && ret != -ENOENT) 2520 break; 2521 ret = 0; 2522 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2523 ret = replay_one_extent(wc->trans, root, path, 2524 eb, i, &key); 2525 if (ret) 2526 break; 2527 } 2528 /* 2529 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the 2530 * BTRFS_DIR_INDEX_KEY items which we use to derive the 2531 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an 2532 * older kernel with such keys, ignore them. 2533 */ 2534 } 2535 btrfs_free_path(path); 2536 return ret; 2537 } 2538 2539 /* 2540 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2541 */ 2542 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2543 { 2544 struct btrfs_block_group *cache; 2545 2546 cache = btrfs_lookup_block_group(fs_info, start); 2547 if (!cache) { 2548 btrfs_err(fs_info, "unable to find block group for %llu", start); 2549 return; 2550 } 2551 2552 spin_lock(&cache->space_info->lock); 2553 spin_lock(&cache->lock); 2554 cache->reserved -= fs_info->nodesize; 2555 cache->space_info->bytes_reserved -= fs_info->nodesize; 2556 spin_unlock(&cache->lock); 2557 spin_unlock(&cache->space_info->lock); 2558 2559 btrfs_put_block_group(cache); 2560 } 2561 2562 static int clean_log_buffer(struct btrfs_trans_handle *trans, 2563 struct extent_buffer *eb) 2564 { 2565 int ret; 2566 2567 btrfs_tree_lock(eb); 2568 btrfs_clear_buffer_dirty(trans, eb); 2569 wait_on_extent_buffer_writeback(eb); 2570 btrfs_tree_unlock(eb); 2571 2572 if (trans) { 2573 ret = btrfs_pin_reserved_extent(trans, eb); 2574 if (ret) 2575 return ret; 2576 } else { 2577 unaccount_log_buffer(eb->fs_info, eb->start); 2578 } 2579 2580 return 0; 2581 } 2582 2583 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2584 struct btrfs_root *root, 2585 struct btrfs_path *path, int *level, 2586 struct walk_control *wc) 2587 { 2588 struct btrfs_fs_info *fs_info = root->fs_info; 2589 u64 bytenr; 2590 u64 ptr_gen; 2591 struct extent_buffer *next; 2592 struct extent_buffer *cur; 2593 int ret = 0; 2594 2595 while (*level > 0) { 2596 struct btrfs_tree_parent_check check = { 0 }; 2597 2598 cur = path->nodes[*level]; 2599 2600 WARN_ON(btrfs_header_level(cur) != *level); 2601 2602 if (path->slots[*level] >= 2603 btrfs_header_nritems(cur)) 2604 break; 2605 2606 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2607 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2608 check.transid = ptr_gen; 2609 check.level = *level - 1; 2610 check.has_first_key = true; 2611 btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]); 2612 2613 next = btrfs_find_create_tree_block(fs_info, bytenr, 2614 btrfs_header_owner(cur), 2615 *level - 1); 2616 if (IS_ERR(next)) 2617 return PTR_ERR(next); 2618 2619 if (*level == 1) { 2620 ret = wc->process_func(root, next, wc, ptr_gen, 2621 *level - 1); 2622 if (ret) { 2623 free_extent_buffer(next); 2624 return ret; 2625 } 2626 2627 path->slots[*level]++; 2628 if (wc->free) { 2629 ret = btrfs_read_extent_buffer(next, &check); 2630 if (ret) { 2631 free_extent_buffer(next); 2632 return ret; 2633 } 2634 2635 ret = clean_log_buffer(trans, next); 2636 if (ret) { 2637 free_extent_buffer(next); 2638 return ret; 2639 } 2640 } 2641 free_extent_buffer(next); 2642 continue; 2643 } 2644 ret = btrfs_read_extent_buffer(next, &check); 2645 if (ret) { 2646 free_extent_buffer(next); 2647 return ret; 2648 } 2649 2650 if (path->nodes[*level-1]) 2651 free_extent_buffer(path->nodes[*level-1]); 2652 path->nodes[*level-1] = next; 2653 *level = btrfs_header_level(next); 2654 path->slots[*level] = 0; 2655 cond_resched(); 2656 } 2657 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2658 2659 cond_resched(); 2660 return 0; 2661 } 2662 2663 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2664 struct btrfs_root *root, 2665 struct btrfs_path *path, int *level, 2666 struct walk_control *wc) 2667 { 2668 int i; 2669 int slot; 2670 int ret; 2671 2672 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2673 slot = path->slots[i]; 2674 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2675 path->slots[i]++; 2676 *level = i; 2677 WARN_ON(*level == 0); 2678 return 0; 2679 } else { 2680 ret = wc->process_func(root, path->nodes[*level], wc, 2681 btrfs_header_generation(path->nodes[*level]), 2682 *level); 2683 if (ret) 2684 return ret; 2685 2686 if (wc->free) { 2687 ret = clean_log_buffer(trans, path->nodes[*level]); 2688 if (ret) 2689 return ret; 2690 } 2691 free_extent_buffer(path->nodes[*level]); 2692 path->nodes[*level] = NULL; 2693 *level = i + 1; 2694 } 2695 } 2696 return 1; 2697 } 2698 2699 /* 2700 * drop the reference count on the tree rooted at 'snap'. This traverses 2701 * the tree freeing any blocks that have a ref count of zero after being 2702 * decremented. 2703 */ 2704 static int walk_log_tree(struct btrfs_trans_handle *trans, 2705 struct btrfs_root *log, struct walk_control *wc) 2706 { 2707 int ret = 0; 2708 int wret; 2709 int level; 2710 struct btrfs_path *path; 2711 int orig_level; 2712 2713 path = btrfs_alloc_path(); 2714 if (!path) 2715 return -ENOMEM; 2716 2717 level = btrfs_header_level(log->node); 2718 orig_level = level; 2719 path->nodes[level] = log->node; 2720 atomic_inc(&log->node->refs); 2721 path->slots[level] = 0; 2722 2723 while (1) { 2724 wret = walk_down_log_tree(trans, log, path, &level, wc); 2725 if (wret > 0) 2726 break; 2727 if (wret < 0) { 2728 ret = wret; 2729 goto out; 2730 } 2731 2732 wret = walk_up_log_tree(trans, log, path, &level, wc); 2733 if (wret > 0) 2734 break; 2735 if (wret < 0) { 2736 ret = wret; 2737 goto out; 2738 } 2739 } 2740 2741 /* was the root node processed? if not, catch it here */ 2742 if (path->nodes[orig_level]) { 2743 ret = wc->process_func(log, path->nodes[orig_level], wc, 2744 btrfs_header_generation(path->nodes[orig_level]), 2745 orig_level); 2746 if (ret) 2747 goto out; 2748 if (wc->free) 2749 ret = clean_log_buffer(trans, path->nodes[orig_level]); 2750 } 2751 2752 out: 2753 btrfs_free_path(path); 2754 return ret; 2755 } 2756 2757 /* 2758 * helper function to update the item for a given subvolumes log root 2759 * in the tree of log roots 2760 */ 2761 static int update_log_root(struct btrfs_trans_handle *trans, 2762 struct btrfs_root *log, 2763 struct btrfs_root_item *root_item) 2764 { 2765 struct btrfs_fs_info *fs_info = log->fs_info; 2766 int ret; 2767 2768 if (log->log_transid == 1) { 2769 /* insert root item on the first sync */ 2770 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2771 &log->root_key, root_item); 2772 } else { 2773 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2774 &log->root_key, root_item); 2775 } 2776 return ret; 2777 } 2778 2779 static void wait_log_commit(struct btrfs_root *root, int transid) 2780 { 2781 DEFINE_WAIT(wait); 2782 int index = transid % 2; 2783 2784 /* 2785 * we only allow two pending log transactions at a time, 2786 * so we know that if ours is more than 2 older than the 2787 * current transaction, we're done 2788 */ 2789 for (;;) { 2790 prepare_to_wait(&root->log_commit_wait[index], 2791 &wait, TASK_UNINTERRUPTIBLE); 2792 2793 if (!(root->log_transid_committed < transid && 2794 atomic_read(&root->log_commit[index]))) 2795 break; 2796 2797 mutex_unlock(&root->log_mutex); 2798 schedule(); 2799 mutex_lock(&root->log_mutex); 2800 } 2801 finish_wait(&root->log_commit_wait[index], &wait); 2802 } 2803 2804 static void wait_for_writer(struct btrfs_root *root) 2805 { 2806 DEFINE_WAIT(wait); 2807 2808 for (;;) { 2809 prepare_to_wait(&root->log_writer_wait, &wait, 2810 TASK_UNINTERRUPTIBLE); 2811 if (!atomic_read(&root->log_writers)) 2812 break; 2813 2814 mutex_unlock(&root->log_mutex); 2815 schedule(); 2816 mutex_lock(&root->log_mutex); 2817 } 2818 finish_wait(&root->log_writer_wait, &wait); 2819 } 2820 2821 void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode) 2822 { 2823 ctx->log_ret = 0; 2824 ctx->log_transid = 0; 2825 ctx->log_new_dentries = false; 2826 ctx->logging_new_name = false; 2827 ctx->logging_new_delayed_dentries = false; 2828 ctx->logged_before = false; 2829 ctx->inode = inode; 2830 INIT_LIST_HEAD(&ctx->list); 2831 INIT_LIST_HEAD(&ctx->ordered_extents); 2832 INIT_LIST_HEAD(&ctx->conflict_inodes); 2833 ctx->num_conflict_inodes = 0; 2834 ctx->logging_conflict_inodes = false; 2835 ctx->scratch_eb = NULL; 2836 } 2837 2838 void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx) 2839 { 2840 struct btrfs_inode *inode = BTRFS_I(ctx->inode); 2841 2842 if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && 2843 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) 2844 return; 2845 2846 /* 2847 * Don't care about allocation failure. This is just for optimization, 2848 * if we fail to allocate here, we will try again later if needed. 2849 */ 2850 ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0); 2851 } 2852 2853 void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) 2854 { 2855 struct btrfs_ordered_extent *ordered; 2856 struct btrfs_ordered_extent *tmp; 2857 2858 ASSERT(inode_is_locked(ctx->inode)); 2859 2860 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 2861 list_del_init(&ordered->log_list); 2862 btrfs_put_ordered_extent(ordered); 2863 } 2864 } 2865 2866 2867 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2868 struct btrfs_log_ctx *ctx) 2869 { 2870 mutex_lock(&root->log_mutex); 2871 list_del_init(&ctx->list); 2872 mutex_unlock(&root->log_mutex); 2873 } 2874 2875 /* 2876 * Invoked in log mutex context, or be sure there is no other task which 2877 * can access the list. 2878 */ 2879 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2880 int index, int error) 2881 { 2882 struct btrfs_log_ctx *ctx; 2883 struct btrfs_log_ctx *safe; 2884 2885 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2886 list_del_init(&ctx->list); 2887 ctx->log_ret = error; 2888 } 2889 } 2890 2891 /* 2892 * Sends a given tree log down to the disk and updates the super blocks to 2893 * record it. When this call is done, you know that any inodes previously 2894 * logged are safely on disk only if it returns 0. 2895 * 2896 * Any other return value means you need to call btrfs_commit_transaction. 2897 * Some of the edge cases for fsyncing directories that have had unlinks 2898 * or renames done in the past mean that sometimes the only safe 2899 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2900 * that has happened. 2901 */ 2902 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2903 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2904 { 2905 int index1; 2906 int index2; 2907 int mark; 2908 int ret; 2909 struct btrfs_fs_info *fs_info = root->fs_info; 2910 struct btrfs_root *log = root->log_root; 2911 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2912 struct btrfs_root_item new_root_item; 2913 int log_transid = 0; 2914 struct btrfs_log_ctx root_log_ctx; 2915 struct blk_plug plug; 2916 u64 log_root_start; 2917 u64 log_root_level; 2918 2919 mutex_lock(&root->log_mutex); 2920 log_transid = ctx->log_transid; 2921 if (root->log_transid_committed >= log_transid) { 2922 mutex_unlock(&root->log_mutex); 2923 return ctx->log_ret; 2924 } 2925 2926 index1 = log_transid % 2; 2927 if (atomic_read(&root->log_commit[index1])) { 2928 wait_log_commit(root, log_transid); 2929 mutex_unlock(&root->log_mutex); 2930 return ctx->log_ret; 2931 } 2932 ASSERT(log_transid == root->log_transid); 2933 atomic_set(&root->log_commit[index1], 1); 2934 2935 /* wait for previous tree log sync to complete */ 2936 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2937 wait_log_commit(root, log_transid - 1); 2938 2939 while (1) { 2940 int batch = atomic_read(&root->log_batch); 2941 /* when we're on an ssd, just kick the log commit out */ 2942 if (!btrfs_test_opt(fs_info, SSD) && 2943 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2944 mutex_unlock(&root->log_mutex); 2945 schedule_timeout_uninterruptible(1); 2946 mutex_lock(&root->log_mutex); 2947 } 2948 wait_for_writer(root); 2949 if (batch == atomic_read(&root->log_batch)) 2950 break; 2951 } 2952 2953 /* bail out if we need to do a full commit */ 2954 if (btrfs_need_log_full_commit(trans)) { 2955 ret = BTRFS_LOG_FORCE_COMMIT; 2956 mutex_unlock(&root->log_mutex); 2957 goto out; 2958 } 2959 2960 if (log_transid % 2 == 0) 2961 mark = EXTENT_DIRTY; 2962 else 2963 mark = EXTENT_NEW; 2964 2965 /* we start IO on all the marked extents here, but we don't actually 2966 * wait for them until later. 2967 */ 2968 blk_start_plug(&plug); 2969 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 2970 /* 2971 * -EAGAIN happens when someone, e.g., a concurrent transaction 2972 * commit, writes a dirty extent in this tree-log commit. This 2973 * concurrent write will create a hole writing out the extents, 2974 * and we cannot proceed on a zoned filesystem, requiring 2975 * sequential writing. While we can bail out to a full commit 2976 * here, but we can continue hoping the concurrent writing fills 2977 * the hole. 2978 */ 2979 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) 2980 ret = 0; 2981 if (ret) { 2982 blk_finish_plug(&plug); 2983 btrfs_set_log_full_commit(trans); 2984 mutex_unlock(&root->log_mutex); 2985 goto out; 2986 } 2987 2988 /* 2989 * We _must_ update under the root->log_mutex in order to make sure we 2990 * have a consistent view of the log root we are trying to commit at 2991 * this moment. 2992 * 2993 * We _must_ copy this into a local copy, because we are not holding the 2994 * log_root_tree->log_mutex yet. This is important because when we 2995 * commit the log_root_tree we must have a consistent view of the 2996 * log_root_tree when we update the super block to point at the 2997 * log_root_tree bytenr. If we update the log_root_tree here we'll race 2998 * with the commit and possibly point at the new block which we may not 2999 * have written out. 3000 */ 3001 btrfs_set_root_node(&log->root_item, log->node); 3002 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3003 3004 btrfs_set_root_log_transid(root, root->log_transid + 1); 3005 log->log_transid = root->log_transid; 3006 root->log_start_pid = 0; 3007 /* 3008 * IO has been started, blocks of the log tree have WRITTEN flag set 3009 * in their headers. new modifications of the log will be written to 3010 * new positions. so it's safe to allow log writers to go in. 3011 */ 3012 mutex_unlock(&root->log_mutex); 3013 3014 if (btrfs_is_zoned(fs_info)) { 3015 mutex_lock(&fs_info->tree_root->log_mutex); 3016 if (!log_root_tree->node) { 3017 ret = btrfs_alloc_log_tree_node(trans, log_root_tree); 3018 if (ret) { 3019 mutex_unlock(&fs_info->tree_root->log_mutex); 3020 blk_finish_plug(&plug); 3021 goto out; 3022 } 3023 } 3024 mutex_unlock(&fs_info->tree_root->log_mutex); 3025 } 3026 3027 btrfs_init_log_ctx(&root_log_ctx, NULL); 3028 3029 mutex_lock(&log_root_tree->log_mutex); 3030 3031 index2 = log_root_tree->log_transid % 2; 3032 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3033 root_log_ctx.log_transid = log_root_tree->log_transid; 3034 3035 /* 3036 * Now we are safe to update the log_root_tree because we're under the 3037 * log_mutex, and we're a current writer so we're holding the commit 3038 * open until we drop the log_mutex. 3039 */ 3040 ret = update_log_root(trans, log, &new_root_item); 3041 if (ret) { 3042 list_del_init(&root_log_ctx.list); 3043 blk_finish_plug(&plug); 3044 btrfs_set_log_full_commit(trans); 3045 if (ret != -ENOSPC) 3046 btrfs_err(fs_info, 3047 "failed to update log for root %llu ret %d", 3048 root->root_key.objectid, ret); 3049 btrfs_wait_tree_log_extents(log, mark); 3050 mutex_unlock(&log_root_tree->log_mutex); 3051 goto out; 3052 } 3053 3054 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3055 blk_finish_plug(&plug); 3056 list_del_init(&root_log_ctx.list); 3057 mutex_unlock(&log_root_tree->log_mutex); 3058 ret = root_log_ctx.log_ret; 3059 goto out; 3060 } 3061 3062 if (atomic_read(&log_root_tree->log_commit[index2])) { 3063 blk_finish_plug(&plug); 3064 ret = btrfs_wait_tree_log_extents(log, mark); 3065 wait_log_commit(log_root_tree, 3066 root_log_ctx.log_transid); 3067 mutex_unlock(&log_root_tree->log_mutex); 3068 if (!ret) 3069 ret = root_log_ctx.log_ret; 3070 goto out; 3071 } 3072 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3073 atomic_set(&log_root_tree->log_commit[index2], 1); 3074 3075 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3076 wait_log_commit(log_root_tree, 3077 root_log_ctx.log_transid - 1); 3078 } 3079 3080 /* 3081 * now that we've moved on to the tree of log tree roots, 3082 * check the full commit flag again 3083 */ 3084 if (btrfs_need_log_full_commit(trans)) { 3085 blk_finish_plug(&plug); 3086 btrfs_wait_tree_log_extents(log, mark); 3087 mutex_unlock(&log_root_tree->log_mutex); 3088 ret = BTRFS_LOG_FORCE_COMMIT; 3089 goto out_wake_log_root; 3090 } 3091 3092 ret = btrfs_write_marked_extents(fs_info, 3093 &log_root_tree->dirty_log_pages, 3094 EXTENT_DIRTY | EXTENT_NEW); 3095 blk_finish_plug(&plug); 3096 /* 3097 * As described above, -EAGAIN indicates a hole in the extents. We 3098 * cannot wait for these write outs since the waiting cause a 3099 * deadlock. Bail out to the full commit instead. 3100 */ 3101 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { 3102 btrfs_set_log_full_commit(trans); 3103 btrfs_wait_tree_log_extents(log, mark); 3104 mutex_unlock(&log_root_tree->log_mutex); 3105 goto out_wake_log_root; 3106 } else if (ret) { 3107 btrfs_set_log_full_commit(trans); 3108 mutex_unlock(&log_root_tree->log_mutex); 3109 goto out_wake_log_root; 3110 } 3111 ret = btrfs_wait_tree_log_extents(log, mark); 3112 if (!ret) 3113 ret = btrfs_wait_tree_log_extents(log_root_tree, 3114 EXTENT_NEW | EXTENT_DIRTY); 3115 if (ret) { 3116 btrfs_set_log_full_commit(trans); 3117 mutex_unlock(&log_root_tree->log_mutex); 3118 goto out_wake_log_root; 3119 } 3120 3121 log_root_start = log_root_tree->node->start; 3122 log_root_level = btrfs_header_level(log_root_tree->node); 3123 log_root_tree->log_transid++; 3124 mutex_unlock(&log_root_tree->log_mutex); 3125 3126 /* 3127 * Here we are guaranteed that nobody is going to write the superblock 3128 * for the current transaction before us and that neither we do write 3129 * our superblock before the previous transaction finishes its commit 3130 * and writes its superblock, because: 3131 * 3132 * 1) We are holding a handle on the current transaction, so no body 3133 * can commit it until we release the handle; 3134 * 3135 * 2) Before writing our superblock we acquire the tree_log_mutex, so 3136 * if the previous transaction is still committing, and hasn't yet 3137 * written its superblock, we wait for it to do it, because a 3138 * transaction commit acquires the tree_log_mutex when the commit 3139 * begins and releases it only after writing its superblock. 3140 */ 3141 mutex_lock(&fs_info->tree_log_mutex); 3142 3143 /* 3144 * The previous transaction writeout phase could have failed, and thus 3145 * marked the fs in an error state. We must not commit here, as we 3146 * could have updated our generation in the super_for_commit and 3147 * writing the super here would result in transid mismatches. If there 3148 * is an error here just bail. 3149 */ 3150 if (BTRFS_FS_ERROR(fs_info)) { 3151 ret = -EIO; 3152 btrfs_set_log_full_commit(trans); 3153 btrfs_abort_transaction(trans, ret); 3154 mutex_unlock(&fs_info->tree_log_mutex); 3155 goto out_wake_log_root; 3156 } 3157 3158 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); 3159 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); 3160 ret = write_all_supers(fs_info, 1); 3161 mutex_unlock(&fs_info->tree_log_mutex); 3162 if (ret) { 3163 btrfs_set_log_full_commit(trans); 3164 btrfs_abort_transaction(trans, ret); 3165 goto out_wake_log_root; 3166 } 3167 3168 /* 3169 * We know there can only be one task here, since we have not yet set 3170 * root->log_commit[index1] to 0 and any task attempting to sync the 3171 * log must wait for the previous log transaction to commit if it's 3172 * still in progress or wait for the current log transaction commit if 3173 * someone else already started it. We use <= and not < because the 3174 * first log transaction has an ID of 0. 3175 */ 3176 ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid); 3177 btrfs_set_root_last_log_commit(root, log_transid); 3178 3179 out_wake_log_root: 3180 mutex_lock(&log_root_tree->log_mutex); 3181 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3182 3183 log_root_tree->log_transid_committed++; 3184 atomic_set(&log_root_tree->log_commit[index2], 0); 3185 mutex_unlock(&log_root_tree->log_mutex); 3186 3187 /* 3188 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3189 * all the updates above are seen by the woken threads. It might not be 3190 * necessary, but proving that seems to be hard. 3191 */ 3192 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3193 out: 3194 mutex_lock(&root->log_mutex); 3195 btrfs_remove_all_log_ctxs(root, index1, ret); 3196 root->log_transid_committed++; 3197 atomic_set(&root->log_commit[index1], 0); 3198 mutex_unlock(&root->log_mutex); 3199 3200 /* 3201 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3202 * all the updates above are seen by the woken threads. It might not be 3203 * necessary, but proving that seems to be hard. 3204 */ 3205 cond_wake_up(&root->log_commit_wait[index1]); 3206 return ret; 3207 } 3208 3209 static void free_log_tree(struct btrfs_trans_handle *trans, 3210 struct btrfs_root *log) 3211 { 3212 int ret; 3213 struct walk_control wc = { 3214 .free = 1, 3215 .process_func = process_one_buffer 3216 }; 3217 3218 if (log->node) { 3219 ret = walk_log_tree(trans, log, &wc); 3220 if (ret) { 3221 /* 3222 * We weren't able to traverse the entire log tree, the 3223 * typical scenario is getting an -EIO when reading an 3224 * extent buffer of the tree, due to a previous writeback 3225 * failure of it. 3226 */ 3227 set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, 3228 &log->fs_info->fs_state); 3229 3230 /* 3231 * Some extent buffers of the log tree may still be dirty 3232 * and not yet written back to storage, because we may 3233 * have updates to a log tree without syncing a log tree, 3234 * such as during rename and link operations. So flush 3235 * them out and wait for their writeback to complete, so 3236 * that we properly cleanup their state and pages. 3237 */ 3238 btrfs_write_marked_extents(log->fs_info, 3239 &log->dirty_log_pages, 3240 EXTENT_DIRTY | EXTENT_NEW); 3241 btrfs_wait_tree_log_extents(log, 3242 EXTENT_DIRTY | EXTENT_NEW); 3243 3244 if (trans) 3245 btrfs_abort_transaction(trans, ret); 3246 else 3247 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3248 } 3249 } 3250 3251 extent_io_tree_release(&log->dirty_log_pages); 3252 extent_io_tree_release(&log->log_csum_range); 3253 3254 btrfs_put_root(log); 3255 } 3256 3257 /* 3258 * free all the extents used by the tree log. This should be called 3259 * at commit time of the full transaction 3260 */ 3261 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3262 { 3263 if (root->log_root) { 3264 free_log_tree(trans, root->log_root); 3265 root->log_root = NULL; 3266 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3267 } 3268 return 0; 3269 } 3270 3271 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3272 struct btrfs_fs_info *fs_info) 3273 { 3274 if (fs_info->log_root_tree) { 3275 free_log_tree(trans, fs_info->log_root_tree); 3276 fs_info->log_root_tree = NULL; 3277 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); 3278 } 3279 return 0; 3280 } 3281 3282 /* 3283 * Check if an inode was logged in the current transaction. This correctly deals 3284 * with the case where the inode was logged but has a logged_trans of 0, which 3285 * happens if the inode is evicted and loaded again, as logged_trans is an in 3286 * memory only field (not persisted). 3287 * 3288 * Returns 1 if the inode was logged before in the transaction, 0 if it was not, 3289 * and < 0 on error. 3290 */ 3291 static int inode_logged(const struct btrfs_trans_handle *trans, 3292 struct btrfs_inode *inode, 3293 struct btrfs_path *path_in) 3294 { 3295 struct btrfs_path *path = path_in; 3296 struct btrfs_key key; 3297 int ret; 3298 3299 if (inode->logged_trans == trans->transid) 3300 return 1; 3301 3302 /* 3303 * If logged_trans is not 0, then we know the inode logged was not logged 3304 * in this transaction, so we can return false right away. 3305 */ 3306 if (inode->logged_trans > 0) 3307 return 0; 3308 3309 /* 3310 * If no log tree was created for this root in this transaction, then 3311 * the inode can not have been logged in this transaction. In that case 3312 * set logged_trans to anything greater than 0 and less than the current 3313 * transaction's ID, to avoid the search below in a future call in case 3314 * a log tree gets created after this. 3315 */ 3316 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { 3317 inode->logged_trans = trans->transid - 1; 3318 return 0; 3319 } 3320 3321 /* 3322 * We have a log tree and the inode's logged_trans is 0. We can't tell 3323 * for sure if the inode was logged before in this transaction by looking 3324 * only at logged_trans. We could be pessimistic and assume it was, but 3325 * that can lead to unnecessarily logging an inode during rename and link 3326 * operations, and then further updating the log in followup rename and 3327 * link operations, specially if it's a directory, which adds latency 3328 * visible to applications doing a series of rename or link operations. 3329 * 3330 * A logged_trans of 0 here can mean several things: 3331 * 3332 * 1) The inode was never logged since the filesystem was mounted, and may 3333 * or may have not been evicted and loaded again; 3334 * 3335 * 2) The inode was logged in a previous transaction, then evicted and 3336 * then loaded again; 3337 * 3338 * 3) The inode was logged in the current transaction, then evicted and 3339 * then loaded again. 3340 * 3341 * For cases 1) and 2) we don't want to return true, but we need to detect 3342 * case 3) and return true. So we do a search in the log root for the inode 3343 * item. 3344 */ 3345 key.objectid = btrfs_ino(inode); 3346 key.type = BTRFS_INODE_ITEM_KEY; 3347 key.offset = 0; 3348 3349 if (!path) { 3350 path = btrfs_alloc_path(); 3351 if (!path) 3352 return -ENOMEM; 3353 } 3354 3355 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 3356 3357 if (path_in) 3358 btrfs_release_path(path); 3359 else 3360 btrfs_free_path(path); 3361 3362 /* 3363 * Logging an inode always results in logging its inode item. So if we 3364 * did not find the item we know the inode was not logged for sure. 3365 */ 3366 if (ret < 0) { 3367 return ret; 3368 } else if (ret > 0) { 3369 /* 3370 * Set logged_trans to a value greater than 0 and less then the 3371 * current transaction to avoid doing the search in future calls. 3372 */ 3373 inode->logged_trans = trans->transid - 1; 3374 return 0; 3375 } 3376 3377 /* 3378 * The inode was previously logged and then evicted, set logged_trans to 3379 * the current transacion's ID, to avoid future tree searches as long as 3380 * the inode is not evicted again. 3381 */ 3382 inode->logged_trans = trans->transid; 3383 3384 /* 3385 * If it's a directory, then we must set last_dir_index_offset to the 3386 * maximum possible value, so that the next attempt to log the inode does 3387 * not skip checking if dir index keys found in modified subvolume tree 3388 * leaves have been logged before, otherwise it would result in attempts 3389 * to insert duplicate dir index keys in the log tree. This must be done 3390 * because last_dir_index_offset is an in-memory only field, not persisted 3391 * in the inode item or any other on-disk structure, so its value is lost 3392 * once the inode is evicted. 3393 */ 3394 if (S_ISDIR(inode->vfs_inode.i_mode)) 3395 inode->last_dir_index_offset = (u64)-1; 3396 3397 return 1; 3398 } 3399 3400 /* 3401 * Delete a directory entry from the log if it exists. 3402 * 3403 * Returns < 0 on error 3404 * 1 if the entry does not exists 3405 * 0 if the entry existed and was successfully deleted 3406 */ 3407 static int del_logged_dentry(struct btrfs_trans_handle *trans, 3408 struct btrfs_root *log, 3409 struct btrfs_path *path, 3410 u64 dir_ino, 3411 const struct fscrypt_str *name, 3412 u64 index) 3413 { 3414 struct btrfs_dir_item *di; 3415 3416 /* 3417 * We only log dir index items of a directory, so we don't need to look 3418 * for dir item keys. 3419 */ 3420 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3421 index, name, -1); 3422 if (IS_ERR(di)) 3423 return PTR_ERR(di); 3424 else if (!di) 3425 return 1; 3426 3427 /* 3428 * We do not need to update the size field of the directory's 3429 * inode item because on log replay we update the field to reflect 3430 * all existing entries in the directory (see overwrite_item()). 3431 */ 3432 return btrfs_delete_one_dir_name(trans, log, path, di); 3433 } 3434 3435 /* 3436 * If both a file and directory are logged, and unlinks or renames are 3437 * mixed in, we have a few interesting corners: 3438 * 3439 * create file X in dir Y 3440 * link file X to X.link in dir Y 3441 * fsync file X 3442 * unlink file X but leave X.link 3443 * fsync dir Y 3444 * 3445 * After a crash we would expect only X.link to exist. But file X 3446 * didn't get fsync'd again so the log has back refs for X and X.link. 3447 * 3448 * We solve this by removing directory entries and inode backrefs from the 3449 * log when a file that was logged in the current transaction is 3450 * unlinked. Any later fsync will include the updated log entries, and 3451 * we'll be able to reconstruct the proper directory items from backrefs. 3452 * 3453 * This optimizations allows us to avoid relogging the entire inode 3454 * or the entire directory. 3455 */ 3456 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3457 struct btrfs_root *root, 3458 const struct fscrypt_str *name, 3459 struct btrfs_inode *dir, u64 index) 3460 { 3461 struct btrfs_path *path; 3462 int ret; 3463 3464 ret = inode_logged(trans, dir, NULL); 3465 if (ret == 0) 3466 return; 3467 else if (ret < 0) { 3468 btrfs_set_log_full_commit(trans); 3469 return; 3470 } 3471 3472 ret = join_running_log_trans(root); 3473 if (ret) 3474 return; 3475 3476 mutex_lock(&dir->log_mutex); 3477 3478 path = btrfs_alloc_path(); 3479 if (!path) { 3480 ret = -ENOMEM; 3481 goto out_unlock; 3482 } 3483 3484 ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), 3485 name, index); 3486 btrfs_free_path(path); 3487 out_unlock: 3488 mutex_unlock(&dir->log_mutex); 3489 if (ret < 0) 3490 btrfs_set_log_full_commit(trans); 3491 btrfs_end_log_trans(root); 3492 } 3493 3494 /* see comments for btrfs_del_dir_entries_in_log */ 3495 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3496 struct btrfs_root *root, 3497 const struct fscrypt_str *name, 3498 struct btrfs_inode *inode, u64 dirid) 3499 { 3500 struct btrfs_root *log; 3501 u64 index; 3502 int ret; 3503 3504 ret = inode_logged(trans, inode, NULL); 3505 if (ret == 0) 3506 return; 3507 else if (ret < 0) { 3508 btrfs_set_log_full_commit(trans); 3509 return; 3510 } 3511 3512 ret = join_running_log_trans(root); 3513 if (ret) 3514 return; 3515 log = root->log_root; 3516 mutex_lock(&inode->log_mutex); 3517 3518 ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), 3519 dirid, &index); 3520 mutex_unlock(&inode->log_mutex); 3521 if (ret < 0 && ret != -ENOENT) 3522 btrfs_set_log_full_commit(trans); 3523 btrfs_end_log_trans(root); 3524 } 3525 3526 /* 3527 * creates a range item in the log for 'dirid'. first_offset and 3528 * last_offset tell us which parts of the key space the log should 3529 * be considered authoritative for. 3530 */ 3531 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3532 struct btrfs_root *log, 3533 struct btrfs_path *path, 3534 u64 dirid, 3535 u64 first_offset, u64 last_offset) 3536 { 3537 int ret; 3538 struct btrfs_key key; 3539 struct btrfs_dir_log_item *item; 3540 3541 key.objectid = dirid; 3542 key.offset = first_offset; 3543 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3544 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3545 /* 3546 * -EEXIST is fine and can happen sporadically when we are logging a 3547 * directory and have concurrent insertions in the subvolume's tree for 3548 * items from other inodes and that result in pushing off some dir items 3549 * from one leaf to another in order to accommodate for the new items. 3550 * This results in logging the same dir index range key. 3551 */ 3552 if (ret && ret != -EEXIST) 3553 return ret; 3554 3555 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3556 struct btrfs_dir_log_item); 3557 if (ret == -EEXIST) { 3558 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item); 3559 3560 /* 3561 * btrfs_del_dir_entries_in_log() might have been called during 3562 * an unlink between the initial insertion of this key and the 3563 * current update, or we might be logging a single entry deletion 3564 * during a rename, so set the new last_offset to the max value. 3565 */ 3566 last_offset = max(last_offset, curr_end); 3567 } 3568 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3569 btrfs_mark_buffer_dirty(trans, path->nodes[0]); 3570 btrfs_release_path(path); 3571 return 0; 3572 } 3573 3574 static int flush_dir_items_batch(struct btrfs_trans_handle *trans, 3575 struct btrfs_inode *inode, 3576 struct extent_buffer *src, 3577 struct btrfs_path *dst_path, 3578 int start_slot, 3579 int count) 3580 { 3581 struct btrfs_root *log = inode->root->log_root; 3582 char *ins_data = NULL; 3583 struct btrfs_item_batch batch; 3584 struct extent_buffer *dst; 3585 unsigned long src_offset; 3586 unsigned long dst_offset; 3587 u64 last_index; 3588 struct btrfs_key key; 3589 u32 item_size; 3590 int ret; 3591 int i; 3592 3593 ASSERT(count > 0); 3594 batch.nr = count; 3595 3596 if (count == 1) { 3597 btrfs_item_key_to_cpu(src, &key, start_slot); 3598 item_size = btrfs_item_size(src, start_slot); 3599 batch.keys = &key; 3600 batch.data_sizes = &item_size; 3601 batch.total_data_size = item_size; 3602 } else { 3603 struct btrfs_key *ins_keys; 3604 u32 *ins_sizes; 3605 3606 ins_data = kmalloc(count * sizeof(u32) + 3607 count * sizeof(struct btrfs_key), GFP_NOFS); 3608 if (!ins_data) 3609 return -ENOMEM; 3610 3611 ins_sizes = (u32 *)ins_data; 3612 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32)); 3613 batch.keys = ins_keys; 3614 batch.data_sizes = ins_sizes; 3615 batch.total_data_size = 0; 3616 3617 for (i = 0; i < count; i++) { 3618 const int slot = start_slot + i; 3619 3620 btrfs_item_key_to_cpu(src, &ins_keys[i], slot); 3621 ins_sizes[i] = btrfs_item_size(src, slot); 3622 batch.total_data_size += ins_sizes[i]; 3623 } 3624 } 3625 3626 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 3627 if (ret) 3628 goto out; 3629 3630 dst = dst_path->nodes[0]; 3631 /* 3632 * Copy all the items in bulk, in a single copy operation. Item data is 3633 * organized such that it's placed at the end of a leaf and from right 3634 * to left. For example, the data for the second item ends at an offset 3635 * that matches the offset where the data for the first item starts, the 3636 * data for the third item ends at an offset that matches the offset 3637 * where the data of the second items starts, and so on. 3638 * Therefore our source and destination start offsets for copy match the 3639 * offsets of the last items (highest slots). 3640 */ 3641 dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1); 3642 src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); 3643 copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); 3644 btrfs_release_path(dst_path); 3645 3646 last_index = batch.keys[count - 1].offset; 3647 ASSERT(last_index > inode->last_dir_index_offset); 3648 3649 /* 3650 * If for some unexpected reason the last item's index is not greater 3651 * than the last index we logged, warn and force a transaction commit. 3652 */ 3653 if (WARN_ON(last_index <= inode->last_dir_index_offset)) 3654 ret = BTRFS_LOG_FORCE_COMMIT; 3655 else 3656 inode->last_dir_index_offset = last_index; 3657 3658 if (btrfs_get_first_dir_index_to_log(inode) == 0) 3659 btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset); 3660 out: 3661 kfree(ins_data); 3662 3663 return ret; 3664 } 3665 3666 static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx) 3667 { 3668 const int slot = path->slots[0]; 3669 3670 if (ctx->scratch_eb) { 3671 copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]); 3672 } else { 3673 ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]); 3674 if (!ctx->scratch_eb) 3675 return -ENOMEM; 3676 } 3677 3678 btrfs_release_path(path); 3679 path->nodes[0] = ctx->scratch_eb; 3680 path->slots[0] = slot; 3681 /* 3682 * Add extra ref to scratch eb so that it is not freed when callers 3683 * release the path, so we can reuse it later if needed. 3684 */ 3685 atomic_inc(&ctx->scratch_eb->refs); 3686 3687 return 0; 3688 } 3689 3690 static int process_dir_items_leaf(struct btrfs_trans_handle *trans, 3691 struct btrfs_inode *inode, 3692 struct btrfs_path *path, 3693 struct btrfs_path *dst_path, 3694 struct btrfs_log_ctx *ctx, 3695 u64 *last_old_dentry_offset) 3696 { 3697 struct btrfs_root *log = inode->root->log_root; 3698 struct extent_buffer *src; 3699 const int nritems = btrfs_header_nritems(path->nodes[0]); 3700 const u64 ino = btrfs_ino(inode); 3701 bool last_found = false; 3702 int batch_start = 0; 3703 int batch_size = 0; 3704 int ret; 3705 3706 /* 3707 * We need to clone the leaf, release the read lock on it, and use the 3708 * clone before modifying the log tree. See the comment at copy_items() 3709 * about why we need to do this. 3710 */ 3711 ret = clone_leaf(path, ctx); 3712 if (ret < 0) 3713 return ret; 3714 3715 src = path->nodes[0]; 3716 3717 for (int i = path->slots[0]; i < nritems; i++) { 3718 struct btrfs_dir_item *di; 3719 struct btrfs_key key; 3720 int ret; 3721 3722 btrfs_item_key_to_cpu(src, &key, i); 3723 3724 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { 3725 last_found = true; 3726 break; 3727 } 3728 3729 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3730 3731 /* 3732 * Skip ranges of items that consist only of dir item keys created 3733 * in past transactions. However if we find a gap, we must log a 3734 * dir index range item for that gap, so that index keys in that 3735 * gap are deleted during log replay. 3736 */ 3737 if (btrfs_dir_transid(src, di) < trans->transid) { 3738 if (key.offset > *last_old_dentry_offset + 1) { 3739 ret = insert_dir_log_key(trans, log, dst_path, 3740 ino, *last_old_dentry_offset + 1, 3741 key.offset - 1); 3742 if (ret < 0) 3743 return ret; 3744 } 3745 3746 *last_old_dentry_offset = key.offset; 3747 continue; 3748 } 3749 3750 /* If we logged this dir index item before, we can skip it. */ 3751 if (key.offset <= inode->last_dir_index_offset) 3752 continue; 3753 3754 /* 3755 * We must make sure that when we log a directory entry, the 3756 * corresponding inode, after log replay, has a matching link 3757 * count. For example: 3758 * 3759 * touch foo 3760 * mkdir mydir 3761 * sync 3762 * ln foo mydir/bar 3763 * xfs_io -c "fsync" mydir 3764 * <crash> 3765 * <mount fs and log replay> 3766 * 3767 * Would result in a fsync log that when replayed, our file inode 3768 * would have a link count of 1, but we get two directory entries 3769 * pointing to the same inode. After removing one of the names, 3770 * it would not be possible to remove the other name, which 3771 * resulted always in stale file handle errors, and would not be 3772 * possible to rmdir the parent directory, since its i_size could 3773 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE, 3774 * resulting in -ENOTEMPTY errors. 3775 */ 3776 if (!ctx->log_new_dentries) { 3777 struct btrfs_key di_key; 3778 3779 btrfs_dir_item_key_to_cpu(src, di, &di_key); 3780 if (di_key.type != BTRFS_ROOT_ITEM_KEY) 3781 ctx->log_new_dentries = true; 3782 } 3783 3784 if (batch_size == 0) 3785 batch_start = i; 3786 batch_size++; 3787 } 3788 3789 if (batch_size > 0) { 3790 int ret; 3791 3792 ret = flush_dir_items_batch(trans, inode, src, dst_path, 3793 batch_start, batch_size); 3794 if (ret < 0) 3795 return ret; 3796 } 3797 3798 return last_found ? 1 : 0; 3799 } 3800 3801 /* 3802 * log all the items included in the current transaction for a given 3803 * directory. This also creates the range items in the log tree required 3804 * to replay anything deleted before the fsync 3805 */ 3806 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3807 struct btrfs_inode *inode, 3808 struct btrfs_path *path, 3809 struct btrfs_path *dst_path, 3810 struct btrfs_log_ctx *ctx, 3811 u64 min_offset, u64 *last_offset_ret) 3812 { 3813 struct btrfs_key min_key; 3814 struct btrfs_root *root = inode->root; 3815 struct btrfs_root *log = root->log_root; 3816 int ret; 3817 u64 last_old_dentry_offset = min_offset - 1; 3818 u64 last_offset = (u64)-1; 3819 u64 ino = btrfs_ino(inode); 3820 3821 min_key.objectid = ino; 3822 min_key.type = BTRFS_DIR_INDEX_KEY; 3823 min_key.offset = min_offset; 3824 3825 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3826 3827 /* 3828 * we didn't find anything from this transaction, see if there 3829 * is anything at all 3830 */ 3831 if (ret != 0 || min_key.objectid != ino || 3832 min_key.type != BTRFS_DIR_INDEX_KEY) { 3833 min_key.objectid = ino; 3834 min_key.type = BTRFS_DIR_INDEX_KEY; 3835 min_key.offset = (u64)-1; 3836 btrfs_release_path(path); 3837 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3838 if (ret < 0) { 3839 btrfs_release_path(path); 3840 return ret; 3841 } 3842 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 3843 3844 /* if ret == 0 there are items for this type, 3845 * create a range to tell us the last key of this type. 3846 * otherwise, there are no items in this directory after 3847 * *min_offset, and we create a range to indicate that. 3848 */ 3849 if (ret == 0) { 3850 struct btrfs_key tmp; 3851 3852 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3853 path->slots[0]); 3854 if (tmp.type == BTRFS_DIR_INDEX_KEY) 3855 last_old_dentry_offset = tmp.offset; 3856 } else if (ret > 0) { 3857 ret = 0; 3858 } 3859 3860 goto done; 3861 } 3862 3863 /* go backward to find any previous key */ 3864 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 3865 if (ret == 0) { 3866 struct btrfs_key tmp; 3867 3868 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3869 /* 3870 * The dir index key before the first one we found that needs to 3871 * be logged might be in a previous leaf, and there might be a 3872 * gap between these keys, meaning that we had deletions that 3873 * happened. So the key range item we log (key type 3874 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the 3875 * previous key's offset plus 1, so that those deletes are replayed. 3876 */ 3877 if (tmp.type == BTRFS_DIR_INDEX_KEY) 3878 last_old_dentry_offset = tmp.offset; 3879 } else if (ret < 0) { 3880 goto done; 3881 } 3882 3883 btrfs_release_path(path); 3884 3885 /* 3886 * Find the first key from this transaction again or the one we were at 3887 * in the loop below in case we had to reschedule. We may be logging the 3888 * directory without holding its VFS lock, which happen when logging new 3889 * dentries (through log_new_dir_dentries()) or in some cases when we 3890 * need to log the parent directory of an inode. This means a dir index 3891 * key might be deleted from the inode's root, and therefore we may not 3892 * find it anymore. If we can't find it, just move to the next key. We 3893 * can not bail out and ignore, because if we do that we will simply 3894 * not log dir index keys that come after the one that was just deleted 3895 * and we can end up logging a dir index range that ends at (u64)-1 3896 * (@last_offset is initialized to that), resulting in removing dir 3897 * entries we should not remove at log replay time. 3898 */ 3899 search: 3900 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3901 if (ret > 0) { 3902 ret = btrfs_next_item(root, path); 3903 if (ret > 0) { 3904 /* There are no more keys in the inode's root. */ 3905 ret = 0; 3906 goto done; 3907 } 3908 } 3909 if (ret < 0) 3910 goto done; 3911 3912 /* 3913 * we have a block from this transaction, log every item in it 3914 * from our directory 3915 */ 3916 while (1) { 3917 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, 3918 &last_old_dentry_offset); 3919 if (ret != 0) { 3920 if (ret > 0) 3921 ret = 0; 3922 goto done; 3923 } 3924 path->slots[0] = btrfs_header_nritems(path->nodes[0]); 3925 3926 /* 3927 * look ahead to the next item and see if it is also 3928 * from this directory and from this transaction 3929 */ 3930 ret = btrfs_next_leaf(root, path); 3931 if (ret) { 3932 if (ret == 1) { 3933 last_offset = (u64)-1; 3934 ret = 0; 3935 } 3936 goto done; 3937 } 3938 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); 3939 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { 3940 last_offset = (u64)-1; 3941 goto done; 3942 } 3943 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3944 /* 3945 * The next leaf was not changed in the current transaction 3946 * and has at least one dir index key. 3947 * We check for the next key because there might have been 3948 * one or more deletions between the last key we logged and 3949 * that next key. So the key range item we log (key type 3950 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's 3951 * offset minus 1, so that those deletes are replayed. 3952 */ 3953 last_offset = min_key.offset - 1; 3954 goto done; 3955 } 3956 if (need_resched()) { 3957 btrfs_release_path(path); 3958 cond_resched(); 3959 goto search; 3960 } 3961 } 3962 done: 3963 btrfs_release_path(path); 3964 btrfs_release_path(dst_path); 3965 3966 if (ret == 0) { 3967 *last_offset_ret = last_offset; 3968 /* 3969 * In case the leaf was changed in the current transaction but 3970 * all its dir items are from a past transaction, the last item 3971 * in the leaf is a dir item and there's no gap between that last 3972 * dir item and the first one on the next leaf (which did not 3973 * change in the current transaction), then we don't need to log 3974 * a range, last_old_dentry_offset is == to last_offset. 3975 */ 3976 ASSERT(last_old_dentry_offset <= last_offset); 3977 if (last_old_dentry_offset < last_offset) 3978 ret = insert_dir_log_key(trans, log, path, ino, 3979 last_old_dentry_offset + 1, 3980 last_offset); 3981 } 3982 3983 return ret; 3984 } 3985 3986 /* 3987 * If the inode was logged before and it was evicted, then its 3988 * last_dir_index_offset is (u64)-1, so we don't the value of the last index 3989 * key offset. If that's the case, search for it and update the inode. This 3990 * is to avoid lookups in the log tree every time we try to insert a dir index 3991 * key from a leaf changed in the current transaction, and to allow us to always 3992 * do batch insertions of dir index keys. 3993 */ 3994 static int update_last_dir_index_offset(struct btrfs_inode *inode, 3995 struct btrfs_path *path, 3996 const struct btrfs_log_ctx *ctx) 3997 { 3998 const u64 ino = btrfs_ino(inode); 3999 struct btrfs_key key; 4000 int ret; 4001 4002 lockdep_assert_held(&inode->log_mutex); 4003 4004 if (inode->last_dir_index_offset != (u64)-1) 4005 return 0; 4006 4007 if (!ctx->logged_before) { 4008 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; 4009 return 0; 4010 } 4011 4012 key.objectid = ino; 4013 key.type = BTRFS_DIR_INDEX_KEY; 4014 key.offset = (u64)-1; 4015 4016 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 4017 /* 4018 * An error happened or we actually have an index key with an offset 4019 * value of (u64)-1. Bail out, we're done. 4020 */ 4021 if (ret <= 0) 4022 goto out; 4023 4024 ret = 0; 4025 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; 4026 4027 /* 4028 * No dir index items, bail out and leave last_dir_index_offset with 4029 * the value right before the first valid index value. 4030 */ 4031 if (path->slots[0] == 0) 4032 goto out; 4033 4034 /* 4035 * btrfs_search_slot() left us at one slot beyond the slot with the last 4036 * index key, or beyond the last key of the directory that is not an 4037 * index key. If we have an index key before, set last_dir_index_offset 4038 * to its offset value, otherwise leave it with a value right before the 4039 * first valid index value, as it means we have an empty directory. 4040 */ 4041 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); 4042 if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY) 4043 inode->last_dir_index_offset = key.offset; 4044 4045 out: 4046 btrfs_release_path(path); 4047 4048 return ret; 4049 } 4050 4051 /* 4052 * logging directories is very similar to logging inodes, We find all the items 4053 * from the current transaction and write them to the log. 4054 * 4055 * The recovery code scans the directory in the subvolume, and if it finds a 4056 * key in the range logged that is not present in the log tree, then it means 4057 * that dir entry was unlinked during the transaction. 4058 * 4059 * In order for that scan to work, we must include one key smaller than 4060 * the smallest logged by this transaction and one key larger than the largest 4061 * key logged by this transaction. 4062 */ 4063 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 4064 struct btrfs_inode *inode, 4065 struct btrfs_path *path, 4066 struct btrfs_path *dst_path, 4067 struct btrfs_log_ctx *ctx) 4068 { 4069 u64 min_key; 4070 u64 max_key; 4071 int ret; 4072 4073 ret = update_last_dir_index_offset(inode, path, ctx); 4074 if (ret) 4075 return ret; 4076 4077 min_key = BTRFS_DIR_START_INDEX; 4078 max_key = 0; 4079 4080 while (1) { 4081 ret = log_dir_items(trans, inode, path, dst_path, 4082 ctx, min_key, &max_key); 4083 if (ret) 4084 return ret; 4085 if (max_key == (u64)-1) 4086 break; 4087 min_key = max_key + 1; 4088 } 4089 4090 return 0; 4091 } 4092 4093 /* 4094 * a helper function to drop items from the log before we relog an 4095 * inode. max_key_type indicates the highest item type to remove. 4096 * This cannot be run for file data extents because it does not 4097 * free the extents they point to. 4098 */ 4099 static int drop_inode_items(struct btrfs_trans_handle *trans, 4100 struct btrfs_root *log, 4101 struct btrfs_path *path, 4102 struct btrfs_inode *inode, 4103 int max_key_type) 4104 { 4105 int ret; 4106 struct btrfs_key key; 4107 struct btrfs_key found_key; 4108 int start_slot; 4109 4110 key.objectid = btrfs_ino(inode); 4111 key.type = max_key_type; 4112 key.offset = (u64)-1; 4113 4114 while (1) { 4115 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 4116 if (ret < 0) { 4117 break; 4118 } else if (ret > 0) { 4119 if (path->slots[0] == 0) 4120 break; 4121 path->slots[0]--; 4122 } 4123 4124 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4125 path->slots[0]); 4126 4127 if (found_key.objectid != key.objectid) 4128 break; 4129 4130 found_key.offset = 0; 4131 found_key.type = 0; 4132 ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot); 4133 if (ret < 0) 4134 break; 4135 4136 ret = btrfs_del_items(trans, log, path, start_slot, 4137 path->slots[0] - start_slot + 1); 4138 /* 4139 * If start slot isn't 0 then we don't need to re-search, we've 4140 * found the last guy with the objectid in this tree. 4141 */ 4142 if (ret || start_slot != 0) 4143 break; 4144 btrfs_release_path(path); 4145 } 4146 btrfs_release_path(path); 4147 if (ret > 0) 4148 ret = 0; 4149 return ret; 4150 } 4151 4152 static int truncate_inode_items(struct btrfs_trans_handle *trans, 4153 struct btrfs_root *log_root, 4154 struct btrfs_inode *inode, 4155 u64 new_size, u32 min_type) 4156 { 4157 struct btrfs_truncate_control control = { 4158 .new_size = new_size, 4159 .ino = btrfs_ino(inode), 4160 .min_type = min_type, 4161 .skip_ref_updates = true, 4162 }; 4163 4164 return btrfs_truncate_inode_items(trans, log_root, &control); 4165 } 4166 4167 static void fill_inode_item(struct btrfs_trans_handle *trans, 4168 struct extent_buffer *leaf, 4169 struct btrfs_inode_item *item, 4170 struct inode *inode, int log_inode_only, 4171 u64 logged_isize) 4172 { 4173 struct btrfs_map_token token; 4174 u64 flags; 4175 4176 btrfs_init_map_token(&token, leaf); 4177 4178 if (log_inode_only) { 4179 /* set the generation to zero so the recover code 4180 * can tell the difference between an logging 4181 * just to say 'this inode exists' and a logging 4182 * to say 'update this inode with these values' 4183 */ 4184 btrfs_set_token_inode_generation(&token, item, 0); 4185 btrfs_set_token_inode_size(&token, item, logged_isize); 4186 } else { 4187 btrfs_set_token_inode_generation(&token, item, 4188 BTRFS_I(inode)->generation); 4189 btrfs_set_token_inode_size(&token, item, inode->i_size); 4190 } 4191 4192 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 4193 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 4194 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 4195 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 4196 4197 btrfs_set_token_timespec_sec(&token, &item->atime, 4198 inode_get_atime_sec(inode)); 4199 btrfs_set_token_timespec_nsec(&token, &item->atime, 4200 inode_get_atime_nsec(inode)); 4201 4202 btrfs_set_token_timespec_sec(&token, &item->mtime, 4203 inode_get_mtime_sec(inode)); 4204 btrfs_set_token_timespec_nsec(&token, &item->mtime, 4205 inode_get_mtime_nsec(inode)); 4206 4207 btrfs_set_token_timespec_sec(&token, &item->ctime, 4208 inode_get_ctime_sec(inode)); 4209 btrfs_set_token_timespec_nsec(&token, &item->ctime, 4210 inode_get_ctime_nsec(inode)); 4211 4212 /* 4213 * We do not need to set the nbytes field, in fact during a fast fsync 4214 * its value may not even be correct, since a fast fsync does not wait 4215 * for ordered extent completion, which is where we update nbytes, it 4216 * only waits for writeback to complete. During log replay as we find 4217 * file extent items and replay them, we adjust the nbytes field of the 4218 * inode item in subvolume tree as needed (see overwrite_item()). 4219 */ 4220 4221 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 4222 btrfs_set_token_inode_transid(&token, item, trans->transid); 4223 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 4224 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 4225 BTRFS_I(inode)->ro_flags); 4226 btrfs_set_token_inode_flags(&token, item, flags); 4227 btrfs_set_token_inode_block_group(&token, item, 0); 4228 } 4229 4230 static int log_inode_item(struct btrfs_trans_handle *trans, 4231 struct btrfs_root *log, struct btrfs_path *path, 4232 struct btrfs_inode *inode, bool inode_item_dropped) 4233 { 4234 struct btrfs_inode_item *inode_item; 4235 int ret; 4236 4237 /* 4238 * If we are doing a fast fsync and the inode was logged before in the 4239 * current transaction, then we know the inode was previously logged and 4240 * it exists in the log tree. For performance reasons, in this case use 4241 * btrfs_search_slot() directly with ins_len set to 0 so that we never 4242 * attempt a write lock on the leaf's parent, which adds unnecessary lock 4243 * contention in case there are concurrent fsyncs for other inodes of the 4244 * same subvolume. Using btrfs_insert_empty_item() when the inode item 4245 * already exists can also result in unnecessarily splitting a leaf. 4246 */ 4247 if (!inode_item_dropped && inode->logged_trans == trans->transid) { 4248 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); 4249 ASSERT(ret <= 0); 4250 if (ret > 0) 4251 ret = -ENOENT; 4252 } else { 4253 /* 4254 * This means it is the first fsync in the current transaction, 4255 * so the inode item is not in the log and we need to insert it. 4256 * We can never get -EEXIST because we are only called for a fast 4257 * fsync and in case an inode eviction happens after the inode was 4258 * logged before in the current transaction, when we load again 4259 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime 4260 * flags and set ->logged_trans to 0. 4261 */ 4262 ret = btrfs_insert_empty_item(trans, log, path, &inode->location, 4263 sizeof(*inode_item)); 4264 ASSERT(ret != -EEXIST); 4265 } 4266 if (ret) 4267 return ret; 4268 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4269 struct btrfs_inode_item); 4270 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 4271 0, 0); 4272 btrfs_release_path(path); 4273 return 0; 4274 } 4275 4276 static int log_csums(struct btrfs_trans_handle *trans, 4277 struct btrfs_inode *inode, 4278 struct btrfs_root *log_root, 4279 struct btrfs_ordered_sum *sums) 4280 { 4281 const u64 lock_end = sums->logical + sums->len - 1; 4282 struct extent_state *cached_state = NULL; 4283 int ret; 4284 4285 /* 4286 * If this inode was not used for reflink operations in the current 4287 * transaction with new extents, then do the fast path, no need to 4288 * worry about logging checksum items with overlapping ranges. 4289 */ 4290 if (inode->last_reflink_trans < trans->transid) 4291 return btrfs_csum_file_blocks(trans, log_root, sums); 4292 4293 /* 4294 * Serialize logging for checksums. This is to avoid racing with the 4295 * same checksum being logged by another task that is logging another 4296 * file which happens to refer to the same extent as well. Such races 4297 * can leave checksum items in the log with overlapping ranges. 4298 */ 4299 ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end, 4300 &cached_state); 4301 if (ret) 4302 return ret; 4303 /* 4304 * Due to extent cloning, we might have logged a csum item that covers a 4305 * subrange of a cloned extent, and later we can end up logging a csum 4306 * item for a larger subrange of the same extent or the entire range. 4307 * This would leave csum items in the log tree that cover the same range 4308 * and break the searches for checksums in the log tree, resulting in 4309 * some checksums missing in the fs/subvolume tree. So just delete (or 4310 * trim and adjust) any existing csum items in the log for this range. 4311 */ 4312 ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len); 4313 if (!ret) 4314 ret = btrfs_csum_file_blocks(trans, log_root, sums); 4315 4316 unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, 4317 &cached_state); 4318 4319 return ret; 4320 } 4321 4322 static noinline int copy_items(struct btrfs_trans_handle *trans, 4323 struct btrfs_inode *inode, 4324 struct btrfs_path *dst_path, 4325 struct btrfs_path *src_path, 4326 int start_slot, int nr, int inode_only, 4327 u64 logged_isize, struct btrfs_log_ctx *ctx) 4328 { 4329 struct btrfs_root *log = inode->root->log_root; 4330 struct btrfs_file_extent_item *extent; 4331 struct extent_buffer *src; 4332 int ret; 4333 struct btrfs_key *ins_keys; 4334 u32 *ins_sizes; 4335 struct btrfs_item_batch batch; 4336 char *ins_data; 4337 int dst_index; 4338 const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); 4339 const u64 i_size = i_size_read(&inode->vfs_inode); 4340 4341 /* 4342 * To keep lockdep happy and avoid deadlocks, clone the source leaf and 4343 * use the clone. This is because otherwise we would be changing the log 4344 * tree, to insert items from the subvolume tree or insert csum items, 4345 * while holding a read lock on a leaf from the subvolume tree, which 4346 * creates a nasty lock dependency when COWing log tree nodes/leaves: 4347 * 4348 * 1) Modifying the log tree triggers an extent buffer allocation while 4349 * holding a write lock on a parent extent buffer from the log tree. 4350 * Allocating the pages for an extent buffer, or the extent buffer 4351 * struct, can trigger inode eviction and finally the inode eviction 4352 * will trigger a release/remove of a delayed node, which requires 4353 * taking the delayed node's mutex; 4354 * 4355 * 2) Allocating a metadata extent for a log tree can trigger the async 4356 * reclaim thread and make us wait for it to release enough space and 4357 * unblock our reservation ticket. The reclaim thread can start 4358 * flushing delayed items, and that in turn results in the need to 4359 * lock delayed node mutexes and in the need to write lock extent 4360 * buffers of a subvolume tree - all this while holding a write lock 4361 * on the parent extent buffer in the log tree. 4362 * 4363 * So one task in scenario 1) running in parallel with another task in 4364 * scenario 2) could lead to a deadlock, one wanting to lock a delayed 4365 * node mutex while having a read lock on a leaf from the subvolume, 4366 * while the other is holding the delayed node's mutex and wants to 4367 * write lock the same subvolume leaf for flushing delayed items. 4368 */ 4369 ret = clone_leaf(src_path, ctx); 4370 if (ret < 0) 4371 return ret; 4372 4373 src = src_path->nodes[0]; 4374 4375 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 4376 nr * sizeof(u32), GFP_NOFS); 4377 if (!ins_data) 4378 return -ENOMEM; 4379 4380 ins_sizes = (u32 *)ins_data; 4381 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 4382 batch.keys = ins_keys; 4383 batch.data_sizes = ins_sizes; 4384 batch.total_data_size = 0; 4385 batch.nr = 0; 4386 4387 dst_index = 0; 4388 for (int i = 0; i < nr; i++) { 4389 const int src_slot = start_slot + i; 4390 struct btrfs_root *csum_root; 4391 struct btrfs_ordered_sum *sums; 4392 struct btrfs_ordered_sum *sums_next; 4393 LIST_HEAD(ordered_sums); 4394 u64 disk_bytenr; 4395 u64 disk_num_bytes; 4396 u64 extent_offset; 4397 u64 extent_num_bytes; 4398 bool is_old_extent; 4399 4400 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); 4401 4402 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) 4403 goto add_to_batch; 4404 4405 extent = btrfs_item_ptr(src, src_slot, 4406 struct btrfs_file_extent_item); 4407 4408 is_old_extent = (btrfs_file_extent_generation(src, extent) < 4409 trans->transid); 4410 4411 /* 4412 * Don't copy extents from past generations. That would make us 4413 * log a lot more metadata for common cases like doing only a 4414 * few random writes into a file and then fsync it for the first 4415 * time or after the full sync flag is set on the inode. We can 4416 * get leaves full of extent items, most of which are from past 4417 * generations, so we can skip them - as long as the inode has 4418 * not been the target of a reflink operation in this transaction, 4419 * as in that case it might have had file extent items with old 4420 * generations copied into it. We also must always log prealloc 4421 * extents that start at or beyond eof, otherwise we would lose 4422 * them on log replay. 4423 */ 4424 if (is_old_extent && 4425 ins_keys[dst_index].offset < i_size && 4426 inode->last_reflink_trans < trans->transid) 4427 continue; 4428 4429 if (skip_csum) 4430 goto add_to_batch; 4431 4432 /* Only regular extents have checksums. */ 4433 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) 4434 goto add_to_batch; 4435 4436 /* 4437 * If it's an extent created in a past transaction, then its 4438 * checksums are already accessible from the committed csum tree, 4439 * no need to log them. 4440 */ 4441 if (is_old_extent) 4442 goto add_to_batch; 4443 4444 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); 4445 /* If it's an explicit hole, there are no checksums. */ 4446 if (disk_bytenr == 0) 4447 goto add_to_batch; 4448 4449 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); 4450 4451 if (btrfs_file_extent_compression(src, extent)) { 4452 extent_offset = 0; 4453 extent_num_bytes = disk_num_bytes; 4454 } else { 4455 extent_offset = btrfs_file_extent_offset(src, extent); 4456 extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); 4457 } 4458 4459 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); 4460 disk_bytenr += extent_offset; 4461 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, 4462 disk_bytenr + extent_num_bytes - 1, 4463 &ordered_sums, 0, false); 4464 if (ret) 4465 goto out; 4466 4467 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { 4468 if (!ret) 4469 ret = log_csums(trans, inode, log, sums); 4470 list_del(&sums->list); 4471 kfree(sums); 4472 } 4473 if (ret) 4474 goto out; 4475 4476 add_to_batch: 4477 ins_sizes[dst_index] = btrfs_item_size(src, src_slot); 4478 batch.total_data_size += ins_sizes[dst_index]; 4479 batch.nr++; 4480 dst_index++; 4481 } 4482 4483 /* 4484 * We have a leaf full of old extent items that don't need to be logged, 4485 * so we don't need to do anything. 4486 */ 4487 if (batch.nr == 0) 4488 goto out; 4489 4490 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 4491 if (ret) 4492 goto out; 4493 4494 dst_index = 0; 4495 for (int i = 0; i < nr; i++) { 4496 const int src_slot = start_slot + i; 4497 const int dst_slot = dst_path->slots[0] + dst_index; 4498 struct btrfs_key key; 4499 unsigned long src_offset; 4500 unsigned long dst_offset; 4501 4502 /* 4503 * We're done, all the remaining items in the source leaf 4504 * correspond to old file extent items. 4505 */ 4506 if (dst_index >= batch.nr) 4507 break; 4508 4509 btrfs_item_key_to_cpu(src, &key, src_slot); 4510 4511 if (key.type != BTRFS_EXTENT_DATA_KEY) 4512 goto copy_item; 4513 4514 extent = btrfs_item_ptr(src, src_slot, 4515 struct btrfs_file_extent_item); 4516 4517 /* See the comment in the previous loop, same logic. */ 4518 if (btrfs_file_extent_generation(src, extent) < trans->transid && 4519 key.offset < i_size && 4520 inode->last_reflink_trans < trans->transid) 4521 continue; 4522 4523 copy_item: 4524 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); 4525 src_offset = btrfs_item_ptr_offset(src, src_slot); 4526 4527 if (key.type == BTRFS_INODE_ITEM_KEY) { 4528 struct btrfs_inode_item *inode_item; 4529 4530 inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, 4531 struct btrfs_inode_item); 4532 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4533 &inode->vfs_inode, 4534 inode_only == LOG_INODE_EXISTS, 4535 logged_isize); 4536 } else { 4537 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4538 src_offset, ins_sizes[dst_index]); 4539 } 4540 4541 dst_index++; 4542 } 4543 4544 btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]); 4545 btrfs_release_path(dst_path); 4546 out: 4547 kfree(ins_data); 4548 4549 return ret; 4550 } 4551 4552 static int extent_cmp(void *priv, const struct list_head *a, 4553 const struct list_head *b) 4554 { 4555 const struct extent_map *em1, *em2; 4556 4557 em1 = list_entry(a, struct extent_map, list); 4558 em2 = list_entry(b, struct extent_map, list); 4559 4560 if (em1->start < em2->start) 4561 return -1; 4562 else if (em1->start > em2->start) 4563 return 1; 4564 return 0; 4565 } 4566 4567 static int log_extent_csums(struct btrfs_trans_handle *trans, 4568 struct btrfs_inode *inode, 4569 struct btrfs_root *log_root, 4570 const struct extent_map *em, 4571 struct btrfs_log_ctx *ctx) 4572 { 4573 struct btrfs_ordered_extent *ordered; 4574 struct btrfs_root *csum_root; 4575 u64 csum_offset; 4576 u64 csum_len; 4577 u64 mod_start = em->mod_start; 4578 u64 mod_len = em->mod_len; 4579 LIST_HEAD(ordered_sums); 4580 int ret = 0; 4581 4582 if (inode->flags & BTRFS_INODE_NODATASUM || 4583 (em->flags & EXTENT_FLAG_PREALLOC) || 4584 em->block_start == EXTENT_MAP_HOLE) 4585 return 0; 4586 4587 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4588 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4589 const u64 mod_end = mod_start + mod_len; 4590 struct btrfs_ordered_sum *sums; 4591 4592 if (mod_len == 0) 4593 break; 4594 4595 if (ordered_end <= mod_start) 4596 continue; 4597 if (mod_end <= ordered->file_offset) 4598 break; 4599 4600 /* 4601 * We are going to copy all the csums on this ordered extent, so 4602 * go ahead and adjust mod_start and mod_len in case this ordered 4603 * extent has already been logged. 4604 */ 4605 if (ordered->file_offset > mod_start) { 4606 if (ordered_end >= mod_end) 4607 mod_len = ordered->file_offset - mod_start; 4608 /* 4609 * If we have this case 4610 * 4611 * |--------- logged extent ---------| 4612 * |----- ordered extent ----| 4613 * 4614 * Just don't mess with mod_start and mod_len, we'll 4615 * just end up logging more csums than we need and it 4616 * will be ok. 4617 */ 4618 } else { 4619 if (ordered_end < mod_end) { 4620 mod_len = mod_end - ordered_end; 4621 mod_start = ordered_end; 4622 } else { 4623 mod_len = 0; 4624 } 4625 } 4626 4627 /* 4628 * To keep us from looping for the above case of an ordered 4629 * extent that falls inside of the logged extent. 4630 */ 4631 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4632 continue; 4633 4634 list_for_each_entry(sums, &ordered->list, list) { 4635 ret = log_csums(trans, inode, log_root, sums); 4636 if (ret) 4637 return ret; 4638 } 4639 } 4640 4641 /* We're done, found all csums in the ordered extents. */ 4642 if (mod_len == 0) 4643 return 0; 4644 4645 /* If we're compressed we have to save the entire range of csums. */ 4646 if (extent_map_is_compressed(em)) { 4647 csum_offset = 0; 4648 csum_len = max(em->block_len, em->orig_block_len); 4649 } else { 4650 csum_offset = mod_start - em->start; 4651 csum_len = mod_len; 4652 } 4653 4654 /* block start is already adjusted for the file extent offset. */ 4655 csum_root = btrfs_csum_root(trans->fs_info, em->block_start); 4656 ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset, 4657 em->block_start + csum_offset + 4658 csum_len - 1, &ordered_sums, 0, false); 4659 if (ret) 4660 return ret; 4661 4662 while (!list_empty(&ordered_sums)) { 4663 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4664 struct btrfs_ordered_sum, 4665 list); 4666 if (!ret) 4667 ret = log_csums(trans, inode, log_root, sums); 4668 list_del(&sums->list); 4669 kfree(sums); 4670 } 4671 4672 return ret; 4673 } 4674 4675 static int log_one_extent(struct btrfs_trans_handle *trans, 4676 struct btrfs_inode *inode, 4677 const struct extent_map *em, 4678 struct btrfs_path *path, 4679 struct btrfs_log_ctx *ctx) 4680 { 4681 struct btrfs_drop_extents_args drop_args = { 0 }; 4682 struct btrfs_root *log = inode->root->log_root; 4683 struct btrfs_file_extent_item fi = { 0 }; 4684 struct extent_buffer *leaf; 4685 struct btrfs_key key; 4686 enum btrfs_compression_type compress_type; 4687 u64 extent_offset = em->start - em->orig_start; 4688 u64 block_len; 4689 int ret; 4690 4691 btrfs_set_stack_file_extent_generation(&fi, trans->transid); 4692 if (em->flags & EXTENT_FLAG_PREALLOC) 4693 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); 4694 else 4695 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); 4696 4697 block_len = max(em->block_len, em->orig_block_len); 4698 compress_type = extent_map_compression(em); 4699 if (compress_type != BTRFS_COMPRESS_NONE) { 4700 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); 4701 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4702 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4703 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - 4704 extent_offset); 4705 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4706 } 4707 4708 btrfs_set_stack_file_extent_offset(&fi, extent_offset); 4709 btrfs_set_stack_file_extent_num_bytes(&fi, em->len); 4710 btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); 4711 btrfs_set_stack_file_extent_compression(&fi, compress_type); 4712 4713 ret = log_extent_csums(trans, inode, log, em, ctx); 4714 if (ret) 4715 return ret; 4716 4717 /* 4718 * If this is the first time we are logging the inode in the current 4719 * transaction, we can avoid btrfs_drop_extents(), which is expensive 4720 * because it does a deletion search, which always acquires write locks 4721 * for extent buffers at levels 2, 1 and 0. This not only wastes time 4722 * but also adds significant contention in a log tree, since log trees 4723 * are small, with a root at level 2 or 3 at most, due to their short 4724 * life span. 4725 */ 4726 if (ctx->logged_before) { 4727 drop_args.path = path; 4728 drop_args.start = em->start; 4729 drop_args.end = em->start + em->len; 4730 drop_args.replace_extent = true; 4731 drop_args.extent_item_size = sizeof(fi); 4732 ret = btrfs_drop_extents(trans, log, inode, &drop_args); 4733 if (ret) 4734 return ret; 4735 } 4736 4737 if (!drop_args.extent_inserted) { 4738 key.objectid = btrfs_ino(inode); 4739 key.type = BTRFS_EXTENT_DATA_KEY; 4740 key.offset = em->start; 4741 4742 ret = btrfs_insert_empty_item(trans, log, path, &key, 4743 sizeof(fi)); 4744 if (ret) 4745 return ret; 4746 } 4747 leaf = path->nodes[0]; 4748 write_extent_buffer(leaf, &fi, 4749 btrfs_item_ptr_offset(leaf, path->slots[0]), 4750 sizeof(fi)); 4751 btrfs_mark_buffer_dirty(trans, leaf); 4752 4753 btrfs_release_path(path); 4754 4755 return ret; 4756 } 4757 4758 /* 4759 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4760 * lose them after doing a full/fast fsync and replaying the log. We scan the 4761 * subvolume's root instead of iterating the inode's extent map tree because 4762 * otherwise we can log incorrect extent items based on extent map conversion. 4763 * That can happen due to the fact that extent maps are merged when they 4764 * are not in the extent map tree's list of modified extents. 4765 */ 4766 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4767 struct btrfs_inode *inode, 4768 struct btrfs_path *path, 4769 struct btrfs_log_ctx *ctx) 4770 { 4771 struct btrfs_root *root = inode->root; 4772 struct btrfs_key key; 4773 const u64 i_size = i_size_read(&inode->vfs_inode); 4774 const u64 ino = btrfs_ino(inode); 4775 struct btrfs_path *dst_path = NULL; 4776 bool dropped_extents = false; 4777 u64 truncate_offset = i_size; 4778 struct extent_buffer *leaf; 4779 int slot; 4780 int ins_nr = 0; 4781 int start_slot = 0; 4782 int ret; 4783 4784 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4785 return 0; 4786 4787 key.objectid = ino; 4788 key.type = BTRFS_EXTENT_DATA_KEY; 4789 key.offset = i_size; 4790 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4791 if (ret < 0) 4792 goto out; 4793 4794 /* 4795 * We must check if there is a prealloc extent that starts before the 4796 * i_size and crosses the i_size boundary. This is to ensure later we 4797 * truncate down to the end of that extent and not to the i_size, as 4798 * otherwise we end up losing part of the prealloc extent after a log 4799 * replay and with an implicit hole if there is another prealloc extent 4800 * that starts at an offset beyond i_size. 4801 */ 4802 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4803 if (ret < 0) 4804 goto out; 4805 4806 if (ret == 0) { 4807 struct btrfs_file_extent_item *ei; 4808 4809 leaf = path->nodes[0]; 4810 slot = path->slots[0]; 4811 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4812 4813 if (btrfs_file_extent_type(leaf, ei) == 4814 BTRFS_FILE_EXTENT_PREALLOC) { 4815 u64 extent_end; 4816 4817 btrfs_item_key_to_cpu(leaf, &key, slot); 4818 extent_end = key.offset + 4819 btrfs_file_extent_num_bytes(leaf, ei); 4820 4821 if (extent_end > i_size) 4822 truncate_offset = extent_end; 4823 } 4824 } else { 4825 ret = 0; 4826 } 4827 4828 while (true) { 4829 leaf = path->nodes[0]; 4830 slot = path->slots[0]; 4831 4832 if (slot >= btrfs_header_nritems(leaf)) { 4833 if (ins_nr > 0) { 4834 ret = copy_items(trans, inode, dst_path, path, 4835 start_slot, ins_nr, 1, 0, ctx); 4836 if (ret < 0) 4837 goto out; 4838 ins_nr = 0; 4839 } 4840 ret = btrfs_next_leaf(root, path); 4841 if (ret < 0) 4842 goto out; 4843 if (ret > 0) { 4844 ret = 0; 4845 break; 4846 } 4847 continue; 4848 } 4849 4850 btrfs_item_key_to_cpu(leaf, &key, slot); 4851 if (key.objectid > ino) 4852 break; 4853 if (WARN_ON_ONCE(key.objectid < ino) || 4854 key.type < BTRFS_EXTENT_DATA_KEY || 4855 key.offset < i_size) { 4856 path->slots[0]++; 4857 continue; 4858 } 4859 if (!dropped_extents) { 4860 /* 4861 * Avoid logging extent items logged in past fsync calls 4862 * and leading to duplicate keys in the log tree. 4863 */ 4864 ret = truncate_inode_items(trans, root->log_root, inode, 4865 truncate_offset, 4866 BTRFS_EXTENT_DATA_KEY); 4867 if (ret) 4868 goto out; 4869 dropped_extents = true; 4870 } 4871 if (ins_nr == 0) 4872 start_slot = slot; 4873 ins_nr++; 4874 path->slots[0]++; 4875 if (!dst_path) { 4876 dst_path = btrfs_alloc_path(); 4877 if (!dst_path) { 4878 ret = -ENOMEM; 4879 goto out; 4880 } 4881 } 4882 } 4883 if (ins_nr > 0) 4884 ret = copy_items(trans, inode, dst_path, path, 4885 start_slot, ins_nr, 1, 0, ctx); 4886 out: 4887 btrfs_release_path(path); 4888 btrfs_free_path(dst_path); 4889 return ret; 4890 } 4891 4892 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4893 struct btrfs_inode *inode, 4894 struct btrfs_path *path, 4895 struct btrfs_log_ctx *ctx) 4896 { 4897 struct btrfs_ordered_extent *ordered; 4898 struct btrfs_ordered_extent *tmp; 4899 struct extent_map *em, *n; 4900 LIST_HEAD(extents); 4901 struct extent_map_tree *tree = &inode->extent_tree; 4902 int ret = 0; 4903 int num = 0; 4904 4905 write_lock(&tree->lock); 4906 4907 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4908 list_del_init(&em->list); 4909 /* 4910 * Just an arbitrary number, this can be really CPU intensive 4911 * once we start getting a lot of extents, and really once we 4912 * have a bunch of extents we just want to commit since it will 4913 * be faster. 4914 */ 4915 if (++num > 32768) { 4916 list_del_init(&tree->modified_extents); 4917 ret = -EFBIG; 4918 goto process; 4919 } 4920 4921 if (em->generation < trans->transid) 4922 continue; 4923 4924 /* We log prealloc extents beyond eof later. */ 4925 if ((em->flags & EXTENT_FLAG_PREALLOC) && 4926 em->start >= i_size_read(&inode->vfs_inode)) 4927 continue; 4928 4929 /* Need a ref to keep it from getting evicted from cache */ 4930 refcount_inc(&em->refs); 4931 em->flags |= EXTENT_FLAG_LOGGING; 4932 list_add_tail(&em->list, &extents); 4933 num++; 4934 } 4935 4936 list_sort(NULL, &extents, extent_cmp); 4937 process: 4938 while (!list_empty(&extents)) { 4939 em = list_entry(extents.next, struct extent_map, list); 4940 4941 list_del_init(&em->list); 4942 4943 /* 4944 * If we had an error we just need to delete everybody from our 4945 * private list. 4946 */ 4947 if (ret) { 4948 clear_em_logging(tree, em); 4949 free_extent_map(em); 4950 continue; 4951 } 4952 4953 write_unlock(&tree->lock); 4954 4955 ret = log_one_extent(trans, inode, em, path, ctx); 4956 write_lock(&tree->lock); 4957 clear_em_logging(tree, em); 4958 free_extent_map(em); 4959 } 4960 WARN_ON(!list_empty(&extents)); 4961 write_unlock(&tree->lock); 4962 4963 if (!ret) 4964 ret = btrfs_log_prealloc_extents(trans, inode, path, ctx); 4965 if (ret) 4966 return ret; 4967 4968 /* 4969 * We have logged all extents successfully, now make sure the commit of 4970 * the current transaction waits for the ordered extents to complete 4971 * before it commits and wipes out the log trees, otherwise we would 4972 * lose data if an ordered extents completes after the transaction 4973 * commits and a power failure happens after the transaction commit. 4974 */ 4975 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 4976 list_del_init(&ordered->log_list); 4977 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 4978 4979 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4980 spin_lock_irq(&inode->ordered_tree_lock); 4981 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4982 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 4983 atomic_inc(&trans->transaction->pending_ordered); 4984 } 4985 spin_unlock_irq(&inode->ordered_tree_lock); 4986 } 4987 btrfs_put_ordered_extent(ordered); 4988 } 4989 4990 return 0; 4991 } 4992 4993 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4994 struct btrfs_path *path, u64 *size_ret) 4995 { 4996 struct btrfs_key key; 4997 int ret; 4998 4999 key.objectid = btrfs_ino(inode); 5000 key.type = BTRFS_INODE_ITEM_KEY; 5001 key.offset = 0; 5002 5003 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 5004 if (ret < 0) { 5005 return ret; 5006 } else if (ret > 0) { 5007 *size_ret = 0; 5008 } else { 5009 struct btrfs_inode_item *item; 5010 5011 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5012 struct btrfs_inode_item); 5013 *size_ret = btrfs_inode_size(path->nodes[0], item); 5014 /* 5015 * If the in-memory inode's i_size is smaller then the inode 5016 * size stored in the btree, return the inode's i_size, so 5017 * that we get a correct inode size after replaying the log 5018 * when before a power failure we had a shrinking truncate 5019 * followed by addition of a new name (rename / new hard link). 5020 * Otherwise return the inode size from the btree, to avoid 5021 * data loss when replaying a log due to previously doing a 5022 * write that expands the inode's size and logging a new name 5023 * immediately after. 5024 */ 5025 if (*size_ret > inode->vfs_inode.i_size) 5026 *size_ret = inode->vfs_inode.i_size; 5027 } 5028 5029 btrfs_release_path(path); 5030 return 0; 5031 } 5032 5033 /* 5034 * At the moment we always log all xattrs. This is to figure out at log replay 5035 * time which xattrs must have their deletion replayed. If a xattr is missing 5036 * in the log tree and exists in the fs/subvol tree, we delete it. This is 5037 * because if a xattr is deleted, the inode is fsynced and a power failure 5038 * happens, causing the log to be replayed the next time the fs is mounted, 5039 * we want the xattr to not exist anymore (same behaviour as other filesystems 5040 * with a journal, ext3/4, xfs, f2fs, etc). 5041 */ 5042 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 5043 struct btrfs_inode *inode, 5044 struct btrfs_path *path, 5045 struct btrfs_path *dst_path, 5046 struct btrfs_log_ctx *ctx) 5047 { 5048 struct btrfs_root *root = inode->root; 5049 int ret; 5050 struct btrfs_key key; 5051 const u64 ino = btrfs_ino(inode); 5052 int ins_nr = 0; 5053 int start_slot = 0; 5054 bool found_xattrs = false; 5055 5056 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) 5057 return 0; 5058 5059 key.objectid = ino; 5060 key.type = BTRFS_XATTR_ITEM_KEY; 5061 key.offset = 0; 5062 5063 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5064 if (ret < 0) 5065 return ret; 5066 5067 while (true) { 5068 int slot = path->slots[0]; 5069 struct extent_buffer *leaf = path->nodes[0]; 5070 int nritems = btrfs_header_nritems(leaf); 5071 5072 if (slot >= nritems) { 5073 if (ins_nr > 0) { 5074 ret = copy_items(trans, inode, dst_path, path, 5075 start_slot, ins_nr, 1, 0, ctx); 5076 if (ret < 0) 5077 return ret; 5078 ins_nr = 0; 5079 } 5080 ret = btrfs_next_leaf(root, path); 5081 if (ret < 0) 5082 return ret; 5083 else if (ret > 0) 5084 break; 5085 continue; 5086 } 5087 5088 btrfs_item_key_to_cpu(leaf, &key, slot); 5089 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 5090 break; 5091 5092 if (ins_nr == 0) 5093 start_slot = slot; 5094 ins_nr++; 5095 path->slots[0]++; 5096 found_xattrs = true; 5097 cond_resched(); 5098 } 5099 if (ins_nr > 0) { 5100 ret = copy_items(trans, inode, dst_path, path, 5101 start_slot, ins_nr, 1, 0, ctx); 5102 if (ret < 0) 5103 return ret; 5104 } 5105 5106 if (!found_xattrs) 5107 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); 5108 5109 return 0; 5110 } 5111 5112 /* 5113 * When using the NO_HOLES feature if we punched a hole that causes the 5114 * deletion of entire leafs or all the extent items of the first leaf (the one 5115 * that contains the inode item and references) we may end up not processing 5116 * any extents, because there are no leafs with a generation matching the 5117 * current transaction that have extent items for our inode. So we need to find 5118 * if any holes exist and then log them. We also need to log holes after any 5119 * truncate operation that changes the inode's size. 5120 */ 5121 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 5122 struct btrfs_inode *inode, 5123 struct btrfs_path *path) 5124 { 5125 struct btrfs_root *root = inode->root; 5126 struct btrfs_fs_info *fs_info = root->fs_info; 5127 struct btrfs_key key; 5128 const u64 ino = btrfs_ino(inode); 5129 const u64 i_size = i_size_read(&inode->vfs_inode); 5130 u64 prev_extent_end = 0; 5131 int ret; 5132 5133 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 5134 return 0; 5135 5136 key.objectid = ino; 5137 key.type = BTRFS_EXTENT_DATA_KEY; 5138 key.offset = 0; 5139 5140 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5141 if (ret < 0) 5142 return ret; 5143 5144 while (true) { 5145 struct extent_buffer *leaf = path->nodes[0]; 5146 5147 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 5148 ret = btrfs_next_leaf(root, path); 5149 if (ret < 0) 5150 return ret; 5151 if (ret > 0) { 5152 ret = 0; 5153 break; 5154 } 5155 leaf = path->nodes[0]; 5156 } 5157 5158 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5159 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 5160 break; 5161 5162 /* We have a hole, log it. */ 5163 if (prev_extent_end < key.offset) { 5164 const u64 hole_len = key.offset - prev_extent_end; 5165 5166 /* 5167 * Release the path to avoid deadlocks with other code 5168 * paths that search the root while holding locks on 5169 * leafs from the log root. 5170 */ 5171 btrfs_release_path(path); 5172 ret = btrfs_insert_hole_extent(trans, root->log_root, 5173 ino, prev_extent_end, 5174 hole_len); 5175 if (ret < 0) 5176 return ret; 5177 5178 /* 5179 * Search for the same key again in the root. Since it's 5180 * an extent item and we are holding the inode lock, the 5181 * key must still exist. If it doesn't just emit warning 5182 * and return an error to fall back to a transaction 5183 * commit. 5184 */ 5185 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5186 if (ret < 0) 5187 return ret; 5188 if (WARN_ON(ret > 0)) 5189 return -ENOENT; 5190 leaf = path->nodes[0]; 5191 } 5192 5193 prev_extent_end = btrfs_file_extent_end(path); 5194 path->slots[0]++; 5195 cond_resched(); 5196 } 5197 5198 if (prev_extent_end < i_size) { 5199 u64 hole_len; 5200 5201 btrfs_release_path(path); 5202 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 5203 ret = btrfs_insert_hole_extent(trans, root->log_root, ino, 5204 prev_extent_end, hole_len); 5205 if (ret < 0) 5206 return ret; 5207 } 5208 5209 return 0; 5210 } 5211 5212 /* 5213 * When we are logging a new inode X, check if it doesn't have a reference that 5214 * matches the reference from some other inode Y created in a past transaction 5215 * and that was renamed in the current transaction. If we don't do this, then at 5216 * log replay time we can lose inode Y (and all its files if it's a directory): 5217 * 5218 * mkdir /mnt/x 5219 * echo "hello world" > /mnt/x/foobar 5220 * sync 5221 * mv /mnt/x /mnt/y 5222 * mkdir /mnt/x # or touch /mnt/x 5223 * xfs_io -c fsync /mnt/x 5224 * <power fail> 5225 * mount fs, trigger log replay 5226 * 5227 * After the log replay procedure, we would lose the first directory and all its 5228 * files (file foobar). 5229 * For the case where inode Y is not a directory we simply end up losing it: 5230 * 5231 * echo "123" > /mnt/foo 5232 * sync 5233 * mv /mnt/foo /mnt/bar 5234 * echo "abc" > /mnt/foo 5235 * xfs_io -c fsync /mnt/foo 5236 * <power fail> 5237 * 5238 * We also need this for cases where a snapshot entry is replaced by some other 5239 * entry (file or directory) otherwise we end up with an unreplayable log due to 5240 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 5241 * if it were a regular entry: 5242 * 5243 * mkdir /mnt/x 5244 * btrfs subvolume snapshot /mnt /mnt/x/snap 5245 * btrfs subvolume delete /mnt/x/snap 5246 * rmdir /mnt/x 5247 * mkdir /mnt/x 5248 * fsync /mnt/x or fsync some new file inside it 5249 * <power fail> 5250 * 5251 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 5252 * the same transaction. 5253 */ 5254 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 5255 const int slot, 5256 const struct btrfs_key *key, 5257 struct btrfs_inode *inode, 5258 u64 *other_ino, u64 *other_parent) 5259 { 5260 int ret; 5261 struct btrfs_path *search_path; 5262 char *name = NULL; 5263 u32 name_len = 0; 5264 u32 item_size = btrfs_item_size(eb, slot); 5265 u32 cur_offset = 0; 5266 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 5267 5268 search_path = btrfs_alloc_path(); 5269 if (!search_path) 5270 return -ENOMEM; 5271 search_path->search_commit_root = 1; 5272 search_path->skip_locking = 1; 5273 5274 while (cur_offset < item_size) { 5275 u64 parent; 5276 u32 this_name_len; 5277 u32 this_len; 5278 unsigned long name_ptr; 5279 struct btrfs_dir_item *di; 5280 struct fscrypt_str name_str; 5281 5282 if (key->type == BTRFS_INODE_REF_KEY) { 5283 struct btrfs_inode_ref *iref; 5284 5285 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 5286 parent = key->offset; 5287 this_name_len = btrfs_inode_ref_name_len(eb, iref); 5288 name_ptr = (unsigned long)(iref + 1); 5289 this_len = sizeof(*iref) + this_name_len; 5290 } else { 5291 struct btrfs_inode_extref *extref; 5292 5293 extref = (struct btrfs_inode_extref *)(ptr + 5294 cur_offset); 5295 parent = btrfs_inode_extref_parent(eb, extref); 5296 this_name_len = btrfs_inode_extref_name_len(eb, extref); 5297 name_ptr = (unsigned long)&extref->name; 5298 this_len = sizeof(*extref) + this_name_len; 5299 } 5300 5301 if (this_name_len > name_len) { 5302 char *new_name; 5303 5304 new_name = krealloc(name, this_name_len, GFP_NOFS); 5305 if (!new_name) { 5306 ret = -ENOMEM; 5307 goto out; 5308 } 5309 name_len = this_name_len; 5310 name = new_name; 5311 } 5312 5313 read_extent_buffer(eb, name, name_ptr, this_name_len); 5314 5315 name_str.name = name; 5316 name_str.len = this_name_len; 5317 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 5318 parent, &name_str, 0); 5319 if (di && !IS_ERR(di)) { 5320 struct btrfs_key di_key; 5321 5322 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 5323 di, &di_key); 5324 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 5325 if (di_key.objectid != key->objectid) { 5326 ret = 1; 5327 *other_ino = di_key.objectid; 5328 *other_parent = parent; 5329 } else { 5330 ret = 0; 5331 } 5332 } else { 5333 ret = -EAGAIN; 5334 } 5335 goto out; 5336 } else if (IS_ERR(di)) { 5337 ret = PTR_ERR(di); 5338 goto out; 5339 } 5340 btrfs_release_path(search_path); 5341 5342 cur_offset += this_len; 5343 } 5344 ret = 0; 5345 out: 5346 btrfs_free_path(search_path); 5347 kfree(name); 5348 return ret; 5349 } 5350 5351 /* 5352 * Check if we need to log an inode. This is used in contexts where while 5353 * logging an inode we need to log another inode (either that it exists or in 5354 * full mode). This is used instead of btrfs_inode_in_log() because the later 5355 * requires the inode to be in the log and have the log transaction committed, 5356 * while here we do not care if the log transaction was already committed - our 5357 * caller will commit the log later - and we want to avoid logging an inode 5358 * multiple times when multiple tasks have joined the same log transaction. 5359 */ 5360 static bool need_log_inode(const struct btrfs_trans_handle *trans, 5361 struct btrfs_inode *inode) 5362 { 5363 /* 5364 * If a directory was not modified, no dentries added or removed, we can 5365 * and should avoid logging it. 5366 */ 5367 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) 5368 return false; 5369 5370 /* 5371 * If this inode does not have new/updated/deleted xattrs since the last 5372 * time it was logged and is flagged as logged in the current transaction, 5373 * we can skip logging it. As for new/deleted names, those are updated in 5374 * the log by link/unlink/rename operations. 5375 * In case the inode was logged and then evicted and reloaded, its 5376 * logged_trans will be 0, in which case we have to fully log it since 5377 * logged_trans is a transient field, not persisted. 5378 */ 5379 if (inode_logged(trans, inode, NULL) == 1 && 5380 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) 5381 return false; 5382 5383 return true; 5384 } 5385 5386 struct btrfs_dir_list { 5387 u64 ino; 5388 struct list_head list; 5389 }; 5390 5391 /* 5392 * Log the inodes of the new dentries of a directory. 5393 * See process_dir_items_leaf() for details about why it is needed. 5394 * This is a recursive operation - if an existing dentry corresponds to a 5395 * directory, that directory's new entries are logged too (same behaviour as 5396 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5397 * the dentries point to we do not acquire their VFS lock, otherwise lockdep 5398 * complains about the following circular lock dependency / possible deadlock: 5399 * 5400 * CPU0 CPU1 5401 * ---- ---- 5402 * lock(&type->i_mutex_dir_key#3/2); 5403 * lock(sb_internal#2); 5404 * lock(&type->i_mutex_dir_key#3/2); 5405 * lock(&sb->s_type->i_mutex_key#14); 5406 * 5407 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5408 * sb_start_intwrite() in btrfs_start_transaction(). 5409 * Not acquiring the VFS lock of the inodes is still safe because: 5410 * 5411 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5412 * that while logging the inode new references (names) are added or removed 5413 * from the inode, leaving the logged inode item with a link count that does 5414 * not match the number of logged inode reference items. This is fine because 5415 * at log replay time we compute the real number of links and correct the 5416 * link count in the inode item (see replay_one_buffer() and 5417 * link_to_fixup_dir()); 5418 * 5419 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5420 * while logging the inode's items new index items (key type 5421 * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item 5422 * has a size that doesn't match the sum of the lengths of all the logged 5423 * names - this is ok, not a problem, because at log replay time we set the 5424 * directory's i_size to the correct value (see replay_one_name() and 5425 * overwrite_item()). 5426 */ 5427 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5428 struct btrfs_inode *start_inode, 5429 struct btrfs_log_ctx *ctx) 5430 { 5431 struct btrfs_root *root = start_inode->root; 5432 struct btrfs_fs_info *fs_info = root->fs_info; 5433 struct btrfs_path *path; 5434 LIST_HEAD(dir_list); 5435 struct btrfs_dir_list *dir_elem; 5436 u64 ino = btrfs_ino(start_inode); 5437 struct btrfs_inode *curr_inode = start_inode; 5438 int ret = 0; 5439 5440 /* 5441 * If we are logging a new name, as part of a link or rename operation, 5442 * don't bother logging new dentries, as we just want to log the names 5443 * of an inode and that any new parents exist. 5444 */ 5445 if (ctx->logging_new_name) 5446 return 0; 5447 5448 path = btrfs_alloc_path(); 5449 if (!path) 5450 return -ENOMEM; 5451 5452 /* Pairs with btrfs_add_delayed_iput below. */ 5453 ihold(&curr_inode->vfs_inode); 5454 5455 while (true) { 5456 struct inode *vfs_inode; 5457 struct btrfs_key key; 5458 struct btrfs_key found_key; 5459 u64 next_index; 5460 bool continue_curr_inode = true; 5461 int iter_ret; 5462 5463 key.objectid = ino; 5464 key.type = BTRFS_DIR_INDEX_KEY; 5465 key.offset = btrfs_get_first_dir_index_to_log(curr_inode); 5466 next_index = key.offset; 5467 again: 5468 btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) { 5469 struct extent_buffer *leaf = path->nodes[0]; 5470 struct btrfs_dir_item *di; 5471 struct btrfs_key di_key; 5472 struct inode *di_inode; 5473 int log_mode = LOG_INODE_EXISTS; 5474 int type; 5475 5476 if (found_key.objectid != ino || 5477 found_key.type != BTRFS_DIR_INDEX_KEY) { 5478 continue_curr_inode = false; 5479 break; 5480 } 5481 5482 next_index = found_key.offset + 1; 5483 5484 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); 5485 type = btrfs_dir_ftype(leaf, di); 5486 if (btrfs_dir_transid(leaf, di) < trans->transid) 5487 continue; 5488 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5489 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5490 continue; 5491 5492 btrfs_release_path(path); 5493 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 5494 if (IS_ERR(di_inode)) { 5495 ret = PTR_ERR(di_inode); 5496 goto out; 5497 } 5498 5499 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 5500 btrfs_add_delayed_iput(BTRFS_I(di_inode)); 5501 break; 5502 } 5503 5504 ctx->log_new_dentries = false; 5505 if (type == BTRFS_FT_DIR) 5506 log_mode = LOG_INODE_ALL; 5507 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), 5508 log_mode, ctx); 5509 btrfs_add_delayed_iput(BTRFS_I(di_inode)); 5510 if (ret) 5511 goto out; 5512 if (ctx->log_new_dentries) { 5513 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5514 if (!dir_elem) { 5515 ret = -ENOMEM; 5516 goto out; 5517 } 5518 dir_elem->ino = di_key.objectid; 5519 list_add_tail(&dir_elem->list, &dir_list); 5520 } 5521 break; 5522 } 5523 5524 btrfs_release_path(path); 5525 5526 if (iter_ret < 0) { 5527 ret = iter_ret; 5528 goto out; 5529 } else if (iter_ret > 0) { 5530 continue_curr_inode = false; 5531 } else { 5532 key = found_key; 5533 } 5534 5535 if (continue_curr_inode && key.offset < (u64)-1) { 5536 key.offset++; 5537 goto again; 5538 } 5539 5540 btrfs_set_first_dir_index_to_log(curr_inode, next_index); 5541 5542 if (list_empty(&dir_list)) 5543 break; 5544 5545 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list); 5546 ino = dir_elem->ino; 5547 list_del(&dir_elem->list); 5548 kfree(dir_elem); 5549 5550 btrfs_add_delayed_iput(curr_inode); 5551 curr_inode = NULL; 5552 5553 vfs_inode = btrfs_iget(fs_info->sb, ino, root); 5554 if (IS_ERR(vfs_inode)) { 5555 ret = PTR_ERR(vfs_inode); 5556 break; 5557 } 5558 curr_inode = BTRFS_I(vfs_inode); 5559 } 5560 out: 5561 btrfs_free_path(path); 5562 if (curr_inode) 5563 btrfs_add_delayed_iput(curr_inode); 5564 5565 if (ret) { 5566 struct btrfs_dir_list *next; 5567 5568 list_for_each_entry_safe(dir_elem, next, &dir_list, list) 5569 kfree(dir_elem); 5570 } 5571 5572 return ret; 5573 } 5574 5575 struct btrfs_ino_list { 5576 u64 ino; 5577 u64 parent; 5578 struct list_head list; 5579 }; 5580 5581 static void free_conflicting_inodes(struct btrfs_log_ctx *ctx) 5582 { 5583 struct btrfs_ino_list *curr; 5584 struct btrfs_ino_list *next; 5585 5586 list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) { 5587 list_del(&curr->list); 5588 kfree(curr); 5589 } 5590 } 5591 5592 static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, 5593 struct btrfs_path *path) 5594 { 5595 struct btrfs_key key; 5596 int ret; 5597 5598 key.objectid = ino; 5599 key.type = BTRFS_INODE_ITEM_KEY; 5600 key.offset = 0; 5601 5602 path->search_commit_root = 1; 5603 path->skip_locking = 1; 5604 5605 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5606 if (WARN_ON_ONCE(ret > 0)) { 5607 /* 5608 * We have previously found the inode through the commit root 5609 * so this should not happen. If it does, just error out and 5610 * fallback to a transaction commit. 5611 */ 5612 ret = -ENOENT; 5613 } else if (ret == 0) { 5614 struct btrfs_inode_item *item; 5615 5616 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5617 struct btrfs_inode_item); 5618 if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item))) 5619 ret = 1; 5620 } 5621 5622 btrfs_release_path(path); 5623 path->search_commit_root = 0; 5624 path->skip_locking = 0; 5625 5626 return ret; 5627 } 5628 5629 static int add_conflicting_inode(struct btrfs_trans_handle *trans, 5630 struct btrfs_root *root, 5631 struct btrfs_path *path, 5632 u64 ino, u64 parent, 5633 struct btrfs_log_ctx *ctx) 5634 { 5635 struct btrfs_ino_list *ino_elem; 5636 struct inode *inode; 5637 5638 /* 5639 * It's rare to have a lot of conflicting inodes, in practice it is not 5640 * common to have more than 1 or 2. We don't want to collect too many, 5641 * as we could end up logging too many inodes (even if only in 5642 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction 5643 * commits. 5644 */ 5645 if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) 5646 return BTRFS_LOG_FORCE_COMMIT; 5647 5648 inode = btrfs_iget(root->fs_info->sb, ino, root); 5649 /* 5650 * If the other inode that had a conflicting dir entry was deleted in 5651 * the current transaction then we either: 5652 * 5653 * 1) Log the parent directory (later after adding it to the list) if 5654 * the inode is a directory. This is because it may be a deleted 5655 * subvolume/snapshot or it may be a regular directory that had 5656 * deleted subvolumes/snapshots (or subdirectories that had them), 5657 * and at the moment we can't deal with dropping subvolumes/snapshots 5658 * during log replay. So we just log the parent, which will result in 5659 * a fallback to a transaction commit if we are dealing with those 5660 * cases (last_unlink_trans will match the current transaction); 5661 * 5662 * 2) Do nothing if it's not a directory. During log replay we simply 5663 * unlink the conflicting dentry from the parent directory and then 5664 * add the dentry for our inode. Like this we can avoid logging the 5665 * parent directory (and maybe fallback to a transaction commit in 5666 * case it has a last_unlink_trans == trans->transid, due to moving 5667 * some inode from it to some other directory). 5668 */ 5669 if (IS_ERR(inode)) { 5670 int ret = PTR_ERR(inode); 5671 5672 if (ret != -ENOENT) 5673 return ret; 5674 5675 ret = conflicting_inode_is_dir(root, ino, path); 5676 /* Not a directory or we got an error. */ 5677 if (ret <= 0) 5678 return ret; 5679 5680 /* Conflicting inode is a directory, so we'll log its parent. */ 5681 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5682 if (!ino_elem) 5683 return -ENOMEM; 5684 ino_elem->ino = ino; 5685 ino_elem->parent = parent; 5686 list_add_tail(&ino_elem->list, &ctx->conflict_inodes); 5687 ctx->num_conflict_inodes++; 5688 5689 return 0; 5690 } 5691 5692 /* 5693 * If the inode was already logged skip it - otherwise we can hit an 5694 * infinite loop. Example: 5695 * 5696 * From the commit root (previous transaction) we have the following 5697 * inodes: 5698 * 5699 * inode 257 a directory 5700 * inode 258 with references "zz" and "zz_link" on inode 257 5701 * inode 259 with reference "a" on inode 257 5702 * 5703 * And in the current (uncommitted) transaction we have: 5704 * 5705 * inode 257 a directory, unchanged 5706 * inode 258 with references "a" and "a2" on inode 257 5707 * inode 259 with reference "zz_link" on inode 257 5708 * inode 261 with reference "zz" on inode 257 5709 * 5710 * When logging inode 261 the following infinite loop could 5711 * happen if we don't skip already logged inodes: 5712 * 5713 * - we detect inode 258 as a conflicting inode, with inode 261 5714 * on reference "zz", and log it; 5715 * 5716 * - we detect inode 259 as a conflicting inode, with inode 258 5717 * on reference "a", and log it; 5718 * 5719 * - we detect inode 258 as a conflicting inode, with inode 259 5720 * on reference "zz_link", and log it - again! After this we 5721 * repeat the above steps forever. 5722 * 5723 * Here we can use need_log_inode() because we only need to log the 5724 * inode in LOG_INODE_EXISTS mode and rename operations update the log, 5725 * so that the log ends up with the new name and without the old name. 5726 */ 5727 if (!need_log_inode(trans, BTRFS_I(inode))) { 5728 btrfs_add_delayed_iput(BTRFS_I(inode)); 5729 return 0; 5730 } 5731 5732 btrfs_add_delayed_iput(BTRFS_I(inode)); 5733 5734 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5735 if (!ino_elem) 5736 return -ENOMEM; 5737 ino_elem->ino = ino; 5738 ino_elem->parent = parent; 5739 list_add_tail(&ino_elem->list, &ctx->conflict_inodes); 5740 ctx->num_conflict_inodes++; 5741 5742 return 0; 5743 } 5744 5745 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 5746 struct btrfs_root *root, 5747 struct btrfs_log_ctx *ctx) 5748 { 5749 struct btrfs_fs_info *fs_info = root->fs_info; 5750 int ret = 0; 5751 5752 /* 5753 * Conflicting inodes are logged by the first call to btrfs_log_inode(), 5754 * otherwise we could have unbounded recursion of btrfs_log_inode() 5755 * calls. This check guarantees we can have only 1 level of recursion. 5756 */ 5757 if (ctx->logging_conflict_inodes) 5758 return 0; 5759 5760 ctx->logging_conflict_inodes = true; 5761 5762 /* 5763 * New conflicting inodes may be found and added to the list while we 5764 * are logging a conflicting inode, so keep iterating while the list is 5765 * not empty. 5766 */ 5767 while (!list_empty(&ctx->conflict_inodes)) { 5768 struct btrfs_ino_list *curr; 5769 struct inode *inode; 5770 u64 ino; 5771 u64 parent; 5772 5773 curr = list_first_entry(&ctx->conflict_inodes, 5774 struct btrfs_ino_list, list); 5775 ino = curr->ino; 5776 parent = curr->parent; 5777 list_del(&curr->list); 5778 kfree(curr); 5779 5780 inode = btrfs_iget(fs_info->sb, ino, root); 5781 /* 5782 * If the other inode that had a conflicting dir entry was 5783 * deleted in the current transaction, we need to log its parent 5784 * directory. See the comment at add_conflicting_inode(). 5785 */ 5786 if (IS_ERR(inode)) { 5787 ret = PTR_ERR(inode); 5788 if (ret != -ENOENT) 5789 break; 5790 5791 inode = btrfs_iget(fs_info->sb, parent, root); 5792 if (IS_ERR(inode)) { 5793 ret = PTR_ERR(inode); 5794 break; 5795 } 5796 5797 /* 5798 * Always log the directory, we cannot make this 5799 * conditional on need_log_inode() because the directory 5800 * might have been logged in LOG_INODE_EXISTS mode or 5801 * the dir index of the conflicting inode is not in a 5802 * dir index key range logged for the directory. So we 5803 * must make sure the deletion is recorded. 5804 */ 5805 ret = btrfs_log_inode(trans, BTRFS_I(inode), 5806 LOG_INODE_ALL, ctx); 5807 btrfs_add_delayed_iput(BTRFS_I(inode)); 5808 if (ret) 5809 break; 5810 continue; 5811 } 5812 5813 /* 5814 * Here we can use need_log_inode() because we only need to log 5815 * the inode in LOG_INODE_EXISTS mode and rename operations 5816 * update the log, so that the log ends up with the new name and 5817 * without the old name. 5818 * 5819 * We did this check at add_conflicting_inode(), but here we do 5820 * it again because if some other task logged the inode after 5821 * that, we can avoid doing it again. 5822 */ 5823 if (!need_log_inode(trans, BTRFS_I(inode))) { 5824 btrfs_add_delayed_iput(BTRFS_I(inode)); 5825 continue; 5826 } 5827 5828 /* 5829 * We are safe logging the other inode without acquiring its 5830 * lock as long as we log with the LOG_INODE_EXISTS mode. We 5831 * are safe against concurrent renames of the other inode as 5832 * well because during a rename we pin the log and update the 5833 * log with the new name before we unpin it. 5834 */ 5835 ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); 5836 btrfs_add_delayed_iput(BTRFS_I(inode)); 5837 if (ret) 5838 break; 5839 } 5840 5841 ctx->logging_conflict_inodes = false; 5842 if (ret) 5843 free_conflicting_inodes(ctx); 5844 5845 return ret; 5846 } 5847 5848 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5849 struct btrfs_inode *inode, 5850 struct btrfs_key *min_key, 5851 const struct btrfs_key *max_key, 5852 struct btrfs_path *path, 5853 struct btrfs_path *dst_path, 5854 const u64 logged_isize, 5855 const int inode_only, 5856 struct btrfs_log_ctx *ctx, 5857 bool *need_log_inode_item) 5858 { 5859 const u64 i_size = i_size_read(&inode->vfs_inode); 5860 struct btrfs_root *root = inode->root; 5861 int ins_start_slot = 0; 5862 int ins_nr = 0; 5863 int ret; 5864 5865 while (1) { 5866 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5867 if (ret < 0) 5868 return ret; 5869 if (ret > 0) { 5870 ret = 0; 5871 break; 5872 } 5873 again: 5874 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5875 if (min_key->objectid != max_key->objectid) 5876 break; 5877 if (min_key->type > max_key->type) 5878 break; 5879 5880 if (min_key->type == BTRFS_INODE_ITEM_KEY) { 5881 *need_log_inode_item = false; 5882 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && 5883 min_key->offset >= i_size) { 5884 /* 5885 * Extents at and beyond eof are logged with 5886 * btrfs_log_prealloc_extents(). 5887 * Only regular files have BTRFS_EXTENT_DATA_KEY keys, 5888 * and no keys greater than that, so bail out. 5889 */ 5890 break; 5891 } else if ((min_key->type == BTRFS_INODE_REF_KEY || 5892 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5893 (inode->generation == trans->transid || 5894 ctx->logging_conflict_inodes)) { 5895 u64 other_ino = 0; 5896 u64 other_parent = 0; 5897 5898 ret = btrfs_check_ref_name_override(path->nodes[0], 5899 path->slots[0], min_key, inode, 5900 &other_ino, &other_parent); 5901 if (ret < 0) { 5902 return ret; 5903 } else if (ret > 0 && 5904 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5905 if (ins_nr > 0) { 5906 ins_nr++; 5907 } else { 5908 ins_nr = 1; 5909 ins_start_slot = path->slots[0]; 5910 } 5911 ret = copy_items(trans, inode, dst_path, path, 5912 ins_start_slot, ins_nr, 5913 inode_only, logged_isize, ctx); 5914 if (ret < 0) 5915 return ret; 5916 ins_nr = 0; 5917 5918 btrfs_release_path(path); 5919 ret = add_conflicting_inode(trans, root, path, 5920 other_ino, 5921 other_parent, ctx); 5922 if (ret) 5923 return ret; 5924 goto next_key; 5925 } 5926 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5927 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ 5928 if (ins_nr == 0) 5929 goto next_slot; 5930 ret = copy_items(trans, inode, dst_path, path, 5931 ins_start_slot, 5932 ins_nr, inode_only, logged_isize, ctx); 5933 if (ret < 0) 5934 return ret; 5935 ins_nr = 0; 5936 goto next_slot; 5937 } 5938 5939 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5940 ins_nr++; 5941 goto next_slot; 5942 } else if (!ins_nr) { 5943 ins_start_slot = path->slots[0]; 5944 ins_nr = 1; 5945 goto next_slot; 5946 } 5947 5948 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5949 ins_nr, inode_only, logged_isize, ctx); 5950 if (ret < 0) 5951 return ret; 5952 ins_nr = 1; 5953 ins_start_slot = path->slots[0]; 5954 next_slot: 5955 path->slots[0]++; 5956 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5957 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5958 path->slots[0]); 5959 goto again; 5960 } 5961 if (ins_nr) { 5962 ret = copy_items(trans, inode, dst_path, path, 5963 ins_start_slot, ins_nr, inode_only, 5964 logged_isize, ctx); 5965 if (ret < 0) 5966 return ret; 5967 ins_nr = 0; 5968 } 5969 btrfs_release_path(path); 5970 next_key: 5971 if (min_key->offset < (u64)-1) { 5972 min_key->offset++; 5973 } else if (min_key->type < max_key->type) { 5974 min_key->type++; 5975 min_key->offset = 0; 5976 } else { 5977 break; 5978 } 5979 5980 /* 5981 * We may process many leaves full of items for our inode, so 5982 * avoid monopolizing a cpu for too long by rescheduling while 5983 * not holding locks on any tree. 5984 */ 5985 cond_resched(); 5986 } 5987 if (ins_nr) { 5988 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5989 ins_nr, inode_only, logged_isize, ctx); 5990 if (ret) 5991 return ret; 5992 } 5993 5994 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { 5995 /* 5996 * Release the path because otherwise we might attempt to double 5997 * lock the same leaf with btrfs_log_prealloc_extents() below. 5998 */ 5999 btrfs_release_path(path); 6000 ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx); 6001 } 6002 6003 return ret; 6004 } 6005 6006 static int insert_delayed_items_batch(struct btrfs_trans_handle *trans, 6007 struct btrfs_root *log, 6008 struct btrfs_path *path, 6009 const struct btrfs_item_batch *batch, 6010 const struct btrfs_delayed_item *first_item) 6011 { 6012 const struct btrfs_delayed_item *curr = first_item; 6013 int ret; 6014 6015 ret = btrfs_insert_empty_items(trans, log, path, batch); 6016 if (ret) 6017 return ret; 6018 6019 for (int i = 0; i < batch->nr; i++) { 6020 char *data_ptr; 6021 6022 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); 6023 write_extent_buffer(path->nodes[0], &curr->data, 6024 (unsigned long)data_ptr, curr->data_len); 6025 curr = list_next_entry(curr, log_list); 6026 path->slots[0]++; 6027 } 6028 6029 btrfs_release_path(path); 6030 6031 return 0; 6032 } 6033 6034 static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, 6035 struct btrfs_inode *inode, 6036 struct btrfs_path *path, 6037 const struct list_head *delayed_ins_list, 6038 struct btrfs_log_ctx *ctx) 6039 { 6040 /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */ 6041 const int max_batch_size = 195; 6042 const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info); 6043 const u64 ino = btrfs_ino(inode); 6044 struct btrfs_root *log = inode->root->log_root; 6045 struct btrfs_item_batch batch = { 6046 .nr = 0, 6047 .total_data_size = 0, 6048 }; 6049 const struct btrfs_delayed_item *first = NULL; 6050 const struct btrfs_delayed_item *curr; 6051 char *ins_data; 6052 struct btrfs_key *ins_keys; 6053 u32 *ins_sizes; 6054 u64 curr_batch_size = 0; 6055 int batch_idx = 0; 6056 int ret; 6057 6058 /* We are adding dir index items to the log tree. */ 6059 lockdep_assert_held(&inode->log_mutex); 6060 6061 /* 6062 * We collect delayed items before copying index keys from the subvolume 6063 * to the log tree. However just after we collected them, they may have 6064 * been flushed (all of them or just some of them), and therefore we 6065 * could have copied them from the subvolume tree to the log tree. 6066 * So find the first delayed item that was not yet logged (they are 6067 * sorted by index number). 6068 */ 6069 list_for_each_entry(curr, delayed_ins_list, log_list) { 6070 if (curr->index > inode->last_dir_index_offset) { 6071 first = curr; 6072 break; 6073 } 6074 } 6075 6076 /* Empty list or all delayed items were already logged. */ 6077 if (!first) 6078 return 0; 6079 6080 ins_data = kmalloc(max_batch_size * sizeof(u32) + 6081 max_batch_size * sizeof(struct btrfs_key), GFP_NOFS); 6082 if (!ins_data) 6083 return -ENOMEM; 6084 ins_sizes = (u32 *)ins_data; 6085 batch.data_sizes = ins_sizes; 6086 ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32)); 6087 batch.keys = ins_keys; 6088 6089 curr = first; 6090 while (!list_entry_is_head(curr, delayed_ins_list, log_list)) { 6091 const u32 curr_size = curr->data_len + sizeof(struct btrfs_item); 6092 6093 if (curr_batch_size + curr_size > leaf_data_size || 6094 batch.nr == max_batch_size) { 6095 ret = insert_delayed_items_batch(trans, log, path, 6096 &batch, first); 6097 if (ret) 6098 goto out; 6099 batch_idx = 0; 6100 batch.nr = 0; 6101 batch.total_data_size = 0; 6102 curr_batch_size = 0; 6103 first = curr; 6104 } 6105 6106 ins_sizes[batch_idx] = curr->data_len; 6107 ins_keys[batch_idx].objectid = ino; 6108 ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY; 6109 ins_keys[batch_idx].offset = curr->index; 6110 curr_batch_size += curr_size; 6111 batch.total_data_size += curr->data_len; 6112 batch.nr++; 6113 batch_idx++; 6114 curr = list_next_entry(curr, log_list); 6115 } 6116 6117 ASSERT(batch.nr >= 1); 6118 ret = insert_delayed_items_batch(trans, log, path, &batch, first); 6119 6120 curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item, 6121 log_list); 6122 inode->last_dir_index_offset = curr->index; 6123 out: 6124 kfree(ins_data); 6125 6126 return ret; 6127 } 6128 6129 static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, 6130 struct btrfs_inode *inode, 6131 struct btrfs_path *path, 6132 const struct list_head *delayed_del_list, 6133 struct btrfs_log_ctx *ctx) 6134 { 6135 const u64 ino = btrfs_ino(inode); 6136 const struct btrfs_delayed_item *curr; 6137 6138 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, 6139 log_list); 6140 6141 while (!list_entry_is_head(curr, delayed_del_list, log_list)) { 6142 u64 first_dir_index = curr->index; 6143 u64 last_dir_index; 6144 const struct btrfs_delayed_item *next; 6145 int ret; 6146 6147 /* 6148 * Find a range of consecutive dir index items to delete. Like 6149 * this we log a single dir range item spanning several contiguous 6150 * dir items instead of logging one range item per dir index item. 6151 */ 6152 next = list_next_entry(curr, log_list); 6153 while (!list_entry_is_head(next, delayed_del_list, log_list)) { 6154 if (next->index != curr->index + 1) 6155 break; 6156 curr = next; 6157 next = list_next_entry(next, log_list); 6158 } 6159 6160 last_dir_index = curr->index; 6161 ASSERT(last_dir_index >= first_dir_index); 6162 6163 ret = insert_dir_log_key(trans, inode->root->log_root, path, 6164 ino, first_dir_index, last_dir_index); 6165 if (ret) 6166 return ret; 6167 curr = list_next_entry(curr, log_list); 6168 } 6169 6170 return 0; 6171 } 6172 6173 static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, 6174 struct btrfs_inode *inode, 6175 struct btrfs_path *path, 6176 struct btrfs_log_ctx *ctx, 6177 const struct list_head *delayed_del_list, 6178 const struct btrfs_delayed_item *first, 6179 const struct btrfs_delayed_item **last_ret) 6180 { 6181 const struct btrfs_delayed_item *next; 6182 struct extent_buffer *leaf = path->nodes[0]; 6183 const int last_slot = btrfs_header_nritems(leaf) - 1; 6184 int slot = path->slots[0] + 1; 6185 const u64 ino = btrfs_ino(inode); 6186 6187 next = list_next_entry(first, log_list); 6188 6189 while (slot < last_slot && 6190 !list_entry_is_head(next, delayed_del_list, log_list)) { 6191 struct btrfs_key key; 6192 6193 btrfs_item_key_to_cpu(leaf, &key, slot); 6194 if (key.objectid != ino || 6195 key.type != BTRFS_DIR_INDEX_KEY || 6196 key.offset != next->index) 6197 break; 6198 6199 slot++; 6200 *last_ret = next; 6201 next = list_next_entry(next, log_list); 6202 } 6203 6204 return btrfs_del_items(trans, inode->root->log_root, path, 6205 path->slots[0], slot - path->slots[0]); 6206 } 6207 6208 static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, 6209 struct btrfs_inode *inode, 6210 struct btrfs_path *path, 6211 const struct list_head *delayed_del_list, 6212 struct btrfs_log_ctx *ctx) 6213 { 6214 struct btrfs_root *log = inode->root->log_root; 6215 const struct btrfs_delayed_item *curr; 6216 u64 last_range_start = 0; 6217 u64 last_range_end = 0; 6218 struct btrfs_key key; 6219 6220 key.objectid = btrfs_ino(inode); 6221 key.type = BTRFS_DIR_INDEX_KEY; 6222 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, 6223 log_list); 6224 6225 while (!list_entry_is_head(curr, delayed_del_list, log_list)) { 6226 const struct btrfs_delayed_item *last = curr; 6227 u64 first_dir_index = curr->index; 6228 u64 last_dir_index; 6229 bool deleted_items = false; 6230 int ret; 6231 6232 key.offset = curr->index; 6233 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 6234 if (ret < 0) { 6235 return ret; 6236 } else if (ret == 0) { 6237 ret = batch_delete_dir_index_items(trans, inode, path, ctx, 6238 delayed_del_list, curr, 6239 &last); 6240 if (ret) 6241 return ret; 6242 deleted_items = true; 6243 } 6244 6245 btrfs_release_path(path); 6246 6247 /* 6248 * If we deleted items from the leaf, it means we have a range 6249 * item logging their range, so no need to add one or update an 6250 * existing one. Otherwise we have to log a dir range item. 6251 */ 6252 if (deleted_items) 6253 goto next_batch; 6254 6255 last_dir_index = last->index; 6256 ASSERT(last_dir_index >= first_dir_index); 6257 /* 6258 * If this range starts right after where the previous one ends, 6259 * then we want to reuse the previous range item and change its 6260 * end offset to the end of this range. This is just to minimize 6261 * leaf space usage, by avoiding adding a new range item. 6262 */ 6263 if (last_range_end != 0 && first_dir_index == last_range_end + 1) 6264 first_dir_index = last_range_start; 6265 6266 ret = insert_dir_log_key(trans, log, path, key.objectid, 6267 first_dir_index, last_dir_index); 6268 if (ret) 6269 return ret; 6270 6271 last_range_start = first_dir_index; 6272 last_range_end = last_dir_index; 6273 next_batch: 6274 curr = list_next_entry(last, log_list); 6275 } 6276 6277 return 0; 6278 } 6279 6280 static int log_delayed_deletion_items(struct btrfs_trans_handle *trans, 6281 struct btrfs_inode *inode, 6282 struct btrfs_path *path, 6283 const struct list_head *delayed_del_list, 6284 struct btrfs_log_ctx *ctx) 6285 { 6286 /* 6287 * We are deleting dir index items from the log tree or adding range 6288 * items to it. 6289 */ 6290 lockdep_assert_held(&inode->log_mutex); 6291 6292 if (list_empty(delayed_del_list)) 6293 return 0; 6294 6295 if (ctx->logged_before) 6296 return log_delayed_deletions_incremental(trans, inode, path, 6297 delayed_del_list, ctx); 6298 6299 return log_delayed_deletions_full(trans, inode, path, delayed_del_list, 6300 ctx); 6301 } 6302 6303 /* 6304 * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed 6305 * items instead of the subvolume tree. 6306 */ 6307 static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, 6308 struct btrfs_inode *inode, 6309 const struct list_head *delayed_ins_list, 6310 struct btrfs_log_ctx *ctx) 6311 { 6312 const bool orig_log_new_dentries = ctx->log_new_dentries; 6313 struct btrfs_fs_info *fs_info = trans->fs_info; 6314 struct btrfs_delayed_item *item; 6315 int ret = 0; 6316 6317 /* 6318 * No need for the log mutex, plus to avoid potential deadlocks or 6319 * lockdep annotations due to nesting of delayed inode mutexes and log 6320 * mutexes. 6321 */ 6322 lockdep_assert_not_held(&inode->log_mutex); 6323 6324 ASSERT(!ctx->logging_new_delayed_dentries); 6325 ctx->logging_new_delayed_dentries = true; 6326 6327 list_for_each_entry(item, delayed_ins_list, log_list) { 6328 struct btrfs_dir_item *dir_item; 6329 struct inode *di_inode; 6330 struct btrfs_key key; 6331 int log_mode = LOG_INODE_EXISTS; 6332 6333 dir_item = (struct btrfs_dir_item *)item->data; 6334 btrfs_disk_key_to_cpu(&key, &dir_item->location); 6335 6336 if (key.type == BTRFS_ROOT_ITEM_KEY) 6337 continue; 6338 6339 di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root); 6340 if (IS_ERR(di_inode)) { 6341 ret = PTR_ERR(di_inode); 6342 break; 6343 } 6344 6345 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 6346 btrfs_add_delayed_iput(BTRFS_I(di_inode)); 6347 continue; 6348 } 6349 6350 if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR) 6351 log_mode = LOG_INODE_ALL; 6352 6353 ctx->log_new_dentries = false; 6354 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); 6355 6356 if (!ret && ctx->log_new_dentries) 6357 ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); 6358 6359 btrfs_add_delayed_iput(BTRFS_I(di_inode)); 6360 6361 if (ret) 6362 break; 6363 } 6364 6365 ctx->log_new_dentries = orig_log_new_dentries; 6366 ctx->logging_new_delayed_dentries = false; 6367 6368 return ret; 6369 } 6370 6371 /* log a single inode in the tree log. 6372 * At least one parent directory for this inode must exist in the tree 6373 * or be logged already. 6374 * 6375 * Any items from this inode changed by the current transaction are copied 6376 * to the log tree. An extra reference is taken on any extents in this 6377 * file, allowing us to avoid a whole pile of corner cases around logging 6378 * blocks that have been removed from the tree. 6379 * 6380 * See LOG_INODE_ALL and related defines for a description of what inode_only 6381 * does. 6382 * 6383 * This handles both files and directories. 6384 */ 6385 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 6386 struct btrfs_inode *inode, 6387 int inode_only, 6388 struct btrfs_log_ctx *ctx) 6389 { 6390 struct btrfs_path *path; 6391 struct btrfs_path *dst_path; 6392 struct btrfs_key min_key; 6393 struct btrfs_key max_key; 6394 struct btrfs_root *log = inode->root->log_root; 6395 int ret; 6396 bool fast_search = false; 6397 u64 ino = btrfs_ino(inode); 6398 struct extent_map_tree *em_tree = &inode->extent_tree; 6399 u64 logged_isize = 0; 6400 bool need_log_inode_item = true; 6401 bool xattrs_logged = false; 6402 bool inode_item_dropped = true; 6403 bool full_dir_logging = false; 6404 LIST_HEAD(delayed_ins_list); 6405 LIST_HEAD(delayed_del_list); 6406 6407 path = btrfs_alloc_path(); 6408 if (!path) 6409 return -ENOMEM; 6410 dst_path = btrfs_alloc_path(); 6411 if (!dst_path) { 6412 btrfs_free_path(path); 6413 return -ENOMEM; 6414 } 6415 6416 min_key.objectid = ino; 6417 min_key.type = BTRFS_INODE_ITEM_KEY; 6418 min_key.offset = 0; 6419 6420 max_key.objectid = ino; 6421 6422 6423 /* today the code can only do partial logging of directories */ 6424 if (S_ISDIR(inode->vfs_inode.i_mode) || 6425 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6426 &inode->runtime_flags) && 6427 inode_only >= LOG_INODE_EXISTS)) 6428 max_key.type = BTRFS_XATTR_ITEM_KEY; 6429 else 6430 max_key.type = (u8)-1; 6431 max_key.offset = (u64)-1; 6432 6433 if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL) 6434 full_dir_logging = true; 6435 6436 /* 6437 * If we are logging a directory while we are logging dentries of the 6438 * delayed items of some other inode, then we need to flush the delayed 6439 * items of this directory and not log the delayed items directly. This 6440 * is to prevent more than one level of recursion into btrfs_log_inode() 6441 * by having something like this: 6442 * 6443 * $ mkdir -p a/b/c/d/e/f/g/h/... 6444 * $ xfs_io -c "fsync" a 6445 * 6446 * Where all directories in the path did not exist before and are 6447 * created in the current transaction. 6448 * So in such a case we directly log the delayed items of the main 6449 * directory ("a") without flushing them first, while for each of its 6450 * subdirectories we flush their delayed items before logging them. 6451 * This prevents a potential unbounded recursion like this: 6452 * 6453 * btrfs_log_inode() 6454 * log_new_delayed_dentries() 6455 * btrfs_log_inode() 6456 * log_new_delayed_dentries() 6457 * btrfs_log_inode() 6458 * log_new_delayed_dentries() 6459 * (...) 6460 * 6461 * We have thresholds for the maximum number of delayed items to have in 6462 * memory, and once they are hit, the items are flushed asynchronously. 6463 * However the limit is quite high, so lets prevent deep levels of 6464 * recursion to happen by limiting the maximum depth to be 1. 6465 */ 6466 if (full_dir_logging && ctx->logging_new_delayed_dentries) { 6467 ret = btrfs_commit_inode_delayed_items(trans, inode); 6468 if (ret) 6469 goto out; 6470 } 6471 6472 mutex_lock(&inode->log_mutex); 6473 6474 /* 6475 * For symlinks, we must always log their content, which is stored in an 6476 * inline extent, otherwise we could end up with an empty symlink after 6477 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if 6478 * one attempts to create an empty symlink). 6479 * We don't need to worry about flushing delalloc, because when we create 6480 * the inline extent when the symlink is created (we never have delalloc 6481 * for symlinks). 6482 */ 6483 if (S_ISLNK(inode->vfs_inode.i_mode)) 6484 inode_only = LOG_INODE_ALL; 6485 6486 /* 6487 * Before logging the inode item, cache the value returned by 6488 * inode_logged(), because after that we have the need to figure out if 6489 * the inode was previously logged in this transaction. 6490 */ 6491 ret = inode_logged(trans, inode, path); 6492 if (ret < 0) 6493 goto out_unlock; 6494 ctx->logged_before = (ret == 1); 6495 ret = 0; 6496 6497 /* 6498 * This is for cases where logging a directory could result in losing a 6499 * a file after replaying the log. For example, if we move a file from a 6500 * directory A to a directory B, then fsync directory A, we have no way 6501 * to known the file was moved from A to B, so logging just A would 6502 * result in losing the file after a log replay. 6503 */ 6504 if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { 6505 ret = BTRFS_LOG_FORCE_COMMIT; 6506 goto out_unlock; 6507 } 6508 6509 /* 6510 * a brute force approach to making sure we get the most uptodate 6511 * copies of everything. 6512 */ 6513 if (S_ISDIR(inode->vfs_inode.i_mode)) { 6514 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); 6515 if (ctx->logged_before) 6516 ret = drop_inode_items(trans, log, path, inode, 6517 BTRFS_XATTR_ITEM_KEY); 6518 } else { 6519 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { 6520 /* 6521 * Make sure the new inode item we write to the log has 6522 * the same isize as the current one (if it exists). 6523 * This is necessary to prevent data loss after log 6524 * replay, and also to prevent doing a wrong expanding 6525 * truncate - for e.g. create file, write 4K into offset 6526 * 0, fsync, write 4K into offset 4096, add hard link, 6527 * fsync some other file (to sync log), power fail - if 6528 * we use the inode's current i_size, after log replay 6529 * we get a 8Kb file, with the last 4Kb extent as a hole 6530 * (zeroes), as if an expanding truncate happened, 6531 * instead of getting a file of 4Kb only. 6532 */ 6533 ret = logged_inode_size(log, inode, path, &logged_isize); 6534 if (ret) 6535 goto out_unlock; 6536 } 6537 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6538 &inode->runtime_flags)) { 6539 if (inode_only == LOG_INODE_EXISTS) { 6540 max_key.type = BTRFS_XATTR_ITEM_KEY; 6541 if (ctx->logged_before) 6542 ret = drop_inode_items(trans, log, path, 6543 inode, max_key.type); 6544 } else { 6545 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6546 &inode->runtime_flags); 6547 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 6548 &inode->runtime_flags); 6549 if (ctx->logged_before) 6550 ret = truncate_inode_items(trans, log, 6551 inode, 0, 0); 6552 } 6553 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 6554 &inode->runtime_flags) || 6555 inode_only == LOG_INODE_EXISTS) { 6556 if (inode_only == LOG_INODE_ALL) 6557 fast_search = true; 6558 max_key.type = BTRFS_XATTR_ITEM_KEY; 6559 if (ctx->logged_before) 6560 ret = drop_inode_items(trans, log, path, inode, 6561 max_key.type); 6562 } else { 6563 if (inode_only == LOG_INODE_ALL) 6564 fast_search = true; 6565 inode_item_dropped = false; 6566 goto log_extents; 6567 } 6568 6569 } 6570 if (ret) 6571 goto out_unlock; 6572 6573 /* 6574 * If we are logging a directory in full mode, collect the delayed items 6575 * before iterating the subvolume tree, so that we don't miss any new 6576 * dir index items in case they get flushed while or right after we are 6577 * iterating the subvolume tree. 6578 */ 6579 if (full_dir_logging && !ctx->logging_new_delayed_dentries) 6580 btrfs_log_get_delayed_items(inode, &delayed_ins_list, 6581 &delayed_del_list); 6582 6583 ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 6584 path, dst_path, logged_isize, 6585 inode_only, ctx, 6586 &need_log_inode_item); 6587 if (ret) 6588 goto out_unlock; 6589 6590 btrfs_release_path(path); 6591 btrfs_release_path(dst_path); 6592 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx); 6593 if (ret) 6594 goto out_unlock; 6595 xattrs_logged = true; 6596 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 6597 btrfs_release_path(path); 6598 btrfs_release_path(dst_path); 6599 ret = btrfs_log_holes(trans, inode, path); 6600 if (ret) 6601 goto out_unlock; 6602 } 6603 log_extents: 6604 btrfs_release_path(path); 6605 btrfs_release_path(dst_path); 6606 if (need_log_inode_item) { 6607 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); 6608 if (ret) 6609 goto out_unlock; 6610 /* 6611 * If we are doing a fast fsync and the inode was logged before 6612 * in this transaction, we don't need to log the xattrs because 6613 * they were logged before. If xattrs were added, changed or 6614 * deleted since the last time we logged the inode, then we have 6615 * already logged them because the inode had the runtime flag 6616 * BTRFS_INODE_COPY_EVERYTHING set. 6617 */ 6618 if (!xattrs_logged && inode->logged_trans < trans->transid) { 6619 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx); 6620 if (ret) 6621 goto out_unlock; 6622 btrfs_release_path(path); 6623 } 6624 } 6625 if (fast_search) { 6626 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); 6627 if (ret) 6628 goto out_unlock; 6629 } else if (inode_only == LOG_INODE_ALL) { 6630 struct extent_map *em, *n; 6631 6632 write_lock(&em_tree->lock); 6633 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 6634 list_del_init(&em->list); 6635 write_unlock(&em_tree->lock); 6636 } 6637 6638 if (full_dir_logging) { 6639 ret = log_directory_changes(trans, inode, path, dst_path, ctx); 6640 if (ret) 6641 goto out_unlock; 6642 ret = log_delayed_insertion_items(trans, inode, path, 6643 &delayed_ins_list, ctx); 6644 if (ret) 6645 goto out_unlock; 6646 ret = log_delayed_deletion_items(trans, inode, path, 6647 &delayed_del_list, ctx); 6648 if (ret) 6649 goto out_unlock; 6650 } 6651 6652 spin_lock(&inode->lock); 6653 inode->logged_trans = trans->transid; 6654 /* 6655 * Don't update last_log_commit if we logged that an inode exists. 6656 * We do this for three reasons: 6657 * 6658 * 1) We might have had buffered writes to this inode that were 6659 * flushed and had their ordered extents completed in this 6660 * transaction, but we did not previously log the inode with 6661 * LOG_INODE_ALL. Later the inode was evicted and after that 6662 * it was loaded again and this LOG_INODE_EXISTS log operation 6663 * happened. We must make sure that if an explicit fsync against 6664 * the inode is performed later, it logs the new extents, an 6665 * updated inode item, etc, and syncs the log. The same logic 6666 * applies to direct IO writes instead of buffered writes. 6667 * 6668 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item 6669 * is logged with an i_size of 0 or whatever value was logged 6670 * before. If later the i_size of the inode is increased by a 6671 * truncate operation, the log is synced through an fsync of 6672 * some other inode and then finally an explicit fsync against 6673 * this inode is made, we must make sure this fsync logs the 6674 * inode with the new i_size, the hole between old i_size and 6675 * the new i_size, and syncs the log. 6676 * 6677 * 3) If we are logging that an ancestor inode exists as part of 6678 * logging a new name from a link or rename operation, don't update 6679 * its last_log_commit - otherwise if an explicit fsync is made 6680 * against an ancestor, the fsync considers the inode in the log 6681 * and doesn't sync the log, resulting in the ancestor missing after 6682 * a power failure unless the log was synced as part of an fsync 6683 * against any other unrelated inode. 6684 */ 6685 if (inode_only != LOG_INODE_EXISTS) 6686 inode->last_log_commit = inode->last_sub_trans; 6687 spin_unlock(&inode->lock); 6688 6689 /* 6690 * Reset the last_reflink_trans so that the next fsync does not need to 6691 * go through the slower path when logging extents and their checksums. 6692 */ 6693 if (inode_only == LOG_INODE_ALL) 6694 inode->last_reflink_trans = 0; 6695 6696 out_unlock: 6697 mutex_unlock(&inode->log_mutex); 6698 out: 6699 btrfs_free_path(path); 6700 btrfs_free_path(dst_path); 6701 6702 if (ret) 6703 free_conflicting_inodes(ctx); 6704 else 6705 ret = log_conflicting_inodes(trans, inode->root, ctx); 6706 6707 if (full_dir_logging && !ctx->logging_new_delayed_dentries) { 6708 if (!ret) 6709 ret = log_new_delayed_dentries(trans, inode, 6710 &delayed_ins_list, ctx); 6711 6712 btrfs_log_put_delayed_items(inode, &delayed_ins_list, 6713 &delayed_del_list); 6714 } 6715 6716 return ret; 6717 } 6718 6719 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 6720 struct btrfs_inode *inode, 6721 struct btrfs_log_ctx *ctx) 6722 { 6723 struct btrfs_fs_info *fs_info = trans->fs_info; 6724 int ret; 6725 struct btrfs_path *path; 6726 struct btrfs_key key; 6727 struct btrfs_root *root = inode->root; 6728 const u64 ino = btrfs_ino(inode); 6729 6730 path = btrfs_alloc_path(); 6731 if (!path) 6732 return -ENOMEM; 6733 path->skip_locking = 1; 6734 path->search_commit_root = 1; 6735 6736 key.objectid = ino; 6737 key.type = BTRFS_INODE_REF_KEY; 6738 key.offset = 0; 6739 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6740 if (ret < 0) 6741 goto out; 6742 6743 while (true) { 6744 struct extent_buffer *leaf = path->nodes[0]; 6745 int slot = path->slots[0]; 6746 u32 cur_offset = 0; 6747 u32 item_size; 6748 unsigned long ptr; 6749 6750 if (slot >= btrfs_header_nritems(leaf)) { 6751 ret = btrfs_next_leaf(root, path); 6752 if (ret < 0) 6753 goto out; 6754 else if (ret > 0) 6755 break; 6756 continue; 6757 } 6758 6759 btrfs_item_key_to_cpu(leaf, &key, slot); 6760 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 6761 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 6762 break; 6763 6764 item_size = btrfs_item_size(leaf, slot); 6765 ptr = btrfs_item_ptr_offset(leaf, slot); 6766 while (cur_offset < item_size) { 6767 struct btrfs_key inode_key; 6768 struct inode *dir_inode; 6769 6770 inode_key.type = BTRFS_INODE_ITEM_KEY; 6771 inode_key.offset = 0; 6772 6773 if (key.type == BTRFS_INODE_EXTREF_KEY) { 6774 struct btrfs_inode_extref *extref; 6775 6776 extref = (struct btrfs_inode_extref *) 6777 (ptr + cur_offset); 6778 inode_key.objectid = btrfs_inode_extref_parent( 6779 leaf, extref); 6780 cur_offset += sizeof(*extref); 6781 cur_offset += btrfs_inode_extref_name_len(leaf, 6782 extref); 6783 } else { 6784 inode_key.objectid = key.offset; 6785 cur_offset = item_size; 6786 } 6787 6788 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 6789 root); 6790 /* 6791 * If the parent inode was deleted, return an error to 6792 * fallback to a transaction commit. This is to prevent 6793 * getting an inode that was moved from one parent A to 6794 * a parent B, got its former parent A deleted and then 6795 * it got fsync'ed, from existing at both parents after 6796 * a log replay (and the old parent still existing). 6797 * Example: 6798 * 6799 * mkdir /mnt/A 6800 * mkdir /mnt/B 6801 * touch /mnt/B/bar 6802 * sync 6803 * mv /mnt/B/bar /mnt/A/bar 6804 * mv -T /mnt/A /mnt/B 6805 * fsync /mnt/B/bar 6806 * <power fail> 6807 * 6808 * If we ignore the old parent B which got deleted, 6809 * after a log replay we would have file bar linked 6810 * at both parents and the old parent B would still 6811 * exist. 6812 */ 6813 if (IS_ERR(dir_inode)) { 6814 ret = PTR_ERR(dir_inode); 6815 goto out; 6816 } 6817 6818 if (!need_log_inode(trans, BTRFS_I(dir_inode))) { 6819 btrfs_add_delayed_iput(BTRFS_I(dir_inode)); 6820 continue; 6821 } 6822 6823 ctx->log_new_dentries = false; 6824 ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), 6825 LOG_INODE_ALL, ctx); 6826 if (!ret && ctx->log_new_dentries) 6827 ret = log_new_dir_dentries(trans, 6828 BTRFS_I(dir_inode), ctx); 6829 btrfs_add_delayed_iput(BTRFS_I(dir_inode)); 6830 if (ret) 6831 goto out; 6832 } 6833 path->slots[0]++; 6834 } 6835 ret = 0; 6836 out: 6837 btrfs_free_path(path); 6838 return ret; 6839 } 6840 6841 static int log_new_ancestors(struct btrfs_trans_handle *trans, 6842 struct btrfs_root *root, 6843 struct btrfs_path *path, 6844 struct btrfs_log_ctx *ctx) 6845 { 6846 struct btrfs_key found_key; 6847 6848 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 6849 6850 while (true) { 6851 struct btrfs_fs_info *fs_info = root->fs_info; 6852 struct extent_buffer *leaf; 6853 int slot; 6854 struct btrfs_key search_key; 6855 struct inode *inode; 6856 u64 ino; 6857 int ret = 0; 6858 6859 btrfs_release_path(path); 6860 6861 ino = found_key.offset; 6862 6863 search_key.objectid = found_key.offset; 6864 search_key.type = BTRFS_INODE_ITEM_KEY; 6865 search_key.offset = 0; 6866 inode = btrfs_iget(fs_info->sb, ino, root); 6867 if (IS_ERR(inode)) 6868 return PTR_ERR(inode); 6869 6870 if (BTRFS_I(inode)->generation >= trans->transid && 6871 need_log_inode(trans, BTRFS_I(inode))) 6872 ret = btrfs_log_inode(trans, BTRFS_I(inode), 6873 LOG_INODE_EXISTS, ctx); 6874 btrfs_add_delayed_iput(BTRFS_I(inode)); 6875 if (ret) 6876 return ret; 6877 6878 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 6879 break; 6880 6881 search_key.type = BTRFS_INODE_REF_KEY; 6882 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6883 if (ret < 0) 6884 return ret; 6885 6886 leaf = path->nodes[0]; 6887 slot = path->slots[0]; 6888 if (slot >= btrfs_header_nritems(leaf)) { 6889 ret = btrfs_next_leaf(root, path); 6890 if (ret < 0) 6891 return ret; 6892 else if (ret > 0) 6893 return -ENOENT; 6894 leaf = path->nodes[0]; 6895 slot = path->slots[0]; 6896 } 6897 6898 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6899 if (found_key.objectid != search_key.objectid || 6900 found_key.type != BTRFS_INODE_REF_KEY) 6901 return -ENOENT; 6902 } 6903 return 0; 6904 } 6905 6906 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 6907 struct btrfs_inode *inode, 6908 struct dentry *parent, 6909 struct btrfs_log_ctx *ctx) 6910 { 6911 struct btrfs_root *root = inode->root; 6912 struct dentry *old_parent = NULL; 6913 struct super_block *sb = inode->vfs_inode.i_sb; 6914 int ret = 0; 6915 6916 while (true) { 6917 if (!parent || d_really_is_negative(parent) || 6918 sb != parent->d_sb) 6919 break; 6920 6921 inode = BTRFS_I(d_inode(parent)); 6922 if (root != inode->root) 6923 break; 6924 6925 if (inode->generation >= trans->transid && 6926 need_log_inode(trans, inode)) { 6927 ret = btrfs_log_inode(trans, inode, 6928 LOG_INODE_EXISTS, ctx); 6929 if (ret) 6930 break; 6931 } 6932 if (IS_ROOT(parent)) 6933 break; 6934 6935 parent = dget_parent(parent); 6936 dput(old_parent); 6937 old_parent = parent; 6938 } 6939 dput(old_parent); 6940 6941 return ret; 6942 } 6943 6944 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 6945 struct btrfs_inode *inode, 6946 struct dentry *parent, 6947 struct btrfs_log_ctx *ctx) 6948 { 6949 struct btrfs_root *root = inode->root; 6950 const u64 ino = btrfs_ino(inode); 6951 struct btrfs_path *path; 6952 struct btrfs_key search_key; 6953 int ret; 6954 6955 /* 6956 * For a single hard link case, go through a fast path that does not 6957 * need to iterate the fs/subvolume tree. 6958 */ 6959 if (inode->vfs_inode.i_nlink < 2) 6960 return log_new_ancestors_fast(trans, inode, parent, ctx); 6961 6962 path = btrfs_alloc_path(); 6963 if (!path) 6964 return -ENOMEM; 6965 6966 search_key.objectid = ino; 6967 search_key.type = BTRFS_INODE_REF_KEY; 6968 search_key.offset = 0; 6969 again: 6970 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6971 if (ret < 0) 6972 goto out; 6973 if (ret == 0) 6974 path->slots[0]++; 6975 6976 while (true) { 6977 struct extent_buffer *leaf = path->nodes[0]; 6978 int slot = path->slots[0]; 6979 struct btrfs_key found_key; 6980 6981 if (slot >= btrfs_header_nritems(leaf)) { 6982 ret = btrfs_next_leaf(root, path); 6983 if (ret < 0) 6984 goto out; 6985 else if (ret > 0) 6986 break; 6987 continue; 6988 } 6989 6990 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6991 if (found_key.objectid != ino || 6992 found_key.type > BTRFS_INODE_EXTREF_KEY) 6993 break; 6994 6995 /* 6996 * Don't deal with extended references because they are rare 6997 * cases and too complex to deal with (we would need to keep 6998 * track of which subitem we are processing for each item in 6999 * this loop, etc). So just return some error to fallback to 7000 * a transaction commit. 7001 */ 7002 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 7003 ret = -EMLINK; 7004 goto out; 7005 } 7006 7007 /* 7008 * Logging ancestors needs to do more searches on the fs/subvol 7009 * tree, so it releases the path as needed to avoid deadlocks. 7010 * Keep track of the last inode ref key and resume from that key 7011 * after logging all new ancestors for the current hard link. 7012 */ 7013 memcpy(&search_key, &found_key, sizeof(search_key)); 7014 7015 ret = log_new_ancestors(trans, root, path, ctx); 7016 if (ret) 7017 goto out; 7018 btrfs_release_path(path); 7019 goto again; 7020 } 7021 ret = 0; 7022 out: 7023 btrfs_free_path(path); 7024 return ret; 7025 } 7026 7027 /* 7028 * helper function around btrfs_log_inode to make sure newly created 7029 * parent directories also end up in the log. A minimal inode and backref 7030 * only logging is done of any parent directories that are older than 7031 * the last committed transaction 7032 */ 7033 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 7034 struct btrfs_inode *inode, 7035 struct dentry *parent, 7036 int inode_only, 7037 struct btrfs_log_ctx *ctx) 7038 { 7039 struct btrfs_root *root = inode->root; 7040 struct btrfs_fs_info *fs_info = root->fs_info; 7041 int ret = 0; 7042 bool log_dentries = false; 7043 7044 if (btrfs_test_opt(fs_info, NOTREELOG)) { 7045 ret = BTRFS_LOG_FORCE_COMMIT; 7046 goto end_no_trans; 7047 } 7048 7049 if (btrfs_root_refs(&root->root_item) == 0) { 7050 ret = BTRFS_LOG_FORCE_COMMIT; 7051 goto end_no_trans; 7052 } 7053 7054 /* 7055 * Skip already logged inodes or inodes corresponding to tmpfiles 7056 * (since logging them is pointless, a link count of 0 means they 7057 * will never be accessible). 7058 */ 7059 if ((btrfs_inode_in_log(inode, trans->transid) && 7060 list_empty(&ctx->ordered_extents)) || 7061 inode->vfs_inode.i_nlink == 0) { 7062 ret = BTRFS_NO_LOG_SYNC; 7063 goto end_no_trans; 7064 } 7065 7066 ret = start_log_trans(trans, root, ctx); 7067 if (ret) 7068 goto end_no_trans; 7069 7070 ret = btrfs_log_inode(trans, inode, inode_only, ctx); 7071 if (ret) 7072 goto end_trans; 7073 7074 /* 7075 * for regular files, if its inode is already on disk, we don't 7076 * have to worry about the parents at all. This is because 7077 * we can use the last_unlink_trans field to record renames 7078 * and other fun in this file. 7079 */ 7080 if (S_ISREG(inode->vfs_inode.i_mode) && 7081 inode->generation < trans->transid && 7082 inode->last_unlink_trans < trans->transid) { 7083 ret = 0; 7084 goto end_trans; 7085 } 7086 7087 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) 7088 log_dentries = true; 7089 7090 /* 7091 * On unlink we must make sure all our current and old parent directory 7092 * inodes are fully logged. This is to prevent leaving dangling 7093 * directory index entries in directories that were our parents but are 7094 * not anymore. Not doing this results in old parent directory being 7095 * impossible to delete after log replay (rmdir will always fail with 7096 * error -ENOTEMPTY). 7097 * 7098 * Example 1: 7099 * 7100 * mkdir testdir 7101 * touch testdir/foo 7102 * ln testdir/foo testdir/bar 7103 * sync 7104 * unlink testdir/bar 7105 * xfs_io -c fsync testdir/foo 7106 * <power failure> 7107 * mount fs, triggers log replay 7108 * 7109 * If we don't log the parent directory (testdir), after log replay the 7110 * directory still has an entry pointing to the file inode using the bar 7111 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 7112 * the file inode has a link count of 1. 7113 * 7114 * Example 2: 7115 * 7116 * mkdir testdir 7117 * touch foo 7118 * ln foo testdir/foo2 7119 * ln foo testdir/foo3 7120 * sync 7121 * unlink testdir/foo3 7122 * xfs_io -c fsync foo 7123 * <power failure> 7124 * mount fs, triggers log replay 7125 * 7126 * Similar as the first example, after log replay the parent directory 7127 * testdir still has an entry pointing to the inode file with name foo3 7128 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 7129 * and has a link count of 2. 7130 */ 7131 if (inode->last_unlink_trans >= trans->transid) { 7132 ret = btrfs_log_all_parents(trans, inode, ctx); 7133 if (ret) 7134 goto end_trans; 7135 } 7136 7137 ret = log_all_new_ancestors(trans, inode, parent, ctx); 7138 if (ret) 7139 goto end_trans; 7140 7141 if (log_dentries) 7142 ret = log_new_dir_dentries(trans, inode, ctx); 7143 else 7144 ret = 0; 7145 end_trans: 7146 if (ret < 0) { 7147 btrfs_set_log_full_commit(trans); 7148 ret = BTRFS_LOG_FORCE_COMMIT; 7149 } 7150 7151 if (ret) 7152 btrfs_remove_log_ctx(root, ctx); 7153 btrfs_end_log_trans(root); 7154 end_no_trans: 7155 return ret; 7156 } 7157 7158 /* 7159 * it is not safe to log dentry if the chunk root has added new 7160 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 7161 * If this returns 1, you must commit the transaction to safely get your 7162 * data on disk. 7163 */ 7164 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 7165 struct dentry *dentry, 7166 struct btrfs_log_ctx *ctx) 7167 { 7168 struct dentry *parent = dget_parent(dentry); 7169 int ret; 7170 7171 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 7172 LOG_INODE_ALL, ctx); 7173 dput(parent); 7174 7175 return ret; 7176 } 7177 7178 /* 7179 * should be called during mount to recover any replay any log trees 7180 * from the FS 7181 */ 7182 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 7183 { 7184 int ret; 7185 struct btrfs_path *path; 7186 struct btrfs_trans_handle *trans; 7187 struct btrfs_key key; 7188 struct btrfs_key found_key; 7189 struct btrfs_root *log; 7190 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 7191 struct walk_control wc = { 7192 .process_func = process_one_buffer, 7193 .stage = LOG_WALK_PIN_ONLY, 7194 }; 7195 7196 path = btrfs_alloc_path(); 7197 if (!path) 7198 return -ENOMEM; 7199 7200 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7201 7202 trans = btrfs_start_transaction(fs_info->tree_root, 0); 7203 if (IS_ERR(trans)) { 7204 ret = PTR_ERR(trans); 7205 goto error; 7206 } 7207 7208 wc.trans = trans; 7209 wc.pin = 1; 7210 7211 ret = walk_log_tree(trans, log_root_tree, &wc); 7212 if (ret) { 7213 btrfs_abort_transaction(trans, ret); 7214 goto error; 7215 } 7216 7217 again: 7218 key.objectid = BTRFS_TREE_LOG_OBJECTID; 7219 key.offset = (u64)-1; 7220 key.type = BTRFS_ROOT_ITEM_KEY; 7221 7222 while (1) { 7223 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 7224 7225 if (ret < 0) { 7226 btrfs_abort_transaction(trans, ret); 7227 goto error; 7228 } 7229 if (ret > 0) { 7230 if (path->slots[0] == 0) 7231 break; 7232 path->slots[0]--; 7233 } 7234 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 7235 path->slots[0]); 7236 btrfs_release_path(path); 7237 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 7238 break; 7239 7240 log = btrfs_read_tree_root(log_root_tree, &found_key); 7241 if (IS_ERR(log)) { 7242 ret = PTR_ERR(log); 7243 btrfs_abort_transaction(trans, ret); 7244 goto error; 7245 } 7246 7247 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 7248 true); 7249 if (IS_ERR(wc.replay_dest)) { 7250 ret = PTR_ERR(wc.replay_dest); 7251 7252 /* 7253 * We didn't find the subvol, likely because it was 7254 * deleted. This is ok, simply skip this log and go to 7255 * the next one. 7256 * 7257 * We need to exclude the root because we can't have 7258 * other log replays overwriting this log as we'll read 7259 * it back in a few more times. This will keep our 7260 * block from being modified, and we'll just bail for 7261 * each subsequent pass. 7262 */ 7263 if (ret == -ENOENT) 7264 ret = btrfs_pin_extent_for_log_replay(trans, log->node); 7265 btrfs_put_root(log); 7266 7267 if (!ret) 7268 goto next; 7269 btrfs_abort_transaction(trans, ret); 7270 goto error; 7271 } 7272 7273 wc.replay_dest->log_root = log; 7274 ret = btrfs_record_root_in_trans(trans, wc.replay_dest); 7275 if (ret) 7276 /* The loop needs to continue due to the root refs */ 7277 btrfs_abort_transaction(trans, ret); 7278 else 7279 ret = walk_log_tree(trans, log, &wc); 7280 7281 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 7282 ret = fixup_inode_link_counts(trans, wc.replay_dest, 7283 path); 7284 if (ret) 7285 btrfs_abort_transaction(trans, ret); 7286 } 7287 7288 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 7289 struct btrfs_root *root = wc.replay_dest; 7290 7291 btrfs_release_path(path); 7292 7293 /* 7294 * We have just replayed everything, and the highest 7295 * objectid of fs roots probably has changed in case 7296 * some inode_item's got replayed. 7297 * 7298 * root->objectid_mutex is not acquired as log replay 7299 * could only happen during mount. 7300 */ 7301 ret = btrfs_init_root_free_objectid(root); 7302 if (ret) 7303 btrfs_abort_transaction(trans, ret); 7304 } 7305 7306 wc.replay_dest->log_root = NULL; 7307 btrfs_put_root(wc.replay_dest); 7308 btrfs_put_root(log); 7309 7310 if (ret) 7311 goto error; 7312 next: 7313 if (found_key.offset == 0) 7314 break; 7315 key.offset = found_key.offset - 1; 7316 } 7317 btrfs_release_path(path); 7318 7319 /* step one is to pin it all, step two is to replay just inodes */ 7320 if (wc.pin) { 7321 wc.pin = 0; 7322 wc.process_func = replay_one_buffer; 7323 wc.stage = LOG_WALK_REPLAY_INODES; 7324 goto again; 7325 } 7326 /* step three is to replay everything */ 7327 if (wc.stage < LOG_WALK_REPLAY_ALL) { 7328 wc.stage++; 7329 goto again; 7330 } 7331 7332 btrfs_free_path(path); 7333 7334 /* step 4: commit the transaction, which also unpins the blocks */ 7335 ret = btrfs_commit_transaction(trans); 7336 if (ret) 7337 return ret; 7338 7339 log_root_tree->log_root = NULL; 7340 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7341 btrfs_put_root(log_root_tree); 7342 7343 return 0; 7344 error: 7345 if (wc.trans) 7346 btrfs_end_transaction(wc.trans); 7347 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7348 btrfs_free_path(path); 7349 return ret; 7350 } 7351 7352 /* 7353 * there are some corner cases where we want to force a full 7354 * commit instead of allowing a directory to be logged. 7355 * 7356 * They revolve around files there were unlinked from the directory, and 7357 * this function updates the parent directory so that a full commit is 7358 * properly done if it is fsync'd later after the unlinks are done. 7359 * 7360 * Must be called before the unlink operations (updates to the subvolume tree, 7361 * inodes, etc) are done. 7362 */ 7363 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 7364 struct btrfs_inode *dir, struct btrfs_inode *inode, 7365 bool for_rename) 7366 { 7367 /* 7368 * when we're logging a file, if it hasn't been renamed 7369 * or unlinked, and its inode is fully committed on disk, 7370 * we don't have to worry about walking up the directory chain 7371 * to log its parents. 7372 * 7373 * So, we use the last_unlink_trans field to put this transid 7374 * into the file. When the file is logged we check it and 7375 * don't log the parents if the file is fully on disk. 7376 */ 7377 mutex_lock(&inode->log_mutex); 7378 inode->last_unlink_trans = trans->transid; 7379 mutex_unlock(&inode->log_mutex); 7380 7381 if (!for_rename) 7382 return; 7383 7384 /* 7385 * If this directory was already logged, any new names will be logged 7386 * with btrfs_log_new_name() and old names will be deleted from the log 7387 * tree with btrfs_del_dir_entries_in_log() or with 7388 * btrfs_del_inode_ref_in_log(). 7389 */ 7390 if (inode_logged(trans, dir, NULL) == 1) 7391 return; 7392 7393 /* 7394 * If the inode we're about to unlink was logged before, the log will be 7395 * properly updated with the new name with btrfs_log_new_name() and the 7396 * old name removed with btrfs_del_dir_entries_in_log() or with 7397 * btrfs_del_inode_ref_in_log(). 7398 */ 7399 if (inode_logged(trans, inode, NULL) == 1) 7400 return; 7401 7402 /* 7403 * when renaming files across directories, if the directory 7404 * there we're unlinking from gets fsync'd later on, there's 7405 * no way to find the destination directory later and fsync it 7406 * properly. So, we have to be conservative and force commits 7407 * so the new name gets discovered. 7408 */ 7409 mutex_lock(&dir->log_mutex); 7410 dir->last_unlink_trans = trans->transid; 7411 mutex_unlock(&dir->log_mutex); 7412 } 7413 7414 /* 7415 * Make sure that if someone attempts to fsync the parent directory of a deleted 7416 * snapshot, it ends up triggering a transaction commit. This is to guarantee 7417 * that after replaying the log tree of the parent directory's root we will not 7418 * see the snapshot anymore and at log replay time we will not see any log tree 7419 * corresponding to the deleted snapshot's root, which could lead to replaying 7420 * it after replaying the log tree of the parent directory (which would replay 7421 * the snapshot delete operation). 7422 * 7423 * Must be called before the actual snapshot destroy operation (updates to the 7424 * parent root and tree of tree roots trees, etc) are done. 7425 */ 7426 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 7427 struct btrfs_inode *dir) 7428 { 7429 mutex_lock(&dir->log_mutex); 7430 dir->last_unlink_trans = trans->transid; 7431 mutex_unlock(&dir->log_mutex); 7432 } 7433 7434 /* 7435 * Update the log after adding a new name for an inode. 7436 * 7437 * @trans: Transaction handle. 7438 * @old_dentry: The dentry associated with the old name and the old 7439 * parent directory. 7440 * @old_dir: The inode of the previous parent directory for the case 7441 * of a rename. For a link operation, it must be NULL. 7442 * @old_dir_index: The index number associated with the old name, meaningful 7443 * only for rename operations (when @old_dir is not NULL). 7444 * Ignored for link operations. 7445 * @parent: The dentry associated with the directory under which the 7446 * new name is located. 7447 * 7448 * Call this after adding a new name for an inode, as a result of a link or 7449 * rename operation, and it will properly update the log to reflect the new name. 7450 */ 7451 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 7452 struct dentry *old_dentry, struct btrfs_inode *old_dir, 7453 u64 old_dir_index, struct dentry *parent) 7454 { 7455 struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); 7456 struct btrfs_root *root = inode->root; 7457 struct btrfs_log_ctx ctx; 7458 bool log_pinned = false; 7459 int ret; 7460 7461 /* 7462 * this will force the logging code to walk the dentry chain 7463 * up for the file 7464 */ 7465 if (!S_ISDIR(inode->vfs_inode.i_mode)) 7466 inode->last_unlink_trans = trans->transid; 7467 7468 /* 7469 * if this inode hasn't been logged and directory we're renaming it 7470 * from hasn't been logged, we don't need to log it 7471 */ 7472 ret = inode_logged(trans, inode, NULL); 7473 if (ret < 0) { 7474 goto out; 7475 } else if (ret == 0) { 7476 if (!old_dir) 7477 return; 7478 /* 7479 * If the inode was not logged and we are doing a rename (old_dir is not 7480 * NULL), check if old_dir was logged - if it was not we can return and 7481 * do nothing. 7482 */ 7483 ret = inode_logged(trans, old_dir, NULL); 7484 if (ret < 0) 7485 goto out; 7486 else if (ret == 0) 7487 return; 7488 } 7489 ret = 0; 7490 7491 /* 7492 * If we are doing a rename (old_dir is not NULL) from a directory that 7493 * was previously logged, make sure that on log replay we get the old 7494 * dir entry deleted. This is needed because we will also log the new 7495 * name of the renamed inode, so we need to make sure that after log 7496 * replay we don't end up with both the new and old dir entries existing. 7497 */ 7498 if (old_dir && old_dir->logged_trans == trans->transid) { 7499 struct btrfs_root *log = old_dir->root->log_root; 7500 struct btrfs_path *path; 7501 struct fscrypt_name fname; 7502 7503 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); 7504 7505 ret = fscrypt_setup_filename(&old_dir->vfs_inode, 7506 &old_dentry->d_name, 0, &fname); 7507 if (ret) 7508 goto out; 7509 /* 7510 * We have two inodes to update in the log, the old directory and 7511 * the inode that got renamed, so we must pin the log to prevent 7512 * anyone from syncing the log until we have updated both inodes 7513 * in the log. 7514 */ 7515 ret = join_running_log_trans(root); 7516 /* 7517 * At least one of the inodes was logged before, so this should 7518 * not fail, but if it does, it's not serious, just bail out and 7519 * mark the log for a full commit. 7520 */ 7521 if (WARN_ON_ONCE(ret < 0)) { 7522 fscrypt_free_filename(&fname); 7523 goto out; 7524 } 7525 7526 log_pinned = true; 7527 7528 path = btrfs_alloc_path(); 7529 if (!path) { 7530 ret = -ENOMEM; 7531 fscrypt_free_filename(&fname); 7532 goto out; 7533 } 7534 7535 /* 7536 * Other concurrent task might be logging the old directory, 7537 * as it can be triggered when logging other inode that had or 7538 * still has a dentry in the old directory. We lock the old 7539 * directory's log_mutex to ensure the deletion of the old 7540 * name is persisted, because during directory logging we 7541 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of 7542 * the old name's dir index item is in the delayed items, so 7543 * it could be missed by an in progress directory logging. 7544 */ 7545 mutex_lock(&old_dir->log_mutex); 7546 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), 7547 &fname.disk_name, old_dir_index); 7548 if (ret > 0) { 7549 /* 7550 * The dentry does not exist in the log, so record its 7551 * deletion. 7552 */ 7553 btrfs_release_path(path); 7554 ret = insert_dir_log_key(trans, log, path, 7555 btrfs_ino(old_dir), 7556 old_dir_index, old_dir_index); 7557 } 7558 mutex_unlock(&old_dir->log_mutex); 7559 7560 btrfs_free_path(path); 7561 fscrypt_free_filename(&fname); 7562 if (ret < 0) 7563 goto out; 7564 } 7565 7566 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 7567 ctx.logging_new_name = true; 7568 btrfs_init_log_ctx_scratch_eb(&ctx); 7569 /* 7570 * We don't care about the return value. If we fail to log the new name 7571 * then we know the next attempt to sync the log will fallback to a full 7572 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 7573 * we don't need to worry about getting a log committed that has an 7574 * inconsistent state after a rename operation. 7575 */ 7576 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 7577 free_extent_buffer(ctx.scratch_eb); 7578 ASSERT(list_empty(&ctx.conflict_inodes)); 7579 out: 7580 /* 7581 * If an error happened mark the log for a full commit because it's not 7582 * consistent and up to date or we couldn't find out if one of the 7583 * inodes was logged before in this transaction. Do it before unpinning 7584 * the log, to avoid any races with someone else trying to commit it. 7585 */ 7586 if (ret < 0) 7587 btrfs_set_log_full_commit(trans); 7588 if (log_pinned) 7589 btrfs_end_log_trans(root); 7590 } 7591 7592