1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include "tree-log.h" 24 #include "disk-io.h" 25 #include "locking.h" 26 #include "print-tree.h" 27 #include "backref.h" 28 #include "hash.h" 29 #include "compression.h" 30 #include "qgroup.h" 31 32 /* magic values for the inode_only field in btrfs_log_inode: 33 * 34 * LOG_INODE_ALL means to log everything 35 * LOG_INODE_EXISTS means to log just enough to recreate the inode 36 * during log replay 37 */ 38 #define LOG_INODE_ALL 0 39 #define LOG_INODE_EXISTS 1 40 #define LOG_OTHER_INODE 2 41 42 /* 43 * directory trouble cases 44 * 45 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 46 * log, we must force a full commit before doing an fsync of the directory 47 * where the unlink was done. 48 * ---> record transid of last unlink/rename per directory 49 * 50 * mkdir foo/some_dir 51 * normal commit 52 * rename foo/some_dir foo2/some_dir 53 * mkdir foo/some_dir 54 * fsync foo/some_dir/some_file 55 * 56 * The fsync above will unlink the original some_dir without recording 57 * it in its new location (foo2). After a crash, some_dir will be gone 58 * unless the fsync of some_file forces a full commit 59 * 60 * 2) we must log any new names for any file or dir that is in the fsync 61 * log. ---> check inode while renaming/linking. 62 * 63 * 2a) we must log any new names for any file or dir during rename 64 * when the directory they are being removed from was logged. 65 * ---> check inode and old parent dir during rename 66 * 67 * 2a is actually the more important variant. With the extra logging 68 * a crash might unlink the old name without recreating the new one 69 * 70 * 3) after a crash, we must go through any directories with a link count 71 * of zero and redo the rm -rf 72 * 73 * mkdir f1/foo 74 * normal commit 75 * rm -rf f1/foo 76 * fsync(f1) 77 * 78 * The directory f1 was fully removed from the FS, but fsync was never 79 * called on f1, only its parent dir. After a crash the rm -rf must 80 * be replayed. This must be able to recurse down the entire 81 * directory tree. The inode link count fixup code takes care of the 82 * ugly details. 83 */ 84 85 /* 86 * stages for the tree walking. The first 87 * stage (0) is to only pin down the blocks we find 88 * the second stage (1) is to make sure that all the inodes 89 * we find in the log are created in the subvolume. 90 * 91 * The last stage is to deal with directories and links and extents 92 * and all the other fun semantics 93 */ 94 #define LOG_WALK_PIN_ONLY 0 95 #define LOG_WALK_REPLAY_INODES 1 96 #define LOG_WALK_REPLAY_DIR_INDEX 2 97 #define LOG_WALK_REPLAY_ALL 3 98 99 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 100 struct btrfs_root *root, struct btrfs_inode *inode, 101 int inode_only, 102 const loff_t start, 103 const loff_t end, 104 struct btrfs_log_ctx *ctx); 105 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 106 struct btrfs_root *root, 107 struct btrfs_path *path, u64 objectid); 108 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 109 struct btrfs_root *root, 110 struct btrfs_root *log, 111 struct btrfs_path *path, 112 u64 dirid, int del_all); 113 114 /* 115 * tree logging is a special write ahead log used to make sure that 116 * fsyncs and O_SYNCs can happen without doing full tree commits. 117 * 118 * Full tree commits are expensive because they require commonly 119 * modified blocks to be recowed, creating many dirty pages in the 120 * extent tree an 4x-6x higher write load than ext3. 121 * 122 * Instead of doing a tree commit on every fsync, we use the 123 * key ranges and transaction ids to find items for a given file or directory 124 * that have changed in this transaction. Those items are copied into 125 * a special tree (one per subvolume root), that tree is written to disk 126 * and then the fsync is considered complete. 127 * 128 * After a crash, items are copied out of the log-tree back into the 129 * subvolume tree. Any file data extents found are recorded in the extent 130 * allocation tree, and the log-tree freed. 131 * 132 * The log tree is read three times, once to pin down all the extents it is 133 * using in ram and once, once to create all the inodes logged in the tree 134 * and once to do all the other items. 135 */ 136 137 /* 138 * start a sub transaction and setup the log tree 139 * this increments the log tree writer count to make the people 140 * syncing the tree wait for us to finish 141 */ 142 static int start_log_trans(struct btrfs_trans_handle *trans, 143 struct btrfs_root *root, 144 struct btrfs_log_ctx *ctx) 145 { 146 struct btrfs_fs_info *fs_info = root->fs_info; 147 int ret = 0; 148 149 mutex_lock(&root->log_mutex); 150 151 if (root->log_root) { 152 if (btrfs_need_log_full_commit(fs_info, trans)) { 153 ret = -EAGAIN; 154 goto out; 155 } 156 157 if (!root->log_start_pid) { 158 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 159 root->log_start_pid = current->pid; 160 } else if (root->log_start_pid != current->pid) { 161 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 162 } 163 } else { 164 mutex_lock(&fs_info->tree_log_mutex); 165 if (!fs_info->log_root_tree) 166 ret = btrfs_init_log_root_tree(trans, fs_info); 167 mutex_unlock(&fs_info->tree_log_mutex); 168 if (ret) 169 goto out; 170 171 ret = btrfs_add_log_tree(trans, root); 172 if (ret) 173 goto out; 174 175 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 176 root->log_start_pid = current->pid; 177 } 178 179 atomic_inc(&root->log_batch); 180 atomic_inc(&root->log_writers); 181 if (ctx) { 182 int index = root->log_transid % 2; 183 list_add_tail(&ctx->list, &root->log_ctxs[index]); 184 ctx->log_transid = root->log_transid; 185 } 186 187 out: 188 mutex_unlock(&root->log_mutex); 189 return ret; 190 } 191 192 /* 193 * returns 0 if there was a log transaction running and we were able 194 * to join, or returns -ENOENT if there were not transactions 195 * in progress 196 */ 197 static int join_running_log_trans(struct btrfs_root *root) 198 { 199 int ret = -ENOENT; 200 201 smp_mb(); 202 if (!root->log_root) 203 return -ENOENT; 204 205 mutex_lock(&root->log_mutex); 206 if (root->log_root) { 207 ret = 0; 208 atomic_inc(&root->log_writers); 209 } 210 mutex_unlock(&root->log_mutex); 211 return ret; 212 } 213 214 /* 215 * This either makes the current running log transaction wait 216 * until you call btrfs_end_log_trans() or it makes any future 217 * log transactions wait until you call btrfs_end_log_trans() 218 */ 219 int btrfs_pin_log_trans(struct btrfs_root *root) 220 { 221 int ret = -ENOENT; 222 223 mutex_lock(&root->log_mutex); 224 atomic_inc(&root->log_writers); 225 mutex_unlock(&root->log_mutex); 226 return ret; 227 } 228 229 /* 230 * indicate we're done making changes to the log tree 231 * and wake up anyone waiting to do a sync 232 */ 233 void btrfs_end_log_trans(struct btrfs_root *root) 234 { 235 if (atomic_dec_and_test(&root->log_writers)) { 236 /* 237 * Implicit memory barrier after atomic_dec_and_test 238 */ 239 if (waitqueue_active(&root->log_writer_wait)) 240 wake_up(&root->log_writer_wait); 241 } 242 } 243 244 245 /* 246 * the walk control struct is used to pass state down the chain when 247 * processing the log tree. The stage field tells us which part 248 * of the log tree processing we are currently doing. The others 249 * are state fields used for that specific part 250 */ 251 struct walk_control { 252 /* should we free the extent on disk when done? This is used 253 * at transaction commit time while freeing a log tree 254 */ 255 int free; 256 257 /* should we write out the extent buffer? This is used 258 * while flushing the log tree to disk during a sync 259 */ 260 int write; 261 262 /* should we wait for the extent buffer io to finish? Also used 263 * while flushing the log tree to disk for a sync 264 */ 265 int wait; 266 267 /* pin only walk, we record which extents on disk belong to the 268 * log trees 269 */ 270 int pin; 271 272 /* what stage of the replay code we're currently in */ 273 int stage; 274 275 /* the root we are currently replaying */ 276 struct btrfs_root *replay_dest; 277 278 /* the trans handle for the current replay */ 279 struct btrfs_trans_handle *trans; 280 281 /* the function that gets used to process blocks we find in the 282 * tree. Note the extent_buffer might not be up to date when it is 283 * passed in, and it must be checked or read if you need the data 284 * inside it 285 */ 286 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 287 struct walk_control *wc, u64 gen); 288 }; 289 290 /* 291 * process_func used to pin down extents, write them or wait on them 292 */ 293 static int process_one_buffer(struct btrfs_root *log, 294 struct extent_buffer *eb, 295 struct walk_control *wc, u64 gen) 296 { 297 struct btrfs_fs_info *fs_info = log->fs_info; 298 int ret = 0; 299 300 /* 301 * If this fs is mixed then we need to be able to process the leaves to 302 * pin down any logged extents, so we have to read the block. 303 */ 304 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 305 ret = btrfs_read_buffer(eb, gen); 306 if (ret) 307 return ret; 308 } 309 310 if (wc->pin) 311 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, 312 eb->len); 313 314 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 315 if (wc->pin && btrfs_header_level(eb) == 0) 316 ret = btrfs_exclude_logged_extents(fs_info, eb); 317 if (wc->write) 318 btrfs_write_tree_block(eb); 319 if (wc->wait) 320 btrfs_wait_tree_block_writeback(eb); 321 } 322 return ret; 323 } 324 325 /* 326 * Item overwrite used by replay and tree logging. eb, slot and key all refer 327 * to the src data we are copying out. 328 * 329 * root is the tree we are copying into, and path is a scratch 330 * path for use in this function (it should be released on entry and 331 * will be released on exit). 332 * 333 * If the key is already in the destination tree the existing item is 334 * overwritten. If the existing item isn't big enough, it is extended. 335 * If it is too large, it is truncated. 336 * 337 * If the key isn't in the destination yet, a new item is inserted. 338 */ 339 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 340 struct btrfs_root *root, 341 struct btrfs_path *path, 342 struct extent_buffer *eb, int slot, 343 struct btrfs_key *key) 344 { 345 struct btrfs_fs_info *fs_info = root->fs_info; 346 int ret; 347 u32 item_size; 348 u64 saved_i_size = 0; 349 int save_old_i_size = 0; 350 unsigned long src_ptr; 351 unsigned long dst_ptr; 352 int overwrite_root = 0; 353 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 354 355 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 356 overwrite_root = 1; 357 358 item_size = btrfs_item_size_nr(eb, slot); 359 src_ptr = btrfs_item_ptr_offset(eb, slot); 360 361 /* look for the key in the destination tree */ 362 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 363 if (ret < 0) 364 return ret; 365 366 if (ret == 0) { 367 char *src_copy; 368 char *dst_copy; 369 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 370 path->slots[0]); 371 if (dst_size != item_size) 372 goto insert; 373 374 if (item_size == 0) { 375 btrfs_release_path(path); 376 return 0; 377 } 378 dst_copy = kmalloc(item_size, GFP_NOFS); 379 src_copy = kmalloc(item_size, GFP_NOFS); 380 if (!dst_copy || !src_copy) { 381 btrfs_release_path(path); 382 kfree(dst_copy); 383 kfree(src_copy); 384 return -ENOMEM; 385 } 386 387 read_extent_buffer(eb, src_copy, src_ptr, item_size); 388 389 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 390 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 391 item_size); 392 ret = memcmp(dst_copy, src_copy, item_size); 393 394 kfree(dst_copy); 395 kfree(src_copy); 396 /* 397 * they have the same contents, just return, this saves 398 * us from cowing blocks in the destination tree and doing 399 * extra writes that may not have been done by a previous 400 * sync 401 */ 402 if (ret == 0) { 403 btrfs_release_path(path); 404 return 0; 405 } 406 407 /* 408 * We need to load the old nbytes into the inode so when we 409 * replay the extents we've logged we get the right nbytes. 410 */ 411 if (inode_item) { 412 struct btrfs_inode_item *item; 413 u64 nbytes; 414 u32 mode; 415 416 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 417 struct btrfs_inode_item); 418 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 419 item = btrfs_item_ptr(eb, slot, 420 struct btrfs_inode_item); 421 btrfs_set_inode_nbytes(eb, item, nbytes); 422 423 /* 424 * If this is a directory we need to reset the i_size to 425 * 0 so that we can set it up properly when replaying 426 * the rest of the items in this log. 427 */ 428 mode = btrfs_inode_mode(eb, item); 429 if (S_ISDIR(mode)) 430 btrfs_set_inode_size(eb, item, 0); 431 } 432 } else if (inode_item) { 433 struct btrfs_inode_item *item; 434 u32 mode; 435 436 /* 437 * New inode, set nbytes to 0 so that the nbytes comes out 438 * properly when we replay the extents. 439 */ 440 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 441 btrfs_set_inode_nbytes(eb, item, 0); 442 443 /* 444 * If this is a directory we need to reset the i_size to 0 so 445 * that we can set it up properly when replaying the rest of 446 * the items in this log. 447 */ 448 mode = btrfs_inode_mode(eb, item); 449 if (S_ISDIR(mode)) 450 btrfs_set_inode_size(eb, item, 0); 451 } 452 insert: 453 btrfs_release_path(path); 454 /* try to insert the key into the destination tree */ 455 path->skip_release_on_error = 1; 456 ret = btrfs_insert_empty_item(trans, root, path, 457 key, item_size); 458 path->skip_release_on_error = 0; 459 460 /* make sure any existing item is the correct size */ 461 if (ret == -EEXIST || ret == -EOVERFLOW) { 462 u32 found_size; 463 found_size = btrfs_item_size_nr(path->nodes[0], 464 path->slots[0]); 465 if (found_size > item_size) 466 btrfs_truncate_item(fs_info, path, item_size, 1); 467 else if (found_size < item_size) 468 btrfs_extend_item(fs_info, path, 469 item_size - found_size); 470 } else if (ret) { 471 return ret; 472 } 473 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 474 path->slots[0]); 475 476 /* don't overwrite an existing inode if the generation number 477 * was logged as zero. This is done when the tree logging code 478 * is just logging an inode to make sure it exists after recovery. 479 * 480 * Also, don't overwrite i_size on directories during replay. 481 * log replay inserts and removes directory items based on the 482 * state of the tree found in the subvolume, and i_size is modified 483 * as it goes 484 */ 485 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 486 struct btrfs_inode_item *src_item; 487 struct btrfs_inode_item *dst_item; 488 489 src_item = (struct btrfs_inode_item *)src_ptr; 490 dst_item = (struct btrfs_inode_item *)dst_ptr; 491 492 if (btrfs_inode_generation(eb, src_item) == 0) { 493 struct extent_buffer *dst_eb = path->nodes[0]; 494 const u64 ino_size = btrfs_inode_size(eb, src_item); 495 496 /* 497 * For regular files an ino_size == 0 is used only when 498 * logging that an inode exists, as part of a directory 499 * fsync, and the inode wasn't fsynced before. In this 500 * case don't set the size of the inode in the fs/subvol 501 * tree, otherwise we would be throwing valid data away. 502 */ 503 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 504 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 505 ino_size != 0) { 506 struct btrfs_map_token token; 507 508 btrfs_init_map_token(&token); 509 btrfs_set_token_inode_size(dst_eb, dst_item, 510 ino_size, &token); 511 } 512 goto no_copy; 513 } 514 515 if (overwrite_root && 516 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 517 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 518 save_old_i_size = 1; 519 saved_i_size = btrfs_inode_size(path->nodes[0], 520 dst_item); 521 } 522 } 523 524 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 525 src_ptr, item_size); 526 527 if (save_old_i_size) { 528 struct btrfs_inode_item *dst_item; 529 dst_item = (struct btrfs_inode_item *)dst_ptr; 530 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 531 } 532 533 /* make sure the generation is filled in */ 534 if (key->type == BTRFS_INODE_ITEM_KEY) { 535 struct btrfs_inode_item *dst_item; 536 dst_item = (struct btrfs_inode_item *)dst_ptr; 537 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 538 btrfs_set_inode_generation(path->nodes[0], dst_item, 539 trans->transid); 540 } 541 } 542 no_copy: 543 btrfs_mark_buffer_dirty(path->nodes[0]); 544 btrfs_release_path(path); 545 return 0; 546 } 547 548 /* 549 * simple helper to read an inode off the disk from a given root 550 * This can only be called for subvolume roots and not for the log 551 */ 552 static noinline struct inode *read_one_inode(struct btrfs_root *root, 553 u64 objectid) 554 { 555 struct btrfs_key key; 556 struct inode *inode; 557 558 key.objectid = objectid; 559 key.type = BTRFS_INODE_ITEM_KEY; 560 key.offset = 0; 561 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 562 if (IS_ERR(inode)) { 563 inode = NULL; 564 } else if (is_bad_inode(inode)) { 565 iput(inode); 566 inode = NULL; 567 } 568 return inode; 569 } 570 571 /* replays a single extent in 'eb' at 'slot' with 'key' into the 572 * subvolume 'root'. path is released on entry and should be released 573 * on exit. 574 * 575 * extents in the log tree have not been allocated out of the extent 576 * tree yet. So, this completes the allocation, taking a reference 577 * as required if the extent already exists or creating a new extent 578 * if it isn't in the extent allocation tree yet. 579 * 580 * The extent is inserted into the file, dropping any existing extents 581 * from the file that overlap the new one. 582 */ 583 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 584 struct btrfs_root *root, 585 struct btrfs_path *path, 586 struct extent_buffer *eb, int slot, 587 struct btrfs_key *key) 588 { 589 struct btrfs_fs_info *fs_info = root->fs_info; 590 int found_type; 591 u64 extent_end; 592 u64 start = key->offset; 593 u64 nbytes = 0; 594 struct btrfs_file_extent_item *item; 595 struct inode *inode = NULL; 596 unsigned long size; 597 int ret = 0; 598 599 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 600 found_type = btrfs_file_extent_type(eb, item); 601 602 if (found_type == BTRFS_FILE_EXTENT_REG || 603 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 604 nbytes = btrfs_file_extent_num_bytes(eb, item); 605 extent_end = start + nbytes; 606 607 /* 608 * We don't add to the inodes nbytes if we are prealloc or a 609 * hole. 610 */ 611 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 612 nbytes = 0; 613 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 614 size = btrfs_file_extent_inline_len(eb, slot, item); 615 nbytes = btrfs_file_extent_ram_bytes(eb, item); 616 extent_end = ALIGN(start + size, 617 fs_info->sectorsize); 618 } else { 619 ret = 0; 620 goto out; 621 } 622 623 inode = read_one_inode(root, key->objectid); 624 if (!inode) { 625 ret = -EIO; 626 goto out; 627 } 628 629 /* 630 * first check to see if we already have this extent in the 631 * file. This must be done before the btrfs_drop_extents run 632 * so we don't try to drop this extent. 633 */ 634 ret = btrfs_lookup_file_extent(trans, root, path, 635 btrfs_ino(BTRFS_I(inode)), start, 0); 636 637 if (ret == 0 && 638 (found_type == BTRFS_FILE_EXTENT_REG || 639 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 640 struct btrfs_file_extent_item cmp1; 641 struct btrfs_file_extent_item cmp2; 642 struct btrfs_file_extent_item *existing; 643 struct extent_buffer *leaf; 644 645 leaf = path->nodes[0]; 646 existing = btrfs_item_ptr(leaf, path->slots[0], 647 struct btrfs_file_extent_item); 648 649 read_extent_buffer(eb, &cmp1, (unsigned long)item, 650 sizeof(cmp1)); 651 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 652 sizeof(cmp2)); 653 654 /* 655 * we already have a pointer to this exact extent, 656 * we don't have to do anything 657 */ 658 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 659 btrfs_release_path(path); 660 goto out; 661 } 662 } 663 btrfs_release_path(path); 664 665 /* drop any overlapping extents */ 666 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 667 if (ret) 668 goto out; 669 670 if (found_type == BTRFS_FILE_EXTENT_REG || 671 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 672 u64 offset; 673 unsigned long dest_offset; 674 struct btrfs_key ins; 675 676 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 677 btrfs_fs_incompat(fs_info, NO_HOLES)) 678 goto update_inode; 679 680 ret = btrfs_insert_empty_item(trans, root, path, key, 681 sizeof(*item)); 682 if (ret) 683 goto out; 684 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 685 path->slots[0]); 686 copy_extent_buffer(path->nodes[0], eb, dest_offset, 687 (unsigned long)item, sizeof(*item)); 688 689 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 690 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 691 ins.type = BTRFS_EXTENT_ITEM_KEY; 692 offset = key->offset - btrfs_file_extent_offset(eb, item); 693 694 /* 695 * Manually record dirty extent, as here we did a shallow 696 * file extent item copy and skip normal backref update, 697 * but modifying extent tree all by ourselves. 698 * So need to manually record dirty extent for qgroup, 699 * as the owner of the file extent changed from log tree 700 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 701 */ 702 ret = btrfs_qgroup_trace_extent(trans, fs_info, 703 btrfs_file_extent_disk_bytenr(eb, item), 704 btrfs_file_extent_disk_num_bytes(eb, item), 705 GFP_NOFS); 706 if (ret < 0) 707 goto out; 708 709 if (ins.objectid > 0) { 710 u64 csum_start; 711 u64 csum_end; 712 LIST_HEAD(ordered_sums); 713 /* 714 * is this extent already allocated in the extent 715 * allocation tree? If so, just add a reference 716 */ 717 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 718 ins.offset); 719 if (ret == 0) { 720 ret = btrfs_inc_extent_ref(trans, fs_info, 721 ins.objectid, ins.offset, 722 0, root->root_key.objectid, 723 key->objectid, offset); 724 if (ret) 725 goto out; 726 } else { 727 /* 728 * insert the extent pointer in the extent 729 * allocation tree 730 */ 731 ret = btrfs_alloc_logged_file_extent(trans, 732 fs_info, 733 root->root_key.objectid, 734 key->objectid, offset, &ins); 735 if (ret) 736 goto out; 737 } 738 btrfs_release_path(path); 739 740 if (btrfs_file_extent_compression(eb, item)) { 741 csum_start = ins.objectid; 742 csum_end = csum_start + ins.offset; 743 } else { 744 csum_start = ins.objectid + 745 btrfs_file_extent_offset(eb, item); 746 csum_end = csum_start + 747 btrfs_file_extent_num_bytes(eb, item); 748 } 749 750 ret = btrfs_lookup_csums_range(root->log_root, 751 csum_start, csum_end - 1, 752 &ordered_sums, 0); 753 if (ret) 754 goto out; 755 /* 756 * Now delete all existing cums in the csum root that 757 * cover our range. We do this because we can have an 758 * extent that is completely referenced by one file 759 * extent item and partially referenced by another 760 * file extent item (like after using the clone or 761 * extent_same ioctls). In this case if we end up doing 762 * the replay of the one that partially references the 763 * extent first, and we do not do the csum deletion 764 * below, we can get 2 csum items in the csum tree that 765 * overlap each other. For example, imagine our log has 766 * the two following file extent items: 767 * 768 * key (257 EXTENT_DATA 409600) 769 * extent data disk byte 12845056 nr 102400 770 * extent data offset 20480 nr 20480 ram 102400 771 * 772 * key (257 EXTENT_DATA 819200) 773 * extent data disk byte 12845056 nr 102400 774 * extent data offset 0 nr 102400 ram 102400 775 * 776 * Where the second one fully references the 100K extent 777 * that starts at disk byte 12845056, and the log tree 778 * has a single csum item that covers the entire range 779 * of the extent: 780 * 781 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 782 * 783 * After the first file extent item is replayed, the 784 * csum tree gets the following csum item: 785 * 786 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 787 * 788 * Which covers the 20K sub-range starting at offset 20K 789 * of our extent. Now when we replay the second file 790 * extent item, if we do not delete existing csum items 791 * that cover any of its blocks, we end up getting two 792 * csum items in our csum tree that overlap each other: 793 * 794 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 795 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 796 * 797 * Which is a problem, because after this anyone trying 798 * to lookup up for the checksum of any block of our 799 * extent starting at an offset of 40K or higher, will 800 * end up looking at the second csum item only, which 801 * does not contain the checksum for any block starting 802 * at offset 40K or higher of our extent. 803 */ 804 while (!list_empty(&ordered_sums)) { 805 struct btrfs_ordered_sum *sums; 806 sums = list_entry(ordered_sums.next, 807 struct btrfs_ordered_sum, 808 list); 809 if (!ret) 810 ret = btrfs_del_csums(trans, fs_info, 811 sums->bytenr, 812 sums->len); 813 if (!ret) 814 ret = btrfs_csum_file_blocks(trans, 815 fs_info->csum_root, sums); 816 list_del(&sums->list); 817 kfree(sums); 818 } 819 if (ret) 820 goto out; 821 } else { 822 btrfs_release_path(path); 823 } 824 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 825 /* inline extents are easy, we just overwrite them */ 826 ret = overwrite_item(trans, root, path, eb, slot, key); 827 if (ret) 828 goto out; 829 } 830 831 inode_add_bytes(inode, nbytes); 832 update_inode: 833 ret = btrfs_update_inode(trans, root, inode); 834 out: 835 if (inode) 836 iput(inode); 837 return ret; 838 } 839 840 /* 841 * when cleaning up conflicts between the directory names in the 842 * subvolume, directory names in the log and directory names in the 843 * inode back references, we may have to unlink inodes from directories. 844 * 845 * This is a helper function to do the unlink of a specific directory 846 * item 847 */ 848 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 849 struct btrfs_root *root, 850 struct btrfs_path *path, 851 struct btrfs_inode *dir, 852 struct btrfs_dir_item *di) 853 { 854 struct btrfs_fs_info *fs_info = root->fs_info; 855 struct inode *inode; 856 char *name; 857 int name_len; 858 struct extent_buffer *leaf; 859 struct btrfs_key location; 860 int ret; 861 862 leaf = path->nodes[0]; 863 864 btrfs_dir_item_key_to_cpu(leaf, di, &location); 865 name_len = btrfs_dir_name_len(leaf, di); 866 name = kmalloc(name_len, GFP_NOFS); 867 if (!name) 868 return -ENOMEM; 869 870 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 871 btrfs_release_path(path); 872 873 inode = read_one_inode(root, location.objectid); 874 if (!inode) { 875 ret = -EIO; 876 goto out; 877 } 878 879 ret = link_to_fixup_dir(trans, root, path, location.objectid); 880 if (ret) 881 goto out; 882 883 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 884 name_len); 885 if (ret) 886 goto out; 887 else 888 ret = btrfs_run_delayed_items(trans, fs_info); 889 out: 890 kfree(name); 891 iput(inode); 892 return ret; 893 } 894 895 /* 896 * helper function to see if a given name and sequence number found 897 * in an inode back reference are already in a directory and correctly 898 * point to this inode 899 */ 900 static noinline int inode_in_dir(struct btrfs_root *root, 901 struct btrfs_path *path, 902 u64 dirid, u64 objectid, u64 index, 903 const char *name, int name_len) 904 { 905 struct btrfs_dir_item *di; 906 struct btrfs_key location; 907 int match = 0; 908 909 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 910 index, name, name_len, 0); 911 if (di && !IS_ERR(di)) { 912 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 913 if (location.objectid != objectid) 914 goto out; 915 } else 916 goto out; 917 btrfs_release_path(path); 918 919 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 920 if (di && !IS_ERR(di)) { 921 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 922 if (location.objectid != objectid) 923 goto out; 924 } else 925 goto out; 926 match = 1; 927 out: 928 btrfs_release_path(path); 929 return match; 930 } 931 932 /* 933 * helper function to check a log tree for a named back reference in 934 * an inode. This is used to decide if a back reference that is 935 * found in the subvolume conflicts with what we find in the log. 936 * 937 * inode backreferences may have multiple refs in a single item, 938 * during replay we process one reference at a time, and we don't 939 * want to delete valid links to a file from the subvolume if that 940 * link is also in the log. 941 */ 942 static noinline int backref_in_log(struct btrfs_root *log, 943 struct btrfs_key *key, 944 u64 ref_objectid, 945 const char *name, int namelen) 946 { 947 struct btrfs_path *path; 948 struct btrfs_inode_ref *ref; 949 unsigned long ptr; 950 unsigned long ptr_end; 951 unsigned long name_ptr; 952 int found_name_len; 953 int item_size; 954 int ret; 955 int match = 0; 956 957 path = btrfs_alloc_path(); 958 if (!path) 959 return -ENOMEM; 960 961 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 962 if (ret != 0) 963 goto out; 964 965 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 966 967 if (key->type == BTRFS_INODE_EXTREF_KEY) { 968 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 969 name, namelen, NULL)) 970 match = 1; 971 972 goto out; 973 } 974 975 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 976 ptr_end = ptr + item_size; 977 while (ptr < ptr_end) { 978 ref = (struct btrfs_inode_ref *)ptr; 979 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 980 if (found_name_len == namelen) { 981 name_ptr = (unsigned long)(ref + 1); 982 ret = memcmp_extent_buffer(path->nodes[0], name, 983 name_ptr, namelen); 984 if (ret == 0) { 985 match = 1; 986 goto out; 987 } 988 } 989 ptr = (unsigned long)(ref + 1) + found_name_len; 990 } 991 out: 992 btrfs_free_path(path); 993 return match; 994 } 995 996 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 997 struct btrfs_root *root, 998 struct btrfs_path *path, 999 struct btrfs_root *log_root, 1000 struct btrfs_inode *dir, 1001 struct btrfs_inode *inode, 1002 u64 inode_objectid, u64 parent_objectid, 1003 u64 ref_index, char *name, int namelen, 1004 int *search_done) 1005 { 1006 struct btrfs_fs_info *fs_info = root->fs_info; 1007 int ret; 1008 char *victim_name; 1009 int victim_name_len; 1010 struct extent_buffer *leaf; 1011 struct btrfs_dir_item *di; 1012 struct btrfs_key search_key; 1013 struct btrfs_inode_extref *extref; 1014 1015 again: 1016 /* Search old style refs */ 1017 search_key.objectid = inode_objectid; 1018 search_key.type = BTRFS_INODE_REF_KEY; 1019 search_key.offset = parent_objectid; 1020 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1021 if (ret == 0) { 1022 struct btrfs_inode_ref *victim_ref; 1023 unsigned long ptr; 1024 unsigned long ptr_end; 1025 1026 leaf = path->nodes[0]; 1027 1028 /* are we trying to overwrite a back ref for the root directory 1029 * if so, just jump out, we're done 1030 */ 1031 if (search_key.objectid == search_key.offset) 1032 return 1; 1033 1034 /* check all the names in this back reference to see 1035 * if they are in the log. if so, we allow them to stay 1036 * otherwise they must be unlinked as a conflict 1037 */ 1038 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1039 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1040 while (ptr < ptr_end) { 1041 victim_ref = (struct btrfs_inode_ref *)ptr; 1042 victim_name_len = btrfs_inode_ref_name_len(leaf, 1043 victim_ref); 1044 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1045 if (!victim_name) 1046 return -ENOMEM; 1047 1048 read_extent_buffer(leaf, victim_name, 1049 (unsigned long)(victim_ref + 1), 1050 victim_name_len); 1051 1052 if (!backref_in_log(log_root, &search_key, 1053 parent_objectid, 1054 victim_name, 1055 victim_name_len)) { 1056 inc_nlink(&inode->vfs_inode); 1057 btrfs_release_path(path); 1058 1059 ret = btrfs_unlink_inode(trans, root, dir, inode, 1060 victim_name, victim_name_len); 1061 kfree(victim_name); 1062 if (ret) 1063 return ret; 1064 ret = btrfs_run_delayed_items(trans, fs_info); 1065 if (ret) 1066 return ret; 1067 *search_done = 1; 1068 goto again; 1069 } 1070 kfree(victim_name); 1071 1072 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1073 } 1074 1075 /* 1076 * NOTE: we have searched root tree and checked the 1077 * corresponding ref, it does not need to check again. 1078 */ 1079 *search_done = 1; 1080 } 1081 btrfs_release_path(path); 1082 1083 /* Same search but for extended refs */ 1084 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1085 inode_objectid, parent_objectid, 0, 1086 0); 1087 if (!IS_ERR_OR_NULL(extref)) { 1088 u32 item_size; 1089 u32 cur_offset = 0; 1090 unsigned long base; 1091 struct inode *victim_parent; 1092 1093 leaf = path->nodes[0]; 1094 1095 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1096 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1097 1098 while (cur_offset < item_size) { 1099 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1100 1101 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1102 1103 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1104 goto next; 1105 1106 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1107 if (!victim_name) 1108 return -ENOMEM; 1109 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1110 victim_name_len); 1111 1112 search_key.objectid = inode_objectid; 1113 search_key.type = BTRFS_INODE_EXTREF_KEY; 1114 search_key.offset = btrfs_extref_hash(parent_objectid, 1115 victim_name, 1116 victim_name_len); 1117 ret = 0; 1118 if (!backref_in_log(log_root, &search_key, 1119 parent_objectid, victim_name, 1120 victim_name_len)) { 1121 ret = -ENOENT; 1122 victim_parent = read_one_inode(root, 1123 parent_objectid); 1124 if (victim_parent) { 1125 inc_nlink(&inode->vfs_inode); 1126 btrfs_release_path(path); 1127 1128 ret = btrfs_unlink_inode(trans, root, 1129 BTRFS_I(victim_parent), 1130 inode, 1131 victim_name, 1132 victim_name_len); 1133 if (!ret) 1134 ret = btrfs_run_delayed_items( 1135 trans, 1136 fs_info); 1137 } 1138 iput(victim_parent); 1139 kfree(victim_name); 1140 if (ret) 1141 return ret; 1142 *search_done = 1; 1143 goto again; 1144 } 1145 kfree(victim_name); 1146 next: 1147 cur_offset += victim_name_len + sizeof(*extref); 1148 } 1149 *search_done = 1; 1150 } 1151 btrfs_release_path(path); 1152 1153 /* look for a conflicting sequence number */ 1154 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1155 ref_index, name, namelen, 0); 1156 if (di && !IS_ERR(di)) { 1157 ret = drop_one_dir_item(trans, root, path, dir, di); 1158 if (ret) 1159 return ret; 1160 } 1161 btrfs_release_path(path); 1162 1163 /* look for a conflicing name */ 1164 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1165 name, namelen, 0); 1166 if (di && !IS_ERR(di)) { 1167 ret = drop_one_dir_item(trans, root, path, dir, di); 1168 if (ret) 1169 return ret; 1170 } 1171 btrfs_release_path(path); 1172 1173 return 0; 1174 } 1175 1176 static int extref_get_fields(struct extent_buffer *eb, int slot, 1177 unsigned long ref_ptr, u32 *namelen, char **name, 1178 u64 *index, u64 *parent_objectid) 1179 { 1180 struct btrfs_inode_extref *extref; 1181 1182 extref = (struct btrfs_inode_extref *)ref_ptr; 1183 1184 *namelen = btrfs_inode_extref_name_len(eb, extref); 1185 if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name, 1186 *namelen)) 1187 return -EIO; 1188 1189 *name = kmalloc(*namelen, GFP_NOFS); 1190 if (*name == NULL) 1191 return -ENOMEM; 1192 1193 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1194 *namelen); 1195 1196 *index = btrfs_inode_extref_index(eb, extref); 1197 if (parent_objectid) 1198 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1199 1200 return 0; 1201 } 1202 1203 static int ref_get_fields(struct extent_buffer *eb, int slot, 1204 unsigned long ref_ptr, u32 *namelen, char **name, 1205 u64 *index) 1206 { 1207 struct btrfs_inode_ref *ref; 1208 1209 ref = (struct btrfs_inode_ref *)ref_ptr; 1210 1211 *namelen = btrfs_inode_ref_name_len(eb, ref); 1212 if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1), 1213 *namelen)) 1214 return -EIO; 1215 1216 *name = kmalloc(*namelen, GFP_NOFS); 1217 if (*name == NULL) 1218 return -ENOMEM; 1219 1220 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1221 1222 *index = btrfs_inode_ref_index(eb, ref); 1223 1224 return 0; 1225 } 1226 1227 /* 1228 * replay one inode back reference item found in the log tree. 1229 * eb, slot and key refer to the buffer and key found in the log tree. 1230 * root is the destination we are replaying into, and path is for temp 1231 * use by this function. (it should be released on return). 1232 */ 1233 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1234 struct btrfs_root *root, 1235 struct btrfs_root *log, 1236 struct btrfs_path *path, 1237 struct extent_buffer *eb, int slot, 1238 struct btrfs_key *key) 1239 { 1240 struct inode *dir = NULL; 1241 struct inode *inode = NULL; 1242 unsigned long ref_ptr; 1243 unsigned long ref_end; 1244 char *name = NULL; 1245 int namelen; 1246 int ret; 1247 int search_done = 0; 1248 int log_ref_ver = 0; 1249 u64 parent_objectid; 1250 u64 inode_objectid; 1251 u64 ref_index = 0; 1252 int ref_struct_size; 1253 1254 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1255 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1256 1257 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1258 struct btrfs_inode_extref *r; 1259 1260 ref_struct_size = sizeof(struct btrfs_inode_extref); 1261 log_ref_ver = 1; 1262 r = (struct btrfs_inode_extref *)ref_ptr; 1263 parent_objectid = btrfs_inode_extref_parent(eb, r); 1264 } else { 1265 ref_struct_size = sizeof(struct btrfs_inode_ref); 1266 parent_objectid = key->offset; 1267 } 1268 inode_objectid = key->objectid; 1269 1270 /* 1271 * it is possible that we didn't log all the parent directories 1272 * for a given inode. If we don't find the dir, just don't 1273 * copy the back ref in. The link count fixup code will take 1274 * care of the rest 1275 */ 1276 dir = read_one_inode(root, parent_objectid); 1277 if (!dir) { 1278 ret = -ENOENT; 1279 goto out; 1280 } 1281 1282 inode = read_one_inode(root, inode_objectid); 1283 if (!inode) { 1284 ret = -EIO; 1285 goto out; 1286 } 1287 1288 while (ref_ptr < ref_end) { 1289 if (log_ref_ver) { 1290 ret = extref_get_fields(eb, slot, ref_ptr, &namelen, 1291 &name, &ref_index, &parent_objectid); 1292 /* 1293 * parent object can change from one array 1294 * item to another. 1295 */ 1296 if (!dir) 1297 dir = read_one_inode(root, parent_objectid); 1298 if (!dir) { 1299 ret = -ENOENT; 1300 goto out; 1301 } 1302 } else { 1303 ret = ref_get_fields(eb, slot, ref_ptr, &namelen, 1304 &name, &ref_index); 1305 } 1306 if (ret) 1307 goto out; 1308 1309 /* if we already have a perfect match, we're done */ 1310 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1311 btrfs_ino(BTRFS_I(inode)), ref_index, 1312 name, namelen)) { 1313 /* 1314 * look for a conflicting back reference in the 1315 * metadata. if we find one we have to unlink that name 1316 * of the file before we add our new link. Later on, we 1317 * overwrite any existing back reference, and we don't 1318 * want to create dangling pointers in the directory. 1319 */ 1320 1321 if (!search_done) { 1322 ret = __add_inode_ref(trans, root, path, log, 1323 BTRFS_I(dir), 1324 BTRFS_I(inode), 1325 inode_objectid, 1326 parent_objectid, 1327 ref_index, name, namelen, 1328 &search_done); 1329 if (ret) { 1330 if (ret == 1) 1331 ret = 0; 1332 goto out; 1333 } 1334 } 1335 1336 /* insert our name */ 1337 ret = btrfs_add_link(trans, BTRFS_I(dir), 1338 BTRFS_I(inode), 1339 name, namelen, 0, ref_index); 1340 if (ret) 1341 goto out; 1342 1343 btrfs_update_inode(trans, root, inode); 1344 } 1345 1346 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1347 kfree(name); 1348 name = NULL; 1349 if (log_ref_ver) { 1350 iput(dir); 1351 dir = NULL; 1352 } 1353 } 1354 1355 /* finally write the back reference in the inode */ 1356 ret = overwrite_item(trans, root, path, eb, slot, key); 1357 out: 1358 btrfs_release_path(path); 1359 kfree(name); 1360 iput(dir); 1361 iput(inode); 1362 return ret; 1363 } 1364 1365 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1366 struct btrfs_root *root, u64 ino) 1367 { 1368 int ret; 1369 1370 ret = btrfs_insert_orphan_item(trans, root, ino); 1371 if (ret == -EEXIST) 1372 ret = 0; 1373 1374 return ret; 1375 } 1376 1377 static int count_inode_extrefs(struct btrfs_root *root, 1378 struct btrfs_inode *inode, struct btrfs_path *path) 1379 { 1380 int ret = 0; 1381 int name_len; 1382 unsigned int nlink = 0; 1383 u32 item_size; 1384 u32 cur_offset = 0; 1385 u64 inode_objectid = btrfs_ino(inode); 1386 u64 offset = 0; 1387 unsigned long ptr; 1388 struct btrfs_inode_extref *extref; 1389 struct extent_buffer *leaf; 1390 1391 while (1) { 1392 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1393 &extref, &offset); 1394 if (ret) 1395 break; 1396 1397 leaf = path->nodes[0]; 1398 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1399 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1400 cur_offset = 0; 1401 1402 while (cur_offset < item_size) { 1403 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1404 name_len = btrfs_inode_extref_name_len(leaf, extref); 1405 1406 nlink++; 1407 1408 cur_offset += name_len + sizeof(*extref); 1409 } 1410 1411 offset++; 1412 btrfs_release_path(path); 1413 } 1414 btrfs_release_path(path); 1415 1416 if (ret < 0 && ret != -ENOENT) 1417 return ret; 1418 return nlink; 1419 } 1420 1421 static int count_inode_refs(struct btrfs_root *root, 1422 struct btrfs_inode *inode, struct btrfs_path *path) 1423 { 1424 int ret; 1425 struct btrfs_key key; 1426 unsigned int nlink = 0; 1427 unsigned long ptr; 1428 unsigned long ptr_end; 1429 int name_len; 1430 u64 ino = btrfs_ino(inode); 1431 1432 key.objectid = ino; 1433 key.type = BTRFS_INODE_REF_KEY; 1434 key.offset = (u64)-1; 1435 1436 while (1) { 1437 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1438 if (ret < 0) 1439 break; 1440 if (ret > 0) { 1441 if (path->slots[0] == 0) 1442 break; 1443 path->slots[0]--; 1444 } 1445 process_slot: 1446 btrfs_item_key_to_cpu(path->nodes[0], &key, 1447 path->slots[0]); 1448 if (key.objectid != ino || 1449 key.type != BTRFS_INODE_REF_KEY) 1450 break; 1451 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1452 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1453 path->slots[0]); 1454 while (ptr < ptr_end) { 1455 struct btrfs_inode_ref *ref; 1456 1457 ref = (struct btrfs_inode_ref *)ptr; 1458 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1459 ref); 1460 ptr = (unsigned long)(ref + 1) + name_len; 1461 nlink++; 1462 } 1463 1464 if (key.offset == 0) 1465 break; 1466 if (path->slots[0] > 0) { 1467 path->slots[0]--; 1468 goto process_slot; 1469 } 1470 key.offset--; 1471 btrfs_release_path(path); 1472 } 1473 btrfs_release_path(path); 1474 1475 return nlink; 1476 } 1477 1478 /* 1479 * There are a few corners where the link count of the file can't 1480 * be properly maintained during replay. So, instead of adding 1481 * lots of complexity to the log code, we just scan the backrefs 1482 * for any file that has been through replay. 1483 * 1484 * The scan will update the link count on the inode to reflect the 1485 * number of back refs found. If it goes down to zero, the iput 1486 * will free the inode. 1487 */ 1488 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1489 struct btrfs_root *root, 1490 struct inode *inode) 1491 { 1492 struct btrfs_path *path; 1493 int ret; 1494 u64 nlink = 0; 1495 u64 ino = btrfs_ino(BTRFS_I(inode)); 1496 1497 path = btrfs_alloc_path(); 1498 if (!path) 1499 return -ENOMEM; 1500 1501 ret = count_inode_refs(root, BTRFS_I(inode), path); 1502 if (ret < 0) 1503 goto out; 1504 1505 nlink = ret; 1506 1507 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1508 if (ret < 0) 1509 goto out; 1510 1511 nlink += ret; 1512 1513 ret = 0; 1514 1515 if (nlink != inode->i_nlink) { 1516 set_nlink(inode, nlink); 1517 btrfs_update_inode(trans, root, inode); 1518 } 1519 BTRFS_I(inode)->index_cnt = (u64)-1; 1520 1521 if (inode->i_nlink == 0) { 1522 if (S_ISDIR(inode->i_mode)) { 1523 ret = replay_dir_deletes(trans, root, NULL, path, 1524 ino, 1); 1525 if (ret) 1526 goto out; 1527 } 1528 ret = insert_orphan_item(trans, root, ino); 1529 } 1530 1531 out: 1532 btrfs_free_path(path); 1533 return ret; 1534 } 1535 1536 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1537 struct btrfs_root *root, 1538 struct btrfs_path *path) 1539 { 1540 int ret; 1541 struct btrfs_key key; 1542 struct inode *inode; 1543 1544 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1545 key.type = BTRFS_ORPHAN_ITEM_KEY; 1546 key.offset = (u64)-1; 1547 while (1) { 1548 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1549 if (ret < 0) 1550 break; 1551 1552 if (ret == 1) { 1553 if (path->slots[0] == 0) 1554 break; 1555 path->slots[0]--; 1556 } 1557 1558 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1559 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1560 key.type != BTRFS_ORPHAN_ITEM_KEY) 1561 break; 1562 1563 ret = btrfs_del_item(trans, root, path); 1564 if (ret) 1565 goto out; 1566 1567 btrfs_release_path(path); 1568 inode = read_one_inode(root, key.offset); 1569 if (!inode) 1570 return -EIO; 1571 1572 ret = fixup_inode_link_count(trans, root, inode); 1573 iput(inode); 1574 if (ret) 1575 goto out; 1576 1577 /* 1578 * fixup on a directory may create new entries, 1579 * make sure we always look for the highset possible 1580 * offset 1581 */ 1582 key.offset = (u64)-1; 1583 } 1584 ret = 0; 1585 out: 1586 btrfs_release_path(path); 1587 return ret; 1588 } 1589 1590 1591 /* 1592 * record a given inode in the fixup dir so we can check its link 1593 * count when replay is done. The link count is incremented here 1594 * so the inode won't go away until we check it 1595 */ 1596 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1597 struct btrfs_root *root, 1598 struct btrfs_path *path, 1599 u64 objectid) 1600 { 1601 struct btrfs_key key; 1602 int ret = 0; 1603 struct inode *inode; 1604 1605 inode = read_one_inode(root, objectid); 1606 if (!inode) 1607 return -EIO; 1608 1609 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1610 key.type = BTRFS_ORPHAN_ITEM_KEY; 1611 key.offset = objectid; 1612 1613 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1614 1615 btrfs_release_path(path); 1616 if (ret == 0) { 1617 if (!inode->i_nlink) 1618 set_nlink(inode, 1); 1619 else 1620 inc_nlink(inode); 1621 ret = btrfs_update_inode(trans, root, inode); 1622 } else if (ret == -EEXIST) { 1623 ret = 0; 1624 } else { 1625 BUG(); /* Logic Error */ 1626 } 1627 iput(inode); 1628 1629 return ret; 1630 } 1631 1632 /* 1633 * when replaying the log for a directory, we only insert names 1634 * for inodes that actually exist. This means an fsync on a directory 1635 * does not implicitly fsync all the new files in it 1636 */ 1637 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1638 struct btrfs_root *root, 1639 u64 dirid, u64 index, 1640 char *name, int name_len, 1641 struct btrfs_key *location) 1642 { 1643 struct inode *inode; 1644 struct inode *dir; 1645 int ret; 1646 1647 inode = read_one_inode(root, location->objectid); 1648 if (!inode) 1649 return -ENOENT; 1650 1651 dir = read_one_inode(root, dirid); 1652 if (!dir) { 1653 iput(inode); 1654 return -EIO; 1655 } 1656 1657 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1658 name_len, 1, index); 1659 1660 /* FIXME, put inode into FIXUP list */ 1661 1662 iput(inode); 1663 iput(dir); 1664 return ret; 1665 } 1666 1667 /* 1668 * Return true if an inode reference exists in the log for the given name, 1669 * inode and parent inode. 1670 */ 1671 static bool name_in_log_ref(struct btrfs_root *log_root, 1672 const char *name, const int name_len, 1673 const u64 dirid, const u64 ino) 1674 { 1675 struct btrfs_key search_key; 1676 1677 search_key.objectid = ino; 1678 search_key.type = BTRFS_INODE_REF_KEY; 1679 search_key.offset = dirid; 1680 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1681 return true; 1682 1683 search_key.type = BTRFS_INODE_EXTREF_KEY; 1684 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1685 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1686 return true; 1687 1688 return false; 1689 } 1690 1691 /* 1692 * take a single entry in a log directory item and replay it into 1693 * the subvolume. 1694 * 1695 * if a conflicting item exists in the subdirectory already, 1696 * the inode it points to is unlinked and put into the link count 1697 * fix up tree. 1698 * 1699 * If a name from the log points to a file or directory that does 1700 * not exist in the FS, it is skipped. fsyncs on directories 1701 * do not force down inodes inside that directory, just changes to the 1702 * names or unlinks in a directory. 1703 * 1704 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1705 * non-existing inode) and 1 if the name was replayed. 1706 */ 1707 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1708 struct btrfs_root *root, 1709 struct btrfs_path *path, 1710 struct extent_buffer *eb, 1711 struct btrfs_dir_item *di, 1712 struct btrfs_key *key) 1713 { 1714 char *name; 1715 int name_len; 1716 struct btrfs_dir_item *dst_di; 1717 struct btrfs_key found_key; 1718 struct btrfs_key log_key; 1719 struct inode *dir; 1720 u8 log_type; 1721 int exists; 1722 int ret = 0; 1723 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1724 bool name_added = false; 1725 1726 dir = read_one_inode(root, key->objectid); 1727 if (!dir) 1728 return -EIO; 1729 1730 name_len = btrfs_dir_name_len(eb, di); 1731 name = kmalloc(name_len, GFP_NOFS); 1732 if (!name) { 1733 ret = -ENOMEM; 1734 goto out; 1735 } 1736 1737 log_type = btrfs_dir_type(eb, di); 1738 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1739 name_len); 1740 1741 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1742 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1743 if (exists == 0) 1744 exists = 1; 1745 else 1746 exists = 0; 1747 btrfs_release_path(path); 1748 1749 if (key->type == BTRFS_DIR_ITEM_KEY) { 1750 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1751 name, name_len, 1); 1752 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1753 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1754 key->objectid, 1755 key->offset, name, 1756 name_len, 1); 1757 } else { 1758 /* Corruption */ 1759 ret = -EINVAL; 1760 goto out; 1761 } 1762 if (IS_ERR_OR_NULL(dst_di)) { 1763 /* we need a sequence number to insert, so we only 1764 * do inserts for the BTRFS_DIR_INDEX_KEY types 1765 */ 1766 if (key->type != BTRFS_DIR_INDEX_KEY) 1767 goto out; 1768 goto insert; 1769 } 1770 1771 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1772 /* the existing item matches the logged item */ 1773 if (found_key.objectid == log_key.objectid && 1774 found_key.type == log_key.type && 1775 found_key.offset == log_key.offset && 1776 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1777 update_size = false; 1778 goto out; 1779 } 1780 1781 /* 1782 * don't drop the conflicting directory entry if the inode 1783 * for the new entry doesn't exist 1784 */ 1785 if (!exists) 1786 goto out; 1787 1788 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1789 if (ret) 1790 goto out; 1791 1792 if (key->type == BTRFS_DIR_INDEX_KEY) 1793 goto insert; 1794 out: 1795 btrfs_release_path(path); 1796 if (!ret && update_size) { 1797 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1798 ret = btrfs_update_inode(trans, root, dir); 1799 } 1800 kfree(name); 1801 iput(dir); 1802 if (!ret && name_added) 1803 ret = 1; 1804 return ret; 1805 1806 insert: 1807 if (name_in_log_ref(root->log_root, name, name_len, 1808 key->objectid, log_key.objectid)) { 1809 /* The dentry will be added later. */ 1810 ret = 0; 1811 update_size = false; 1812 goto out; 1813 } 1814 btrfs_release_path(path); 1815 ret = insert_one_name(trans, root, key->objectid, key->offset, 1816 name, name_len, &log_key); 1817 if (ret && ret != -ENOENT && ret != -EEXIST) 1818 goto out; 1819 if (!ret) 1820 name_added = true; 1821 update_size = false; 1822 ret = 0; 1823 goto out; 1824 } 1825 1826 /* 1827 * find all the names in a directory item and reconcile them into 1828 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1829 * one name in a directory item, but the same code gets used for 1830 * both directory index types 1831 */ 1832 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1833 struct btrfs_root *root, 1834 struct btrfs_path *path, 1835 struct extent_buffer *eb, int slot, 1836 struct btrfs_key *key) 1837 { 1838 struct btrfs_fs_info *fs_info = root->fs_info; 1839 int ret = 0; 1840 u32 item_size = btrfs_item_size_nr(eb, slot); 1841 struct btrfs_dir_item *di; 1842 int name_len; 1843 unsigned long ptr; 1844 unsigned long ptr_end; 1845 struct btrfs_path *fixup_path = NULL; 1846 1847 ptr = btrfs_item_ptr_offset(eb, slot); 1848 ptr_end = ptr + item_size; 1849 while (ptr < ptr_end) { 1850 di = (struct btrfs_dir_item *)ptr; 1851 if (verify_dir_item(fs_info, eb, slot, di)) 1852 return -EIO; 1853 name_len = btrfs_dir_name_len(eb, di); 1854 ret = replay_one_name(trans, root, path, eb, di, key); 1855 if (ret < 0) 1856 break; 1857 ptr = (unsigned long)(di + 1); 1858 ptr += name_len; 1859 1860 /* 1861 * If this entry refers to a non-directory (directories can not 1862 * have a link count > 1) and it was added in the transaction 1863 * that was not committed, make sure we fixup the link count of 1864 * the inode it the entry points to. Otherwise something like 1865 * the following would result in a directory pointing to an 1866 * inode with a wrong link that does not account for this dir 1867 * entry: 1868 * 1869 * mkdir testdir 1870 * touch testdir/foo 1871 * touch testdir/bar 1872 * sync 1873 * 1874 * ln testdir/bar testdir/bar_link 1875 * ln testdir/foo testdir/foo_link 1876 * xfs_io -c "fsync" testdir/bar 1877 * 1878 * <power failure> 1879 * 1880 * mount fs, log replay happens 1881 * 1882 * File foo would remain with a link count of 1 when it has two 1883 * entries pointing to it in the directory testdir. This would 1884 * make it impossible to ever delete the parent directory has 1885 * it would result in stale dentries that can never be deleted. 1886 */ 1887 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 1888 struct btrfs_key di_key; 1889 1890 if (!fixup_path) { 1891 fixup_path = btrfs_alloc_path(); 1892 if (!fixup_path) { 1893 ret = -ENOMEM; 1894 break; 1895 } 1896 } 1897 1898 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1899 ret = link_to_fixup_dir(trans, root, fixup_path, 1900 di_key.objectid); 1901 if (ret) 1902 break; 1903 } 1904 ret = 0; 1905 } 1906 btrfs_free_path(fixup_path); 1907 return ret; 1908 } 1909 1910 /* 1911 * directory replay has two parts. There are the standard directory 1912 * items in the log copied from the subvolume, and range items 1913 * created in the log while the subvolume was logged. 1914 * 1915 * The range items tell us which parts of the key space the log 1916 * is authoritative for. During replay, if a key in the subvolume 1917 * directory is in a logged range item, but not actually in the log 1918 * that means it was deleted from the directory before the fsync 1919 * and should be removed. 1920 */ 1921 static noinline int find_dir_range(struct btrfs_root *root, 1922 struct btrfs_path *path, 1923 u64 dirid, int key_type, 1924 u64 *start_ret, u64 *end_ret) 1925 { 1926 struct btrfs_key key; 1927 u64 found_end; 1928 struct btrfs_dir_log_item *item; 1929 int ret; 1930 int nritems; 1931 1932 if (*start_ret == (u64)-1) 1933 return 1; 1934 1935 key.objectid = dirid; 1936 key.type = key_type; 1937 key.offset = *start_ret; 1938 1939 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1940 if (ret < 0) 1941 goto out; 1942 if (ret > 0) { 1943 if (path->slots[0] == 0) 1944 goto out; 1945 path->slots[0]--; 1946 } 1947 if (ret != 0) 1948 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1949 1950 if (key.type != key_type || key.objectid != dirid) { 1951 ret = 1; 1952 goto next; 1953 } 1954 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1955 struct btrfs_dir_log_item); 1956 found_end = btrfs_dir_log_end(path->nodes[0], item); 1957 1958 if (*start_ret >= key.offset && *start_ret <= found_end) { 1959 ret = 0; 1960 *start_ret = key.offset; 1961 *end_ret = found_end; 1962 goto out; 1963 } 1964 ret = 1; 1965 next: 1966 /* check the next slot in the tree to see if it is a valid item */ 1967 nritems = btrfs_header_nritems(path->nodes[0]); 1968 path->slots[0]++; 1969 if (path->slots[0] >= nritems) { 1970 ret = btrfs_next_leaf(root, path); 1971 if (ret) 1972 goto out; 1973 } 1974 1975 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1976 1977 if (key.type != key_type || key.objectid != dirid) { 1978 ret = 1; 1979 goto out; 1980 } 1981 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1982 struct btrfs_dir_log_item); 1983 found_end = btrfs_dir_log_end(path->nodes[0], item); 1984 *start_ret = key.offset; 1985 *end_ret = found_end; 1986 ret = 0; 1987 out: 1988 btrfs_release_path(path); 1989 return ret; 1990 } 1991 1992 /* 1993 * this looks for a given directory item in the log. If the directory 1994 * item is not in the log, the item is removed and the inode it points 1995 * to is unlinked 1996 */ 1997 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1998 struct btrfs_root *root, 1999 struct btrfs_root *log, 2000 struct btrfs_path *path, 2001 struct btrfs_path *log_path, 2002 struct inode *dir, 2003 struct btrfs_key *dir_key) 2004 { 2005 struct btrfs_fs_info *fs_info = root->fs_info; 2006 int ret; 2007 struct extent_buffer *eb; 2008 int slot; 2009 u32 item_size; 2010 struct btrfs_dir_item *di; 2011 struct btrfs_dir_item *log_di; 2012 int name_len; 2013 unsigned long ptr; 2014 unsigned long ptr_end; 2015 char *name; 2016 struct inode *inode; 2017 struct btrfs_key location; 2018 2019 again: 2020 eb = path->nodes[0]; 2021 slot = path->slots[0]; 2022 item_size = btrfs_item_size_nr(eb, slot); 2023 ptr = btrfs_item_ptr_offset(eb, slot); 2024 ptr_end = ptr + item_size; 2025 while (ptr < ptr_end) { 2026 di = (struct btrfs_dir_item *)ptr; 2027 if (verify_dir_item(fs_info, eb, slot, di)) { 2028 ret = -EIO; 2029 goto out; 2030 } 2031 2032 name_len = btrfs_dir_name_len(eb, di); 2033 name = kmalloc(name_len, GFP_NOFS); 2034 if (!name) { 2035 ret = -ENOMEM; 2036 goto out; 2037 } 2038 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2039 name_len); 2040 log_di = NULL; 2041 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2042 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2043 dir_key->objectid, 2044 name, name_len, 0); 2045 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2046 log_di = btrfs_lookup_dir_index_item(trans, log, 2047 log_path, 2048 dir_key->objectid, 2049 dir_key->offset, 2050 name, name_len, 0); 2051 } 2052 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 2053 btrfs_dir_item_key_to_cpu(eb, di, &location); 2054 btrfs_release_path(path); 2055 btrfs_release_path(log_path); 2056 inode = read_one_inode(root, location.objectid); 2057 if (!inode) { 2058 kfree(name); 2059 return -EIO; 2060 } 2061 2062 ret = link_to_fixup_dir(trans, root, 2063 path, location.objectid); 2064 if (ret) { 2065 kfree(name); 2066 iput(inode); 2067 goto out; 2068 } 2069 2070 inc_nlink(inode); 2071 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2072 BTRFS_I(inode), name, name_len); 2073 if (!ret) 2074 ret = btrfs_run_delayed_items(trans, fs_info); 2075 kfree(name); 2076 iput(inode); 2077 if (ret) 2078 goto out; 2079 2080 /* there might still be more names under this key 2081 * check and repeat if required 2082 */ 2083 ret = btrfs_search_slot(NULL, root, dir_key, path, 2084 0, 0); 2085 if (ret == 0) 2086 goto again; 2087 ret = 0; 2088 goto out; 2089 } else if (IS_ERR(log_di)) { 2090 kfree(name); 2091 return PTR_ERR(log_di); 2092 } 2093 btrfs_release_path(log_path); 2094 kfree(name); 2095 2096 ptr = (unsigned long)(di + 1); 2097 ptr += name_len; 2098 } 2099 ret = 0; 2100 out: 2101 btrfs_release_path(path); 2102 btrfs_release_path(log_path); 2103 return ret; 2104 } 2105 2106 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2107 struct btrfs_root *root, 2108 struct btrfs_root *log, 2109 struct btrfs_path *path, 2110 const u64 ino) 2111 { 2112 struct btrfs_fs_info *fs_info = root->fs_info; 2113 struct btrfs_key search_key; 2114 struct btrfs_path *log_path; 2115 int i; 2116 int nritems; 2117 int ret; 2118 2119 log_path = btrfs_alloc_path(); 2120 if (!log_path) 2121 return -ENOMEM; 2122 2123 search_key.objectid = ino; 2124 search_key.type = BTRFS_XATTR_ITEM_KEY; 2125 search_key.offset = 0; 2126 again: 2127 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2128 if (ret < 0) 2129 goto out; 2130 process_leaf: 2131 nritems = btrfs_header_nritems(path->nodes[0]); 2132 for (i = path->slots[0]; i < nritems; i++) { 2133 struct btrfs_key key; 2134 struct btrfs_dir_item *di; 2135 struct btrfs_dir_item *log_di; 2136 u32 total_size; 2137 u32 cur; 2138 2139 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2140 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2141 ret = 0; 2142 goto out; 2143 } 2144 2145 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2146 total_size = btrfs_item_size_nr(path->nodes[0], i); 2147 cur = 0; 2148 while (cur < total_size) { 2149 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2150 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2151 u32 this_len = sizeof(*di) + name_len + data_len; 2152 char *name; 2153 2154 ret = verify_dir_item(fs_info, path->nodes[0], i, di); 2155 if (ret) { 2156 ret = -EIO; 2157 goto out; 2158 } 2159 name = kmalloc(name_len, GFP_NOFS); 2160 if (!name) { 2161 ret = -ENOMEM; 2162 goto out; 2163 } 2164 read_extent_buffer(path->nodes[0], name, 2165 (unsigned long)(di + 1), name_len); 2166 2167 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2168 name, name_len, 0); 2169 btrfs_release_path(log_path); 2170 if (!log_di) { 2171 /* Doesn't exist in log tree, so delete it. */ 2172 btrfs_release_path(path); 2173 di = btrfs_lookup_xattr(trans, root, path, ino, 2174 name, name_len, -1); 2175 kfree(name); 2176 if (IS_ERR(di)) { 2177 ret = PTR_ERR(di); 2178 goto out; 2179 } 2180 ASSERT(di); 2181 ret = btrfs_delete_one_dir_name(trans, root, 2182 path, di); 2183 if (ret) 2184 goto out; 2185 btrfs_release_path(path); 2186 search_key = key; 2187 goto again; 2188 } 2189 kfree(name); 2190 if (IS_ERR(log_di)) { 2191 ret = PTR_ERR(log_di); 2192 goto out; 2193 } 2194 cur += this_len; 2195 di = (struct btrfs_dir_item *)((char *)di + this_len); 2196 } 2197 } 2198 ret = btrfs_next_leaf(root, path); 2199 if (ret > 0) 2200 ret = 0; 2201 else if (ret == 0) 2202 goto process_leaf; 2203 out: 2204 btrfs_free_path(log_path); 2205 btrfs_release_path(path); 2206 return ret; 2207 } 2208 2209 2210 /* 2211 * deletion replay happens before we copy any new directory items 2212 * out of the log or out of backreferences from inodes. It 2213 * scans the log to find ranges of keys that log is authoritative for, 2214 * and then scans the directory to find items in those ranges that are 2215 * not present in the log. 2216 * 2217 * Anything we don't find in the log is unlinked and removed from the 2218 * directory. 2219 */ 2220 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2221 struct btrfs_root *root, 2222 struct btrfs_root *log, 2223 struct btrfs_path *path, 2224 u64 dirid, int del_all) 2225 { 2226 u64 range_start; 2227 u64 range_end; 2228 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2229 int ret = 0; 2230 struct btrfs_key dir_key; 2231 struct btrfs_key found_key; 2232 struct btrfs_path *log_path; 2233 struct inode *dir; 2234 2235 dir_key.objectid = dirid; 2236 dir_key.type = BTRFS_DIR_ITEM_KEY; 2237 log_path = btrfs_alloc_path(); 2238 if (!log_path) 2239 return -ENOMEM; 2240 2241 dir = read_one_inode(root, dirid); 2242 /* it isn't an error if the inode isn't there, that can happen 2243 * because we replay the deletes before we copy in the inode item 2244 * from the log 2245 */ 2246 if (!dir) { 2247 btrfs_free_path(log_path); 2248 return 0; 2249 } 2250 again: 2251 range_start = 0; 2252 range_end = 0; 2253 while (1) { 2254 if (del_all) 2255 range_end = (u64)-1; 2256 else { 2257 ret = find_dir_range(log, path, dirid, key_type, 2258 &range_start, &range_end); 2259 if (ret != 0) 2260 break; 2261 } 2262 2263 dir_key.offset = range_start; 2264 while (1) { 2265 int nritems; 2266 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2267 0, 0); 2268 if (ret < 0) 2269 goto out; 2270 2271 nritems = btrfs_header_nritems(path->nodes[0]); 2272 if (path->slots[0] >= nritems) { 2273 ret = btrfs_next_leaf(root, path); 2274 if (ret) 2275 break; 2276 } 2277 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2278 path->slots[0]); 2279 if (found_key.objectid != dirid || 2280 found_key.type != dir_key.type) 2281 goto next_type; 2282 2283 if (found_key.offset > range_end) 2284 break; 2285 2286 ret = check_item_in_log(trans, root, log, path, 2287 log_path, dir, 2288 &found_key); 2289 if (ret) 2290 goto out; 2291 if (found_key.offset == (u64)-1) 2292 break; 2293 dir_key.offset = found_key.offset + 1; 2294 } 2295 btrfs_release_path(path); 2296 if (range_end == (u64)-1) 2297 break; 2298 range_start = range_end + 1; 2299 } 2300 2301 next_type: 2302 ret = 0; 2303 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2304 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2305 dir_key.type = BTRFS_DIR_INDEX_KEY; 2306 btrfs_release_path(path); 2307 goto again; 2308 } 2309 out: 2310 btrfs_release_path(path); 2311 btrfs_free_path(log_path); 2312 iput(dir); 2313 return ret; 2314 } 2315 2316 /* 2317 * the process_func used to replay items from the log tree. This 2318 * gets called in two different stages. The first stage just looks 2319 * for inodes and makes sure they are all copied into the subvolume. 2320 * 2321 * The second stage copies all the other item types from the log into 2322 * the subvolume. The two stage approach is slower, but gets rid of 2323 * lots of complexity around inodes referencing other inodes that exist 2324 * only in the log (references come from either directory items or inode 2325 * back refs). 2326 */ 2327 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2328 struct walk_control *wc, u64 gen) 2329 { 2330 int nritems; 2331 struct btrfs_path *path; 2332 struct btrfs_root *root = wc->replay_dest; 2333 struct btrfs_key key; 2334 int level; 2335 int i; 2336 int ret; 2337 2338 ret = btrfs_read_buffer(eb, gen); 2339 if (ret) 2340 return ret; 2341 2342 level = btrfs_header_level(eb); 2343 2344 if (level != 0) 2345 return 0; 2346 2347 path = btrfs_alloc_path(); 2348 if (!path) 2349 return -ENOMEM; 2350 2351 nritems = btrfs_header_nritems(eb); 2352 for (i = 0; i < nritems; i++) { 2353 btrfs_item_key_to_cpu(eb, &key, i); 2354 2355 /* inode keys are done during the first stage */ 2356 if (key.type == BTRFS_INODE_ITEM_KEY && 2357 wc->stage == LOG_WALK_REPLAY_INODES) { 2358 struct btrfs_inode_item *inode_item; 2359 u32 mode; 2360 2361 inode_item = btrfs_item_ptr(eb, i, 2362 struct btrfs_inode_item); 2363 ret = replay_xattr_deletes(wc->trans, root, log, 2364 path, key.objectid); 2365 if (ret) 2366 break; 2367 mode = btrfs_inode_mode(eb, inode_item); 2368 if (S_ISDIR(mode)) { 2369 ret = replay_dir_deletes(wc->trans, 2370 root, log, path, key.objectid, 0); 2371 if (ret) 2372 break; 2373 } 2374 ret = overwrite_item(wc->trans, root, path, 2375 eb, i, &key); 2376 if (ret) 2377 break; 2378 2379 /* for regular files, make sure corresponding 2380 * orphan item exist. extents past the new EOF 2381 * will be truncated later by orphan cleanup. 2382 */ 2383 if (S_ISREG(mode)) { 2384 ret = insert_orphan_item(wc->trans, root, 2385 key.objectid); 2386 if (ret) 2387 break; 2388 } 2389 2390 ret = link_to_fixup_dir(wc->trans, root, 2391 path, key.objectid); 2392 if (ret) 2393 break; 2394 } 2395 2396 if (key.type == BTRFS_DIR_INDEX_KEY && 2397 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2398 ret = replay_one_dir_item(wc->trans, root, path, 2399 eb, i, &key); 2400 if (ret) 2401 break; 2402 } 2403 2404 if (wc->stage < LOG_WALK_REPLAY_ALL) 2405 continue; 2406 2407 /* these keys are simply copied */ 2408 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2409 ret = overwrite_item(wc->trans, root, path, 2410 eb, i, &key); 2411 if (ret) 2412 break; 2413 } else if (key.type == BTRFS_INODE_REF_KEY || 2414 key.type == BTRFS_INODE_EXTREF_KEY) { 2415 ret = add_inode_ref(wc->trans, root, log, path, 2416 eb, i, &key); 2417 if (ret && ret != -ENOENT) 2418 break; 2419 ret = 0; 2420 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2421 ret = replay_one_extent(wc->trans, root, path, 2422 eb, i, &key); 2423 if (ret) 2424 break; 2425 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2426 ret = replay_one_dir_item(wc->trans, root, path, 2427 eb, i, &key); 2428 if (ret) 2429 break; 2430 } 2431 } 2432 btrfs_free_path(path); 2433 return ret; 2434 } 2435 2436 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2437 struct btrfs_root *root, 2438 struct btrfs_path *path, int *level, 2439 struct walk_control *wc) 2440 { 2441 struct btrfs_fs_info *fs_info = root->fs_info; 2442 u64 root_owner; 2443 u64 bytenr; 2444 u64 ptr_gen; 2445 struct extent_buffer *next; 2446 struct extent_buffer *cur; 2447 struct extent_buffer *parent; 2448 u32 blocksize; 2449 int ret = 0; 2450 2451 WARN_ON(*level < 0); 2452 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2453 2454 while (*level > 0) { 2455 WARN_ON(*level < 0); 2456 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2457 cur = path->nodes[*level]; 2458 2459 WARN_ON(btrfs_header_level(cur) != *level); 2460 2461 if (path->slots[*level] >= 2462 btrfs_header_nritems(cur)) 2463 break; 2464 2465 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2466 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2467 blocksize = fs_info->nodesize; 2468 2469 parent = path->nodes[*level]; 2470 root_owner = btrfs_header_owner(parent); 2471 2472 next = btrfs_find_create_tree_block(fs_info, bytenr); 2473 if (IS_ERR(next)) 2474 return PTR_ERR(next); 2475 2476 if (*level == 1) { 2477 ret = wc->process_func(root, next, wc, ptr_gen); 2478 if (ret) { 2479 free_extent_buffer(next); 2480 return ret; 2481 } 2482 2483 path->slots[*level]++; 2484 if (wc->free) { 2485 ret = btrfs_read_buffer(next, ptr_gen); 2486 if (ret) { 2487 free_extent_buffer(next); 2488 return ret; 2489 } 2490 2491 if (trans) { 2492 btrfs_tree_lock(next); 2493 btrfs_set_lock_blocking(next); 2494 clean_tree_block(fs_info, next); 2495 btrfs_wait_tree_block_writeback(next); 2496 btrfs_tree_unlock(next); 2497 } 2498 2499 WARN_ON(root_owner != 2500 BTRFS_TREE_LOG_OBJECTID); 2501 ret = btrfs_free_and_pin_reserved_extent( 2502 fs_info, bytenr, 2503 blocksize); 2504 if (ret) { 2505 free_extent_buffer(next); 2506 return ret; 2507 } 2508 } 2509 free_extent_buffer(next); 2510 continue; 2511 } 2512 ret = btrfs_read_buffer(next, ptr_gen); 2513 if (ret) { 2514 free_extent_buffer(next); 2515 return ret; 2516 } 2517 2518 WARN_ON(*level <= 0); 2519 if (path->nodes[*level-1]) 2520 free_extent_buffer(path->nodes[*level-1]); 2521 path->nodes[*level-1] = next; 2522 *level = btrfs_header_level(next); 2523 path->slots[*level] = 0; 2524 cond_resched(); 2525 } 2526 WARN_ON(*level < 0); 2527 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2528 2529 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2530 2531 cond_resched(); 2532 return 0; 2533 } 2534 2535 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2536 struct btrfs_root *root, 2537 struct btrfs_path *path, int *level, 2538 struct walk_control *wc) 2539 { 2540 struct btrfs_fs_info *fs_info = root->fs_info; 2541 u64 root_owner; 2542 int i; 2543 int slot; 2544 int ret; 2545 2546 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2547 slot = path->slots[i]; 2548 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2549 path->slots[i]++; 2550 *level = i; 2551 WARN_ON(*level == 0); 2552 return 0; 2553 } else { 2554 struct extent_buffer *parent; 2555 if (path->nodes[*level] == root->node) 2556 parent = path->nodes[*level]; 2557 else 2558 parent = path->nodes[*level + 1]; 2559 2560 root_owner = btrfs_header_owner(parent); 2561 ret = wc->process_func(root, path->nodes[*level], wc, 2562 btrfs_header_generation(path->nodes[*level])); 2563 if (ret) 2564 return ret; 2565 2566 if (wc->free) { 2567 struct extent_buffer *next; 2568 2569 next = path->nodes[*level]; 2570 2571 if (trans) { 2572 btrfs_tree_lock(next); 2573 btrfs_set_lock_blocking(next); 2574 clean_tree_block(fs_info, next); 2575 btrfs_wait_tree_block_writeback(next); 2576 btrfs_tree_unlock(next); 2577 } 2578 2579 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2580 ret = btrfs_free_and_pin_reserved_extent( 2581 fs_info, 2582 path->nodes[*level]->start, 2583 path->nodes[*level]->len); 2584 if (ret) 2585 return ret; 2586 } 2587 free_extent_buffer(path->nodes[*level]); 2588 path->nodes[*level] = NULL; 2589 *level = i + 1; 2590 } 2591 } 2592 return 1; 2593 } 2594 2595 /* 2596 * drop the reference count on the tree rooted at 'snap'. This traverses 2597 * the tree freeing any blocks that have a ref count of zero after being 2598 * decremented. 2599 */ 2600 static int walk_log_tree(struct btrfs_trans_handle *trans, 2601 struct btrfs_root *log, struct walk_control *wc) 2602 { 2603 struct btrfs_fs_info *fs_info = log->fs_info; 2604 int ret = 0; 2605 int wret; 2606 int level; 2607 struct btrfs_path *path; 2608 int orig_level; 2609 2610 path = btrfs_alloc_path(); 2611 if (!path) 2612 return -ENOMEM; 2613 2614 level = btrfs_header_level(log->node); 2615 orig_level = level; 2616 path->nodes[level] = log->node; 2617 extent_buffer_get(log->node); 2618 path->slots[level] = 0; 2619 2620 while (1) { 2621 wret = walk_down_log_tree(trans, log, path, &level, wc); 2622 if (wret > 0) 2623 break; 2624 if (wret < 0) { 2625 ret = wret; 2626 goto out; 2627 } 2628 2629 wret = walk_up_log_tree(trans, log, path, &level, wc); 2630 if (wret > 0) 2631 break; 2632 if (wret < 0) { 2633 ret = wret; 2634 goto out; 2635 } 2636 } 2637 2638 /* was the root node processed? if not, catch it here */ 2639 if (path->nodes[orig_level]) { 2640 ret = wc->process_func(log, path->nodes[orig_level], wc, 2641 btrfs_header_generation(path->nodes[orig_level])); 2642 if (ret) 2643 goto out; 2644 if (wc->free) { 2645 struct extent_buffer *next; 2646 2647 next = path->nodes[orig_level]; 2648 2649 if (trans) { 2650 btrfs_tree_lock(next); 2651 btrfs_set_lock_blocking(next); 2652 clean_tree_block(fs_info, next); 2653 btrfs_wait_tree_block_writeback(next); 2654 btrfs_tree_unlock(next); 2655 } 2656 2657 WARN_ON(log->root_key.objectid != 2658 BTRFS_TREE_LOG_OBJECTID); 2659 ret = btrfs_free_and_pin_reserved_extent(fs_info, 2660 next->start, next->len); 2661 if (ret) 2662 goto out; 2663 } 2664 } 2665 2666 out: 2667 btrfs_free_path(path); 2668 return ret; 2669 } 2670 2671 /* 2672 * helper function to update the item for a given subvolumes log root 2673 * in the tree of log roots 2674 */ 2675 static int update_log_root(struct btrfs_trans_handle *trans, 2676 struct btrfs_root *log) 2677 { 2678 struct btrfs_fs_info *fs_info = log->fs_info; 2679 int ret; 2680 2681 if (log->log_transid == 1) { 2682 /* insert root item on the first sync */ 2683 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2684 &log->root_key, &log->root_item); 2685 } else { 2686 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2687 &log->root_key, &log->root_item); 2688 } 2689 return ret; 2690 } 2691 2692 static void wait_log_commit(struct btrfs_root *root, int transid) 2693 { 2694 DEFINE_WAIT(wait); 2695 int index = transid % 2; 2696 2697 /* 2698 * we only allow two pending log transactions at a time, 2699 * so we know that if ours is more than 2 older than the 2700 * current transaction, we're done 2701 */ 2702 do { 2703 prepare_to_wait(&root->log_commit_wait[index], 2704 &wait, TASK_UNINTERRUPTIBLE); 2705 mutex_unlock(&root->log_mutex); 2706 2707 if (root->log_transid_committed < transid && 2708 atomic_read(&root->log_commit[index])) 2709 schedule(); 2710 2711 finish_wait(&root->log_commit_wait[index], &wait); 2712 mutex_lock(&root->log_mutex); 2713 } while (root->log_transid_committed < transid && 2714 atomic_read(&root->log_commit[index])); 2715 } 2716 2717 static void wait_for_writer(struct btrfs_root *root) 2718 { 2719 DEFINE_WAIT(wait); 2720 2721 while (atomic_read(&root->log_writers)) { 2722 prepare_to_wait(&root->log_writer_wait, 2723 &wait, TASK_UNINTERRUPTIBLE); 2724 mutex_unlock(&root->log_mutex); 2725 if (atomic_read(&root->log_writers)) 2726 schedule(); 2727 finish_wait(&root->log_writer_wait, &wait); 2728 mutex_lock(&root->log_mutex); 2729 } 2730 } 2731 2732 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2733 struct btrfs_log_ctx *ctx) 2734 { 2735 if (!ctx) 2736 return; 2737 2738 mutex_lock(&root->log_mutex); 2739 list_del_init(&ctx->list); 2740 mutex_unlock(&root->log_mutex); 2741 } 2742 2743 /* 2744 * Invoked in log mutex context, or be sure there is no other task which 2745 * can access the list. 2746 */ 2747 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2748 int index, int error) 2749 { 2750 struct btrfs_log_ctx *ctx; 2751 struct btrfs_log_ctx *safe; 2752 2753 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2754 list_del_init(&ctx->list); 2755 ctx->log_ret = error; 2756 } 2757 2758 INIT_LIST_HEAD(&root->log_ctxs[index]); 2759 } 2760 2761 /* 2762 * btrfs_sync_log does sends a given tree log down to the disk and 2763 * updates the super blocks to record it. When this call is done, 2764 * you know that any inodes previously logged are safely on disk only 2765 * if it returns 0. 2766 * 2767 * Any other return value means you need to call btrfs_commit_transaction. 2768 * Some of the edge cases for fsyncing directories that have had unlinks 2769 * or renames done in the past mean that sometimes the only safe 2770 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2771 * that has happened. 2772 */ 2773 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2774 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2775 { 2776 int index1; 2777 int index2; 2778 int mark; 2779 int ret; 2780 struct btrfs_fs_info *fs_info = root->fs_info; 2781 struct btrfs_root *log = root->log_root; 2782 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2783 int log_transid = 0; 2784 struct btrfs_log_ctx root_log_ctx; 2785 struct blk_plug plug; 2786 2787 mutex_lock(&root->log_mutex); 2788 log_transid = ctx->log_transid; 2789 if (root->log_transid_committed >= log_transid) { 2790 mutex_unlock(&root->log_mutex); 2791 return ctx->log_ret; 2792 } 2793 2794 index1 = log_transid % 2; 2795 if (atomic_read(&root->log_commit[index1])) { 2796 wait_log_commit(root, log_transid); 2797 mutex_unlock(&root->log_mutex); 2798 return ctx->log_ret; 2799 } 2800 ASSERT(log_transid == root->log_transid); 2801 atomic_set(&root->log_commit[index1], 1); 2802 2803 /* wait for previous tree log sync to complete */ 2804 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2805 wait_log_commit(root, log_transid - 1); 2806 2807 while (1) { 2808 int batch = atomic_read(&root->log_batch); 2809 /* when we're on an ssd, just kick the log commit out */ 2810 if (!btrfs_test_opt(fs_info, SSD) && 2811 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2812 mutex_unlock(&root->log_mutex); 2813 schedule_timeout_uninterruptible(1); 2814 mutex_lock(&root->log_mutex); 2815 } 2816 wait_for_writer(root); 2817 if (batch == atomic_read(&root->log_batch)) 2818 break; 2819 } 2820 2821 /* bail out if we need to do a full commit */ 2822 if (btrfs_need_log_full_commit(fs_info, trans)) { 2823 ret = -EAGAIN; 2824 btrfs_free_logged_extents(log, log_transid); 2825 mutex_unlock(&root->log_mutex); 2826 goto out; 2827 } 2828 2829 if (log_transid % 2 == 0) 2830 mark = EXTENT_DIRTY; 2831 else 2832 mark = EXTENT_NEW; 2833 2834 /* we start IO on all the marked extents here, but we don't actually 2835 * wait for them until later. 2836 */ 2837 blk_start_plug(&plug); 2838 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 2839 if (ret) { 2840 blk_finish_plug(&plug); 2841 btrfs_abort_transaction(trans, ret); 2842 btrfs_free_logged_extents(log, log_transid); 2843 btrfs_set_log_full_commit(fs_info, trans); 2844 mutex_unlock(&root->log_mutex); 2845 goto out; 2846 } 2847 2848 btrfs_set_root_node(&log->root_item, log->node); 2849 2850 root->log_transid++; 2851 log->log_transid = root->log_transid; 2852 root->log_start_pid = 0; 2853 /* 2854 * IO has been started, blocks of the log tree have WRITTEN flag set 2855 * in their headers. new modifications of the log will be written to 2856 * new positions. so it's safe to allow log writers to go in. 2857 */ 2858 mutex_unlock(&root->log_mutex); 2859 2860 btrfs_init_log_ctx(&root_log_ctx, NULL); 2861 2862 mutex_lock(&log_root_tree->log_mutex); 2863 atomic_inc(&log_root_tree->log_batch); 2864 atomic_inc(&log_root_tree->log_writers); 2865 2866 index2 = log_root_tree->log_transid % 2; 2867 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2868 root_log_ctx.log_transid = log_root_tree->log_transid; 2869 2870 mutex_unlock(&log_root_tree->log_mutex); 2871 2872 ret = update_log_root(trans, log); 2873 2874 mutex_lock(&log_root_tree->log_mutex); 2875 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2876 /* 2877 * Implicit memory barrier after atomic_dec_and_test 2878 */ 2879 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2880 wake_up(&log_root_tree->log_writer_wait); 2881 } 2882 2883 if (ret) { 2884 if (!list_empty(&root_log_ctx.list)) 2885 list_del_init(&root_log_ctx.list); 2886 2887 blk_finish_plug(&plug); 2888 btrfs_set_log_full_commit(fs_info, trans); 2889 2890 if (ret != -ENOSPC) { 2891 btrfs_abort_transaction(trans, ret); 2892 mutex_unlock(&log_root_tree->log_mutex); 2893 goto out; 2894 } 2895 btrfs_wait_tree_log_extents(log, mark); 2896 btrfs_free_logged_extents(log, log_transid); 2897 mutex_unlock(&log_root_tree->log_mutex); 2898 ret = -EAGAIN; 2899 goto out; 2900 } 2901 2902 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2903 blk_finish_plug(&plug); 2904 list_del_init(&root_log_ctx.list); 2905 mutex_unlock(&log_root_tree->log_mutex); 2906 ret = root_log_ctx.log_ret; 2907 goto out; 2908 } 2909 2910 index2 = root_log_ctx.log_transid % 2; 2911 if (atomic_read(&log_root_tree->log_commit[index2])) { 2912 blk_finish_plug(&plug); 2913 ret = btrfs_wait_tree_log_extents(log, mark); 2914 btrfs_wait_logged_extents(trans, log, log_transid); 2915 wait_log_commit(log_root_tree, 2916 root_log_ctx.log_transid); 2917 mutex_unlock(&log_root_tree->log_mutex); 2918 if (!ret) 2919 ret = root_log_ctx.log_ret; 2920 goto out; 2921 } 2922 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2923 atomic_set(&log_root_tree->log_commit[index2], 1); 2924 2925 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2926 wait_log_commit(log_root_tree, 2927 root_log_ctx.log_transid - 1); 2928 } 2929 2930 wait_for_writer(log_root_tree); 2931 2932 /* 2933 * now that we've moved on to the tree of log tree roots, 2934 * check the full commit flag again 2935 */ 2936 if (btrfs_need_log_full_commit(fs_info, trans)) { 2937 blk_finish_plug(&plug); 2938 btrfs_wait_tree_log_extents(log, mark); 2939 btrfs_free_logged_extents(log, log_transid); 2940 mutex_unlock(&log_root_tree->log_mutex); 2941 ret = -EAGAIN; 2942 goto out_wake_log_root; 2943 } 2944 2945 ret = btrfs_write_marked_extents(fs_info, 2946 &log_root_tree->dirty_log_pages, 2947 EXTENT_DIRTY | EXTENT_NEW); 2948 blk_finish_plug(&plug); 2949 if (ret) { 2950 btrfs_set_log_full_commit(fs_info, trans); 2951 btrfs_abort_transaction(trans, ret); 2952 btrfs_free_logged_extents(log, log_transid); 2953 mutex_unlock(&log_root_tree->log_mutex); 2954 goto out_wake_log_root; 2955 } 2956 ret = btrfs_wait_tree_log_extents(log, mark); 2957 if (!ret) 2958 ret = btrfs_wait_tree_log_extents(log_root_tree, 2959 EXTENT_NEW | EXTENT_DIRTY); 2960 if (ret) { 2961 btrfs_set_log_full_commit(fs_info, trans); 2962 btrfs_free_logged_extents(log, log_transid); 2963 mutex_unlock(&log_root_tree->log_mutex); 2964 goto out_wake_log_root; 2965 } 2966 btrfs_wait_logged_extents(trans, log, log_transid); 2967 2968 btrfs_set_super_log_root(fs_info->super_for_commit, 2969 log_root_tree->node->start); 2970 btrfs_set_super_log_root_level(fs_info->super_for_commit, 2971 btrfs_header_level(log_root_tree->node)); 2972 2973 log_root_tree->log_transid++; 2974 mutex_unlock(&log_root_tree->log_mutex); 2975 2976 /* 2977 * nobody else is going to jump in and write the the ctree 2978 * super here because the log_commit atomic below is protecting 2979 * us. We must be called with a transaction handle pinning 2980 * the running transaction open, so a full commit can't hop 2981 * in and cause problems either. 2982 */ 2983 ret = write_all_supers(fs_info, 1); 2984 if (ret) { 2985 btrfs_set_log_full_commit(fs_info, trans); 2986 btrfs_abort_transaction(trans, ret); 2987 goto out_wake_log_root; 2988 } 2989 2990 mutex_lock(&root->log_mutex); 2991 if (root->last_log_commit < log_transid) 2992 root->last_log_commit = log_transid; 2993 mutex_unlock(&root->log_mutex); 2994 2995 out_wake_log_root: 2996 mutex_lock(&log_root_tree->log_mutex); 2997 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 2998 2999 log_root_tree->log_transid_committed++; 3000 atomic_set(&log_root_tree->log_commit[index2], 0); 3001 mutex_unlock(&log_root_tree->log_mutex); 3002 3003 /* 3004 * The barrier before waitqueue_active is implied by mutex_unlock 3005 */ 3006 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 3007 wake_up(&log_root_tree->log_commit_wait[index2]); 3008 out: 3009 mutex_lock(&root->log_mutex); 3010 btrfs_remove_all_log_ctxs(root, index1, ret); 3011 root->log_transid_committed++; 3012 atomic_set(&root->log_commit[index1], 0); 3013 mutex_unlock(&root->log_mutex); 3014 3015 /* 3016 * The barrier before waitqueue_active is implied by mutex_unlock 3017 */ 3018 if (waitqueue_active(&root->log_commit_wait[index1])) 3019 wake_up(&root->log_commit_wait[index1]); 3020 return ret; 3021 } 3022 3023 static void free_log_tree(struct btrfs_trans_handle *trans, 3024 struct btrfs_root *log) 3025 { 3026 int ret; 3027 u64 start; 3028 u64 end; 3029 struct walk_control wc = { 3030 .free = 1, 3031 .process_func = process_one_buffer 3032 }; 3033 3034 ret = walk_log_tree(trans, log, &wc); 3035 /* I don't think this can happen but just in case */ 3036 if (ret) 3037 btrfs_abort_transaction(trans, ret); 3038 3039 while (1) { 3040 ret = find_first_extent_bit(&log->dirty_log_pages, 3041 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 3042 NULL); 3043 if (ret) 3044 break; 3045 3046 clear_extent_bits(&log->dirty_log_pages, start, end, 3047 EXTENT_DIRTY | EXTENT_NEW); 3048 } 3049 3050 /* 3051 * We may have short-circuited the log tree with the full commit logic 3052 * and left ordered extents on our list, so clear these out to keep us 3053 * from leaking inodes and memory. 3054 */ 3055 btrfs_free_logged_extents(log, 0); 3056 btrfs_free_logged_extents(log, 1); 3057 3058 free_extent_buffer(log->node); 3059 kfree(log); 3060 } 3061 3062 /* 3063 * free all the extents used by the tree log. This should be called 3064 * at commit time of the full transaction 3065 */ 3066 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3067 { 3068 if (root->log_root) { 3069 free_log_tree(trans, root->log_root); 3070 root->log_root = NULL; 3071 } 3072 return 0; 3073 } 3074 3075 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3076 struct btrfs_fs_info *fs_info) 3077 { 3078 if (fs_info->log_root_tree) { 3079 free_log_tree(trans, fs_info->log_root_tree); 3080 fs_info->log_root_tree = NULL; 3081 } 3082 return 0; 3083 } 3084 3085 /* 3086 * If both a file and directory are logged, and unlinks or renames are 3087 * mixed in, we have a few interesting corners: 3088 * 3089 * create file X in dir Y 3090 * link file X to X.link in dir Y 3091 * fsync file X 3092 * unlink file X but leave X.link 3093 * fsync dir Y 3094 * 3095 * After a crash we would expect only X.link to exist. But file X 3096 * didn't get fsync'd again so the log has back refs for X and X.link. 3097 * 3098 * We solve this by removing directory entries and inode backrefs from the 3099 * log when a file that was logged in the current transaction is 3100 * unlinked. Any later fsync will include the updated log entries, and 3101 * we'll be able to reconstruct the proper directory items from backrefs. 3102 * 3103 * This optimizations allows us to avoid relogging the entire inode 3104 * or the entire directory. 3105 */ 3106 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3107 struct btrfs_root *root, 3108 const char *name, int name_len, 3109 struct btrfs_inode *dir, u64 index) 3110 { 3111 struct btrfs_root *log; 3112 struct btrfs_dir_item *di; 3113 struct btrfs_path *path; 3114 int ret; 3115 int err = 0; 3116 int bytes_del = 0; 3117 u64 dir_ino = btrfs_ino(dir); 3118 3119 if (dir->logged_trans < trans->transid) 3120 return 0; 3121 3122 ret = join_running_log_trans(root); 3123 if (ret) 3124 return 0; 3125 3126 mutex_lock(&dir->log_mutex); 3127 3128 log = root->log_root; 3129 path = btrfs_alloc_path(); 3130 if (!path) { 3131 err = -ENOMEM; 3132 goto out_unlock; 3133 } 3134 3135 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3136 name, name_len, -1); 3137 if (IS_ERR(di)) { 3138 err = PTR_ERR(di); 3139 goto fail; 3140 } 3141 if (di) { 3142 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3143 bytes_del += name_len; 3144 if (ret) { 3145 err = ret; 3146 goto fail; 3147 } 3148 } 3149 btrfs_release_path(path); 3150 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3151 index, name, name_len, -1); 3152 if (IS_ERR(di)) { 3153 err = PTR_ERR(di); 3154 goto fail; 3155 } 3156 if (di) { 3157 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3158 bytes_del += name_len; 3159 if (ret) { 3160 err = ret; 3161 goto fail; 3162 } 3163 } 3164 3165 /* update the directory size in the log to reflect the names 3166 * we have removed 3167 */ 3168 if (bytes_del) { 3169 struct btrfs_key key; 3170 3171 key.objectid = dir_ino; 3172 key.offset = 0; 3173 key.type = BTRFS_INODE_ITEM_KEY; 3174 btrfs_release_path(path); 3175 3176 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3177 if (ret < 0) { 3178 err = ret; 3179 goto fail; 3180 } 3181 if (ret == 0) { 3182 struct btrfs_inode_item *item; 3183 u64 i_size; 3184 3185 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3186 struct btrfs_inode_item); 3187 i_size = btrfs_inode_size(path->nodes[0], item); 3188 if (i_size > bytes_del) 3189 i_size -= bytes_del; 3190 else 3191 i_size = 0; 3192 btrfs_set_inode_size(path->nodes[0], item, i_size); 3193 btrfs_mark_buffer_dirty(path->nodes[0]); 3194 } else 3195 ret = 0; 3196 btrfs_release_path(path); 3197 } 3198 fail: 3199 btrfs_free_path(path); 3200 out_unlock: 3201 mutex_unlock(&dir->log_mutex); 3202 if (ret == -ENOSPC) { 3203 btrfs_set_log_full_commit(root->fs_info, trans); 3204 ret = 0; 3205 } else if (ret < 0) 3206 btrfs_abort_transaction(trans, ret); 3207 3208 btrfs_end_log_trans(root); 3209 3210 return err; 3211 } 3212 3213 /* see comments for btrfs_del_dir_entries_in_log */ 3214 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3215 struct btrfs_root *root, 3216 const char *name, int name_len, 3217 struct btrfs_inode *inode, u64 dirid) 3218 { 3219 struct btrfs_fs_info *fs_info = root->fs_info; 3220 struct btrfs_root *log; 3221 u64 index; 3222 int ret; 3223 3224 if (inode->logged_trans < trans->transid) 3225 return 0; 3226 3227 ret = join_running_log_trans(root); 3228 if (ret) 3229 return 0; 3230 log = root->log_root; 3231 mutex_lock(&inode->log_mutex); 3232 3233 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3234 dirid, &index); 3235 mutex_unlock(&inode->log_mutex); 3236 if (ret == -ENOSPC) { 3237 btrfs_set_log_full_commit(fs_info, trans); 3238 ret = 0; 3239 } else if (ret < 0 && ret != -ENOENT) 3240 btrfs_abort_transaction(trans, ret); 3241 btrfs_end_log_trans(root); 3242 3243 return ret; 3244 } 3245 3246 /* 3247 * creates a range item in the log for 'dirid'. first_offset and 3248 * last_offset tell us which parts of the key space the log should 3249 * be considered authoritative for. 3250 */ 3251 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3252 struct btrfs_root *log, 3253 struct btrfs_path *path, 3254 int key_type, u64 dirid, 3255 u64 first_offset, u64 last_offset) 3256 { 3257 int ret; 3258 struct btrfs_key key; 3259 struct btrfs_dir_log_item *item; 3260 3261 key.objectid = dirid; 3262 key.offset = first_offset; 3263 if (key_type == BTRFS_DIR_ITEM_KEY) 3264 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3265 else 3266 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3267 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3268 if (ret) 3269 return ret; 3270 3271 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3272 struct btrfs_dir_log_item); 3273 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3274 btrfs_mark_buffer_dirty(path->nodes[0]); 3275 btrfs_release_path(path); 3276 return 0; 3277 } 3278 3279 /* 3280 * log all the items included in the current transaction for a given 3281 * directory. This also creates the range items in the log tree required 3282 * to replay anything deleted before the fsync 3283 */ 3284 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3285 struct btrfs_root *root, struct btrfs_inode *inode, 3286 struct btrfs_path *path, 3287 struct btrfs_path *dst_path, int key_type, 3288 struct btrfs_log_ctx *ctx, 3289 u64 min_offset, u64 *last_offset_ret) 3290 { 3291 struct btrfs_key min_key; 3292 struct btrfs_root *log = root->log_root; 3293 struct extent_buffer *src; 3294 int err = 0; 3295 int ret; 3296 int i; 3297 int nritems; 3298 u64 first_offset = min_offset; 3299 u64 last_offset = (u64)-1; 3300 u64 ino = btrfs_ino(inode); 3301 3302 log = root->log_root; 3303 3304 min_key.objectid = ino; 3305 min_key.type = key_type; 3306 min_key.offset = min_offset; 3307 3308 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3309 3310 /* 3311 * we didn't find anything from this transaction, see if there 3312 * is anything at all 3313 */ 3314 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3315 min_key.objectid = ino; 3316 min_key.type = key_type; 3317 min_key.offset = (u64)-1; 3318 btrfs_release_path(path); 3319 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3320 if (ret < 0) { 3321 btrfs_release_path(path); 3322 return ret; 3323 } 3324 ret = btrfs_previous_item(root, path, ino, key_type); 3325 3326 /* if ret == 0 there are items for this type, 3327 * create a range to tell us the last key of this type. 3328 * otherwise, there are no items in this directory after 3329 * *min_offset, and we create a range to indicate that. 3330 */ 3331 if (ret == 0) { 3332 struct btrfs_key tmp; 3333 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3334 path->slots[0]); 3335 if (key_type == tmp.type) 3336 first_offset = max(min_offset, tmp.offset) + 1; 3337 } 3338 goto done; 3339 } 3340 3341 /* go backward to find any previous key */ 3342 ret = btrfs_previous_item(root, path, ino, key_type); 3343 if (ret == 0) { 3344 struct btrfs_key tmp; 3345 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3346 if (key_type == tmp.type) { 3347 first_offset = tmp.offset; 3348 ret = overwrite_item(trans, log, dst_path, 3349 path->nodes[0], path->slots[0], 3350 &tmp); 3351 if (ret) { 3352 err = ret; 3353 goto done; 3354 } 3355 } 3356 } 3357 btrfs_release_path(path); 3358 3359 /* find the first key from this transaction again */ 3360 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3361 if (WARN_ON(ret != 0)) 3362 goto done; 3363 3364 /* 3365 * we have a block from this transaction, log every item in it 3366 * from our directory 3367 */ 3368 while (1) { 3369 struct btrfs_key tmp; 3370 src = path->nodes[0]; 3371 nritems = btrfs_header_nritems(src); 3372 for (i = path->slots[0]; i < nritems; i++) { 3373 struct btrfs_dir_item *di; 3374 3375 btrfs_item_key_to_cpu(src, &min_key, i); 3376 3377 if (min_key.objectid != ino || min_key.type != key_type) 3378 goto done; 3379 ret = overwrite_item(trans, log, dst_path, src, i, 3380 &min_key); 3381 if (ret) { 3382 err = ret; 3383 goto done; 3384 } 3385 3386 /* 3387 * We must make sure that when we log a directory entry, 3388 * the corresponding inode, after log replay, has a 3389 * matching link count. For example: 3390 * 3391 * touch foo 3392 * mkdir mydir 3393 * sync 3394 * ln foo mydir/bar 3395 * xfs_io -c "fsync" mydir 3396 * <crash> 3397 * <mount fs and log replay> 3398 * 3399 * Would result in a fsync log that when replayed, our 3400 * file inode would have a link count of 1, but we get 3401 * two directory entries pointing to the same inode. 3402 * After removing one of the names, it would not be 3403 * possible to remove the other name, which resulted 3404 * always in stale file handle errors, and would not 3405 * be possible to rmdir the parent directory, since 3406 * its i_size could never decrement to the value 3407 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3408 */ 3409 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3410 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3411 if (ctx && 3412 (btrfs_dir_transid(src, di) == trans->transid || 3413 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3414 tmp.type != BTRFS_ROOT_ITEM_KEY) 3415 ctx->log_new_dentries = true; 3416 } 3417 path->slots[0] = nritems; 3418 3419 /* 3420 * look ahead to the next item and see if it is also 3421 * from this directory and from this transaction 3422 */ 3423 ret = btrfs_next_leaf(root, path); 3424 if (ret == 1) { 3425 last_offset = (u64)-1; 3426 goto done; 3427 } 3428 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3429 if (tmp.objectid != ino || tmp.type != key_type) { 3430 last_offset = (u64)-1; 3431 goto done; 3432 } 3433 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3434 ret = overwrite_item(trans, log, dst_path, 3435 path->nodes[0], path->slots[0], 3436 &tmp); 3437 if (ret) 3438 err = ret; 3439 else 3440 last_offset = tmp.offset; 3441 goto done; 3442 } 3443 } 3444 done: 3445 btrfs_release_path(path); 3446 btrfs_release_path(dst_path); 3447 3448 if (err == 0) { 3449 *last_offset_ret = last_offset; 3450 /* 3451 * insert the log range keys to indicate where the log 3452 * is valid 3453 */ 3454 ret = insert_dir_log_key(trans, log, path, key_type, 3455 ino, first_offset, last_offset); 3456 if (ret) 3457 err = ret; 3458 } 3459 return err; 3460 } 3461 3462 /* 3463 * logging directories is very similar to logging inodes, We find all the items 3464 * from the current transaction and write them to the log. 3465 * 3466 * The recovery code scans the directory in the subvolume, and if it finds a 3467 * key in the range logged that is not present in the log tree, then it means 3468 * that dir entry was unlinked during the transaction. 3469 * 3470 * In order for that scan to work, we must include one key smaller than 3471 * the smallest logged by this transaction and one key larger than the largest 3472 * key logged by this transaction. 3473 */ 3474 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3475 struct btrfs_root *root, struct btrfs_inode *inode, 3476 struct btrfs_path *path, 3477 struct btrfs_path *dst_path, 3478 struct btrfs_log_ctx *ctx) 3479 { 3480 u64 min_key; 3481 u64 max_key; 3482 int ret; 3483 int key_type = BTRFS_DIR_ITEM_KEY; 3484 3485 again: 3486 min_key = 0; 3487 max_key = 0; 3488 while (1) { 3489 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3490 ctx, min_key, &max_key); 3491 if (ret) 3492 return ret; 3493 if (max_key == (u64)-1) 3494 break; 3495 min_key = max_key + 1; 3496 } 3497 3498 if (key_type == BTRFS_DIR_ITEM_KEY) { 3499 key_type = BTRFS_DIR_INDEX_KEY; 3500 goto again; 3501 } 3502 return 0; 3503 } 3504 3505 /* 3506 * a helper function to drop items from the log before we relog an 3507 * inode. max_key_type indicates the highest item type to remove. 3508 * This cannot be run for file data extents because it does not 3509 * free the extents they point to. 3510 */ 3511 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3512 struct btrfs_root *log, 3513 struct btrfs_path *path, 3514 u64 objectid, int max_key_type) 3515 { 3516 int ret; 3517 struct btrfs_key key; 3518 struct btrfs_key found_key; 3519 int start_slot; 3520 3521 key.objectid = objectid; 3522 key.type = max_key_type; 3523 key.offset = (u64)-1; 3524 3525 while (1) { 3526 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3527 BUG_ON(ret == 0); /* Logic error */ 3528 if (ret < 0) 3529 break; 3530 3531 if (path->slots[0] == 0) 3532 break; 3533 3534 path->slots[0]--; 3535 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3536 path->slots[0]); 3537 3538 if (found_key.objectid != objectid) 3539 break; 3540 3541 found_key.offset = 0; 3542 found_key.type = 0; 3543 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3544 &start_slot); 3545 3546 ret = btrfs_del_items(trans, log, path, start_slot, 3547 path->slots[0] - start_slot + 1); 3548 /* 3549 * If start slot isn't 0 then we don't need to re-search, we've 3550 * found the last guy with the objectid in this tree. 3551 */ 3552 if (ret || start_slot != 0) 3553 break; 3554 btrfs_release_path(path); 3555 } 3556 btrfs_release_path(path); 3557 if (ret > 0) 3558 ret = 0; 3559 return ret; 3560 } 3561 3562 static void fill_inode_item(struct btrfs_trans_handle *trans, 3563 struct extent_buffer *leaf, 3564 struct btrfs_inode_item *item, 3565 struct inode *inode, int log_inode_only, 3566 u64 logged_isize) 3567 { 3568 struct btrfs_map_token token; 3569 3570 btrfs_init_map_token(&token); 3571 3572 if (log_inode_only) { 3573 /* set the generation to zero so the recover code 3574 * can tell the difference between an logging 3575 * just to say 'this inode exists' and a logging 3576 * to say 'update this inode with these values' 3577 */ 3578 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3579 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3580 } else { 3581 btrfs_set_token_inode_generation(leaf, item, 3582 BTRFS_I(inode)->generation, 3583 &token); 3584 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3585 } 3586 3587 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3588 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3589 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3590 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3591 3592 btrfs_set_token_timespec_sec(leaf, &item->atime, 3593 inode->i_atime.tv_sec, &token); 3594 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3595 inode->i_atime.tv_nsec, &token); 3596 3597 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3598 inode->i_mtime.tv_sec, &token); 3599 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3600 inode->i_mtime.tv_nsec, &token); 3601 3602 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3603 inode->i_ctime.tv_sec, &token); 3604 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3605 inode->i_ctime.tv_nsec, &token); 3606 3607 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3608 &token); 3609 3610 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3611 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3612 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3613 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3614 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3615 } 3616 3617 static int log_inode_item(struct btrfs_trans_handle *trans, 3618 struct btrfs_root *log, struct btrfs_path *path, 3619 struct btrfs_inode *inode) 3620 { 3621 struct btrfs_inode_item *inode_item; 3622 int ret; 3623 3624 ret = btrfs_insert_empty_item(trans, log, path, 3625 &inode->location, sizeof(*inode_item)); 3626 if (ret && ret != -EEXIST) 3627 return ret; 3628 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3629 struct btrfs_inode_item); 3630 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3631 0, 0); 3632 btrfs_release_path(path); 3633 return 0; 3634 } 3635 3636 static noinline int copy_items(struct btrfs_trans_handle *trans, 3637 struct btrfs_inode *inode, 3638 struct btrfs_path *dst_path, 3639 struct btrfs_path *src_path, u64 *last_extent, 3640 int start_slot, int nr, int inode_only, 3641 u64 logged_isize) 3642 { 3643 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 3644 unsigned long src_offset; 3645 unsigned long dst_offset; 3646 struct btrfs_root *log = inode->root->log_root; 3647 struct btrfs_file_extent_item *extent; 3648 struct btrfs_inode_item *inode_item; 3649 struct extent_buffer *src = src_path->nodes[0]; 3650 struct btrfs_key first_key, last_key, key; 3651 int ret; 3652 struct btrfs_key *ins_keys; 3653 u32 *ins_sizes; 3654 char *ins_data; 3655 int i; 3656 struct list_head ordered_sums; 3657 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3658 bool has_extents = false; 3659 bool need_find_last_extent = true; 3660 bool done = false; 3661 3662 INIT_LIST_HEAD(&ordered_sums); 3663 3664 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3665 nr * sizeof(u32), GFP_NOFS); 3666 if (!ins_data) 3667 return -ENOMEM; 3668 3669 first_key.objectid = (u64)-1; 3670 3671 ins_sizes = (u32 *)ins_data; 3672 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3673 3674 for (i = 0; i < nr; i++) { 3675 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3676 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3677 } 3678 ret = btrfs_insert_empty_items(trans, log, dst_path, 3679 ins_keys, ins_sizes, nr); 3680 if (ret) { 3681 kfree(ins_data); 3682 return ret; 3683 } 3684 3685 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3686 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3687 dst_path->slots[0]); 3688 3689 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3690 3691 if (i == nr - 1) 3692 last_key = ins_keys[i]; 3693 3694 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3695 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3696 dst_path->slots[0], 3697 struct btrfs_inode_item); 3698 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3699 &inode->vfs_inode, 3700 inode_only == LOG_INODE_EXISTS, 3701 logged_isize); 3702 } else { 3703 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3704 src_offset, ins_sizes[i]); 3705 } 3706 3707 /* 3708 * We set need_find_last_extent here in case we know we were 3709 * processing other items and then walk into the first extent in 3710 * the inode. If we don't hit an extent then nothing changes, 3711 * we'll do the last search the next time around. 3712 */ 3713 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3714 has_extents = true; 3715 if (first_key.objectid == (u64)-1) 3716 first_key = ins_keys[i]; 3717 } else { 3718 need_find_last_extent = false; 3719 } 3720 3721 /* take a reference on file data extents so that truncates 3722 * or deletes of this inode don't have to relog the inode 3723 * again 3724 */ 3725 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3726 !skip_csum) { 3727 int found_type; 3728 extent = btrfs_item_ptr(src, start_slot + i, 3729 struct btrfs_file_extent_item); 3730 3731 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3732 continue; 3733 3734 found_type = btrfs_file_extent_type(src, extent); 3735 if (found_type == BTRFS_FILE_EXTENT_REG) { 3736 u64 ds, dl, cs, cl; 3737 ds = btrfs_file_extent_disk_bytenr(src, 3738 extent); 3739 /* ds == 0 is a hole */ 3740 if (ds == 0) 3741 continue; 3742 3743 dl = btrfs_file_extent_disk_num_bytes(src, 3744 extent); 3745 cs = btrfs_file_extent_offset(src, extent); 3746 cl = btrfs_file_extent_num_bytes(src, 3747 extent); 3748 if (btrfs_file_extent_compression(src, 3749 extent)) { 3750 cs = 0; 3751 cl = dl; 3752 } 3753 3754 ret = btrfs_lookup_csums_range( 3755 fs_info->csum_root, 3756 ds + cs, ds + cs + cl - 1, 3757 &ordered_sums, 0); 3758 if (ret) { 3759 btrfs_release_path(dst_path); 3760 kfree(ins_data); 3761 return ret; 3762 } 3763 } 3764 } 3765 } 3766 3767 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3768 btrfs_release_path(dst_path); 3769 kfree(ins_data); 3770 3771 /* 3772 * we have to do this after the loop above to avoid changing the 3773 * log tree while trying to change the log tree. 3774 */ 3775 ret = 0; 3776 while (!list_empty(&ordered_sums)) { 3777 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3778 struct btrfs_ordered_sum, 3779 list); 3780 if (!ret) 3781 ret = btrfs_csum_file_blocks(trans, log, sums); 3782 list_del(&sums->list); 3783 kfree(sums); 3784 } 3785 3786 if (!has_extents) 3787 return ret; 3788 3789 if (need_find_last_extent && *last_extent == first_key.offset) { 3790 /* 3791 * We don't have any leafs between our current one and the one 3792 * we processed before that can have file extent items for our 3793 * inode (and have a generation number smaller than our current 3794 * transaction id). 3795 */ 3796 need_find_last_extent = false; 3797 } 3798 3799 /* 3800 * Because we use btrfs_search_forward we could skip leaves that were 3801 * not modified and then assume *last_extent is valid when it really 3802 * isn't. So back up to the previous leaf and read the end of the last 3803 * extent before we go and fill in holes. 3804 */ 3805 if (need_find_last_extent) { 3806 u64 len; 3807 3808 ret = btrfs_prev_leaf(inode->root, src_path); 3809 if (ret < 0) 3810 return ret; 3811 if (ret) 3812 goto fill_holes; 3813 if (src_path->slots[0]) 3814 src_path->slots[0]--; 3815 src = src_path->nodes[0]; 3816 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3817 if (key.objectid != btrfs_ino(inode) || 3818 key.type != BTRFS_EXTENT_DATA_KEY) 3819 goto fill_holes; 3820 extent = btrfs_item_ptr(src, src_path->slots[0], 3821 struct btrfs_file_extent_item); 3822 if (btrfs_file_extent_type(src, extent) == 3823 BTRFS_FILE_EXTENT_INLINE) { 3824 len = btrfs_file_extent_inline_len(src, 3825 src_path->slots[0], 3826 extent); 3827 *last_extent = ALIGN(key.offset + len, 3828 fs_info->sectorsize); 3829 } else { 3830 len = btrfs_file_extent_num_bytes(src, extent); 3831 *last_extent = key.offset + len; 3832 } 3833 } 3834 fill_holes: 3835 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3836 * things could have happened 3837 * 3838 * 1) A merge could have happened, so we could currently be on a leaf 3839 * that holds what we were copying in the first place. 3840 * 2) A split could have happened, and now not all of the items we want 3841 * are on the same leaf. 3842 * 3843 * So we need to adjust how we search for holes, we need to drop the 3844 * path and re-search for the first extent key we found, and then walk 3845 * forward until we hit the last one we copied. 3846 */ 3847 if (need_find_last_extent) { 3848 /* btrfs_prev_leaf could return 1 without releasing the path */ 3849 btrfs_release_path(src_path); 3850 ret = btrfs_search_slot(NULL, inode->root, &first_key, 3851 src_path, 0, 0); 3852 if (ret < 0) 3853 return ret; 3854 ASSERT(ret == 0); 3855 src = src_path->nodes[0]; 3856 i = src_path->slots[0]; 3857 } else { 3858 i = start_slot; 3859 } 3860 3861 /* 3862 * Ok so here we need to go through and fill in any holes we may have 3863 * to make sure that holes are punched for those areas in case they had 3864 * extents previously. 3865 */ 3866 while (!done) { 3867 u64 offset, len; 3868 u64 extent_end; 3869 3870 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3871 ret = btrfs_next_leaf(inode->root, src_path); 3872 if (ret < 0) 3873 return ret; 3874 ASSERT(ret == 0); 3875 src = src_path->nodes[0]; 3876 i = 0; 3877 } 3878 3879 btrfs_item_key_to_cpu(src, &key, i); 3880 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3881 done = true; 3882 if (key.objectid != btrfs_ino(inode) || 3883 key.type != BTRFS_EXTENT_DATA_KEY) { 3884 i++; 3885 continue; 3886 } 3887 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3888 if (btrfs_file_extent_type(src, extent) == 3889 BTRFS_FILE_EXTENT_INLINE) { 3890 len = btrfs_file_extent_inline_len(src, i, extent); 3891 extent_end = ALIGN(key.offset + len, 3892 fs_info->sectorsize); 3893 } else { 3894 len = btrfs_file_extent_num_bytes(src, extent); 3895 extent_end = key.offset + len; 3896 } 3897 i++; 3898 3899 if (*last_extent == key.offset) { 3900 *last_extent = extent_end; 3901 continue; 3902 } 3903 offset = *last_extent; 3904 len = key.offset - *last_extent; 3905 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 3906 offset, 0, 0, len, 0, len, 0, 0, 0); 3907 if (ret) 3908 break; 3909 *last_extent = extent_end; 3910 } 3911 /* 3912 * Need to let the callers know we dropped the path so they should 3913 * re-search. 3914 */ 3915 if (!ret && need_find_last_extent) 3916 ret = 1; 3917 return ret; 3918 } 3919 3920 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3921 { 3922 struct extent_map *em1, *em2; 3923 3924 em1 = list_entry(a, struct extent_map, list); 3925 em2 = list_entry(b, struct extent_map, list); 3926 3927 if (em1->start < em2->start) 3928 return -1; 3929 else if (em1->start > em2->start) 3930 return 1; 3931 return 0; 3932 } 3933 3934 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 3935 struct inode *inode, 3936 struct btrfs_root *root, 3937 const struct extent_map *em, 3938 const struct list_head *logged_list, 3939 bool *ordered_io_error) 3940 { 3941 struct btrfs_fs_info *fs_info = root->fs_info; 3942 struct btrfs_ordered_extent *ordered; 3943 struct btrfs_root *log = root->log_root; 3944 u64 mod_start = em->mod_start; 3945 u64 mod_len = em->mod_len; 3946 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3947 u64 csum_offset; 3948 u64 csum_len; 3949 LIST_HEAD(ordered_sums); 3950 int ret = 0; 3951 3952 *ordered_io_error = false; 3953 3954 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 3955 em->block_start == EXTENT_MAP_HOLE) 3956 return 0; 3957 3958 /* 3959 * Wait far any ordered extent that covers our extent map. If it 3960 * finishes without an error, first check and see if our csums are on 3961 * our outstanding ordered extents. 3962 */ 3963 list_for_each_entry(ordered, logged_list, log_list) { 3964 struct btrfs_ordered_sum *sum; 3965 3966 if (!mod_len) 3967 break; 3968 3969 if (ordered->file_offset + ordered->len <= mod_start || 3970 mod_start + mod_len <= ordered->file_offset) 3971 continue; 3972 3973 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 3974 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 3975 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 3976 const u64 start = ordered->file_offset; 3977 const u64 end = ordered->file_offset + ordered->len - 1; 3978 3979 WARN_ON(ordered->inode != inode); 3980 filemap_fdatawrite_range(inode->i_mapping, start, end); 3981 } 3982 3983 wait_event(ordered->wait, 3984 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 3985 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3986 3987 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3988 /* 3989 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 3990 * i_mapping flags, so that the next fsync won't get 3991 * an outdated io error too. 3992 */ 3993 filemap_check_errors(inode->i_mapping); 3994 *ordered_io_error = true; 3995 break; 3996 } 3997 /* 3998 * We are going to copy all the csums on this ordered extent, so 3999 * go ahead and adjust mod_start and mod_len in case this 4000 * ordered extent has already been logged. 4001 */ 4002 if (ordered->file_offset > mod_start) { 4003 if (ordered->file_offset + ordered->len >= 4004 mod_start + mod_len) 4005 mod_len = ordered->file_offset - mod_start; 4006 /* 4007 * If we have this case 4008 * 4009 * |--------- logged extent ---------| 4010 * |----- ordered extent ----| 4011 * 4012 * Just don't mess with mod_start and mod_len, we'll 4013 * just end up logging more csums than we need and it 4014 * will be ok. 4015 */ 4016 } else { 4017 if (ordered->file_offset + ordered->len < 4018 mod_start + mod_len) { 4019 mod_len = (mod_start + mod_len) - 4020 (ordered->file_offset + ordered->len); 4021 mod_start = ordered->file_offset + 4022 ordered->len; 4023 } else { 4024 mod_len = 0; 4025 } 4026 } 4027 4028 if (skip_csum) 4029 continue; 4030 4031 /* 4032 * To keep us from looping for the above case of an ordered 4033 * extent that falls inside of the logged extent. 4034 */ 4035 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 4036 &ordered->flags)) 4037 continue; 4038 4039 list_for_each_entry(sum, &ordered->list, list) { 4040 ret = btrfs_csum_file_blocks(trans, log, sum); 4041 if (ret) 4042 break; 4043 } 4044 } 4045 4046 if (*ordered_io_error || !mod_len || ret || skip_csum) 4047 return ret; 4048 4049 if (em->compress_type) { 4050 csum_offset = 0; 4051 csum_len = max(em->block_len, em->orig_block_len); 4052 } else { 4053 csum_offset = mod_start - em->start; 4054 csum_len = mod_len; 4055 } 4056 4057 /* block start is already adjusted for the file extent offset. */ 4058 ret = btrfs_lookup_csums_range(fs_info->csum_root, 4059 em->block_start + csum_offset, 4060 em->block_start + csum_offset + 4061 csum_len - 1, &ordered_sums, 0); 4062 if (ret) 4063 return ret; 4064 4065 while (!list_empty(&ordered_sums)) { 4066 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4067 struct btrfs_ordered_sum, 4068 list); 4069 if (!ret) 4070 ret = btrfs_csum_file_blocks(trans, log, sums); 4071 list_del(&sums->list); 4072 kfree(sums); 4073 } 4074 4075 return ret; 4076 } 4077 4078 static int log_one_extent(struct btrfs_trans_handle *trans, 4079 struct btrfs_inode *inode, struct btrfs_root *root, 4080 const struct extent_map *em, 4081 struct btrfs_path *path, 4082 const struct list_head *logged_list, 4083 struct btrfs_log_ctx *ctx) 4084 { 4085 struct btrfs_root *log = root->log_root; 4086 struct btrfs_file_extent_item *fi; 4087 struct extent_buffer *leaf; 4088 struct btrfs_map_token token; 4089 struct btrfs_key key; 4090 u64 extent_offset = em->start - em->orig_start; 4091 u64 block_len; 4092 int ret; 4093 int extent_inserted = 0; 4094 bool ordered_io_err = false; 4095 4096 ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, 4097 logged_list, &ordered_io_err); 4098 if (ret) 4099 return ret; 4100 4101 if (ordered_io_err) { 4102 ctx->io_err = -EIO; 4103 return 0; 4104 } 4105 4106 btrfs_init_map_token(&token); 4107 4108 ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, 4109 em->start + em->len, NULL, 0, 1, 4110 sizeof(*fi), &extent_inserted); 4111 if (ret) 4112 return ret; 4113 4114 if (!extent_inserted) { 4115 key.objectid = btrfs_ino(inode); 4116 key.type = BTRFS_EXTENT_DATA_KEY; 4117 key.offset = em->start; 4118 4119 ret = btrfs_insert_empty_item(trans, log, path, &key, 4120 sizeof(*fi)); 4121 if (ret) 4122 return ret; 4123 } 4124 leaf = path->nodes[0]; 4125 fi = btrfs_item_ptr(leaf, path->slots[0], 4126 struct btrfs_file_extent_item); 4127 4128 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4129 &token); 4130 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4131 btrfs_set_token_file_extent_type(leaf, fi, 4132 BTRFS_FILE_EXTENT_PREALLOC, 4133 &token); 4134 else 4135 btrfs_set_token_file_extent_type(leaf, fi, 4136 BTRFS_FILE_EXTENT_REG, 4137 &token); 4138 4139 block_len = max(em->block_len, em->orig_block_len); 4140 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4141 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4142 em->block_start, 4143 &token); 4144 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4145 &token); 4146 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4147 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4148 em->block_start - 4149 extent_offset, &token); 4150 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4151 &token); 4152 } else { 4153 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4154 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4155 &token); 4156 } 4157 4158 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4159 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4160 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4161 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4162 &token); 4163 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4164 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4165 btrfs_mark_buffer_dirty(leaf); 4166 4167 btrfs_release_path(path); 4168 4169 return ret; 4170 } 4171 4172 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4173 struct btrfs_root *root, 4174 struct btrfs_inode *inode, 4175 struct btrfs_path *path, 4176 struct list_head *logged_list, 4177 struct btrfs_log_ctx *ctx, 4178 const u64 start, 4179 const u64 end) 4180 { 4181 struct extent_map *em, *n; 4182 struct list_head extents; 4183 struct extent_map_tree *tree = &inode->extent_tree; 4184 u64 logged_start, logged_end; 4185 u64 test_gen; 4186 int ret = 0; 4187 int num = 0; 4188 4189 INIT_LIST_HEAD(&extents); 4190 4191 down_write(&inode->dio_sem); 4192 write_lock(&tree->lock); 4193 test_gen = root->fs_info->last_trans_committed; 4194 logged_start = start; 4195 logged_end = end; 4196 4197 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4198 list_del_init(&em->list); 4199 /* 4200 * Just an arbitrary number, this can be really CPU intensive 4201 * once we start getting a lot of extents, and really once we 4202 * have a bunch of extents we just want to commit since it will 4203 * be faster. 4204 */ 4205 if (++num > 32768) { 4206 list_del_init(&tree->modified_extents); 4207 ret = -EFBIG; 4208 goto process; 4209 } 4210 4211 if (em->generation <= test_gen) 4212 continue; 4213 4214 if (em->start < logged_start) 4215 logged_start = em->start; 4216 if ((em->start + em->len - 1) > logged_end) 4217 logged_end = em->start + em->len - 1; 4218 4219 /* Need a ref to keep it from getting evicted from cache */ 4220 refcount_inc(&em->refs); 4221 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4222 list_add_tail(&em->list, &extents); 4223 num++; 4224 } 4225 4226 list_sort(NULL, &extents, extent_cmp); 4227 btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); 4228 /* 4229 * Some ordered extents started by fsync might have completed 4230 * before we could collect them into the list logged_list, which 4231 * means they're gone, not in our logged_list nor in the inode's 4232 * ordered tree. We want the application/user space to know an 4233 * error happened while attempting to persist file data so that 4234 * it can take proper action. If such error happened, we leave 4235 * without writing to the log tree and the fsync must report the 4236 * file data write error and not commit the current transaction. 4237 */ 4238 ret = filemap_check_errors(inode->vfs_inode.i_mapping); 4239 if (ret) 4240 ctx->io_err = ret; 4241 process: 4242 while (!list_empty(&extents)) { 4243 em = list_entry(extents.next, struct extent_map, list); 4244 4245 list_del_init(&em->list); 4246 4247 /* 4248 * If we had an error we just need to delete everybody from our 4249 * private list. 4250 */ 4251 if (ret) { 4252 clear_em_logging(tree, em); 4253 free_extent_map(em); 4254 continue; 4255 } 4256 4257 write_unlock(&tree->lock); 4258 4259 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4260 ctx); 4261 write_lock(&tree->lock); 4262 clear_em_logging(tree, em); 4263 free_extent_map(em); 4264 } 4265 WARN_ON(!list_empty(&extents)); 4266 write_unlock(&tree->lock); 4267 up_write(&inode->dio_sem); 4268 4269 btrfs_release_path(path); 4270 return ret; 4271 } 4272 4273 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4274 struct btrfs_path *path, u64 *size_ret) 4275 { 4276 struct btrfs_key key; 4277 int ret; 4278 4279 key.objectid = btrfs_ino(inode); 4280 key.type = BTRFS_INODE_ITEM_KEY; 4281 key.offset = 0; 4282 4283 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4284 if (ret < 0) { 4285 return ret; 4286 } else if (ret > 0) { 4287 *size_ret = 0; 4288 } else { 4289 struct btrfs_inode_item *item; 4290 4291 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4292 struct btrfs_inode_item); 4293 *size_ret = btrfs_inode_size(path->nodes[0], item); 4294 } 4295 4296 btrfs_release_path(path); 4297 return 0; 4298 } 4299 4300 /* 4301 * At the moment we always log all xattrs. This is to figure out at log replay 4302 * time which xattrs must have their deletion replayed. If a xattr is missing 4303 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4304 * because if a xattr is deleted, the inode is fsynced and a power failure 4305 * happens, causing the log to be replayed the next time the fs is mounted, 4306 * we want the xattr to not exist anymore (same behaviour as other filesystems 4307 * with a journal, ext3/4, xfs, f2fs, etc). 4308 */ 4309 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4310 struct btrfs_root *root, 4311 struct btrfs_inode *inode, 4312 struct btrfs_path *path, 4313 struct btrfs_path *dst_path) 4314 { 4315 int ret; 4316 struct btrfs_key key; 4317 const u64 ino = btrfs_ino(inode); 4318 int ins_nr = 0; 4319 int start_slot = 0; 4320 4321 key.objectid = ino; 4322 key.type = BTRFS_XATTR_ITEM_KEY; 4323 key.offset = 0; 4324 4325 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4326 if (ret < 0) 4327 return ret; 4328 4329 while (true) { 4330 int slot = path->slots[0]; 4331 struct extent_buffer *leaf = path->nodes[0]; 4332 int nritems = btrfs_header_nritems(leaf); 4333 4334 if (slot >= nritems) { 4335 if (ins_nr > 0) { 4336 u64 last_extent = 0; 4337 4338 ret = copy_items(trans, inode, dst_path, path, 4339 &last_extent, start_slot, 4340 ins_nr, 1, 0); 4341 /* can't be 1, extent items aren't processed */ 4342 ASSERT(ret <= 0); 4343 if (ret < 0) 4344 return ret; 4345 ins_nr = 0; 4346 } 4347 ret = btrfs_next_leaf(root, path); 4348 if (ret < 0) 4349 return ret; 4350 else if (ret > 0) 4351 break; 4352 continue; 4353 } 4354 4355 btrfs_item_key_to_cpu(leaf, &key, slot); 4356 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4357 break; 4358 4359 if (ins_nr == 0) 4360 start_slot = slot; 4361 ins_nr++; 4362 path->slots[0]++; 4363 cond_resched(); 4364 } 4365 if (ins_nr > 0) { 4366 u64 last_extent = 0; 4367 4368 ret = copy_items(trans, inode, dst_path, path, 4369 &last_extent, start_slot, 4370 ins_nr, 1, 0); 4371 /* can't be 1, extent items aren't processed */ 4372 ASSERT(ret <= 0); 4373 if (ret < 0) 4374 return ret; 4375 } 4376 4377 return 0; 4378 } 4379 4380 /* 4381 * If the no holes feature is enabled we need to make sure any hole between the 4382 * last extent and the i_size of our inode is explicitly marked in the log. This 4383 * is to make sure that doing something like: 4384 * 4385 * 1) create file with 128Kb of data 4386 * 2) truncate file to 64Kb 4387 * 3) truncate file to 256Kb 4388 * 4) fsync file 4389 * 5) <crash/power failure> 4390 * 6) mount fs and trigger log replay 4391 * 4392 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4393 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4394 * file correspond to a hole. The presence of explicit holes in a log tree is 4395 * what guarantees that log replay will remove/adjust file extent items in the 4396 * fs/subvol tree. 4397 * 4398 * Here we do not need to care about holes between extents, that is already done 4399 * by copy_items(). We also only need to do this in the full sync path, where we 4400 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4401 * lookup the list of modified extent maps and if any represents a hole, we 4402 * insert a corresponding extent representing a hole in the log tree. 4403 */ 4404 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4405 struct btrfs_root *root, 4406 struct btrfs_inode *inode, 4407 struct btrfs_path *path) 4408 { 4409 struct btrfs_fs_info *fs_info = root->fs_info; 4410 int ret; 4411 struct btrfs_key key; 4412 u64 hole_start; 4413 u64 hole_size; 4414 struct extent_buffer *leaf; 4415 struct btrfs_root *log = root->log_root; 4416 const u64 ino = btrfs_ino(inode); 4417 const u64 i_size = i_size_read(&inode->vfs_inode); 4418 4419 if (!btrfs_fs_incompat(fs_info, NO_HOLES)) 4420 return 0; 4421 4422 key.objectid = ino; 4423 key.type = BTRFS_EXTENT_DATA_KEY; 4424 key.offset = (u64)-1; 4425 4426 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4427 ASSERT(ret != 0); 4428 if (ret < 0) 4429 return ret; 4430 4431 ASSERT(path->slots[0] > 0); 4432 path->slots[0]--; 4433 leaf = path->nodes[0]; 4434 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4435 4436 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4437 /* inode does not have any extents */ 4438 hole_start = 0; 4439 hole_size = i_size; 4440 } else { 4441 struct btrfs_file_extent_item *extent; 4442 u64 len; 4443 4444 /* 4445 * If there's an extent beyond i_size, an explicit hole was 4446 * already inserted by copy_items(). 4447 */ 4448 if (key.offset >= i_size) 4449 return 0; 4450 4451 extent = btrfs_item_ptr(leaf, path->slots[0], 4452 struct btrfs_file_extent_item); 4453 4454 if (btrfs_file_extent_type(leaf, extent) == 4455 BTRFS_FILE_EXTENT_INLINE) { 4456 len = btrfs_file_extent_inline_len(leaf, 4457 path->slots[0], 4458 extent); 4459 ASSERT(len == i_size || 4460 (len == fs_info->sectorsize && 4461 btrfs_file_extent_compression(leaf, extent) != 4462 BTRFS_COMPRESS_NONE)); 4463 return 0; 4464 } 4465 4466 len = btrfs_file_extent_num_bytes(leaf, extent); 4467 /* Last extent goes beyond i_size, no need to log a hole. */ 4468 if (key.offset + len > i_size) 4469 return 0; 4470 hole_start = key.offset + len; 4471 hole_size = i_size - hole_start; 4472 } 4473 btrfs_release_path(path); 4474 4475 /* Last extent ends at i_size. */ 4476 if (hole_size == 0) 4477 return 0; 4478 4479 hole_size = ALIGN(hole_size, fs_info->sectorsize); 4480 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4481 hole_size, 0, hole_size, 0, 0, 0); 4482 return ret; 4483 } 4484 4485 /* 4486 * When we are logging a new inode X, check if it doesn't have a reference that 4487 * matches the reference from some other inode Y created in a past transaction 4488 * and that was renamed in the current transaction. If we don't do this, then at 4489 * log replay time we can lose inode Y (and all its files if it's a directory): 4490 * 4491 * mkdir /mnt/x 4492 * echo "hello world" > /mnt/x/foobar 4493 * sync 4494 * mv /mnt/x /mnt/y 4495 * mkdir /mnt/x # or touch /mnt/x 4496 * xfs_io -c fsync /mnt/x 4497 * <power fail> 4498 * mount fs, trigger log replay 4499 * 4500 * After the log replay procedure, we would lose the first directory and all its 4501 * files (file foobar). 4502 * For the case where inode Y is not a directory we simply end up losing it: 4503 * 4504 * echo "123" > /mnt/foo 4505 * sync 4506 * mv /mnt/foo /mnt/bar 4507 * echo "abc" > /mnt/foo 4508 * xfs_io -c fsync /mnt/foo 4509 * <power fail> 4510 * 4511 * We also need this for cases where a snapshot entry is replaced by some other 4512 * entry (file or directory) otherwise we end up with an unreplayable log due to 4513 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4514 * if it were a regular entry: 4515 * 4516 * mkdir /mnt/x 4517 * btrfs subvolume snapshot /mnt /mnt/x/snap 4518 * btrfs subvolume delete /mnt/x/snap 4519 * rmdir /mnt/x 4520 * mkdir /mnt/x 4521 * fsync /mnt/x or fsync some new file inside it 4522 * <power fail> 4523 * 4524 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4525 * the same transaction. 4526 */ 4527 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4528 const int slot, 4529 const struct btrfs_key *key, 4530 struct btrfs_inode *inode, 4531 u64 *other_ino) 4532 { 4533 int ret; 4534 struct btrfs_path *search_path; 4535 char *name = NULL; 4536 u32 name_len = 0; 4537 u32 item_size = btrfs_item_size_nr(eb, slot); 4538 u32 cur_offset = 0; 4539 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4540 4541 search_path = btrfs_alloc_path(); 4542 if (!search_path) 4543 return -ENOMEM; 4544 search_path->search_commit_root = 1; 4545 search_path->skip_locking = 1; 4546 4547 while (cur_offset < item_size) { 4548 u64 parent; 4549 u32 this_name_len; 4550 u32 this_len; 4551 unsigned long name_ptr; 4552 struct btrfs_dir_item *di; 4553 4554 if (key->type == BTRFS_INODE_REF_KEY) { 4555 struct btrfs_inode_ref *iref; 4556 4557 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4558 parent = key->offset; 4559 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4560 name_ptr = (unsigned long)(iref + 1); 4561 this_len = sizeof(*iref) + this_name_len; 4562 } else { 4563 struct btrfs_inode_extref *extref; 4564 4565 extref = (struct btrfs_inode_extref *)(ptr + 4566 cur_offset); 4567 parent = btrfs_inode_extref_parent(eb, extref); 4568 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4569 name_ptr = (unsigned long)&extref->name; 4570 this_len = sizeof(*extref) + this_name_len; 4571 } 4572 4573 ret = btrfs_is_name_len_valid(eb, slot, name_ptr, 4574 this_name_len); 4575 if (!ret) { 4576 ret = -EIO; 4577 goto out; 4578 } 4579 if (this_name_len > name_len) { 4580 char *new_name; 4581 4582 new_name = krealloc(name, this_name_len, GFP_NOFS); 4583 if (!new_name) { 4584 ret = -ENOMEM; 4585 goto out; 4586 } 4587 name_len = this_name_len; 4588 name = new_name; 4589 } 4590 4591 read_extent_buffer(eb, name, name_ptr, this_name_len); 4592 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4593 parent, name, this_name_len, 0); 4594 if (di && !IS_ERR(di)) { 4595 struct btrfs_key di_key; 4596 4597 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4598 di, &di_key); 4599 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4600 ret = 1; 4601 *other_ino = di_key.objectid; 4602 } else { 4603 ret = -EAGAIN; 4604 } 4605 goto out; 4606 } else if (IS_ERR(di)) { 4607 ret = PTR_ERR(di); 4608 goto out; 4609 } 4610 btrfs_release_path(search_path); 4611 4612 cur_offset += this_len; 4613 } 4614 ret = 0; 4615 out: 4616 btrfs_free_path(search_path); 4617 kfree(name); 4618 return ret; 4619 } 4620 4621 /* log a single inode in the tree log. 4622 * At least one parent directory for this inode must exist in the tree 4623 * or be logged already. 4624 * 4625 * Any items from this inode changed by the current transaction are copied 4626 * to the log tree. An extra reference is taken on any extents in this 4627 * file, allowing us to avoid a whole pile of corner cases around logging 4628 * blocks that have been removed from the tree. 4629 * 4630 * See LOG_INODE_ALL and related defines for a description of what inode_only 4631 * does. 4632 * 4633 * This handles both files and directories. 4634 */ 4635 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4636 struct btrfs_root *root, struct btrfs_inode *inode, 4637 int inode_only, 4638 const loff_t start, 4639 const loff_t end, 4640 struct btrfs_log_ctx *ctx) 4641 { 4642 struct btrfs_fs_info *fs_info = root->fs_info; 4643 struct btrfs_path *path; 4644 struct btrfs_path *dst_path; 4645 struct btrfs_key min_key; 4646 struct btrfs_key max_key; 4647 struct btrfs_root *log = root->log_root; 4648 struct extent_buffer *src = NULL; 4649 LIST_HEAD(logged_list); 4650 u64 last_extent = 0; 4651 int err = 0; 4652 int ret; 4653 int nritems; 4654 int ins_start_slot = 0; 4655 int ins_nr; 4656 bool fast_search = false; 4657 u64 ino = btrfs_ino(inode); 4658 struct extent_map_tree *em_tree = &inode->extent_tree; 4659 u64 logged_isize = 0; 4660 bool need_log_inode_item = true; 4661 4662 path = btrfs_alloc_path(); 4663 if (!path) 4664 return -ENOMEM; 4665 dst_path = btrfs_alloc_path(); 4666 if (!dst_path) { 4667 btrfs_free_path(path); 4668 return -ENOMEM; 4669 } 4670 4671 min_key.objectid = ino; 4672 min_key.type = BTRFS_INODE_ITEM_KEY; 4673 min_key.offset = 0; 4674 4675 max_key.objectid = ino; 4676 4677 4678 /* today the code can only do partial logging of directories */ 4679 if (S_ISDIR(inode->vfs_inode.i_mode) || 4680 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4681 &inode->runtime_flags) && 4682 inode_only >= LOG_INODE_EXISTS)) 4683 max_key.type = BTRFS_XATTR_ITEM_KEY; 4684 else 4685 max_key.type = (u8)-1; 4686 max_key.offset = (u64)-1; 4687 4688 /* 4689 * Only run delayed items if we are a dir or a new file. 4690 * Otherwise commit the delayed inode only, which is needed in 4691 * order for the log replay code to mark inodes for link count 4692 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4693 */ 4694 if (S_ISDIR(inode->vfs_inode.i_mode) || 4695 inode->generation > fs_info->last_trans_committed) 4696 ret = btrfs_commit_inode_delayed_items(trans, inode); 4697 else 4698 ret = btrfs_commit_inode_delayed_inode(inode); 4699 4700 if (ret) { 4701 btrfs_free_path(path); 4702 btrfs_free_path(dst_path); 4703 return ret; 4704 } 4705 4706 if (inode_only == LOG_OTHER_INODE) { 4707 inode_only = LOG_INODE_EXISTS; 4708 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 4709 } else { 4710 mutex_lock(&inode->log_mutex); 4711 } 4712 4713 /* 4714 * a brute force approach to making sure we get the most uptodate 4715 * copies of everything. 4716 */ 4717 if (S_ISDIR(inode->vfs_inode.i_mode)) { 4718 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4719 4720 if (inode_only == LOG_INODE_EXISTS) 4721 max_key_type = BTRFS_XATTR_ITEM_KEY; 4722 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4723 } else { 4724 if (inode_only == LOG_INODE_EXISTS) { 4725 /* 4726 * Make sure the new inode item we write to the log has 4727 * the same isize as the current one (if it exists). 4728 * This is necessary to prevent data loss after log 4729 * replay, and also to prevent doing a wrong expanding 4730 * truncate - for e.g. create file, write 4K into offset 4731 * 0, fsync, write 4K into offset 4096, add hard link, 4732 * fsync some other file (to sync log), power fail - if 4733 * we use the inode's current i_size, after log replay 4734 * we get a 8Kb file, with the last 4Kb extent as a hole 4735 * (zeroes), as if an expanding truncate happened, 4736 * instead of getting a file of 4Kb only. 4737 */ 4738 err = logged_inode_size(log, inode, path, &logged_isize); 4739 if (err) 4740 goto out_unlock; 4741 } 4742 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4743 &inode->runtime_flags)) { 4744 if (inode_only == LOG_INODE_EXISTS) { 4745 max_key.type = BTRFS_XATTR_ITEM_KEY; 4746 ret = drop_objectid_items(trans, log, path, ino, 4747 max_key.type); 4748 } else { 4749 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4750 &inode->runtime_flags); 4751 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4752 &inode->runtime_flags); 4753 while(1) { 4754 ret = btrfs_truncate_inode_items(trans, 4755 log, &inode->vfs_inode, 0, 0); 4756 if (ret != -EAGAIN) 4757 break; 4758 } 4759 } 4760 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4761 &inode->runtime_flags) || 4762 inode_only == LOG_INODE_EXISTS) { 4763 if (inode_only == LOG_INODE_ALL) 4764 fast_search = true; 4765 max_key.type = BTRFS_XATTR_ITEM_KEY; 4766 ret = drop_objectid_items(trans, log, path, ino, 4767 max_key.type); 4768 } else { 4769 if (inode_only == LOG_INODE_ALL) 4770 fast_search = true; 4771 goto log_extents; 4772 } 4773 4774 } 4775 if (ret) { 4776 err = ret; 4777 goto out_unlock; 4778 } 4779 4780 while (1) { 4781 ins_nr = 0; 4782 ret = btrfs_search_forward(root, &min_key, 4783 path, trans->transid); 4784 if (ret < 0) { 4785 err = ret; 4786 goto out_unlock; 4787 } 4788 if (ret != 0) 4789 break; 4790 again: 4791 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4792 if (min_key.objectid != ino) 4793 break; 4794 if (min_key.type > max_key.type) 4795 break; 4796 4797 if (min_key.type == BTRFS_INODE_ITEM_KEY) 4798 need_log_inode_item = false; 4799 4800 if ((min_key.type == BTRFS_INODE_REF_KEY || 4801 min_key.type == BTRFS_INODE_EXTREF_KEY) && 4802 inode->generation == trans->transid) { 4803 u64 other_ino = 0; 4804 4805 ret = btrfs_check_ref_name_override(path->nodes[0], 4806 path->slots[0], &min_key, inode, 4807 &other_ino); 4808 if (ret < 0) { 4809 err = ret; 4810 goto out_unlock; 4811 } else if (ret > 0 && ctx && 4812 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 4813 struct btrfs_key inode_key; 4814 struct inode *other_inode; 4815 4816 if (ins_nr > 0) { 4817 ins_nr++; 4818 } else { 4819 ins_nr = 1; 4820 ins_start_slot = path->slots[0]; 4821 } 4822 ret = copy_items(trans, inode, dst_path, path, 4823 &last_extent, ins_start_slot, 4824 ins_nr, inode_only, 4825 logged_isize); 4826 if (ret < 0) { 4827 err = ret; 4828 goto out_unlock; 4829 } 4830 ins_nr = 0; 4831 btrfs_release_path(path); 4832 inode_key.objectid = other_ino; 4833 inode_key.type = BTRFS_INODE_ITEM_KEY; 4834 inode_key.offset = 0; 4835 other_inode = btrfs_iget(fs_info->sb, 4836 &inode_key, root, 4837 NULL); 4838 /* 4839 * If the other inode that had a conflicting dir 4840 * entry was deleted in the current transaction, 4841 * we don't need to do more work nor fallback to 4842 * a transaction commit. 4843 */ 4844 if (IS_ERR(other_inode) && 4845 PTR_ERR(other_inode) == -ENOENT) { 4846 goto next_key; 4847 } else if (IS_ERR(other_inode)) { 4848 err = PTR_ERR(other_inode); 4849 goto out_unlock; 4850 } 4851 /* 4852 * We are safe logging the other inode without 4853 * acquiring its i_mutex as long as we log with 4854 * the LOG_INODE_EXISTS mode. We're safe against 4855 * concurrent renames of the other inode as well 4856 * because during a rename we pin the log and 4857 * update the log with the new name before we 4858 * unpin it. 4859 */ 4860 err = btrfs_log_inode(trans, root, 4861 BTRFS_I(other_inode), 4862 LOG_OTHER_INODE, 0, LLONG_MAX, 4863 ctx); 4864 iput(other_inode); 4865 if (err) 4866 goto out_unlock; 4867 else 4868 goto next_key; 4869 } 4870 } 4871 4872 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 4873 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 4874 if (ins_nr == 0) 4875 goto next_slot; 4876 ret = copy_items(trans, inode, dst_path, path, 4877 &last_extent, ins_start_slot, 4878 ins_nr, inode_only, logged_isize); 4879 if (ret < 0) { 4880 err = ret; 4881 goto out_unlock; 4882 } 4883 ins_nr = 0; 4884 if (ret) { 4885 btrfs_release_path(path); 4886 continue; 4887 } 4888 goto next_slot; 4889 } 4890 4891 src = path->nodes[0]; 4892 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4893 ins_nr++; 4894 goto next_slot; 4895 } else if (!ins_nr) { 4896 ins_start_slot = path->slots[0]; 4897 ins_nr = 1; 4898 goto next_slot; 4899 } 4900 4901 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4902 ins_start_slot, ins_nr, inode_only, 4903 logged_isize); 4904 if (ret < 0) { 4905 err = ret; 4906 goto out_unlock; 4907 } 4908 if (ret) { 4909 ins_nr = 0; 4910 btrfs_release_path(path); 4911 continue; 4912 } 4913 ins_nr = 1; 4914 ins_start_slot = path->slots[0]; 4915 next_slot: 4916 4917 nritems = btrfs_header_nritems(path->nodes[0]); 4918 path->slots[0]++; 4919 if (path->slots[0] < nritems) { 4920 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 4921 path->slots[0]); 4922 goto again; 4923 } 4924 if (ins_nr) { 4925 ret = copy_items(trans, inode, dst_path, path, 4926 &last_extent, ins_start_slot, 4927 ins_nr, inode_only, logged_isize); 4928 if (ret < 0) { 4929 err = ret; 4930 goto out_unlock; 4931 } 4932 ret = 0; 4933 ins_nr = 0; 4934 } 4935 btrfs_release_path(path); 4936 next_key: 4937 if (min_key.offset < (u64)-1) { 4938 min_key.offset++; 4939 } else if (min_key.type < max_key.type) { 4940 min_key.type++; 4941 min_key.offset = 0; 4942 } else { 4943 break; 4944 } 4945 } 4946 if (ins_nr) { 4947 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4948 ins_start_slot, ins_nr, inode_only, 4949 logged_isize); 4950 if (ret < 0) { 4951 err = ret; 4952 goto out_unlock; 4953 } 4954 ret = 0; 4955 ins_nr = 0; 4956 } 4957 4958 btrfs_release_path(path); 4959 btrfs_release_path(dst_path); 4960 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 4961 if (err) 4962 goto out_unlock; 4963 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 4964 btrfs_release_path(path); 4965 btrfs_release_path(dst_path); 4966 err = btrfs_log_trailing_hole(trans, root, inode, path); 4967 if (err) 4968 goto out_unlock; 4969 } 4970 log_extents: 4971 btrfs_release_path(path); 4972 btrfs_release_path(dst_path); 4973 if (need_log_inode_item) { 4974 err = log_inode_item(trans, log, dst_path, inode); 4975 if (err) 4976 goto out_unlock; 4977 } 4978 if (fast_search) { 4979 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4980 &logged_list, ctx, start, end); 4981 if (ret) { 4982 err = ret; 4983 goto out_unlock; 4984 } 4985 } else if (inode_only == LOG_INODE_ALL) { 4986 struct extent_map *em, *n; 4987 4988 write_lock(&em_tree->lock); 4989 /* 4990 * We can't just remove every em if we're called for a ranged 4991 * fsync - that is, one that doesn't cover the whole possible 4992 * file range (0 to LLONG_MAX). This is because we can have 4993 * em's that fall outside the range we're logging and therefore 4994 * their ordered operations haven't completed yet 4995 * (btrfs_finish_ordered_io() not invoked yet). This means we 4996 * didn't get their respective file extent item in the fs/subvol 4997 * tree yet, and need to let the next fast fsync (one which 4998 * consults the list of modified extent maps) find the em so 4999 * that it logs a matching file extent item and waits for the 5000 * respective ordered operation to complete (if it's still 5001 * running). 5002 * 5003 * Removing every em outside the range we're logging would make 5004 * the next fast fsync not log their matching file extent items, 5005 * therefore making us lose data after a log replay. 5006 */ 5007 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 5008 list) { 5009 const u64 mod_end = em->mod_start + em->mod_len - 1; 5010 5011 if (em->mod_start >= start && mod_end <= end) 5012 list_del_init(&em->list); 5013 } 5014 write_unlock(&em_tree->lock); 5015 } 5016 5017 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5018 ret = log_directory_changes(trans, root, inode, path, dst_path, 5019 ctx); 5020 if (ret) { 5021 err = ret; 5022 goto out_unlock; 5023 } 5024 } 5025 5026 spin_lock(&inode->lock); 5027 inode->logged_trans = trans->transid; 5028 inode->last_log_commit = inode->last_sub_trans; 5029 spin_unlock(&inode->lock); 5030 out_unlock: 5031 if (unlikely(err)) 5032 btrfs_put_logged_extents(&logged_list); 5033 else 5034 btrfs_submit_logged_extents(&logged_list, log); 5035 mutex_unlock(&inode->log_mutex); 5036 5037 btrfs_free_path(path); 5038 btrfs_free_path(dst_path); 5039 return err; 5040 } 5041 5042 /* 5043 * Check if we must fallback to a transaction commit when logging an inode. 5044 * This must be called after logging the inode and is used only in the context 5045 * when fsyncing an inode requires the need to log some other inode - in which 5046 * case we can't lock the i_mutex of each other inode we need to log as that 5047 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5048 * log inodes up or down in the hierarchy) or rename operations for example. So 5049 * we take the log_mutex of the inode after we have logged it and then check for 5050 * its last_unlink_trans value - this is safe because any task setting 5051 * last_unlink_trans must take the log_mutex and it must do this before it does 5052 * the actual unlink operation, so if we do this check before a concurrent task 5053 * sets last_unlink_trans it means we've logged a consistent version/state of 5054 * all the inode items, otherwise we are not sure and must do a transaction 5055 * commit (the concurrent task might have only updated last_unlink_trans before 5056 * we logged the inode or it might have also done the unlink). 5057 */ 5058 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5059 struct btrfs_inode *inode) 5060 { 5061 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5062 bool ret = false; 5063 5064 mutex_lock(&inode->log_mutex); 5065 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5066 /* 5067 * Make sure any commits to the log are forced to be full 5068 * commits. 5069 */ 5070 btrfs_set_log_full_commit(fs_info, trans); 5071 ret = true; 5072 } 5073 mutex_unlock(&inode->log_mutex); 5074 5075 return ret; 5076 } 5077 5078 /* 5079 * follow the dentry parent pointers up the chain and see if any 5080 * of the directories in it require a full commit before they can 5081 * be logged. Returns zero if nothing special needs to be done or 1 if 5082 * a full commit is required. 5083 */ 5084 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5085 struct btrfs_inode *inode, 5086 struct dentry *parent, 5087 struct super_block *sb, 5088 u64 last_committed) 5089 { 5090 int ret = 0; 5091 struct dentry *old_parent = NULL; 5092 struct btrfs_inode *orig_inode = inode; 5093 5094 /* 5095 * for regular files, if its inode is already on disk, we don't 5096 * have to worry about the parents at all. This is because 5097 * we can use the last_unlink_trans field to record renames 5098 * and other fun in this file. 5099 */ 5100 if (S_ISREG(inode->vfs_inode.i_mode) && 5101 inode->generation <= last_committed && 5102 inode->last_unlink_trans <= last_committed) 5103 goto out; 5104 5105 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5106 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5107 goto out; 5108 inode = BTRFS_I(d_inode(parent)); 5109 } 5110 5111 while (1) { 5112 /* 5113 * If we are logging a directory then we start with our inode, 5114 * not our parent's inode, so we need to skip setting the 5115 * logged_trans so that further down in the log code we don't 5116 * think this inode has already been logged. 5117 */ 5118 if (inode != orig_inode) 5119 inode->logged_trans = trans->transid; 5120 smp_mb(); 5121 5122 if (btrfs_must_commit_transaction(trans, inode)) { 5123 ret = 1; 5124 break; 5125 } 5126 5127 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5128 break; 5129 5130 if (IS_ROOT(parent)) { 5131 inode = BTRFS_I(d_inode(parent)); 5132 if (btrfs_must_commit_transaction(trans, inode)) 5133 ret = 1; 5134 break; 5135 } 5136 5137 parent = dget_parent(parent); 5138 dput(old_parent); 5139 old_parent = parent; 5140 inode = BTRFS_I(d_inode(parent)); 5141 5142 } 5143 dput(old_parent); 5144 out: 5145 return ret; 5146 } 5147 5148 struct btrfs_dir_list { 5149 u64 ino; 5150 struct list_head list; 5151 }; 5152 5153 /* 5154 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5155 * details about the why it is needed. 5156 * This is a recursive operation - if an existing dentry corresponds to a 5157 * directory, that directory's new entries are logged too (same behaviour as 5158 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5159 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5160 * complains about the following circular lock dependency / possible deadlock: 5161 * 5162 * CPU0 CPU1 5163 * ---- ---- 5164 * lock(&type->i_mutex_dir_key#3/2); 5165 * lock(sb_internal#2); 5166 * lock(&type->i_mutex_dir_key#3/2); 5167 * lock(&sb->s_type->i_mutex_key#14); 5168 * 5169 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5170 * sb_start_intwrite() in btrfs_start_transaction(). 5171 * Not locking i_mutex of the inodes is still safe because: 5172 * 5173 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5174 * that while logging the inode new references (names) are added or removed 5175 * from the inode, leaving the logged inode item with a link count that does 5176 * not match the number of logged inode reference items. This is fine because 5177 * at log replay time we compute the real number of links and correct the 5178 * link count in the inode item (see replay_one_buffer() and 5179 * link_to_fixup_dir()); 5180 * 5181 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5182 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5183 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5184 * has a size that doesn't match the sum of the lengths of all the logged 5185 * names. This does not result in a problem because if a dir_item key is 5186 * logged but its matching dir_index key is not logged, at log replay time we 5187 * don't use it to replay the respective name (see replay_one_name()). On the 5188 * other hand if only the dir_index key ends up being logged, the respective 5189 * name is added to the fs/subvol tree with both the dir_item and dir_index 5190 * keys created (see replay_one_name()). 5191 * The directory's inode item with a wrong i_size is not a problem as well, 5192 * since we don't use it at log replay time to set the i_size in the inode 5193 * item of the fs/subvol tree (see overwrite_item()). 5194 */ 5195 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5196 struct btrfs_root *root, 5197 struct btrfs_inode *start_inode, 5198 struct btrfs_log_ctx *ctx) 5199 { 5200 struct btrfs_fs_info *fs_info = root->fs_info; 5201 struct btrfs_root *log = root->log_root; 5202 struct btrfs_path *path; 5203 LIST_HEAD(dir_list); 5204 struct btrfs_dir_list *dir_elem; 5205 int ret = 0; 5206 5207 path = btrfs_alloc_path(); 5208 if (!path) 5209 return -ENOMEM; 5210 5211 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5212 if (!dir_elem) { 5213 btrfs_free_path(path); 5214 return -ENOMEM; 5215 } 5216 dir_elem->ino = btrfs_ino(start_inode); 5217 list_add_tail(&dir_elem->list, &dir_list); 5218 5219 while (!list_empty(&dir_list)) { 5220 struct extent_buffer *leaf; 5221 struct btrfs_key min_key; 5222 int nritems; 5223 int i; 5224 5225 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5226 list); 5227 if (ret) 5228 goto next_dir_inode; 5229 5230 min_key.objectid = dir_elem->ino; 5231 min_key.type = BTRFS_DIR_ITEM_KEY; 5232 min_key.offset = 0; 5233 again: 5234 btrfs_release_path(path); 5235 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5236 if (ret < 0) { 5237 goto next_dir_inode; 5238 } else if (ret > 0) { 5239 ret = 0; 5240 goto next_dir_inode; 5241 } 5242 5243 process_leaf: 5244 leaf = path->nodes[0]; 5245 nritems = btrfs_header_nritems(leaf); 5246 for (i = path->slots[0]; i < nritems; i++) { 5247 struct btrfs_dir_item *di; 5248 struct btrfs_key di_key; 5249 struct inode *di_inode; 5250 struct btrfs_dir_list *new_dir_elem; 5251 int log_mode = LOG_INODE_EXISTS; 5252 int type; 5253 5254 btrfs_item_key_to_cpu(leaf, &min_key, i); 5255 if (min_key.objectid != dir_elem->ino || 5256 min_key.type != BTRFS_DIR_ITEM_KEY) 5257 goto next_dir_inode; 5258 5259 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5260 type = btrfs_dir_type(leaf, di); 5261 if (btrfs_dir_transid(leaf, di) < trans->transid && 5262 type != BTRFS_FT_DIR) 5263 continue; 5264 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5265 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5266 continue; 5267 5268 btrfs_release_path(path); 5269 di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); 5270 if (IS_ERR(di_inode)) { 5271 ret = PTR_ERR(di_inode); 5272 goto next_dir_inode; 5273 } 5274 5275 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5276 iput(di_inode); 5277 break; 5278 } 5279 5280 ctx->log_new_dentries = false; 5281 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5282 log_mode = LOG_INODE_ALL; 5283 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5284 log_mode, 0, LLONG_MAX, ctx); 5285 if (!ret && 5286 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5287 ret = 1; 5288 iput(di_inode); 5289 if (ret) 5290 goto next_dir_inode; 5291 if (ctx->log_new_dentries) { 5292 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5293 GFP_NOFS); 5294 if (!new_dir_elem) { 5295 ret = -ENOMEM; 5296 goto next_dir_inode; 5297 } 5298 new_dir_elem->ino = di_key.objectid; 5299 list_add_tail(&new_dir_elem->list, &dir_list); 5300 } 5301 break; 5302 } 5303 if (i == nritems) { 5304 ret = btrfs_next_leaf(log, path); 5305 if (ret < 0) { 5306 goto next_dir_inode; 5307 } else if (ret > 0) { 5308 ret = 0; 5309 goto next_dir_inode; 5310 } 5311 goto process_leaf; 5312 } 5313 if (min_key.offset < (u64)-1) { 5314 min_key.offset++; 5315 goto again; 5316 } 5317 next_dir_inode: 5318 list_del(&dir_elem->list); 5319 kfree(dir_elem); 5320 } 5321 5322 btrfs_free_path(path); 5323 return ret; 5324 } 5325 5326 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5327 struct btrfs_inode *inode, 5328 struct btrfs_log_ctx *ctx) 5329 { 5330 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5331 int ret; 5332 struct btrfs_path *path; 5333 struct btrfs_key key; 5334 struct btrfs_root *root = inode->root; 5335 const u64 ino = btrfs_ino(inode); 5336 5337 path = btrfs_alloc_path(); 5338 if (!path) 5339 return -ENOMEM; 5340 path->skip_locking = 1; 5341 path->search_commit_root = 1; 5342 5343 key.objectid = ino; 5344 key.type = BTRFS_INODE_REF_KEY; 5345 key.offset = 0; 5346 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5347 if (ret < 0) 5348 goto out; 5349 5350 while (true) { 5351 struct extent_buffer *leaf = path->nodes[0]; 5352 int slot = path->slots[0]; 5353 u32 cur_offset = 0; 5354 u32 item_size; 5355 unsigned long ptr; 5356 5357 if (slot >= btrfs_header_nritems(leaf)) { 5358 ret = btrfs_next_leaf(root, path); 5359 if (ret < 0) 5360 goto out; 5361 else if (ret > 0) 5362 break; 5363 continue; 5364 } 5365 5366 btrfs_item_key_to_cpu(leaf, &key, slot); 5367 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5368 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5369 break; 5370 5371 item_size = btrfs_item_size_nr(leaf, slot); 5372 ptr = btrfs_item_ptr_offset(leaf, slot); 5373 while (cur_offset < item_size) { 5374 struct btrfs_key inode_key; 5375 struct inode *dir_inode; 5376 5377 inode_key.type = BTRFS_INODE_ITEM_KEY; 5378 inode_key.offset = 0; 5379 5380 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5381 struct btrfs_inode_extref *extref; 5382 5383 extref = (struct btrfs_inode_extref *) 5384 (ptr + cur_offset); 5385 inode_key.objectid = btrfs_inode_extref_parent( 5386 leaf, extref); 5387 cur_offset += sizeof(*extref); 5388 cur_offset += btrfs_inode_extref_name_len(leaf, 5389 extref); 5390 } else { 5391 inode_key.objectid = key.offset; 5392 cur_offset = item_size; 5393 } 5394 5395 dir_inode = btrfs_iget(fs_info->sb, &inode_key, 5396 root, NULL); 5397 /* If parent inode was deleted, skip it. */ 5398 if (IS_ERR(dir_inode)) 5399 continue; 5400 5401 if (ctx) 5402 ctx->log_new_dentries = false; 5403 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5404 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5405 if (!ret && 5406 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5407 ret = 1; 5408 if (!ret && ctx && ctx->log_new_dentries) 5409 ret = log_new_dir_dentries(trans, root, 5410 BTRFS_I(dir_inode), ctx); 5411 iput(dir_inode); 5412 if (ret) 5413 goto out; 5414 } 5415 path->slots[0]++; 5416 } 5417 ret = 0; 5418 out: 5419 btrfs_free_path(path); 5420 return ret; 5421 } 5422 5423 /* 5424 * helper function around btrfs_log_inode to make sure newly created 5425 * parent directories also end up in the log. A minimal inode and backref 5426 * only logging is done of any parent directories that are older than 5427 * the last committed transaction 5428 */ 5429 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5430 struct btrfs_root *root, 5431 struct btrfs_inode *inode, 5432 struct dentry *parent, 5433 const loff_t start, 5434 const loff_t end, 5435 int exists_only, 5436 struct btrfs_log_ctx *ctx) 5437 { 5438 struct btrfs_fs_info *fs_info = root->fs_info; 5439 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 5440 struct super_block *sb; 5441 struct dentry *old_parent = NULL; 5442 int ret = 0; 5443 u64 last_committed = fs_info->last_trans_committed; 5444 bool log_dentries = false; 5445 struct btrfs_inode *orig_inode = inode; 5446 5447 sb = inode->vfs_inode.i_sb; 5448 5449 if (btrfs_test_opt(fs_info, NOTREELOG)) { 5450 ret = 1; 5451 goto end_no_trans; 5452 } 5453 5454 /* 5455 * The prev transaction commit doesn't complete, we need do 5456 * full commit by ourselves. 5457 */ 5458 if (fs_info->last_trans_log_full_commit > 5459 fs_info->last_trans_committed) { 5460 ret = 1; 5461 goto end_no_trans; 5462 } 5463 5464 if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) { 5465 ret = 1; 5466 goto end_no_trans; 5467 } 5468 5469 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 5470 last_committed); 5471 if (ret) 5472 goto end_no_trans; 5473 5474 if (btrfs_inode_in_log(inode, trans->transid)) { 5475 ret = BTRFS_NO_LOG_SYNC; 5476 goto end_no_trans; 5477 } 5478 5479 ret = start_log_trans(trans, root, ctx); 5480 if (ret) 5481 goto end_no_trans; 5482 5483 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 5484 if (ret) 5485 goto end_trans; 5486 5487 /* 5488 * for regular files, if its inode is already on disk, we don't 5489 * have to worry about the parents at all. This is because 5490 * we can use the last_unlink_trans field to record renames 5491 * and other fun in this file. 5492 */ 5493 if (S_ISREG(inode->vfs_inode.i_mode) && 5494 inode->generation <= last_committed && 5495 inode->last_unlink_trans <= last_committed) { 5496 ret = 0; 5497 goto end_trans; 5498 } 5499 5500 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 5501 log_dentries = true; 5502 5503 /* 5504 * On unlink we must make sure all our current and old parent directory 5505 * inodes are fully logged. This is to prevent leaving dangling 5506 * directory index entries in directories that were our parents but are 5507 * not anymore. Not doing this results in old parent directory being 5508 * impossible to delete after log replay (rmdir will always fail with 5509 * error -ENOTEMPTY). 5510 * 5511 * Example 1: 5512 * 5513 * mkdir testdir 5514 * touch testdir/foo 5515 * ln testdir/foo testdir/bar 5516 * sync 5517 * unlink testdir/bar 5518 * xfs_io -c fsync testdir/foo 5519 * <power failure> 5520 * mount fs, triggers log replay 5521 * 5522 * If we don't log the parent directory (testdir), after log replay the 5523 * directory still has an entry pointing to the file inode using the bar 5524 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 5525 * the file inode has a link count of 1. 5526 * 5527 * Example 2: 5528 * 5529 * mkdir testdir 5530 * touch foo 5531 * ln foo testdir/foo2 5532 * ln foo testdir/foo3 5533 * sync 5534 * unlink testdir/foo3 5535 * xfs_io -c fsync foo 5536 * <power failure> 5537 * mount fs, triggers log replay 5538 * 5539 * Similar as the first example, after log replay the parent directory 5540 * testdir still has an entry pointing to the inode file with name foo3 5541 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 5542 * and has a link count of 2. 5543 */ 5544 if (inode->last_unlink_trans > last_committed) { 5545 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 5546 if (ret) 5547 goto end_trans; 5548 } 5549 5550 while (1) { 5551 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5552 break; 5553 5554 inode = BTRFS_I(d_inode(parent)); 5555 if (root != inode->root) 5556 break; 5557 5558 if (inode->generation > last_committed) { 5559 ret = btrfs_log_inode(trans, root, inode, 5560 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); 5561 if (ret) 5562 goto end_trans; 5563 } 5564 if (IS_ROOT(parent)) 5565 break; 5566 5567 parent = dget_parent(parent); 5568 dput(old_parent); 5569 old_parent = parent; 5570 } 5571 if (log_dentries) 5572 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5573 else 5574 ret = 0; 5575 end_trans: 5576 dput(old_parent); 5577 if (ret < 0) { 5578 btrfs_set_log_full_commit(fs_info, trans); 5579 ret = 1; 5580 } 5581 5582 if (ret) 5583 btrfs_remove_log_ctx(root, ctx); 5584 btrfs_end_log_trans(root); 5585 end_no_trans: 5586 return ret; 5587 } 5588 5589 /* 5590 * it is not safe to log dentry if the chunk root has added new 5591 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5592 * If this returns 1, you must commit the transaction to safely get your 5593 * data on disk. 5594 */ 5595 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5596 struct btrfs_root *root, struct dentry *dentry, 5597 const loff_t start, 5598 const loff_t end, 5599 struct btrfs_log_ctx *ctx) 5600 { 5601 struct dentry *parent = dget_parent(dentry); 5602 int ret; 5603 5604 ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)), 5605 parent, start, end, 0, ctx); 5606 dput(parent); 5607 5608 return ret; 5609 } 5610 5611 /* 5612 * should be called during mount to recover any replay any log trees 5613 * from the FS 5614 */ 5615 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5616 { 5617 int ret; 5618 struct btrfs_path *path; 5619 struct btrfs_trans_handle *trans; 5620 struct btrfs_key key; 5621 struct btrfs_key found_key; 5622 struct btrfs_key tmp_key; 5623 struct btrfs_root *log; 5624 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5625 struct walk_control wc = { 5626 .process_func = process_one_buffer, 5627 .stage = 0, 5628 }; 5629 5630 path = btrfs_alloc_path(); 5631 if (!path) 5632 return -ENOMEM; 5633 5634 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5635 5636 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5637 if (IS_ERR(trans)) { 5638 ret = PTR_ERR(trans); 5639 goto error; 5640 } 5641 5642 wc.trans = trans; 5643 wc.pin = 1; 5644 5645 ret = walk_log_tree(trans, log_root_tree, &wc); 5646 if (ret) { 5647 btrfs_handle_fs_error(fs_info, ret, 5648 "Failed to pin buffers while recovering log root tree."); 5649 goto error; 5650 } 5651 5652 again: 5653 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5654 key.offset = (u64)-1; 5655 key.type = BTRFS_ROOT_ITEM_KEY; 5656 5657 while (1) { 5658 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5659 5660 if (ret < 0) { 5661 btrfs_handle_fs_error(fs_info, ret, 5662 "Couldn't find tree log root."); 5663 goto error; 5664 } 5665 if (ret > 0) { 5666 if (path->slots[0] == 0) 5667 break; 5668 path->slots[0]--; 5669 } 5670 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5671 path->slots[0]); 5672 btrfs_release_path(path); 5673 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5674 break; 5675 5676 log = btrfs_read_fs_root(log_root_tree, &found_key); 5677 if (IS_ERR(log)) { 5678 ret = PTR_ERR(log); 5679 btrfs_handle_fs_error(fs_info, ret, 5680 "Couldn't read tree log root."); 5681 goto error; 5682 } 5683 5684 tmp_key.objectid = found_key.offset; 5685 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5686 tmp_key.offset = (u64)-1; 5687 5688 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5689 if (IS_ERR(wc.replay_dest)) { 5690 ret = PTR_ERR(wc.replay_dest); 5691 free_extent_buffer(log->node); 5692 free_extent_buffer(log->commit_root); 5693 kfree(log); 5694 btrfs_handle_fs_error(fs_info, ret, 5695 "Couldn't read target root for tree log recovery."); 5696 goto error; 5697 } 5698 5699 wc.replay_dest->log_root = log; 5700 btrfs_record_root_in_trans(trans, wc.replay_dest); 5701 ret = walk_log_tree(trans, log, &wc); 5702 5703 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5704 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5705 path); 5706 } 5707 5708 key.offset = found_key.offset - 1; 5709 wc.replay_dest->log_root = NULL; 5710 free_extent_buffer(log->node); 5711 free_extent_buffer(log->commit_root); 5712 kfree(log); 5713 5714 if (ret) 5715 goto error; 5716 5717 if (found_key.offset == 0) 5718 break; 5719 } 5720 btrfs_release_path(path); 5721 5722 /* step one is to pin it all, step two is to replay just inodes */ 5723 if (wc.pin) { 5724 wc.pin = 0; 5725 wc.process_func = replay_one_buffer; 5726 wc.stage = LOG_WALK_REPLAY_INODES; 5727 goto again; 5728 } 5729 /* step three is to replay everything */ 5730 if (wc.stage < LOG_WALK_REPLAY_ALL) { 5731 wc.stage++; 5732 goto again; 5733 } 5734 5735 btrfs_free_path(path); 5736 5737 /* step 4: commit the transaction, which also unpins the blocks */ 5738 ret = btrfs_commit_transaction(trans); 5739 if (ret) 5740 return ret; 5741 5742 free_extent_buffer(log_root_tree->node); 5743 log_root_tree->log_root = NULL; 5744 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5745 kfree(log_root_tree); 5746 5747 return 0; 5748 error: 5749 if (wc.trans) 5750 btrfs_end_transaction(wc.trans); 5751 btrfs_free_path(path); 5752 return ret; 5753 } 5754 5755 /* 5756 * there are some corner cases where we want to force a full 5757 * commit instead of allowing a directory to be logged. 5758 * 5759 * They revolve around files there were unlinked from the directory, and 5760 * this function updates the parent directory so that a full commit is 5761 * properly done if it is fsync'd later after the unlinks are done. 5762 * 5763 * Must be called before the unlink operations (updates to the subvolume tree, 5764 * inodes, etc) are done. 5765 */ 5766 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5767 struct btrfs_inode *dir, struct btrfs_inode *inode, 5768 int for_rename) 5769 { 5770 /* 5771 * when we're logging a file, if it hasn't been renamed 5772 * or unlinked, and its inode is fully committed on disk, 5773 * we don't have to worry about walking up the directory chain 5774 * to log its parents. 5775 * 5776 * So, we use the last_unlink_trans field to put this transid 5777 * into the file. When the file is logged we check it and 5778 * don't log the parents if the file is fully on disk. 5779 */ 5780 mutex_lock(&inode->log_mutex); 5781 inode->last_unlink_trans = trans->transid; 5782 mutex_unlock(&inode->log_mutex); 5783 5784 /* 5785 * if this directory was already logged any new 5786 * names for this file/dir will get recorded 5787 */ 5788 smp_mb(); 5789 if (dir->logged_trans == trans->transid) 5790 return; 5791 5792 /* 5793 * if the inode we're about to unlink was logged, 5794 * the log will be properly updated for any new names 5795 */ 5796 if (inode->logged_trans == trans->transid) 5797 return; 5798 5799 /* 5800 * when renaming files across directories, if the directory 5801 * there we're unlinking from gets fsync'd later on, there's 5802 * no way to find the destination directory later and fsync it 5803 * properly. So, we have to be conservative and force commits 5804 * so the new name gets discovered. 5805 */ 5806 if (for_rename) 5807 goto record; 5808 5809 /* we can safely do the unlink without any special recording */ 5810 return; 5811 5812 record: 5813 mutex_lock(&dir->log_mutex); 5814 dir->last_unlink_trans = trans->transid; 5815 mutex_unlock(&dir->log_mutex); 5816 } 5817 5818 /* 5819 * Make sure that if someone attempts to fsync the parent directory of a deleted 5820 * snapshot, it ends up triggering a transaction commit. This is to guarantee 5821 * that after replaying the log tree of the parent directory's root we will not 5822 * see the snapshot anymore and at log replay time we will not see any log tree 5823 * corresponding to the deleted snapshot's root, which could lead to replaying 5824 * it after replaying the log tree of the parent directory (which would replay 5825 * the snapshot delete operation). 5826 * 5827 * Must be called before the actual snapshot destroy operation (updates to the 5828 * parent root and tree of tree roots trees, etc) are done. 5829 */ 5830 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 5831 struct btrfs_inode *dir) 5832 { 5833 mutex_lock(&dir->log_mutex); 5834 dir->last_unlink_trans = trans->transid; 5835 mutex_unlock(&dir->log_mutex); 5836 } 5837 5838 /* 5839 * Call this after adding a new name for a file and it will properly 5840 * update the log to reflect the new name. 5841 * 5842 * It will return zero if all goes well, and it will return 1 if a 5843 * full transaction commit is required. 5844 */ 5845 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 5846 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 5847 struct dentry *parent) 5848 { 5849 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5850 struct btrfs_root *root = inode->root; 5851 5852 /* 5853 * this will force the logging code to walk the dentry chain 5854 * up for the file 5855 */ 5856 if (S_ISREG(inode->vfs_inode.i_mode)) 5857 inode->last_unlink_trans = trans->transid; 5858 5859 /* 5860 * if this inode hasn't been logged and directory we're renaming it 5861 * from hasn't been logged, we don't need to log it 5862 */ 5863 if (inode->logged_trans <= fs_info->last_trans_committed && 5864 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 5865 return 0; 5866 5867 return btrfs_log_inode_parent(trans, root, inode, parent, 0, 5868 LLONG_MAX, 1, NULL); 5869 } 5870 5871