1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include "tree-log.h" 24 #include "disk-io.h" 25 #include "locking.h" 26 #include "print-tree.h" 27 #include "backref.h" 28 #include "hash.h" 29 30 /* magic values for the inode_only field in btrfs_log_inode: 31 * 32 * LOG_INODE_ALL means to log everything 33 * LOG_INODE_EXISTS means to log just enough to recreate the inode 34 * during log replay 35 */ 36 #define LOG_INODE_ALL 0 37 #define LOG_INODE_EXISTS 1 38 39 /* 40 * directory trouble cases 41 * 42 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 43 * log, we must force a full commit before doing an fsync of the directory 44 * where the unlink was done. 45 * ---> record transid of last unlink/rename per directory 46 * 47 * mkdir foo/some_dir 48 * normal commit 49 * rename foo/some_dir foo2/some_dir 50 * mkdir foo/some_dir 51 * fsync foo/some_dir/some_file 52 * 53 * The fsync above will unlink the original some_dir without recording 54 * it in its new location (foo2). After a crash, some_dir will be gone 55 * unless the fsync of some_file forces a full commit 56 * 57 * 2) we must log any new names for any file or dir that is in the fsync 58 * log. ---> check inode while renaming/linking. 59 * 60 * 2a) we must log any new names for any file or dir during rename 61 * when the directory they are being removed from was logged. 62 * ---> check inode and old parent dir during rename 63 * 64 * 2a is actually the more important variant. With the extra logging 65 * a crash might unlink the old name without recreating the new one 66 * 67 * 3) after a crash, we must go through any directories with a link count 68 * of zero and redo the rm -rf 69 * 70 * mkdir f1/foo 71 * normal commit 72 * rm -rf f1/foo 73 * fsync(f1) 74 * 75 * The directory f1 was fully removed from the FS, but fsync was never 76 * called on f1, only its parent dir. After a crash the rm -rf must 77 * be replayed. This must be able to recurse down the entire 78 * directory tree. The inode link count fixup code takes care of the 79 * ugly details. 80 */ 81 82 /* 83 * stages for the tree walking. The first 84 * stage (0) is to only pin down the blocks we find 85 * the second stage (1) is to make sure that all the inodes 86 * we find in the log are created in the subvolume. 87 * 88 * The last stage is to deal with directories and links and extents 89 * and all the other fun semantics 90 */ 91 #define LOG_WALK_PIN_ONLY 0 92 #define LOG_WALK_REPLAY_INODES 1 93 #define LOG_WALK_REPLAY_DIR_INDEX 2 94 #define LOG_WALK_REPLAY_ALL 3 95 96 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, struct inode *inode, 98 int inode_only, 99 const loff_t start, 100 const loff_t end, 101 struct btrfs_log_ctx *ctx); 102 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 103 struct btrfs_root *root, 104 struct btrfs_path *path, u64 objectid); 105 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 106 struct btrfs_root *root, 107 struct btrfs_root *log, 108 struct btrfs_path *path, 109 u64 dirid, int del_all); 110 111 /* 112 * tree logging is a special write ahead log used to make sure that 113 * fsyncs and O_SYNCs can happen without doing full tree commits. 114 * 115 * Full tree commits are expensive because they require commonly 116 * modified blocks to be recowed, creating many dirty pages in the 117 * extent tree an 4x-6x higher write load than ext3. 118 * 119 * Instead of doing a tree commit on every fsync, we use the 120 * key ranges and transaction ids to find items for a given file or directory 121 * that have changed in this transaction. Those items are copied into 122 * a special tree (one per subvolume root), that tree is written to disk 123 * and then the fsync is considered complete. 124 * 125 * After a crash, items are copied out of the log-tree back into the 126 * subvolume tree. Any file data extents found are recorded in the extent 127 * allocation tree, and the log-tree freed. 128 * 129 * The log tree is read three times, once to pin down all the extents it is 130 * using in ram and once, once to create all the inodes logged in the tree 131 * and once to do all the other items. 132 */ 133 134 /* 135 * start a sub transaction and setup the log tree 136 * this increments the log tree writer count to make the people 137 * syncing the tree wait for us to finish 138 */ 139 static int start_log_trans(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root, 141 struct btrfs_log_ctx *ctx) 142 { 143 int index; 144 int ret; 145 146 mutex_lock(&root->log_mutex); 147 if (root->log_root) { 148 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 149 ret = -EAGAIN; 150 goto out; 151 } 152 if (!root->log_start_pid) { 153 root->log_start_pid = current->pid; 154 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 155 } else if (root->log_start_pid != current->pid) { 156 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 157 } 158 159 atomic_inc(&root->log_batch); 160 atomic_inc(&root->log_writers); 161 if (ctx) { 162 index = root->log_transid % 2; 163 list_add_tail(&ctx->list, &root->log_ctxs[index]); 164 ctx->log_transid = root->log_transid; 165 } 166 mutex_unlock(&root->log_mutex); 167 return 0; 168 } 169 170 ret = 0; 171 mutex_lock(&root->fs_info->tree_log_mutex); 172 if (!root->fs_info->log_root_tree) 173 ret = btrfs_init_log_root_tree(trans, root->fs_info); 174 mutex_unlock(&root->fs_info->tree_log_mutex); 175 if (ret) 176 goto out; 177 178 if (!root->log_root) { 179 ret = btrfs_add_log_tree(trans, root); 180 if (ret) 181 goto out; 182 } 183 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 184 root->log_start_pid = current->pid; 185 atomic_inc(&root->log_batch); 186 atomic_inc(&root->log_writers); 187 if (ctx) { 188 index = root->log_transid % 2; 189 list_add_tail(&ctx->list, &root->log_ctxs[index]); 190 ctx->log_transid = root->log_transid; 191 } 192 out: 193 mutex_unlock(&root->log_mutex); 194 return ret; 195 } 196 197 /* 198 * returns 0 if there was a log transaction running and we were able 199 * to join, or returns -ENOENT if there were not transactions 200 * in progress 201 */ 202 static int join_running_log_trans(struct btrfs_root *root) 203 { 204 int ret = -ENOENT; 205 206 smp_mb(); 207 if (!root->log_root) 208 return -ENOENT; 209 210 mutex_lock(&root->log_mutex); 211 if (root->log_root) { 212 ret = 0; 213 atomic_inc(&root->log_writers); 214 } 215 mutex_unlock(&root->log_mutex); 216 return ret; 217 } 218 219 /* 220 * This either makes the current running log transaction wait 221 * until you call btrfs_end_log_trans() or it makes any future 222 * log transactions wait until you call btrfs_end_log_trans() 223 */ 224 int btrfs_pin_log_trans(struct btrfs_root *root) 225 { 226 int ret = -ENOENT; 227 228 mutex_lock(&root->log_mutex); 229 atomic_inc(&root->log_writers); 230 mutex_unlock(&root->log_mutex); 231 return ret; 232 } 233 234 /* 235 * indicate we're done making changes to the log tree 236 * and wake up anyone waiting to do a sync 237 */ 238 void btrfs_end_log_trans(struct btrfs_root *root) 239 { 240 if (atomic_dec_and_test(&root->log_writers)) { 241 smp_mb(); 242 if (waitqueue_active(&root->log_writer_wait)) 243 wake_up(&root->log_writer_wait); 244 } 245 } 246 247 248 /* 249 * the walk control struct is used to pass state down the chain when 250 * processing the log tree. The stage field tells us which part 251 * of the log tree processing we are currently doing. The others 252 * are state fields used for that specific part 253 */ 254 struct walk_control { 255 /* should we free the extent on disk when done? This is used 256 * at transaction commit time while freeing a log tree 257 */ 258 int free; 259 260 /* should we write out the extent buffer? This is used 261 * while flushing the log tree to disk during a sync 262 */ 263 int write; 264 265 /* should we wait for the extent buffer io to finish? Also used 266 * while flushing the log tree to disk for a sync 267 */ 268 int wait; 269 270 /* pin only walk, we record which extents on disk belong to the 271 * log trees 272 */ 273 int pin; 274 275 /* what stage of the replay code we're currently in */ 276 int stage; 277 278 /* the root we are currently replaying */ 279 struct btrfs_root *replay_dest; 280 281 /* the trans handle for the current replay */ 282 struct btrfs_trans_handle *trans; 283 284 /* the function that gets used to process blocks we find in the 285 * tree. Note the extent_buffer might not be up to date when it is 286 * passed in, and it must be checked or read if you need the data 287 * inside it 288 */ 289 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 290 struct walk_control *wc, u64 gen); 291 }; 292 293 /* 294 * process_func used to pin down extents, write them or wait on them 295 */ 296 static int process_one_buffer(struct btrfs_root *log, 297 struct extent_buffer *eb, 298 struct walk_control *wc, u64 gen) 299 { 300 int ret = 0; 301 302 /* 303 * If this fs is mixed then we need to be able to process the leaves to 304 * pin down any logged extents, so we have to read the block. 305 */ 306 if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { 307 ret = btrfs_read_buffer(eb, gen); 308 if (ret) 309 return ret; 310 } 311 312 if (wc->pin) 313 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 314 eb->start, eb->len); 315 316 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 317 if (wc->pin && btrfs_header_level(eb) == 0) 318 ret = btrfs_exclude_logged_extents(log, eb); 319 if (wc->write) 320 btrfs_write_tree_block(eb); 321 if (wc->wait) 322 btrfs_wait_tree_block_writeback(eb); 323 } 324 return ret; 325 } 326 327 /* 328 * Item overwrite used by replay and tree logging. eb, slot and key all refer 329 * to the src data we are copying out. 330 * 331 * root is the tree we are copying into, and path is a scratch 332 * path for use in this function (it should be released on entry and 333 * will be released on exit). 334 * 335 * If the key is already in the destination tree the existing item is 336 * overwritten. If the existing item isn't big enough, it is extended. 337 * If it is too large, it is truncated. 338 * 339 * If the key isn't in the destination yet, a new item is inserted. 340 */ 341 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 342 struct btrfs_root *root, 343 struct btrfs_path *path, 344 struct extent_buffer *eb, int slot, 345 struct btrfs_key *key) 346 { 347 int ret; 348 u32 item_size; 349 u64 saved_i_size = 0; 350 int save_old_i_size = 0; 351 unsigned long src_ptr; 352 unsigned long dst_ptr; 353 int overwrite_root = 0; 354 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 355 356 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 357 overwrite_root = 1; 358 359 item_size = btrfs_item_size_nr(eb, slot); 360 src_ptr = btrfs_item_ptr_offset(eb, slot); 361 362 /* look for the key in the destination tree */ 363 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 364 if (ret < 0) 365 return ret; 366 367 if (ret == 0) { 368 char *src_copy; 369 char *dst_copy; 370 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 371 path->slots[0]); 372 if (dst_size != item_size) 373 goto insert; 374 375 if (item_size == 0) { 376 btrfs_release_path(path); 377 return 0; 378 } 379 dst_copy = kmalloc(item_size, GFP_NOFS); 380 src_copy = kmalloc(item_size, GFP_NOFS); 381 if (!dst_copy || !src_copy) { 382 btrfs_release_path(path); 383 kfree(dst_copy); 384 kfree(src_copy); 385 return -ENOMEM; 386 } 387 388 read_extent_buffer(eb, src_copy, src_ptr, item_size); 389 390 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 391 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 392 item_size); 393 ret = memcmp(dst_copy, src_copy, item_size); 394 395 kfree(dst_copy); 396 kfree(src_copy); 397 /* 398 * they have the same contents, just return, this saves 399 * us from cowing blocks in the destination tree and doing 400 * extra writes that may not have been done by a previous 401 * sync 402 */ 403 if (ret == 0) { 404 btrfs_release_path(path); 405 return 0; 406 } 407 408 /* 409 * We need to load the old nbytes into the inode so when we 410 * replay the extents we've logged we get the right nbytes. 411 */ 412 if (inode_item) { 413 struct btrfs_inode_item *item; 414 u64 nbytes; 415 u32 mode; 416 417 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 418 struct btrfs_inode_item); 419 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 420 item = btrfs_item_ptr(eb, slot, 421 struct btrfs_inode_item); 422 btrfs_set_inode_nbytes(eb, item, nbytes); 423 424 /* 425 * If this is a directory we need to reset the i_size to 426 * 0 so that we can set it up properly when replaying 427 * the rest of the items in this log. 428 */ 429 mode = btrfs_inode_mode(eb, item); 430 if (S_ISDIR(mode)) 431 btrfs_set_inode_size(eb, item, 0); 432 } 433 } else if (inode_item) { 434 struct btrfs_inode_item *item; 435 u32 mode; 436 437 /* 438 * New inode, set nbytes to 0 so that the nbytes comes out 439 * properly when we replay the extents. 440 */ 441 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 442 btrfs_set_inode_nbytes(eb, item, 0); 443 444 /* 445 * If this is a directory we need to reset the i_size to 0 so 446 * that we can set it up properly when replaying the rest of 447 * the items in this log. 448 */ 449 mode = btrfs_inode_mode(eb, item); 450 if (S_ISDIR(mode)) 451 btrfs_set_inode_size(eb, item, 0); 452 } 453 insert: 454 btrfs_release_path(path); 455 /* try to insert the key into the destination tree */ 456 path->skip_release_on_error = 1; 457 ret = btrfs_insert_empty_item(trans, root, path, 458 key, item_size); 459 path->skip_release_on_error = 0; 460 461 /* make sure any existing item is the correct size */ 462 if (ret == -EEXIST || ret == -EOVERFLOW) { 463 u32 found_size; 464 found_size = btrfs_item_size_nr(path->nodes[0], 465 path->slots[0]); 466 if (found_size > item_size) 467 btrfs_truncate_item(root, path, item_size, 1); 468 else if (found_size < item_size) 469 btrfs_extend_item(root, path, 470 item_size - found_size); 471 } else if (ret) { 472 return ret; 473 } 474 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 475 path->slots[0]); 476 477 /* don't overwrite an existing inode if the generation number 478 * was logged as zero. This is done when the tree logging code 479 * is just logging an inode to make sure it exists after recovery. 480 * 481 * Also, don't overwrite i_size on directories during replay. 482 * log replay inserts and removes directory items based on the 483 * state of the tree found in the subvolume, and i_size is modified 484 * as it goes 485 */ 486 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 487 struct btrfs_inode_item *src_item; 488 struct btrfs_inode_item *dst_item; 489 490 src_item = (struct btrfs_inode_item *)src_ptr; 491 dst_item = (struct btrfs_inode_item *)dst_ptr; 492 493 if (btrfs_inode_generation(eb, src_item) == 0) { 494 struct extent_buffer *dst_eb = path->nodes[0]; 495 const u64 ino_size = btrfs_inode_size(eb, src_item); 496 497 /* 498 * For regular files an ino_size == 0 is used only when 499 * logging that an inode exists, as part of a directory 500 * fsync, and the inode wasn't fsynced before. In this 501 * case don't set the size of the inode in the fs/subvol 502 * tree, otherwise we would be throwing valid data away. 503 */ 504 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 505 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 506 ino_size != 0) { 507 struct btrfs_map_token token; 508 509 btrfs_init_map_token(&token); 510 btrfs_set_token_inode_size(dst_eb, dst_item, 511 ino_size, &token); 512 } 513 goto no_copy; 514 } 515 516 if (overwrite_root && 517 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 518 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 519 save_old_i_size = 1; 520 saved_i_size = btrfs_inode_size(path->nodes[0], 521 dst_item); 522 } 523 } 524 525 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 526 src_ptr, item_size); 527 528 if (save_old_i_size) { 529 struct btrfs_inode_item *dst_item; 530 dst_item = (struct btrfs_inode_item *)dst_ptr; 531 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 532 } 533 534 /* make sure the generation is filled in */ 535 if (key->type == BTRFS_INODE_ITEM_KEY) { 536 struct btrfs_inode_item *dst_item; 537 dst_item = (struct btrfs_inode_item *)dst_ptr; 538 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 539 btrfs_set_inode_generation(path->nodes[0], dst_item, 540 trans->transid); 541 } 542 } 543 no_copy: 544 btrfs_mark_buffer_dirty(path->nodes[0]); 545 btrfs_release_path(path); 546 return 0; 547 } 548 549 /* 550 * simple helper to read an inode off the disk from a given root 551 * This can only be called for subvolume roots and not for the log 552 */ 553 static noinline struct inode *read_one_inode(struct btrfs_root *root, 554 u64 objectid) 555 { 556 struct btrfs_key key; 557 struct inode *inode; 558 559 key.objectid = objectid; 560 key.type = BTRFS_INODE_ITEM_KEY; 561 key.offset = 0; 562 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 563 if (IS_ERR(inode)) { 564 inode = NULL; 565 } else if (is_bad_inode(inode)) { 566 iput(inode); 567 inode = NULL; 568 } 569 return inode; 570 } 571 572 /* replays a single extent in 'eb' at 'slot' with 'key' into the 573 * subvolume 'root'. path is released on entry and should be released 574 * on exit. 575 * 576 * extents in the log tree have not been allocated out of the extent 577 * tree yet. So, this completes the allocation, taking a reference 578 * as required if the extent already exists or creating a new extent 579 * if it isn't in the extent allocation tree yet. 580 * 581 * The extent is inserted into the file, dropping any existing extents 582 * from the file that overlap the new one. 583 */ 584 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 585 struct btrfs_root *root, 586 struct btrfs_path *path, 587 struct extent_buffer *eb, int slot, 588 struct btrfs_key *key) 589 { 590 int found_type; 591 u64 extent_end; 592 u64 start = key->offset; 593 u64 nbytes = 0; 594 struct btrfs_file_extent_item *item; 595 struct inode *inode = NULL; 596 unsigned long size; 597 int ret = 0; 598 599 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 600 found_type = btrfs_file_extent_type(eb, item); 601 602 if (found_type == BTRFS_FILE_EXTENT_REG || 603 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 604 nbytes = btrfs_file_extent_num_bytes(eb, item); 605 extent_end = start + nbytes; 606 607 /* 608 * We don't add to the inodes nbytes if we are prealloc or a 609 * hole. 610 */ 611 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 612 nbytes = 0; 613 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 614 size = btrfs_file_extent_inline_len(eb, slot, item); 615 nbytes = btrfs_file_extent_ram_bytes(eb, item); 616 extent_end = ALIGN(start + size, root->sectorsize); 617 } else { 618 ret = 0; 619 goto out; 620 } 621 622 inode = read_one_inode(root, key->objectid); 623 if (!inode) { 624 ret = -EIO; 625 goto out; 626 } 627 628 /* 629 * first check to see if we already have this extent in the 630 * file. This must be done before the btrfs_drop_extents run 631 * so we don't try to drop this extent. 632 */ 633 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 634 start, 0); 635 636 if (ret == 0 && 637 (found_type == BTRFS_FILE_EXTENT_REG || 638 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 639 struct btrfs_file_extent_item cmp1; 640 struct btrfs_file_extent_item cmp2; 641 struct btrfs_file_extent_item *existing; 642 struct extent_buffer *leaf; 643 644 leaf = path->nodes[0]; 645 existing = btrfs_item_ptr(leaf, path->slots[0], 646 struct btrfs_file_extent_item); 647 648 read_extent_buffer(eb, &cmp1, (unsigned long)item, 649 sizeof(cmp1)); 650 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 651 sizeof(cmp2)); 652 653 /* 654 * we already have a pointer to this exact extent, 655 * we don't have to do anything 656 */ 657 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 658 btrfs_release_path(path); 659 goto out; 660 } 661 } 662 btrfs_release_path(path); 663 664 /* drop any overlapping extents */ 665 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 666 if (ret) 667 goto out; 668 669 if (found_type == BTRFS_FILE_EXTENT_REG || 670 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 671 u64 offset; 672 unsigned long dest_offset; 673 struct btrfs_key ins; 674 675 ret = btrfs_insert_empty_item(trans, root, path, key, 676 sizeof(*item)); 677 if (ret) 678 goto out; 679 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 680 path->slots[0]); 681 copy_extent_buffer(path->nodes[0], eb, dest_offset, 682 (unsigned long)item, sizeof(*item)); 683 684 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 685 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 686 ins.type = BTRFS_EXTENT_ITEM_KEY; 687 offset = key->offset - btrfs_file_extent_offset(eb, item); 688 689 if (ins.objectid > 0) { 690 u64 csum_start; 691 u64 csum_end; 692 LIST_HEAD(ordered_sums); 693 /* 694 * is this extent already allocated in the extent 695 * allocation tree? If so, just add a reference 696 */ 697 ret = btrfs_lookup_data_extent(root, ins.objectid, 698 ins.offset); 699 if (ret == 0) { 700 ret = btrfs_inc_extent_ref(trans, root, 701 ins.objectid, ins.offset, 702 0, root->root_key.objectid, 703 key->objectid, offset, 0); 704 if (ret) 705 goto out; 706 } else { 707 /* 708 * insert the extent pointer in the extent 709 * allocation tree 710 */ 711 ret = btrfs_alloc_logged_file_extent(trans, 712 root, root->root_key.objectid, 713 key->objectid, offset, &ins); 714 if (ret) 715 goto out; 716 } 717 btrfs_release_path(path); 718 719 if (btrfs_file_extent_compression(eb, item)) { 720 csum_start = ins.objectid; 721 csum_end = csum_start + ins.offset; 722 } else { 723 csum_start = ins.objectid + 724 btrfs_file_extent_offset(eb, item); 725 csum_end = csum_start + 726 btrfs_file_extent_num_bytes(eb, item); 727 } 728 729 ret = btrfs_lookup_csums_range(root->log_root, 730 csum_start, csum_end - 1, 731 &ordered_sums, 0); 732 if (ret) 733 goto out; 734 while (!list_empty(&ordered_sums)) { 735 struct btrfs_ordered_sum *sums; 736 sums = list_entry(ordered_sums.next, 737 struct btrfs_ordered_sum, 738 list); 739 if (!ret) 740 ret = btrfs_csum_file_blocks(trans, 741 root->fs_info->csum_root, 742 sums); 743 list_del(&sums->list); 744 kfree(sums); 745 } 746 if (ret) 747 goto out; 748 } else { 749 btrfs_release_path(path); 750 } 751 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 752 /* inline extents are easy, we just overwrite them */ 753 ret = overwrite_item(trans, root, path, eb, slot, key); 754 if (ret) 755 goto out; 756 } 757 758 inode_add_bytes(inode, nbytes); 759 ret = btrfs_update_inode(trans, root, inode); 760 out: 761 if (inode) 762 iput(inode); 763 return ret; 764 } 765 766 /* 767 * when cleaning up conflicts between the directory names in the 768 * subvolume, directory names in the log and directory names in the 769 * inode back references, we may have to unlink inodes from directories. 770 * 771 * This is a helper function to do the unlink of a specific directory 772 * item 773 */ 774 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 775 struct btrfs_root *root, 776 struct btrfs_path *path, 777 struct inode *dir, 778 struct btrfs_dir_item *di) 779 { 780 struct inode *inode; 781 char *name; 782 int name_len; 783 struct extent_buffer *leaf; 784 struct btrfs_key location; 785 int ret; 786 787 leaf = path->nodes[0]; 788 789 btrfs_dir_item_key_to_cpu(leaf, di, &location); 790 name_len = btrfs_dir_name_len(leaf, di); 791 name = kmalloc(name_len, GFP_NOFS); 792 if (!name) 793 return -ENOMEM; 794 795 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 796 btrfs_release_path(path); 797 798 inode = read_one_inode(root, location.objectid); 799 if (!inode) { 800 ret = -EIO; 801 goto out; 802 } 803 804 ret = link_to_fixup_dir(trans, root, path, location.objectid); 805 if (ret) 806 goto out; 807 808 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 809 if (ret) 810 goto out; 811 else 812 ret = btrfs_run_delayed_items(trans, root); 813 out: 814 kfree(name); 815 iput(inode); 816 return ret; 817 } 818 819 /* 820 * helper function to see if a given name and sequence number found 821 * in an inode back reference are already in a directory and correctly 822 * point to this inode 823 */ 824 static noinline int inode_in_dir(struct btrfs_root *root, 825 struct btrfs_path *path, 826 u64 dirid, u64 objectid, u64 index, 827 const char *name, int name_len) 828 { 829 struct btrfs_dir_item *di; 830 struct btrfs_key location; 831 int match = 0; 832 833 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 834 index, name, name_len, 0); 835 if (di && !IS_ERR(di)) { 836 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 837 if (location.objectid != objectid) 838 goto out; 839 } else 840 goto out; 841 btrfs_release_path(path); 842 843 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 844 if (di && !IS_ERR(di)) { 845 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 846 if (location.objectid != objectid) 847 goto out; 848 } else 849 goto out; 850 match = 1; 851 out: 852 btrfs_release_path(path); 853 return match; 854 } 855 856 /* 857 * helper function to check a log tree for a named back reference in 858 * an inode. This is used to decide if a back reference that is 859 * found in the subvolume conflicts with what we find in the log. 860 * 861 * inode backreferences may have multiple refs in a single item, 862 * during replay we process one reference at a time, and we don't 863 * want to delete valid links to a file from the subvolume if that 864 * link is also in the log. 865 */ 866 static noinline int backref_in_log(struct btrfs_root *log, 867 struct btrfs_key *key, 868 u64 ref_objectid, 869 const char *name, int namelen) 870 { 871 struct btrfs_path *path; 872 struct btrfs_inode_ref *ref; 873 unsigned long ptr; 874 unsigned long ptr_end; 875 unsigned long name_ptr; 876 int found_name_len; 877 int item_size; 878 int ret; 879 int match = 0; 880 881 path = btrfs_alloc_path(); 882 if (!path) 883 return -ENOMEM; 884 885 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 886 if (ret != 0) 887 goto out; 888 889 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 890 891 if (key->type == BTRFS_INODE_EXTREF_KEY) { 892 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 893 name, namelen, NULL)) 894 match = 1; 895 896 goto out; 897 } 898 899 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 900 ptr_end = ptr + item_size; 901 while (ptr < ptr_end) { 902 ref = (struct btrfs_inode_ref *)ptr; 903 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 904 if (found_name_len == namelen) { 905 name_ptr = (unsigned long)(ref + 1); 906 ret = memcmp_extent_buffer(path->nodes[0], name, 907 name_ptr, namelen); 908 if (ret == 0) { 909 match = 1; 910 goto out; 911 } 912 } 913 ptr = (unsigned long)(ref + 1) + found_name_len; 914 } 915 out: 916 btrfs_free_path(path); 917 return match; 918 } 919 920 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 921 struct btrfs_root *root, 922 struct btrfs_path *path, 923 struct btrfs_root *log_root, 924 struct inode *dir, struct inode *inode, 925 struct extent_buffer *eb, 926 u64 inode_objectid, u64 parent_objectid, 927 u64 ref_index, char *name, int namelen, 928 int *search_done) 929 { 930 int ret; 931 char *victim_name; 932 int victim_name_len; 933 struct extent_buffer *leaf; 934 struct btrfs_dir_item *di; 935 struct btrfs_key search_key; 936 struct btrfs_inode_extref *extref; 937 938 again: 939 /* Search old style refs */ 940 search_key.objectid = inode_objectid; 941 search_key.type = BTRFS_INODE_REF_KEY; 942 search_key.offset = parent_objectid; 943 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 944 if (ret == 0) { 945 struct btrfs_inode_ref *victim_ref; 946 unsigned long ptr; 947 unsigned long ptr_end; 948 949 leaf = path->nodes[0]; 950 951 /* are we trying to overwrite a back ref for the root directory 952 * if so, just jump out, we're done 953 */ 954 if (search_key.objectid == search_key.offset) 955 return 1; 956 957 /* check all the names in this back reference to see 958 * if they are in the log. if so, we allow them to stay 959 * otherwise they must be unlinked as a conflict 960 */ 961 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 962 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 963 while (ptr < ptr_end) { 964 victim_ref = (struct btrfs_inode_ref *)ptr; 965 victim_name_len = btrfs_inode_ref_name_len(leaf, 966 victim_ref); 967 victim_name = kmalloc(victim_name_len, GFP_NOFS); 968 if (!victim_name) 969 return -ENOMEM; 970 971 read_extent_buffer(leaf, victim_name, 972 (unsigned long)(victim_ref + 1), 973 victim_name_len); 974 975 if (!backref_in_log(log_root, &search_key, 976 parent_objectid, 977 victim_name, 978 victim_name_len)) { 979 inc_nlink(inode); 980 btrfs_release_path(path); 981 982 ret = btrfs_unlink_inode(trans, root, dir, 983 inode, victim_name, 984 victim_name_len); 985 kfree(victim_name); 986 if (ret) 987 return ret; 988 ret = btrfs_run_delayed_items(trans, root); 989 if (ret) 990 return ret; 991 *search_done = 1; 992 goto again; 993 } 994 kfree(victim_name); 995 996 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 997 } 998 999 /* 1000 * NOTE: we have searched root tree and checked the 1001 * coresponding ref, it does not need to check again. 1002 */ 1003 *search_done = 1; 1004 } 1005 btrfs_release_path(path); 1006 1007 /* Same search but for extended refs */ 1008 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1009 inode_objectid, parent_objectid, 0, 1010 0); 1011 if (!IS_ERR_OR_NULL(extref)) { 1012 u32 item_size; 1013 u32 cur_offset = 0; 1014 unsigned long base; 1015 struct inode *victim_parent; 1016 1017 leaf = path->nodes[0]; 1018 1019 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1020 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1021 1022 while (cur_offset < item_size) { 1023 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1024 1025 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1026 1027 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1028 goto next; 1029 1030 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1031 if (!victim_name) 1032 return -ENOMEM; 1033 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1034 victim_name_len); 1035 1036 search_key.objectid = inode_objectid; 1037 search_key.type = BTRFS_INODE_EXTREF_KEY; 1038 search_key.offset = btrfs_extref_hash(parent_objectid, 1039 victim_name, 1040 victim_name_len); 1041 ret = 0; 1042 if (!backref_in_log(log_root, &search_key, 1043 parent_objectid, victim_name, 1044 victim_name_len)) { 1045 ret = -ENOENT; 1046 victim_parent = read_one_inode(root, 1047 parent_objectid); 1048 if (victim_parent) { 1049 inc_nlink(inode); 1050 btrfs_release_path(path); 1051 1052 ret = btrfs_unlink_inode(trans, root, 1053 victim_parent, 1054 inode, 1055 victim_name, 1056 victim_name_len); 1057 if (!ret) 1058 ret = btrfs_run_delayed_items( 1059 trans, root); 1060 } 1061 iput(victim_parent); 1062 kfree(victim_name); 1063 if (ret) 1064 return ret; 1065 *search_done = 1; 1066 goto again; 1067 } 1068 kfree(victim_name); 1069 if (ret) 1070 return ret; 1071 next: 1072 cur_offset += victim_name_len + sizeof(*extref); 1073 } 1074 *search_done = 1; 1075 } 1076 btrfs_release_path(path); 1077 1078 /* look for a conflicting sequence number */ 1079 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1080 ref_index, name, namelen, 0); 1081 if (di && !IS_ERR(di)) { 1082 ret = drop_one_dir_item(trans, root, path, dir, di); 1083 if (ret) 1084 return ret; 1085 } 1086 btrfs_release_path(path); 1087 1088 /* look for a conflicing name */ 1089 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1090 name, namelen, 0); 1091 if (di && !IS_ERR(di)) { 1092 ret = drop_one_dir_item(trans, root, path, dir, di); 1093 if (ret) 1094 return ret; 1095 } 1096 btrfs_release_path(path); 1097 1098 return 0; 1099 } 1100 1101 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1102 u32 *namelen, char **name, u64 *index, 1103 u64 *parent_objectid) 1104 { 1105 struct btrfs_inode_extref *extref; 1106 1107 extref = (struct btrfs_inode_extref *)ref_ptr; 1108 1109 *namelen = btrfs_inode_extref_name_len(eb, extref); 1110 *name = kmalloc(*namelen, GFP_NOFS); 1111 if (*name == NULL) 1112 return -ENOMEM; 1113 1114 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1115 *namelen); 1116 1117 *index = btrfs_inode_extref_index(eb, extref); 1118 if (parent_objectid) 1119 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1120 1121 return 0; 1122 } 1123 1124 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1125 u32 *namelen, char **name, u64 *index) 1126 { 1127 struct btrfs_inode_ref *ref; 1128 1129 ref = (struct btrfs_inode_ref *)ref_ptr; 1130 1131 *namelen = btrfs_inode_ref_name_len(eb, ref); 1132 *name = kmalloc(*namelen, GFP_NOFS); 1133 if (*name == NULL) 1134 return -ENOMEM; 1135 1136 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1137 1138 *index = btrfs_inode_ref_index(eb, ref); 1139 1140 return 0; 1141 } 1142 1143 /* 1144 * replay one inode back reference item found in the log tree. 1145 * eb, slot and key refer to the buffer and key found in the log tree. 1146 * root is the destination we are replaying into, and path is for temp 1147 * use by this function. (it should be released on return). 1148 */ 1149 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1150 struct btrfs_root *root, 1151 struct btrfs_root *log, 1152 struct btrfs_path *path, 1153 struct extent_buffer *eb, int slot, 1154 struct btrfs_key *key) 1155 { 1156 struct inode *dir = NULL; 1157 struct inode *inode = NULL; 1158 unsigned long ref_ptr; 1159 unsigned long ref_end; 1160 char *name = NULL; 1161 int namelen; 1162 int ret; 1163 int search_done = 0; 1164 int log_ref_ver = 0; 1165 u64 parent_objectid; 1166 u64 inode_objectid; 1167 u64 ref_index = 0; 1168 int ref_struct_size; 1169 1170 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1171 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1172 1173 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1174 struct btrfs_inode_extref *r; 1175 1176 ref_struct_size = sizeof(struct btrfs_inode_extref); 1177 log_ref_ver = 1; 1178 r = (struct btrfs_inode_extref *)ref_ptr; 1179 parent_objectid = btrfs_inode_extref_parent(eb, r); 1180 } else { 1181 ref_struct_size = sizeof(struct btrfs_inode_ref); 1182 parent_objectid = key->offset; 1183 } 1184 inode_objectid = key->objectid; 1185 1186 /* 1187 * it is possible that we didn't log all the parent directories 1188 * for a given inode. If we don't find the dir, just don't 1189 * copy the back ref in. The link count fixup code will take 1190 * care of the rest 1191 */ 1192 dir = read_one_inode(root, parent_objectid); 1193 if (!dir) { 1194 ret = -ENOENT; 1195 goto out; 1196 } 1197 1198 inode = read_one_inode(root, inode_objectid); 1199 if (!inode) { 1200 ret = -EIO; 1201 goto out; 1202 } 1203 1204 while (ref_ptr < ref_end) { 1205 if (log_ref_ver) { 1206 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1207 &ref_index, &parent_objectid); 1208 /* 1209 * parent object can change from one array 1210 * item to another. 1211 */ 1212 if (!dir) 1213 dir = read_one_inode(root, parent_objectid); 1214 if (!dir) { 1215 ret = -ENOENT; 1216 goto out; 1217 } 1218 } else { 1219 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1220 &ref_index); 1221 } 1222 if (ret) 1223 goto out; 1224 1225 /* if we already have a perfect match, we're done */ 1226 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), 1227 ref_index, name, namelen)) { 1228 /* 1229 * look for a conflicting back reference in the 1230 * metadata. if we find one we have to unlink that name 1231 * of the file before we add our new link. Later on, we 1232 * overwrite any existing back reference, and we don't 1233 * want to create dangling pointers in the directory. 1234 */ 1235 1236 if (!search_done) { 1237 ret = __add_inode_ref(trans, root, path, log, 1238 dir, inode, eb, 1239 inode_objectid, 1240 parent_objectid, 1241 ref_index, name, namelen, 1242 &search_done); 1243 if (ret) { 1244 if (ret == 1) 1245 ret = 0; 1246 goto out; 1247 } 1248 } 1249 1250 /* insert our name */ 1251 ret = btrfs_add_link(trans, dir, inode, name, namelen, 1252 0, ref_index); 1253 if (ret) 1254 goto out; 1255 1256 btrfs_update_inode(trans, root, inode); 1257 } 1258 1259 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1260 kfree(name); 1261 name = NULL; 1262 if (log_ref_ver) { 1263 iput(dir); 1264 dir = NULL; 1265 } 1266 } 1267 1268 /* finally write the back reference in the inode */ 1269 ret = overwrite_item(trans, root, path, eb, slot, key); 1270 out: 1271 btrfs_release_path(path); 1272 kfree(name); 1273 iput(dir); 1274 iput(inode); 1275 return ret; 1276 } 1277 1278 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1279 struct btrfs_root *root, u64 ino) 1280 { 1281 int ret; 1282 1283 ret = btrfs_insert_orphan_item(trans, root, ino); 1284 if (ret == -EEXIST) 1285 ret = 0; 1286 1287 return ret; 1288 } 1289 1290 static int count_inode_extrefs(struct btrfs_root *root, 1291 struct inode *inode, struct btrfs_path *path) 1292 { 1293 int ret = 0; 1294 int name_len; 1295 unsigned int nlink = 0; 1296 u32 item_size; 1297 u32 cur_offset = 0; 1298 u64 inode_objectid = btrfs_ino(inode); 1299 u64 offset = 0; 1300 unsigned long ptr; 1301 struct btrfs_inode_extref *extref; 1302 struct extent_buffer *leaf; 1303 1304 while (1) { 1305 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1306 &extref, &offset); 1307 if (ret) 1308 break; 1309 1310 leaf = path->nodes[0]; 1311 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1312 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1313 cur_offset = 0; 1314 1315 while (cur_offset < item_size) { 1316 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1317 name_len = btrfs_inode_extref_name_len(leaf, extref); 1318 1319 nlink++; 1320 1321 cur_offset += name_len + sizeof(*extref); 1322 } 1323 1324 offset++; 1325 btrfs_release_path(path); 1326 } 1327 btrfs_release_path(path); 1328 1329 if (ret < 0 && ret != -ENOENT) 1330 return ret; 1331 return nlink; 1332 } 1333 1334 static int count_inode_refs(struct btrfs_root *root, 1335 struct inode *inode, struct btrfs_path *path) 1336 { 1337 int ret; 1338 struct btrfs_key key; 1339 unsigned int nlink = 0; 1340 unsigned long ptr; 1341 unsigned long ptr_end; 1342 int name_len; 1343 u64 ino = btrfs_ino(inode); 1344 1345 key.objectid = ino; 1346 key.type = BTRFS_INODE_REF_KEY; 1347 key.offset = (u64)-1; 1348 1349 while (1) { 1350 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1351 if (ret < 0) 1352 break; 1353 if (ret > 0) { 1354 if (path->slots[0] == 0) 1355 break; 1356 path->slots[0]--; 1357 } 1358 process_slot: 1359 btrfs_item_key_to_cpu(path->nodes[0], &key, 1360 path->slots[0]); 1361 if (key.objectid != ino || 1362 key.type != BTRFS_INODE_REF_KEY) 1363 break; 1364 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1365 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1366 path->slots[0]); 1367 while (ptr < ptr_end) { 1368 struct btrfs_inode_ref *ref; 1369 1370 ref = (struct btrfs_inode_ref *)ptr; 1371 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1372 ref); 1373 ptr = (unsigned long)(ref + 1) + name_len; 1374 nlink++; 1375 } 1376 1377 if (key.offset == 0) 1378 break; 1379 if (path->slots[0] > 0) { 1380 path->slots[0]--; 1381 goto process_slot; 1382 } 1383 key.offset--; 1384 btrfs_release_path(path); 1385 } 1386 btrfs_release_path(path); 1387 1388 return nlink; 1389 } 1390 1391 /* 1392 * There are a few corners where the link count of the file can't 1393 * be properly maintained during replay. So, instead of adding 1394 * lots of complexity to the log code, we just scan the backrefs 1395 * for any file that has been through replay. 1396 * 1397 * The scan will update the link count on the inode to reflect the 1398 * number of back refs found. If it goes down to zero, the iput 1399 * will free the inode. 1400 */ 1401 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1402 struct btrfs_root *root, 1403 struct inode *inode) 1404 { 1405 struct btrfs_path *path; 1406 int ret; 1407 u64 nlink = 0; 1408 u64 ino = btrfs_ino(inode); 1409 1410 path = btrfs_alloc_path(); 1411 if (!path) 1412 return -ENOMEM; 1413 1414 ret = count_inode_refs(root, inode, path); 1415 if (ret < 0) 1416 goto out; 1417 1418 nlink = ret; 1419 1420 ret = count_inode_extrefs(root, inode, path); 1421 if (ret < 0) 1422 goto out; 1423 1424 nlink += ret; 1425 1426 ret = 0; 1427 1428 if (nlink != inode->i_nlink) { 1429 set_nlink(inode, nlink); 1430 btrfs_update_inode(trans, root, inode); 1431 } 1432 BTRFS_I(inode)->index_cnt = (u64)-1; 1433 1434 if (inode->i_nlink == 0) { 1435 if (S_ISDIR(inode->i_mode)) { 1436 ret = replay_dir_deletes(trans, root, NULL, path, 1437 ino, 1); 1438 if (ret) 1439 goto out; 1440 } 1441 ret = insert_orphan_item(trans, root, ino); 1442 } 1443 1444 out: 1445 btrfs_free_path(path); 1446 return ret; 1447 } 1448 1449 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1450 struct btrfs_root *root, 1451 struct btrfs_path *path) 1452 { 1453 int ret; 1454 struct btrfs_key key; 1455 struct inode *inode; 1456 1457 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1458 key.type = BTRFS_ORPHAN_ITEM_KEY; 1459 key.offset = (u64)-1; 1460 while (1) { 1461 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1462 if (ret < 0) 1463 break; 1464 1465 if (ret == 1) { 1466 if (path->slots[0] == 0) 1467 break; 1468 path->slots[0]--; 1469 } 1470 1471 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1472 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1473 key.type != BTRFS_ORPHAN_ITEM_KEY) 1474 break; 1475 1476 ret = btrfs_del_item(trans, root, path); 1477 if (ret) 1478 goto out; 1479 1480 btrfs_release_path(path); 1481 inode = read_one_inode(root, key.offset); 1482 if (!inode) 1483 return -EIO; 1484 1485 ret = fixup_inode_link_count(trans, root, inode); 1486 iput(inode); 1487 if (ret) 1488 goto out; 1489 1490 /* 1491 * fixup on a directory may create new entries, 1492 * make sure we always look for the highset possible 1493 * offset 1494 */ 1495 key.offset = (u64)-1; 1496 } 1497 ret = 0; 1498 out: 1499 btrfs_release_path(path); 1500 return ret; 1501 } 1502 1503 1504 /* 1505 * record a given inode in the fixup dir so we can check its link 1506 * count when replay is done. The link count is incremented here 1507 * so the inode won't go away until we check it 1508 */ 1509 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1510 struct btrfs_root *root, 1511 struct btrfs_path *path, 1512 u64 objectid) 1513 { 1514 struct btrfs_key key; 1515 int ret = 0; 1516 struct inode *inode; 1517 1518 inode = read_one_inode(root, objectid); 1519 if (!inode) 1520 return -EIO; 1521 1522 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1523 key.type = BTRFS_ORPHAN_ITEM_KEY; 1524 key.offset = objectid; 1525 1526 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1527 1528 btrfs_release_path(path); 1529 if (ret == 0) { 1530 if (!inode->i_nlink) 1531 set_nlink(inode, 1); 1532 else 1533 inc_nlink(inode); 1534 ret = btrfs_update_inode(trans, root, inode); 1535 } else if (ret == -EEXIST) { 1536 ret = 0; 1537 } else { 1538 BUG(); /* Logic Error */ 1539 } 1540 iput(inode); 1541 1542 return ret; 1543 } 1544 1545 /* 1546 * when replaying the log for a directory, we only insert names 1547 * for inodes that actually exist. This means an fsync on a directory 1548 * does not implicitly fsync all the new files in it 1549 */ 1550 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1551 struct btrfs_root *root, 1552 struct btrfs_path *path, 1553 u64 dirid, u64 index, 1554 char *name, int name_len, u8 type, 1555 struct btrfs_key *location) 1556 { 1557 struct inode *inode; 1558 struct inode *dir; 1559 int ret; 1560 1561 inode = read_one_inode(root, location->objectid); 1562 if (!inode) 1563 return -ENOENT; 1564 1565 dir = read_one_inode(root, dirid); 1566 if (!dir) { 1567 iput(inode); 1568 return -EIO; 1569 } 1570 1571 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1572 1573 /* FIXME, put inode into FIXUP list */ 1574 1575 iput(inode); 1576 iput(dir); 1577 return ret; 1578 } 1579 1580 /* 1581 * Return true if an inode reference exists in the log for the given name, 1582 * inode and parent inode. 1583 */ 1584 static bool name_in_log_ref(struct btrfs_root *log_root, 1585 const char *name, const int name_len, 1586 const u64 dirid, const u64 ino) 1587 { 1588 struct btrfs_key search_key; 1589 1590 search_key.objectid = ino; 1591 search_key.type = BTRFS_INODE_REF_KEY; 1592 search_key.offset = dirid; 1593 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1594 return true; 1595 1596 search_key.type = BTRFS_INODE_EXTREF_KEY; 1597 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1598 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1599 return true; 1600 1601 return false; 1602 } 1603 1604 /* 1605 * take a single entry in a log directory item and replay it into 1606 * the subvolume. 1607 * 1608 * if a conflicting item exists in the subdirectory already, 1609 * the inode it points to is unlinked and put into the link count 1610 * fix up tree. 1611 * 1612 * If a name from the log points to a file or directory that does 1613 * not exist in the FS, it is skipped. fsyncs on directories 1614 * do not force down inodes inside that directory, just changes to the 1615 * names or unlinks in a directory. 1616 */ 1617 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1618 struct btrfs_root *root, 1619 struct btrfs_path *path, 1620 struct extent_buffer *eb, 1621 struct btrfs_dir_item *di, 1622 struct btrfs_key *key) 1623 { 1624 char *name; 1625 int name_len; 1626 struct btrfs_dir_item *dst_di; 1627 struct btrfs_key found_key; 1628 struct btrfs_key log_key; 1629 struct inode *dir; 1630 u8 log_type; 1631 int exists; 1632 int ret = 0; 1633 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1634 1635 dir = read_one_inode(root, key->objectid); 1636 if (!dir) 1637 return -EIO; 1638 1639 name_len = btrfs_dir_name_len(eb, di); 1640 name = kmalloc(name_len, GFP_NOFS); 1641 if (!name) { 1642 ret = -ENOMEM; 1643 goto out; 1644 } 1645 1646 log_type = btrfs_dir_type(eb, di); 1647 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1648 name_len); 1649 1650 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1651 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1652 if (exists == 0) 1653 exists = 1; 1654 else 1655 exists = 0; 1656 btrfs_release_path(path); 1657 1658 if (key->type == BTRFS_DIR_ITEM_KEY) { 1659 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1660 name, name_len, 1); 1661 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1662 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1663 key->objectid, 1664 key->offset, name, 1665 name_len, 1); 1666 } else { 1667 /* Corruption */ 1668 ret = -EINVAL; 1669 goto out; 1670 } 1671 if (IS_ERR_OR_NULL(dst_di)) { 1672 /* we need a sequence number to insert, so we only 1673 * do inserts for the BTRFS_DIR_INDEX_KEY types 1674 */ 1675 if (key->type != BTRFS_DIR_INDEX_KEY) 1676 goto out; 1677 goto insert; 1678 } 1679 1680 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1681 /* the existing item matches the logged item */ 1682 if (found_key.objectid == log_key.objectid && 1683 found_key.type == log_key.type && 1684 found_key.offset == log_key.offset && 1685 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1686 update_size = false; 1687 goto out; 1688 } 1689 1690 /* 1691 * don't drop the conflicting directory entry if the inode 1692 * for the new entry doesn't exist 1693 */ 1694 if (!exists) 1695 goto out; 1696 1697 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1698 if (ret) 1699 goto out; 1700 1701 if (key->type == BTRFS_DIR_INDEX_KEY) 1702 goto insert; 1703 out: 1704 btrfs_release_path(path); 1705 if (!ret && update_size) { 1706 btrfs_i_size_write(dir, dir->i_size + name_len * 2); 1707 ret = btrfs_update_inode(trans, root, dir); 1708 } 1709 kfree(name); 1710 iput(dir); 1711 return ret; 1712 1713 insert: 1714 if (name_in_log_ref(root->log_root, name, name_len, 1715 key->objectid, log_key.objectid)) { 1716 /* The dentry will be added later. */ 1717 ret = 0; 1718 update_size = false; 1719 goto out; 1720 } 1721 btrfs_release_path(path); 1722 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1723 name, name_len, log_type, &log_key); 1724 if (ret && ret != -ENOENT && ret != -EEXIST) 1725 goto out; 1726 update_size = false; 1727 ret = 0; 1728 goto out; 1729 } 1730 1731 /* 1732 * find all the names in a directory item and reconcile them into 1733 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1734 * one name in a directory item, but the same code gets used for 1735 * both directory index types 1736 */ 1737 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1738 struct btrfs_root *root, 1739 struct btrfs_path *path, 1740 struct extent_buffer *eb, int slot, 1741 struct btrfs_key *key) 1742 { 1743 int ret; 1744 u32 item_size = btrfs_item_size_nr(eb, slot); 1745 struct btrfs_dir_item *di; 1746 int name_len; 1747 unsigned long ptr; 1748 unsigned long ptr_end; 1749 1750 ptr = btrfs_item_ptr_offset(eb, slot); 1751 ptr_end = ptr + item_size; 1752 while (ptr < ptr_end) { 1753 di = (struct btrfs_dir_item *)ptr; 1754 if (verify_dir_item(root, eb, di)) 1755 return -EIO; 1756 name_len = btrfs_dir_name_len(eb, di); 1757 ret = replay_one_name(trans, root, path, eb, di, key); 1758 if (ret) 1759 return ret; 1760 ptr = (unsigned long)(di + 1); 1761 ptr += name_len; 1762 } 1763 return 0; 1764 } 1765 1766 /* 1767 * directory replay has two parts. There are the standard directory 1768 * items in the log copied from the subvolume, and range items 1769 * created in the log while the subvolume was logged. 1770 * 1771 * The range items tell us which parts of the key space the log 1772 * is authoritative for. During replay, if a key in the subvolume 1773 * directory is in a logged range item, but not actually in the log 1774 * that means it was deleted from the directory before the fsync 1775 * and should be removed. 1776 */ 1777 static noinline int find_dir_range(struct btrfs_root *root, 1778 struct btrfs_path *path, 1779 u64 dirid, int key_type, 1780 u64 *start_ret, u64 *end_ret) 1781 { 1782 struct btrfs_key key; 1783 u64 found_end; 1784 struct btrfs_dir_log_item *item; 1785 int ret; 1786 int nritems; 1787 1788 if (*start_ret == (u64)-1) 1789 return 1; 1790 1791 key.objectid = dirid; 1792 key.type = key_type; 1793 key.offset = *start_ret; 1794 1795 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1796 if (ret < 0) 1797 goto out; 1798 if (ret > 0) { 1799 if (path->slots[0] == 0) 1800 goto out; 1801 path->slots[0]--; 1802 } 1803 if (ret != 0) 1804 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1805 1806 if (key.type != key_type || key.objectid != dirid) { 1807 ret = 1; 1808 goto next; 1809 } 1810 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1811 struct btrfs_dir_log_item); 1812 found_end = btrfs_dir_log_end(path->nodes[0], item); 1813 1814 if (*start_ret >= key.offset && *start_ret <= found_end) { 1815 ret = 0; 1816 *start_ret = key.offset; 1817 *end_ret = found_end; 1818 goto out; 1819 } 1820 ret = 1; 1821 next: 1822 /* check the next slot in the tree to see if it is a valid item */ 1823 nritems = btrfs_header_nritems(path->nodes[0]); 1824 if (path->slots[0] >= nritems) { 1825 ret = btrfs_next_leaf(root, path); 1826 if (ret) 1827 goto out; 1828 } else { 1829 path->slots[0]++; 1830 } 1831 1832 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1833 1834 if (key.type != key_type || key.objectid != dirid) { 1835 ret = 1; 1836 goto out; 1837 } 1838 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1839 struct btrfs_dir_log_item); 1840 found_end = btrfs_dir_log_end(path->nodes[0], item); 1841 *start_ret = key.offset; 1842 *end_ret = found_end; 1843 ret = 0; 1844 out: 1845 btrfs_release_path(path); 1846 return ret; 1847 } 1848 1849 /* 1850 * this looks for a given directory item in the log. If the directory 1851 * item is not in the log, the item is removed and the inode it points 1852 * to is unlinked 1853 */ 1854 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1855 struct btrfs_root *root, 1856 struct btrfs_root *log, 1857 struct btrfs_path *path, 1858 struct btrfs_path *log_path, 1859 struct inode *dir, 1860 struct btrfs_key *dir_key) 1861 { 1862 int ret; 1863 struct extent_buffer *eb; 1864 int slot; 1865 u32 item_size; 1866 struct btrfs_dir_item *di; 1867 struct btrfs_dir_item *log_di; 1868 int name_len; 1869 unsigned long ptr; 1870 unsigned long ptr_end; 1871 char *name; 1872 struct inode *inode; 1873 struct btrfs_key location; 1874 1875 again: 1876 eb = path->nodes[0]; 1877 slot = path->slots[0]; 1878 item_size = btrfs_item_size_nr(eb, slot); 1879 ptr = btrfs_item_ptr_offset(eb, slot); 1880 ptr_end = ptr + item_size; 1881 while (ptr < ptr_end) { 1882 di = (struct btrfs_dir_item *)ptr; 1883 if (verify_dir_item(root, eb, di)) { 1884 ret = -EIO; 1885 goto out; 1886 } 1887 1888 name_len = btrfs_dir_name_len(eb, di); 1889 name = kmalloc(name_len, GFP_NOFS); 1890 if (!name) { 1891 ret = -ENOMEM; 1892 goto out; 1893 } 1894 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1895 name_len); 1896 log_di = NULL; 1897 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1898 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1899 dir_key->objectid, 1900 name, name_len, 0); 1901 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1902 log_di = btrfs_lookup_dir_index_item(trans, log, 1903 log_path, 1904 dir_key->objectid, 1905 dir_key->offset, 1906 name, name_len, 0); 1907 } 1908 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 1909 btrfs_dir_item_key_to_cpu(eb, di, &location); 1910 btrfs_release_path(path); 1911 btrfs_release_path(log_path); 1912 inode = read_one_inode(root, location.objectid); 1913 if (!inode) { 1914 kfree(name); 1915 return -EIO; 1916 } 1917 1918 ret = link_to_fixup_dir(trans, root, 1919 path, location.objectid); 1920 if (ret) { 1921 kfree(name); 1922 iput(inode); 1923 goto out; 1924 } 1925 1926 inc_nlink(inode); 1927 ret = btrfs_unlink_inode(trans, root, dir, inode, 1928 name, name_len); 1929 if (!ret) 1930 ret = btrfs_run_delayed_items(trans, root); 1931 kfree(name); 1932 iput(inode); 1933 if (ret) 1934 goto out; 1935 1936 /* there might still be more names under this key 1937 * check and repeat if required 1938 */ 1939 ret = btrfs_search_slot(NULL, root, dir_key, path, 1940 0, 0); 1941 if (ret == 0) 1942 goto again; 1943 ret = 0; 1944 goto out; 1945 } else if (IS_ERR(log_di)) { 1946 kfree(name); 1947 return PTR_ERR(log_di); 1948 } 1949 btrfs_release_path(log_path); 1950 kfree(name); 1951 1952 ptr = (unsigned long)(di + 1); 1953 ptr += name_len; 1954 } 1955 ret = 0; 1956 out: 1957 btrfs_release_path(path); 1958 btrfs_release_path(log_path); 1959 return ret; 1960 } 1961 1962 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 1963 struct btrfs_root *root, 1964 struct btrfs_root *log, 1965 struct btrfs_path *path, 1966 const u64 ino) 1967 { 1968 struct btrfs_key search_key; 1969 struct btrfs_path *log_path; 1970 int i; 1971 int nritems; 1972 int ret; 1973 1974 log_path = btrfs_alloc_path(); 1975 if (!log_path) 1976 return -ENOMEM; 1977 1978 search_key.objectid = ino; 1979 search_key.type = BTRFS_XATTR_ITEM_KEY; 1980 search_key.offset = 0; 1981 again: 1982 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1983 if (ret < 0) 1984 goto out; 1985 process_leaf: 1986 nritems = btrfs_header_nritems(path->nodes[0]); 1987 for (i = path->slots[0]; i < nritems; i++) { 1988 struct btrfs_key key; 1989 struct btrfs_dir_item *di; 1990 struct btrfs_dir_item *log_di; 1991 u32 total_size; 1992 u32 cur; 1993 1994 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 1995 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 1996 ret = 0; 1997 goto out; 1998 } 1999 2000 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2001 total_size = btrfs_item_size_nr(path->nodes[0], i); 2002 cur = 0; 2003 while (cur < total_size) { 2004 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2005 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2006 u32 this_len = sizeof(*di) + name_len + data_len; 2007 char *name; 2008 2009 name = kmalloc(name_len, GFP_NOFS); 2010 if (!name) { 2011 ret = -ENOMEM; 2012 goto out; 2013 } 2014 read_extent_buffer(path->nodes[0], name, 2015 (unsigned long)(di + 1), name_len); 2016 2017 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2018 name, name_len, 0); 2019 btrfs_release_path(log_path); 2020 if (!log_di) { 2021 /* Doesn't exist in log tree, so delete it. */ 2022 btrfs_release_path(path); 2023 di = btrfs_lookup_xattr(trans, root, path, ino, 2024 name, name_len, -1); 2025 kfree(name); 2026 if (IS_ERR(di)) { 2027 ret = PTR_ERR(di); 2028 goto out; 2029 } 2030 ASSERT(di); 2031 ret = btrfs_delete_one_dir_name(trans, root, 2032 path, di); 2033 if (ret) 2034 goto out; 2035 btrfs_release_path(path); 2036 search_key = key; 2037 goto again; 2038 } 2039 kfree(name); 2040 if (IS_ERR(log_di)) { 2041 ret = PTR_ERR(log_di); 2042 goto out; 2043 } 2044 cur += this_len; 2045 di = (struct btrfs_dir_item *)((char *)di + this_len); 2046 } 2047 } 2048 ret = btrfs_next_leaf(root, path); 2049 if (ret > 0) 2050 ret = 0; 2051 else if (ret == 0) 2052 goto process_leaf; 2053 out: 2054 btrfs_free_path(log_path); 2055 btrfs_release_path(path); 2056 return ret; 2057 } 2058 2059 2060 /* 2061 * deletion replay happens before we copy any new directory items 2062 * out of the log or out of backreferences from inodes. It 2063 * scans the log to find ranges of keys that log is authoritative for, 2064 * and then scans the directory to find items in those ranges that are 2065 * not present in the log. 2066 * 2067 * Anything we don't find in the log is unlinked and removed from the 2068 * directory. 2069 */ 2070 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2071 struct btrfs_root *root, 2072 struct btrfs_root *log, 2073 struct btrfs_path *path, 2074 u64 dirid, int del_all) 2075 { 2076 u64 range_start; 2077 u64 range_end; 2078 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2079 int ret = 0; 2080 struct btrfs_key dir_key; 2081 struct btrfs_key found_key; 2082 struct btrfs_path *log_path; 2083 struct inode *dir; 2084 2085 dir_key.objectid = dirid; 2086 dir_key.type = BTRFS_DIR_ITEM_KEY; 2087 log_path = btrfs_alloc_path(); 2088 if (!log_path) 2089 return -ENOMEM; 2090 2091 dir = read_one_inode(root, dirid); 2092 /* it isn't an error if the inode isn't there, that can happen 2093 * because we replay the deletes before we copy in the inode item 2094 * from the log 2095 */ 2096 if (!dir) { 2097 btrfs_free_path(log_path); 2098 return 0; 2099 } 2100 again: 2101 range_start = 0; 2102 range_end = 0; 2103 while (1) { 2104 if (del_all) 2105 range_end = (u64)-1; 2106 else { 2107 ret = find_dir_range(log, path, dirid, key_type, 2108 &range_start, &range_end); 2109 if (ret != 0) 2110 break; 2111 } 2112 2113 dir_key.offset = range_start; 2114 while (1) { 2115 int nritems; 2116 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2117 0, 0); 2118 if (ret < 0) 2119 goto out; 2120 2121 nritems = btrfs_header_nritems(path->nodes[0]); 2122 if (path->slots[0] >= nritems) { 2123 ret = btrfs_next_leaf(root, path); 2124 if (ret) 2125 break; 2126 } 2127 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2128 path->slots[0]); 2129 if (found_key.objectid != dirid || 2130 found_key.type != dir_key.type) 2131 goto next_type; 2132 2133 if (found_key.offset > range_end) 2134 break; 2135 2136 ret = check_item_in_log(trans, root, log, path, 2137 log_path, dir, 2138 &found_key); 2139 if (ret) 2140 goto out; 2141 if (found_key.offset == (u64)-1) 2142 break; 2143 dir_key.offset = found_key.offset + 1; 2144 } 2145 btrfs_release_path(path); 2146 if (range_end == (u64)-1) 2147 break; 2148 range_start = range_end + 1; 2149 } 2150 2151 next_type: 2152 ret = 0; 2153 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2154 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2155 dir_key.type = BTRFS_DIR_INDEX_KEY; 2156 btrfs_release_path(path); 2157 goto again; 2158 } 2159 out: 2160 btrfs_release_path(path); 2161 btrfs_free_path(log_path); 2162 iput(dir); 2163 return ret; 2164 } 2165 2166 /* 2167 * the process_func used to replay items from the log tree. This 2168 * gets called in two different stages. The first stage just looks 2169 * for inodes and makes sure they are all copied into the subvolume. 2170 * 2171 * The second stage copies all the other item types from the log into 2172 * the subvolume. The two stage approach is slower, but gets rid of 2173 * lots of complexity around inodes referencing other inodes that exist 2174 * only in the log (references come from either directory items or inode 2175 * back refs). 2176 */ 2177 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2178 struct walk_control *wc, u64 gen) 2179 { 2180 int nritems; 2181 struct btrfs_path *path; 2182 struct btrfs_root *root = wc->replay_dest; 2183 struct btrfs_key key; 2184 int level; 2185 int i; 2186 int ret; 2187 2188 ret = btrfs_read_buffer(eb, gen); 2189 if (ret) 2190 return ret; 2191 2192 level = btrfs_header_level(eb); 2193 2194 if (level != 0) 2195 return 0; 2196 2197 path = btrfs_alloc_path(); 2198 if (!path) 2199 return -ENOMEM; 2200 2201 nritems = btrfs_header_nritems(eb); 2202 for (i = 0; i < nritems; i++) { 2203 btrfs_item_key_to_cpu(eb, &key, i); 2204 2205 /* inode keys are done during the first stage */ 2206 if (key.type == BTRFS_INODE_ITEM_KEY && 2207 wc->stage == LOG_WALK_REPLAY_INODES) { 2208 struct btrfs_inode_item *inode_item; 2209 u32 mode; 2210 2211 inode_item = btrfs_item_ptr(eb, i, 2212 struct btrfs_inode_item); 2213 ret = replay_xattr_deletes(wc->trans, root, log, 2214 path, key.objectid); 2215 if (ret) 2216 break; 2217 mode = btrfs_inode_mode(eb, inode_item); 2218 if (S_ISDIR(mode)) { 2219 ret = replay_dir_deletes(wc->trans, 2220 root, log, path, key.objectid, 0); 2221 if (ret) 2222 break; 2223 } 2224 ret = overwrite_item(wc->trans, root, path, 2225 eb, i, &key); 2226 if (ret) 2227 break; 2228 2229 /* for regular files, make sure corresponding 2230 * orhpan item exist. extents past the new EOF 2231 * will be truncated later by orphan cleanup. 2232 */ 2233 if (S_ISREG(mode)) { 2234 ret = insert_orphan_item(wc->trans, root, 2235 key.objectid); 2236 if (ret) 2237 break; 2238 } 2239 2240 ret = link_to_fixup_dir(wc->trans, root, 2241 path, key.objectid); 2242 if (ret) 2243 break; 2244 } 2245 2246 if (key.type == BTRFS_DIR_INDEX_KEY && 2247 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2248 ret = replay_one_dir_item(wc->trans, root, path, 2249 eb, i, &key); 2250 if (ret) 2251 break; 2252 } 2253 2254 if (wc->stage < LOG_WALK_REPLAY_ALL) 2255 continue; 2256 2257 /* these keys are simply copied */ 2258 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2259 ret = overwrite_item(wc->trans, root, path, 2260 eb, i, &key); 2261 if (ret) 2262 break; 2263 } else if (key.type == BTRFS_INODE_REF_KEY || 2264 key.type == BTRFS_INODE_EXTREF_KEY) { 2265 ret = add_inode_ref(wc->trans, root, log, path, 2266 eb, i, &key); 2267 if (ret && ret != -ENOENT) 2268 break; 2269 ret = 0; 2270 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2271 ret = replay_one_extent(wc->trans, root, path, 2272 eb, i, &key); 2273 if (ret) 2274 break; 2275 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2276 ret = replay_one_dir_item(wc->trans, root, path, 2277 eb, i, &key); 2278 if (ret) 2279 break; 2280 } 2281 } 2282 btrfs_free_path(path); 2283 return ret; 2284 } 2285 2286 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2287 struct btrfs_root *root, 2288 struct btrfs_path *path, int *level, 2289 struct walk_control *wc) 2290 { 2291 u64 root_owner; 2292 u64 bytenr; 2293 u64 ptr_gen; 2294 struct extent_buffer *next; 2295 struct extent_buffer *cur; 2296 struct extent_buffer *parent; 2297 u32 blocksize; 2298 int ret = 0; 2299 2300 WARN_ON(*level < 0); 2301 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2302 2303 while (*level > 0) { 2304 WARN_ON(*level < 0); 2305 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2306 cur = path->nodes[*level]; 2307 2308 WARN_ON(btrfs_header_level(cur) != *level); 2309 2310 if (path->slots[*level] >= 2311 btrfs_header_nritems(cur)) 2312 break; 2313 2314 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2315 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2316 blocksize = root->nodesize; 2317 2318 parent = path->nodes[*level]; 2319 root_owner = btrfs_header_owner(parent); 2320 2321 next = btrfs_find_create_tree_block(root, bytenr); 2322 if (!next) 2323 return -ENOMEM; 2324 2325 if (*level == 1) { 2326 ret = wc->process_func(root, next, wc, ptr_gen); 2327 if (ret) { 2328 free_extent_buffer(next); 2329 return ret; 2330 } 2331 2332 path->slots[*level]++; 2333 if (wc->free) { 2334 ret = btrfs_read_buffer(next, ptr_gen); 2335 if (ret) { 2336 free_extent_buffer(next); 2337 return ret; 2338 } 2339 2340 if (trans) { 2341 btrfs_tree_lock(next); 2342 btrfs_set_lock_blocking(next); 2343 clean_tree_block(trans, root->fs_info, 2344 next); 2345 btrfs_wait_tree_block_writeback(next); 2346 btrfs_tree_unlock(next); 2347 } 2348 2349 WARN_ON(root_owner != 2350 BTRFS_TREE_LOG_OBJECTID); 2351 ret = btrfs_free_and_pin_reserved_extent(root, 2352 bytenr, blocksize); 2353 if (ret) { 2354 free_extent_buffer(next); 2355 return ret; 2356 } 2357 } 2358 free_extent_buffer(next); 2359 continue; 2360 } 2361 ret = btrfs_read_buffer(next, ptr_gen); 2362 if (ret) { 2363 free_extent_buffer(next); 2364 return ret; 2365 } 2366 2367 WARN_ON(*level <= 0); 2368 if (path->nodes[*level-1]) 2369 free_extent_buffer(path->nodes[*level-1]); 2370 path->nodes[*level-1] = next; 2371 *level = btrfs_header_level(next); 2372 path->slots[*level] = 0; 2373 cond_resched(); 2374 } 2375 WARN_ON(*level < 0); 2376 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2377 2378 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2379 2380 cond_resched(); 2381 return 0; 2382 } 2383 2384 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2385 struct btrfs_root *root, 2386 struct btrfs_path *path, int *level, 2387 struct walk_control *wc) 2388 { 2389 u64 root_owner; 2390 int i; 2391 int slot; 2392 int ret; 2393 2394 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2395 slot = path->slots[i]; 2396 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2397 path->slots[i]++; 2398 *level = i; 2399 WARN_ON(*level == 0); 2400 return 0; 2401 } else { 2402 struct extent_buffer *parent; 2403 if (path->nodes[*level] == root->node) 2404 parent = path->nodes[*level]; 2405 else 2406 parent = path->nodes[*level + 1]; 2407 2408 root_owner = btrfs_header_owner(parent); 2409 ret = wc->process_func(root, path->nodes[*level], wc, 2410 btrfs_header_generation(path->nodes[*level])); 2411 if (ret) 2412 return ret; 2413 2414 if (wc->free) { 2415 struct extent_buffer *next; 2416 2417 next = path->nodes[*level]; 2418 2419 if (trans) { 2420 btrfs_tree_lock(next); 2421 btrfs_set_lock_blocking(next); 2422 clean_tree_block(trans, root->fs_info, 2423 next); 2424 btrfs_wait_tree_block_writeback(next); 2425 btrfs_tree_unlock(next); 2426 } 2427 2428 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2429 ret = btrfs_free_and_pin_reserved_extent(root, 2430 path->nodes[*level]->start, 2431 path->nodes[*level]->len); 2432 if (ret) 2433 return ret; 2434 } 2435 free_extent_buffer(path->nodes[*level]); 2436 path->nodes[*level] = NULL; 2437 *level = i + 1; 2438 } 2439 } 2440 return 1; 2441 } 2442 2443 /* 2444 * drop the reference count on the tree rooted at 'snap'. This traverses 2445 * the tree freeing any blocks that have a ref count of zero after being 2446 * decremented. 2447 */ 2448 static int walk_log_tree(struct btrfs_trans_handle *trans, 2449 struct btrfs_root *log, struct walk_control *wc) 2450 { 2451 int ret = 0; 2452 int wret; 2453 int level; 2454 struct btrfs_path *path; 2455 int orig_level; 2456 2457 path = btrfs_alloc_path(); 2458 if (!path) 2459 return -ENOMEM; 2460 2461 level = btrfs_header_level(log->node); 2462 orig_level = level; 2463 path->nodes[level] = log->node; 2464 extent_buffer_get(log->node); 2465 path->slots[level] = 0; 2466 2467 while (1) { 2468 wret = walk_down_log_tree(trans, log, path, &level, wc); 2469 if (wret > 0) 2470 break; 2471 if (wret < 0) { 2472 ret = wret; 2473 goto out; 2474 } 2475 2476 wret = walk_up_log_tree(trans, log, path, &level, wc); 2477 if (wret > 0) 2478 break; 2479 if (wret < 0) { 2480 ret = wret; 2481 goto out; 2482 } 2483 } 2484 2485 /* was the root node processed? if not, catch it here */ 2486 if (path->nodes[orig_level]) { 2487 ret = wc->process_func(log, path->nodes[orig_level], wc, 2488 btrfs_header_generation(path->nodes[orig_level])); 2489 if (ret) 2490 goto out; 2491 if (wc->free) { 2492 struct extent_buffer *next; 2493 2494 next = path->nodes[orig_level]; 2495 2496 if (trans) { 2497 btrfs_tree_lock(next); 2498 btrfs_set_lock_blocking(next); 2499 clean_tree_block(trans, log->fs_info, next); 2500 btrfs_wait_tree_block_writeback(next); 2501 btrfs_tree_unlock(next); 2502 } 2503 2504 WARN_ON(log->root_key.objectid != 2505 BTRFS_TREE_LOG_OBJECTID); 2506 ret = btrfs_free_and_pin_reserved_extent(log, next->start, 2507 next->len); 2508 if (ret) 2509 goto out; 2510 } 2511 } 2512 2513 out: 2514 btrfs_free_path(path); 2515 return ret; 2516 } 2517 2518 /* 2519 * helper function to update the item for a given subvolumes log root 2520 * in the tree of log roots 2521 */ 2522 static int update_log_root(struct btrfs_trans_handle *trans, 2523 struct btrfs_root *log) 2524 { 2525 int ret; 2526 2527 if (log->log_transid == 1) { 2528 /* insert root item on the first sync */ 2529 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 2530 &log->root_key, &log->root_item); 2531 } else { 2532 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 2533 &log->root_key, &log->root_item); 2534 } 2535 return ret; 2536 } 2537 2538 static void wait_log_commit(struct btrfs_trans_handle *trans, 2539 struct btrfs_root *root, int transid) 2540 { 2541 DEFINE_WAIT(wait); 2542 int index = transid % 2; 2543 2544 /* 2545 * we only allow two pending log transactions at a time, 2546 * so we know that if ours is more than 2 older than the 2547 * current transaction, we're done 2548 */ 2549 do { 2550 prepare_to_wait(&root->log_commit_wait[index], 2551 &wait, TASK_UNINTERRUPTIBLE); 2552 mutex_unlock(&root->log_mutex); 2553 2554 if (root->log_transid_committed < transid && 2555 atomic_read(&root->log_commit[index])) 2556 schedule(); 2557 2558 finish_wait(&root->log_commit_wait[index], &wait); 2559 mutex_lock(&root->log_mutex); 2560 } while (root->log_transid_committed < transid && 2561 atomic_read(&root->log_commit[index])); 2562 } 2563 2564 static void wait_for_writer(struct btrfs_trans_handle *trans, 2565 struct btrfs_root *root) 2566 { 2567 DEFINE_WAIT(wait); 2568 2569 while (atomic_read(&root->log_writers)) { 2570 prepare_to_wait(&root->log_writer_wait, 2571 &wait, TASK_UNINTERRUPTIBLE); 2572 mutex_unlock(&root->log_mutex); 2573 if (atomic_read(&root->log_writers)) 2574 schedule(); 2575 finish_wait(&root->log_writer_wait, &wait); 2576 mutex_lock(&root->log_mutex); 2577 } 2578 } 2579 2580 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2581 struct btrfs_log_ctx *ctx) 2582 { 2583 if (!ctx) 2584 return; 2585 2586 mutex_lock(&root->log_mutex); 2587 list_del_init(&ctx->list); 2588 mutex_unlock(&root->log_mutex); 2589 } 2590 2591 /* 2592 * Invoked in log mutex context, or be sure there is no other task which 2593 * can access the list. 2594 */ 2595 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2596 int index, int error) 2597 { 2598 struct btrfs_log_ctx *ctx; 2599 2600 if (!error) { 2601 INIT_LIST_HEAD(&root->log_ctxs[index]); 2602 return; 2603 } 2604 2605 list_for_each_entry(ctx, &root->log_ctxs[index], list) 2606 ctx->log_ret = error; 2607 2608 INIT_LIST_HEAD(&root->log_ctxs[index]); 2609 } 2610 2611 /* 2612 * btrfs_sync_log does sends a given tree log down to the disk and 2613 * updates the super blocks to record it. When this call is done, 2614 * you know that any inodes previously logged are safely on disk only 2615 * if it returns 0. 2616 * 2617 * Any other return value means you need to call btrfs_commit_transaction. 2618 * Some of the edge cases for fsyncing directories that have had unlinks 2619 * or renames done in the past mean that sometimes the only safe 2620 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2621 * that has happened. 2622 */ 2623 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2624 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2625 { 2626 int index1; 2627 int index2; 2628 int mark; 2629 int ret; 2630 struct btrfs_root *log = root->log_root; 2631 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2632 int log_transid = 0; 2633 struct btrfs_log_ctx root_log_ctx; 2634 struct blk_plug plug; 2635 2636 mutex_lock(&root->log_mutex); 2637 log_transid = ctx->log_transid; 2638 if (root->log_transid_committed >= log_transid) { 2639 mutex_unlock(&root->log_mutex); 2640 return ctx->log_ret; 2641 } 2642 2643 index1 = log_transid % 2; 2644 if (atomic_read(&root->log_commit[index1])) { 2645 wait_log_commit(trans, root, log_transid); 2646 mutex_unlock(&root->log_mutex); 2647 return ctx->log_ret; 2648 } 2649 ASSERT(log_transid == root->log_transid); 2650 atomic_set(&root->log_commit[index1], 1); 2651 2652 /* wait for previous tree log sync to complete */ 2653 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2654 wait_log_commit(trans, root, log_transid - 1); 2655 2656 while (1) { 2657 int batch = atomic_read(&root->log_batch); 2658 /* when we're on an ssd, just kick the log commit out */ 2659 if (!btrfs_test_opt(root, SSD) && 2660 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2661 mutex_unlock(&root->log_mutex); 2662 schedule_timeout_uninterruptible(1); 2663 mutex_lock(&root->log_mutex); 2664 } 2665 wait_for_writer(trans, root); 2666 if (batch == atomic_read(&root->log_batch)) 2667 break; 2668 } 2669 2670 /* bail out if we need to do a full commit */ 2671 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 2672 ret = -EAGAIN; 2673 btrfs_free_logged_extents(log, log_transid); 2674 mutex_unlock(&root->log_mutex); 2675 goto out; 2676 } 2677 2678 if (log_transid % 2 == 0) 2679 mark = EXTENT_DIRTY; 2680 else 2681 mark = EXTENT_NEW; 2682 2683 /* we start IO on all the marked extents here, but we don't actually 2684 * wait for them until later. 2685 */ 2686 blk_start_plug(&plug); 2687 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2688 if (ret) { 2689 blk_finish_plug(&plug); 2690 btrfs_abort_transaction(trans, root, ret); 2691 btrfs_free_logged_extents(log, log_transid); 2692 btrfs_set_log_full_commit(root->fs_info, trans); 2693 mutex_unlock(&root->log_mutex); 2694 goto out; 2695 } 2696 2697 btrfs_set_root_node(&log->root_item, log->node); 2698 2699 root->log_transid++; 2700 log->log_transid = root->log_transid; 2701 root->log_start_pid = 0; 2702 /* 2703 * IO has been started, blocks of the log tree have WRITTEN flag set 2704 * in their headers. new modifications of the log will be written to 2705 * new positions. so it's safe to allow log writers to go in. 2706 */ 2707 mutex_unlock(&root->log_mutex); 2708 2709 btrfs_init_log_ctx(&root_log_ctx); 2710 2711 mutex_lock(&log_root_tree->log_mutex); 2712 atomic_inc(&log_root_tree->log_batch); 2713 atomic_inc(&log_root_tree->log_writers); 2714 2715 index2 = log_root_tree->log_transid % 2; 2716 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2717 root_log_ctx.log_transid = log_root_tree->log_transid; 2718 2719 mutex_unlock(&log_root_tree->log_mutex); 2720 2721 ret = update_log_root(trans, log); 2722 2723 mutex_lock(&log_root_tree->log_mutex); 2724 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2725 smp_mb(); 2726 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2727 wake_up(&log_root_tree->log_writer_wait); 2728 } 2729 2730 if (ret) { 2731 if (!list_empty(&root_log_ctx.list)) 2732 list_del_init(&root_log_ctx.list); 2733 2734 blk_finish_plug(&plug); 2735 btrfs_set_log_full_commit(root->fs_info, trans); 2736 2737 if (ret != -ENOSPC) { 2738 btrfs_abort_transaction(trans, root, ret); 2739 mutex_unlock(&log_root_tree->log_mutex); 2740 goto out; 2741 } 2742 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2743 btrfs_free_logged_extents(log, log_transid); 2744 mutex_unlock(&log_root_tree->log_mutex); 2745 ret = -EAGAIN; 2746 goto out; 2747 } 2748 2749 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2750 blk_finish_plug(&plug); 2751 mutex_unlock(&log_root_tree->log_mutex); 2752 ret = root_log_ctx.log_ret; 2753 goto out; 2754 } 2755 2756 index2 = root_log_ctx.log_transid % 2; 2757 if (atomic_read(&log_root_tree->log_commit[index2])) { 2758 blk_finish_plug(&plug); 2759 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, 2760 mark); 2761 btrfs_wait_logged_extents(trans, log, log_transid); 2762 wait_log_commit(trans, log_root_tree, 2763 root_log_ctx.log_transid); 2764 mutex_unlock(&log_root_tree->log_mutex); 2765 if (!ret) 2766 ret = root_log_ctx.log_ret; 2767 goto out; 2768 } 2769 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2770 atomic_set(&log_root_tree->log_commit[index2], 1); 2771 2772 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2773 wait_log_commit(trans, log_root_tree, 2774 root_log_ctx.log_transid - 1); 2775 } 2776 2777 wait_for_writer(trans, log_root_tree); 2778 2779 /* 2780 * now that we've moved on to the tree of log tree roots, 2781 * check the full commit flag again 2782 */ 2783 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 2784 blk_finish_plug(&plug); 2785 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2786 btrfs_free_logged_extents(log, log_transid); 2787 mutex_unlock(&log_root_tree->log_mutex); 2788 ret = -EAGAIN; 2789 goto out_wake_log_root; 2790 } 2791 2792 ret = btrfs_write_marked_extents(log_root_tree, 2793 &log_root_tree->dirty_log_pages, 2794 EXTENT_DIRTY | EXTENT_NEW); 2795 blk_finish_plug(&plug); 2796 if (ret) { 2797 btrfs_set_log_full_commit(root->fs_info, trans); 2798 btrfs_abort_transaction(trans, root, ret); 2799 btrfs_free_logged_extents(log, log_transid); 2800 mutex_unlock(&log_root_tree->log_mutex); 2801 goto out_wake_log_root; 2802 } 2803 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2804 if (!ret) 2805 ret = btrfs_wait_marked_extents(log_root_tree, 2806 &log_root_tree->dirty_log_pages, 2807 EXTENT_NEW | EXTENT_DIRTY); 2808 if (ret) { 2809 btrfs_set_log_full_commit(root->fs_info, trans); 2810 btrfs_free_logged_extents(log, log_transid); 2811 mutex_unlock(&log_root_tree->log_mutex); 2812 goto out_wake_log_root; 2813 } 2814 btrfs_wait_logged_extents(trans, log, log_transid); 2815 2816 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2817 log_root_tree->node->start); 2818 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2819 btrfs_header_level(log_root_tree->node)); 2820 2821 log_root_tree->log_transid++; 2822 mutex_unlock(&log_root_tree->log_mutex); 2823 2824 /* 2825 * nobody else is going to jump in and write the the ctree 2826 * super here because the log_commit atomic below is protecting 2827 * us. We must be called with a transaction handle pinning 2828 * the running transaction open, so a full commit can't hop 2829 * in and cause problems either. 2830 */ 2831 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2832 if (ret) { 2833 btrfs_set_log_full_commit(root->fs_info, trans); 2834 btrfs_abort_transaction(trans, root, ret); 2835 goto out_wake_log_root; 2836 } 2837 2838 mutex_lock(&root->log_mutex); 2839 if (root->last_log_commit < log_transid) 2840 root->last_log_commit = log_transid; 2841 mutex_unlock(&root->log_mutex); 2842 2843 out_wake_log_root: 2844 /* 2845 * We needn't get log_mutex here because we are sure all 2846 * the other tasks are blocked. 2847 */ 2848 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 2849 2850 mutex_lock(&log_root_tree->log_mutex); 2851 log_root_tree->log_transid_committed++; 2852 atomic_set(&log_root_tree->log_commit[index2], 0); 2853 mutex_unlock(&log_root_tree->log_mutex); 2854 2855 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2856 wake_up(&log_root_tree->log_commit_wait[index2]); 2857 out: 2858 /* See above. */ 2859 btrfs_remove_all_log_ctxs(root, index1, ret); 2860 2861 mutex_lock(&root->log_mutex); 2862 root->log_transid_committed++; 2863 atomic_set(&root->log_commit[index1], 0); 2864 mutex_unlock(&root->log_mutex); 2865 2866 if (waitqueue_active(&root->log_commit_wait[index1])) 2867 wake_up(&root->log_commit_wait[index1]); 2868 return ret; 2869 } 2870 2871 static void free_log_tree(struct btrfs_trans_handle *trans, 2872 struct btrfs_root *log) 2873 { 2874 int ret; 2875 u64 start; 2876 u64 end; 2877 struct walk_control wc = { 2878 .free = 1, 2879 .process_func = process_one_buffer 2880 }; 2881 2882 ret = walk_log_tree(trans, log, &wc); 2883 /* I don't think this can happen but just in case */ 2884 if (ret) 2885 btrfs_abort_transaction(trans, log, ret); 2886 2887 while (1) { 2888 ret = find_first_extent_bit(&log->dirty_log_pages, 2889 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 2890 NULL); 2891 if (ret) 2892 break; 2893 2894 clear_extent_bits(&log->dirty_log_pages, start, end, 2895 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2896 } 2897 2898 /* 2899 * We may have short-circuited the log tree with the full commit logic 2900 * and left ordered extents on our list, so clear these out to keep us 2901 * from leaking inodes and memory. 2902 */ 2903 btrfs_free_logged_extents(log, 0); 2904 btrfs_free_logged_extents(log, 1); 2905 2906 free_extent_buffer(log->node); 2907 kfree(log); 2908 } 2909 2910 /* 2911 * free all the extents used by the tree log. This should be called 2912 * at commit time of the full transaction 2913 */ 2914 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2915 { 2916 if (root->log_root) { 2917 free_log_tree(trans, root->log_root); 2918 root->log_root = NULL; 2919 } 2920 return 0; 2921 } 2922 2923 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 2924 struct btrfs_fs_info *fs_info) 2925 { 2926 if (fs_info->log_root_tree) { 2927 free_log_tree(trans, fs_info->log_root_tree); 2928 fs_info->log_root_tree = NULL; 2929 } 2930 return 0; 2931 } 2932 2933 /* 2934 * If both a file and directory are logged, and unlinks or renames are 2935 * mixed in, we have a few interesting corners: 2936 * 2937 * create file X in dir Y 2938 * link file X to X.link in dir Y 2939 * fsync file X 2940 * unlink file X but leave X.link 2941 * fsync dir Y 2942 * 2943 * After a crash we would expect only X.link to exist. But file X 2944 * didn't get fsync'd again so the log has back refs for X and X.link. 2945 * 2946 * We solve this by removing directory entries and inode backrefs from the 2947 * log when a file that was logged in the current transaction is 2948 * unlinked. Any later fsync will include the updated log entries, and 2949 * we'll be able to reconstruct the proper directory items from backrefs. 2950 * 2951 * This optimizations allows us to avoid relogging the entire inode 2952 * or the entire directory. 2953 */ 2954 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2955 struct btrfs_root *root, 2956 const char *name, int name_len, 2957 struct inode *dir, u64 index) 2958 { 2959 struct btrfs_root *log; 2960 struct btrfs_dir_item *di; 2961 struct btrfs_path *path; 2962 int ret; 2963 int err = 0; 2964 int bytes_del = 0; 2965 u64 dir_ino = btrfs_ino(dir); 2966 2967 if (BTRFS_I(dir)->logged_trans < trans->transid) 2968 return 0; 2969 2970 ret = join_running_log_trans(root); 2971 if (ret) 2972 return 0; 2973 2974 mutex_lock(&BTRFS_I(dir)->log_mutex); 2975 2976 log = root->log_root; 2977 path = btrfs_alloc_path(); 2978 if (!path) { 2979 err = -ENOMEM; 2980 goto out_unlock; 2981 } 2982 2983 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 2984 name, name_len, -1); 2985 if (IS_ERR(di)) { 2986 err = PTR_ERR(di); 2987 goto fail; 2988 } 2989 if (di) { 2990 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2991 bytes_del += name_len; 2992 if (ret) { 2993 err = ret; 2994 goto fail; 2995 } 2996 } 2997 btrfs_release_path(path); 2998 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 2999 index, name, name_len, -1); 3000 if (IS_ERR(di)) { 3001 err = PTR_ERR(di); 3002 goto fail; 3003 } 3004 if (di) { 3005 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3006 bytes_del += name_len; 3007 if (ret) { 3008 err = ret; 3009 goto fail; 3010 } 3011 } 3012 3013 /* update the directory size in the log to reflect the names 3014 * we have removed 3015 */ 3016 if (bytes_del) { 3017 struct btrfs_key key; 3018 3019 key.objectid = dir_ino; 3020 key.offset = 0; 3021 key.type = BTRFS_INODE_ITEM_KEY; 3022 btrfs_release_path(path); 3023 3024 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3025 if (ret < 0) { 3026 err = ret; 3027 goto fail; 3028 } 3029 if (ret == 0) { 3030 struct btrfs_inode_item *item; 3031 u64 i_size; 3032 3033 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3034 struct btrfs_inode_item); 3035 i_size = btrfs_inode_size(path->nodes[0], item); 3036 if (i_size > bytes_del) 3037 i_size -= bytes_del; 3038 else 3039 i_size = 0; 3040 btrfs_set_inode_size(path->nodes[0], item, i_size); 3041 btrfs_mark_buffer_dirty(path->nodes[0]); 3042 } else 3043 ret = 0; 3044 btrfs_release_path(path); 3045 } 3046 fail: 3047 btrfs_free_path(path); 3048 out_unlock: 3049 mutex_unlock(&BTRFS_I(dir)->log_mutex); 3050 if (ret == -ENOSPC) { 3051 btrfs_set_log_full_commit(root->fs_info, trans); 3052 ret = 0; 3053 } else if (ret < 0) 3054 btrfs_abort_transaction(trans, root, ret); 3055 3056 btrfs_end_log_trans(root); 3057 3058 return err; 3059 } 3060 3061 /* see comments for btrfs_del_dir_entries_in_log */ 3062 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3063 struct btrfs_root *root, 3064 const char *name, int name_len, 3065 struct inode *inode, u64 dirid) 3066 { 3067 struct btrfs_root *log; 3068 u64 index; 3069 int ret; 3070 3071 if (BTRFS_I(inode)->logged_trans < trans->transid) 3072 return 0; 3073 3074 ret = join_running_log_trans(root); 3075 if (ret) 3076 return 0; 3077 log = root->log_root; 3078 mutex_lock(&BTRFS_I(inode)->log_mutex); 3079 3080 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3081 dirid, &index); 3082 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3083 if (ret == -ENOSPC) { 3084 btrfs_set_log_full_commit(root->fs_info, trans); 3085 ret = 0; 3086 } else if (ret < 0 && ret != -ENOENT) 3087 btrfs_abort_transaction(trans, root, ret); 3088 btrfs_end_log_trans(root); 3089 3090 return ret; 3091 } 3092 3093 /* 3094 * creates a range item in the log for 'dirid'. first_offset and 3095 * last_offset tell us which parts of the key space the log should 3096 * be considered authoritative for. 3097 */ 3098 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3099 struct btrfs_root *log, 3100 struct btrfs_path *path, 3101 int key_type, u64 dirid, 3102 u64 first_offset, u64 last_offset) 3103 { 3104 int ret; 3105 struct btrfs_key key; 3106 struct btrfs_dir_log_item *item; 3107 3108 key.objectid = dirid; 3109 key.offset = first_offset; 3110 if (key_type == BTRFS_DIR_ITEM_KEY) 3111 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3112 else 3113 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3114 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3115 if (ret) 3116 return ret; 3117 3118 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3119 struct btrfs_dir_log_item); 3120 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3121 btrfs_mark_buffer_dirty(path->nodes[0]); 3122 btrfs_release_path(path); 3123 return 0; 3124 } 3125 3126 /* 3127 * log all the items included in the current transaction for a given 3128 * directory. This also creates the range items in the log tree required 3129 * to replay anything deleted before the fsync 3130 */ 3131 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3132 struct btrfs_root *root, struct inode *inode, 3133 struct btrfs_path *path, 3134 struct btrfs_path *dst_path, int key_type, 3135 struct btrfs_log_ctx *ctx, 3136 u64 min_offset, u64 *last_offset_ret) 3137 { 3138 struct btrfs_key min_key; 3139 struct btrfs_root *log = root->log_root; 3140 struct extent_buffer *src; 3141 int err = 0; 3142 int ret; 3143 int i; 3144 int nritems; 3145 u64 first_offset = min_offset; 3146 u64 last_offset = (u64)-1; 3147 u64 ino = btrfs_ino(inode); 3148 3149 log = root->log_root; 3150 3151 min_key.objectid = ino; 3152 min_key.type = key_type; 3153 min_key.offset = min_offset; 3154 3155 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3156 3157 /* 3158 * we didn't find anything from this transaction, see if there 3159 * is anything at all 3160 */ 3161 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3162 min_key.objectid = ino; 3163 min_key.type = key_type; 3164 min_key.offset = (u64)-1; 3165 btrfs_release_path(path); 3166 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3167 if (ret < 0) { 3168 btrfs_release_path(path); 3169 return ret; 3170 } 3171 ret = btrfs_previous_item(root, path, ino, key_type); 3172 3173 /* if ret == 0 there are items for this type, 3174 * create a range to tell us the last key of this type. 3175 * otherwise, there are no items in this directory after 3176 * *min_offset, and we create a range to indicate that. 3177 */ 3178 if (ret == 0) { 3179 struct btrfs_key tmp; 3180 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3181 path->slots[0]); 3182 if (key_type == tmp.type) 3183 first_offset = max(min_offset, tmp.offset) + 1; 3184 } 3185 goto done; 3186 } 3187 3188 /* go backward to find any previous key */ 3189 ret = btrfs_previous_item(root, path, ino, key_type); 3190 if (ret == 0) { 3191 struct btrfs_key tmp; 3192 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3193 if (key_type == tmp.type) { 3194 first_offset = tmp.offset; 3195 ret = overwrite_item(trans, log, dst_path, 3196 path->nodes[0], path->slots[0], 3197 &tmp); 3198 if (ret) { 3199 err = ret; 3200 goto done; 3201 } 3202 } 3203 } 3204 btrfs_release_path(path); 3205 3206 /* find the first key from this transaction again */ 3207 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3208 if (WARN_ON(ret != 0)) 3209 goto done; 3210 3211 /* 3212 * we have a block from this transaction, log every item in it 3213 * from our directory 3214 */ 3215 while (1) { 3216 struct btrfs_key tmp; 3217 src = path->nodes[0]; 3218 nritems = btrfs_header_nritems(src); 3219 for (i = path->slots[0]; i < nritems; i++) { 3220 struct btrfs_dir_item *di; 3221 3222 btrfs_item_key_to_cpu(src, &min_key, i); 3223 3224 if (min_key.objectid != ino || min_key.type != key_type) 3225 goto done; 3226 ret = overwrite_item(trans, log, dst_path, src, i, 3227 &min_key); 3228 if (ret) { 3229 err = ret; 3230 goto done; 3231 } 3232 3233 /* 3234 * We must make sure that when we log a directory entry, 3235 * the corresponding inode, after log replay, has a 3236 * matching link count. For example: 3237 * 3238 * touch foo 3239 * mkdir mydir 3240 * sync 3241 * ln foo mydir/bar 3242 * xfs_io -c "fsync" mydir 3243 * <crash> 3244 * <mount fs and log replay> 3245 * 3246 * Would result in a fsync log that when replayed, our 3247 * file inode would have a link count of 1, but we get 3248 * two directory entries pointing to the same inode. 3249 * After removing one of the names, it would not be 3250 * possible to remove the other name, which resulted 3251 * always in stale file handle errors, and would not 3252 * be possible to rmdir the parent directory, since 3253 * its i_size could never decrement to the value 3254 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3255 */ 3256 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3257 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3258 if (ctx && 3259 (btrfs_dir_transid(src, di) == trans->transid || 3260 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3261 tmp.type != BTRFS_ROOT_ITEM_KEY) 3262 ctx->log_new_dentries = true; 3263 } 3264 path->slots[0] = nritems; 3265 3266 /* 3267 * look ahead to the next item and see if it is also 3268 * from this directory and from this transaction 3269 */ 3270 ret = btrfs_next_leaf(root, path); 3271 if (ret == 1) { 3272 last_offset = (u64)-1; 3273 goto done; 3274 } 3275 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3276 if (tmp.objectid != ino || tmp.type != key_type) { 3277 last_offset = (u64)-1; 3278 goto done; 3279 } 3280 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3281 ret = overwrite_item(trans, log, dst_path, 3282 path->nodes[0], path->slots[0], 3283 &tmp); 3284 if (ret) 3285 err = ret; 3286 else 3287 last_offset = tmp.offset; 3288 goto done; 3289 } 3290 } 3291 done: 3292 btrfs_release_path(path); 3293 btrfs_release_path(dst_path); 3294 3295 if (err == 0) { 3296 *last_offset_ret = last_offset; 3297 /* 3298 * insert the log range keys to indicate where the log 3299 * is valid 3300 */ 3301 ret = insert_dir_log_key(trans, log, path, key_type, 3302 ino, first_offset, last_offset); 3303 if (ret) 3304 err = ret; 3305 } 3306 return err; 3307 } 3308 3309 /* 3310 * logging directories is very similar to logging inodes, We find all the items 3311 * from the current transaction and write them to the log. 3312 * 3313 * The recovery code scans the directory in the subvolume, and if it finds a 3314 * key in the range logged that is not present in the log tree, then it means 3315 * that dir entry was unlinked during the transaction. 3316 * 3317 * In order for that scan to work, we must include one key smaller than 3318 * the smallest logged by this transaction and one key larger than the largest 3319 * key logged by this transaction. 3320 */ 3321 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3322 struct btrfs_root *root, struct inode *inode, 3323 struct btrfs_path *path, 3324 struct btrfs_path *dst_path, 3325 struct btrfs_log_ctx *ctx) 3326 { 3327 u64 min_key; 3328 u64 max_key; 3329 int ret; 3330 int key_type = BTRFS_DIR_ITEM_KEY; 3331 3332 again: 3333 min_key = 0; 3334 max_key = 0; 3335 while (1) { 3336 ret = log_dir_items(trans, root, inode, path, 3337 dst_path, key_type, ctx, min_key, 3338 &max_key); 3339 if (ret) 3340 return ret; 3341 if (max_key == (u64)-1) 3342 break; 3343 min_key = max_key + 1; 3344 } 3345 3346 if (key_type == BTRFS_DIR_ITEM_KEY) { 3347 key_type = BTRFS_DIR_INDEX_KEY; 3348 goto again; 3349 } 3350 return 0; 3351 } 3352 3353 /* 3354 * a helper function to drop items from the log before we relog an 3355 * inode. max_key_type indicates the highest item type to remove. 3356 * This cannot be run for file data extents because it does not 3357 * free the extents they point to. 3358 */ 3359 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3360 struct btrfs_root *log, 3361 struct btrfs_path *path, 3362 u64 objectid, int max_key_type) 3363 { 3364 int ret; 3365 struct btrfs_key key; 3366 struct btrfs_key found_key; 3367 int start_slot; 3368 3369 key.objectid = objectid; 3370 key.type = max_key_type; 3371 key.offset = (u64)-1; 3372 3373 while (1) { 3374 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3375 BUG_ON(ret == 0); /* Logic error */ 3376 if (ret < 0) 3377 break; 3378 3379 if (path->slots[0] == 0) 3380 break; 3381 3382 path->slots[0]--; 3383 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3384 path->slots[0]); 3385 3386 if (found_key.objectid != objectid) 3387 break; 3388 3389 found_key.offset = 0; 3390 found_key.type = 0; 3391 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3392 &start_slot); 3393 3394 ret = btrfs_del_items(trans, log, path, start_slot, 3395 path->slots[0] - start_slot + 1); 3396 /* 3397 * If start slot isn't 0 then we don't need to re-search, we've 3398 * found the last guy with the objectid in this tree. 3399 */ 3400 if (ret || start_slot != 0) 3401 break; 3402 btrfs_release_path(path); 3403 } 3404 btrfs_release_path(path); 3405 if (ret > 0) 3406 ret = 0; 3407 return ret; 3408 } 3409 3410 static void fill_inode_item(struct btrfs_trans_handle *trans, 3411 struct extent_buffer *leaf, 3412 struct btrfs_inode_item *item, 3413 struct inode *inode, int log_inode_only, 3414 u64 logged_isize) 3415 { 3416 struct btrfs_map_token token; 3417 3418 btrfs_init_map_token(&token); 3419 3420 if (log_inode_only) { 3421 /* set the generation to zero so the recover code 3422 * can tell the difference between an logging 3423 * just to say 'this inode exists' and a logging 3424 * to say 'update this inode with these values' 3425 */ 3426 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3427 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3428 } else { 3429 btrfs_set_token_inode_generation(leaf, item, 3430 BTRFS_I(inode)->generation, 3431 &token); 3432 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3433 } 3434 3435 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3436 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3437 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3438 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3439 3440 btrfs_set_token_timespec_sec(leaf, &item->atime, 3441 inode->i_atime.tv_sec, &token); 3442 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3443 inode->i_atime.tv_nsec, &token); 3444 3445 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3446 inode->i_mtime.tv_sec, &token); 3447 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3448 inode->i_mtime.tv_nsec, &token); 3449 3450 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3451 inode->i_ctime.tv_sec, &token); 3452 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3453 inode->i_ctime.tv_nsec, &token); 3454 3455 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3456 &token); 3457 3458 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3459 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3460 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3461 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3462 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3463 } 3464 3465 static int log_inode_item(struct btrfs_trans_handle *trans, 3466 struct btrfs_root *log, struct btrfs_path *path, 3467 struct inode *inode) 3468 { 3469 struct btrfs_inode_item *inode_item; 3470 int ret; 3471 3472 ret = btrfs_insert_empty_item(trans, log, path, 3473 &BTRFS_I(inode)->location, 3474 sizeof(*inode_item)); 3475 if (ret && ret != -EEXIST) 3476 return ret; 3477 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3478 struct btrfs_inode_item); 3479 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0); 3480 btrfs_release_path(path); 3481 return 0; 3482 } 3483 3484 static noinline int copy_items(struct btrfs_trans_handle *trans, 3485 struct inode *inode, 3486 struct btrfs_path *dst_path, 3487 struct btrfs_path *src_path, u64 *last_extent, 3488 int start_slot, int nr, int inode_only, 3489 u64 logged_isize) 3490 { 3491 unsigned long src_offset; 3492 unsigned long dst_offset; 3493 struct btrfs_root *log = BTRFS_I(inode)->root->log_root; 3494 struct btrfs_file_extent_item *extent; 3495 struct btrfs_inode_item *inode_item; 3496 struct extent_buffer *src = src_path->nodes[0]; 3497 struct btrfs_key first_key, last_key, key; 3498 int ret; 3499 struct btrfs_key *ins_keys; 3500 u32 *ins_sizes; 3501 char *ins_data; 3502 int i; 3503 struct list_head ordered_sums; 3504 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3505 bool has_extents = false; 3506 bool need_find_last_extent = true; 3507 bool done = false; 3508 3509 INIT_LIST_HEAD(&ordered_sums); 3510 3511 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3512 nr * sizeof(u32), GFP_NOFS); 3513 if (!ins_data) 3514 return -ENOMEM; 3515 3516 first_key.objectid = (u64)-1; 3517 3518 ins_sizes = (u32 *)ins_data; 3519 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3520 3521 for (i = 0; i < nr; i++) { 3522 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3523 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3524 } 3525 ret = btrfs_insert_empty_items(trans, log, dst_path, 3526 ins_keys, ins_sizes, nr); 3527 if (ret) { 3528 kfree(ins_data); 3529 return ret; 3530 } 3531 3532 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3533 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3534 dst_path->slots[0]); 3535 3536 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3537 3538 if ((i == (nr - 1))) 3539 last_key = ins_keys[i]; 3540 3541 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3542 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3543 dst_path->slots[0], 3544 struct btrfs_inode_item); 3545 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3546 inode, inode_only == LOG_INODE_EXISTS, 3547 logged_isize); 3548 } else { 3549 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3550 src_offset, ins_sizes[i]); 3551 } 3552 3553 /* 3554 * We set need_find_last_extent here in case we know we were 3555 * processing other items and then walk into the first extent in 3556 * the inode. If we don't hit an extent then nothing changes, 3557 * we'll do the last search the next time around. 3558 */ 3559 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3560 has_extents = true; 3561 if (first_key.objectid == (u64)-1) 3562 first_key = ins_keys[i]; 3563 } else { 3564 need_find_last_extent = false; 3565 } 3566 3567 /* take a reference on file data extents so that truncates 3568 * or deletes of this inode don't have to relog the inode 3569 * again 3570 */ 3571 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3572 !skip_csum) { 3573 int found_type; 3574 extent = btrfs_item_ptr(src, start_slot + i, 3575 struct btrfs_file_extent_item); 3576 3577 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3578 continue; 3579 3580 found_type = btrfs_file_extent_type(src, extent); 3581 if (found_type == BTRFS_FILE_EXTENT_REG) { 3582 u64 ds, dl, cs, cl; 3583 ds = btrfs_file_extent_disk_bytenr(src, 3584 extent); 3585 /* ds == 0 is a hole */ 3586 if (ds == 0) 3587 continue; 3588 3589 dl = btrfs_file_extent_disk_num_bytes(src, 3590 extent); 3591 cs = btrfs_file_extent_offset(src, extent); 3592 cl = btrfs_file_extent_num_bytes(src, 3593 extent); 3594 if (btrfs_file_extent_compression(src, 3595 extent)) { 3596 cs = 0; 3597 cl = dl; 3598 } 3599 3600 ret = btrfs_lookup_csums_range( 3601 log->fs_info->csum_root, 3602 ds + cs, ds + cs + cl - 1, 3603 &ordered_sums, 0); 3604 if (ret) { 3605 btrfs_release_path(dst_path); 3606 kfree(ins_data); 3607 return ret; 3608 } 3609 } 3610 } 3611 } 3612 3613 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3614 btrfs_release_path(dst_path); 3615 kfree(ins_data); 3616 3617 /* 3618 * we have to do this after the loop above to avoid changing the 3619 * log tree while trying to change the log tree. 3620 */ 3621 ret = 0; 3622 while (!list_empty(&ordered_sums)) { 3623 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3624 struct btrfs_ordered_sum, 3625 list); 3626 if (!ret) 3627 ret = btrfs_csum_file_blocks(trans, log, sums); 3628 list_del(&sums->list); 3629 kfree(sums); 3630 } 3631 3632 if (!has_extents) 3633 return ret; 3634 3635 if (need_find_last_extent && *last_extent == first_key.offset) { 3636 /* 3637 * We don't have any leafs between our current one and the one 3638 * we processed before that can have file extent items for our 3639 * inode (and have a generation number smaller than our current 3640 * transaction id). 3641 */ 3642 need_find_last_extent = false; 3643 } 3644 3645 /* 3646 * Because we use btrfs_search_forward we could skip leaves that were 3647 * not modified and then assume *last_extent is valid when it really 3648 * isn't. So back up to the previous leaf and read the end of the last 3649 * extent before we go and fill in holes. 3650 */ 3651 if (need_find_last_extent) { 3652 u64 len; 3653 3654 ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); 3655 if (ret < 0) 3656 return ret; 3657 if (ret) 3658 goto fill_holes; 3659 if (src_path->slots[0]) 3660 src_path->slots[0]--; 3661 src = src_path->nodes[0]; 3662 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3663 if (key.objectid != btrfs_ino(inode) || 3664 key.type != BTRFS_EXTENT_DATA_KEY) 3665 goto fill_holes; 3666 extent = btrfs_item_ptr(src, src_path->slots[0], 3667 struct btrfs_file_extent_item); 3668 if (btrfs_file_extent_type(src, extent) == 3669 BTRFS_FILE_EXTENT_INLINE) { 3670 len = btrfs_file_extent_inline_len(src, 3671 src_path->slots[0], 3672 extent); 3673 *last_extent = ALIGN(key.offset + len, 3674 log->sectorsize); 3675 } else { 3676 len = btrfs_file_extent_num_bytes(src, extent); 3677 *last_extent = key.offset + len; 3678 } 3679 } 3680 fill_holes: 3681 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3682 * things could have happened 3683 * 3684 * 1) A merge could have happened, so we could currently be on a leaf 3685 * that holds what we were copying in the first place. 3686 * 2) A split could have happened, and now not all of the items we want 3687 * are on the same leaf. 3688 * 3689 * So we need to adjust how we search for holes, we need to drop the 3690 * path and re-search for the first extent key we found, and then walk 3691 * forward until we hit the last one we copied. 3692 */ 3693 if (need_find_last_extent) { 3694 /* btrfs_prev_leaf could return 1 without releasing the path */ 3695 btrfs_release_path(src_path); 3696 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, 3697 src_path, 0, 0); 3698 if (ret < 0) 3699 return ret; 3700 ASSERT(ret == 0); 3701 src = src_path->nodes[0]; 3702 i = src_path->slots[0]; 3703 } else { 3704 i = start_slot; 3705 } 3706 3707 /* 3708 * Ok so here we need to go through and fill in any holes we may have 3709 * to make sure that holes are punched for those areas in case they had 3710 * extents previously. 3711 */ 3712 while (!done) { 3713 u64 offset, len; 3714 u64 extent_end; 3715 3716 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3717 ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); 3718 if (ret < 0) 3719 return ret; 3720 ASSERT(ret == 0); 3721 src = src_path->nodes[0]; 3722 i = 0; 3723 } 3724 3725 btrfs_item_key_to_cpu(src, &key, i); 3726 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3727 done = true; 3728 if (key.objectid != btrfs_ino(inode) || 3729 key.type != BTRFS_EXTENT_DATA_KEY) { 3730 i++; 3731 continue; 3732 } 3733 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3734 if (btrfs_file_extent_type(src, extent) == 3735 BTRFS_FILE_EXTENT_INLINE) { 3736 len = btrfs_file_extent_inline_len(src, i, extent); 3737 extent_end = ALIGN(key.offset + len, log->sectorsize); 3738 } else { 3739 len = btrfs_file_extent_num_bytes(src, extent); 3740 extent_end = key.offset + len; 3741 } 3742 i++; 3743 3744 if (*last_extent == key.offset) { 3745 *last_extent = extent_end; 3746 continue; 3747 } 3748 offset = *last_extent; 3749 len = key.offset - *last_extent; 3750 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 3751 offset, 0, 0, len, 0, len, 0, 3752 0, 0); 3753 if (ret) 3754 break; 3755 *last_extent = extent_end; 3756 } 3757 /* 3758 * Need to let the callers know we dropped the path so they should 3759 * re-search. 3760 */ 3761 if (!ret && need_find_last_extent) 3762 ret = 1; 3763 return ret; 3764 } 3765 3766 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3767 { 3768 struct extent_map *em1, *em2; 3769 3770 em1 = list_entry(a, struct extent_map, list); 3771 em2 = list_entry(b, struct extent_map, list); 3772 3773 if (em1->start < em2->start) 3774 return -1; 3775 else if (em1->start > em2->start) 3776 return 1; 3777 return 0; 3778 } 3779 3780 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 3781 struct inode *inode, 3782 struct btrfs_root *root, 3783 const struct extent_map *em, 3784 const struct list_head *logged_list, 3785 bool *ordered_io_error) 3786 { 3787 struct btrfs_ordered_extent *ordered; 3788 struct btrfs_root *log = root->log_root; 3789 u64 mod_start = em->mod_start; 3790 u64 mod_len = em->mod_len; 3791 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3792 u64 csum_offset; 3793 u64 csum_len; 3794 LIST_HEAD(ordered_sums); 3795 int ret = 0; 3796 3797 *ordered_io_error = false; 3798 3799 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 3800 em->block_start == EXTENT_MAP_HOLE) 3801 return 0; 3802 3803 /* 3804 * Wait far any ordered extent that covers our extent map. If it 3805 * finishes without an error, first check and see if our csums are on 3806 * our outstanding ordered extents. 3807 */ 3808 list_for_each_entry(ordered, logged_list, log_list) { 3809 struct btrfs_ordered_sum *sum; 3810 3811 if (!mod_len) 3812 break; 3813 3814 if (ordered->file_offset + ordered->len <= mod_start || 3815 mod_start + mod_len <= ordered->file_offset) 3816 continue; 3817 3818 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 3819 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 3820 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 3821 const u64 start = ordered->file_offset; 3822 const u64 end = ordered->file_offset + ordered->len - 1; 3823 3824 WARN_ON(ordered->inode != inode); 3825 filemap_fdatawrite_range(inode->i_mapping, start, end); 3826 } 3827 3828 wait_event(ordered->wait, 3829 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 3830 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3831 3832 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3833 /* 3834 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 3835 * i_mapping flags, so that the next fsync won't get 3836 * an outdated io error too. 3837 */ 3838 btrfs_inode_check_errors(inode); 3839 *ordered_io_error = true; 3840 break; 3841 } 3842 /* 3843 * We are going to copy all the csums on this ordered extent, so 3844 * go ahead and adjust mod_start and mod_len in case this 3845 * ordered extent has already been logged. 3846 */ 3847 if (ordered->file_offset > mod_start) { 3848 if (ordered->file_offset + ordered->len >= 3849 mod_start + mod_len) 3850 mod_len = ordered->file_offset - mod_start; 3851 /* 3852 * If we have this case 3853 * 3854 * |--------- logged extent ---------| 3855 * |----- ordered extent ----| 3856 * 3857 * Just don't mess with mod_start and mod_len, we'll 3858 * just end up logging more csums than we need and it 3859 * will be ok. 3860 */ 3861 } else { 3862 if (ordered->file_offset + ordered->len < 3863 mod_start + mod_len) { 3864 mod_len = (mod_start + mod_len) - 3865 (ordered->file_offset + ordered->len); 3866 mod_start = ordered->file_offset + 3867 ordered->len; 3868 } else { 3869 mod_len = 0; 3870 } 3871 } 3872 3873 if (skip_csum) 3874 continue; 3875 3876 /* 3877 * To keep us from looping for the above case of an ordered 3878 * extent that falls inside of the logged extent. 3879 */ 3880 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3881 &ordered->flags)) 3882 continue; 3883 3884 list_for_each_entry(sum, &ordered->list, list) { 3885 ret = btrfs_csum_file_blocks(trans, log, sum); 3886 if (ret) 3887 break; 3888 } 3889 } 3890 3891 if (*ordered_io_error || !mod_len || ret || skip_csum) 3892 return ret; 3893 3894 if (em->compress_type) { 3895 csum_offset = 0; 3896 csum_len = max(em->block_len, em->orig_block_len); 3897 } else { 3898 csum_offset = mod_start - em->start; 3899 csum_len = mod_len; 3900 } 3901 3902 /* block start is already adjusted for the file extent offset. */ 3903 ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 3904 em->block_start + csum_offset, 3905 em->block_start + csum_offset + 3906 csum_len - 1, &ordered_sums, 0); 3907 if (ret) 3908 return ret; 3909 3910 while (!list_empty(&ordered_sums)) { 3911 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3912 struct btrfs_ordered_sum, 3913 list); 3914 if (!ret) 3915 ret = btrfs_csum_file_blocks(trans, log, sums); 3916 list_del(&sums->list); 3917 kfree(sums); 3918 } 3919 3920 return ret; 3921 } 3922 3923 static int log_one_extent(struct btrfs_trans_handle *trans, 3924 struct inode *inode, struct btrfs_root *root, 3925 const struct extent_map *em, 3926 struct btrfs_path *path, 3927 const struct list_head *logged_list, 3928 struct btrfs_log_ctx *ctx) 3929 { 3930 struct btrfs_root *log = root->log_root; 3931 struct btrfs_file_extent_item *fi; 3932 struct extent_buffer *leaf; 3933 struct btrfs_map_token token; 3934 struct btrfs_key key; 3935 u64 extent_offset = em->start - em->orig_start; 3936 u64 block_len; 3937 int ret; 3938 int extent_inserted = 0; 3939 bool ordered_io_err = false; 3940 3941 ret = wait_ordered_extents(trans, inode, root, em, logged_list, 3942 &ordered_io_err); 3943 if (ret) 3944 return ret; 3945 3946 if (ordered_io_err) { 3947 ctx->io_err = -EIO; 3948 return 0; 3949 } 3950 3951 btrfs_init_map_token(&token); 3952 3953 ret = __btrfs_drop_extents(trans, log, inode, path, em->start, 3954 em->start + em->len, NULL, 0, 1, 3955 sizeof(*fi), &extent_inserted); 3956 if (ret) 3957 return ret; 3958 3959 if (!extent_inserted) { 3960 key.objectid = btrfs_ino(inode); 3961 key.type = BTRFS_EXTENT_DATA_KEY; 3962 key.offset = em->start; 3963 3964 ret = btrfs_insert_empty_item(trans, log, path, &key, 3965 sizeof(*fi)); 3966 if (ret) 3967 return ret; 3968 } 3969 leaf = path->nodes[0]; 3970 fi = btrfs_item_ptr(leaf, path->slots[0], 3971 struct btrfs_file_extent_item); 3972 3973 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 3974 &token); 3975 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3976 btrfs_set_token_file_extent_type(leaf, fi, 3977 BTRFS_FILE_EXTENT_PREALLOC, 3978 &token); 3979 else 3980 btrfs_set_token_file_extent_type(leaf, fi, 3981 BTRFS_FILE_EXTENT_REG, 3982 &token); 3983 3984 block_len = max(em->block_len, em->orig_block_len); 3985 if (em->compress_type != BTRFS_COMPRESS_NONE) { 3986 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3987 em->block_start, 3988 &token); 3989 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3990 &token); 3991 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 3992 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3993 em->block_start - 3994 extent_offset, &token); 3995 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3996 &token); 3997 } else { 3998 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 3999 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4000 &token); 4001 } 4002 4003 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4004 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4005 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4006 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4007 &token); 4008 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4009 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4010 btrfs_mark_buffer_dirty(leaf); 4011 4012 btrfs_release_path(path); 4013 4014 return ret; 4015 } 4016 4017 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4018 struct btrfs_root *root, 4019 struct inode *inode, 4020 struct btrfs_path *path, 4021 struct list_head *logged_list, 4022 struct btrfs_log_ctx *ctx) 4023 { 4024 struct extent_map *em, *n; 4025 struct list_head extents; 4026 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 4027 u64 test_gen; 4028 int ret = 0; 4029 int num = 0; 4030 4031 INIT_LIST_HEAD(&extents); 4032 4033 write_lock(&tree->lock); 4034 test_gen = root->fs_info->last_trans_committed; 4035 4036 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4037 list_del_init(&em->list); 4038 4039 /* 4040 * Just an arbitrary number, this can be really CPU intensive 4041 * once we start getting a lot of extents, and really once we 4042 * have a bunch of extents we just want to commit since it will 4043 * be faster. 4044 */ 4045 if (++num > 32768) { 4046 list_del_init(&tree->modified_extents); 4047 ret = -EFBIG; 4048 goto process; 4049 } 4050 4051 if (em->generation <= test_gen) 4052 continue; 4053 /* Need a ref to keep it from getting evicted from cache */ 4054 atomic_inc(&em->refs); 4055 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4056 list_add_tail(&em->list, &extents); 4057 num++; 4058 } 4059 4060 list_sort(NULL, &extents, extent_cmp); 4061 4062 process: 4063 while (!list_empty(&extents)) { 4064 em = list_entry(extents.next, struct extent_map, list); 4065 4066 list_del_init(&em->list); 4067 4068 /* 4069 * If we had an error we just need to delete everybody from our 4070 * private list. 4071 */ 4072 if (ret) { 4073 clear_em_logging(tree, em); 4074 free_extent_map(em); 4075 continue; 4076 } 4077 4078 write_unlock(&tree->lock); 4079 4080 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4081 ctx); 4082 write_lock(&tree->lock); 4083 clear_em_logging(tree, em); 4084 free_extent_map(em); 4085 } 4086 WARN_ON(!list_empty(&extents)); 4087 write_unlock(&tree->lock); 4088 4089 btrfs_release_path(path); 4090 return ret; 4091 } 4092 4093 static int logged_inode_size(struct btrfs_root *log, struct inode *inode, 4094 struct btrfs_path *path, u64 *size_ret) 4095 { 4096 struct btrfs_key key; 4097 int ret; 4098 4099 key.objectid = btrfs_ino(inode); 4100 key.type = BTRFS_INODE_ITEM_KEY; 4101 key.offset = 0; 4102 4103 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4104 if (ret < 0) { 4105 return ret; 4106 } else if (ret > 0) { 4107 *size_ret = 0; 4108 } else { 4109 struct btrfs_inode_item *item; 4110 4111 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4112 struct btrfs_inode_item); 4113 *size_ret = btrfs_inode_size(path->nodes[0], item); 4114 } 4115 4116 btrfs_release_path(path); 4117 return 0; 4118 } 4119 4120 /* 4121 * At the moment we always log all xattrs. This is to figure out at log replay 4122 * time which xattrs must have their deletion replayed. If a xattr is missing 4123 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4124 * because if a xattr is deleted, the inode is fsynced and a power failure 4125 * happens, causing the log to be replayed the next time the fs is mounted, 4126 * we want the xattr to not exist anymore (same behaviour as other filesystems 4127 * with a journal, ext3/4, xfs, f2fs, etc). 4128 */ 4129 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4130 struct btrfs_root *root, 4131 struct inode *inode, 4132 struct btrfs_path *path, 4133 struct btrfs_path *dst_path) 4134 { 4135 int ret; 4136 struct btrfs_key key; 4137 const u64 ino = btrfs_ino(inode); 4138 int ins_nr = 0; 4139 int start_slot = 0; 4140 4141 key.objectid = ino; 4142 key.type = BTRFS_XATTR_ITEM_KEY; 4143 key.offset = 0; 4144 4145 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4146 if (ret < 0) 4147 return ret; 4148 4149 while (true) { 4150 int slot = path->slots[0]; 4151 struct extent_buffer *leaf = path->nodes[0]; 4152 int nritems = btrfs_header_nritems(leaf); 4153 4154 if (slot >= nritems) { 4155 if (ins_nr > 0) { 4156 u64 last_extent = 0; 4157 4158 ret = copy_items(trans, inode, dst_path, path, 4159 &last_extent, start_slot, 4160 ins_nr, 1, 0); 4161 /* can't be 1, extent items aren't processed */ 4162 ASSERT(ret <= 0); 4163 if (ret < 0) 4164 return ret; 4165 ins_nr = 0; 4166 } 4167 ret = btrfs_next_leaf(root, path); 4168 if (ret < 0) 4169 return ret; 4170 else if (ret > 0) 4171 break; 4172 continue; 4173 } 4174 4175 btrfs_item_key_to_cpu(leaf, &key, slot); 4176 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4177 break; 4178 4179 if (ins_nr == 0) 4180 start_slot = slot; 4181 ins_nr++; 4182 path->slots[0]++; 4183 cond_resched(); 4184 } 4185 if (ins_nr > 0) { 4186 u64 last_extent = 0; 4187 4188 ret = copy_items(trans, inode, dst_path, path, 4189 &last_extent, start_slot, 4190 ins_nr, 1, 0); 4191 /* can't be 1, extent items aren't processed */ 4192 ASSERT(ret <= 0); 4193 if (ret < 0) 4194 return ret; 4195 } 4196 4197 return 0; 4198 } 4199 4200 /* 4201 * If the no holes feature is enabled we need to make sure any hole between the 4202 * last extent and the i_size of our inode is explicitly marked in the log. This 4203 * is to make sure that doing something like: 4204 * 4205 * 1) create file with 128Kb of data 4206 * 2) truncate file to 64Kb 4207 * 3) truncate file to 256Kb 4208 * 4) fsync file 4209 * 5) <crash/power failure> 4210 * 6) mount fs and trigger log replay 4211 * 4212 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4213 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4214 * file correspond to a hole. The presence of explicit holes in a log tree is 4215 * what guarantees that log replay will remove/adjust file extent items in the 4216 * fs/subvol tree. 4217 * 4218 * Here we do not need to care about holes between extents, that is already done 4219 * by copy_items(). We also only need to do this in the full sync path, where we 4220 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4221 * lookup the list of modified extent maps and if any represents a hole, we 4222 * insert a corresponding extent representing a hole in the log tree. 4223 */ 4224 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4225 struct btrfs_root *root, 4226 struct inode *inode, 4227 struct btrfs_path *path) 4228 { 4229 int ret; 4230 struct btrfs_key key; 4231 u64 hole_start; 4232 u64 hole_size; 4233 struct extent_buffer *leaf; 4234 struct btrfs_root *log = root->log_root; 4235 const u64 ino = btrfs_ino(inode); 4236 const u64 i_size = i_size_read(inode); 4237 4238 if (!btrfs_fs_incompat(root->fs_info, NO_HOLES)) 4239 return 0; 4240 4241 key.objectid = ino; 4242 key.type = BTRFS_EXTENT_DATA_KEY; 4243 key.offset = (u64)-1; 4244 4245 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4246 ASSERT(ret != 0); 4247 if (ret < 0) 4248 return ret; 4249 4250 ASSERT(path->slots[0] > 0); 4251 path->slots[0]--; 4252 leaf = path->nodes[0]; 4253 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4254 4255 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4256 /* inode does not have any extents */ 4257 hole_start = 0; 4258 hole_size = i_size; 4259 } else { 4260 struct btrfs_file_extent_item *extent; 4261 u64 len; 4262 4263 /* 4264 * If there's an extent beyond i_size, an explicit hole was 4265 * already inserted by copy_items(). 4266 */ 4267 if (key.offset >= i_size) 4268 return 0; 4269 4270 extent = btrfs_item_ptr(leaf, path->slots[0], 4271 struct btrfs_file_extent_item); 4272 4273 if (btrfs_file_extent_type(leaf, extent) == 4274 BTRFS_FILE_EXTENT_INLINE) { 4275 len = btrfs_file_extent_inline_len(leaf, 4276 path->slots[0], 4277 extent); 4278 ASSERT(len == i_size); 4279 return 0; 4280 } 4281 4282 len = btrfs_file_extent_num_bytes(leaf, extent); 4283 /* Last extent goes beyond i_size, no need to log a hole. */ 4284 if (key.offset + len > i_size) 4285 return 0; 4286 hole_start = key.offset + len; 4287 hole_size = i_size - hole_start; 4288 } 4289 btrfs_release_path(path); 4290 4291 /* Last extent ends at i_size. */ 4292 if (hole_size == 0) 4293 return 0; 4294 4295 hole_size = ALIGN(hole_size, root->sectorsize); 4296 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4297 hole_size, 0, hole_size, 0, 0, 0); 4298 return ret; 4299 } 4300 4301 /* log a single inode in the tree log. 4302 * At least one parent directory for this inode must exist in the tree 4303 * or be logged already. 4304 * 4305 * Any items from this inode changed by the current transaction are copied 4306 * to the log tree. An extra reference is taken on any extents in this 4307 * file, allowing us to avoid a whole pile of corner cases around logging 4308 * blocks that have been removed from the tree. 4309 * 4310 * See LOG_INODE_ALL and related defines for a description of what inode_only 4311 * does. 4312 * 4313 * This handles both files and directories. 4314 */ 4315 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4316 struct btrfs_root *root, struct inode *inode, 4317 int inode_only, 4318 const loff_t start, 4319 const loff_t end, 4320 struct btrfs_log_ctx *ctx) 4321 { 4322 struct btrfs_path *path; 4323 struct btrfs_path *dst_path; 4324 struct btrfs_key min_key; 4325 struct btrfs_key max_key; 4326 struct btrfs_root *log = root->log_root; 4327 struct extent_buffer *src = NULL; 4328 LIST_HEAD(logged_list); 4329 u64 last_extent = 0; 4330 int err = 0; 4331 int ret; 4332 int nritems; 4333 int ins_start_slot = 0; 4334 int ins_nr; 4335 bool fast_search = false; 4336 u64 ino = btrfs_ino(inode); 4337 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4338 u64 logged_isize = 0; 4339 bool need_log_inode_item = true; 4340 4341 path = btrfs_alloc_path(); 4342 if (!path) 4343 return -ENOMEM; 4344 dst_path = btrfs_alloc_path(); 4345 if (!dst_path) { 4346 btrfs_free_path(path); 4347 return -ENOMEM; 4348 } 4349 4350 min_key.objectid = ino; 4351 min_key.type = BTRFS_INODE_ITEM_KEY; 4352 min_key.offset = 0; 4353 4354 max_key.objectid = ino; 4355 4356 4357 /* today the code can only do partial logging of directories */ 4358 if (S_ISDIR(inode->i_mode) || 4359 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4360 &BTRFS_I(inode)->runtime_flags) && 4361 inode_only == LOG_INODE_EXISTS)) 4362 max_key.type = BTRFS_XATTR_ITEM_KEY; 4363 else 4364 max_key.type = (u8)-1; 4365 max_key.offset = (u64)-1; 4366 4367 /* 4368 * Only run delayed items if we are a dir or a new file. 4369 * Otherwise commit the delayed inode only, which is needed in 4370 * order for the log replay code to mark inodes for link count 4371 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4372 */ 4373 if (S_ISDIR(inode->i_mode) || 4374 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) 4375 ret = btrfs_commit_inode_delayed_items(trans, inode); 4376 else 4377 ret = btrfs_commit_inode_delayed_inode(inode); 4378 4379 if (ret) { 4380 btrfs_free_path(path); 4381 btrfs_free_path(dst_path); 4382 return ret; 4383 } 4384 4385 mutex_lock(&BTRFS_I(inode)->log_mutex); 4386 4387 btrfs_get_logged_extents(inode, &logged_list, start, end); 4388 4389 /* 4390 * a brute force approach to making sure we get the most uptodate 4391 * copies of everything. 4392 */ 4393 if (S_ISDIR(inode->i_mode)) { 4394 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4395 4396 if (inode_only == LOG_INODE_EXISTS) 4397 max_key_type = BTRFS_XATTR_ITEM_KEY; 4398 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4399 } else { 4400 if (inode_only == LOG_INODE_EXISTS) { 4401 /* 4402 * Make sure the new inode item we write to the log has 4403 * the same isize as the current one (if it exists). 4404 * This is necessary to prevent data loss after log 4405 * replay, and also to prevent doing a wrong expanding 4406 * truncate - for e.g. create file, write 4K into offset 4407 * 0, fsync, write 4K into offset 4096, add hard link, 4408 * fsync some other file (to sync log), power fail - if 4409 * we use the inode's current i_size, after log replay 4410 * we get a 8Kb file, with the last 4Kb extent as a hole 4411 * (zeroes), as if an expanding truncate happened, 4412 * instead of getting a file of 4Kb only. 4413 */ 4414 err = logged_inode_size(log, inode, path, 4415 &logged_isize); 4416 if (err) 4417 goto out_unlock; 4418 } 4419 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4420 &BTRFS_I(inode)->runtime_flags)) { 4421 if (inode_only == LOG_INODE_EXISTS) { 4422 max_key.type = BTRFS_XATTR_ITEM_KEY; 4423 ret = drop_objectid_items(trans, log, path, ino, 4424 max_key.type); 4425 } else { 4426 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4427 &BTRFS_I(inode)->runtime_flags); 4428 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4429 &BTRFS_I(inode)->runtime_flags); 4430 while(1) { 4431 ret = btrfs_truncate_inode_items(trans, 4432 log, inode, 0, 0); 4433 if (ret != -EAGAIN) 4434 break; 4435 } 4436 } 4437 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4438 &BTRFS_I(inode)->runtime_flags) || 4439 inode_only == LOG_INODE_EXISTS) { 4440 if (inode_only == LOG_INODE_ALL) 4441 fast_search = true; 4442 max_key.type = BTRFS_XATTR_ITEM_KEY; 4443 ret = drop_objectid_items(trans, log, path, ino, 4444 max_key.type); 4445 } else { 4446 if (inode_only == LOG_INODE_ALL) 4447 fast_search = true; 4448 goto log_extents; 4449 } 4450 4451 } 4452 if (ret) { 4453 err = ret; 4454 goto out_unlock; 4455 } 4456 4457 while (1) { 4458 ins_nr = 0; 4459 ret = btrfs_search_forward(root, &min_key, 4460 path, trans->transid); 4461 if (ret != 0) 4462 break; 4463 again: 4464 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4465 if (min_key.objectid != ino) 4466 break; 4467 if (min_key.type > max_key.type) 4468 break; 4469 4470 if (min_key.type == BTRFS_INODE_ITEM_KEY) 4471 need_log_inode_item = false; 4472 4473 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 4474 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 4475 if (ins_nr == 0) 4476 goto next_slot; 4477 ret = copy_items(trans, inode, dst_path, path, 4478 &last_extent, ins_start_slot, 4479 ins_nr, inode_only, logged_isize); 4480 if (ret < 0) { 4481 err = ret; 4482 goto out_unlock; 4483 } 4484 ins_nr = 0; 4485 if (ret) { 4486 btrfs_release_path(path); 4487 continue; 4488 } 4489 goto next_slot; 4490 } 4491 4492 src = path->nodes[0]; 4493 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4494 ins_nr++; 4495 goto next_slot; 4496 } else if (!ins_nr) { 4497 ins_start_slot = path->slots[0]; 4498 ins_nr = 1; 4499 goto next_slot; 4500 } 4501 4502 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4503 ins_start_slot, ins_nr, inode_only, 4504 logged_isize); 4505 if (ret < 0) { 4506 err = ret; 4507 goto out_unlock; 4508 } 4509 if (ret) { 4510 ins_nr = 0; 4511 btrfs_release_path(path); 4512 continue; 4513 } 4514 ins_nr = 1; 4515 ins_start_slot = path->slots[0]; 4516 next_slot: 4517 4518 nritems = btrfs_header_nritems(path->nodes[0]); 4519 path->slots[0]++; 4520 if (path->slots[0] < nritems) { 4521 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 4522 path->slots[0]); 4523 goto again; 4524 } 4525 if (ins_nr) { 4526 ret = copy_items(trans, inode, dst_path, path, 4527 &last_extent, ins_start_slot, 4528 ins_nr, inode_only, logged_isize); 4529 if (ret < 0) { 4530 err = ret; 4531 goto out_unlock; 4532 } 4533 ret = 0; 4534 ins_nr = 0; 4535 } 4536 btrfs_release_path(path); 4537 4538 if (min_key.offset < (u64)-1) { 4539 min_key.offset++; 4540 } else if (min_key.type < max_key.type) { 4541 min_key.type++; 4542 min_key.offset = 0; 4543 } else { 4544 break; 4545 } 4546 } 4547 if (ins_nr) { 4548 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4549 ins_start_slot, ins_nr, inode_only, 4550 logged_isize); 4551 if (ret < 0) { 4552 err = ret; 4553 goto out_unlock; 4554 } 4555 ret = 0; 4556 ins_nr = 0; 4557 } 4558 4559 btrfs_release_path(path); 4560 btrfs_release_path(dst_path); 4561 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 4562 if (err) 4563 goto out_unlock; 4564 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 4565 btrfs_release_path(path); 4566 btrfs_release_path(dst_path); 4567 err = btrfs_log_trailing_hole(trans, root, inode, path); 4568 if (err) 4569 goto out_unlock; 4570 } 4571 log_extents: 4572 btrfs_release_path(path); 4573 btrfs_release_path(dst_path); 4574 if (need_log_inode_item) { 4575 err = log_inode_item(trans, log, dst_path, inode); 4576 if (err) 4577 goto out_unlock; 4578 } 4579 if (fast_search) { 4580 /* 4581 * Some ordered extents started by fsync might have completed 4582 * before we collected the ordered extents in logged_list, which 4583 * means they're gone, not in our logged_list nor in the inode's 4584 * ordered tree. We want the application/user space to know an 4585 * error happened while attempting to persist file data so that 4586 * it can take proper action. If such error happened, we leave 4587 * without writing to the log tree and the fsync must report the 4588 * file data write error and not commit the current transaction. 4589 */ 4590 err = btrfs_inode_check_errors(inode); 4591 if (err) { 4592 ctx->io_err = err; 4593 goto out_unlock; 4594 } 4595 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4596 &logged_list, ctx); 4597 if (ret) { 4598 err = ret; 4599 goto out_unlock; 4600 } 4601 } else if (inode_only == LOG_INODE_ALL) { 4602 struct extent_map *em, *n; 4603 4604 write_lock(&em_tree->lock); 4605 /* 4606 * We can't just remove every em if we're called for a ranged 4607 * fsync - that is, one that doesn't cover the whole possible 4608 * file range (0 to LLONG_MAX). This is because we can have 4609 * em's that fall outside the range we're logging and therefore 4610 * their ordered operations haven't completed yet 4611 * (btrfs_finish_ordered_io() not invoked yet). This means we 4612 * didn't get their respective file extent item in the fs/subvol 4613 * tree yet, and need to let the next fast fsync (one which 4614 * consults the list of modified extent maps) find the em so 4615 * that it logs a matching file extent item and waits for the 4616 * respective ordered operation to complete (if it's still 4617 * running). 4618 * 4619 * Removing every em outside the range we're logging would make 4620 * the next fast fsync not log their matching file extent items, 4621 * therefore making us lose data after a log replay. 4622 */ 4623 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 4624 list) { 4625 const u64 mod_end = em->mod_start + em->mod_len - 1; 4626 4627 if (em->mod_start >= start && mod_end <= end) 4628 list_del_init(&em->list); 4629 } 4630 write_unlock(&em_tree->lock); 4631 } 4632 4633 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 4634 ret = log_directory_changes(trans, root, inode, path, dst_path, 4635 ctx); 4636 if (ret) { 4637 err = ret; 4638 goto out_unlock; 4639 } 4640 } 4641 4642 spin_lock(&BTRFS_I(inode)->lock); 4643 BTRFS_I(inode)->logged_trans = trans->transid; 4644 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4645 spin_unlock(&BTRFS_I(inode)->lock); 4646 out_unlock: 4647 if (unlikely(err)) 4648 btrfs_put_logged_extents(&logged_list); 4649 else 4650 btrfs_submit_logged_extents(&logged_list, log); 4651 mutex_unlock(&BTRFS_I(inode)->log_mutex); 4652 4653 btrfs_free_path(path); 4654 btrfs_free_path(dst_path); 4655 return err; 4656 } 4657 4658 /* 4659 * follow the dentry parent pointers up the chain and see if any 4660 * of the directories in it require a full commit before they can 4661 * be logged. Returns zero if nothing special needs to be done or 1 if 4662 * a full commit is required. 4663 */ 4664 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 4665 struct inode *inode, 4666 struct dentry *parent, 4667 struct super_block *sb, 4668 u64 last_committed) 4669 { 4670 int ret = 0; 4671 struct btrfs_root *root; 4672 struct dentry *old_parent = NULL; 4673 struct inode *orig_inode = inode; 4674 4675 /* 4676 * for regular files, if its inode is already on disk, we don't 4677 * have to worry about the parents at all. This is because 4678 * we can use the last_unlink_trans field to record renames 4679 * and other fun in this file. 4680 */ 4681 if (S_ISREG(inode->i_mode) && 4682 BTRFS_I(inode)->generation <= last_committed && 4683 BTRFS_I(inode)->last_unlink_trans <= last_committed) 4684 goto out; 4685 4686 if (!S_ISDIR(inode->i_mode)) { 4687 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) 4688 goto out; 4689 inode = d_inode(parent); 4690 } 4691 4692 while (1) { 4693 /* 4694 * If we are logging a directory then we start with our inode, 4695 * not our parents inode, so we need to skipp setting the 4696 * logged_trans so that further down in the log code we don't 4697 * think this inode has already been logged. 4698 */ 4699 if (inode != orig_inode) 4700 BTRFS_I(inode)->logged_trans = trans->transid; 4701 smp_mb(); 4702 4703 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 4704 root = BTRFS_I(inode)->root; 4705 4706 /* 4707 * make sure any commits to the log are forced 4708 * to be full commits 4709 */ 4710 btrfs_set_log_full_commit(root->fs_info, trans); 4711 ret = 1; 4712 break; 4713 } 4714 4715 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) 4716 break; 4717 4718 if (IS_ROOT(parent)) 4719 break; 4720 4721 parent = dget_parent(parent); 4722 dput(old_parent); 4723 old_parent = parent; 4724 inode = d_inode(parent); 4725 4726 } 4727 dput(old_parent); 4728 out: 4729 return ret; 4730 } 4731 4732 struct btrfs_dir_list { 4733 u64 ino; 4734 struct list_head list; 4735 }; 4736 4737 /* 4738 * Log the inodes of the new dentries of a directory. See log_dir_items() for 4739 * details about the why it is needed. 4740 * This is a recursive operation - if an existing dentry corresponds to a 4741 * directory, that directory's new entries are logged too (same behaviour as 4742 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 4743 * the dentries point to we do not lock their i_mutex, otherwise lockdep 4744 * complains about the following circular lock dependency / possible deadlock: 4745 * 4746 * CPU0 CPU1 4747 * ---- ---- 4748 * lock(&type->i_mutex_dir_key#3/2); 4749 * lock(sb_internal#2); 4750 * lock(&type->i_mutex_dir_key#3/2); 4751 * lock(&sb->s_type->i_mutex_key#14); 4752 * 4753 * Where sb_internal is the lock (a counter that works as a lock) acquired by 4754 * sb_start_intwrite() in btrfs_start_transaction(). 4755 * Not locking i_mutex of the inodes is still safe because: 4756 * 4757 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 4758 * that while logging the inode new references (names) are added or removed 4759 * from the inode, leaving the logged inode item with a link count that does 4760 * not match the number of logged inode reference items. This is fine because 4761 * at log replay time we compute the real number of links and correct the 4762 * link count in the inode item (see replay_one_buffer() and 4763 * link_to_fixup_dir()); 4764 * 4765 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 4766 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 4767 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 4768 * has a size that doesn't match the sum of the lengths of all the logged 4769 * names. This does not result in a problem because if a dir_item key is 4770 * logged but its matching dir_index key is not logged, at log replay time we 4771 * don't use it to replay the respective name (see replay_one_name()). On the 4772 * other hand if only the dir_index key ends up being logged, the respective 4773 * name is added to the fs/subvol tree with both the dir_item and dir_index 4774 * keys created (see replay_one_name()). 4775 * The directory's inode item with a wrong i_size is not a problem as well, 4776 * since we don't use it at log replay time to set the i_size in the inode 4777 * item of the fs/subvol tree (see overwrite_item()). 4778 */ 4779 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 4780 struct btrfs_root *root, 4781 struct inode *start_inode, 4782 struct btrfs_log_ctx *ctx) 4783 { 4784 struct btrfs_root *log = root->log_root; 4785 struct btrfs_path *path; 4786 LIST_HEAD(dir_list); 4787 struct btrfs_dir_list *dir_elem; 4788 int ret = 0; 4789 4790 path = btrfs_alloc_path(); 4791 if (!path) 4792 return -ENOMEM; 4793 4794 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 4795 if (!dir_elem) { 4796 btrfs_free_path(path); 4797 return -ENOMEM; 4798 } 4799 dir_elem->ino = btrfs_ino(start_inode); 4800 list_add_tail(&dir_elem->list, &dir_list); 4801 4802 while (!list_empty(&dir_list)) { 4803 struct extent_buffer *leaf; 4804 struct btrfs_key min_key; 4805 int nritems; 4806 int i; 4807 4808 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 4809 list); 4810 if (ret) 4811 goto next_dir_inode; 4812 4813 min_key.objectid = dir_elem->ino; 4814 min_key.type = BTRFS_DIR_ITEM_KEY; 4815 min_key.offset = 0; 4816 again: 4817 btrfs_release_path(path); 4818 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 4819 if (ret < 0) { 4820 goto next_dir_inode; 4821 } else if (ret > 0) { 4822 ret = 0; 4823 goto next_dir_inode; 4824 } 4825 4826 process_leaf: 4827 leaf = path->nodes[0]; 4828 nritems = btrfs_header_nritems(leaf); 4829 for (i = path->slots[0]; i < nritems; i++) { 4830 struct btrfs_dir_item *di; 4831 struct btrfs_key di_key; 4832 struct inode *di_inode; 4833 struct btrfs_dir_list *new_dir_elem; 4834 int log_mode = LOG_INODE_EXISTS; 4835 int type; 4836 4837 btrfs_item_key_to_cpu(leaf, &min_key, i); 4838 if (min_key.objectid != dir_elem->ino || 4839 min_key.type != BTRFS_DIR_ITEM_KEY) 4840 goto next_dir_inode; 4841 4842 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 4843 type = btrfs_dir_type(leaf, di); 4844 if (btrfs_dir_transid(leaf, di) < trans->transid && 4845 type != BTRFS_FT_DIR) 4846 continue; 4847 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 4848 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 4849 continue; 4850 4851 di_inode = btrfs_iget(root->fs_info->sb, &di_key, 4852 root, NULL); 4853 if (IS_ERR(di_inode)) { 4854 ret = PTR_ERR(di_inode); 4855 goto next_dir_inode; 4856 } 4857 4858 if (btrfs_inode_in_log(di_inode, trans->transid)) { 4859 iput(di_inode); 4860 continue; 4861 } 4862 4863 ctx->log_new_dentries = false; 4864 if (type == BTRFS_FT_DIR) 4865 log_mode = LOG_INODE_ALL; 4866 btrfs_release_path(path); 4867 ret = btrfs_log_inode(trans, root, di_inode, 4868 log_mode, 0, LLONG_MAX, ctx); 4869 iput(di_inode); 4870 if (ret) 4871 goto next_dir_inode; 4872 if (ctx->log_new_dentries) { 4873 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 4874 GFP_NOFS); 4875 if (!new_dir_elem) { 4876 ret = -ENOMEM; 4877 goto next_dir_inode; 4878 } 4879 new_dir_elem->ino = di_key.objectid; 4880 list_add_tail(&new_dir_elem->list, &dir_list); 4881 } 4882 break; 4883 } 4884 if (i == nritems) { 4885 ret = btrfs_next_leaf(log, path); 4886 if (ret < 0) { 4887 goto next_dir_inode; 4888 } else if (ret > 0) { 4889 ret = 0; 4890 goto next_dir_inode; 4891 } 4892 goto process_leaf; 4893 } 4894 if (min_key.offset < (u64)-1) { 4895 min_key.offset++; 4896 goto again; 4897 } 4898 next_dir_inode: 4899 list_del(&dir_elem->list); 4900 kfree(dir_elem); 4901 } 4902 4903 btrfs_free_path(path); 4904 return ret; 4905 } 4906 4907 /* 4908 * helper function around btrfs_log_inode to make sure newly created 4909 * parent directories also end up in the log. A minimal inode and backref 4910 * only logging is done of any parent directories that are older than 4911 * the last committed transaction 4912 */ 4913 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 4914 struct btrfs_root *root, struct inode *inode, 4915 struct dentry *parent, 4916 const loff_t start, 4917 const loff_t end, 4918 int exists_only, 4919 struct btrfs_log_ctx *ctx) 4920 { 4921 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 4922 struct super_block *sb; 4923 struct dentry *old_parent = NULL; 4924 int ret = 0; 4925 u64 last_committed = root->fs_info->last_trans_committed; 4926 const struct dentry * const first_parent = parent; 4927 const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > 4928 last_committed); 4929 bool log_dentries = false; 4930 struct inode *orig_inode = inode; 4931 4932 sb = inode->i_sb; 4933 4934 if (btrfs_test_opt(root, NOTREELOG)) { 4935 ret = 1; 4936 goto end_no_trans; 4937 } 4938 4939 /* 4940 * The prev transaction commit doesn't complete, we need do 4941 * full commit by ourselves. 4942 */ 4943 if (root->fs_info->last_trans_log_full_commit > 4944 root->fs_info->last_trans_committed) { 4945 ret = 1; 4946 goto end_no_trans; 4947 } 4948 4949 if (root != BTRFS_I(inode)->root || 4950 btrfs_root_refs(&root->root_item) == 0) { 4951 ret = 1; 4952 goto end_no_trans; 4953 } 4954 4955 ret = check_parent_dirs_for_sync(trans, inode, parent, 4956 sb, last_committed); 4957 if (ret) 4958 goto end_no_trans; 4959 4960 if (btrfs_inode_in_log(inode, trans->transid)) { 4961 ret = BTRFS_NO_LOG_SYNC; 4962 goto end_no_trans; 4963 } 4964 4965 ret = start_log_trans(trans, root, ctx); 4966 if (ret) 4967 goto end_no_trans; 4968 4969 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 4970 if (ret) 4971 goto end_trans; 4972 4973 /* 4974 * for regular files, if its inode is already on disk, we don't 4975 * have to worry about the parents at all. This is because 4976 * we can use the last_unlink_trans field to record renames 4977 * and other fun in this file. 4978 */ 4979 if (S_ISREG(inode->i_mode) && 4980 BTRFS_I(inode)->generation <= last_committed && 4981 BTRFS_I(inode)->last_unlink_trans <= last_committed) { 4982 ret = 0; 4983 goto end_trans; 4984 } 4985 4986 if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) 4987 log_dentries = true; 4988 4989 while (1) { 4990 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) 4991 break; 4992 4993 inode = d_inode(parent); 4994 if (root != BTRFS_I(inode)->root) 4995 break; 4996 4997 /* 4998 * On unlink we must make sure our immediate parent directory 4999 * inode is fully logged. This is to prevent leaving dangling 5000 * directory index entries and a wrong directory inode's i_size. 5001 * Not doing so can result in a directory being impossible to 5002 * delete after log replay (rmdir will always fail with error 5003 * -ENOTEMPTY). 5004 */ 5005 if (did_unlink && parent == first_parent) 5006 inode_only = LOG_INODE_ALL; 5007 else 5008 inode_only = LOG_INODE_EXISTS; 5009 5010 if (BTRFS_I(inode)->generation > 5011 root->fs_info->last_trans_committed || 5012 inode_only == LOG_INODE_ALL) { 5013 ret = btrfs_log_inode(trans, root, inode, inode_only, 5014 0, LLONG_MAX, ctx); 5015 if (ret) 5016 goto end_trans; 5017 } 5018 if (IS_ROOT(parent)) 5019 break; 5020 5021 parent = dget_parent(parent); 5022 dput(old_parent); 5023 old_parent = parent; 5024 } 5025 if (log_dentries) 5026 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5027 else 5028 ret = 0; 5029 end_trans: 5030 dput(old_parent); 5031 if (ret < 0) { 5032 btrfs_set_log_full_commit(root->fs_info, trans); 5033 ret = 1; 5034 } 5035 5036 if (ret) 5037 btrfs_remove_log_ctx(root, ctx); 5038 btrfs_end_log_trans(root); 5039 end_no_trans: 5040 return ret; 5041 } 5042 5043 /* 5044 * it is not safe to log dentry if the chunk root has added new 5045 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5046 * If this returns 1, you must commit the transaction to safely get your 5047 * data on disk. 5048 */ 5049 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5050 struct btrfs_root *root, struct dentry *dentry, 5051 const loff_t start, 5052 const loff_t end, 5053 struct btrfs_log_ctx *ctx) 5054 { 5055 struct dentry *parent = dget_parent(dentry); 5056 int ret; 5057 5058 ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent, 5059 start, end, 0, ctx); 5060 dput(parent); 5061 5062 return ret; 5063 } 5064 5065 /* 5066 * should be called during mount to recover any replay any log trees 5067 * from the FS 5068 */ 5069 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5070 { 5071 int ret; 5072 struct btrfs_path *path; 5073 struct btrfs_trans_handle *trans; 5074 struct btrfs_key key; 5075 struct btrfs_key found_key; 5076 struct btrfs_key tmp_key; 5077 struct btrfs_root *log; 5078 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5079 struct walk_control wc = { 5080 .process_func = process_one_buffer, 5081 .stage = 0, 5082 }; 5083 5084 path = btrfs_alloc_path(); 5085 if (!path) 5086 return -ENOMEM; 5087 5088 fs_info->log_root_recovering = 1; 5089 5090 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5091 if (IS_ERR(trans)) { 5092 ret = PTR_ERR(trans); 5093 goto error; 5094 } 5095 5096 wc.trans = trans; 5097 wc.pin = 1; 5098 5099 ret = walk_log_tree(trans, log_root_tree, &wc); 5100 if (ret) { 5101 btrfs_error(fs_info, ret, "Failed to pin buffers while " 5102 "recovering log root tree."); 5103 goto error; 5104 } 5105 5106 again: 5107 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5108 key.offset = (u64)-1; 5109 key.type = BTRFS_ROOT_ITEM_KEY; 5110 5111 while (1) { 5112 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5113 5114 if (ret < 0) { 5115 btrfs_error(fs_info, ret, 5116 "Couldn't find tree log root."); 5117 goto error; 5118 } 5119 if (ret > 0) { 5120 if (path->slots[0] == 0) 5121 break; 5122 path->slots[0]--; 5123 } 5124 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5125 path->slots[0]); 5126 btrfs_release_path(path); 5127 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5128 break; 5129 5130 log = btrfs_read_fs_root(log_root_tree, &found_key); 5131 if (IS_ERR(log)) { 5132 ret = PTR_ERR(log); 5133 btrfs_error(fs_info, ret, 5134 "Couldn't read tree log root."); 5135 goto error; 5136 } 5137 5138 tmp_key.objectid = found_key.offset; 5139 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5140 tmp_key.offset = (u64)-1; 5141 5142 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5143 if (IS_ERR(wc.replay_dest)) { 5144 ret = PTR_ERR(wc.replay_dest); 5145 free_extent_buffer(log->node); 5146 free_extent_buffer(log->commit_root); 5147 kfree(log); 5148 btrfs_error(fs_info, ret, "Couldn't read target root " 5149 "for tree log recovery."); 5150 goto error; 5151 } 5152 5153 wc.replay_dest->log_root = log; 5154 btrfs_record_root_in_trans(trans, wc.replay_dest); 5155 ret = walk_log_tree(trans, log, &wc); 5156 5157 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5158 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5159 path); 5160 } 5161 5162 key.offset = found_key.offset - 1; 5163 wc.replay_dest->log_root = NULL; 5164 free_extent_buffer(log->node); 5165 free_extent_buffer(log->commit_root); 5166 kfree(log); 5167 5168 if (ret) 5169 goto error; 5170 5171 if (found_key.offset == 0) 5172 break; 5173 } 5174 btrfs_release_path(path); 5175 5176 /* step one is to pin it all, step two is to replay just inodes */ 5177 if (wc.pin) { 5178 wc.pin = 0; 5179 wc.process_func = replay_one_buffer; 5180 wc.stage = LOG_WALK_REPLAY_INODES; 5181 goto again; 5182 } 5183 /* step three is to replay everything */ 5184 if (wc.stage < LOG_WALK_REPLAY_ALL) { 5185 wc.stage++; 5186 goto again; 5187 } 5188 5189 btrfs_free_path(path); 5190 5191 /* step 4: commit the transaction, which also unpins the blocks */ 5192 ret = btrfs_commit_transaction(trans, fs_info->tree_root); 5193 if (ret) 5194 return ret; 5195 5196 free_extent_buffer(log_root_tree->node); 5197 log_root_tree->log_root = NULL; 5198 fs_info->log_root_recovering = 0; 5199 kfree(log_root_tree); 5200 5201 return 0; 5202 error: 5203 if (wc.trans) 5204 btrfs_end_transaction(wc.trans, fs_info->tree_root); 5205 btrfs_free_path(path); 5206 return ret; 5207 } 5208 5209 /* 5210 * there are some corner cases where we want to force a full 5211 * commit instead of allowing a directory to be logged. 5212 * 5213 * They revolve around files there were unlinked from the directory, and 5214 * this function updates the parent directory so that a full commit is 5215 * properly done if it is fsync'd later after the unlinks are done. 5216 */ 5217 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5218 struct inode *dir, struct inode *inode, 5219 int for_rename) 5220 { 5221 /* 5222 * when we're logging a file, if it hasn't been renamed 5223 * or unlinked, and its inode is fully committed on disk, 5224 * we don't have to worry about walking up the directory chain 5225 * to log its parents. 5226 * 5227 * So, we use the last_unlink_trans field to put this transid 5228 * into the file. When the file is logged we check it and 5229 * don't log the parents if the file is fully on disk. 5230 */ 5231 if (S_ISREG(inode->i_mode)) 5232 BTRFS_I(inode)->last_unlink_trans = trans->transid; 5233 5234 /* 5235 * if this directory was already logged any new 5236 * names for this file/dir will get recorded 5237 */ 5238 smp_mb(); 5239 if (BTRFS_I(dir)->logged_trans == trans->transid) 5240 return; 5241 5242 /* 5243 * if the inode we're about to unlink was logged, 5244 * the log will be properly updated for any new names 5245 */ 5246 if (BTRFS_I(inode)->logged_trans == trans->transid) 5247 return; 5248 5249 /* 5250 * when renaming files across directories, if the directory 5251 * there we're unlinking from gets fsync'd later on, there's 5252 * no way to find the destination directory later and fsync it 5253 * properly. So, we have to be conservative and force commits 5254 * so the new name gets discovered. 5255 */ 5256 if (for_rename) 5257 goto record; 5258 5259 /* we can safely do the unlink without any special recording */ 5260 return; 5261 5262 record: 5263 BTRFS_I(dir)->last_unlink_trans = trans->transid; 5264 } 5265 5266 /* 5267 * Call this after adding a new name for a file and it will properly 5268 * update the log to reflect the new name. 5269 * 5270 * It will return zero if all goes well, and it will return 1 if a 5271 * full transaction commit is required. 5272 */ 5273 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 5274 struct inode *inode, struct inode *old_dir, 5275 struct dentry *parent) 5276 { 5277 struct btrfs_root * root = BTRFS_I(inode)->root; 5278 5279 /* 5280 * this will force the logging code to walk the dentry chain 5281 * up for the file 5282 */ 5283 if (S_ISREG(inode->i_mode)) 5284 BTRFS_I(inode)->last_unlink_trans = trans->transid; 5285 5286 /* 5287 * if this inode hasn't been logged and directory we're renaming it 5288 * from hasn't been logged, we don't need to log it 5289 */ 5290 if (BTRFS_I(inode)->logged_trans <= 5291 root->fs_info->last_trans_committed && 5292 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 5293 root->fs_info->last_trans_committed)) 5294 return 0; 5295 5296 return btrfs_log_inode_parent(trans, root, inode, parent, 0, 5297 LLONG_MAX, 1, NULL); 5298 } 5299 5300