1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include <linux/error-injection.h> 11 #include "messages.h" 12 #include "ctree.h" 13 #include "disk-io.h" 14 #include "transaction.h" 15 #include "print-tree.h" 16 #include "locking.h" 17 #include "volumes.h" 18 #include "qgroup.h" 19 #include "tree-mod-log.h" 20 #include "tree-checker.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "extent-tree.h" 24 #include "relocation.h" 25 #include "file-item.h" 26 27 static struct kmem_cache *btrfs_path_cachep; 28 29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 30 *root, struct btrfs_path *path, int level); 31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 32 const struct btrfs_key *ins_key, struct btrfs_path *path, 33 int data_size, bool extend); 34 static int push_node_left(struct btrfs_trans_handle *trans, 35 struct extent_buffer *dst, 36 struct extent_buffer *src, bool empty); 37 static int balance_node_right(struct btrfs_trans_handle *trans, 38 struct extent_buffer *dst_buf, 39 struct extent_buffer *src_buf); 40 /* 41 * The leaf data grows from end-to-front in the node. this returns the address 42 * of the start of the last item, which is the stop of the leaf data stack. 43 */ 44 static unsigned int leaf_data_end(const struct extent_buffer *leaf) 45 { 46 u32 nr = btrfs_header_nritems(leaf); 47 48 if (nr == 0) 49 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); 50 return btrfs_item_offset(leaf, nr - 1); 51 } 52 53 /* 54 * Move data in a @leaf (using memmove, safe for overlapping ranges). 55 * 56 * @leaf: leaf that we're doing a memmove on 57 * @dst_offset: item data offset we're moving to 58 * @src_offset: item data offset were' moving from 59 * @len: length of the data we're moving 60 * 61 * Wrapper around memmove_extent_buffer() that takes into account the header on 62 * the leaf. The btrfs_item offset's start directly after the header, so we 63 * have to adjust any offsets to account for the header in the leaf. This 64 * handles that math to simplify the callers. 65 */ 66 static inline void memmove_leaf_data(const struct extent_buffer *leaf, 67 unsigned long dst_offset, 68 unsigned long src_offset, 69 unsigned long len) 70 { 71 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, 72 btrfs_item_nr_offset(leaf, 0) + src_offset, len); 73 } 74 75 /* 76 * Copy item data from @src into @dst at the given @offset. 77 * 78 * @dst: destination leaf that we're copying into 79 * @src: source leaf that we're copying from 80 * @dst_offset: item data offset we're copying to 81 * @src_offset: item data offset were' copying from 82 * @len: length of the data we're copying 83 * 84 * Wrapper around copy_extent_buffer() that takes into account the header on 85 * the leaf. The btrfs_item offset's start directly after the header, so we 86 * have to adjust any offsets to account for the header in the leaf. This 87 * handles that math to simplify the callers. 88 */ 89 static inline void copy_leaf_data(const struct extent_buffer *dst, 90 const struct extent_buffer *src, 91 unsigned long dst_offset, 92 unsigned long src_offset, unsigned long len) 93 { 94 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, 95 btrfs_item_nr_offset(src, 0) + src_offset, len); 96 } 97 98 /* 99 * Move items in a @leaf (using memmove). 100 * 101 * @dst: destination leaf for the items 102 * @dst_item: the item nr we're copying into 103 * @src_item: the item nr we're copying from 104 * @nr_items: the number of items to copy 105 * 106 * Wrapper around memmove_extent_buffer() that does the math to get the 107 * appropriate offsets into the leaf from the item numbers. 108 */ 109 static inline void memmove_leaf_items(const struct extent_buffer *leaf, 110 int dst_item, int src_item, int nr_items) 111 { 112 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), 113 btrfs_item_nr_offset(leaf, src_item), 114 nr_items * sizeof(struct btrfs_item)); 115 } 116 117 /* 118 * Copy items from @src into @dst at the given @offset. 119 * 120 * @dst: destination leaf for the items 121 * @src: source leaf for the items 122 * @dst_item: the item nr we're copying into 123 * @src_item: the item nr we're copying from 124 * @nr_items: the number of items to copy 125 * 126 * Wrapper around copy_extent_buffer() that does the math to get the 127 * appropriate offsets into the leaf from the item numbers. 128 */ 129 static inline void copy_leaf_items(const struct extent_buffer *dst, 130 const struct extent_buffer *src, 131 int dst_item, int src_item, int nr_items) 132 { 133 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), 134 btrfs_item_nr_offset(src, src_item), 135 nr_items * sizeof(struct btrfs_item)); 136 } 137 138 struct btrfs_path *btrfs_alloc_path(void) 139 { 140 might_sleep(); 141 142 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 143 } 144 145 /* this also releases the path */ 146 void btrfs_free_path(struct btrfs_path *p) 147 { 148 if (!p) 149 return; 150 btrfs_release_path(p); 151 kmem_cache_free(btrfs_path_cachep, p); 152 } 153 154 /* 155 * path release drops references on the extent buffers in the path 156 * and it drops any locks held by this path 157 * 158 * It is safe to call this on paths that no locks or extent buffers held. 159 */ 160 noinline void btrfs_release_path(struct btrfs_path *p) 161 { 162 int i; 163 164 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 165 p->slots[i] = 0; 166 if (!p->nodes[i]) 167 continue; 168 if (p->locks[i]) { 169 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 170 p->locks[i] = 0; 171 } 172 free_extent_buffer(p->nodes[i]); 173 p->nodes[i] = NULL; 174 } 175 } 176 177 /* 178 * safely gets a reference on the root node of a tree. A lock 179 * is not taken, so a concurrent writer may put a different node 180 * at the root of the tree. See btrfs_lock_root_node for the 181 * looping required. 182 * 183 * The extent buffer returned by this has a reference taken, so 184 * it won't disappear. It may stop being the root of the tree 185 * at any time because there are no locks held. 186 */ 187 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 188 { 189 struct extent_buffer *eb; 190 191 while (1) { 192 rcu_read_lock(); 193 eb = rcu_dereference(root->node); 194 195 /* 196 * RCU really hurts here, we could free up the root node because 197 * it was COWed but we may not get the new root node yet so do 198 * the inc_not_zero dance and if it doesn't work then 199 * synchronize_rcu and try again. 200 */ 201 if (refcount_inc_not_zero(&eb->refs)) { 202 rcu_read_unlock(); 203 break; 204 } 205 rcu_read_unlock(); 206 synchronize_rcu(); 207 } 208 return eb; 209 } 210 211 /* 212 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), 213 * just get put onto a simple dirty list. Transaction walks this list to make 214 * sure they get properly updated on disk. 215 */ 216 static void add_root_to_dirty_list(struct btrfs_root *root) 217 { 218 struct btrfs_fs_info *fs_info = root->fs_info; 219 220 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 221 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 222 return; 223 224 spin_lock(&fs_info->trans_lock); 225 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 226 /* Want the extent tree to be the last on the list */ 227 if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID) 228 list_move_tail(&root->dirty_list, 229 &fs_info->dirty_cowonly_roots); 230 else 231 list_move(&root->dirty_list, 232 &fs_info->dirty_cowonly_roots); 233 } 234 spin_unlock(&fs_info->trans_lock); 235 } 236 237 /* 238 * used by snapshot creation to make a copy of a root for a tree with 239 * a given objectid. The buffer with the new root node is returned in 240 * cow_ret, and this func returns zero on success or a negative error code. 241 */ 242 int btrfs_copy_root(struct btrfs_trans_handle *trans, 243 struct btrfs_root *root, 244 struct extent_buffer *buf, 245 struct extent_buffer **cow_ret, u64 new_root_objectid) 246 { 247 struct btrfs_fs_info *fs_info = root->fs_info; 248 struct extent_buffer *cow; 249 int ret = 0; 250 int level; 251 struct btrfs_disk_key disk_key; 252 const bool is_reloc_root = (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID); 253 u64 reloc_src_root = 0; 254 255 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 256 trans->transid != fs_info->running_transaction->transid); 257 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 258 trans->transid != btrfs_get_root_last_trans(root)); 259 260 level = btrfs_header_level(buf); 261 if (level == 0) 262 btrfs_item_key(buf, &disk_key, 0); 263 else 264 btrfs_node_key(buf, &disk_key, 0); 265 266 if (is_reloc_root) 267 reloc_src_root = btrfs_header_owner(buf); 268 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 269 &disk_key, level, buf->start, 0, 270 reloc_src_root, BTRFS_NESTING_NEW_ROOT); 271 if (IS_ERR(cow)) 272 return PTR_ERR(cow); 273 274 copy_extent_buffer_full(cow, buf); 275 btrfs_set_header_bytenr(cow, cow->start); 276 btrfs_set_header_generation(cow, trans->transid); 277 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 278 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 279 BTRFS_HEADER_FLAG_RELOC); 280 if (is_reloc_root) 281 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 282 else 283 btrfs_set_header_owner(cow, new_root_objectid); 284 285 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 286 287 if (unlikely(btrfs_header_generation(buf) > trans->transid)) { 288 btrfs_tree_unlock(cow); 289 free_extent_buffer(cow); 290 ret = -EUCLEAN; 291 btrfs_abort_transaction(trans, ret); 292 return ret; 293 } 294 295 ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); 296 if (unlikely(ret)) { 297 btrfs_abort_transaction(trans, ret); 298 btrfs_tree_unlock(cow); 299 free_extent_buffer(cow); 300 return ret; 301 } 302 303 btrfs_mark_buffer_dirty(trans, cow); 304 *cow_ret = cow; 305 return 0; 306 } 307 308 /* 309 * check if the tree block can be shared by multiple trees 310 */ 311 bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans, 312 const struct btrfs_root *root, 313 const struct extent_buffer *buf) 314 { 315 const u64 buf_gen = btrfs_header_generation(buf); 316 317 /* 318 * Tree blocks not in shareable trees and tree roots are never shared. 319 * If a block was allocated after the last snapshot and the block was 320 * not allocated by tree relocation, we know the block is not shared. 321 */ 322 323 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 324 return false; 325 326 if (buf == root->node) 327 return false; 328 329 if (buf_gen > btrfs_root_last_snapshot(&root->root_item) && 330 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) 331 return false; 332 333 if (buf != root->commit_root) 334 return true; 335 336 /* 337 * An extent buffer that used to be the commit root may still be shared 338 * because the tree height may have increased and it became a child of a 339 * higher level root. This can happen when snapshotting a subvolume 340 * created in the current transaction. 341 */ 342 if (buf_gen == trans->transid) 343 return true; 344 345 return false; 346 } 347 348 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 349 struct btrfs_root *root, 350 struct extent_buffer *buf, 351 struct extent_buffer *cow, 352 int *last_ref) 353 { 354 struct btrfs_fs_info *fs_info = root->fs_info; 355 u64 refs; 356 u64 owner; 357 u64 flags; 358 int ret; 359 const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); 360 361 /* 362 * Backrefs update rules: 363 * 364 * Always use full backrefs for extent pointers in tree block 365 * allocated by tree relocation. 366 * 367 * If a shared tree block is no longer referenced by its owner 368 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 369 * use full backrefs for extent pointers in tree block. 370 * 371 * If a tree block is been relocating 372 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 373 * use full backrefs for extent pointers in tree block. 374 * The reason for this is some operations (such as drop tree) 375 * are only allowed for blocks use full backrefs. 376 */ 377 378 if (btrfs_block_can_be_shared(trans, root, buf)) { 379 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 380 btrfs_header_level(buf), 1, 381 &refs, &flags, NULL); 382 if (ret) 383 return ret; 384 if (unlikely(refs == 0)) { 385 btrfs_crit(fs_info, 386 "found 0 references for tree block at bytenr %llu level %d root %llu", 387 buf->start, btrfs_header_level(buf), 388 btrfs_root_id(root)); 389 ret = -EUCLEAN; 390 btrfs_abort_transaction(trans, ret); 391 return ret; 392 } 393 } else { 394 refs = 1; 395 if (is_reloc_root || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 396 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 397 else 398 flags = 0; 399 } 400 401 owner = btrfs_header_owner(buf); 402 if (unlikely(owner == BTRFS_TREE_RELOC_OBJECTID && 403 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))) { 404 btrfs_crit(fs_info, 405 "found tree block at bytenr %llu level %d root %llu refs %llu flags %llx without full backref flag set", 406 buf->start, btrfs_header_level(buf), 407 btrfs_root_id(root), refs, flags); 408 ret = -EUCLEAN; 409 btrfs_abort_transaction(trans, ret); 410 return ret; 411 } 412 413 if (refs > 1) { 414 if ((owner == btrfs_root_id(root) || is_reloc_root) && 415 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 416 ret = btrfs_inc_ref(trans, root, buf, true); 417 if (ret) 418 return ret; 419 420 if (is_reloc_root) { 421 ret = btrfs_dec_ref(trans, root, buf, false); 422 if (ret) 423 return ret; 424 ret = btrfs_inc_ref(trans, root, cow, true); 425 if (ret) 426 return ret; 427 } 428 ret = btrfs_set_disk_extent_flags(trans, buf, 429 BTRFS_BLOCK_FLAG_FULL_BACKREF); 430 if (ret) 431 return ret; 432 } else { 433 ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); 434 if (ret) 435 return ret; 436 } 437 } else { 438 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 439 ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); 440 if (ret) 441 return ret; 442 ret = btrfs_dec_ref(trans, root, buf, true); 443 if (ret) 444 return ret; 445 } 446 btrfs_clear_buffer_dirty(trans, buf); 447 *last_ref = 1; 448 } 449 return 0; 450 } 451 452 /* 453 * does the dirty work in cow of a single block. The parent block (if 454 * supplied) is updated to point to the new cow copy. The new buffer is marked 455 * dirty and returned locked. If you modify the block it needs to be marked 456 * dirty again. 457 * 458 * search_start -- an allocation hint for the new block 459 * 460 * empty_size -- a hint that you plan on doing more cow. This is the size in 461 * bytes the allocator should try to find free next to the block it returns. 462 * This is just a hint and may be ignored by the allocator. 463 */ 464 int btrfs_force_cow_block(struct btrfs_trans_handle *trans, 465 struct btrfs_root *root, 466 struct extent_buffer *buf, 467 struct extent_buffer *parent, int parent_slot, 468 struct extent_buffer **cow_ret, 469 u64 search_start, u64 empty_size, 470 enum btrfs_lock_nesting nest) 471 { 472 struct btrfs_fs_info *fs_info = root->fs_info; 473 struct btrfs_disk_key disk_key; 474 struct extent_buffer *cow; 475 int level, ret; 476 int last_ref = 0; 477 int unlock_orig = 0; 478 u64 parent_start = 0; 479 u64 reloc_src_root = 0; 480 481 if (*cow_ret == buf) 482 unlock_orig = 1; 483 484 btrfs_assert_tree_write_locked(buf); 485 486 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 487 trans->transid != fs_info->running_transaction->transid); 488 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 489 trans->transid != btrfs_get_root_last_trans(root)); 490 491 level = btrfs_header_level(buf); 492 493 if (level == 0) 494 btrfs_item_key(buf, &disk_key, 0); 495 else 496 btrfs_node_key(buf, &disk_key, 0); 497 498 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { 499 if (parent) 500 parent_start = parent->start; 501 reloc_src_root = btrfs_header_owner(buf); 502 } 503 cow = btrfs_alloc_tree_block(trans, root, parent_start, 504 btrfs_root_id(root), &disk_key, level, 505 search_start, empty_size, reloc_src_root, nest); 506 if (IS_ERR(cow)) 507 return PTR_ERR(cow); 508 509 /* cow is set to blocking by btrfs_init_new_buffer */ 510 511 copy_extent_buffer_full(cow, buf); 512 btrfs_set_header_bytenr(cow, cow->start); 513 btrfs_set_header_generation(cow, trans->transid); 514 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 515 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 516 BTRFS_HEADER_FLAG_RELOC); 517 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 518 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 519 else 520 btrfs_set_header_owner(cow, btrfs_root_id(root)); 521 522 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 523 524 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 525 if (unlikely(ret)) { 526 btrfs_abort_transaction(trans, ret); 527 goto error_unlock_cow; 528 } 529 530 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 531 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 532 if (unlikely(ret)) { 533 btrfs_abort_transaction(trans, ret); 534 goto error_unlock_cow; 535 } 536 } 537 538 if (buf == root->node) { 539 WARN_ON(parent && parent != buf); 540 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || 541 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 542 parent_start = buf->start; 543 544 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); 545 if (unlikely(ret < 0)) { 546 btrfs_abort_transaction(trans, ret); 547 goto error_unlock_cow; 548 } 549 refcount_inc(&cow->refs); 550 rcu_assign_pointer(root->node, cow); 551 552 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 553 parent_start, last_ref); 554 free_extent_buffer(buf); 555 add_root_to_dirty_list(root); 556 if (unlikely(ret < 0)) { 557 btrfs_abort_transaction(trans, ret); 558 goto error_unlock_cow; 559 } 560 } else { 561 WARN_ON(trans->transid != btrfs_header_generation(parent)); 562 ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, 563 BTRFS_MOD_LOG_KEY_REPLACE); 564 if (unlikely(ret)) { 565 btrfs_abort_transaction(trans, ret); 566 goto error_unlock_cow; 567 } 568 btrfs_set_node_blockptr(parent, parent_slot, 569 cow->start); 570 btrfs_set_node_ptr_generation(parent, parent_slot, 571 trans->transid); 572 btrfs_mark_buffer_dirty(trans, parent); 573 if (last_ref) { 574 ret = btrfs_tree_mod_log_free_eb(buf); 575 if (unlikely(ret)) { 576 btrfs_abort_transaction(trans, ret); 577 goto error_unlock_cow; 578 } 579 } 580 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 581 parent_start, last_ref); 582 if (unlikely(ret < 0)) { 583 btrfs_abort_transaction(trans, ret); 584 goto error_unlock_cow; 585 } 586 } 587 588 trace_btrfs_cow_block(root, buf, cow); 589 if (unlock_orig) 590 btrfs_tree_unlock(buf); 591 free_extent_buffer_stale(buf); 592 btrfs_mark_buffer_dirty(trans, cow); 593 *cow_ret = cow; 594 return 0; 595 596 error_unlock_cow: 597 btrfs_tree_unlock(cow); 598 free_extent_buffer(cow); 599 return ret; 600 } 601 602 static inline bool should_cow_block(const struct btrfs_trans_handle *trans, 603 const struct btrfs_root *root, 604 const struct extent_buffer *buf) 605 { 606 if (btrfs_is_testing(root->fs_info)) 607 return false; 608 609 /* 610 * We do not need to cow a block if 611 * 1) this block is not created or changed in this transaction; 612 * 2) this block does not belong to TREE_RELOC tree; 613 * 3) the root is not forced COW. 614 * 615 * What is forced COW: 616 * when we create snapshot during committing the transaction, 617 * after we've finished copying src root, we must COW the shared 618 * block to ensure the metadata consistency. 619 */ 620 621 if (btrfs_header_generation(buf) != trans->transid) 622 return true; 623 624 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) 625 return true; 626 627 /* Ensure we can see the FORCE_COW bit. */ 628 smp_mb__before_atomic(); 629 if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 630 return true; 631 632 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 633 return false; 634 635 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) 636 return true; 637 638 return false; 639 } 640 641 /* 642 * COWs a single block, see btrfs_force_cow_block() for the real work. 643 * This version of it has extra checks so that a block isn't COWed more than 644 * once per transaction, as long as it hasn't been written yet 645 */ 646 int btrfs_cow_block(struct btrfs_trans_handle *trans, 647 struct btrfs_root *root, struct extent_buffer *buf, 648 struct extent_buffer *parent, int parent_slot, 649 struct extent_buffer **cow_ret, 650 enum btrfs_lock_nesting nest) 651 { 652 struct btrfs_fs_info *fs_info = root->fs_info; 653 u64 search_start; 654 655 if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { 656 btrfs_abort_transaction(trans, -EUCLEAN); 657 btrfs_crit(fs_info, 658 "attempt to COW block %llu on root %llu that is being deleted", 659 buf->start, btrfs_root_id(root)); 660 return -EUCLEAN; 661 } 662 663 /* 664 * COWing must happen through a running transaction, which always 665 * matches the current fs generation (it's a transaction with a state 666 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 667 * into error state to prevent the commit of any transaction. 668 */ 669 if (unlikely(trans->transaction != fs_info->running_transaction || 670 trans->transid != fs_info->generation)) { 671 btrfs_abort_transaction(trans, -EUCLEAN); 672 btrfs_crit(fs_info, 673 "unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", 674 buf->start, btrfs_root_id(root), trans->transid, 675 fs_info->running_transaction->transid, 676 fs_info->generation); 677 return -EUCLEAN; 678 } 679 680 if (!should_cow_block(trans, root, buf)) { 681 *cow_ret = buf; 682 return 0; 683 } 684 685 search_start = round_down(buf->start, SZ_1G); 686 687 /* 688 * Before CoWing this block for later modification, check if it's 689 * the subtree root and do the delayed subtree trace if needed. 690 * 691 * Also We don't care about the error, as it's handled internally. 692 */ 693 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 694 return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, 695 cow_ret, search_start, 0, nest); 696 } 697 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 698 699 /* 700 * same as comp_keys only with two btrfs_key's 701 */ 702 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 703 { 704 if (k1->objectid > k2->objectid) 705 return 1; 706 if (k1->objectid < k2->objectid) 707 return -1; 708 if (k1->type > k2->type) 709 return 1; 710 if (k1->type < k2->type) 711 return -1; 712 if (k1->offset > k2->offset) 713 return 1; 714 if (k1->offset < k2->offset) 715 return -1; 716 return 0; 717 } 718 719 /* 720 * Search for a key in the given extent_buffer. 721 * 722 * The lower boundary for the search is specified by the slot number @first_slot. 723 * Use a value of 0 to search over the whole extent buffer. Works for both 724 * leaves and nodes. 725 * 726 * The slot in the extent buffer is returned via @slot. If the key exists in the 727 * extent buffer, then @slot will point to the slot where the key is, otherwise 728 * it points to the slot where you would insert the key. 729 * 730 * Slot may point to the total number of items (i.e. one position beyond the last 731 * key) if the key is bigger than the last key in the extent buffer. 732 */ 733 int btrfs_bin_search(const struct extent_buffer *eb, int first_slot, 734 const struct btrfs_key *key, int *slot) 735 { 736 unsigned long p; 737 int item_size; 738 /* 739 * Use unsigned types for the low and high slots, so that we get a more 740 * efficient division in the search loop below. 741 */ 742 u32 low = first_slot; 743 u32 high = btrfs_header_nritems(eb); 744 int ret; 745 const int key_size = sizeof(struct btrfs_disk_key); 746 747 if (unlikely(low > high)) { 748 btrfs_err(eb->fs_info, 749 "%s: low (%u) > high (%u) eb %llu owner %llu level %d", 750 __func__, low, high, eb->start, 751 btrfs_header_owner(eb), btrfs_header_level(eb)); 752 return -EINVAL; 753 } 754 755 if (btrfs_header_level(eb) == 0) { 756 p = offsetof(struct btrfs_leaf, items); 757 item_size = sizeof(struct btrfs_item); 758 } else { 759 p = offsetof(struct btrfs_node, ptrs); 760 item_size = sizeof(struct btrfs_key_ptr); 761 } 762 763 while (low < high) { 764 const int unit_size = eb->folio_size; 765 unsigned long oil; 766 unsigned long offset; 767 struct btrfs_disk_key *tmp; 768 struct btrfs_disk_key unaligned; 769 int mid; 770 771 mid = (low + high) / 2; 772 offset = p + mid * item_size; 773 oil = get_eb_offset_in_folio(eb, offset); 774 775 if (oil + key_size <= unit_size) { 776 const unsigned long idx = get_eb_folio_index(eb, offset); 777 char *kaddr = folio_address(eb->folios[idx]); 778 779 oil = get_eb_offset_in_folio(eb, offset); 780 tmp = (struct btrfs_disk_key *)(kaddr + oil); 781 } else { 782 read_extent_buffer(eb, &unaligned, offset, key_size); 783 tmp = &unaligned; 784 } 785 786 ret = btrfs_comp_keys(tmp, key); 787 788 if (ret < 0) 789 low = mid + 1; 790 else if (ret > 0) 791 high = mid; 792 else { 793 *slot = mid; 794 return 0; 795 } 796 } 797 *slot = low; 798 return 1; 799 } 800 801 static void root_add_used_bytes(struct btrfs_root *root) 802 { 803 spin_lock(&root->accounting_lock); 804 btrfs_set_root_used(&root->root_item, 805 btrfs_root_used(&root->root_item) + root->fs_info->nodesize); 806 spin_unlock(&root->accounting_lock); 807 } 808 809 static void root_sub_used_bytes(struct btrfs_root *root) 810 { 811 spin_lock(&root->accounting_lock); 812 btrfs_set_root_used(&root->root_item, 813 btrfs_root_used(&root->root_item) - root->fs_info->nodesize); 814 spin_unlock(&root->accounting_lock); 815 } 816 817 /* given a node and slot number, this reads the blocks it points to. The 818 * extent buffer is returned with a reference taken (but unlocked). 819 */ 820 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, 821 int slot) 822 { 823 int level = btrfs_header_level(parent); 824 struct btrfs_tree_parent_check check = { 0 }; 825 struct extent_buffer *eb; 826 827 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 828 return ERR_PTR(-ENOENT); 829 830 ASSERT(level); 831 832 check.level = level - 1; 833 check.transid = btrfs_node_ptr_generation(parent, slot); 834 check.owner_root = btrfs_header_owner(parent); 835 check.has_first_key = true; 836 btrfs_node_key_to_cpu(parent, &check.first_key, slot); 837 838 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 839 &check); 840 if (IS_ERR(eb)) 841 return eb; 842 if (unlikely(!extent_buffer_uptodate(eb))) { 843 free_extent_buffer(eb); 844 return ERR_PTR(-EIO); 845 } 846 847 return eb; 848 } 849 850 /* 851 * Promote a child node to become the new tree root. 852 * 853 * @trans: Transaction handle 854 * @root: Tree root structure to update 855 * @path: Path holding nodes and locks 856 * @level: Level of the parent (old root) 857 * @parent: The parent (old root) with exactly one item 858 * 859 * This helper is called during rebalancing when the root node contains only 860 * a single item (nritems == 1). We can reduce the tree height by promoting 861 * that child to become the new root and freeing the old root node. The path 862 * locks and references are updated accordingly. 863 * 864 * Return: 0 on success, negative errno on failure. The transaction is aborted 865 * on critical errors. 866 */ 867 static int promote_child_to_root(struct btrfs_trans_handle *trans, 868 struct btrfs_root *root, struct btrfs_path *path, 869 int level, struct extent_buffer *parent) 870 { 871 struct extent_buffer *child; 872 int ret; 873 874 ASSERT(btrfs_header_nritems(parent) == 1); 875 876 child = btrfs_read_node_slot(parent, 0); 877 if (IS_ERR(child)) 878 return PTR_ERR(child); 879 880 btrfs_tree_lock(child); 881 ret = btrfs_cow_block(trans, root, child, parent, 0, &child, BTRFS_NESTING_COW); 882 if (ret) { 883 btrfs_tree_unlock(child); 884 free_extent_buffer(child); 885 return ret; 886 } 887 888 ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 889 if (unlikely(ret < 0)) { 890 btrfs_tree_unlock(child); 891 free_extent_buffer(child); 892 btrfs_abort_transaction(trans, ret); 893 return ret; 894 } 895 rcu_assign_pointer(root->node, child); 896 897 add_root_to_dirty_list(root); 898 btrfs_tree_unlock(child); 899 900 path->locks[level] = 0; 901 path->nodes[level] = NULL; 902 btrfs_clear_buffer_dirty(trans, parent); 903 btrfs_tree_unlock(parent); 904 /* Once for the path. */ 905 free_extent_buffer(parent); 906 907 root_sub_used_bytes(root); 908 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), parent, 0, 1); 909 /* Once for the root ptr. */ 910 free_extent_buffer_stale(parent); 911 if (unlikely(ret < 0)) { 912 btrfs_abort_transaction(trans, ret); 913 return ret; 914 } 915 916 return 0; 917 } 918 919 /* 920 * node level balancing, used to make sure nodes are in proper order for 921 * item deletion. We balance from the top down, so we have to make sure 922 * that a deletion won't leave an node completely empty later on. 923 */ 924 static noinline int balance_level(struct btrfs_trans_handle *trans, 925 struct btrfs_root *root, 926 struct btrfs_path *path, int level) 927 { 928 struct btrfs_fs_info *fs_info = root->fs_info; 929 struct extent_buffer *right = NULL; 930 struct extent_buffer *mid; 931 struct extent_buffer *left = NULL; 932 struct extent_buffer *parent = NULL; 933 int ret = 0; 934 int wret; 935 int pslot; 936 int orig_slot = path->slots[level]; 937 u64 orig_ptr; 938 939 ASSERT(level > 0); 940 941 mid = path->nodes[level]; 942 943 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); 944 WARN_ON(btrfs_header_generation(mid) != trans->transid); 945 946 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 947 948 if (level < BTRFS_MAX_LEVEL - 1) { 949 parent = path->nodes[level + 1]; 950 pslot = path->slots[level + 1]; 951 } 952 953 /* 954 * deal with the case where there is only one pointer in the root 955 * by promoting the node below to a root 956 */ 957 if (!parent) { 958 if (btrfs_header_nritems(mid) != 1) 959 return 0; 960 961 return promote_child_to_root(trans, root, path, level, mid); 962 } 963 if (btrfs_header_nritems(mid) > 964 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 965 return 0; 966 967 if (pslot) { 968 left = btrfs_read_node_slot(parent, pslot - 1); 969 if (IS_ERR(left)) { 970 ret = PTR_ERR(left); 971 left = NULL; 972 goto out; 973 } 974 975 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); 976 wret = btrfs_cow_block(trans, root, left, 977 parent, pslot - 1, &left, 978 BTRFS_NESTING_LEFT_COW); 979 if (wret) { 980 ret = wret; 981 goto out; 982 } 983 } 984 985 if (pslot + 1 < btrfs_header_nritems(parent)) { 986 right = btrfs_read_node_slot(parent, pslot + 1); 987 if (IS_ERR(right)) { 988 ret = PTR_ERR(right); 989 right = NULL; 990 goto out; 991 } 992 993 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); 994 wret = btrfs_cow_block(trans, root, right, 995 parent, pslot + 1, &right, 996 BTRFS_NESTING_RIGHT_COW); 997 if (wret) { 998 ret = wret; 999 goto out; 1000 } 1001 } 1002 1003 /* first, try to make some room in the middle buffer */ 1004 if (left) { 1005 orig_slot += btrfs_header_nritems(left); 1006 wret = push_node_left(trans, left, mid, 1); 1007 if (wret < 0) 1008 ret = wret; 1009 } 1010 1011 /* 1012 * then try to empty the right most buffer into the middle 1013 */ 1014 if (right) { 1015 wret = push_node_left(trans, mid, right, 1); 1016 if (wret < 0 && wret != -ENOSPC) 1017 ret = wret; 1018 if (btrfs_header_nritems(right) == 0) { 1019 btrfs_clear_buffer_dirty(trans, right); 1020 btrfs_tree_unlock(right); 1021 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); 1022 if (ret < 0) { 1023 free_extent_buffer_stale(right); 1024 right = NULL; 1025 goto out; 1026 } 1027 root_sub_used_bytes(root); 1028 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), 1029 right, 0, 1); 1030 free_extent_buffer_stale(right); 1031 right = NULL; 1032 if (unlikely(ret < 0)) { 1033 btrfs_abort_transaction(trans, ret); 1034 goto out; 1035 } 1036 } else { 1037 struct btrfs_disk_key right_key; 1038 btrfs_node_key(right, &right_key, 0); 1039 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1040 BTRFS_MOD_LOG_KEY_REPLACE); 1041 if (unlikely(ret < 0)) { 1042 btrfs_abort_transaction(trans, ret); 1043 goto out; 1044 } 1045 btrfs_set_node_key(parent, &right_key, pslot + 1); 1046 btrfs_mark_buffer_dirty(trans, parent); 1047 } 1048 } 1049 if (btrfs_header_nritems(mid) == 1) { 1050 /* 1051 * we're not allowed to leave a node with one item in the 1052 * tree during a delete. A deletion from lower in the tree 1053 * could try to delete the only pointer in this node. 1054 * So, pull some keys from the left. 1055 * There has to be a left pointer at this point because 1056 * otherwise we would have pulled some pointers from the 1057 * right 1058 */ 1059 if (unlikely(!left)) { 1060 btrfs_crit(fs_info, 1061 "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", 1062 parent->start, btrfs_header_level(parent), 1063 mid->start, btrfs_root_id(root)); 1064 ret = -EUCLEAN; 1065 btrfs_abort_transaction(trans, ret); 1066 goto out; 1067 } 1068 wret = balance_node_right(trans, mid, left); 1069 if (wret < 0) { 1070 ret = wret; 1071 goto out; 1072 } 1073 if (wret == 1) { 1074 wret = push_node_left(trans, left, mid, 1); 1075 if (wret < 0) 1076 ret = wret; 1077 } 1078 BUG_ON(wret == 1); 1079 } 1080 if (btrfs_header_nritems(mid) == 0) { 1081 btrfs_clear_buffer_dirty(trans, mid); 1082 btrfs_tree_unlock(mid); 1083 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); 1084 if (ret < 0) { 1085 free_extent_buffer_stale(mid); 1086 mid = NULL; 1087 goto out; 1088 } 1089 root_sub_used_bytes(root); 1090 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1091 free_extent_buffer_stale(mid); 1092 mid = NULL; 1093 if (unlikely(ret < 0)) { 1094 btrfs_abort_transaction(trans, ret); 1095 goto out; 1096 } 1097 } else { 1098 /* update the parent key to reflect our changes */ 1099 struct btrfs_disk_key mid_key; 1100 btrfs_node_key(mid, &mid_key, 0); 1101 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1102 BTRFS_MOD_LOG_KEY_REPLACE); 1103 if (unlikely(ret < 0)) { 1104 btrfs_abort_transaction(trans, ret); 1105 goto out; 1106 } 1107 btrfs_set_node_key(parent, &mid_key, pslot); 1108 btrfs_mark_buffer_dirty(trans, parent); 1109 } 1110 1111 /* update the path */ 1112 if (left) { 1113 if (btrfs_header_nritems(left) > orig_slot) { 1114 /* left was locked after cow */ 1115 path->nodes[level] = left; 1116 path->slots[level + 1] -= 1; 1117 path->slots[level] = orig_slot; 1118 /* Left is now owned by path. */ 1119 left = NULL; 1120 if (mid) { 1121 btrfs_tree_unlock(mid); 1122 free_extent_buffer(mid); 1123 } 1124 } else { 1125 orig_slot -= btrfs_header_nritems(left); 1126 path->slots[level] = orig_slot; 1127 } 1128 } 1129 /* double check we haven't messed things up */ 1130 if (orig_ptr != 1131 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1132 BUG(); 1133 out: 1134 if (right) { 1135 btrfs_tree_unlock(right); 1136 free_extent_buffer(right); 1137 } 1138 if (left) { 1139 btrfs_tree_unlock(left); 1140 free_extent_buffer(left); 1141 } 1142 return ret; 1143 } 1144 1145 /* Node balancing for insertion. Here we only split or push nodes around 1146 * when they are completely full. This is also done top down, so we 1147 * have to be pessimistic. 1148 */ 1149 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1150 struct btrfs_root *root, 1151 struct btrfs_path *path, int level) 1152 { 1153 struct btrfs_fs_info *fs_info = root->fs_info; 1154 struct extent_buffer *right = NULL; 1155 struct extent_buffer *mid; 1156 struct extent_buffer *left = NULL; 1157 struct extent_buffer *parent = NULL; 1158 int ret = 0; 1159 int wret; 1160 int pslot; 1161 int orig_slot = path->slots[level]; 1162 1163 if (level == 0) 1164 return 1; 1165 1166 mid = path->nodes[level]; 1167 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1168 1169 if (level < BTRFS_MAX_LEVEL - 1) { 1170 parent = path->nodes[level + 1]; 1171 pslot = path->slots[level + 1]; 1172 } 1173 1174 if (!parent) 1175 return 1; 1176 1177 /* first, try to make some room in the middle buffer */ 1178 if (pslot) { 1179 u32 left_nr; 1180 1181 left = btrfs_read_node_slot(parent, pslot - 1); 1182 if (IS_ERR(left)) 1183 return PTR_ERR(left); 1184 1185 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); 1186 1187 left_nr = btrfs_header_nritems(left); 1188 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1189 wret = 1; 1190 } else { 1191 ret = btrfs_cow_block(trans, root, left, parent, 1192 pslot - 1, &left, 1193 BTRFS_NESTING_LEFT_COW); 1194 if (ret) 1195 wret = 1; 1196 else { 1197 wret = push_node_left(trans, left, mid, 0); 1198 } 1199 } 1200 if (wret < 0) 1201 ret = wret; 1202 if (wret == 0) { 1203 struct btrfs_disk_key disk_key; 1204 orig_slot += left_nr; 1205 btrfs_node_key(mid, &disk_key, 0); 1206 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1207 BTRFS_MOD_LOG_KEY_REPLACE); 1208 if (unlikely(ret < 0)) { 1209 btrfs_tree_unlock(left); 1210 free_extent_buffer(left); 1211 btrfs_abort_transaction(trans, ret); 1212 return ret; 1213 } 1214 btrfs_set_node_key(parent, &disk_key, pslot); 1215 btrfs_mark_buffer_dirty(trans, parent); 1216 if (btrfs_header_nritems(left) > orig_slot) { 1217 path->nodes[level] = left; 1218 path->slots[level + 1] -= 1; 1219 path->slots[level] = orig_slot; 1220 btrfs_tree_unlock(mid); 1221 free_extent_buffer(mid); 1222 } else { 1223 orig_slot -= 1224 btrfs_header_nritems(left); 1225 path->slots[level] = orig_slot; 1226 btrfs_tree_unlock(left); 1227 free_extent_buffer(left); 1228 } 1229 return 0; 1230 } 1231 btrfs_tree_unlock(left); 1232 free_extent_buffer(left); 1233 } 1234 1235 /* 1236 * then try to empty the right most buffer into the middle 1237 */ 1238 if (pslot + 1 < btrfs_header_nritems(parent)) { 1239 u32 right_nr; 1240 1241 right = btrfs_read_node_slot(parent, pslot + 1); 1242 if (IS_ERR(right)) 1243 return PTR_ERR(right); 1244 1245 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); 1246 1247 right_nr = btrfs_header_nritems(right); 1248 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1249 wret = 1; 1250 } else { 1251 ret = btrfs_cow_block(trans, root, right, 1252 parent, pslot + 1, 1253 &right, BTRFS_NESTING_RIGHT_COW); 1254 if (ret) 1255 wret = 1; 1256 else { 1257 wret = balance_node_right(trans, right, mid); 1258 } 1259 } 1260 if (wret < 0) 1261 ret = wret; 1262 if (wret == 0) { 1263 struct btrfs_disk_key disk_key; 1264 1265 btrfs_node_key(right, &disk_key, 0); 1266 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1267 BTRFS_MOD_LOG_KEY_REPLACE); 1268 if (unlikely(ret < 0)) { 1269 btrfs_tree_unlock(right); 1270 free_extent_buffer(right); 1271 btrfs_abort_transaction(trans, ret); 1272 return ret; 1273 } 1274 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1275 btrfs_mark_buffer_dirty(trans, parent); 1276 1277 if (btrfs_header_nritems(mid) <= orig_slot) { 1278 path->nodes[level] = right; 1279 path->slots[level + 1] += 1; 1280 path->slots[level] = orig_slot - 1281 btrfs_header_nritems(mid); 1282 btrfs_tree_unlock(mid); 1283 free_extent_buffer(mid); 1284 } else { 1285 btrfs_tree_unlock(right); 1286 free_extent_buffer(right); 1287 } 1288 return 0; 1289 } 1290 btrfs_tree_unlock(right); 1291 free_extent_buffer(right); 1292 } 1293 return 1; 1294 } 1295 1296 /* 1297 * readahead one full node of leaves, finding things that are close 1298 * to the block in 'slot', and triggering ra on them. 1299 */ 1300 static void reada_for_search(struct btrfs_fs_info *fs_info, 1301 const struct btrfs_path *path, 1302 int level, int slot, u64 objectid) 1303 { 1304 struct extent_buffer *node; 1305 struct btrfs_disk_key disk_key; 1306 u32 nritems; 1307 u64 search; 1308 u64 target; 1309 u64 nread = 0; 1310 u64 nread_max; 1311 u32 nr; 1312 u32 blocksize; 1313 u32 nscan = 0; 1314 1315 if (level != 1 && path->reada != READA_FORWARD_ALWAYS) 1316 return; 1317 1318 if (!path->nodes[level]) 1319 return; 1320 1321 node = path->nodes[level]; 1322 1323 /* 1324 * Since the time between visiting leaves is much shorter than the time 1325 * between visiting nodes, limit read ahead of nodes to 1, to avoid too 1326 * much IO at once (possibly random). 1327 */ 1328 if (path->reada == READA_FORWARD_ALWAYS) { 1329 if (level > 1) 1330 nread_max = node->fs_info->nodesize; 1331 else 1332 nread_max = SZ_128K; 1333 } else { 1334 nread_max = SZ_64K; 1335 } 1336 1337 search = btrfs_node_blockptr(node, slot); 1338 blocksize = fs_info->nodesize; 1339 if (path->reada != READA_FORWARD_ALWAYS) { 1340 struct extent_buffer *eb; 1341 1342 eb = find_extent_buffer(fs_info, search); 1343 if (eb) { 1344 free_extent_buffer(eb); 1345 return; 1346 } 1347 } 1348 1349 target = search; 1350 1351 nritems = btrfs_header_nritems(node); 1352 nr = slot; 1353 1354 while (1) { 1355 if (path->reada == READA_BACK) { 1356 if (nr == 0) 1357 break; 1358 nr--; 1359 } else if (path->reada == READA_FORWARD || 1360 path->reada == READA_FORWARD_ALWAYS) { 1361 nr++; 1362 if (nr >= nritems) 1363 break; 1364 } 1365 if (path->reada == READA_BACK && objectid) { 1366 btrfs_node_key(node, &disk_key, nr); 1367 if (btrfs_disk_key_objectid(&disk_key) != objectid) 1368 break; 1369 } 1370 search = btrfs_node_blockptr(node, nr); 1371 if (path->reada == READA_FORWARD_ALWAYS || 1372 (search <= target && target - search <= 65536) || 1373 (search > target && search - target <= 65536)) { 1374 btrfs_readahead_node_child(node, nr); 1375 nread += blocksize; 1376 } 1377 nscan++; 1378 if (nread > nread_max || nscan > 32) 1379 break; 1380 } 1381 } 1382 1383 static noinline void reada_for_balance(const struct btrfs_path *path, int level) 1384 { 1385 struct extent_buffer *parent; 1386 int slot; 1387 int nritems; 1388 1389 parent = path->nodes[level + 1]; 1390 if (!parent) 1391 return; 1392 1393 nritems = btrfs_header_nritems(parent); 1394 slot = path->slots[level + 1]; 1395 1396 if (slot > 0) 1397 btrfs_readahead_node_child(parent, slot - 1); 1398 if (slot + 1 < nritems) 1399 btrfs_readahead_node_child(parent, slot + 1); 1400 } 1401 1402 1403 /* 1404 * when we walk down the tree, it is usually safe to unlock the higher layers 1405 * in the tree. The exceptions are when our path goes through slot 0, because 1406 * operations on the tree might require changing key pointers higher up in the 1407 * tree. 1408 * 1409 * callers might also have set path->keep_locks, which tells this code to keep 1410 * the lock if the path points to the last slot in the block. This is part of 1411 * walking through the tree, and selecting the next slot in the higher block. 1412 * 1413 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 1414 * if lowest_unlock is 1, level 0 won't be unlocked 1415 */ 1416 static noinline void unlock_up(struct btrfs_path *path, int level, 1417 int lowest_unlock, int min_write_lock_level, 1418 int *write_lock_level) 1419 { 1420 int i; 1421 int skip_level = level; 1422 bool check_skip = true; 1423 1424 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1425 if (!path->nodes[i]) 1426 break; 1427 if (!path->locks[i]) 1428 break; 1429 1430 if (check_skip) { 1431 if (path->slots[i] == 0) { 1432 skip_level = i + 1; 1433 continue; 1434 } 1435 1436 if (path->keep_locks) { 1437 u32 nritems; 1438 1439 nritems = btrfs_header_nritems(path->nodes[i]); 1440 if (nritems < 1 || path->slots[i] >= nritems - 1) { 1441 skip_level = i + 1; 1442 continue; 1443 } 1444 } 1445 } 1446 1447 if (i >= lowest_unlock && i > skip_level) { 1448 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 1449 check_skip = false; 1450 path->locks[i] = 0; 1451 if (write_lock_level && 1452 i > min_write_lock_level && 1453 i <= *write_lock_level) { 1454 *write_lock_level = i - 1; 1455 } 1456 } 1457 } 1458 } 1459 1460 /* 1461 * Helper function for btrfs_search_slot() and other functions that do a search 1462 * on a btree. The goal is to find a tree block in the cache (the radix tree at 1463 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read 1464 * its pages from disk. 1465 * 1466 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the 1467 * whole btree search, starting again from the current root node. 1468 */ 1469 static int 1470 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 1471 struct extent_buffer **eb_ret, int slot, 1472 const struct btrfs_key *key) 1473 { 1474 struct btrfs_fs_info *fs_info = root->fs_info; 1475 struct btrfs_tree_parent_check check = { 0 }; 1476 u64 blocknr; 1477 struct extent_buffer *tmp = NULL; 1478 int ret = 0; 1479 int ret2; 1480 int parent_level; 1481 bool read_tmp = false; 1482 bool tmp_locked = false; 1483 bool path_released = false; 1484 1485 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1486 parent_level = btrfs_header_level(*eb_ret); 1487 btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); 1488 check.has_first_key = true; 1489 check.level = parent_level - 1; 1490 check.transid = btrfs_node_ptr_generation(*eb_ret, slot); 1491 check.owner_root = btrfs_root_id(root); 1492 1493 /* 1494 * If we need to read an extent buffer from disk and we are holding locks 1495 * on upper level nodes, we unlock all the upper nodes before reading the 1496 * extent buffer, and then return -EAGAIN to the caller as it needs to 1497 * restart the search. We don't release the lock on the current level 1498 * because we need to walk this node to figure out which blocks to read. 1499 */ 1500 tmp = find_extent_buffer(fs_info, blocknr); 1501 if (tmp) { 1502 if (p->reada == READA_FORWARD_ALWAYS) 1503 reada_for_search(fs_info, p, parent_level, slot, key->objectid); 1504 1505 /* first we do an atomic uptodate check */ 1506 if (btrfs_buffer_uptodate(tmp, check.transid, true) > 0) { 1507 /* 1508 * Do extra check for first_key, eb can be stale due to 1509 * being cached, read from scrub, or have multiple 1510 * parents (shared tree blocks). 1511 */ 1512 if (unlikely(btrfs_verify_level_key(tmp, &check))) { 1513 ret = -EUCLEAN; 1514 goto out; 1515 } 1516 *eb_ret = tmp; 1517 tmp = NULL; 1518 ret = 0; 1519 goto out; 1520 } 1521 1522 if (p->nowait) { 1523 ret = -EAGAIN; 1524 goto out; 1525 } 1526 1527 if (!p->skip_locking) { 1528 btrfs_unlock_up_safe(p, parent_level + 1); 1529 btrfs_maybe_reset_lockdep_class(root, tmp); 1530 tmp_locked = true; 1531 btrfs_tree_read_lock(tmp); 1532 btrfs_release_path(p); 1533 ret = -EAGAIN; 1534 path_released = true; 1535 } 1536 1537 /* Now we're allowed to do a blocking uptodate check. */ 1538 ret2 = btrfs_read_extent_buffer(tmp, &check); 1539 if (ret2) { 1540 ret = ret2; 1541 goto out; 1542 } 1543 1544 if (ret == 0) { 1545 ASSERT(!tmp_locked); 1546 *eb_ret = tmp; 1547 tmp = NULL; 1548 } 1549 goto out; 1550 } else if (p->nowait) { 1551 ret = -EAGAIN; 1552 goto out; 1553 } 1554 1555 if (!p->skip_locking) { 1556 btrfs_unlock_up_safe(p, parent_level + 1); 1557 ret = -EAGAIN; 1558 } 1559 1560 if (p->reada != READA_NONE) 1561 reada_for_search(fs_info, p, parent_level, slot, key->objectid); 1562 1563 tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level); 1564 if (IS_ERR(tmp)) { 1565 ret = PTR_ERR(tmp); 1566 tmp = NULL; 1567 goto out; 1568 } 1569 read_tmp = true; 1570 1571 if (!p->skip_locking) { 1572 ASSERT(ret == -EAGAIN); 1573 btrfs_maybe_reset_lockdep_class(root, tmp); 1574 tmp_locked = true; 1575 btrfs_tree_read_lock(tmp); 1576 btrfs_release_path(p); 1577 path_released = true; 1578 } 1579 1580 /* Now we're allowed to do a blocking uptodate check. */ 1581 ret2 = btrfs_read_extent_buffer(tmp, &check); 1582 if (ret2) { 1583 ret = ret2; 1584 goto out; 1585 } 1586 1587 /* 1588 * If the read above didn't mark this buffer up to date, 1589 * it will never end up being up to date. Set ret to EIO now 1590 * and give up so that our caller doesn't loop forever 1591 * on our EAGAINs. 1592 */ 1593 if (unlikely(!extent_buffer_uptodate(tmp))) { 1594 ret = -EIO; 1595 goto out; 1596 } 1597 1598 if (ret == 0) { 1599 ASSERT(!tmp_locked); 1600 *eb_ret = tmp; 1601 tmp = NULL; 1602 } 1603 out: 1604 if (tmp) { 1605 if (tmp_locked) 1606 btrfs_tree_read_unlock(tmp); 1607 if (read_tmp && ret && ret != -EAGAIN) 1608 free_extent_buffer_stale(tmp); 1609 else 1610 free_extent_buffer(tmp); 1611 } 1612 if (ret && !path_released) 1613 btrfs_release_path(p); 1614 1615 return ret; 1616 } 1617 1618 /* 1619 * helper function for btrfs_search_slot. This does all of the checks 1620 * for node-level blocks and does any balancing required based on 1621 * the ins_len. 1622 * 1623 * If no extra work was required, zero is returned. If we had to 1624 * drop the path, -EAGAIN is returned and btrfs_search_slot must 1625 * start over 1626 */ 1627 static int 1628 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1629 struct btrfs_root *root, struct btrfs_path *p, 1630 struct extent_buffer *b, int level, int ins_len, 1631 int *write_lock_level) 1632 { 1633 struct btrfs_fs_info *fs_info = root->fs_info; 1634 int ret = 0; 1635 1636 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1637 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 1638 1639 if (*write_lock_level < level + 1) { 1640 *write_lock_level = level + 1; 1641 btrfs_release_path(p); 1642 return -EAGAIN; 1643 } 1644 1645 reada_for_balance(p, level); 1646 ret = split_node(trans, root, p, level); 1647 1648 b = p->nodes[level]; 1649 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1650 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 1651 1652 if (*write_lock_level < level + 1) { 1653 *write_lock_level = level + 1; 1654 btrfs_release_path(p); 1655 return -EAGAIN; 1656 } 1657 1658 reada_for_balance(p, level); 1659 ret = balance_level(trans, root, p, level); 1660 if (ret) 1661 return ret; 1662 1663 b = p->nodes[level]; 1664 if (!b) { 1665 btrfs_release_path(p); 1666 return -EAGAIN; 1667 } 1668 BUG_ON(btrfs_header_nritems(b) == 1); 1669 } 1670 return ret; 1671 } 1672 1673 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 1674 u64 iobjectid, u64 ioff, u8 key_type, 1675 struct btrfs_key *found_key) 1676 { 1677 int ret; 1678 struct btrfs_key key; 1679 struct extent_buffer *eb; 1680 1681 ASSERT(path); 1682 ASSERT(found_key); 1683 1684 key.type = key_type; 1685 key.objectid = iobjectid; 1686 key.offset = ioff; 1687 1688 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 1689 if (ret < 0) 1690 return ret; 1691 1692 eb = path->nodes[0]; 1693 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 1694 ret = btrfs_next_leaf(fs_root, path); 1695 if (ret) 1696 return ret; 1697 eb = path->nodes[0]; 1698 } 1699 1700 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 1701 if (found_key->type != key.type || 1702 found_key->objectid != key.objectid) 1703 return 1; 1704 1705 return 0; 1706 } 1707 1708 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 1709 struct btrfs_path *p, 1710 int write_lock_level) 1711 { 1712 struct extent_buffer *b; 1713 int root_lock = 0; 1714 int level = 0; 1715 1716 if (p->search_commit_root) { 1717 b = root->commit_root; 1718 refcount_inc(&b->refs); 1719 level = btrfs_header_level(b); 1720 /* 1721 * Ensure that all callers have set skip_locking when 1722 * p->search_commit_root is true. 1723 */ 1724 ASSERT(p->skip_locking); 1725 1726 goto out; 1727 } 1728 1729 if (p->skip_locking) { 1730 b = btrfs_root_node(root); 1731 level = btrfs_header_level(b); 1732 goto out; 1733 } 1734 1735 /* We try very hard to do read locks on the root */ 1736 root_lock = BTRFS_READ_LOCK; 1737 1738 /* 1739 * If the level is set to maximum, we can skip trying to get the read 1740 * lock. 1741 */ 1742 if (write_lock_level < BTRFS_MAX_LEVEL) { 1743 /* 1744 * We don't know the level of the root node until we actually 1745 * have it read locked 1746 */ 1747 if (p->nowait) { 1748 b = btrfs_try_read_lock_root_node(root); 1749 if (IS_ERR(b)) 1750 return b; 1751 } else { 1752 b = btrfs_read_lock_root_node(root); 1753 } 1754 level = btrfs_header_level(b); 1755 if (level > write_lock_level) 1756 goto out; 1757 1758 /* Whoops, must trade for write lock */ 1759 btrfs_tree_read_unlock(b); 1760 free_extent_buffer(b); 1761 } 1762 1763 b = btrfs_lock_root_node(root); 1764 root_lock = BTRFS_WRITE_LOCK; 1765 1766 /* The level might have changed, check again */ 1767 level = btrfs_header_level(b); 1768 1769 out: 1770 /* 1771 * The root may have failed to write out at some point, and thus is no 1772 * longer valid, return an error in this case. 1773 */ 1774 if (unlikely(!extent_buffer_uptodate(b))) { 1775 if (root_lock) 1776 btrfs_tree_unlock_rw(b, root_lock); 1777 free_extent_buffer(b); 1778 return ERR_PTR(-EIO); 1779 } 1780 1781 p->nodes[level] = b; 1782 if (!p->skip_locking) 1783 p->locks[level] = root_lock; 1784 /* 1785 * Callers are responsible for dropping b's references. 1786 */ 1787 return b; 1788 } 1789 1790 /* 1791 * Replace the extent buffer at the lowest level of the path with a cloned 1792 * version. The purpose is to be able to use it safely, after releasing the 1793 * commit root semaphore, even if relocation is happening in parallel, the 1794 * transaction used for relocation is committed and the extent buffer is 1795 * reallocated in the next transaction. 1796 * 1797 * This is used in a context where the caller does not prevent transaction 1798 * commits from happening, either by holding a transaction handle or holding 1799 * some lock, while it's doing searches through a commit root. 1800 * At the moment it's only used for send operations. 1801 */ 1802 static int finish_need_commit_sem_search(struct btrfs_path *path) 1803 { 1804 const int i = path->lowest_level; 1805 const int slot = path->slots[i]; 1806 struct extent_buffer *lowest = path->nodes[i]; 1807 struct extent_buffer *clone; 1808 1809 ASSERT(path->need_commit_sem); 1810 1811 if (!lowest) 1812 return 0; 1813 1814 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); 1815 1816 clone = btrfs_clone_extent_buffer(lowest); 1817 if (!clone) 1818 return -ENOMEM; 1819 1820 btrfs_release_path(path); 1821 path->nodes[i] = clone; 1822 path->slots[i] = slot; 1823 1824 return 0; 1825 } 1826 1827 static inline int search_for_key_slot(const struct extent_buffer *eb, 1828 int search_low_slot, 1829 const struct btrfs_key *key, 1830 int prev_cmp, 1831 int *slot) 1832 { 1833 /* 1834 * If a previous call to btrfs_bin_search() on a parent node returned an 1835 * exact match (prev_cmp == 0), we can safely assume the target key will 1836 * always be at slot 0 on lower levels, since each key pointer 1837 * (struct btrfs_key_ptr) refers to the lowest key accessible from the 1838 * subtree it points to. Thus we can skip searching lower levels. 1839 */ 1840 if (prev_cmp == 0) { 1841 *slot = 0; 1842 return 0; 1843 } 1844 1845 return btrfs_bin_search(eb, search_low_slot, key, slot); 1846 } 1847 1848 static int search_leaf(struct btrfs_trans_handle *trans, 1849 struct btrfs_root *root, 1850 const struct btrfs_key *key, 1851 struct btrfs_path *path, 1852 int ins_len, 1853 int prev_cmp) 1854 { 1855 struct extent_buffer *leaf = path->nodes[0]; 1856 int leaf_free_space = -1; 1857 int search_low_slot = 0; 1858 int ret; 1859 bool do_bin_search = true; 1860 1861 /* 1862 * If we are doing an insertion, the leaf has enough free space and the 1863 * destination slot for the key is not slot 0, then we can unlock our 1864 * write lock on the parent, and any other upper nodes, before doing the 1865 * binary search on the leaf (with search_for_key_slot()), allowing other 1866 * tasks to lock the parent and any other upper nodes. 1867 */ 1868 if (ins_len > 0) { 1869 /* 1870 * Cache the leaf free space, since we will need it later and it 1871 * will not change until then. 1872 */ 1873 leaf_free_space = btrfs_leaf_free_space(leaf); 1874 1875 /* 1876 * !path->locks[1] means we have a single node tree, the leaf is 1877 * the root of the tree. 1878 */ 1879 if (path->locks[1] && leaf_free_space >= ins_len) { 1880 struct btrfs_disk_key first_key; 1881 1882 ASSERT(btrfs_header_nritems(leaf) > 0); 1883 btrfs_item_key(leaf, &first_key, 0); 1884 1885 /* 1886 * Doing the extra comparison with the first key is cheap, 1887 * taking into account that the first key is very likely 1888 * already in a cache line because it immediately follows 1889 * the extent buffer's header and we have recently accessed 1890 * the header's level field. 1891 */ 1892 ret = btrfs_comp_keys(&first_key, key); 1893 if (ret < 0) { 1894 /* 1895 * The first key is smaller than the key we want 1896 * to insert, so we are safe to unlock all upper 1897 * nodes and we have to do the binary search. 1898 * 1899 * We do use btrfs_unlock_up_safe() and not 1900 * unlock_up() because the later does not unlock 1901 * nodes with a slot of 0 - we can safely unlock 1902 * any node even if its slot is 0 since in this 1903 * case the key does not end up at slot 0 of the 1904 * leaf and there's no need to split the leaf. 1905 */ 1906 btrfs_unlock_up_safe(path, 1); 1907 search_low_slot = 1; 1908 } else { 1909 /* 1910 * The first key is >= then the key we want to 1911 * insert, so we can skip the binary search as 1912 * the target key will be at slot 0. 1913 * 1914 * We can not unlock upper nodes when the key is 1915 * less than the first key, because we will need 1916 * to update the key at slot 0 of the parent node 1917 * and possibly of other upper nodes too. 1918 * If the key matches the first key, then we can 1919 * unlock all the upper nodes, using 1920 * btrfs_unlock_up_safe() instead of unlock_up() 1921 * as stated above. 1922 */ 1923 if (ret == 0) 1924 btrfs_unlock_up_safe(path, 1); 1925 /* 1926 * ret is already 0 or 1, matching the result of 1927 * a btrfs_bin_search() call, so there is no need 1928 * to adjust it. 1929 */ 1930 do_bin_search = false; 1931 path->slots[0] = 0; 1932 } 1933 } 1934 } 1935 1936 if (do_bin_search) { 1937 ret = search_for_key_slot(leaf, search_low_slot, key, 1938 prev_cmp, &path->slots[0]); 1939 if (ret < 0) 1940 return ret; 1941 } 1942 1943 if (ins_len > 0) { 1944 /* 1945 * Item key already exists. In this case, if we are allowed to 1946 * insert the item (for example, in dir_item case, item key 1947 * collision is allowed), it will be merged with the original 1948 * item. Only the item size grows, no new btrfs item will be 1949 * added. If search_for_extension is not set, ins_len already 1950 * accounts the size btrfs_item, deduct it here so leaf space 1951 * check will be correct. 1952 */ 1953 if (ret == 0 && !path->search_for_extension) { 1954 ASSERT(ins_len >= sizeof(struct btrfs_item)); 1955 ins_len -= sizeof(struct btrfs_item); 1956 } 1957 1958 ASSERT(leaf_free_space >= 0); 1959 1960 if (leaf_free_space < ins_len) { 1961 int ret2; 1962 1963 ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0)); 1964 ASSERT(ret2 <= 0); 1965 if (WARN_ON(ret2 > 0)) 1966 ret2 = -EUCLEAN; 1967 if (ret2) 1968 ret = ret2; 1969 } 1970 } 1971 1972 return ret; 1973 } 1974 1975 /* 1976 * Look for a key in a tree and perform necessary modifications to preserve 1977 * tree invariants. 1978 * 1979 * @trans: Handle of transaction, used when modifying the tree 1980 * @p: Holds all btree nodes along the search path 1981 * @root: The root node of the tree 1982 * @key: The key we are looking for 1983 * @ins_len: Indicates purpose of search: 1984 * >0 for inserts it's size of item inserted (*) 1985 * <0 for deletions 1986 * 0 for plain searches, not modifying the tree 1987 * 1988 * (*) If size of item inserted doesn't include 1989 * sizeof(struct btrfs_item), then p->search_for_extension must 1990 * be set. 1991 * @cow: boolean should CoW operations be performed. Must always be 1 1992 * when modifying the tree. 1993 * 1994 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 1995 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 1996 * 1997 * If @key is found, 0 is returned and you can find the item in the leaf level 1998 * of the path (level 0) 1999 * 2000 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 2001 * points to the slot where it should be inserted 2002 * 2003 * If an error is encountered while searching the tree a negative error number 2004 * is returned 2005 */ 2006 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2007 const struct btrfs_key *key, struct btrfs_path *p, 2008 int ins_len, int cow) 2009 { 2010 struct btrfs_fs_info *fs_info; 2011 struct extent_buffer *b; 2012 int slot; 2013 int ret; 2014 int level; 2015 int lowest_unlock = 1; 2016 /* everything at write_lock_level or lower must be write locked */ 2017 int write_lock_level = 0; 2018 u8 lowest_level = 0; 2019 int min_write_lock_level; 2020 int prev_cmp; 2021 2022 if (!root) 2023 return -EINVAL; 2024 2025 fs_info = root->fs_info; 2026 might_sleep(); 2027 2028 lowest_level = p->lowest_level; 2029 WARN_ON(lowest_level && ins_len > 0); 2030 WARN_ON(p->nodes[0] != NULL); 2031 BUG_ON(!cow && ins_len); 2032 2033 /* 2034 * For now only allow nowait for read only operations. There's no 2035 * strict reason why we can't, we just only need it for reads so it's 2036 * only implemented for reads. 2037 */ 2038 ASSERT(!p->nowait || !cow); 2039 2040 if (ins_len < 0) { 2041 lowest_unlock = 2; 2042 2043 /* when we are removing items, we might have to go up to level 2044 * two as we update tree pointers Make sure we keep write 2045 * for those levels as well 2046 */ 2047 write_lock_level = 2; 2048 } else if (ins_len > 0) { 2049 /* 2050 * for inserting items, make sure we have a write lock on 2051 * level 1 so we can update keys 2052 */ 2053 write_lock_level = 1; 2054 } 2055 2056 if (!cow) 2057 write_lock_level = -1; 2058 2059 if (cow && (p->keep_locks || p->lowest_level)) 2060 write_lock_level = BTRFS_MAX_LEVEL; 2061 2062 min_write_lock_level = write_lock_level; 2063 2064 if (p->need_commit_sem) { 2065 ASSERT(p->search_commit_root); 2066 if (p->nowait) { 2067 if (!down_read_trylock(&fs_info->commit_root_sem)) 2068 return -EAGAIN; 2069 } else { 2070 down_read(&fs_info->commit_root_sem); 2071 } 2072 } 2073 2074 again: 2075 prev_cmp = -1; 2076 b = btrfs_search_slot_get_root(root, p, write_lock_level); 2077 if (IS_ERR(b)) { 2078 ret = PTR_ERR(b); 2079 goto done; 2080 } 2081 2082 while (b) { 2083 int dec = 0; 2084 int ret2; 2085 2086 level = btrfs_header_level(b); 2087 2088 if (cow) { 2089 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 2090 2091 /* 2092 * if we don't really need to cow this block 2093 * then we don't want to set the path blocking, 2094 * so we test it here 2095 */ 2096 if (!should_cow_block(trans, root, b)) 2097 goto cow_done; 2098 2099 /* 2100 * must have write locks on this node and the 2101 * parent 2102 */ 2103 if (level > write_lock_level || 2104 (level + 1 > write_lock_level && 2105 level + 1 < BTRFS_MAX_LEVEL && 2106 p->nodes[level + 1])) { 2107 write_lock_level = level + 1; 2108 btrfs_release_path(p); 2109 goto again; 2110 } 2111 2112 if (last_level) 2113 ret2 = btrfs_cow_block(trans, root, b, NULL, 0, 2114 &b, BTRFS_NESTING_COW); 2115 else 2116 ret2 = btrfs_cow_block(trans, root, b, 2117 p->nodes[level + 1], 2118 p->slots[level + 1], &b, 2119 BTRFS_NESTING_COW); 2120 if (ret2) { 2121 ret = ret2; 2122 goto done; 2123 } 2124 } 2125 cow_done: 2126 p->nodes[level] = b; 2127 2128 /* 2129 * we have a lock on b and as long as we aren't changing 2130 * the tree, there is no way to for the items in b to change. 2131 * It is safe to drop the lock on our parent before we 2132 * go through the expensive btree search on b. 2133 * 2134 * If we're inserting or deleting (ins_len != 0), then we might 2135 * be changing slot zero, which may require changing the parent. 2136 * So, we can't drop the lock until after we know which slot 2137 * we're operating on. 2138 */ 2139 if (!ins_len && !p->keep_locks) { 2140 int u = level + 1; 2141 2142 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 2143 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 2144 p->locks[u] = 0; 2145 } 2146 } 2147 2148 if (level == 0) { 2149 if (ins_len > 0) 2150 ASSERT(write_lock_level >= 1); 2151 2152 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); 2153 if (!p->search_for_split) 2154 unlock_up(p, level, lowest_unlock, 2155 min_write_lock_level, NULL); 2156 goto done; 2157 } 2158 2159 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); 2160 if (ret < 0) 2161 goto done; 2162 prev_cmp = ret; 2163 2164 if (ret && slot > 0) { 2165 dec = 1; 2166 slot--; 2167 } 2168 p->slots[level] = slot; 2169 ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len, 2170 &write_lock_level); 2171 if (ret2 == -EAGAIN) 2172 goto again; 2173 if (ret2) { 2174 ret = ret2; 2175 goto done; 2176 } 2177 b = p->nodes[level]; 2178 slot = p->slots[level]; 2179 2180 /* 2181 * Slot 0 is special, if we change the key we have to update 2182 * the parent pointer which means we must have a write lock on 2183 * the parent 2184 */ 2185 if (slot == 0 && ins_len && write_lock_level < level + 1) { 2186 write_lock_level = level + 1; 2187 btrfs_release_path(p); 2188 goto again; 2189 } 2190 2191 unlock_up(p, level, lowest_unlock, min_write_lock_level, 2192 &write_lock_level); 2193 2194 if (level == lowest_level) { 2195 if (dec) 2196 p->slots[level]++; 2197 goto done; 2198 } 2199 2200 ret2 = read_block_for_search(root, p, &b, slot, key); 2201 if (ret2 == -EAGAIN && !p->nowait) 2202 goto again; 2203 if (ret2) { 2204 ret = ret2; 2205 goto done; 2206 } 2207 2208 if (!p->skip_locking) { 2209 level = btrfs_header_level(b); 2210 2211 btrfs_maybe_reset_lockdep_class(root, b); 2212 2213 if (level <= write_lock_level) { 2214 btrfs_tree_lock(b); 2215 p->locks[level] = BTRFS_WRITE_LOCK; 2216 } else { 2217 if (p->nowait) { 2218 if (!btrfs_try_tree_read_lock(b)) { 2219 free_extent_buffer(b); 2220 ret = -EAGAIN; 2221 goto done; 2222 } 2223 } else { 2224 btrfs_tree_read_lock(b); 2225 } 2226 p->locks[level] = BTRFS_READ_LOCK; 2227 } 2228 p->nodes[level] = b; 2229 } 2230 } 2231 ret = 1; 2232 done: 2233 if (ret < 0 && !p->skip_release_on_error) 2234 btrfs_release_path(p); 2235 2236 if (p->need_commit_sem) { 2237 int ret2; 2238 2239 ret2 = finish_need_commit_sem_search(p); 2240 up_read(&fs_info->commit_root_sem); 2241 if (ret2) 2242 ret = ret2; 2243 } 2244 2245 return ret; 2246 } 2247 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); 2248 2249 /* 2250 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2251 * current state of the tree together with the operations recorded in the tree 2252 * modification log to search for the key in a previous version of this tree, as 2253 * denoted by the time_seq parameter. 2254 * 2255 * Naturally, there is no support for insert, delete or cow operations. 2256 * 2257 * The resulting path and return value will be set up as if we called 2258 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2259 */ 2260 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 2261 struct btrfs_path *p, u64 time_seq) 2262 { 2263 struct btrfs_fs_info *fs_info = root->fs_info; 2264 struct extent_buffer *b; 2265 int slot; 2266 int ret; 2267 int level; 2268 int lowest_unlock = 1; 2269 u8 lowest_level = 0; 2270 2271 lowest_level = p->lowest_level; 2272 WARN_ON(p->nodes[0] != NULL); 2273 ASSERT(!p->nowait); 2274 2275 if (p->search_commit_root) { 2276 BUG_ON(time_seq); 2277 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2278 } 2279 2280 again: 2281 b = btrfs_get_old_root(root, time_seq); 2282 if (unlikely(!b)) { 2283 ret = -EIO; 2284 goto done; 2285 } 2286 level = btrfs_header_level(b); 2287 p->locks[level] = BTRFS_READ_LOCK; 2288 2289 while (b) { 2290 int dec = 0; 2291 int ret2; 2292 2293 level = btrfs_header_level(b); 2294 p->nodes[level] = b; 2295 2296 /* 2297 * we have a lock on b and as long as we aren't changing 2298 * the tree, there is no way to for the items in b to change. 2299 * It is safe to drop the lock on our parent before we 2300 * go through the expensive btree search on b. 2301 */ 2302 btrfs_unlock_up_safe(p, level + 1); 2303 2304 ret = btrfs_bin_search(b, 0, key, &slot); 2305 if (ret < 0) 2306 goto done; 2307 2308 if (level == 0) { 2309 p->slots[level] = slot; 2310 unlock_up(p, level, lowest_unlock, 0, NULL); 2311 goto done; 2312 } 2313 2314 if (ret && slot > 0) { 2315 dec = 1; 2316 slot--; 2317 } 2318 p->slots[level] = slot; 2319 unlock_up(p, level, lowest_unlock, 0, NULL); 2320 2321 if (level == lowest_level) { 2322 if (dec) 2323 p->slots[level]++; 2324 goto done; 2325 } 2326 2327 ret2 = read_block_for_search(root, p, &b, slot, key); 2328 if (ret2 == -EAGAIN && !p->nowait) 2329 goto again; 2330 if (ret2) { 2331 ret = ret2; 2332 goto done; 2333 } 2334 2335 level = btrfs_header_level(b); 2336 btrfs_tree_read_lock(b); 2337 b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq); 2338 if (!b) { 2339 ret = -ENOMEM; 2340 goto done; 2341 } 2342 p->locks[level] = BTRFS_READ_LOCK; 2343 p->nodes[level] = b; 2344 } 2345 ret = 1; 2346 done: 2347 if (ret < 0) 2348 btrfs_release_path(p); 2349 2350 return ret; 2351 } 2352 2353 /* 2354 * Search the tree again to find a leaf with smaller keys. 2355 * Returns 0 if it found something. 2356 * Returns 1 if there are no smaller keys. 2357 * Returns < 0 on error. 2358 * 2359 * This may release the path, and so you may lose any locks held at the 2360 * time you call it. 2361 */ 2362 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 2363 { 2364 struct btrfs_key key; 2365 struct btrfs_key orig_key; 2366 struct btrfs_disk_key found_key; 2367 int ret; 2368 2369 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 2370 orig_key = key; 2371 2372 if (key.offset > 0) { 2373 key.offset--; 2374 } else if (key.type > 0) { 2375 key.type--; 2376 key.offset = (u64)-1; 2377 } else if (key.objectid > 0) { 2378 key.objectid--; 2379 key.type = (u8)-1; 2380 key.offset = (u64)-1; 2381 } else { 2382 return 1; 2383 } 2384 2385 btrfs_release_path(path); 2386 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2387 if (ret <= 0) 2388 return ret; 2389 2390 /* 2391 * Previous key not found. Even if we were at slot 0 of the leaf we had 2392 * before releasing the path and calling btrfs_search_slot(), we now may 2393 * be in a slot pointing to the same original key - this can happen if 2394 * after we released the path, one of more items were moved from a 2395 * sibling leaf into the front of the leaf we had due to an insertion 2396 * (see push_leaf_right()). 2397 * If we hit this case and our slot is > 0 and just decrement the slot 2398 * so that the caller does not process the same key again, which may or 2399 * may not break the caller, depending on its logic. 2400 */ 2401 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 2402 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); 2403 ret = btrfs_comp_keys(&found_key, &orig_key); 2404 if (ret == 0) { 2405 if (path->slots[0] > 0) { 2406 path->slots[0]--; 2407 return 0; 2408 } 2409 /* 2410 * At slot 0, same key as before, it means orig_key is 2411 * the lowest, leftmost, key in the tree. We're done. 2412 */ 2413 return 1; 2414 } 2415 } 2416 2417 btrfs_item_key(path->nodes[0], &found_key, 0); 2418 ret = btrfs_comp_keys(&found_key, &key); 2419 /* 2420 * We might have had an item with the previous key in the tree right 2421 * before we released our path. And after we released our path, that 2422 * item might have been pushed to the first slot (0) of the leaf we 2423 * were holding due to a tree balance. Alternatively, an item with the 2424 * previous key can exist as the only element of a leaf (big fat item). 2425 * Therefore account for these 2 cases, so that our callers (like 2426 * btrfs_previous_item) don't miss an existing item with a key matching 2427 * the previous key we computed above. 2428 */ 2429 if (ret <= 0) 2430 return 0; 2431 return 1; 2432 } 2433 2434 /* 2435 * helper to use instead of search slot if no exact match is needed but 2436 * instead the next or previous item should be returned. 2437 * When find_higher is true, the next higher item is returned, the next lower 2438 * otherwise. 2439 * When return_any and find_higher are both true, and no higher item is found, 2440 * return the next lower instead. 2441 * When return_any is true and find_higher is false, and no lower item is found, 2442 * return the next higher instead. 2443 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2444 * < 0 on error 2445 */ 2446 int btrfs_search_slot_for_read(struct btrfs_root *root, 2447 const struct btrfs_key *key, 2448 struct btrfs_path *p, int find_higher, 2449 int return_any) 2450 { 2451 int ret; 2452 struct extent_buffer *leaf; 2453 2454 again: 2455 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2456 if (ret <= 0) 2457 return ret; 2458 /* 2459 * a return value of 1 means the path is at the position where the 2460 * item should be inserted. Normally this is the next bigger item, 2461 * but in case the previous item is the last in a leaf, path points 2462 * to the first free slot in the previous leaf, i.e. at an invalid 2463 * item. 2464 */ 2465 leaf = p->nodes[0]; 2466 2467 if (find_higher) { 2468 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2469 ret = btrfs_next_leaf(root, p); 2470 if (ret <= 0) 2471 return ret; 2472 if (!return_any) 2473 return 1; 2474 /* 2475 * no higher item found, return the next 2476 * lower instead 2477 */ 2478 return_any = 0; 2479 find_higher = 0; 2480 btrfs_release_path(p); 2481 goto again; 2482 } 2483 } else { 2484 if (p->slots[0] == 0) { 2485 ret = btrfs_prev_leaf(root, p); 2486 if (ret < 0) 2487 return ret; 2488 if (!ret) { 2489 leaf = p->nodes[0]; 2490 if (p->slots[0] == btrfs_header_nritems(leaf)) 2491 p->slots[0]--; 2492 return 0; 2493 } 2494 if (!return_any) 2495 return 1; 2496 /* 2497 * no lower item found, return the next 2498 * higher instead 2499 */ 2500 return_any = 0; 2501 find_higher = 1; 2502 btrfs_release_path(p); 2503 goto again; 2504 } else { 2505 --p->slots[0]; 2506 } 2507 } 2508 return 0; 2509 } 2510 2511 /* 2512 * Execute search and call btrfs_previous_item to traverse backwards if the item 2513 * was not found. 2514 * 2515 * Return 0 if found, 1 if not found and < 0 if error. 2516 */ 2517 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, 2518 struct btrfs_path *path) 2519 { 2520 int ret; 2521 2522 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 2523 if (ret > 0) 2524 ret = btrfs_previous_item(root, path, key->objectid, key->type); 2525 2526 if (ret == 0) 2527 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2528 2529 return ret; 2530 } 2531 2532 /* 2533 * Search for a valid slot for the given path. 2534 * 2535 * @root: The root node of the tree. 2536 * @key: Will contain a valid item if found. 2537 * @path: The starting point to validate the slot. 2538 * 2539 * Return: 0 if the item is valid 2540 * 1 if not found 2541 * <0 if error. 2542 */ 2543 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, 2544 struct btrfs_path *path) 2545 { 2546 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 2547 int ret; 2548 2549 ret = btrfs_next_leaf(root, path); 2550 if (ret) 2551 return ret; 2552 } 2553 2554 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2555 return 0; 2556 } 2557 2558 /* 2559 * adjust the pointers going up the tree, starting at level 2560 * making sure the right key of each node is points to 'key'. 2561 * This is used after shifting pointers to the left, so it stops 2562 * fixing up pointers when a given leaf/node is not in slot 0 of the 2563 * higher levels 2564 * 2565 */ 2566 static void fixup_low_keys(struct btrfs_trans_handle *trans, 2567 const struct btrfs_path *path, 2568 const struct btrfs_disk_key *key, int level) 2569 { 2570 int i; 2571 struct extent_buffer *t; 2572 int ret; 2573 2574 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2575 int tslot = path->slots[i]; 2576 2577 if (!path->nodes[i]) 2578 break; 2579 t = path->nodes[i]; 2580 ret = btrfs_tree_mod_log_insert_key(t, tslot, 2581 BTRFS_MOD_LOG_KEY_REPLACE); 2582 BUG_ON(ret < 0); 2583 btrfs_set_node_key(t, key, tslot); 2584 btrfs_mark_buffer_dirty(trans, path->nodes[i]); 2585 if (tslot != 0) 2586 break; 2587 } 2588 } 2589 2590 /* 2591 * update item key. 2592 * 2593 * This function isn't completely safe. It's the caller's responsibility 2594 * that the new key won't break the order 2595 */ 2596 void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, 2597 const struct btrfs_path *path, 2598 const struct btrfs_key *new_key) 2599 { 2600 struct btrfs_fs_info *fs_info = trans->fs_info; 2601 struct btrfs_disk_key disk_key; 2602 struct extent_buffer *eb; 2603 int slot; 2604 2605 eb = path->nodes[0]; 2606 slot = path->slots[0]; 2607 if (slot > 0) { 2608 btrfs_item_key(eb, &disk_key, slot - 1); 2609 if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) { 2610 btrfs_print_leaf(eb); 2611 btrfs_crit(fs_info, 2612 "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT, 2613 slot, btrfs_disk_key_objectid(&disk_key), 2614 btrfs_disk_key_type(&disk_key), 2615 btrfs_disk_key_offset(&disk_key), 2616 BTRFS_KEY_FMT_VALUE(new_key)); 2617 BUG(); 2618 } 2619 } 2620 if (slot < btrfs_header_nritems(eb) - 1) { 2621 btrfs_item_key(eb, &disk_key, slot + 1); 2622 if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) { 2623 btrfs_print_leaf(eb); 2624 btrfs_crit(fs_info, 2625 "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT, 2626 slot, btrfs_disk_key_objectid(&disk_key), 2627 btrfs_disk_key_type(&disk_key), 2628 btrfs_disk_key_offset(&disk_key), 2629 BTRFS_KEY_FMT_VALUE(new_key)); 2630 BUG(); 2631 } 2632 } 2633 2634 btrfs_cpu_key_to_disk(&disk_key, new_key); 2635 btrfs_set_item_key(eb, &disk_key, slot); 2636 btrfs_mark_buffer_dirty(trans, eb); 2637 if (slot == 0) 2638 fixup_low_keys(trans, path, &disk_key, 1); 2639 } 2640 2641 /* 2642 * Check key order of two sibling extent buffers. 2643 * 2644 * Return true if something is wrong. 2645 * Return false if everything is fine. 2646 * 2647 * Tree-checker only works inside one tree block, thus the following 2648 * corruption can not be detected by tree-checker: 2649 * 2650 * Leaf @left | Leaf @right 2651 * -------------------------------------------------------------- 2652 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | 2653 * 2654 * Key f6 in leaf @left itself is valid, but not valid when the next 2655 * key in leaf @right is 7. 2656 * This can only be checked at tree block merge time. 2657 * And since tree checker has ensured all key order in each tree block 2658 * is correct, we only need to bother the last key of @left and the first 2659 * key of @right. 2660 */ 2661 static bool check_sibling_keys(const struct extent_buffer *left, 2662 const struct extent_buffer *right) 2663 { 2664 struct btrfs_key left_last; 2665 struct btrfs_key right_first; 2666 int level = btrfs_header_level(left); 2667 int nr_left = btrfs_header_nritems(left); 2668 int nr_right = btrfs_header_nritems(right); 2669 2670 /* No key to check in one of the tree blocks */ 2671 if (!nr_left || !nr_right) 2672 return false; 2673 2674 if (level) { 2675 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); 2676 btrfs_node_key_to_cpu(right, &right_first, 0); 2677 } else { 2678 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); 2679 btrfs_item_key_to_cpu(right, &right_first, 0); 2680 } 2681 2682 if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { 2683 btrfs_crit(left->fs_info, "left extent buffer:"); 2684 btrfs_print_tree(left, false); 2685 btrfs_crit(left->fs_info, "right extent buffer:"); 2686 btrfs_print_tree(right, false); 2687 btrfs_crit(left->fs_info, 2688 "bad key order, sibling blocks, left last " BTRFS_KEY_FMT " right first " BTRFS_KEY_FMT, 2689 BTRFS_KEY_FMT_VALUE(&left_last), 2690 BTRFS_KEY_FMT_VALUE(&right_first)); 2691 return true; 2692 } 2693 return false; 2694 } 2695 2696 /* 2697 * try to push data from one node into the next node left in the 2698 * tree. 2699 * 2700 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2701 * error, and > 0 if there was no room in the left hand block. 2702 */ 2703 static int push_node_left(struct btrfs_trans_handle *trans, 2704 struct extent_buffer *dst, 2705 struct extent_buffer *src, bool empty) 2706 { 2707 struct btrfs_fs_info *fs_info = trans->fs_info; 2708 int push_items = 0; 2709 int src_nritems; 2710 int dst_nritems; 2711 int ret = 0; 2712 2713 src_nritems = btrfs_header_nritems(src); 2714 dst_nritems = btrfs_header_nritems(dst); 2715 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2716 WARN_ON(btrfs_header_generation(src) != trans->transid); 2717 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2718 2719 if (!empty && src_nritems <= 8) 2720 return 1; 2721 2722 if (push_items <= 0) 2723 return 1; 2724 2725 if (empty) { 2726 push_items = min(src_nritems, push_items); 2727 if (push_items < src_nritems) { 2728 /* leave at least 8 pointers in the node if 2729 * we aren't going to empty it 2730 */ 2731 if (src_nritems - push_items < 8) { 2732 if (push_items <= 8) 2733 return 1; 2734 push_items -= 8; 2735 } 2736 } 2737 } else 2738 push_items = min(src_nritems - 8, push_items); 2739 2740 /* dst is the left eb, src is the middle eb */ 2741 if (unlikely(check_sibling_keys(dst, src))) { 2742 ret = -EUCLEAN; 2743 btrfs_abort_transaction(trans, ret); 2744 return ret; 2745 } 2746 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); 2747 if (unlikely(ret)) { 2748 btrfs_abort_transaction(trans, ret); 2749 return ret; 2750 } 2751 copy_extent_buffer(dst, src, 2752 btrfs_node_key_ptr_offset(dst, dst_nritems), 2753 btrfs_node_key_ptr_offset(src, 0), 2754 push_items * sizeof(struct btrfs_key_ptr)); 2755 2756 if (push_items < src_nritems) { 2757 /* 2758 * btrfs_tree_mod_log_eb_copy handles logging the move, so we 2759 * don't need to do an explicit tree mod log operation for it. 2760 */ 2761 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), 2762 btrfs_node_key_ptr_offset(src, push_items), 2763 (src_nritems - push_items) * 2764 sizeof(struct btrfs_key_ptr)); 2765 } 2766 btrfs_set_header_nritems(src, src_nritems - push_items); 2767 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2768 btrfs_mark_buffer_dirty(trans, src); 2769 btrfs_mark_buffer_dirty(trans, dst); 2770 2771 return ret; 2772 } 2773 2774 /* 2775 * try to push data from one node into the next node right in the 2776 * tree. 2777 * 2778 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 2779 * error, and > 0 if there was no room in the right hand block. 2780 * 2781 * this will only push up to 1/2 the contents of the left node over 2782 */ 2783 static int balance_node_right(struct btrfs_trans_handle *trans, 2784 struct extent_buffer *dst, 2785 struct extent_buffer *src) 2786 { 2787 struct btrfs_fs_info *fs_info = trans->fs_info; 2788 int push_items = 0; 2789 int max_push; 2790 int src_nritems; 2791 int dst_nritems; 2792 int ret = 0; 2793 2794 WARN_ON(btrfs_header_generation(src) != trans->transid); 2795 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2796 2797 src_nritems = btrfs_header_nritems(src); 2798 dst_nritems = btrfs_header_nritems(dst); 2799 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2800 if (push_items <= 0) 2801 return 1; 2802 2803 if (src_nritems < 4) 2804 return 1; 2805 2806 max_push = src_nritems / 2 + 1; 2807 /* don't try to empty the node */ 2808 if (max_push >= src_nritems) 2809 return 1; 2810 2811 if (max_push < push_items) 2812 push_items = max_push; 2813 2814 /* dst is the right eb, src is the middle eb */ 2815 if (unlikely(check_sibling_keys(src, dst))) { 2816 ret = -EUCLEAN; 2817 btrfs_abort_transaction(trans, ret); 2818 return ret; 2819 } 2820 2821 /* 2822 * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't 2823 * need to do an explicit tree mod log operation for it. 2824 */ 2825 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), 2826 btrfs_node_key_ptr_offset(dst, 0), 2827 (dst_nritems) * 2828 sizeof(struct btrfs_key_ptr)); 2829 2830 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, 2831 push_items); 2832 if (unlikely(ret)) { 2833 btrfs_abort_transaction(trans, ret); 2834 return ret; 2835 } 2836 copy_extent_buffer(dst, src, 2837 btrfs_node_key_ptr_offset(dst, 0), 2838 btrfs_node_key_ptr_offset(src, src_nritems - push_items), 2839 push_items * sizeof(struct btrfs_key_ptr)); 2840 2841 btrfs_set_header_nritems(src, src_nritems - push_items); 2842 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2843 2844 btrfs_mark_buffer_dirty(trans, src); 2845 btrfs_mark_buffer_dirty(trans, dst); 2846 2847 return ret; 2848 } 2849 2850 /* 2851 * helper function to insert a new root level in the tree. 2852 * A new node is allocated, and a single item is inserted to 2853 * point to the existing root 2854 * 2855 * returns zero on success or < 0 on failure. 2856 */ 2857 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 2858 struct btrfs_root *root, 2859 struct btrfs_path *path, int level) 2860 { 2861 u64 lower_gen; 2862 struct extent_buffer *lower; 2863 struct extent_buffer *c; 2864 struct extent_buffer *old; 2865 struct btrfs_disk_key lower_key; 2866 int ret; 2867 2868 BUG_ON(path->nodes[level]); 2869 BUG_ON(path->nodes[level-1] != root->node); 2870 2871 lower = path->nodes[level-1]; 2872 if (level == 1) 2873 btrfs_item_key(lower, &lower_key, 0); 2874 else 2875 btrfs_node_key(lower, &lower_key, 0); 2876 2877 c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), 2878 &lower_key, level, root->node->start, 0, 2879 0, BTRFS_NESTING_NEW_ROOT); 2880 if (IS_ERR(c)) 2881 return PTR_ERR(c); 2882 2883 root_add_used_bytes(root); 2884 2885 btrfs_set_header_nritems(c, 1); 2886 btrfs_set_node_key(c, &lower_key, 0); 2887 btrfs_set_node_blockptr(c, 0, lower->start); 2888 lower_gen = btrfs_header_generation(lower); 2889 WARN_ON(lower_gen != trans->transid); 2890 2891 btrfs_set_node_ptr_generation(c, 0, lower_gen); 2892 2893 btrfs_mark_buffer_dirty(trans, c); 2894 2895 old = root->node; 2896 ret = btrfs_tree_mod_log_insert_root(root->node, c, false); 2897 if (ret < 0) { 2898 int ret2; 2899 2900 btrfs_clear_buffer_dirty(trans, c); 2901 ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); 2902 if (unlikely(ret2 < 0)) 2903 btrfs_abort_transaction(trans, ret2); 2904 btrfs_tree_unlock(c); 2905 free_extent_buffer(c); 2906 return ret; 2907 } 2908 rcu_assign_pointer(root->node, c); 2909 2910 /* the super has an extra ref to root->node */ 2911 free_extent_buffer(old); 2912 2913 add_root_to_dirty_list(root); 2914 refcount_inc(&c->refs); 2915 path->nodes[level] = c; 2916 path->locks[level] = BTRFS_WRITE_LOCK; 2917 path->slots[level] = 0; 2918 return 0; 2919 } 2920 2921 /* 2922 * worker function to insert a single pointer in a node. 2923 * the node should have enough room for the pointer already 2924 * 2925 * slot and level indicate where you want the key to go, and 2926 * blocknr is the block the key points to. 2927 */ 2928 static int insert_ptr(struct btrfs_trans_handle *trans, 2929 const struct btrfs_path *path, 2930 const struct btrfs_disk_key *key, u64 bytenr, 2931 int slot, int level) 2932 { 2933 struct extent_buffer *lower; 2934 int nritems; 2935 int ret; 2936 2937 BUG_ON(!path->nodes[level]); 2938 btrfs_assert_tree_write_locked(path->nodes[level]); 2939 lower = path->nodes[level]; 2940 nritems = btrfs_header_nritems(lower); 2941 BUG_ON(slot > nritems); 2942 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); 2943 if (slot != nritems) { 2944 if (level) { 2945 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, 2946 slot, nritems - slot); 2947 if (unlikely(ret < 0)) { 2948 btrfs_abort_transaction(trans, ret); 2949 return ret; 2950 } 2951 } 2952 memmove_extent_buffer(lower, 2953 btrfs_node_key_ptr_offset(lower, slot + 1), 2954 btrfs_node_key_ptr_offset(lower, slot), 2955 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2956 } 2957 if (level) { 2958 ret = btrfs_tree_mod_log_insert_key(lower, slot, 2959 BTRFS_MOD_LOG_KEY_ADD); 2960 if (unlikely(ret < 0)) { 2961 btrfs_abort_transaction(trans, ret); 2962 return ret; 2963 } 2964 } 2965 btrfs_set_node_key(lower, key, slot); 2966 btrfs_set_node_blockptr(lower, slot, bytenr); 2967 WARN_ON(trans->transid == 0); 2968 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 2969 btrfs_set_header_nritems(lower, nritems + 1); 2970 btrfs_mark_buffer_dirty(trans, lower); 2971 2972 return 0; 2973 } 2974 2975 /* 2976 * split the node at the specified level in path in two. 2977 * The path is corrected to point to the appropriate node after the split 2978 * 2979 * Before splitting this tries to make some room in the node by pushing 2980 * left and right, if either one works, it returns right away. 2981 * 2982 * returns 0 on success and < 0 on failure 2983 */ 2984 static noinline int split_node(struct btrfs_trans_handle *trans, 2985 struct btrfs_root *root, 2986 struct btrfs_path *path, int level) 2987 { 2988 struct btrfs_fs_info *fs_info = root->fs_info; 2989 struct extent_buffer *c; 2990 struct extent_buffer *split; 2991 struct btrfs_disk_key disk_key; 2992 int mid; 2993 int ret; 2994 u32 c_nritems; 2995 2996 c = path->nodes[level]; 2997 WARN_ON(btrfs_header_generation(c) != trans->transid); 2998 if (c == root->node) { 2999 /* 3000 * trying to split the root, lets make a new one 3001 * 3002 * tree mod log: We don't log_removal old root in 3003 * insert_new_root, because that root buffer will be kept as a 3004 * normal node. We are going to log removal of half of the 3005 * elements below with btrfs_tree_mod_log_eb_copy(). We're 3006 * holding a tree lock on the buffer, which is why we cannot 3007 * race with other tree_mod_log users. 3008 */ 3009 ret = insert_new_root(trans, root, path, level + 1); 3010 if (ret) 3011 return ret; 3012 } else { 3013 ret = push_nodes_for_insert(trans, root, path, level); 3014 c = path->nodes[level]; 3015 if (!ret && btrfs_header_nritems(c) < 3016 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 3017 return 0; 3018 if (ret < 0) 3019 return ret; 3020 } 3021 3022 c_nritems = btrfs_header_nritems(c); 3023 mid = (c_nritems + 1) / 2; 3024 btrfs_node_key(c, &disk_key, mid); 3025 3026 split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), 3027 &disk_key, level, c->start, 0, 3028 0, BTRFS_NESTING_SPLIT); 3029 if (IS_ERR(split)) 3030 return PTR_ERR(split); 3031 3032 root_add_used_bytes(root); 3033 ASSERT(btrfs_header_level(c) == level); 3034 3035 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); 3036 if (unlikely(ret)) { 3037 btrfs_tree_unlock(split); 3038 free_extent_buffer(split); 3039 btrfs_abort_transaction(trans, ret); 3040 return ret; 3041 } 3042 copy_extent_buffer(split, c, 3043 btrfs_node_key_ptr_offset(split, 0), 3044 btrfs_node_key_ptr_offset(c, mid), 3045 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 3046 btrfs_set_header_nritems(split, c_nritems - mid); 3047 btrfs_set_header_nritems(c, mid); 3048 3049 btrfs_mark_buffer_dirty(trans, c); 3050 btrfs_mark_buffer_dirty(trans, split); 3051 3052 ret = insert_ptr(trans, path, &disk_key, split->start, 3053 path->slots[level + 1] + 1, level + 1); 3054 if (ret < 0) { 3055 btrfs_tree_unlock(split); 3056 free_extent_buffer(split); 3057 return ret; 3058 } 3059 3060 if (path->slots[level] >= mid) { 3061 path->slots[level] -= mid; 3062 btrfs_tree_unlock(c); 3063 free_extent_buffer(c); 3064 path->nodes[level] = split; 3065 path->slots[level + 1] += 1; 3066 } else { 3067 btrfs_tree_unlock(split); 3068 free_extent_buffer(split); 3069 } 3070 return 0; 3071 } 3072 3073 /* 3074 * how many bytes are required to store the items in a leaf. start 3075 * and nr indicate which items in the leaf to check. This totals up the 3076 * space used both by the item structs and the item data 3077 */ 3078 static int leaf_space_used(const struct extent_buffer *l, int start, int nr) 3079 { 3080 int data_len; 3081 int nritems = btrfs_header_nritems(l); 3082 int end = min(nritems, start + nr) - 1; 3083 3084 if (!nr) 3085 return 0; 3086 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); 3087 data_len = data_len - btrfs_item_offset(l, end); 3088 data_len += sizeof(struct btrfs_item) * nr; 3089 WARN_ON(data_len < 0); 3090 return data_len; 3091 } 3092 3093 /* 3094 * The space between the end of the leaf items and 3095 * the start of the leaf data. IOW, how much room 3096 * the leaf has left for both items and data 3097 */ 3098 int btrfs_leaf_free_space(const struct extent_buffer *leaf) 3099 { 3100 struct btrfs_fs_info *fs_info = leaf->fs_info; 3101 int nritems = btrfs_header_nritems(leaf); 3102 int ret; 3103 3104 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 3105 if (unlikely(ret < 0)) { 3106 btrfs_crit(fs_info, 3107 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 3108 ret, 3109 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 3110 leaf_space_used(leaf, 0, nritems), nritems); 3111 } 3112 return ret; 3113 } 3114 3115 /* 3116 * min slot controls the lowest index we're willing to push to the 3117 * right. We'll push up to and including min_slot, but no lower 3118 */ 3119 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 3120 struct btrfs_path *path, 3121 int data_size, bool empty, 3122 struct extent_buffer *right, 3123 int free_space, u32 left_nritems, 3124 u32 min_slot) 3125 { 3126 struct btrfs_fs_info *fs_info = right->fs_info; 3127 struct extent_buffer *left = path->nodes[0]; 3128 struct extent_buffer *upper = path->nodes[1]; 3129 struct btrfs_disk_key disk_key; 3130 int slot; 3131 u32 i; 3132 int push_space = 0; 3133 int push_items = 0; 3134 u32 nr; 3135 u32 right_nritems; 3136 u32 data_end; 3137 u32 this_item_size; 3138 3139 if (empty) 3140 nr = 0; 3141 else 3142 nr = max_t(u32, 1, min_slot); 3143 3144 if (path->slots[0] >= left_nritems) 3145 push_space += data_size; 3146 3147 slot = path->slots[1]; 3148 i = left_nritems - 1; 3149 while (i >= nr) { 3150 if (!empty && push_items > 0) { 3151 if (path->slots[0] > i) 3152 break; 3153 if (path->slots[0] == i) { 3154 int space = btrfs_leaf_free_space(left); 3155 3156 if (space + push_space * 2 > free_space) 3157 break; 3158 } 3159 } 3160 3161 if (path->slots[0] == i) 3162 push_space += data_size; 3163 3164 this_item_size = btrfs_item_size(left, i); 3165 if (this_item_size + sizeof(struct btrfs_item) + 3166 push_space > free_space) 3167 break; 3168 3169 push_items++; 3170 push_space += this_item_size + sizeof(struct btrfs_item); 3171 if (i == 0) 3172 break; 3173 i--; 3174 } 3175 3176 if (push_items == 0) 3177 goto out_unlock; 3178 3179 WARN_ON(!empty && push_items == left_nritems); 3180 3181 /* push left to right */ 3182 right_nritems = btrfs_header_nritems(right); 3183 3184 push_space = btrfs_item_data_end(left, left_nritems - push_items); 3185 push_space -= leaf_data_end(left); 3186 3187 /* make room in the right data area */ 3188 data_end = leaf_data_end(right); 3189 memmove_leaf_data(right, data_end - push_space, data_end, 3190 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 3191 3192 /* copy from the left data area */ 3193 copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3194 leaf_data_end(left), push_space); 3195 3196 memmove_leaf_items(right, push_items, 0, right_nritems); 3197 3198 /* copy the items from left to right */ 3199 copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); 3200 3201 /* update the item pointers */ 3202 right_nritems += push_items; 3203 btrfs_set_header_nritems(right, right_nritems); 3204 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3205 for (i = 0; i < right_nritems; i++) { 3206 push_space -= btrfs_item_size(right, i); 3207 btrfs_set_item_offset(right, i, push_space); 3208 } 3209 3210 left_nritems -= push_items; 3211 btrfs_set_header_nritems(left, left_nritems); 3212 3213 if (left_nritems) 3214 btrfs_mark_buffer_dirty(trans, left); 3215 else 3216 btrfs_clear_buffer_dirty(trans, left); 3217 3218 btrfs_mark_buffer_dirty(trans, right); 3219 3220 btrfs_item_key(right, &disk_key, 0); 3221 btrfs_set_node_key(upper, &disk_key, slot + 1); 3222 btrfs_mark_buffer_dirty(trans, upper); 3223 3224 /* then fixup the leaf pointer in the path */ 3225 if (path->slots[0] >= left_nritems) { 3226 path->slots[0] -= left_nritems; 3227 btrfs_tree_unlock(left); 3228 free_extent_buffer(left); 3229 path->nodes[0] = right; 3230 path->slots[1] += 1; 3231 } else { 3232 btrfs_tree_unlock(right); 3233 free_extent_buffer(right); 3234 } 3235 return 0; 3236 3237 out_unlock: 3238 btrfs_tree_unlock(right); 3239 free_extent_buffer(right); 3240 return 1; 3241 } 3242 3243 /* 3244 * push some data in the path leaf to the right, trying to free up at 3245 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3246 * 3247 * returns 1 if the push failed because the other node didn't have enough 3248 * room, 0 if everything worked out and < 0 if there were major errors. 3249 * 3250 * this will push starting from min_slot to the end of the leaf. It won't 3251 * push any slot lower than min_slot 3252 */ 3253 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3254 *root, struct btrfs_path *path, 3255 int min_data_size, int data_size, 3256 bool empty, u32 min_slot) 3257 { 3258 struct extent_buffer *left = path->nodes[0]; 3259 struct extent_buffer *right; 3260 struct extent_buffer *upper; 3261 int slot; 3262 int free_space; 3263 u32 left_nritems; 3264 int ret; 3265 3266 if (!path->nodes[1]) 3267 return 1; 3268 3269 slot = path->slots[1]; 3270 upper = path->nodes[1]; 3271 if (slot >= btrfs_header_nritems(upper) - 1) 3272 return 1; 3273 3274 btrfs_assert_tree_write_locked(path->nodes[1]); 3275 3276 right = btrfs_read_node_slot(upper, slot + 1); 3277 if (IS_ERR(right)) 3278 return PTR_ERR(right); 3279 3280 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); 3281 3282 free_space = btrfs_leaf_free_space(right); 3283 if (free_space < data_size) 3284 goto out_unlock; 3285 3286 ret = btrfs_cow_block(trans, root, right, upper, 3287 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 3288 if (ret) 3289 goto out_unlock; 3290 3291 left_nritems = btrfs_header_nritems(left); 3292 if (left_nritems == 0) 3293 goto out_unlock; 3294 3295 if (unlikely(check_sibling_keys(left, right))) { 3296 ret = -EUCLEAN; 3297 btrfs_abort_transaction(trans, ret); 3298 btrfs_tree_unlock(right); 3299 free_extent_buffer(right); 3300 return ret; 3301 } 3302 if (path->slots[0] == left_nritems && !empty) { 3303 /* Key greater than all keys in the leaf, right neighbor has 3304 * enough room for it and we're not emptying our leaf to delete 3305 * it, therefore use right neighbor to insert the new item and 3306 * no need to touch/dirty our left leaf. */ 3307 btrfs_tree_unlock(left); 3308 free_extent_buffer(left); 3309 path->nodes[0] = right; 3310 path->slots[0] = 0; 3311 path->slots[1]++; 3312 return 0; 3313 } 3314 3315 return __push_leaf_right(trans, path, min_data_size, empty, right, 3316 free_space, left_nritems, min_slot); 3317 out_unlock: 3318 btrfs_tree_unlock(right); 3319 free_extent_buffer(right); 3320 return 1; 3321 } 3322 3323 /* 3324 * push some data in the path leaf to the left, trying to free up at 3325 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3326 * 3327 * max_slot can put a limit on how far into the leaf we'll push items. The 3328 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3329 * items 3330 */ 3331 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 3332 struct btrfs_path *path, int data_size, 3333 bool empty, struct extent_buffer *left, 3334 int free_space, u32 right_nritems, 3335 u32 max_slot) 3336 { 3337 struct btrfs_fs_info *fs_info = left->fs_info; 3338 struct btrfs_disk_key disk_key; 3339 struct extent_buffer *right = path->nodes[0]; 3340 int i; 3341 int push_space = 0; 3342 int push_items = 0; 3343 u32 old_left_nritems; 3344 u32 nr; 3345 int ret = 0; 3346 u32 this_item_size; 3347 u32 old_left_item_size; 3348 3349 if (empty) 3350 nr = min(right_nritems, max_slot); 3351 else 3352 nr = min(right_nritems - 1, max_slot); 3353 3354 for (i = 0; i < nr; i++) { 3355 if (!empty && push_items > 0) { 3356 if (path->slots[0] < i) 3357 break; 3358 if (path->slots[0] == i) { 3359 int space = btrfs_leaf_free_space(right); 3360 3361 if (space + push_space * 2 > free_space) 3362 break; 3363 } 3364 } 3365 3366 if (path->slots[0] == i) 3367 push_space += data_size; 3368 3369 this_item_size = btrfs_item_size(right, i); 3370 if (this_item_size + sizeof(struct btrfs_item) + push_space > 3371 free_space) 3372 break; 3373 3374 push_items++; 3375 push_space += this_item_size + sizeof(struct btrfs_item); 3376 } 3377 3378 if (push_items == 0) { 3379 ret = 1; 3380 goto out; 3381 } 3382 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 3383 3384 /* push data from right to left */ 3385 copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); 3386 3387 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 3388 btrfs_item_offset(right, push_items - 1); 3389 3390 copy_leaf_data(left, right, leaf_data_end(left) - push_space, 3391 btrfs_item_offset(right, push_items - 1), push_space); 3392 old_left_nritems = btrfs_header_nritems(left); 3393 BUG_ON(old_left_nritems <= 0); 3394 3395 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); 3396 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3397 u32 ioff; 3398 3399 ioff = btrfs_item_offset(left, i); 3400 btrfs_set_item_offset(left, i, 3401 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); 3402 } 3403 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3404 3405 /* fixup right node */ 3406 if (unlikely(push_items > right_nritems)) { 3407 ret = -EUCLEAN; 3408 btrfs_abort_transaction(trans, ret); 3409 btrfs_crit(fs_info, "push items (%d) > right leaf items (%u)", 3410 push_items, right_nritems); 3411 goto out; 3412 } 3413 3414 if (push_items < right_nritems) { 3415 push_space = btrfs_item_offset(right, push_items - 1) - 3416 leaf_data_end(right); 3417 memmove_leaf_data(right, 3418 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3419 leaf_data_end(right), push_space); 3420 3421 memmove_leaf_items(right, 0, push_items, 3422 btrfs_header_nritems(right) - push_items); 3423 } 3424 3425 right_nritems -= push_items; 3426 btrfs_set_header_nritems(right, right_nritems); 3427 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3428 for (i = 0; i < right_nritems; i++) { 3429 push_space = push_space - btrfs_item_size(right, i); 3430 btrfs_set_item_offset(right, i, push_space); 3431 } 3432 3433 btrfs_mark_buffer_dirty(trans, left); 3434 if (right_nritems) 3435 btrfs_mark_buffer_dirty(trans, right); 3436 else 3437 btrfs_clear_buffer_dirty(trans, right); 3438 3439 btrfs_item_key(right, &disk_key, 0); 3440 fixup_low_keys(trans, path, &disk_key, 1); 3441 3442 /* then fixup the leaf pointer in the path */ 3443 if (path->slots[0] < push_items) { 3444 path->slots[0] += old_left_nritems; 3445 btrfs_tree_unlock(right); 3446 free_extent_buffer(right); 3447 path->nodes[0] = left; 3448 path->slots[1] -= 1; 3449 } else { 3450 btrfs_tree_unlock(left); 3451 free_extent_buffer(left); 3452 path->slots[0] -= push_items; 3453 } 3454 BUG_ON(path->slots[0] < 0); 3455 return ret; 3456 out: 3457 btrfs_tree_unlock(left); 3458 free_extent_buffer(left); 3459 return ret; 3460 } 3461 3462 /* 3463 * push some data in the path leaf to the left, trying to free up at 3464 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3465 * 3466 * max_slot can put a limit on how far into the leaf we'll push items. The 3467 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3468 * items 3469 */ 3470 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3471 *root, struct btrfs_path *path, int min_data_size, 3472 int data_size, int empty, u32 max_slot) 3473 { 3474 struct extent_buffer *right = path->nodes[0]; 3475 struct extent_buffer *left; 3476 int slot; 3477 int free_space; 3478 u32 right_nritems; 3479 int ret = 0; 3480 3481 slot = path->slots[1]; 3482 if (slot == 0) 3483 return 1; 3484 if (!path->nodes[1]) 3485 return 1; 3486 3487 right_nritems = btrfs_header_nritems(right); 3488 if (right_nritems == 0) 3489 return 1; 3490 3491 btrfs_assert_tree_write_locked(path->nodes[1]); 3492 3493 left = btrfs_read_node_slot(path->nodes[1], slot - 1); 3494 if (IS_ERR(left)) 3495 return PTR_ERR(left); 3496 3497 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); 3498 3499 free_space = btrfs_leaf_free_space(left); 3500 if (free_space < data_size) { 3501 ret = 1; 3502 goto out; 3503 } 3504 3505 ret = btrfs_cow_block(trans, root, left, 3506 path->nodes[1], slot - 1, &left, 3507 BTRFS_NESTING_LEFT_COW); 3508 if (ret) { 3509 /* we hit -ENOSPC, but it isn't fatal here */ 3510 if (ret == -ENOSPC) 3511 ret = 1; 3512 goto out; 3513 } 3514 3515 if (unlikely(check_sibling_keys(left, right))) { 3516 ret = -EUCLEAN; 3517 btrfs_abort_transaction(trans, ret); 3518 goto out; 3519 } 3520 return __push_leaf_left(trans, path, min_data_size, empty, left, 3521 free_space, right_nritems, max_slot); 3522 out: 3523 btrfs_tree_unlock(left); 3524 free_extent_buffer(left); 3525 return ret; 3526 } 3527 3528 /* 3529 * split the path's leaf in two, making sure there is at least data_size 3530 * available for the resulting leaf level of the path. 3531 */ 3532 static noinline int copy_for_split(struct btrfs_trans_handle *trans, 3533 struct btrfs_path *path, 3534 struct extent_buffer *l, 3535 struct extent_buffer *right, 3536 int slot, int mid, int nritems) 3537 { 3538 struct btrfs_fs_info *fs_info = trans->fs_info; 3539 int data_copy_size; 3540 int rt_data_off; 3541 int i; 3542 int ret; 3543 struct btrfs_disk_key disk_key; 3544 3545 nritems = nritems - mid; 3546 btrfs_set_header_nritems(right, nritems); 3547 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); 3548 3549 copy_leaf_items(right, l, 0, mid, nritems); 3550 3551 copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, 3552 leaf_data_end(l), data_copy_size); 3553 3554 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); 3555 3556 for (i = 0; i < nritems; i++) { 3557 u32 ioff; 3558 3559 ioff = btrfs_item_offset(right, i); 3560 btrfs_set_item_offset(right, i, ioff + rt_data_off); 3561 } 3562 3563 btrfs_set_header_nritems(l, mid); 3564 btrfs_item_key(right, &disk_key, 0); 3565 ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); 3566 if (ret < 0) 3567 return ret; 3568 3569 btrfs_mark_buffer_dirty(trans, right); 3570 btrfs_mark_buffer_dirty(trans, l); 3571 BUG_ON(path->slots[0] != slot); 3572 3573 if (mid <= slot) { 3574 btrfs_tree_unlock(path->nodes[0]); 3575 free_extent_buffer(path->nodes[0]); 3576 path->nodes[0] = right; 3577 path->slots[0] -= mid; 3578 path->slots[1] += 1; 3579 } else { 3580 btrfs_tree_unlock(right); 3581 free_extent_buffer(right); 3582 } 3583 3584 BUG_ON(path->slots[0] < 0); 3585 3586 return 0; 3587 } 3588 3589 /* 3590 * double splits happen when we need to insert a big item in the middle 3591 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3592 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3593 * A B C 3594 * 3595 * We avoid this by trying to push the items on either side of our target 3596 * into the adjacent leaves. If all goes well we can avoid the double split 3597 * completely. 3598 */ 3599 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3600 struct btrfs_root *root, 3601 struct btrfs_path *path, 3602 int data_size) 3603 { 3604 int ret; 3605 int progress = 0; 3606 int slot; 3607 u32 nritems; 3608 int space_needed = data_size; 3609 3610 slot = path->slots[0]; 3611 if (slot < btrfs_header_nritems(path->nodes[0])) 3612 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3613 3614 /* 3615 * try to push all the items after our slot into the 3616 * right leaf 3617 */ 3618 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 3619 if (ret < 0) 3620 return ret; 3621 3622 if (ret == 0) 3623 progress++; 3624 3625 nritems = btrfs_header_nritems(path->nodes[0]); 3626 /* 3627 * our goal is to get our slot at the start or end of a leaf. If 3628 * we've done so we're done 3629 */ 3630 if (path->slots[0] == 0 || path->slots[0] == nritems) 3631 return 0; 3632 3633 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3634 return 0; 3635 3636 /* try to push all the items before our slot into the next leaf */ 3637 slot = path->slots[0]; 3638 space_needed = data_size; 3639 if (slot > 0) 3640 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3641 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 3642 if (ret < 0) 3643 return ret; 3644 3645 if (ret == 0) 3646 progress++; 3647 3648 if (progress) 3649 return 0; 3650 return 1; 3651 } 3652 3653 /* 3654 * split the path's leaf in two, making sure there is at least data_size 3655 * available for the resulting leaf level of the path. 3656 * 3657 * returns 0 if all went well and < 0 on failure. 3658 */ 3659 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3660 struct btrfs_root *root, 3661 const struct btrfs_key *ins_key, 3662 struct btrfs_path *path, int data_size, 3663 bool extend) 3664 { 3665 struct btrfs_disk_key disk_key; 3666 struct extent_buffer *l; 3667 u32 nritems; 3668 int mid; 3669 int slot; 3670 struct extent_buffer *right; 3671 struct btrfs_fs_info *fs_info = root->fs_info; 3672 int ret = 0; 3673 int wret; 3674 int split; 3675 int num_doubles = 0; 3676 int tried_avoid_double = 0; 3677 3678 l = path->nodes[0]; 3679 slot = path->slots[0]; 3680 if (extend && data_size + btrfs_item_size(l, slot) + 3681 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 3682 return -EOVERFLOW; 3683 3684 /* first try to make some room by pushing left and right */ 3685 if (data_size && path->nodes[1]) { 3686 int space_needed = data_size; 3687 3688 if (slot < btrfs_header_nritems(l)) 3689 space_needed -= btrfs_leaf_free_space(l); 3690 3691 wret = push_leaf_right(trans, root, path, space_needed, 3692 space_needed, 0, 0); 3693 if (wret < 0) 3694 return wret; 3695 if (wret) { 3696 space_needed = data_size; 3697 if (slot > 0) 3698 space_needed -= btrfs_leaf_free_space(l); 3699 wret = push_leaf_left(trans, root, path, space_needed, 3700 space_needed, 0, (u32)-1); 3701 if (wret < 0) 3702 return wret; 3703 } 3704 l = path->nodes[0]; 3705 3706 /* did the pushes work? */ 3707 if (btrfs_leaf_free_space(l) >= data_size) 3708 return 0; 3709 } 3710 3711 if (!path->nodes[1]) { 3712 ret = insert_new_root(trans, root, path, 1); 3713 if (ret) 3714 return ret; 3715 } 3716 again: 3717 split = 1; 3718 l = path->nodes[0]; 3719 slot = path->slots[0]; 3720 nritems = btrfs_header_nritems(l); 3721 mid = (nritems + 1) / 2; 3722 3723 if (mid <= slot) { 3724 if (nritems == 1 || 3725 leaf_space_used(l, mid, nritems - mid) + data_size > 3726 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3727 if (slot >= nritems) { 3728 split = 0; 3729 } else { 3730 mid = slot; 3731 if (mid != nritems && 3732 leaf_space_used(l, mid, nritems - mid) + 3733 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3734 if (data_size && !tried_avoid_double) 3735 goto push_for_double; 3736 split = 2; 3737 } 3738 } 3739 } 3740 } else { 3741 if (leaf_space_used(l, 0, mid) + data_size > 3742 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3743 if (!extend && data_size && slot == 0) { 3744 split = 0; 3745 } else if ((extend || !data_size) && slot == 0) { 3746 mid = 1; 3747 } else { 3748 mid = slot; 3749 if (mid != nritems && 3750 leaf_space_used(l, mid, nritems - mid) + 3751 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3752 if (data_size && !tried_avoid_double) 3753 goto push_for_double; 3754 split = 2; 3755 } 3756 } 3757 } 3758 } 3759 3760 if (split == 0) 3761 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3762 else 3763 btrfs_item_key(l, &disk_key, mid); 3764 3765 /* 3766 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double 3767 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES 3768 * subclasses, which is 8 at the time of this patch, and we've maxed it 3769 * out. In the future we could add a 3770 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just 3771 * use BTRFS_NESTING_NEW_ROOT. 3772 */ 3773 right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), 3774 &disk_key, 0, l->start, 0, 0, 3775 num_doubles ? BTRFS_NESTING_NEW_ROOT : 3776 BTRFS_NESTING_SPLIT); 3777 if (IS_ERR(right)) 3778 return PTR_ERR(right); 3779 3780 root_add_used_bytes(root); 3781 3782 if (split == 0) { 3783 if (mid <= slot) { 3784 btrfs_set_header_nritems(right, 0); 3785 ret = insert_ptr(trans, path, &disk_key, 3786 right->start, path->slots[1] + 1, 1); 3787 if (ret < 0) { 3788 btrfs_tree_unlock(right); 3789 free_extent_buffer(right); 3790 return ret; 3791 } 3792 btrfs_tree_unlock(path->nodes[0]); 3793 free_extent_buffer(path->nodes[0]); 3794 path->nodes[0] = right; 3795 path->slots[0] = 0; 3796 path->slots[1] += 1; 3797 } else { 3798 btrfs_set_header_nritems(right, 0); 3799 ret = insert_ptr(trans, path, &disk_key, 3800 right->start, path->slots[1], 1); 3801 if (ret < 0) { 3802 btrfs_tree_unlock(right); 3803 free_extent_buffer(right); 3804 return ret; 3805 } 3806 btrfs_tree_unlock(path->nodes[0]); 3807 free_extent_buffer(path->nodes[0]); 3808 path->nodes[0] = right; 3809 path->slots[0] = 0; 3810 if (path->slots[1] == 0) 3811 fixup_low_keys(trans, path, &disk_key, 1); 3812 } 3813 /* 3814 * We create a new leaf 'right' for the required ins_len and 3815 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 3816 * the content of ins_len to 'right'. 3817 */ 3818 return ret; 3819 } 3820 3821 ret = copy_for_split(trans, path, l, right, slot, mid, nritems); 3822 if (ret < 0) { 3823 btrfs_tree_unlock(right); 3824 free_extent_buffer(right); 3825 return ret; 3826 } 3827 3828 if (split == 2) { 3829 BUG_ON(num_doubles != 0); 3830 num_doubles++; 3831 goto again; 3832 } 3833 3834 return 0; 3835 3836 push_for_double: 3837 push_for_double_split(trans, root, path, data_size); 3838 tried_avoid_double = 1; 3839 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3840 return 0; 3841 goto again; 3842 } 3843 3844 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3845 struct btrfs_root *root, 3846 struct btrfs_path *path, int ins_len) 3847 { 3848 struct btrfs_key key; 3849 struct extent_buffer *leaf; 3850 struct btrfs_file_extent_item *fi; 3851 u64 extent_len = 0; 3852 u32 item_size; 3853 int ret; 3854 3855 leaf = path->nodes[0]; 3856 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3857 3858 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 3859 key.type != BTRFS_RAID_STRIPE_KEY && 3860 key.type != BTRFS_EXTENT_CSUM_KEY); 3861 3862 if (btrfs_leaf_free_space(leaf) >= ins_len) 3863 return 0; 3864 3865 item_size = btrfs_item_size(leaf, path->slots[0]); 3866 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3867 fi = btrfs_item_ptr(leaf, path->slots[0], 3868 struct btrfs_file_extent_item); 3869 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3870 } 3871 btrfs_release_path(path); 3872 3873 path->keep_locks = true; 3874 path->search_for_split = true; 3875 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 3876 path->search_for_split = false; 3877 if (ret > 0) 3878 ret = -EAGAIN; 3879 if (ret < 0) 3880 goto err; 3881 3882 ret = -EAGAIN; 3883 leaf = path->nodes[0]; 3884 /* if our item isn't there, return now */ 3885 if (item_size != btrfs_item_size(leaf, path->slots[0])) 3886 goto err; 3887 3888 /* the leaf has changed, it now has room. return now */ 3889 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) 3890 goto err; 3891 3892 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3893 fi = btrfs_item_ptr(leaf, path->slots[0], 3894 struct btrfs_file_extent_item); 3895 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 3896 goto err; 3897 } 3898 3899 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3900 if (ret) 3901 goto err; 3902 3903 path->keep_locks = false; 3904 btrfs_unlock_up_safe(path, 1); 3905 return 0; 3906 err: 3907 path->keep_locks = false; 3908 return ret; 3909 } 3910 3911 static noinline int split_item(struct btrfs_trans_handle *trans, 3912 struct btrfs_path *path, 3913 const struct btrfs_key *new_key, 3914 unsigned long split_offset) 3915 { 3916 struct extent_buffer *leaf; 3917 int orig_slot, slot; 3918 char *buf; 3919 u32 nritems; 3920 u32 item_size; 3921 u32 orig_offset; 3922 struct btrfs_disk_key disk_key; 3923 3924 leaf = path->nodes[0]; 3925 /* 3926 * Shouldn't happen because the caller must have previously called 3927 * setup_leaf_for_split() to make room for the new item in the leaf. 3928 */ 3929 if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item))) 3930 return -ENOSPC; 3931 3932 orig_slot = path->slots[0]; 3933 orig_offset = btrfs_item_offset(leaf, path->slots[0]); 3934 item_size = btrfs_item_size(leaf, path->slots[0]); 3935 3936 buf = kmalloc(item_size, GFP_NOFS); 3937 if (!buf) 3938 return -ENOMEM; 3939 3940 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3941 path->slots[0]), item_size); 3942 3943 slot = path->slots[0] + 1; 3944 nritems = btrfs_header_nritems(leaf); 3945 if (slot != nritems) { 3946 /* shift the items */ 3947 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); 3948 } 3949 3950 btrfs_cpu_key_to_disk(&disk_key, new_key); 3951 btrfs_set_item_key(leaf, &disk_key, slot); 3952 3953 btrfs_set_item_offset(leaf, slot, orig_offset); 3954 btrfs_set_item_size(leaf, slot, item_size - split_offset); 3955 3956 btrfs_set_item_offset(leaf, orig_slot, 3957 orig_offset + item_size - split_offset); 3958 btrfs_set_item_size(leaf, orig_slot, split_offset); 3959 3960 btrfs_set_header_nritems(leaf, nritems + 1); 3961 3962 /* write the data for the start of the original item */ 3963 write_extent_buffer(leaf, buf, 3964 btrfs_item_ptr_offset(leaf, path->slots[0]), 3965 split_offset); 3966 3967 /* write the data for the new item */ 3968 write_extent_buffer(leaf, buf + split_offset, 3969 btrfs_item_ptr_offset(leaf, slot), 3970 item_size - split_offset); 3971 btrfs_mark_buffer_dirty(trans, leaf); 3972 3973 BUG_ON(btrfs_leaf_free_space(leaf) < 0); 3974 kfree(buf); 3975 return 0; 3976 } 3977 3978 /* 3979 * This function splits a single item into two items, 3980 * giving 'new_key' to the new item and splitting the 3981 * old one at split_offset (from the start of the item). 3982 * 3983 * The path may be released by this operation. After 3984 * the split, the path is pointing to the old item. The 3985 * new item is going to be in the same node as the old one. 3986 * 3987 * Note, the item being split must be smaller enough to live alone on 3988 * a tree block with room for one extra struct btrfs_item 3989 * 3990 * This allows us to split the item in place, keeping a lock on the 3991 * leaf the entire time. 3992 */ 3993 int btrfs_split_item(struct btrfs_trans_handle *trans, 3994 struct btrfs_root *root, 3995 struct btrfs_path *path, 3996 const struct btrfs_key *new_key, 3997 unsigned long split_offset) 3998 { 3999 int ret; 4000 ret = setup_leaf_for_split(trans, root, path, 4001 sizeof(struct btrfs_item)); 4002 if (ret) 4003 return ret; 4004 4005 return split_item(trans, path, new_key, split_offset); 4006 } 4007 4008 /* 4009 * make the item pointed to by the path smaller. new_size indicates 4010 * how small to make it, and from_end tells us if we just chop bytes 4011 * off the end of the item or if we shift the item to chop bytes off 4012 * the front. 4013 */ 4014 void btrfs_truncate_item(struct btrfs_trans_handle *trans, 4015 const struct btrfs_path *path, u32 new_size, int from_end) 4016 { 4017 int slot; 4018 struct extent_buffer *leaf; 4019 u32 nritems; 4020 unsigned int data_end; 4021 unsigned int old_data_start; 4022 unsigned int old_size; 4023 unsigned int size_diff; 4024 int i; 4025 4026 leaf = path->nodes[0]; 4027 slot = path->slots[0]; 4028 4029 old_size = btrfs_item_size(leaf, slot); 4030 if (old_size == new_size) 4031 return; 4032 4033 nritems = btrfs_header_nritems(leaf); 4034 data_end = leaf_data_end(leaf); 4035 4036 old_data_start = btrfs_item_offset(leaf, slot); 4037 4038 size_diff = old_size - new_size; 4039 4040 BUG_ON(slot < 0); 4041 BUG_ON(slot >= nritems); 4042 4043 /* 4044 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4045 */ 4046 /* first correct the data pointers */ 4047 for (i = slot; i < nritems; i++) { 4048 u32 ioff; 4049 4050 ioff = btrfs_item_offset(leaf, i); 4051 btrfs_set_item_offset(leaf, i, ioff + size_diff); 4052 } 4053 4054 /* shift the data */ 4055 if (from_end) { 4056 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4057 old_data_start + new_size - data_end); 4058 } else { 4059 struct btrfs_disk_key disk_key; 4060 u64 offset; 4061 4062 btrfs_item_key(leaf, &disk_key, slot); 4063 4064 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 4065 unsigned long ptr; 4066 struct btrfs_file_extent_item *fi; 4067 4068 fi = btrfs_item_ptr(leaf, slot, 4069 struct btrfs_file_extent_item); 4070 fi = (struct btrfs_file_extent_item *)( 4071 (unsigned long)fi - size_diff); 4072 4073 if (btrfs_file_extent_type(leaf, fi) == 4074 BTRFS_FILE_EXTENT_INLINE) { 4075 ptr = btrfs_item_ptr_offset(leaf, slot); 4076 memmove_extent_buffer(leaf, ptr, 4077 (unsigned long)fi, 4078 BTRFS_FILE_EXTENT_INLINE_DATA_START); 4079 } 4080 } 4081 4082 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4083 old_data_start - data_end); 4084 4085 offset = btrfs_disk_key_offset(&disk_key); 4086 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4087 btrfs_set_item_key(leaf, &disk_key, slot); 4088 if (slot == 0) 4089 fixup_low_keys(trans, path, &disk_key, 1); 4090 } 4091 4092 btrfs_set_item_size(leaf, slot, new_size); 4093 btrfs_mark_buffer_dirty(trans, leaf); 4094 4095 if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { 4096 btrfs_print_leaf(leaf); 4097 BUG(); 4098 } 4099 } 4100 4101 /* 4102 * make the item pointed to by the path bigger, data_size is the added size. 4103 */ 4104 void btrfs_extend_item(struct btrfs_trans_handle *trans, 4105 const struct btrfs_path *path, u32 data_size) 4106 { 4107 int slot; 4108 struct extent_buffer *leaf; 4109 u32 nritems; 4110 unsigned int data_end; 4111 unsigned int old_data; 4112 unsigned int old_size; 4113 int i; 4114 4115 leaf = path->nodes[0]; 4116 4117 nritems = btrfs_header_nritems(leaf); 4118 data_end = leaf_data_end(leaf); 4119 4120 if (unlikely(btrfs_leaf_free_space(leaf) < data_size)) { 4121 btrfs_print_leaf(leaf); 4122 BUG(); 4123 } 4124 slot = path->slots[0]; 4125 old_data = btrfs_item_data_end(leaf, slot); 4126 4127 BUG_ON(slot < 0); 4128 if (unlikely(slot >= nritems)) { 4129 btrfs_print_leaf(leaf); 4130 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", 4131 slot, nritems); 4132 BUG(); 4133 } 4134 4135 /* 4136 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4137 */ 4138 /* first correct the data pointers */ 4139 for (i = slot; i < nritems; i++) { 4140 u32 ioff; 4141 4142 ioff = btrfs_item_offset(leaf, i); 4143 btrfs_set_item_offset(leaf, i, ioff - data_size); 4144 } 4145 4146 /* shift the data */ 4147 memmove_leaf_data(leaf, data_end - data_size, data_end, 4148 old_data - data_end); 4149 4150 old_size = btrfs_item_size(leaf, slot); 4151 btrfs_set_item_size(leaf, slot, old_size + data_size); 4152 btrfs_mark_buffer_dirty(trans, leaf); 4153 4154 if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { 4155 btrfs_print_leaf(leaf); 4156 BUG(); 4157 } 4158 } 4159 4160 /* 4161 * Make space in the node before inserting one or more items. 4162 * 4163 * @trans: transaction handle 4164 * @root: root we are inserting items to 4165 * @path: points to the leaf/slot where we are going to insert new items 4166 * @batch: information about the batch of items to insert 4167 * 4168 * Main purpose is to save stack depth by doing the bulk of the work in a 4169 * function that doesn't call btrfs_search_slot 4170 */ 4171 static void setup_items_for_insert(struct btrfs_trans_handle *trans, 4172 struct btrfs_root *root, struct btrfs_path *path, 4173 const struct btrfs_item_batch *batch) 4174 { 4175 struct btrfs_fs_info *fs_info = root->fs_info; 4176 int i; 4177 u32 nritems; 4178 unsigned int data_end; 4179 struct btrfs_disk_key disk_key; 4180 struct extent_buffer *leaf; 4181 int slot; 4182 u32 total_size; 4183 4184 /* 4185 * Before anything else, update keys in the parent and other ancestors 4186 * if needed, then release the write locks on them, so that other tasks 4187 * can use them while we modify the leaf. 4188 */ 4189 if (path->slots[0] == 0) { 4190 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); 4191 fixup_low_keys(trans, path, &disk_key, 1); 4192 } 4193 btrfs_unlock_up_safe(path, 1); 4194 4195 leaf = path->nodes[0]; 4196 slot = path->slots[0]; 4197 4198 nritems = btrfs_header_nritems(leaf); 4199 data_end = leaf_data_end(leaf); 4200 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4201 4202 if (unlikely(btrfs_leaf_free_space(leaf) < total_size)) { 4203 btrfs_print_leaf(leaf); 4204 btrfs_crit(fs_info, "not enough freespace need %u have %d", 4205 total_size, btrfs_leaf_free_space(leaf)); 4206 BUG(); 4207 } 4208 4209 if (slot != nritems) { 4210 unsigned int old_data = btrfs_item_data_end(leaf, slot); 4211 4212 if (unlikely(old_data < data_end)) { 4213 btrfs_print_leaf(leaf); 4214 btrfs_crit(fs_info, 4215 "item at slot %d with data offset %u beyond data end of leaf %u", 4216 slot, old_data, data_end); 4217 BUG(); 4218 } 4219 /* 4220 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4221 */ 4222 /* first correct the data pointers */ 4223 for (i = slot; i < nritems; i++) { 4224 u32 ioff; 4225 4226 ioff = btrfs_item_offset(leaf, i); 4227 btrfs_set_item_offset(leaf, i, 4228 ioff - batch->total_data_size); 4229 } 4230 /* shift the items */ 4231 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); 4232 4233 /* shift the data */ 4234 memmove_leaf_data(leaf, data_end - batch->total_data_size, 4235 data_end, old_data - data_end); 4236 data_end = old_data; 4237 } 4238 4239 /* setup the item for the new data */ 4240 for (i = 0; i < batch->nr; i++) { 4241 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); 4242 btrfs_set_item_key(leaf, &disk_key, slot + i); 4243 data_end -= batch->data_sizes[i]; 4244 btrfs_set_item_offset(leaf, slot + i, data_end); 4245 btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]); 4246 } 4247 4248 btrfs_set_header_nritems(leaf, nritems + batch->nr); 4249 btrfs_mark_buffer_dirty(trans, leaf); 4250 4251 if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { 4252 btrfs_print_leaf(leaf); 4253 BUG(); 4254 } 4255 } 4256 4257 /* 4258 * Insert a new item into a leaf. 4259 * 4260 * @trans: Transaction handle. 4261 * @root: The root of the btree. 4262 * @path: A path pointing to the target leaf and slot. 4263 * @key: The key of the new item. 4264 * @data_size: The size of the data associated with the new key. 4265 */ 4266 void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, 4267 struct btrfs_root *root, 4268 struct btrfs_path *path, 4269 const struct btrfs_key *key, 4270 u32 data_size) 4271 { 4272 struct btrfs_item_batch batch; 4273 4274 batch.keys = key; 4275 batch.data_sizes = &data_size; 4276 batch.total_data_size = data_size; 4277 batch.nr = 1; 4278 4279 setup_items_for_insert(trans, root, path, &batch); 4280 } 4281 4282 /* 4283 * Given a key and some data, insert items into the tree. 4284 * This does all the path init required, making room in the tree if needed. 4285 * 4286 * Returns: 0 on success 4287 * -EEXIST if the first key already exists 4288 * < 0 on other errors 4289 */ 4290 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4291 struct btrfs_root *root, 4292 struct btrfs_path *path, 4293 const struct btrfs_item_batch *batch) 4294 { 4295 int ret = 0; 4296 int slot; 4297 u32 total_size; 4298 4299 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4300 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); 4301 if (ret == 0) 4302 return -EEXIST; 4303 if (ret < 0) 4304 return ret; 4305 4306 slot = path->slots[0]; 4307 BUG_ON(slot < 0); 4308 4309 setup_items_for_insert(trans, root, path, batch); 4310 return 0; 4311 } 4312 4313 /* 4314 * Given a key and some data, insert an item into the tree. 4315 * This does all the path init required, making room in the tree if needed. 4316 */ 4317 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4318 const struct btrfs_key *cpu_key, void *data, 4319 u32 data_size) 4320 { 4321 int ret = 0; 4322 BTRFS_PATH_AUTO_FREE(path); 4323 struct extent_buffer *leaf; 4324 unsigned long ptr; 4325 4326 path = btrfs_alloc_path(); 4327 if (!path) 4328 return -ENOMEM; 4329 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4330 if (!ret) { 4331 leaf = path->nodes[0]; 4332 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4333 write_extent_buffer(leaf, data, ptr, data_size); 4334 btrfs_mark_buffer_dirty(trans, leaf); 4335 } 4336 return ret; 4337 } 4338 4339 /* 4340 * This function duplicates an item, giving 'new_key' to the new item. 4341 * It guarantees both items live in the same tree leaf and the new item is 4342 * contiguous with the original item. 4343 * 4344 * This allows us to split a file extent in place, keeping a lock on the leaf 4345 * the entire time. 4346 */ 4347 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4348 struct btrfs_root *root, 4349 struct btrfs_path *path, 4350 const struct btrfs_key *new_key) 4351 { 4352 struct extent_buffer *leaf; 4353 int ret; 4354 u32 item_size; 4355 4356 leaf = path->nodes[0]; 4357 item_size = btrfs_item_size(leaf, path->slots[0]); 4358 ret = setup_leaf_for_split(trans, root, path, 4359 item_size + sizeof(struct btrfs_item)); 4360 if (ret) 4361 return ret; 4362 4363 path->slots[0]++; 4364 btrfs_setup_item_for_insert(trans, root, path, new_key, item_size); 4365 leaf = path->nodes[0]; 4366 memcpy_extent_buffer(leaf, 4367 btrfs_item_ptr_offset(leaf, path->slots[0]), 4368 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4369 item_size); 4370 return 0; 4371 } 4372 4373 /* 4374 * delete the pointer from a given node. 4375 * 4376 * the tree should have been previously balanced so the deletion does not 4377 * empty a node. 4378 * 4379 * This is exported for use inside btrfs-progs, don't un-export it. 4380 */ 4381 int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4382 struct btrfs_path *path, int level, int slot) 4383 { 4384 struct extent_buffer *parent = path->nodes[level]; 4385 u32 nritems; 4386 int ret; 4387 4388 nritems = btrfs_header_nritems(parent); 4389 if (slot != nritems - 1) { 4390 if (level) { 4391 ret = btrfs_tree_mod_log_insert_move(parent, slot, 4392 slot + 1, nritems - slot - 1); 4393 if (unlikely(ret < 0)) { 4394 btrfs_abort_transaction(trans, ret); 4395 return ret; 4396 } 4397 } 4398 memmove_extent_buffer(parent, 4399 btrfs_node_key_ptr_offset(parent, slot), 4400 btrfs_node_key_ptr_offset(parent, slot + 1), 4401 sizeof(struct btrfs_key_ptr) * 4402 (nritems - slot - 1)); 4403 } else if (level) { 4404 ret = btrfs_tree_mod_log_insert_key(parent, slot, 4405 BTRFS_MOD_LOG_KEY_REMOVE); 4406 if (unlikely(ret < 0)) { 4407 btrfs_abort_transaction(trans, ret); 4408 return ret; 4409 } 4410 } 4411 4412 nritems--; 4413 btrfs_set_header_nritems(parent, nritems); 4414 if (nritems == 0 && parent == root->node) { 4415 BUG_ON(btrfs_header_level(root->node) != 1); 4416 /* just turn the root into a leaf and break */ 4417 btrfs_set_header_level(root->node, 0); 4418 } else if (slot == 0) { 4419 struct btrfs_disk_key disk_key; 4420 4421 btrfs_node_key(parent, &disk_key, 0); 4422 fixup_low_keys(trans, path, &disk_key, level + 1); 4423 } 4424 btrfs_mark_buffer_dirty(trans, parent); 4425 return 0; 4426 } 4427 4428 /* 4429 * a helper function to delete the leaf pointed to by path->slots[1] and 4430 * path->nodes[1]. 4431 * 4432 * This deletes the pointer in path->nodes[1] and frees the leaf 4433 * block extent. zero is returned if it all worked out, < 0 otherwise. 4434 * 4435 * The path must have already been setup for deleting the leaf, including 4436 * all the proper balancing. path->nodes[1] must be locked. 4437 */ 4438 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, 4439 struct btrfs_root *root, 4440 struct btrfs_path *path, 4441 struct extent_buffer *leaf) 4442 { 4443 int ret; 4444 4445 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4446 ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]); 4447 if (ret < 0) 4448 return ret; 4449 4450 /* 4451 * btrfs_free_extent is expensive, we want to make sure we 4452 * aren't holding any locks when we call it 4453 */ 4454 btrfs_unlock_up_safe(path, 0); 4455 4456 root_sub_used_bytes(root); 4457 4458 refcount_inc(&leaf->refs); 4459 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); 4460 free_extent_buffer_stale(leaf); 4461 if (ret < 0) 4462 btrfs_abort_transaction(trans, ret); 4463 4464 return ret; 4465 } 4466 /* 4467 * delete the item at the leaf level in path. If that empties 4468 * the leaf, remove it from the tree 4469 */ 4470 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4471 struct btrfs_path *path, int slot, int nr) 4472 { 4473 struct btrfs_fs_info *fs_info = root->fs_info; 4474 struct extent_buffer *leaf; 4475 int ret = 0; 4476 int wret; 4477 u32 nritems; 4478 4479 leaf = path->nodes[0]; 4480 nritems = btrfs_header_nritems(leaf); 4481 4482 if (slot + nr != nritems) { 4483 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); 4484 const int data_end = leaf_data_end(leaf); 4485 u32 dsize = 0; 4486 int i; 4487 4488 for (i = 0; i < nr; i++) 4489 dsize += btrfs_item_size(leaf, slot + i); 4490 4491 memmove_leaf_data(leaf, data_end + dsize, data_end, 4492 last_off - data_end); 4493 4494 for (i = slot + nr; i < nritems; i++) { 4495 u32 ioff; 4496 4497 ioff = btrfs_item_offset(leaf, i); 4498 btrfs_set_item_offset(leaf, i, ioff + dsize); 4499 } 4500 4501 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); 4502 } 4503 btrfs_set_header_nritems(leaf, nritems - nr); 4504 nritems -= nr; 4505 4506 /* delete the leaf if we've emptied it */ 4507 if (nritems == 0) { 4508 if (leaf != root->node) { 4509 btrfs_clear_buffer_dirty(trans, leaf); 4510 ret = btrfs_del_leaf(trans, root, path, leaf); 4511 if (ret < 0) 4512 return ret; 4513 } 4514 } else { 4515 int used = leaf_space_used(leaf, 0, nritems); 4516 if (slot == 0) { 4517 struct btrfs_disk_key disk_key; 4518 4519 btrfs_item_key(leaf, &disk_key, 0); 4520 fixup_low_keys(trans, path, &disk_key, 1); 4521 } 4522 4523 /* 4524 * Try to delete the leaf if it is mostly empty. We do this by 4525 * trying to move all its items into its left and right neighbours. 4526 * If we can't move all the items, then we don't delete it - it's 4527 * not ideal, but future insertions might fill the leaf with more 4528 * items, or items from other leaves might be moved later into our 4529 * leaf due to deletions on those leaves. 4530 */ 4531 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4532 u32 min_push_space; 4533 4534 /* push_leaf_left fixes the path. 4535 * make sure the path still points to our leaf 4536 * for possible call to btrfs_del_ptr below 4537 */ 4538 slot = path->slots[1]; 4539 refcount_inc(&leaf->refs); 4540 /* 4541 * We want to be able to at least push one item to the 4542 * left neighbour leaf, and that's the first item. 4543 */ 4544 min_push_space = sizeof(struct btrfs_item) + 4545 btrfs_item_size(leaf, 0); 4546 wret = push_leaf_left(trans, root, path, 0, 4547 min_push_space, 1, (u32)-1); 4548 if (wret < 0 && wret != -ENOSPC) 4549 ret = wret; 4550 4551 if (path->nodes[0] == leaf && 4552 btrfs_header_nritems(leaf)) { 4553 /* 4554 * If we were not able to push all items from our 4555 * leaf to its left neighbour, then attempt to 4556 * either push all the remaining items to the 4557 * right neighbour or none. There's no advantage 4558 * in pushing only some items, instead of all, as 4559 * it's pointless to end up with a leaf having 4560 * too few items while the neighbours can be full 4561 * or nearly full. 4562 */ 4563 nritems = btrfs_header_nritems(leaf); 4564 min_push_space = leaf_space_used(leaf, 0, nritems); 4565 wret = push_leaf_right(trans, root, path, 0, 4566 min_push_space, 1, 0); 4567 if (wret < 0 && wret != -ENOSPC) 4568 ret = wret; 4569 } 4570 4571 if (btrfs_header_nritems(leaf) == 0) { 4572 path->slots[1] = slot; 4573 ret = btrfs_del_leaf(trans, root, path, leaf); 4574 free_extent_buffer(leaf); 4575 if (ret < 0) 4576 return ret; 4577 } else { 4578 /* if we're still in the path, make sure 4579 * we're dirty. Otherwise, one of the 4580 * push_leaf functions must have already 4581 * dirtied this buffer 4582 */ 4583 if (path->nodes[0] == leaf) 4584 btrfs_mark_buffer_dirty(trans, leaf); 4585 free_extent_buffer(leaf); 4586 } 4587 } else { 4588 btrfs_mark_buffer_dirty(trans, leaf); 4589 } 4590 } 4591 return ret; 4592 } 4593 4594 /* 4595 * A helper function to walk down the tree starting at min_key, and looking 4596 * for leaves that have a minimum transaction id. 4597 * This is used by the btree defrag code, and tree logging 4598 * 4599 * This does not cow, but it does stuff the starting key it finds back 4600 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4601 * key and get a writable path. 4602 * 4603 * min_trans indicates the oldest transaction that you are interested 4604 * in walking through. Any nodes or leaves older than min_trans are 4605 * skipped over (without reading them). 4606 * 4607 * returns zero if something useful was found, < 0 on error and 1 if there 4608 * was nothing in the tree that matched the search criteria. 4609 */ 4610 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4611 struct btrfs_path *path, 4612 u64 min_trans) 4613 { 4614 struct extent_buffer *cur; 4615 int slot; 4616 int sret; 4617 u32 nritems; 4618 int level; 4619 int ret = 1; 4620 const bool keep_locks = path->keep_locks; 4621 4622 ASSERT(!path->nowait); 4623 ASSERT(path->lowest_level == 0); 4624 path->keep_locks = true; 4625 again: 4626 cur = btrfs_read_lock_root_node(root); 4627 level = btrfs_header_level(cur); 4628 WARN_ON(path->nodes[level]); 4629 path->nodes[level] = cur; 4630 path->locks[level] = BTRFS_READ_LOCK; 4631 4632 if (btrfs_header_generation(cur) < min_trans) { 4633 ret = 1; 4634 goto out; 4635 } 4636 while (1) { 4637 nritems = btrfs_header_nritems(cur); 4638 level = btrfs_header_level(cur); 4639 sret = btrfs_bin_search(cur, 0, min_key, &slot); 4640 if (sret < 0) { 4641 ret = sret; 4642 goto out; 4643 } 4644 4645 /* At level 0 we're done, setup the path and exit. */ 4646 if (level == 0) { 4647 if (slot >= nritems) 4648 goto find_next_key; 4649 ret = 0; 4650 path->slots[level] = slot; 4651 /* Save our key for returning back. */ 4652 btrfs_item_key_to_cpu(cur, min_key, slot); 4653 goto out; 4654 } 4655 if (sret && slot > 0) 4656 slot--; 4657 /* 4658 * check this node pointer against the min_trans parameters. 4659 * If it is too old, skip to the next one. 4660 */ 4661 while (slot < nritems) { 4662 u64 gen; 4663 4664 gen = btrfs_node_ptr_generation(cur, slot); 4665 if (gen < min_trans) { 4666 slot++; 4667 continue; 4668 } 4669 break; 4670 } 4671 find_next_key: 4672 /* 4673 * we didn't find a candidate key in this node, walk forward 4674 * and find another one 4675 */ 4676 path->slots[level] = slot; 4677 if (slot >= nritems) { 4678 sret = btrfs_find_next_key(root, path, min_key, level, 4679 min_trans); 4680 if (sret == 0) { 4681 btrfs_release_path(path); 4682 goto again; 4683 } else { 4684 goto out; 4685 } 4686 } 4687 cur = btrfs_read_node_slot(cur, slot); 4688 if (IS_ERR(cur)) { 4689 ret = PTR_ERR(cur); 4690 goto out; 4691 } 4692 4693 btrfs_tree_read_lock(cur); 4694 4695 path->locks[level - 1] = BTRFS_READ_LOCK; 4696 path->nodes[level - 1] = cur; 4697 unlock_up(path, level, 1, 0, NULL); 4698 } 4699 out: 4700 path->keep_locks = keep_locks; 4701 if (ret == 0) 4702 btrfs_unlock_up_safe(path, 1); 4703 return ret; 4704 } 4705 4706 /* 4707 * this is similar to btrfs_next_leaf, but does not try to preserve 4708 * and fixup the path. It looks for and returns the next key in the 4709 * tree based on the current path and the min_trans parameters. 4710 * 4711 * 0 is returned if another key is found, < 0 if there are any errors 4712 * and 1 is returned if there are no higher keys in the tree 4713 * 4714 * path->keep_locks should be set to true on the search made before 4715 * calling this function. 4716 */ 4717 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4718 struct btrfs_key *key, int level, u64 min_trans) 4719 { 4720 int slot; 4721 struct extent_buffer *c; 4722 4723 WARN_ON(!path->keep_locks && !path->skip_locking); 4724 while (level < BTRFS_MAX_LEVEL) { 4725 if (!path->nodes[level]) 4726 return 1; 4727 4728 slot = path->slots[level] + 1; 4729 c = path->nodes[level]; 4730 next: 4731 if (slot >= btrfs_header_nritems(c)) { 4732 int ret; 4733 int orig_lowest; 4734 struct btrfs_key cur_key; 4735 if (level + 1 >= BTRFS_MAX_LEVEL || 4736 !path->nodes[level + 1]) 4737 return 1; 4738 4739 if (path->locks[level + 1] || path->skip_locking) { 4740 level++; 4741 continue; 4742 } 4743 4744 slot = btrfs_header_nritems(c) - 1; 4745 if (level == 0) 4746 btrfs_item_key_to_cpu(c, &cur_key, slot); 4747 else 4748 btrfs_node_key_to_cpu(c, &cur_key, slot); 4749 4750 orig_lowest = path->lowest_level; 4751 btrfs_release_path(path); 4752 path->lowest_level = level; 4753 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4754 0, 0); 4755 path->lowest_level = orig_lowest; 4756 if (ret < 0) 4757 return ret; 4758 4759 c = path->nodes[level]; 4760 slot = path->slots[level]; 4761 if (ret == 0) 4762 slot++; 4763 goto next; 4764 } 4765 4766 if (level == 0) 4767 btrfs_item_key_to_cpu(c, key, slot); 4768 else { 4769 u64 gen = btrfs_node_ptr_generation(c, slot); 4770 4771 if (gen < min_trans) { 4772 slot++; 4773 goto next; 4774 } 4775 btrfs_node_key_to_cpu(c, key, slot); 4776 } 4777 return 0; 4778 } 4779 return 1; 4780 } 4781 4782 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 4783 u64 time_seq) 4784 { 4785 int slot; 4786 int level; 4787 struct extent_buffer *c; 4788 struct extent_buffer *next; 4789 struct btrfs_fs_info *fs_info = root->fs_info; 4790 struct btrfs_key key; 4791 bool need_commit_sem = false; 4792 u32 nritems; 4793 int ret; 4794 int i; 4795 4796 /* 4797 * The nowait semantics are used only for write paths, where we don't 4798 * use the tree mod log and sequence numbers. 4799 */ 4800 if (time_seq) 4801 ASSERT(!path->nowait); 4802 4803 nritems = btrfs_header_nritems(path->nodes[0]); 4804 if (nritems == 0) 4805 return 1; 4806 4807 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4808 again: 4809 level = 1; 4810 next = NULL; 4811 btrfs_release_path(path); 4812 4813 path->keep_locks = true; 4814 4815 if (time_seq) { 4816 ret = btrfs_search_old_slot(root, &key, path, time_seq); 4817 } else { 4818 if (path->need_commit_sem) { 4819 path->need_commit_sem = false; 4820 need_commit_sem = true; 4821 if (path->nowait) { 4822 if (!down_read_trylock(&fs_info->commit_root_sem)) { 4823 ret = -EAGAIN; 4824 goto done; 4825 } 4826 } else { 4827 down_read(&fs_info->commit_root_sem); 4828 } 4829 } 4830 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4831 } 4832 path->keep_locks = false; 4833 4834 if (ret < 0) 4835 goto done; 4836 4837 nritems = btrfs_header_nritems(path->nodes[0]); 4838 /* 4839 * By releasing the path above we dropped all our locks. A balance 4840 * could have happened and 4841 * 4842 * 1. added more items after the previous last item 4843 * 2. deleted the previous last item 4844 * 4845 * So, check again here and advance the path if there are now more 4846 * items available. 4847 */ 4848 if (nritems > 0 && path->slots[0] <= nritems - 1) { 4849 if (ret == 0 && path->slots[0] != nritems - 1) { 4850 path->slots[0]++; 4851 goto done; 4852 } else if (ret > 0) { 4853 ret = 0; 4854 goto done; 4855 } 4856 } 4857 4858 while (level < BTRFS_MAX_LEVEL) { 4859 if (!path->nodes[level]) { 4860 ret = 1; 4861 goto done; 4862 } 4863 4864 slot = path->slots[level] + 1; 4865 c = path->nodes[level]; 4866 if (slot >= btrfs_header_nritems(c)) { 4867 level++; 4868 if (level == BTRFS_MAX_LEVEL) { 4869 ret = 1; 4870 goto done; 4871 } 4872 continue; 4873 } 4874 4875 4876 /* 4877 * Our current level is where we're going to start from, and to 4878 * make sure lockdep doesn't complain we need to drop our locks 4879 * and nodes from 0 to our current level. 4880 */ 4881 for (i = 0; i < level; i++) { 4882 if (path->locks[level]) { 4883 btrfs_tree_read_unlock(path->nodes[i]); 4884 path->locks[i] = 0; 4885 } 4886 free_extent_buffer(path->nodes[i]); 4887 path->nodes[i] = NULL; 4888 } 4889 4890 next = c; 4891 ret = read_block_for_search(root, path, &next, slot, &key); 4892 if (ret == -EAGAIN && !path->nowait) 4893 goto again; 4894 4895 if (ret < 0) { 4896 btrfs_release_path(path); 4897 goto done; 4898 } 4899 4900 if (!path->skip_locking) { 4901 ret = btrfs_try_tree_read_lock(next); 4902 if (!ret && path->nowait) { 4903 ret = -EAGAIN; 4904 goto done; 4905 } 4906 if (!ret && time_seq) { 4907 /* 4908 * If we don't get the lock, we may be racing 4909 * with push_leaf_left, holding that lock while 4910 * itself waiting for the leaf we've currently 4911 * locked. To solve this situation, we give up 4912 * on our lock and cycle. 4913 */ 4914 free_extent_buffer(next); 4915 btrfs_release_path(path); 4916 cond_resched(); 4917 goto again; 4918 } 4919 if (!ret) 4920 btrfs_tree_read_lock(next); 4921 } 4922 break; 4923 } 4924 path->slots[level] = slot; 4925 while (1) { 4926 level--; 4927 path->nodes[level] = next; 4928 path->slots[level] = 0; 4929 if (!path->skip_locking) 4930 path->locks[level] = BTRFS_READ_LOCK; 4931 if (!level) 4932 break; 4933 4934 ret = read_block_for_search(root, path, &next, 0, &key); 4935 if (ret == -EAGAIN && !path->nowait) 4936 goto again; 4937 4938 if (ret < 0) { 4939 btrfs_release_path(path); 4940 goto done; 4941 } 4942 4943 if (!path->skip_locking) { 4944 if (path->nowait) { 4945 if (!btrfs_try_tree_read_lock(next)) { 4946 ret = -EAGAIN; 4947 goto done; 4948 } 4949 } else { 4950 btrfs_tree_read_lock(next); 4951 } 4952 } 4953 } 4954 ret = 0; 4955 done: 4956 unlock_up(path, 0, 1, 0, NULL); 4957 if (need_commit_sem) { 4958 int ret2; 4959 4960 path->need_commit_sem = true; 4961 ret2 = finish_need_commit_sem_search(path); 4962 up_read(&fs_info->commit_root_sem); 4963 if (ret2) 4964 ret = ret2; 4965 } 4966 4967 return ret; 4968 } 4969 4970 int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) 4971 { 4972 path->slots[0]++; 4973 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 4974 return btrfs_next_old_leaf(root, path, time_seq); 4975 return 0; 4976 } 4977 4978 /* 4979 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 4980 * searching until it gets past min_objectid or finds an item of 'type' 4981 * 4982 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4983 */ 4984 int btrfs_previous_item(struct btrfs_root *root, 4985 struct btrfs_path *path, u64 min_objectid, 4986 int type) 4987 { 4988 struct btrfs_key found_key; 4989 struct extent_buffer *leaf; 4990 u32 nritems; 4991 int ret; 4992 4993 while (1) { 4994 if (path->slots[0] == 0) { 4995 ret = btrfs_prev_leaf(root, path); 4996 if (ret != 0) 4997 return ret; 4998 } else { 4999 path->slots[0]--; 5000 } 5001 leaf = path->nodes[0]; 5002 nritems = btrfs_header_nritems(leaf); 5003 if (nritems == 0) 5004 return 1; 5005 if (path->slots[0] == nritems) 5006 path->slots[0]--; 5007 5008 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5009 if (found_key.objectid < min_objectid) 5010 break; 5011 if (found_key.type == type) 5012 return 0; 5013 if (found_key.objectid == min_objectid && 5014 found_key.type < type) 5015 break; 5016 } 5017 return 1; 5018 } 5019 5020 /* 5021 * search in extent tree to find a previous Metadata/Data extent item with 5022 * min objecitd. 5023 * 5024 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5025 */ 5026 int btrfs_previous_extent_item(struct btrfs_root *root, 5027 struct btrfs_path *path, u64 min_objectid) 5028 { 5029 struct btrfs_key found_key; 5030 struct extent_buffer *leaf; 5031 u32 nritems; 5032 int ret; 5033 5034 while (1) { 5035 if (path->slots[0] == 0) { 5036 ret = btrfs_prev_leaf(root, path); 5037 if (ret != 0) 5038 return ret; 5039 } else { 5040 path->slots[0]--; 5041 } 5042 leaf = path->nodes[0]; 5043 nritems = btrfs_header_nritems(leaf); 5044 if (nritems == 0) 5045 return 1; 5046 if (path->slots[0] == nritems) 5047 path->slots[0]--; 5048 5049 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5050 if (found_key.objectid < min_objectid) 5051 break; 5052 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 5053 found_key.type == BTRFS_METADATA_ITEM_KEY) 5054 return 0; 5055 if (found_key.objectid == min_objectid && 5056 found_key.type < BTRFS_EXTENT_ITEM_KEY) 5057 break; 5058 } 5059 return 1; 5060 } 5061 5062 int __init btrfs_ctree_init(void) 5063 { 5064 btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0); 5065 if (!btrfs_path_cachep) 5066 return -ENOMEM; 5067 return 0; 5068 } 5069 5070 void __cold btrfs_ctree_exit(void) 5071 { 5072 kmem_cache_destroy(btrfs_path_cachep); 5073 } 5074