1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include <linux/error-injection.h> 11 #include "messages.h" 12 #include "ctree.h" 13 #include "disk-io.h" 14 #include "transaction.h" 15 #include "print-tree.h" 16 #include "locking.h" 17 #include "volumes.h" 18 #include "qgroup.h" 19 #include "tree-mod-log.h" 20 #include "tree-checker.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "extent-tree.h" 24 #include "extent_io.h" 25 #include "relocation.h" 26 #include "file-item.h" 27 28 static struct kmem_cache *btrfs_path_cachep; 29 30 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 31 *root, struct btrfs_path *path, int level); 32 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 33 const struct btrfs_key *ins_key, struct btrfs_path *path, 34 int data_size, bool extend); 35 static int push_node_left(struct btrfs_trans_handle *trans, 36 struct extent_buffer *dst, 37 struct extent_buffer *src, bool empty); 38 static int balance_node_right(struct btrfs_trans_handle *trans, 39 struct extent_buffer *dst_buf, 40 struct extent_buffer *src_buf); 41 /* 42 * The leaf data grows from end-to-front in the node. this returns the address 43 * of the start of the last item, which is the stop of the leaf data stack. 44 */ 45 static unsigned int leaf_data_end(const struct extent_buffer *leaf) 46 { 47 u32 nr = btrfs_header_nritems(leaf); 48 49 if (nr == 0) 50 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); 51 return btrfs_item_offset(leaf, nr - 1); 52 } 53 54 /* 55 * Move data in a @leaf (using memmove, safe for overlapping ranges). 56 * 57 * @leaf: leaf that we're doing a memmove on 58 * @dst_offset: item data offset we're moving to 59 * @src_offset: item data offset were' moving from 60 * @len: length of the data we're moving 61 * 62 * Wrapper around memmove_extent_buffer() that takes into account the header on 63 * the leaf. The btrfs_item offset's start directly after the header, so we 64 * have to adjust any offsets to account for the header in the leaf. This 65 * handles that math to simplify the callers. 66 */ 67 static inline void memmove_leaf_data(const struct extent_buffer *leaf, 68 unsigned long dst_offset, 69 unsigned long src_offset, 70 unsigned long len) 71 { 72 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, 73 btrfs_item_nr_offset(leaf, 0) + src_offset, len); 74 } 75 76 /* 77 * Copy item data from @src into @dst at the given @offset. 78 * 79 * @dst: destination leaf that we're copying into 80 * @src: source leaf that we're copying from 81 * @dst_offset: item data offset we're copying to 82 * @src_offset: item data offset were' copying from 83 * @len: length of the data we're copying 84 * 85 * Wrapper around copy_extent_buffer() that takes into account the header on 86 * the leaf. The btrfs_item offset's start directly after the header, so we 87 * have to adjust any offsets to account for the header in the leaf. This 88 * handles that math to simplify the callers. 89 */ 90 static inline void copy_leaf_data(const struct extent_buffer *dst, 91 const struct extent_buffer *src, 92 unsigned long dst_offset, 93 unsigned long src_offset, unsigned long len) 94 { 95 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, 96 btrfs_item_nr_offset(src, 0) + src_offset, len); 97 } 98 99 /* 100 * Move items in a @leaf (using memmove). 101 * 102 * @dst: destination leaf for the items 103 * @dst_item: the item nr we're copying into 104 * @src_item: the item nr we're copying from 105 * @nr_items: the number of items to copy 106 * 107 * Wrapper around memmove_extent_buffer() that does the math to get the 108 * appropriate offsets into the leaf from the item numbers. 109 */ 110 static inline void memmove_leaf_items(const struct extent_buffer *leaf, 111 int dst_item, int src_item, int nr_items) 112 { 113 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), 114 btrfs_item_nr_offset(leaf, src_item), 115 nr_items * sizeof(struct btrfs_item)); 116 } 117 118 /* 119 * Copy items from @src into @dst at the given @offset. 120 * 121 * @dst: destination leaf for the items 122 * @src: source leaf for the items 123 * @dst_item: the item nr we're copying into 124 * @src_item: the item nr we're copying from 125 * @nr_items: the number of items to copy 126 * 127 * Wrapper around copy_extent_buffer() that does the math to get the 128 * appropriate offsets into the leaf from the item numbers. 129 */ 130 static inline void copy_leaf_items(const struct extent_buffer *dst, 131 const struct extent_buffer *src, 132 int dst_item, int src_item, int nr_items) 133 { 134 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), 135 btrfs_item_nr_offset(src, src_item), 136 nr_items * sizeof(struct btrfs_item)); 137 } 138 139 struct btrfs_path *btrfs_alloc_path(void) 140 { 141 might_sleep(); 142 143 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 144 } 145 146 /* this also releases the path */ 147 void btrfs_free_path(struct btrfs_path *p) 148 { 149 if (!p) 150 return; 151 btrfs_release_path(p); 152 kmem_cache_free(btrfs_path_cachep, p); 153 } 154 155 /* 156 * path release drops references on the extent buffers in the path 157 * and it drops any locks held by this path 158 * 159 * It is safe to call this on paths that no locks or extent buffers held. 160 */ 161 noinline void btrfs_release_path(struct btrfs_path *p) 162 { 163 int i; 164 165 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 166 p->slots[i] = 0; 167 if (!p->nodes[i]) 168 continue; 169 if (p->locks[i]) { 170 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 171 p->locks[i] = 0; 172 } 173 free_extent_buffer(p->nodes[i]); 174 p->nodes[i] = NULL; 175 } 176 } 177 178 /* 179 * safely gets a reference on the root node of a tree. A lock 180 * is not taken, so a concurrent writer may put a different node 181 * at the root of the tree. See btrfs_lock_root_node for the 182 * looping required. 183 * 184 * The extent buffer returned by this has a reference taken, so 185 * it won't disappear. It may stop being the root of the tree 186 * at any time because there are no locks held. 187 */ 188 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 189 { 190 struct extent_buffer *eb; 191 192 while (1) { 193 rcu_read_lock(); 194 eb = rcu_dereference(root->node); 195 196 /* 197 * RCU really hurts here, we could free up the root node because 198 * it was COWed but we may not get the new root node yet so do 199 * the inc_not_zero dance and if it doesn't work then 200 * synchronize_rcu and try again. 201 */ 202 if (refcount_inc_not_zero(&eb->refs)) { 203 rcu_read_unlock(); 204 break; 205 } 206 rcu_read_unlock(); 207 synchronize_rcu(); 208 } 209 return eb; 210 } 211 212 /* 213 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), 214 * just get put onto a simple dirty list. Transaction walks this list to make 215 * sure they get properly updated on disk. 216 */ 217 static void add_root_to_dirty_list(struct btrfs_root *root) 218 { 219 struct btrfs_fs_info *fs_info = root->fs_info; 220 221 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 222 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 223 return; 224 225 spin_lock(&fs_info->trans_lock); 226 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 227 /* Want the extent tree to be the last on the list */ 228 if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID) 229 list_move_tail(&root->dirty_list, 230 &fs_info->dirty_cowonly_roots); 231 else 232 list_move(&root->dirty_list, 233 &fs_info->dirty_cowonly_roots); 234 } 235 spin_unlock(&fs_info->trans_lock); 236 } 237 238 /* 239 * used by snapshot creation to make a copy of a root for a tree with 240 * a given objectid. The buffer with the new root node is returned in 241 * cow_ret, and this func returns zero on success or a negative error code. 242 */ 243 int btrfs_copy_root(struct btrfs_trans_handle *trans, 244 struct btrfs_root *root, 245 struct extent_buffer *buf, 246 struct extent_buffer **cow_ret, u64 new_root_objectid) 247 { 248 struct btrfs_fs_info *fs_info = root->fs_info; 249 struct extent_buffer *cow; 250 int ret = 0; 251 int level; 252 struct btrfs_disk_key disk_key; 253 const bool is_reloc_root = (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID); 254 u64 reloc_src_root = 0; 255 256 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 257 trans->transid != fs_info->running_transaction->transid); 258 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 259 trans->transid != btrfs_get_root_last_trans(root)); 260 261 level = btrfs_header_level(buf); 262 if (level == 0) 263 btrfs_item_key(buf, &disk_key, 0); 264 else 265 btrfs_node_key(buf, &disk_key, 0); 266 267 if (is_reloc_root) 268 reloc_src_root = btrfs_header_owner(buf); 269 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 270 &disk_key, level, buf->start, 0, 271 reloc_src_root, BTRFS_NESTING_NEW_ROOT); 272 if (IS_ERR(cow)) 273 return PTR_ERR(cow); 274 275 copy_extent_buffer_full(cow, buf); 276 btrfs_set_header_bytenr(cow, cow->start); 277 btrfs_set_header_generation(cow, trans->transid); 278 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 279 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 280 BTRFS_HEADER_FLAG_RELOC); 281 if (is_reloc_root) 282 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 283 else 284 btrfs_set_header_owner(cow, new_root_objectid); 285 286 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 287 288 if (unlikely(btrfs_header_generation(buf) > trans->transid)) { 289 btrfs_tree_unlock(cow); 290 free_extent_buffer(cow); 291 ret = -EUCLEAN; 292 btrfs_abort_transaction(trans, ret); 293 return ret; 294 } 295 296 ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); 297 if (unlikely(ret)) { 298 btrfs_abort_transaction(trans, ret); 299 btrfs_tree_unlock(cow); 300 free_extent_buffer(cow); 301 return ret; 302 } 303 304 btrfs_mark_buffer_dirty(trans, cow); 305 *cow_ret = cow; 306 return 0; 307 } 308 309 /* 310 * check if the tree block can be shared by multiple trees 311 */ 312 bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans, 313 const struct btrfs_root *root, 314 const struct extent_buffer *buf) 315 { 316 const u64 buf_gen = btrfs_header_generation(buf); 317 318 /* 319 * Tree blocks not in shareable trees and tree roots are never shared. 320 * If a block was allocated after the last snapshot and the block was 321 * not allocated by tree relocation, we know the block is not shared. 322 */ 323 324 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 325 return false; 326 327 if (buf == root->node) 328 return false; 329 330 if (buf_gen > btrfs_root_last_snapshot(&root->root_item) && 331 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) 332 return false; 333 334 if (buf != root->commit_root) 335 return true; 336 337 /* 338 * An extent buffer that used to be the commit root may still be shared 339 * because the tree height may have increased and it became a child of a 340 * higher level root. This can happen when snapshotting a subvolume 341 * created in the current transaction. 342 */ 343 if (buf_gen == trans->transid) 344 return true; 345 346 return false; 347 } 348 349 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 350 struct btrfs_root *root, 351 struct extent_buffer *buf, 352 struct extent_buffer *cow, 353 int *last_ref) 354 { 355 struct btrfs_fs_info *fs_info = root->fs_info; 356 u64 refs; 357 u64 owner; 358 u64 flags; 359 int ret; 360 const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); 361 362 /* 363 * Backrefs update rules: 364 * 365 * Always use full backrefs for extent pointers in tree block 366 * allocated by tree relocation. 367 * 368 * If a shared tree block is no longer referenced by its owner 369 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 370 * use full backrefs for extent pointers in tree block. 371 * 372 * If a tree block is been relocating 373 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 374 * use full backrefs for extent pointers in tree block. 375 * The reason for this is some operations (such as drop tree) 376 * are only allowed for blocks use full backrefs. 377 */ 378 379 if (btrfs_block_can_be_shared(trans, root, buf)) { 380 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 381 btrfs_header_level(buf), 1, 382 &refs, &flags, NULL); 383 if (ret) 384 return ret; 385 if (unlikely(refs == 0)) { 386 btrfs_crit(fs_info, 387 "found 0 references for tree block at bytenr %llu level %d root %llu", 388 buf->start, btrfs_header_level(buf), 389 btrfs_root_id(root)); 390 ret = -EUCLEAN; 391 btrfs_abort_transaction(trans, ret); 392 return ret; 393 } 394 } else { 395 refs = 1; 396 if (is_reloc_root || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 397 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 398 else 399 flags = 0; 400 } 401 402 owner = btrfs_header_owner(buf); 403 if (unlikely(owner == BTRFS_TREE_RELOC_OBJECTID && 404 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))) { 405 btrfs_crit(fs_info, 406 "found tree block at bytenr %llu level %d root %llu refs %llu flags %llx without full backref flag set", 407 buf->start, btrfs_header_level(buf), 408 btrfs_root_id(root), refs, flags); 409 ret = -EUCLEAN; 410 btrfs_abort_transaction(trans, ret); 411 return ret; 412 } 413 414 if (refs > 1) { 415 if ((owner == btrfs_root_id(root) || is_reloc_root) && 416 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 417 ret = btrfs_inc_ref(trans, root, buf, true); 418 if (ret) 419 return ret; 420 421 if (is_reloc_root) { 422 ret = btrfs_dec_ref(trans, root, buf, false); 423 if (ret) 424 return ret; 425 ret = btrfs_inc_ref(trans, root, cow, true); 426 if (ret) 427 return ret; 428 } 429 ret = btrfs_set_disk_extent_flags(trans, buf, 430 BTRFS_BLOCK_FLAG_FULL_BACKREF); 431 if (ret) 432 return ret; 433 } else { 434 ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); 435 if (ret) 436 return ret; 437 } 438 } else { 439 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 440 ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); 441 if (ret) 442 return ret; 443 ret = btrfs_dec_ref(trans, root, buf, true); 444 if (ret) 445 return ret; 446 } 447 btrfs_clear_buffer_dirty(trans, buf); 448 *last_ref = 1; 449 } 450 return 0; 451 } 452 453 /* 454 * does the dirty work in cow of a single block. The parent block (if 455 * supplied) is updated to point to the new cow copy. The new buffer is marked 456 * dirty and returned locked. If you modify the block it needs to be marked 457 * dirty again. 458 * 459 * search_start -- an allocation hint for the new block 460 * 461 * empty_size -- a hint that you plan on doing more cow. This is the size in 462 * bytes the allocator should try to find free next to the block it returns. 463 * This is just a hint and may be ignored by the allocator. 464 */ 465 int btrfs_force_cow_block(struct btrfs_trans_handle *trans, 466 struct btrfs_root *root, 467 struct extent_buffer *buf, 468 struct extent_buffer *parent, int parent_slot, 469 struct extent_buffer **cow_ret, 470 u64 search_start, u64 empty_size, 471 enum btrfs_lock_nesting nest) 472 { 473 struct btrfs_fs_info *fs_info = root->fs_info; 474 struct btrfs_disk_key disk_key; 475 struct extent_buffer *cow; 476 int level, ret; 477 int last_ref = 0; 478 int unlock_orig = 0; 479 u64 parent_start = 0; 480 u64 reloc_src_root = 0; 481 482 if (*cow_ret == buf) 483 unlock_orig = 1; 484 485 btrfs_assert_tree_write_locked(buf); 486 487 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 488 trans->transid != fs_info->running_transaction->transid); 489 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 490 trans->transid != btrfs_get_root_last_trans(root)); 491 492 level = btrfs_header_level(buf); 493 494 if (level == 0) 495 btrfs_item_key(buf, &disk_key, 0); 496 else 497 btrfs_node_key(buf, &disk_key, 0); 498 499 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { 500 if (parent) 501 parent_start = parent->start; 502 reloc_src_root = btrfs_header_owner(buf); 503 } 504 cow = btrfs_alloc_tree_block(trans, root, parent_start, 505 btrfs_root_id(root), &disk_key, level, 506 search_start, empty_size, reloc_src_root, nest); 507 if (IS_ERR(cow)) 508 return PTR_ERR(cow); 509 510 /* cow is set to blocking by btrfs_init_new_buffer */ 511 512 copy_extent_buffer_full(cow, buf); 513 btrfs_set_header_bytenr(cow, cow->start); 514 btrfs_set_header_generation(cow, trans->transid); 515 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 516 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 517 BTRFS_HEADER_FLAG_RELOC); 518 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 519 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 520 else 521 btrfs_set_header_owner(cow, btrfs_root_id(root)); 522 523 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 524 525 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 526 if (unlikely(ret)) { 527 btrfs_abort_transaction(trans, ret); 528 goto error_unlock_cow; 529 } 530 531 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 532 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 533 if (unlikely(ret)) { 534 btrfs_abort_transaction(trans, ret); 535 goto error_unlock_cow; 536 } 537 } 538 539 if (buf == root->node) { 540 WARN_ON(parent && parent != buf); 541 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || 542 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 543 parent_start = buf->start; 544 545 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); 546 if (unlikely(ret < 0)) { 547 btrfs_abort_transaction(trans, ret); 548 goto error_unlock_cow; 549 } 550 refcount_inc(&cow->refs); 551 rcu_assign_pointer(root->node, cow); 552 553 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 554 parent_start, last_ref); 555 free_extent_buffer(buf); 556 add_root_to_dirty_list(root); 557 if (unlikely(ret < 0)) { 558 btrfs_abort_transaction(trans, ret); 559 goto error_unlock_cow; 560 } 561 } else { 562 WARN_ON(trans->transid != btrfs_header_generation(parent)); 563 ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, 564 BTRFS_MOD_LOG_KEY_REPLACE); 565 if (unlikely(ret)) { 566 btrfs_abort_transaction(trans, ret); 567 goto error_unlock_cow; 568 } 569 btrfs_set_node_blockptr(parent, parent_slot, 570 cow->start); 571 btrfs_set_node_ptr_generation(parent, parent_slot, 572 trans->transid); 573 btrfs_mark_buffer_dirty(trans, parent); 574 if (last_ref) { 575 ret = btrfs_tree_mod_log_free_eb(buf); 576 if (unlikely(ret)) { 577 btrfs_abort_transaction(trans, ret); 578 goto error_unlock_cow; 579 } 580 } 581 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 582 parent_start, last_ref); 583 if (unlikely(ret < 0)) { 584 btrfs_abort_transaction(trans, ret); 585 goto error_unlock_cow; 586 } 587 } 588 589 trace_btrfs_cow_block(root, buf, cow); 590 if (unlock_orig) 591 btrfs_tree_unlock(buf); 592 free_extent_buffer_stale(buf); 593 btrfs_mark_buffer_dirty(trans, cow); 594 595 btrfs_inhibit_eb_writeback(trans, cow); 596 597 *cow_ret = cow; 598 return 0; 599 600 error_unlock_cow: 601 btrfs_tree_unlock(cow); 602 free_extent_buffer(cow); 603 return ret; 604 } 605 606 static inline bool should_cow_block(struct btrfs_trans_handle *trans, 607 const struct btrfs_root *root, 608 struct extent_buffer *buf) 609 { 610 if (btrfs_is_testing(root->fs_info)) 611 return false; 612 613 /* 614 * We do not need to cow a block if 615 * 1) this block is not created or changed in this transaction; 616 * 2) this block does not belong to TREE_RELOC tree; 617 * 3) the root is not forced COW. 618 * 619 * What is forced COW: 620 * when we create snapshot during committing the transaction, 621 * after we've finished copying src root, we must COW the shared 622 * block to ensure the metadata consistency. 623 */ 624 625 if (btrfs_header_generation(buf) != trans->transid) 626 return true; 627 628 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) 629 return true; 630 631 /* Ensure we can see the FORCE_COW bit. */ 632 smp_mb__before_atomic(); 633 if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 634 return true; 635 636 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 637 return false; 638 639 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) 640 return true; 641 642 btrfs_inhibit_eb_writeback(trans, buf); 643 return false; 644 } 645 646 /* 647 * COWs a single block, see btrfs_force_cow_block() for the real work. 648 * This version of it has extra checks so that a block isn't COWed more than 649 * once per transaction, as long as it hasn't been written yet 650 */ 651 int btrfs_cow_block(struct btrfs_trans_handle *trans, 652 struct btrfs_root *root, struct extent_buffer *buf, 653 struct extent_buffer *parent, int parent_slot, 654 struct extent_buffer **cow_ret, 655 enum btrfs_lock_nesting nest) 656 { 657 struct btrfs_fs_info *fs_info = root->fs_info; 658 u64 search_start; 659 660 if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { 661 btrfs_abort_transaction(trans, -EUCLEAN); 662 btrfs_crit(fs_info, 663 "attempt to COW block %llu on root %llu that is being deleted", 664 buf->start, btrfs_root_id(root)); 665 return -EUCLEAN; 666 } 667 668 /* 669 * COWing must happen through a running transaction, which always 670 * matches the current fs generation (it's a transaction with a state 671 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 672 * into error state to prevent the commit of any transaction. 673 */ 674 if (unlikely(trans->transaction != fs_info->running_transaction || 675 trans->transid != fs_info->generation)) { 676 btrfs_abort_transaction(trans, -EUCLEAN); 677 btrfs_crit(fs_info, 678 "unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", 679 buf->start, btrfs_root_id(root), trans->transid, 680 fs_info->running_transaction->transid, 681 fs_info->generation); 682 return -EUCLEAN; 683 } 684 685 if (!should_cow_block(trans, root, buf)) { 686 *cow_ret = buf; 687 return 0; 688 } 689 690 search_start = round_down(buf->start, SZ_1G); 691 692 /* 693 * Before CoWing this block for later modification, check if it's 694 * the subtree root and do the delayed subtree trace if needed. 695 * 696 * Also We don't care about the error, as it's handled internally. 697 */ 698 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 699 return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, 700 cow_ret, search_start, 0, nest); 701 } 702 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 703 704 /* 705 * same as comp_keys only with two btrfs_key's 706 */ 707 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 708 { 709 if (k1->objectid > k2->objectid) 710 return 1; 711 if (k1->objectid < k2->objectid) 712 return -1; 713 if (k1->type > k2->type) 714 return 1; 715 if (k1->type < k2->type) 716 return -1; 717 if (k1->offset > k2->offset) 718 return 1; 719 if (k1->offset < k2->offset) 720 return -1; 721 return 0; 722 } 723 724 /* 725 * Search for a key in the given extent_buffer. 726 * 727 * The lower boundary for the search is specified by the slot number @first_slot. 728 * Use a value of 0 to search over the whole extent buffer. Works for both 729 * leaves and nodes. 730 * 731 * The slot in the extent buffer is returned via @slot. If the key exists in the 732 * extent buffer, then @slot will point to the slot where the key is, otherwise 733 * it points to the slot where you would insert the key. 734 * 735 * Slot may point to the total number of items (i.e. one position beyond the last 736 * key) if the key is bigger than the last key in the extent buffer. 737 */ 738 int btrfs_bin_search(const struct extent_buffer *eb, int first_slot, 739 const struct btrfs_key *key, int *slot) 740 { 741 unsigned long p; 742 int item_size; 743 /* 744 * Use unsigned types for the low and high slots, so that we get a more 745 * efficient division in the search loop below. 746 */ 747 u32 low = first_slot; 748 u32 high = btrfs_header_nritems(eb); 749 int ret; 750 const int key_size = sizeof(struct btrfs_disk_key); 751 752 if (unlikely(low > high)) { 753 btrfs_err(eb->fs_info, 754 "%s: low (%u) > high (%u) eb %llu owner %llu level %d", 755 __func__, low, high, eb->start, 756 btrfs_header_owner(eb), btrfs_header_level(eb)); 757 return -EINVAL; 758 } 759 760 if (btrfs_header_level(eb) == 0) { 761 p = offsetof(struct btrfs_leaf, items); 762 item_size = sizeof(struct btrfs_item); 763 } else { 764 p = offsetof(struct btrfs_node, ptrs); 765 item_size = sizeof(struct btrfs_key_ptr); 766 } 767 768 while (low < high) { 769 const int unit_size = eb->folio_size; 770 unsigned long oif; 771 unsigned long offset; 772 struct btrfs_disk_key *tmp; 773 struct btrfs_disk_key unaligned; 774 u32 mid; 775 776 mid = (low + high) / 2; 777 offset = p + mid * item_size; 778 oif = get_eb_offset_in_folio(eb, offset); 779 780 if (oif + key_size <= unit_size) { 781 const unsigned long idx = get_eb_folio_index(eb, offset); 782 char *kaddr = folio_address(eb->folios[idx]); 783 784 tmp = (struct btrfs_disk_key *)(kaddr + oif); 785 } else { 786 read_extent_buffer(eb, &unaligned, offset, key_size); 787 tmp = &unaligned; 788 } 789 790 ret = btrfs_comp_keys(tmp, key); 791 792 if (ret < 0) 793 low = mid + 1; 794 else if (ret > 0) 795 high = mid; 796 else { 797 *slot = mid; 798 return 0; 799 } 800 } 801 *slot = low; 802 return 1; 803 } 804 805 static void root_add_used_bytes(struct btrfs_root *root) 806 { 807 spin_lock(&root->accounting_lock); 808 btrfs_set_root_used(&root->root_item, 809 btrfs_root_used(&root->root_item) + root->fs_info->nodesize); 810 spin_unlock(&root->accounting_lock); 811 } 812 813 static void root_sub_used_bytes(struct btrfs_root *root) 814 { 815 spin_lock(&root->accounting_lock); 816 btrfs_set_root_used(&root->root_item, 817 btrfs_root_used(&root->root_item) - root->fs_info->nodesize); 818 spin_unlock(&root->accounting_lock); 819 } 820 821 /* given a node and slot number, this reads the blocks it points to. The 822 * extent buffer is returned with a reference taken (but unlocked). 823 */ 824 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, 825 int slot) 826 { 827 int level = btrfs_header_level(parent); 828 struct btrfs_tree_parent_check check = { 0 }; 829 830 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 831 return ERR_PTR(-ENOENT); 832 833 ASSERT(level); 834 835 check.level = level - 1; 836 check.transid = btrfs_node_ptr_generation(parent, slot); 837 check.owner_root = btrfs_header_owner(parent); 838 check.has_first_key = true; 839 btrfs_node_key_to_cpu(parent, &check.first_key, slot); 840 841 return read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 842 &check); 843 } 844 845 /* 846 * Promote a child node to become the new tree root. 847 * 848 * @trans: Transaction handle 849 * @root: Tree root structure to update 850 * @path: Path holding nodes and locks 851 * @level: Level of the parent (old root) 852 * @parent: The parent (old root) with exactly one item 853 * 854 * This helper is called during rebalancing when the root node contains only 855 * a single item (nritems == 1). We can reduce the tree height by promoting 856 * that child to become the new root and freeing the old root node. The path 857 * locks and references are updated accordingly. 858 * 859 * Return: 0 on success, negative errno on failure. The transaction is aborted 860 * on critical errors. 861 */ 862 static int promote_child_to_root(struct btrfs_trans_handle *trans, 863 struct btrfs_root *root, struct btrfs_path *path, 864 int level, struct extent_buffer *parent) 865 { 866 struct extent_buffer *child; 867 int ret; 868 869 ASSERT(btrfs_header_nritems(parent) == 1); 870 871 child = btrfs_read_node_slot(parent, 0); 872 if (IS_ERR(child)) 873 return PTR_ERR(child); 874 875 btrfs_tree_lock(child); 876 ret = btrfs_cow_block(trans, root, child, parent, 0, &child, BTRFS_NESTING_COW); 877 if (ret) { 878 btrfs_tree_unlock(child); 879 free_extent_buffer(child); 880 return ret; 881 } 882 883 ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 884 if (unlikely(ret < 0)) { 885 btrfs_tree_unlock(child); 886 free_extent_buffer(child); 887 btrfs_abort_transaction(trans, ret); 888 return ret; 889 } 890 rcu_assign_pointer(root->node, child); 891 892 add_root_to_dirty_list(root); 893 btrfs_tree_unlock(child); 894 895 path->locks[level] = 0; 896 path->nodes[level] = NULL; 897 btrfs_clear_buffer_dirty(trans, parent); 898 btrfs_tree_unlock(parent); 899 /* Once for the path. */ 900 free_extent_buffer(parent); 901 902 root_sub_used_bytes(root); 903 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), parent, 0, 1); 904 /* Once for the root ptr. */ 905 free_extent_buffer_stale(parent); 906 if (unlikely(ret < 0)) { 907 btrfs_abort_transaction(trans, ret); 908 return ret; 909 } 910 911 return 0; 912 } 913 914 /* 915 * node level balancing, used to make sure nodes are in proper order for 916 * item deletion. We balance from the top down, so we have to make sure 917 * that a deletion won't leave an node completely empty later on. 918 */ 919 static noinline int balance_level(struct btrfs_trans_handle *trans, 920 struct btrfs_root *root, 921 struct btrfs_path *path, int level) 922 { 923 struct btrfs_fs_info *fs_info = root->fs_info; 924 struct extent_buffer *right = NULL; 925 struct extent_buffer *mid; 926 struct extent_buffer *left = NULL; 927 struct extent_buffer *parent = NULL; 928 int ret = 0; 929 int wret; 930 int pslot; 931 int orig_slot = path->slots[level]; 932 u64 orig_ptr; 933 934 ASSERT(level > 0); 935 936 mid = path->nodes[level]; 937 938 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); 939 WARN_ON(btrfs_header_generation(mid) != trans->transid); 940 941 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 942 943 if (level < BTRFS_MAX_LEVEL - 1) { 944 parent = path->nodes[level + 1]; 945 pslot = path->slots[level + 1]; 946 } 947 948 /* 949 * deal with the case where there is only one pointer in the root 950 * by promoting the node below to a root 951 */ 952 if (!parent) { 953 if (btrfs_header_nritems(mid) != 1) 954 return 0; 955 956 return promote_child_to_root(trans, root, path, level, mid); 957 } 958 if (btrfs_header_nritems(mid) > 959 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 960 return 0; 961 962 if (pslot) { 963 left = btrfs_read_node_slot(parent, pslot - 1); 964 if (IS_ERR(left)) { 965 ret = PTR_ERR(left); 966 left = NULL; 967 goto out; 968 } 969 970 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); 971 wret = btrfs_cow_block(trans, root, left, 972 parent, pslot - 1, &left, 973 BTRFS_NESTING_LEFT_COW); 974 if (wret) { 975 ret = wret; 976 goto out; 977 } 978 } 979 980 if (pslot + 1 < btrfs_header_nritems(parent)) { 981 right = btrfs_read_node_slot(parent, pslot + 1); 982 if (IS_ERR(right)) { 983 ret = PTR_ERR(right); 984 right = NULL; 985 goto out; 986 } 987 988 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); 989 wret = btrfs_cow_block(trans, root, right, 990 parent, pslot + 1, &right, 991 BTRFS_NESTING_RIGHT_COW); 992 if (wret) { 993 ret = wret; 994 goto out; 995 } 996 } 997 998 /* first, try to make some room in the middle buffer */ 999 if (left) { 1000 orig_slot += btrfs_header_nritems(left); 1001 wret = push_node_left(trans, left, mid, 1); 1002 if (wret < 0) 1003 ret = wret; 1004 } 1005 1006 /* 1007 * then try to empty the right most buffer into the middle 1008 */ 1009 if (right) { 1010 wret = push_node_left(trans, mid, right, 1); 1011 if (wret < 0 && wret != -ENOSPC) 1012 ret = wret; 1013 if (btrfs_header_nritems(right) == 0) { 1014 btrfs_clear_buffer_dirty(trans, right); 1015 btrfs_tree_unlock(right); 1016 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); 1017 if (ret < 0) { 1018 free_extent_buffer_stale(right); 1019 right = NULL; 1020 goto out; 1021 } 1022 root_sub_used_bytes(root); 1023 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), 1024 right, 0, 1); 1025 free_extent_buffer_stale(right); 1026 right = NULL; 1027 if (unlikely(ret < 0)) { 1028 btrfs_abort_transaction(trans, ret); 1029 goto out; 1030 } 1031 } else { 1032 struct btrfs_disk_key right_key; 1033 btrfs_node_key(right, &right_key, 0); 1034 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1035 BTRFS_MOD_LOG_KEY_REPLACE); 1036 if (unlikely(ret < 0)) { 1037 btrfs_abort_transaction(trans, ret); 1038 goto out; 1039 } 1040 btrfs_set_node_key(parent, &right_key, pslot + 1); 1041 btrfs_mark_buffer_dirty(trans, parent); 1042 } 1043 } 1044 if (btrfs_header_nritems(mid) == 1) { 1045 /* 1046 * we're not allowed to leave a node with one item in the 1047 * tree during a delete. A deletion from lower in the tree 1048 * could try to delete the only pointer in this node. 1049 * So, pull some keys from the left. 1050 * There has to be a left pointer at this point because 1051 * otherwise we would have pulled some pointers from the 1052 * right 1053 */ 1054 if (unlikely(!left)) { 1055 btrfs_crit(fs_info, 1056 "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", 1057 parent->start, btrfs_header_level(parent), 1058 mid->start, btrfs_root_id(root)); 1059 ret = -EUCLEAN; 1060 btrfs_abort_transaction(trans, ret); 1061 goto out; 1062 } 1063 wret = balance_node_right(trans, mid, left); 1064 if (wret < 0) { 1065 ret = wret; 1066 goto out; 1067 } 1068 if (wret == 1) { 1069 wret = push_node_left(trans, left, mid, 1); 1070 if (wret < 0) 1071 ret = wret; 1072 } 1073 BUG_ON(wret == 1); 1074 } 1075 if (btrfs_header_nritems(mid) == 0) { 1076 btrfs_clear_buffer_dirty(trans, mid); 1077 btrfs_tree_unlock(mid); 1078 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); 1079 if (ret < 0) { 1080 free_extent_buffer_stale(mid); 1081 mid = NULL; 1082 goto out; 1083 } 1084 root_sub_used_bytes(root); 1085 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1086 free_extent_buffer_stale(mid); 1087 mid = NULL; 1088 if (unlikely(ret < 0)) { 1089 btrfs_abort_transaction(trans, ret); 1090 goto out; 1091 } 1092 } else { 1093 /* update the parent key to reflect our changes */ 1094 struct btrfs_disk_key mid_key; 1095 btrfs_node_key(mid, &mid_key, 0); 1096 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1097 BTRFS_MOD_LOG_KEY_REPLACE); 1098 if (unlikely(ret < 0)) { 1099 btrfs_abort_transaction(trans, ret); 1100 goto out; 1101 } 1102 btrfs_set_node_key(parent, &mid_key, pslot); 1103 btrfs_mark_buffer_dirty(trans, parent); 1104 } 1105 1106 /* update the path */ 1107 if (left) { 1108 if (btrfs_header_nritems(left) > orig_slot) { 1109 /* left was locked after cow */ 1110 path->nodes[level] = left; 1111 path->slots[level + 1] -= 1; 1112 path->slots[level] = orig_slot; 1113 /* Left is now owned by path. */ 1114 left = NULL; 1115 if (mid) { 1116 btrfs_tree_unlock(mid); 1117 free_extent_buffer(mid); 1118 } 1119 } else { 1120 orig_slot -= btrfs_header_nritems(left); 1121 path->slots[level] = orig_slot; 1122 } 1123 } 1124 /* double check we haven't messed things up */ 1125 if (orig_ptr != 1126 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1127 BUG(); 1128 out: 1129 if (right) { 1130 btrfs_tree_unlock(right); 1131 free_extent_buffer(right); 1132 } 1133 if (left) { 1134 btrfs_tree_unlock(left); 1135 free_extent_buffer(left); 1136 } 1137 return ret; 1138 } 1139 1140 /* Node balancing for insertion. Here we only split or push nodes around 1141 * when they are completely full. This is also done top down, so we 1142 * have to be pessimistic. 1143 */ 1144 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1145 struct btrfs_root *root, 1146 struct btrfs_path *path, int level) 1147 { 1148 struct btrfs_fs_info *fs_info = root->fs_info; 1149 struct extent_buffer *right = NULL; 1150 struct extent_buffer *mid; 1151 struct extent_buffer *left = NULL; 1152 struct extent_buffer *parent = NULL; 1153 int ret = 0; 1154 int wret; 1155 int pslot; 1156 int orig_slot = path->slots[level]; 1157 1158 if (level == 0) 1159 return 1; 1160 1161 mid = path->nodes[level]; 1162 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1163 1164 if (level < BTRFS_MAX_LEVEL - 1) { 1165 parent = path->nodes[level + 1]; 1166 pslot = path->slots[level + 1]; 1167 } 1168 1169 if (!parent) 1170 return 1; 1171 1172 /* first, try to make some room in the middle buffer */ 1173 if (pslot) { 1174 u32 left_nr; 1175 1176 left = btrfs_read_node_slot(parent, pslot - 1); 1177 if (IS_ERR(left)) 1178 return PTR_ERR(left); 1179 1180 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); 1181 1182 left_nr = btrfs_header_nritems(left); 1183 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1184 wret = 1; 1185 } else { 1186 ret = btrfs_cow_block(trans, root, left, parent, 1187 pslot - 1, &left, 1188 BTRFS_NESTING_LEFT_COW); 1189 if (ret) 1190 wret = 1; 1191 else { 1192 wret = push_node_left(trans, left, mid, 0); 1193 } 1194 } 1195 if (wret < 0) 1196 ret = wret; 1197 if (wret == 0) { 1198 struct btrfs_disk_key disk_key; 1199 orig_slot += left_nr; 1200 btrfs_node_key(mid, &disk_key, 0); 1201 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1202 BTRFS_MOD_LOG_KEY_REPLACE); 1203 if (unlikely(ret < 0)) { 1204 btrfs_tree_unlock(left); 1205 free_extent_buffer(left); 1206 btrfs_abort_transaction(trans, ret); 1207 return ret; 1208 } 1209 btrfs_set_node_key(parent, &disk_key, pslot); 1210 btrfs_mark_buffer_dirty(trans, parent); 1211 if (btrfs_header_nritems(left) > orig_slot) { 1212 path->nodes[level] = left; 1213 path->slots[level + 1] -= 1; 1214 path->slots[level] = orig_slot; 1215 btrfs_tree_unlock(mid); 1216 free_extent_buffer(mid); 1217 } else { 1218 orig_slot -= 1219 btrfs_header_nritems(left); 1220 path->slots[level] = orig_slot; 1221 btrfs_tree_unlock(left); 1222 free_extent_buffer(left); 1223 } 1224 return 0; 1225 } 1226 btrfs_tree_unlock(left); 1227 free_extent_buffer(left); 1228 } 1229 1230 /* 1231 * then try to empty the right most buffer into the middle 1232 */ 1233 if (pslot + 1 < btrfs_header_nritems(parent)) { 1234 u32 right_nr; 1235 1236 right = btrfs_read_node_slot(parent, pslot + 1); 1237 if (IS_ERR(right)) 1238 return PTR_ERR(right); 1239 1240 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); 1241 1242 right_nr = btrfs_header_nritems(right); 1243 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1244 wret = 1; 1245 } else { 1246 ret = btrfs_cow_block(trans, root, right, 1247 parent, pslot + 1, 1248 &right, BTRFS_NESTING_RIGHT_COW); 1249 if (ret) 1250 wret = 1; 1251 else { 1252 wret = balance_node_right(trans, right, mid); 1253 } 1254 } 1255 if (wret < 0) 1256 ret = wret; 1257 if (wret == 0) { 1258 struct btrfs_disk_key disk_key; 1259 1260 btrfs_node_key(right, &disk_key, 0); 1261 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1262 BTRFS_MOD_LOG_KEY_REPLACE); 1263 if (unlikely(ret < 0)) { 1264 btrfs_tree_unlock(right); 1265 free_extent_buffer(right); 1266 btrfs_abort_transaction(trans, ret); 1267 return ret; 1268 } 1269 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1270 btrfs_mark_buffer_dirty(trans, parent); 1271 1272 if (btrfs_header_nritems(mid) <= orig_slot) { 1273 path->nodes[level] = right; 1274 path->slots[level + 1] += 1; 1275 path->slots[level] = orig_slot - 1276 btrfs_header_nritems(mid); 1277 btrfs_tree_unlock(mid); 1278 free_extent_buffer(mid); 1279 } else { 1280 btrfs_tree_unlock(right); 1281 free_extent_buffer(right); 1282 } 1283 return 0; 1284 } 1285 btrfs_tree_unlock(right); 1286 free_extent_buffer(right); 1287 } 1288 return 1; 1289 } 1290 1291 /* 1292 * readahead one full node of leaves, finding things that are close 1293 * to the block in 'slot', and triggering ra on them. 1294 */ 1295 static void reada_for_search(struct btrfs_fs_info *fs_info, 1296 const struct btrfs_path *path, 1297 int level, int slot, u64 objectid) 1298 { 1299 struct extent_buffer *node; 1300 struct btrfs_disk_key disk_key; 1301 u32 nritems; 1302 u64 search; 1303 u64 target; 1304 u64 nread = 0; 1305 u64 nread_max; 1306 u32 nr; 1307 u32 blocksize; 1308 u32 nscan = 0; 1309 1310 if (level != 1 && path->reada != READA_FORWARD_ALWAYS) 1311 return; 1312 1313 if (!path->nodes[level]) 1314 return; 1315 1316 node = path->nodes[level]; 1317 1318 /* 1319 * Since the time between visiting leaves is much shorter than the time 1320 * between visiting nodes, limit read ahead of nodes to 1, to avoid too 1321 * much IO at once (possibly random). 1322 */ 1323 if (path->reada == READA_FORWARD_ALWAYS) { 1324 if (level > 1) 1325 nread_max = node->fs_info->nodesize; 1326 else 1327 nread_max = SZ_128K; 1328 } else { 1329 nread_max = SZ_64K; 1330 } 1331 1332 search = btrfs_node_blockptr(node, slot); 1333 blocksize = fs_info->nodesize; 1334 if (path->reada != READA_FORWARD_ALWAYS) { 1335 struct extent_buffer *eb; 1336 1337 eb = find_extent_buffer(fs_info, search); 1338 if (eb) { 1339 free_extent_buffer(eb); 1340 return; 1341 } 1342 } 1343 1344 target = search; 1345 1346 nritems = btrfs_header_nritems(node); 1347 nr = slot; 1348 1349 while (1) { 1350 if (path->reada == READA_BACK) { 1351 if (nr == 0) 1352 break; 1353 nr--; 1354 } else if (path->reada == READA_FORWARD || 1355 path->reada == READA_FORWARD_ALWAYS) { 1356 nr++; 1357 if (nr >= nritems) 1358 break; 1359 } 1360 if (path->reada == READA_BACK && objectid) { 1361 btrfs_node_key(node, &disk_key, nr); 1362 if (btrfs_disk_key_objectid(&disk_key) != objectid) 1363 break; 1364 } 1365 search = btrfs_node_blockptr(node, nr); 1366 if (path->reada == READA_FORWARD_ALWAYS || 1367 (search <= target && target - search <= 65536) || 1368 (search > target && search - target <= 65536)) { 1369 btrfs_readahead_node_child(node, nr); 1370 nread += blocksize; 1371 } 1372 nscan++; 1373 if (nread > nread_max || nscan > 32) 1374 break; 1375 } 1376 } 1377 1378 static noinline void reada_for_balance(const struct btrfs_path *path, int level) 1379 { 1380 struct extent_buffer *parent; 1381 int slot; 1382 int nritems; 1383 1384 parent = path->nodes[level + 1]; 1385 if (!parent) 1386 return; 1387 1388 nritems = btrfs_header_nritems(parent); 1389 slot = path->slots[level + 1]; 1390 1391 if (slot > 0) 1392 btrfs_readahead_node_child(parent, slot - 1); 1393 if (slot + 1 < nritems) 1394 btrfs_readahead_node_child(parent, slot + 1); 1395 } 1396 1397 1398 /* 1399 * when we walk down the tree, it is usually safe to unlock the higher layers 1400 * in the tree. The exceptions are when our path goes through slot 0, because 1401 * operations on the tree might require changing key pointers higher up in the 1402 * tree. 1403 * 1404 * callers might also have set path->keep_locks, which tells this code to keep 1405 * the lock if the path points to the last slot in the block. This is part of 1406 * walking through the tree, and selecting the next slot in the higher block. 1407 * 1408 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 1409 * if lowest_unlock is 1, level 0 won't be unlocked 1410 */ 1411 static noinline void unlock_up(struct btrfs_path *path, int level, 1412 int lowest_unlock, int min_write_lock_level, 1413 int *write_lock_level) 1414 { 1415 int i; 1416 int skip_level = level; 1417 bool check_skip = true; 1418 1419 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1420 if (!path->nodes[i]) 1421 break; 1422 if (!path->locks[i]) 1423 break; 1424 1425 if (check_skip) { 1426 if (path->slots[i] == 0) { 1427 skip_level = i + 1; 1428 continue; 1429 } 1430 1431 if (path->keep_locks) { 1432 u32 nritems; 1433 1434 nritems = btrfs_header_nritems(path->nodes[i]); 1435 if (nritems < 1 || path->slots[i] >= nritems - 1) { 1436 skip_level = i + 1; 1437 continue; 1438 } 1439 } 1440 } 1441 1442 if (i >= lowest_unlock && i > skip_level) { 1443 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 1444 check_skip = false; 1445 path->locks[i] = 0; 1446 if (write_lock_level && 1447 i > min_write_lock_level && 1448 i <= *write_lock_level) { 1449 *write_lock_level = i - 1; 1450 } 1451 } 1452 } 1453 } 1454 1455 /* 1456 * Helper function for btrfs_search_slot() and other functions that do a search 1457 * on a btree. The goal is to find a tree block in the cache (the radix tree at 1458 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read 1459 * its pages from disk. 1460 * 1461 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the 1462 * whole btree search, starting again from the current root node. 1463 */ 1464 static int 1465 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 1466 struct extent_buffer **eb_ret, int slot, 1467 const struct btrfs_key *key) 1468 { 1469 struct btrfs_fs_info *fs_info = root->fs_info; 1470 struct btrfs_tree_parent_check check = { 0 }; 1471 u64 blocknr; 1472 struct extent_buffer *tmp = NULL; 1473 int ret = 0; 1474 int ret2; 1475 int parent_level; 1476 bool read_tmp = false; 1477 bool tmp_locked = false; 1478 bool path_released = false; 1479 1480 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1481 parent_level = btrfs_header_level(*eb_ret); 1482 btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); 1483 check.has_first_key = true; 1484 check.level = parent_level - 1; 1485 check.transid = btrfs_node_ptr_generation(*eb_ret, slot); 1486 check.owner_root = btrfs_root_id(root); 1487 1488 /* 1489 * If we need to read an extent buffer from disk and we are holding locks 1490 * on upper level nodes, we unlock all the upper nodes before reading the 1491 * extent buffer, and then return -EAGAIN to the caller as it needs to 1492 * restart the search. We don't release the lock on the current level 1493 * because we need to walk this node to figure out which blocks to read. 1494 */ 1495 tmp = find_extent_buffer(fs_info, blocknr); 1496 if (tmp) { 1497 if (p->reada == READA_FORWARD_ALWAYS) 1498 reada_for_search(fs_info, p, parent_level, slot, key->objectid); 1499 1500 /* first we do an atomic uptodate check */ 1501 if (btrfs_buffer_uptodate(tmp, check.transid, NULL) > 0) { 1502 /* 1503 * Do extra check for first_key, eb can be stale due to 1504 * being cached, read from scrub, or have multiple 1505 * parents (shared tree blocks). 1506 */ 1507 if (unlikely(btrfs_verify_level_key(tmp, &check))) { 1508 ret = -EUCLEAN; 1509 goto out; 1510 } 1511 *eb_ret = tmp; 1512 tmp = NULL; 1513 ret = 0; 1514 goto out; 1515 } 1516 1517 if (p->nowait) { 1518 ret = -EAGAIN; 1519 goto out; 1520 } 1521 1522 if (!p->skip_locking) { 1523 btrfs_unlock_up_safe(p, parent_level + 1); 1524 btrfs_maybe_reset_lockdep_class(root, tmp); 1525 tmp_locked = true; 1526 btrfs_tree_read_lock(tmp); 1527 btrfs_release_path(p); 1528 ret = -EAGAIN; 1529 path_released = true; 1530 } 1531 1532 /* Now we're allowed to do a blocking uptodate check. */ 1533 ret2 = btrfs_read_extent_buffer(tmp, &check); 1534 if (ret2) { 1535 ret = ret2; 1536 goto out; 1537 } 1538 1539 if (ret == 0) { 1540 ASSERT(!tmp_locked); 1541 *eb_ret = tmp; 1542 tmp = NULL; 1543 } 1544 goto out; 1545 } else if (p->nowait) { 1546 ret = -EAGAIN; 1547 goto out; 1548 } 1549 1550 if (!p->skip_locking) { 1551 btrfs_unlock_up_safe(p, parent_level + 1); 1552 ret = -EAGAIN; 1553 } 1554 1555 if (p->reada != READA_NONE) 1556 reada_for_search(fs_info, p, parent_level, slot, key->objectid); 1557 1558 tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level); 1559 if (IS_ERR(tmp)) { 1560 ret = PTR_ERR(tmp); 1561 tmp = NULL; 1562 goto out; 1563 } 1564 read_tmp = true; 1565 1566 if (!p->skip_locking) { 1567 ASSERT(ret == -EAGAIN); 1568 btrfs_maybe_reset_lockdep_class(root, tmp); 1569 tmp_locked = true; 1570 btrfs_tree_read_lock(tmp); 1571 btrfs_release_path(p); 1572 path_released = true; 1573 } 1574 1575 /* Now we're allowed to do a blocking uptodate check. */ 1576 ret2 = btrfs_read_extent_buffer(tmp, &check); 1577 if (ret2) { 1578 ret = ret2; 1579 goto out; 1580 } 1581 1582 /* 1583 * If the read above didn't mark this buffer up to date, 1584 * it will never end up being up to date. Set ret to EIO now 1585 * and give up so that our caller doesn't loop forever 1586 * on our EAGAINs. 1587 */ 1588 if (unlikely(!extent_buffer_uptodate(tmp))) { 1589 ret = -EIO; 1590 goto out; 1591 } 1592 1593 if (ret == 0) { 1594 ASSERT(!tmp_locked); 1595 *eb_ret = tmp; 1596 tmp = NULL; 1597 } 1598 out: 1599 if (tmp) { 1600 if (tmp_locked) 1601 btrfs_tree_read_unlock(tmp); 1602 if (read_tmp && ret && ret != -EAGAIN) 1603 free_extent_buffer_stale(tmp); 1604 else 1605 free_extent_buffer(tmp); 1606 } 1607 if (ret && !path_released) 1608 btrfs_release_path(p); 1609 1610 return ret; 1611 } 1612 1613 /* 1614 * helper function for btrfs_search_slot. This does all of the checks 1615 * for node-level blocks and does any balancing required based on 1616 * the ins_len. 1617 * 1618 * If no extra work was required, zero is returned. If we had to 1619 * drop the path, -EAGAIN is returned and btrfs_search_slot must 1620 * start over 1621 */ 1622 static int 1623 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1624 struct btrfs_root *root, struct btrfs_path *p, 1625 struct extent_buffer *b, int level, int ins_len, 1626 int *write_lock_level) 1627 { 1628 struct btrfs_fs_info *fs_info = root->fs_info; 1629 int ret = 0; 1630 1631 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1632 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 1633 1634 if (*write_lock_level < level + 1) { 1635 *write_lock_level = level + 1; 1636 btrfs_release_path(p); 1637 return -EAGAIN; 1638 } 1639 1640 reada_for_balance(p, level); 1641 ret = split_node(trans, root, p, level); 1642 1643 b = p->nodes[level]; 1644 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1645 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 1646 1647 if (*write_lock_level < level + 1) { 1648 *write_lock_level = level + 1; 1649 btrfs_release_path(p); 1650 return -EAGAIN; 1651 } 1652 1653 reada_for_balance(p, level); 1654 ret = balance_level(trans, root, p, level); 1655 if (ret) 1656 return ret; 1657 1658 b = p->nodes[level]; 1659 if (!b) { 1660 btrfs_release_path(p); 1661 return -EAGAIN; 1662 } 1663 BUG_ON(btrfs_header_nritems(b) == 1); 1664 } 1665 return ret; 1666 } 1667 1668 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 1669 u64 iobjectid, u64 ioff, u8 key_type, 1670 struct btrfs_key *found_key) 1671 { 1672 int ret; 1673 struct btrfs_key key; 1674 struct extent_buffer *eb; 1675 1676 ASSERT(path); 1677 ASSERT(found_key); 1678 1679 key.type = key_type; 1680 key.objectid = iobjectid; 1681 key.offset = ioff; 1682 1683 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 1684 if (ret < 0) 1685 return ret; 1686 1687 eb = path->nodes[0]; 1688 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 1689 ret = btrfs_next_leaf(fs_root, path); 1690 if (ret) 1691 return ret; 1692 eb = path->nodes[0]; 1693 } 1694 1695 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 1696 if (found_key->type != key.type || 1697 found_key->objectid != key.objectid) 1698 return 1; 1699 1700 return 0; 1701 } 1702 1703 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 1704 struct btrfs_path *p, 1705 int write_lock_level) 1706 { 1707 struct extent_buffer *b; 1708 int root_lock = 0; 1709 int level = 0; 1710 1711 if (p->search_commit_root) { 1712 b = root->commit_root; 1713 refcount_inc(&b->refs); 1714 level = btrfs_header_level(b); 1715 /* 1716 * Ensure that all callers have set skip_locking when 1717 * p->search_commit_root is true. 1718 */ 1719 ASSERT(p->skip_locking); 1720 1721 goto out; 1722 } 1723 1724 if (p->skip_locking) { 1725 b = btrfs_root_node(root); 1726 level = btrfs_header_level(b); 1727 goto out; 1728 } 1729 1730 /* We try very hard to do read locks on the root */ 1731 root_lock = BTRFS_READ_LOCK; 1732 1733 /* 1734 * If the level is set to maximum, we can skip trying to get the read 1735 * lock. 1736 */ 1737 if (write_lock_level < BTRFS_MAX_LEVEL) { 1738 /* 1739 * We don't know the level of the root node until we actually 1740 * have it read locked 1741 */ 1742 if (p->nowait) { 1743 b = btrfs_try_read_lock_root_node(root); 1744 if (IS_ERR(b)) 1745 return b; 1746 } else { 1747 b = btrfs_read_lock_root_node(root); 1748 } 1749 level = btrfs_header_level(b); 1750 if (level > write_lock_level) 1751 goto out; 1752 1753 /* Whoops, must trade for write lock */ 1754 btrfs_tree_read_unlock(b); 1755 free_extent_buffer(b); 1756 } 1757 1758 b = btrfs_lock_root_node(root); 1759 root_lock = BTRFS_WRITE_LOCK; 1760 1761 /* The level might have changed, check again */ 1762 level = btrfs_header_level(b); 1763 1764 out: 1765 /* 1766 * The root may have failed to write out at some point, and thus is no 1767 * longer valid, return an error in this case. 1768 */ 1769 if (unlikely(!extent_buffer_uptodate(b))) { 1770 if (root_lock) 1771 btrfs_tree_unlock_rw(b, root_lock); 1772 free_extent_buffer(b); 1773 return ERR_PTR(-EIO); 1774 } 1775 1776 p->nodes[level] = b; 1777 if (!p->skip_locking) 1778 p->locks[level] = root_lock; 1779 /* 1780 * Callers are responsible for dropping b's references. 1781 */ 1782 return b; 1783 } 1784 1785 /* 1786 * Replace the extent buffer at the lowest level of the path with a cloned 1787 * version. The purpose is to be able to use it safely, after releasing the 1788 * commit root semaphore, even if relocation is happening in parallel, the 1789 * transaction used for relocation is committed and the extent buffer is 1790 * reallocated in the next transaction. 1791 * 1792 * This is used in a context where the caller does not prevent transaction 1793 * commits from happening, either by holding a transaction handle or holding 1794 * some lock, while it's doing searches through a commit root. 1795 * At the moment it's only used for send operations. 1796 */ 1797 static int finish_need_commit_sem_search(struct btrfs_path *path) 1798 { 1799 const int i = path->lowest_level; 1800 const int slot = path->slots[i]; 1801 struct extent_buffer *lowest = path->nodes[i]; 1802 struct extent_buffer *clone; 1803 1804 ASSERT(path->need_commit_sem); 1805 1806 if (!lowest) 1807 return 0; 1808 1809 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); 1810 1811 clone = btrfs_clone_extent_buffer(lowest); 1812 if (!clone) 1813 return -ENOMEM; 1814 1815 btrfs_release_path(path); 1816 path->nodes[i] = clone; 1817 path->slots[i] = slot; 1818 1819 return 0; 1820 } 1821 1822 static inline int search_for_key_slot(const struct extent_buffer *eb, 1823 int search_low_slot, 1824 const struct btrfs_key *key, 1825 int prev_cmp, 1826 int *slot) 1827 { 1828 /* 1829 * If a previous call to btrfs_bin_search() on a parent node returned an 1830 * exact match (prev_cmp == 0), we can safely assume the target key will 1831 * always be at slot 0 on lower levels, since each key pointer 1832 * (struct btrfs_key_ptr) refers to the lowest key accessible from the 1833 * subtree it points to. Thus we can skip searching lower levels. 1834 */ 1835 if (prev_cmp == 0) { 1836 *slot = 0; 1837 return 0; 1838 } 1839 1840 return btrfs_bin_search(eb, search_low_slot, key, slot); 1841 } 1842 1843 static int search_leaf(struct btrfs_trans_handle *trans, 1844 struct btrfs_root *root, 1845 const struct btrfs_key *key, 1846 struct btrfs_path *path, 1847 int ins_len, 1848 int prev_cmp) 1849 { 1850 struct extent_buffer *leaf = path->nodes[0]; 1851 int leaf_free_space = -1; 1852 int search_low_slot = 0; 1853 int ret; 1854 bool do_bin_search = true; 1855 1856 /* 1857 * If we are doing an insertion, the leaf has enough free space and the 1858 * destination slot for the key is not slot 0, then we can unlock our 1859 * write lock on the parent, and any other upper nodes, before doing the 1860 * binary search on the leaf (with search_for_key_slot()), allowing other 1861 * tasks to lock the parent and any other upper nodes. 1862 */ 1863 if (ins_len > 0) { 1864 /* 1865 * Cache the leaf free space, since we will need it later and it 1866 * will not change until then. 1867 */ 1868 leaf_free_space = btrfs_leaf_free_space(leaf); 1869 1870 /* 1871 * !path->locks[1] means we have a single node tree, the leaf is 1872 * the root of the tree. 1873 */ 1874 if (path->locks[1] && leaf_free_space >= ins_len) { 1875 struct btrfs_disk_key first_key; 1876 1877 ASSERT(btrfs_header_nritems(leaf) > 0); 1878 btrfs_item_key(leaf, &first_key, 0); 1879 1880 /* 1881 * Doing the extra comparison with the first key is cheap, 1882 * taking into account that the first key is very likely 1883 * already in a cache line because it immediately follows 1884 * the extent buffer's header and we have recently accessed 1885 * the header's level field. 1886 */ 1887 ret = btrfs_comp_keys(&first_key, key); 1888 if (ret < 0) { 1889 /* 1890 * The first key is smaller than the key we want 1891 * to insert, so we are safe to unlock all upper 1892 * nodes and we have to do the binary search. 1893 * 1894 * We do use btrfs_unlock_up_safe() and not 1895 * unlock_up() because the later does not unlock 1896 * nodes with a slot of 0 - we can safely unlock 1897 * any node even if its slot is 0 since in this 1898 * case the key does not end up at slot 0 of the 1899 * leaf and there's no need to split the leaf. 1900 */ 1901 btrfs_unlock_up_safe(path, 1); 1902 search_low_slot = 1; 1903 } else { 1904 /* 1905 * The first key is >= then the key we want to 1906 * insert, so we can skip the binary search as 1907 * the target key will be at slot 0. 1908 * 1909 * We can not unlock upper nodes when the key is 1910 * less than the first key, because we will need 1911 * to update the key at slot 0 of the parent node 1912 * and possibly of other upper nodes too. 1913 * If the key matches the first key, then we can 1914 * unlock all the upper nodes, using 1915 * btrfs_unlock_up_safe() instead of unlock_up() 1916 * as stated above. 1917 */ 1918 if (ret == 0) 1919 btrfs_unlock_up_safe(path, 1); 1920 /* 1921 * ret is already 0 or 1, matching the result of 1922 * a btrfs_bin_search() call, so there is no need 1923 * to adjust it. 1924 */ 1925 do_bin_search = false; 1926 path->slots[0] = 0; 1927 } 1928 } 1929 } 1930 1931 if (do_bin_search) { 1932 ret = search_for_key_slot(leaf, search_low_slot, key, 1933 prev_cmp, &path->slots[0]); 1934 if (ret < 0) 1935 return ret; 1936 } 1937 1938 if (ins_len > 0) { 1939 /* 1940 * Item key already exists. In this case, if we are allowed to 1941 * insert the item (for example, in dir_item case, item key 1942 * collision is allowed), it will be merged with the original 1943 * item. Only the item size grows, no new btrfs item will be 1944 * added. If search_for_extension is not set, ins_len already 1945 * accounts the size btrfs_item, deduct it here so leaf space 1946 * check will be correct. 1947 */ 1948 if (ret == 0 && !path->search_for_extension) { 1949 ASSERT(ins_len >= sizeof(struct btrfs_item)); 1950 ins_len -= sizeof(struct btrfs_item); 1951 } 1952 1953 ASSERT(leaf_free_space >= 0); 1954 1955 if (leaf_free_space < ins_len) { 1956 int ret2; 1957 1958 ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0)); 1959 ASSERT(ret2 <= 0); 1960 if (WARN_ON(ret2 > 0)) 1961 ret2 = -EUCLEAN; 1962 if (ret2) 1963 ret = ret2; 1964 } 1965 } 1966 1967 return ret; 1968 } 1969 1970 /* 1971 * Look for a key in a tree and perform necessary modifications to preserve 1972 * tree invariants. 1973 * 1974 * @trans: Handle of transaction, used when modifying the tree 1975 * @p: Holds all btree nodes along the search path 1976 * @root: The root node of the tree 1977 * @key: The key we are looking for 1978 * @ins_len: Indicates purpose of search: 1979 * >0 for inserts it's size of item inserted (*) 1980 * <0 for deletions 1981 * 0 for plain searches, not modifying the tree 1982 * 1983 * (*) If size of item inserted doesn't include 1984 * sizeof(struct btrfs_item), then p->search_for_extension must 1985 * be set. 1986 * @cow: boolean should CoW operations be performed. Must always be 1 1987 * when modifying the tree. 1988 * 1989 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 1990 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 1991 * 1992 * If @key is found, 0 is returned and you can find the item in the leaf level 1993 * of the path (level 0) 1994 * 1995 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 1996 * points to the slot where it should be inserted 1997 * 1998 * If an error is encountered while searching the tree a negative error number 1999 * is returned 2000 */ 2001 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2002 const struct btrfs_key *key, struct btrfs_path *p, 2003 int ins_len, int cow) 2004 { 2005 struct btrfs_fs_info *fs_info; 2006 struct extent_buffer *b; 2007 int slot; 2008 int ret; 2009 int level; 2010 int lowest_unlock = 1; 2011 /* everything at write_lock_level or lower must be write locked */ 2012 int write_lock_level = 0; 2013 u8 lowest_level = 0; 2014 int min_write_lock_level; 2015 int prev_cmp; 2016 2017 if (!root) 2018 return -EINVAL; 2019 2020 fs_info = root->fs_info; 2021 might_sleep(); 2022 2023 lowest_level = p->lowest_level; 2024 WARN_ON(lowest_level && ins_len > 0); 2025 WARN_ON(p->nodes[0] != NULL); 2026 BUG_ON(!cow && ins_len); 2027 2028 /* 2029 * For now only allow nowait for read only operations. There's no 2030 * strict reason why we can't, we just only need it for reads so it's 2031 * only implemented for reads. 2032 */ 2033 ASSERT(!p->nowait || !cow); 2034 2035 if (ins_len < 0) { 2036 lowest_unlock = 2; 2037 2038 /* when we are removing items, we might have to go up to level 2039 * two as we update tree pointers Make sure we keep write 2040 * for those levels as well 2041 */ 2042 write_lock_level = 2; 2043 } else if (ins_len > 0) { 2044 /* 2045 * for inserting items, make sure we have a write lock on 2046 * level 1 so we can update keys 2047 */ 2048 write_lock_level = 1; 2049 } 2050 2051 if (!cow) 2052 write_lock_level = -1; 2053 2054 if (cow && (p->keep_locks || p->lowest_level)) 2055 write_lock_level = BTRFS_MAX_LEVEL; 2056 2057 min_write_lock_level = write_lock_level; 2058 2059 if (p->need_commit_sem) { 2060 ASSERT(p->search_commit_root); 2061 if (p->nowait) { 2062 if (!down_read_trylock(&fs_info->commit_root_sem)) 2063 return -EAGAIN; 2064 } else { 2065 down_read(&fs_info->commit_root_sem); 2066 } 2067 } 2068 2069 again: 2070 prev_cmp = -1; 2071 b = btrfs_search_slot_get_root(root, p, write_lock_level); 2072 if (IS_ERR(b)) { 2073 ret = PTR_ERR(b); 2074 goto done; 2075 } 2076 2077 while (b) { 2078 int dec = 0; 2079 int ret2; 2080 2081 level = btrfs_header_level(b); 2082 2083 if (cow) { 2084 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 2085 2086 /* 2087 * if we don't really need to cow this block 2088 * then we don't want to set the path blocking, 2089 * so we test it here 2090 */ 2091 if (!should_cow_block(trans, root, b)) 2092 goto cow_done; 2093 2094 /* 2095 * must have write locks on this node and the 2096 * parent 2097 */ 2098 if (level > write_lock_level || 2099 (level + 1 > write_lock_level && 2100 level + 1 < BTRFS_MAX_LEVEL && 2101 p->nodes[level + 1])) { 2102 write_lock_level = level + 1; 2103 btrfs_release_path(p); 2104 trace_btrfs_search_slot_restart(root, level, "write_lock"); 2105 goto again; 2106 } 2107 2108 if (last_level) 2109 ret2 = btrfs_cow_block(trans, root, b, NULL, 0, 2110 &b, BTRFS_NESTING_COW); 2111 else 2112 ret2 = btrfs_cow_block(trans, root, b, 2113 p->nodes[level + 1], 2114 p->slots[level + 1], &b, 2115 BTRFS_NESTING_COW); 2116 if (ret2) { 2117 ret = ret2; 2118 goto done; 2119 } 2120 } 2121 cow_done: 2122 p->nodes[level] = b; 2123 2124 /* 2125 * we have a lock on b and as long as we aren't changing 2126 * the tree, there is no way to for the items in b to change. 2127 * It is safe to drop the lock on our parent before we 2128 * go through the expensive btree search on b. 2129 * 2130 * If we're inserting or deleting (ins_len != 0), then we might 2131 * be changing slot zero, which may require changing the parent. 2132 * So, we can't drop the lock until after we know which slot 2133 * we're operating on. 2134 */ 2135 if (!ins_len && !p->keep_locks) { 2136 int u = level + 1; 2137 2138 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 2139 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 2140 p->locks[u] = 0; 2141 } 2142 } 2143 2144 if (level == 0) { 2145 if (ins_len > 0) 2146 ASSERT(write_lock_level >= 1); 2147 2148 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); 2149 if (!p->search_for_split) 2150 unlock_up(p, level, lowest_unlock, 2151 min_write_lock_level, NULL); 2152 goto done; 2153 } 2154 2155 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); 2156 if (ret < 0) 2157 goto done; 2158 prev_cmp = ret; 2159 2160 if (ret && slot > 0) { 2161 dec = 1; 2162 slot--; 2163 } 2164 p->slots[level] = slot; 2165 ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len, 2166 &write_lock_level); 2167 if (ret2 == -EAGAIN) { 2168 trace_btrfs_search_slot_restart(root, level, "setup_nodes"); 2169 goto again; 2170 } 2171 if (ret2) { 2172 ret = ret2; 2173 goto done; 2174 } 2175 b = p->nodes[level]; 2176 slot = p->slots[level]; 2177 2178 /* 2179 * Slot 0 is special, if we change the key we have to update 2180 * the parent pointer which means we must have a write lock on 2181 * the parent 2182 */ 2183 if (slot == 0 && ins_len && write_lock_level < level + 1) { 2184 write_lock_level = level + 1; 2185 btrfs_release_path(p); 2186 trace_btrfs_search_slot_restart(root, level, "slot_zero"); 2187 goto again; 2188 } 2189 2190 unlock_up(p, level, lowest_unlock, min_write_lock_level, 2191 &write_lock_level); 2192 2193 if (level == lowest_level) { 2194 if (dec) 2195 p->slots[level]++; 2196 goto done; 2197 } 2198 2199 ret2 = read_block_for_search(root, p, &b, slot, key); 2200 if (ret2 == -EAGAIN && !p->nowait) { 2201 trace_btrfs_search_slot_restart(root, level, "read_block"); 2202 goto again; 2203 } 2204 if (ret2) { 2205 ret = ret2; 2206 goto done; 2207 } 2208 2209 if (!p->skip_locking) { 2210 level = btrfs_header_level(b); 2211 2212 btrfs_maybe_reset_lockdep_class(root, b); 2213 2214 if (level <= write_lock_level) { 2215 btrfs_tree_lock(b); 2216 p->locks[level] = BTRFS_WRITE_LOCK; 2217 } else { 2218 if (p->nowait) { 2219 if (!btrfs_try_tree_read_lock(b)) { 2220 free_extent_buffer(b); 2221 ret = -EAGAIN; 2222 goto done; 2223 } 2224 } else { 2225 btrfs_tree_read_lock(b); 2226 } 2227 p->locks[level] = BTRFS_READ_LOCK; 2228 } 2229 p->nodes[level] = b; 2230 } 2231 } 2232 ret = 1; 2233 done: 2234 if (ret < 0 && !p->skip_release_on_error) 2235 btrfs_release_path(p); 2236 2237 if (p->need_commit_sem) { 2238 int ret2; 2239 2240 ret2 = finish_need_commit_sem_search(p); 2241 up_read(&fs_info->commit_root_sem); 2242 if (ret2) 2243 ret = ret2; 2244 } 2245 2246 return ret; 2247 } 2248 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); 2249 2250 /* 2251 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2252 * current state of the tree together with the operations recorded in the tree 2253 * modification log to search for the key in a previous version of this tree, as 2254 * denoted by the time_seq parameter. 2255 * 2256 * Naturally, there is no support for insert, delete or cow operations. 2257 * 2258 * The resulting path and return value will be set up as if we called 2259 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2260 */ 2261 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 2262 struct btrfs_path *p, u64 time_seq) 2263 { 2264 struct btrfs_fs_info *fs_info = root->fs_info; 2265 struct extent_buffer *b; 2266 int slot; 2267 int ret; 2268 int level; 2269 int lowest_unlock = 1; 2270 u8 lowest_level = 0; 2271 2272 lowest_level = p->lowest_level; 2273 WARN_ON(p->nodes[0] != NULL); 2274 ASSERT(!p->nowait); 2275 2276 if (p->search_commit_root) { 2277 BUG_ON(time_seq); 2278 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2279 } 2280 2281 again: 2282 b = btrfs_get_old_root(root, time_seq); 2283 if (unlikely(!b)) { 2284 ret = -EIO; 2285 goto done; 2286 } 2287 level = btrfs_header_level(b); 2288 p->locks[level] = BTRFS_READ_LOCK; 2289 2290 while (b) { 2291 int dec = 0; 2292 int ret2; 2293 2294 level = btrfs_header_level(b); 2295 p->nodes[level] = b; 2296 2297 /* 2298 * we have a lock on b and as long as we aren't changing 2299 * the tree, there is no way to for the items in b to change. 2300 * It is safe to drop the lock on our parent before we 2301 * go through the expensive btree search on b. 2302 */ 2303 btrfs_unlock_up_safe(p, level + 1); 2304 2305 ret = btrfs_bin_search(b, 0, key, &slot); 2306 if (ret < 0) 2307 goto done; 2308 2309 if (level == 0) { 2310 p->slots[level] = slot; 2311 unlock_up(p, level, lowest_unlock, 0, NULL); 2312 goto done; 2313 } 2314 2315 if (ret && slot > 0) { 2316 dec = 1; 2317 slot--; 2318 } 2319 p->slots[level] = slot; 2320 unlock_up(p, level, lowest_unlock, 0, NULL); 2321 2322 if (level == lowest_level) { 2323 if (dec) 2324 p->slots[level]++; 2325 goto done; 2326 } 2327 2328 ret2 = read_block_for_search(root, p, &b, slot, key); 2329 if (ret2 == -EAGAIN && !p->nowait) 2330 goto again; 2331 if (ret2) { 2332 ret = ret2; 2333 goto done; 2334 } 2335 2336 level = btrfs_header_level(b); 2337 btrfs_tree_read_lock(b); 2338 b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq); 2339 if (!b) { 2340 ret = -ENOMEM; 2341 goto done; 2342 } 2343 p->locks[level] = BTRFS_READ_LOCK; 2344 p->nodes[level] = b; 2345 } 2346 ret = 1; 2347 done: 2348 if (ret < 0) 2349 btrfs_release_path(p); 2350 2351 return ret; 2352 } 2353 2354 /* 2355 * Search the tree again to find a leaf with smaller keys. 2356 * Returns 0 if it found something. 2357 * Returns 1 if there are no smaller keys. 2358 * Returns < 0 on error. 2359 * 2360 * This may release the path, and so you may lose any locks held at the 2361 * time you call it. 2362 */ 2363 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 2364 { 2365 struct btrfs_key key; 2366 struct btrfs_key orig_key; 2367 struct btrfs_disk_key found_key; 2368 int ret; 2369 2370 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 2371 orig_key = key; 2372 2373 if (key.offset > 0) { 2374 key.offset--; 2375 } else if (key.type > 0) { 2376 key.type--; 2377 key.offset = (u64)-1; 2378 } else if (key.objectid > 0) { 2379 key.objectid--; 2380 key.type = (u8)-1; 2381 key.offset = (u64)-1; 2382 } else { 2383 return 1; 2384 } 2385 2386 btrfs_release_path(path); 2387 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2388 if (ret <= 0) 2389 return ret; 2390 2391 /* 2392 * Previous key not found. Even if we were at slot 0 of the leaf we had 2393 * before releasing the path and calling btrfs_search_slot(), we now may 2394 * be in a slot pointing to the same original key - this can happen if 2395 * after we released the path, one of more items were moved from a 2396 * sibling leaf into the front of the leaf we had due to an insertion 2397 * (see push_leaf_right()). 2398 * If we hit this case and our slot is > 0 and just decrement the slot 2399 * so that the caller does not process the same key again, which may or 2400 * may not break the caller, depending on its logic. 2401 */ 2402 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 2403 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); 2404 ret = btrfs_comp_keys(&found_key, &orig_key); 2405 if (ret == 0) { 2406 if (path->slots[0] > 0) { 2407 path->slots[0]--; 2408 return 0; 2409 } 2410 /* 2411 * At slot 0, same key as before, it means orig_key is 2412 * the lowest, leftmost, key in the tree. We're done. 2413 */ 2414 return 1; 2415 } 2416 } 2417 2418 btrfs_item_key(path->nodes[0], &found_key, 0); 2419 ret = btrfs_comp_keys(&found_key, &key); 2420 /* 2421 * We might have had an item with the previous key in the tree right 2422 * before we released our path. And after we released our path, that 2423 * item might have been pushed to the first slot (0) of the leaf we 2424 * were holding due to a tree balance. Alternatively, an item with the 2425 * previous key can exist as the only element of a leaf (big fat item). 2426 * Therefore account for these 2 cases, so that our callers (like 2427 * btrfs_previous_item) don't miss an existing item with a key matching 2428 * the previous key we computed above. 2429 */ 2430 if (ret <= 0) 2431 return 0; 2432 return 1; 2433 } 2434 2435 /* 2436 * helper to use instead of search slot if no exact match is needed but 2437 * instead the next or previous item should be returned. 2438 * When find_higher is true, the next higher item is returned, the next lower 2439 * otherwise. 2440 * When return_any and find_higher are both true, and no higher item is found, 2441 * return the next lower instead. 2442 * When return_any is true and find_higher is false, and no lower item is found, 2443 * return the next higher instead. 2444 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2445 * < 0 on error 2446 */ 2447 int btrfs_search_slot_for_read(struct btrfs_root *root, 2448 const struct btrfs_key *key, 2449 struct btrfs_path *p, int find_higher, 2450 int return_any) 2451 { 2452 int ret; 2453 struct extent_buffer *leaf; 2454 2455 again: 2456 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2457 if (ret <= 0) 2458 return ret; 2459 /* 2460 * a return value of 1 means the path is at the position where the 2461 * item should be inserted. Normally this is the next bigger item, 2462 * but in case the previous item is the last in a leaf, path points 2463 * to the first free slot in the previous leaf, i.e. at an invalid 2464 * item. 2465 */ 2466 leaf = p->nodes[0]; 2467 2468 if (find_higher) { 2469 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2470 ret = btrfs_next_leaf(root, p); 2471 if (ret <= 0) 2472 return ret; 2473 if (!return_any) 2474 return 1; 2475 /* 2476 * no higher item found, return the next 2477 * lower instead 2478 */ 2479 return_any = 0; 2480 find_higher = 0; 2481 btrfs_release_path(p); 2482 goto again; 2483 } 2484 } else { 2485 if (p->slots[0] == 0) { 2486 ret = btrfs_prev_leaf(root, p); 2487 if (ret < 0) 2488 return ret; 2489 if (!ret) { 2490 leaf = p->nodes[0]; 2491 if (p->slots[0] == btrfs_header_nritems(leaf)) 2492 p->slots[0]--; 2493 return 0; 2494 } 2495 if (!return_any) 2496 return 1; 2497 /* 2498 * no lower item found, return the next 2499 * higher instead 2500 */ 2501 return_any = 0; 2502 find_higher = 1; 2503 btrfs_release_path(p); 2504 goto again; 2505 } else { 2506 --p->slots[0]; 2507 } 2508 } 2509 return 0; 2510 } 2511 2512 /* 2513 * Execute search and call btrfs_previous_item to traverse backwards if the item 2514 * was not found. 2515 * 2516 * Return 0 if found, 1 if not found and < 0 if error. 2517 */ 2518 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, 2519 struct btrfs_path *path) 2520 { 2521 int ret; 2522 2523 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 2524 if (ret > 0) 2525 ret = btrfs_previous_item(root, path, key->objectid, key->type); 2526 2527 if (ret == 0) 2528 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2529 2530 return ret; 2531 } 2532 2533 /* 2534 * Search for a valid slot for the given path. 2535 * 2536 * @root: The root node of the tree. 2537 * @key: Will contain a valid item if found. 2538 * @path: The starting point to validate the slot. 2539 * 2540 * Return: 0 if the item is valid 2541 * 1 if not found 2542 * <0 if error. 2543 */ 2544 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, 2545 struct btrfs_path *path) 2546 { 2547 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 2548 int ret; 2549 2550 ret = btrfs_next_leaf(root, path); 2551 if (ret) 2552 return ret; 2553 } 2554 2555 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2556 return 0; 2557 } 2558 2559 /* 2560 * adjust the pointers going up the tree, starting at level 2561 * making sure the right key of each node is points to 'key'. 2562 * This is used after shifting pointers to the left, so it stops 2563 * fixing up pointers when a given leaf/node is not in slot 0 of the 2564 * higher levels 2565 * 2566 */ 2567 static void fixup_low_keys(struct btrfs_trans_handle *trans, 2568 const struct btrfs_path *path, 2569 const struct btrfs_disk_key *key, int level) 2570 { 2571 int i; 2572 struct extent_buffer *t; 2573 int ret; 2574 2575 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2576 int tslot = path->slots[i]; 2577 2578 if (!path->nodes[i]) 2579 break; 2580 t = path->nodes[i]; 2581 ret = btrfs_tree_mod_log_insert_key(t, tslot, 2582 BTRFS_MOD_LOG_KEY_REPLACE); 2583 BUG_ON(ret < 0); 2584 btrfs_set_node_key(t, key, tslot); 2585 btrfs_mark_buffer_dirty(trans, path->nodes[i]); 2586 if (tslot != 0) 2587 break; 2588 } 2589 } 2590 2591 /* 2592 * update item key. 2593 * 2594 * This function isn't completely safe. It's the caller's responsibility 2595 * that the new key won't break the order 2596 */ 2597 void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, 2598 const struct btrfs_path *path, 2599 const struct btrfs_key *new_key) 2600 { 2601 struct btrfs_fs_info *fs_info = trans->fs_info; 2602 struct btrfs_disk_key disk_key; 2603 struct extent_buffer *eb; 2604 int slot; 2605 2606 eb = path->nodes[0]; 2607 slot = path->slots[0]; 2608 if (slot > 0) { 2609 btrfs_item_key(eb, &disk_key, slot - 1); 2610 if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) { 2611 btrfs_print_leaf(eb); 2612 btrfs_crit(fs_info, 2613 "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT, 2614 slot, btrfs_disk_key_objectid(&disk_key), 2615 btrfs_disk_key_type(&disk_key), 2616 btrfs_disk_key_offset(&disk_key), 2617 BTRFS_KEY_FMT_VALUE(new_key)); 2618 BUG(); 2619 } 2620 } 2621 if (slot < btrfs_header_nritems(eb) - 1) { 2622 btrfs_item_key(eb, &disk_key, slot + 1); 2623 if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) { 2624 btrfs_print_leaf(eb); 2625 btrfs_crit(fs_info, 2626 "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT, 2627 slot, btrfs_disk_key_objectid(&disk_key), 2628 btrfs_disk_key_type(&disk_key), 2629 btrfs_disk_key_offset(&disk_key), 2630 BTRFS_KEY_FMT_VALUE(new_key)); 2631 BUG(); 2632 } 2633 } 2634 2635 btrfs_cpu_key_to_disk(&disk_key, new_key); 2636 btrfs_set_item_key(eb, &disk_key, slot); 2637 btrfs_mark_buffer_dirty(trans, eb); 2638 if (slot == 0) 2639 fixup_low_keys(trans, path, &disk_key, 1); 2640 } 2641 2642 /* 2643 * Check key order of two sibling extent buffers. 2644 * 2645 * Return true if something is wrong. 2646 * Return false if everything is fine. 2647 * 2648 * Tree-checker only works inside one tree block, thus the following 2649 * corruption can not be detected by tree-checker: 2650 * 2651 * Leaf @left | Leaf @right 2652 * -------------------------------------------------------------- 2653 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | 2654 * 2655 * Key f6 in leaf @left itself is valid, but not valid when the next 2656 * key in leaf @right is 7. 2657 * This can only be checked at tree block merge time. 2658 * And since tree checker has ensured all key order in each tree block 2659 * is correct, we only need to bother the last key of @left and the first 2660 * key of @right. 2661 */ 2662 static bool check_sibling_keys(const struct extent_buffer *left, 2663 const struct extent_buffer *right) 2664 { 2665 struct btrfs_key left_last; 2666 struct btrfs_key right_first; 2667 int level = btrfs_header_level(left); 2668 int nr_left = btrfs_header_nritems(left); 2669 int nr_right = btrfs_header_nritems(right); 2670 2671 /* No key to check in one of the tree blocks */ 2672 if (!nr_left || !nr_right) 2673 return false; 2674 2675 if (level) { 2676 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); 2677 btrfs_node_key_to_cpu(right, &right_first, 0); 2678 } else { 2679 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); 2680 btrfs_item_key_to_cpu(right, &right_first, 0); 2681 } 2682 2683 if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { 2684 btrfs_crit(left->fs_info, "left extent buffer:"); 2685 btrfs_print_tree(left, false); 2686 btrfs_crit(left->fs_info, "right extent buffer:"); 2687 btrfs_print_tree(right, false); 2688 btrfs_crit(left->fs_info, 2689 "bad key order, sibling blocks, left last " BTRFS_KEY_FMT " right first " BTRFS_KEY_FMT, 2690 BTRFS_KEY_FMT_VALUE(&left_last), 2691 BTRFS_KEY_FMT_VALUE(&right_first)); 2692 return true; 2693 } 2694 return false; 2695 } 2696 2697 /* 2698 * try to push data from one node into the next node left in the 2699 * tree. 2700 * 2701 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2702 * error, and > 0 if there was no room in the left hand block. 2703 */ 2704 static int push_node_left(struct btrfs_trans_handle *trans, 2705 struct extent_buffer *dst, 2706 struct extent_buffer *src, bool empty) 2707 { 2708 struct btrfs_fs_info *fs_info = trans->fs_info; 2709 int push_items = 0; 2710 int src_nritems; 2711 int dst_nritems; 2712 int ret = 0; 2713 2714 src_nritems = btrfs_header_nritems(src); 2715 dst_nritems = btrfs_header_nritems(dst); 2716 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2717 WARN_ON(btrfs_header_generation(src) != trans->transid); 2718 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2719 2720 if (!empty && src_nritems <= 8) 2721 return 1; 2722 2723 if (push_items <= 0) 2724 return 1; 2725 2726 if (empty) { 2727 push_items = min(src_nritems, push_items); 2728 if (push_items < src_nritems) { 2729 /* leave at least 8 pointers in the node if 2730 * we aren't going to empty it 2731 */ 2732 if (src_nritems - push_items < 8) { 2733 if (push_items <= 8) 2734 return 1; 2735 push_items -= 8; 2736 } 2737 } 2738 } else 2739 push_items = min(src_nritems - 8, push_items); 2740 2741 /* dst is the left eb, src is the middle eb */ 2742 if (unlikely(check_sibling_keys(dst, src))) { 2743 ret = -EUCLEAN; 2744 btrfs_abort_transaction(trans, ret); 2745 return ret; 2746 } 2747 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); 2748 if (unlikely(ret)) { 2749 btrfs_abort_transaction(trans, ret); 2750 return ret; 2751 } 2752 copy_extent_buffer(dst, src, 2753 btrfs_node_key_ptr_offset(dst, dst_nritems), 2754 btrfs_node_key_ptr_offset(src, 0), 2755 push_items * sizeof(struct btrfs_key_ptr)); 2756 2757 if (push_items < src_nritems) { 2758 /* 2759 * btrfs_tree_mod_log_eb_copy handles logging the move, so we 2760 * don't need to do an explicit tree mod log operation for it. 2761 */ 2762 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), 2763 btrfs_node_key_ptr_offset(src, push_items), 2764 (src_nritems - push_items) * 2765 sizeof(struct btrfs_key_ptr)); 2766 } 2767 btrfs_set_header_nritems(src, src_nritems - push_items); 2768 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2769 btrfs_mark_buffer_dirty(trans, src); 2770 btrfs_mark_buffer_dirty(trans, dst); 2771 2772 return ret; 2773 } 2774 2775 /* 2776 * try to push data from one node into the next node right in the 2777 * tree. 2778 * 2779 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 2780 * error, and > 0 if there was no room in the right hand block. 2781 * 2782 * this will only push up to 1/2 the contents of the left node over 2783 */ 2784 static int balance_node_right(struct btrfs_trans_handle *trans, 2785 struct extent_buffer *dst, 2786 struct extent_buffer *src) 2787 { 2788 struct btrfs_fs_info *fs_info = trans->fs_info; 2789 int push_items = 0; 2790 int max_push; 2791 int src_nritems; 2792 int dst_nritems; 2793 int ret = 0; 2794 2795 WARN_ON(btrfs_header_generation(src) != trans->transid); 2796 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2797 2798 src_nritems = btrfs_header_nritems(src); 2799 dst_nritems = btrfs_header_nritems(dst); 2800 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2801 if (push_items <= 0) 2802 return 1; 2803 2804 if (src_nritems < 4) 2805 return 1; 2806 2807 max_push = src_nritems / 2 + 1; 2808 /* don't try to empty the node */ 2809 if (max_push >= src_nritems) 2810 return 1; 2811 2812 if (max_push < push_items) 2813 push_items = max_push; 2814 2815 /* dst is the right eb, src is the middle eb */ 2816 if (unlikely(check_sibling_keys(src, dst))) { 2817 ret = -EUCLEAN; 2818 btrfs_abort_transaction(trans, ret); 2819 return ret; 2820 } 2821 2822 /* 2823 * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't 2824 * need to do an explicit tree mod log operation for it. 2825 */ 2826 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), 2827 btrfs_node_key_ptr_offset(dst, 0), 2828 (dst_nritems) * 2829 sizeof(struct btrfs_key_ptr)); 2830 2831 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, 2832 push_items); 2833 if (unlikely(ret)) { 2834 btrfs_abort_transaction(trans, ret); 2835 return ret; 2836 } 2837 copy_extent_buffer(dst, src, 2838 btrfs_node_key_ptr_offset(dst, 0), 2839 btrfs_node_key_ptr_offset(src, src_nritems - push_items), 2840 push_items * sizeof(struct btrfs_key_ptr)); 2841 2842 btrfs_set_header_nritems(src, src_nritems - push_items); 2843 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2844 2845 btrfs_mark_buffer_dirty(trans, src); 2846 btrfs_mark_buffer_dirty(trans, dst); 2847 2848 return ret; 2849 } 2850 2851 /* 2852 * helper function to insert a new root level in the tree. 2853 * A new node is allocated, and a single item is inserted to 2854 * point to the existing root 2855 * 2856 * returns zero on success or < 0 on failure. 2857 */ 2858 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 2859 struct btrfs_root *root, 2860 struct btrfs_path *path, int level) 2861 { 2862 u64 lower_gen; 2863 struct extent_buffer *lower; 2864 struct extent_buffer *c; 2865 struct extent_buffer *old; 2866 struct btrfs_disk_key lower_key; 2867 int ret; 2868 2869 BUG_ON(path->nodes[level]); 2870 BUG_ON(path->nodes[level-1] != root->node); 2871 2872 lower = path->nodes[level-1]; 2873 if (level == 1) 2874 btrfs_item_key(lower, &lower_key, 0); 2875 else 2876 btrfs_node_key(lower, &lower_key, 0); 2877 2878 c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), 2879 &lower_key, level, root->node->start, 0, 2880 0, BTRFS_NESTING_NEW_ROOT); 2881 if (IS_ERR(c)) 2882 return PTR_ERR(c); 2883 2884 root_add_used_bytes(root); 2885 2886 btrfs_set_header_nritems(c, 1); 2887 btrfs_set_node_key(c, &lower_key, 0); 2888 btrfs_set_node_blockptr(c, 0, lower->start); 2889 lower_gen = btrfs_header_generation(lower); 2890 WARN_ON(lower_gen != trans->transid); 2891 2892 btrfs_set_node_ptr_generation(c, 0, lower_gen); 2893 2894 btrfs_mark_buffer_dirty(trans, c); 2895 2896 old = root->node; 2897 ret = btrfs_tree_mod_log_insert_root(root->node, c, false); 2898 if (ret < 0) { 2899 int ret2; 2900 2901 btrfs_clear_buffer_dirty(trans, c); 2902 ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); 2903 if (unlikely(ret2 < 0)) 2904 btrfs_abort_transaction(trans, ret2); 2905 btrfs_tree_unlock(c); 2906 free_extent_buffer(c); 2907 return ret; 2908 } 2909 rcu_assign_pointer(root->node, c); 2910 2911 /* the super has an extra ref to root->node */ 2912 free_extent_buffer(old); 2913 2914 add_root_to_dirty_list(root); 2915 refcount_inc(&c->refs); 2916 path->nodes[level] = c; 2917 path->locks[level] = BTRFS_WRITE_LOCK; 2918 path->slots[level] = 0; 2919 return 0; 2920 } 2921 2922 /* 2923 * worker function to insert a single pointer in a node. 2924 * the node should have enough room for the pointer already 2925 * 2926 * slot and level indicate where you want the key to go, and 2927 * blocknr is the block the key points to. 2928 */ 2929 static int insert_ptr(struct btrfs_trans_handle *trans, 2930 const struct btrfs_path *path, 2931 const struct btrfs_disk_key *key, u64 bytenr, 2932 int slot, int level) 2933 { 2934 struct extent_buffer *lower; 2935 int nritems; 2936 int ret; 2937 2938 BUG_ON(!path->nodes[level]); 2939 btrfs_assert_tree_write_locked(path->nodes[level]); 2940 lower = path->nodes[level]; 2941 nritems = btrfs_header_nritems(lower); 2942 BUG_ON(slot > nritems); 2943 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); 2944 if (slot != nritems) { 2945 if (level) { 2946 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, 2947 slot, nritems - slot); 2948 if (unlikely(ret < 0)) { 2949 btrfs_abort_transaction(trans, ret); 2950 return ret; 2951 } 2952 } 2953 memmove_extent_buffer(lower, 2954 btrfs_node_key_ptr_offset(lower, slot + 1), 2955 btrfs_node_key_ptr_offset(lower, slot), 2956 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2957 } 2958 if (level) { 2959 ret = btrfs_tree_mod_log_insert_key(lower, slot, 2960 BTRFS_MOD_LOG_KEY_ADD); 2961 if (unlikely(ret < 0)) { 2962 btrfs_abort_transaction(trans, ret); 2963 return ret; 2964 } 2965 } 2966 btrfs_set_node_key(lower, key, slot); 2967 btrfs_set_node_blockptr(lower, slot, bytenr); 2968 WARN_ON(trans->transid == 0); 2969 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 2970 btrfs_set_header_nritems(lower, nritems + 1); 2971 btrfs_mark_buffer_dirty(trans, lower); 2972 2973 return 0; 2974 } 2975 2976 /* 2977 * split the node at the specified level in path in two. 2978 * The path is corrected to point to the appropriate node after the split 2979 * 2980 * Before splitting this tries to make some room in the node by pushing 2981 * left and right, if either one works, it returns right away. 2982 * 2983 * returns 0 on success and < 0 on failure 2984 */ 2985 static noinline int split_node(struct btrfs_trans_handle *trans, 2986 struct btrfs_root *root, 2987 struct btrfs_path *path, int level) 2988 { 2989 struct btrfs_fs_info *fs_info = root->fs_info; 2990 struct extent_buffer *c; 2991 struct extent_buffer *split; 2992 struct btrfs_disk_key disk_key; 2993 int mid; 2994 int ret; 2995 u32 c_nritems; 2996 2997 c = path->nodes[level]; 2998 WARN_ON(btrfs_header_generation(c) != trans->transid); 2999 if (c == root->node) { 3000 /* 3001 * trying to split the root, lets make a new one 3002 * 3003 * tree mod log: We don't log_removal old root in 3004 * insert_new_root, because that root buffer will be kept as a 3005 * normal node. We are going to log removal of half of the 3006 * elements below with btrfs_tree_mod_log_eb_copy(). We're 3007 * holding a tree lock on the buffer, which is why we cannot 3008 * race with other tree_mod_log users. 3009 */ 3010 ret = insert_new_root(trans, root, path, level + 1); 3011 if (ret) 3012 return ret; 3013 } else { 3014 ret = push_nodes_for_insert(trans, root, path, level); 3015 c = path->nodes[level]; 3016 if (!ret && btrfs_header_nritems(c) < 3017 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 3018 return 0; 3019 if (ret < 0) 3020 return ret; 3021 } 3022 3023 c_nritems = btrfs_header_nritems(c); 3024 mid = (c_nritems + 1) / 2; 3025 btrfs_node_key(c, &disk_key, mid); 3026 3027 split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), 3028 &disk_key, level, c->start, 0, 3029 0, BTRFS_NESTING_SPLIT); 3030 if (IS_ERR(split)) 3031 return PTR_ERR(split); 3032 3033 root_add_used_bytes(root); 3034 ASSERT(btrfs_header_level(c) == level); 3035 3036 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); 3037 if (unlikely(ret)) { 3038 btrfs_tree_unlock(split); 3039 free_extent_buffer(split); 3040 btrfs_abort_transaction(trans, ret); 3041 return ret; 3042 } 3043 copy_extent_buffer(split, c, 3044 btrfs_node_key_ptr_offset(split, 0), 3045 btrfs_node_key_ptr_offset(c, mid), 3046 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 3047 btrfs_set_header_nritems(split, c_nritems - mid); 3048 btrfs_set_header_nritems(c, mid); 3049 3050 btrfs_mark_buffer_dirty(trans, c); 3051 btrfs_mark_buffer_dirty(trans, split); 3052 3053 ret = insert_ptr(trans, path, &disk_key, split->start, 3054 path->slots[level + 1] + 1, level + 1); 3055 if (ret < 0) { 3056 btrfs_tree_unlock(split); 3057 free_extent_buffer(split); 3058 return ret; 3059 } 3060 3061 if (path->slots[level] >= mid) { 3062 path->slots[level] -= mid; 3063 btrfs_tree_unlock(c); 3064 free_extent_buffer(c); 3065 path->nodes[level] = split; 3066 path->slots[level + 1] += 1; 3067 } else { 3068 btrfs_tree_unlock(split); 3069 free_extent_buffer(split); 3070 } 3071 return 0; 3072 } 3073 3074 /* 3075 * how many bytes are required to store the items in a leaf. start 3076 * and nr indicate which items in the leaf to check. This totals up the 3077 * space used both by the item structs and the item data 3078 */ 3079 static int leaf_space_used(const struct extent_buffer *l, int start, int nr) 3080 { 3081 int data_len; 3082 int nritems = btrfs_header_nritems(l); 3083 int end = min(nritems, start + nr) - 1; 3084 3085 if (!nr) 3086 return 0; 3087 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); 3088 data_len = data_len - btrfs_item_offset(l, end); 3089 data_len += sizeof(struct btrfs_item) * nr; 3090 WARN_ON(data_len < 0); 3091 return data_len; 3092 } 3093 3094 /* 3095 * The space between the end of the leaf items and 3096 * the start of the leaf data. IOW, how much room 3097 * the leaf has left for both items and data 3098 */ 3099 int btrfs_leaf_free_space(const struct extent_buffer *leaf) 3100 { 3101 struct btrfs_fs_info *fs_info = leaf->fs_info; 3102 int nritems = btrfs_header_nritems(leaf); 3103 int ret; 3104 3105 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 3106 if (unlikely(ret < 0)) { 3107 btrfs_crit(fs_info, 3108 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 3109 ret, 3110 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 3111 leaf_space_used(leaf, 0, nritems), nritems); 3112 } 3113 return ret; 3114 } 3115 3116 /* 3117 * min slot controls the lowest index we're willing to push to the 3118 * right. We'll push up to and including min_slot, but no lower 3119 */ 3120 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 3121 struct btrfs_path *path, 3122 int data_size, bool empty, 3123 struct extent_buffer *right, 3124 int free_space, u32 left_nritems, 3125 u32 min_slot) 3126 { 3127 struct btrfs_fs_info *fs_info = right->fs_info; 3128 struct extent_buffer *left = path->nodes[0]; 3129 struct extent_buffer *upper = path->nodes[1]; 3130 struct btrfs_disk_key disk_key; 3131 int slot; 3132 u32 i; 3133 int push_space = 0; 3134 int push_items = 0; 3135 u32 nr; 3136 u32 right_nritems; 3137 u32 data_end; 3138 u32 this_item_size; 3139 3140 if (empty) 3141 nr = 0; 3142 else 3143 nr = max_t(u32, 1, min_slot); 3144 3145 if (path->slots[0] >= left_nritems) 3146 push_space += data_size; 3147 3148 slot = path->slots[1]; 3149 i = left_nritems - 1; 3150 while (i >= nr) { 3151 if (!empty && push_items > 0) { 3152 if (path->slots[0] > i) 3153 break; 3154 if (path->slots[0] == i) { 3155 int space = btrfs_leaf_free_space(left); 3156 3157 if (space + push_space * 2 > free_space) 3158 break; 3159 } 3160 } 3161 3162 if (path->slots[0] == i) 3163 push_space += data_size; 3164 3165 this_item_size = btrfs_item_size(left, i); 3166 if (this_item_size + sizeof(struct btrfs_item) + 3167 push_space > free_space) 3168 break; 3169 3170 push_items++; 3171 push_space += this_item_size + sizeof(struct btrfs_item); 3172 if (i == 0) 3173 break; 3174 i--; 3175 } 3176 3177 if (push_items == 0) 3178 goto out_unlock; 3179 3180 WARN_ON(!empty && push_items == left_nritems); 3181 3182 /* push left to right */ 3183 right_nritems = btrfs_header_nritems(right); 3184 3185 push_space = btrfs_item_data_end(left, left_nritems - push_items); 3186 push_space -= leaf_data_end(left); 3187 3188 /* make room in the right data area */ 3189 data_end = leaf_data_end(right); 3190 memmove_leaf_data(right, data_end - push_space, data_end, 3191 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 3192 3193 /* copy from the left data area */ 3194 copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3195 leaf_data_end(left), push_space); 3196 3197 memmove_leaf_items(right, push_items, 0, right_nritems); 3198 3199 /* copy the items from left to right */ 3200 copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); 3201 3202 /* update the item pointers */ 3203 right_nritems += push_items; 3204 btrfs_set_header_nritems(right, right_nritems); 3205 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3206 for (i = 0; i < right_nritems; i++) { 3207 push_space -= btrfs_item_size(right, i); 3208 btrfs_set_item_offset(right, i, push_space); 3209 } 3210 3211 left_nritems -= push_items; 3212 btrfs_set_header_nritems(left, left_nritems); 3213 3214 if (left_nritems) 3215 btrfs_mark_buffer_dirty(trans, left); 3216 else 3217 btrfs_clear_buffer_dirty(trans, left); 3218 3219 btrfs_mark_buffer_dirty(trans, right); 3220 3221 btrfs_item_key(right, &disk_key, 0); 3222 btrfs_set_node_key(upper, &disk_key, slot + 1); 3223 btrfs_mark_buffer_dirty(trans, upper); 3224 3225 /* then fixup the leaf pointer in the path */ 3226 if (path->slots[0] >= left_nritems) { 3227 path->slots[0] -= left_nritems; 3228 btrfs_tree_unlock(left); 3229 free_extent_buffer(left); 3230 path->nodes[0] = right; 3231 path->slots[1] += 1; 3232 } else { 3233 btrfs_tree_unlock(right); 3234 free_extent_buffer(right); 3235 } 3236 return 0; 3237 3238 out_unlock: 3239 btrfs_tree_unlock(right); 3240 free_extent_buffer(right); 3241 return 1; 3242 } 3243 3244 /* 3245 * push some data in the path leaf to the right, trying to free up at 3246 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3247 * 3248 * returns 1 if the push failed because the other node didn't have enough 3249 * room, 0 if everything worked out and < 0 if there were major errors. 3250 * 3251 * this will push starting from min_slot to the end of the leaf. It won't 3252 * push any slot lower than min_slot 3253 */ 3254 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3255 *root, struct btrfs_path *path, 3256 int min_data_size, int data_size, 3257 bool empty, u32 min_slot) 3258 { 3259 struct extent_buffer *left = path->nodes[0]; 3260 struct extent_buffer *right; 3261 struct extent_buffer *upper; 3262 int slot; 3263 int free_space; 3264 u32 left_nritems; 3265 int ret; 3266 3267 if (!path->nodes[1]) 3268 return 1; 3269 3270 slot = path->slots[1]; 3271 upper = path->nodes[1]; 3272 if (slot >= btrfs_header_nritems(upper) - 1) 3273 return 1; 3274 3275 btrfs_assert_tree_write_locked(path->nodes[1]); 3276 3277 right = btrfs_read_node_slot(upper, slot + 1); 3278 if (IS_ERR(right)) 3279 return PTR_ERR(right); 3280 3281 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); 3282 3283 free_space = btrfs_leaf_free_space(right); 3284 if (free_space < data_size) 3285 goto out_unlock; 3286 3287 ret = btrfs_cow_block(trans, root, right, upper, 3288 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 3289 if (ret) 3290 goto out_unlock; 3291 3292 left_nritems = btrfs_header_nritems(left); 3293 if (left_nritems == 0) 3294 goto out_unlock; 3295 3296 if (unlikely(check_sibling_keys(left, right))) { 3297 ret = -EUCLEAN; 3298 btrfs_abort_transaction(trans, ret); 3299 btrfs_tree_unlock(right); 3300 free_extent_buffer(right); 3301 return ret; 3302 } 3303 if (path->slots[0] == left_nritems && !empty) { 3304 /* Key greater than all keys in the leaf, right neighbor has 3305 * enough room for it and we're not emptying our leaf to delete 3306 * it, therefore use right neighbor to insert the new item and 3307 * no need to touch/dirty our left leaf. */ 3308 btrfs_tree_unlock(left); 3309 free_extent_buffer(left); 3310 path->nodes[0] = right; 3311 path->slots[0] = 0; 3312 path->slots[1]++; 3313 return 0; 3314 } 3315 3316 return __push_leaf_right(trans, path, min_data_size, empty, right, 3317 free_space, left_nritems, min_slot); 3318 out_unlock: 3319 btrfs_tree_unlock(right); 3320 free_extent_buffer(right); 3321 return 1; 3322 } 3323 3324 /* 3325 * push some data in the path leaf to the left, trying to free up at 3326 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3327 * 3328 * max_slot can put a limit on how far into the leaf we'll push items. The 3329 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3330 * items 3331 */ 3332 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 3333 struct btrfs_path *path, int data_size, 3334 bool empty, struct extent_buffer *left, 3335 int free_space, u32 right_nritems, 3336 u32 max_slot) 3337 { 3338 struct btrfs_fs_info *fs_info = left->fs_info; 3339 struct btrfs_disk_key disk_key; 3340 struct extent_buffer *right = path->nodes[0]; 3341 int i; 3342 int push_space = 0; 3343 int push_items = 0; 3344 u32 old_left_nritems; 3345 u32 nr; 3346 int ret = 0; 3347 u32 this_item_size; 3348 u32 old_left_item_size; 3349 3350 if (empty) 3351 nr = min(right_nritems, max_slot); 3352 else 3353 nr = min(right_nritems - 1, max_slot); 3354 3355 for (i = 0; i < nr; i++) { 3356 if (!empty && push_items > 0) { 3357 if (path->slots[0] < i) 3358 break; 3359 if (path->slots[0] == i) { 3360 int space = btrfs_leaf_free_space(right); 3361 3362 if (space + push_space * 2 > free_space) 3363 break; 3364 } 3365 } 3366 3367 if (path->slots[0] == i) 3368 push_space += data_size; 3369 3370 this_item_size = btrfs_item_size(right, i); 3371 if (this_item_size + sizeof(struct btrfs_item) + push_space > 3372 free_space) 3373 break; 3374 3375 push_items++; 3376 push_space += this_item_size + sizeof(struct btrfs_item); 3377 } 3378 3379 if (push_items == 0) { 3380 ret = 1; 3381 goto out; 3382 } 3383 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 3384 3385 /* push data from right to left */ 3386 copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); 3387 3388 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 3389 btrfs_item_offset(right, push_items - 1); 3390 3391 copy_leaf_data(left, right, leaf_data_end(left) - push_space, 3392 btrfs_item_offset(right, push_items - 1), push_space); 3393 old_left_nritems = btrfs_header_nritems(left); 3394 BUG_ON(old_left_nritems <= 0); 3395 3396 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); 3397 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3398 u32 ioff; 3399 3400 ioff = btrfs_item_offset(left, i); 3401 btrfs_set_item_offset(left, i, 3402 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); 3403 } 3404 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3405 3406 /* fixup right node */ 3407 if (unlikely(push_items > right_nritems)) { 3408 ret = -EUCLEAN; 3409 btrfs_abort_transaction(trans, ret); 3410 btrfs_crit(fs_info, "push items (%d) > right leaf items (%u)", 3411 push_items, right_nritems); 3412 goto out; 3413 } 3414 3415 if (push_items < right_nritems) { 3416 push_space = btrfs_item_offset(right, push_items - 1) - 3417 leaf_data_end(right); 3418 memmove_leaf_data(right, 3419 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3420 leaf_data_end(right), push_space); 3421 3422 memmove_leaf_items(right, 0, push_items, 3423 btrfs_header_nritems(right) - push_items); 3424 } 3425 3426 right_nritems -= push_items; 3427 btrfs_set_header_nritems(right, right_nritems); 3428 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3429 for (i = 0; i < right_nritems; i++) { 3430 push_space = push_space - btrfs_item_size(right, i); 3431 btrfs_set_item_offset(right, i, push_space); 3432 } 3433 3434 btrfs_mark_buffer_dirty(trans, left); 3435 if (right_nritems) 3436 btrfs_mark_buffer_dirty(trans, right); 3437 else 3438 btrfs_clear_buffer_dirty(trans, right); 3439 3440 btrfs_item_key(right, &disk_key, 0); 3441 fixup_low_keys(trans, path, &disk_key, 1); 3442 3443 /* then fixup the leaf pointer in the path */ 3444 if (path->slots[0] < push_items) { 3445 path->slots[0] += old_left_nritems; 3446 btrfs_tree_unlock(right); 3447 free_extent_buffer(right); 3448 path->nodes[0] = left; 3449 path->slots[1] -= 1; 3450 } else { 3451 btrfs_tree_unlock(left); 3452 free_extent_buffer(left); 3453 path->slots[0] -= push_items; 3454 } 3455 BUG_ON(path->slots[0] < 0); 3456 return ret; 3457 out: 3458 btrfs_tree_unlock(left); 3459 free_extent_buffer(left); 3460 return ret; 3461 } 3462 3463 /* 3464 * push some data in the path leaf to the left, trying to free up at 3465 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3466 * 3467 * max_slot can put a limit on how far into the leaf we'll push items. The 3468 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3469 * items 3470 */ 3471 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3472 *root, struct btrfs_path *path, int min_data_size, 3473 int data_size, int empty, u32 max_slot) 3474 { 3475 struct extent_buffer *right = path->nodes[0]; 3476 struct extent_buffer *left; 3477 int slot; 3478 int free_space; 3479 u32 right_nritems; 3480 int ret = 0; 3481 3482 slot = path->slots[1]; 3483 if (slot == 0) 3484 return 1; 3485 if (!path->nodes[1]) 3486 return 1; 3487 3488 right_nritems = btrfs_header_nritems(right); 3489 if (right_nritems == 0) 3490 return 1; 3491 3492 btrfs_assert_tree_write_locked(path->nodes[1]); 3493 3494 left = btrfs_read_node_slot(path->nodes[1], slot - 1); 3495 if (IS_ERR(left)) 3496 return PTR_ERR(left); 3497 3498 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); 3499 3500 free_space = btrfs_leaf_free_space(left); 3501 if (free_space < data_size) { 3502 ret = 1; 3503 goto out; 3504 } 3505 3506 ret = btrfs_cow_block(trans, root, left, 3507 path->nodes[1], slot - 1, &left, 3508 BTRFS_NESTING_LEFT_COW); 3509 if (ret) { 3510 /* we hit -ENOSPC, but it isn't fatal here */ 3511 if (ret == -ENOSPC) 3512 ret = 1; 3513 goto out; 3514 } 3515 3516 if (unlikely(check_sibling_keys(left, right))) { 3517 ret = -EUCLEAN; 3518 btrfs_abort_transaction(trans, ret); 3519 goto out; 3520 } 3521 return __push_leaf_left(trans, path, min_data_size, empty, left, 3522 free_space, right_nritems, max_slot); 3523 out: 3524 btrfs_tree_unlock(left); 3525 free_extent_buffer(left); 3526 return ret; 3527 } 3528 3529 /* 3530 * split the path's leaf in two, making sure there is at least data_size 3531 * available for the resulting leaf level of the path. 3532 */ 3533 static noinline int copy_for_split(struct btrfs_trans_handle *trans, 3534 struct btrfs_path *path, 3535 struct extent_buffer *l, 3536 struct extent_buffer *right, 3537 int slot, int mid, int nritems) 3538 { 3539 struct btrfs_fs_info *fs_info = trans->fs_info; 3540 int data_copy_size; 3541 int rt_data_off; 3542 int i; 3543 int ret; 3544 struct btrfs_disk_key disk_key; 3545 3546 nritems = nritems - mid; 3547 btrfs_set_header_nritems(right, nritems); 3548 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); 3549 3550 copy_leaf_items(right, l, 0, mid, nritems); 3551 3552 copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, 3553 leaf_data_end(l), data_copy_size); 3554 3555 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); 3556 3557 for (i = 0; i < nritems; i++) { 3558 u32 ioff; 3559 3560 ioff = btrfs_item_offset(right, i); 3561 btrfs_set_item_offset(right, i, ioff + rt_data_off); 3562 } 3563 3564 btrfs_set_header_nritems(l, mid); 3565 btrfs_item_key(right, &disk_key, 0); 3566 ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); 3567 if (ret < 0) 3568 return ret; 3569 3570 btrfs_mark_buffer_dirty(trans, right); 3571 btrfs_mark_buffer_dirty(trans, l); 3572 BUG_ON(path->slots[0] != slot); 3573 3574 if (mid <= slot) { 3575 btrfs_tree_unlock(path->nodes[0]); 3576 free_extent_buffer(path->nodes[0]); 3577 path->nodes[0] = right; 3578 path->slots[0] -= mid; 3579 path->slots[1] += 1; 3580 } else { 3581 btrfs_tree_unlock(right); 3582 free_extent_buffer(right); 3583 } 3584 3585 BUG_ON(path->slots[0] < 0); 3586 3587 return 0; 3588 } 3589 3590 /* 3591 * double splits happen when we need to insert a big item in the middle 3592 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3593 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3594 * A B C 3595 * 3596 * We avoid this by trying to push the items on either side of our target 3597 * into the adjacent leaves. If all goes well we can avoid the double split 3598 * completely. 3599 */ 3600 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3601 struct btrfs_root *root, 3602 struct btrfs_path *path, 3603 int data_size) 3604 { 3605 int ret; 3606 int progress = 0; 3607 int slot; 3608 u32 nritems; 3609 int space_needed = data_size; 3610 3611 slot = path->slots[0]; 3612 if (slot < btrfs_header_nritems(path->nodes[0])) 3613 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3614 3615 /* 3616 * try to push all the items after our slot into the 3617 * right leaf 3618 */ 3619 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 3620 if (ret < 0) 3621 return ret; 3622 3623 if (ret == 0) 3624 progress++; 3625 3626 nritems = btrfs_header_nritems(path->nodes[0]); 3627 /* 3628 * our goal is to get our slot at the start or end of a leaf. If 3629 * we've done so we're done 3630 */ 3631 if (path->slots[0] == 0 || path->slots[0] == nritems) 3632 return 0; 3633 3634 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3635 return 0; 3636 3637 /* try to push all the items before our slot into the next leaf */ 3638 slot = path->slots[0]; 3639 space_needed = data_size; 3640 if (slot > 0) 3641 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3642 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 3643 if (ret < 0) 3644 return ret; 3645 3646 if (ret == 0) 3647 progress++; 3648 3649 if (progress) 3650 return 0; 3651 return 1; 3652 } 3653 3654 /* 3655 * split the path's leaf in two, making sure there is at least data_size 3656 * available for the resulting leaf level of the path. 3657 * 3658 * returns 0 if all went well and < 0 on failure. 3659 */ 3660 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3661 struct btrfs_root *root, 3662 const struct btrfs_key *ins_key, 3663 struct btrfs_path *path, int data_size, 3664 bool extend) 3665 { 3666 struct btrfs_disk_key disk_key; 3667 struct extent_buffer *l; 3668 u32 nritems; 3669 int mid; 3670 int slot; 3671 struct extent_buffer *right; 3672 struct btrfs_fs_info *fs_info = root->fs_info; 3673 int ret = 0; 3674 int wret; 3675 int split; 3676 int num_doubles = 0; 3677 int tried_avoid_double = 0; 3678 3679 l = path->nodes[0]; 3680 slot = path->slots[0]; 3681 if (extend && data_size + btrfs_item_size(l, slot) + 3682 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 3683 return -EOVERFLOW; 3684 3685 /* first try to make some room by pushing left and right */ 3686 if (data_size && path->nodes[1]) { 3687 int space_needed = data_size; 3688 3689 if (slot < btrfs_header_nritems(l)) 3690 space_needed -= btrfs_leaf_free_space(l); 3691 3692 wret = push_leaf_right(trans, root, path, space_needed, 3693 space_needed, 0, 0); 3694 if (wret < 0) 3695 return wret; 3696 if (wret) { 3697 space_needed = data_size; 3698 if (slot > 0) 3699 space_needed -= btrfs_leaf_free_space(l); 3700 wret = push_leaf_left(trans, root, path, space_needed, 3701 space_needed, 0, (u32)-1); 3702 if (wret < 0) 3703 return wret; 3704 } 3705 l = path->nodes[0]; 3706 3707 /* did the pushes work? */ 3708 if (btrfs_leaf_free_space(l) >= data_size) 3709 return 0; 3710 } 3711 3712 if (!path->nodes[1]) { 3713 ret = insert_new_root(trans, root, path, 1); 3714 if (ret) 3715 return ret; 3716 } 3717 again: 3718 split = 1; 3719 l = path->nodes[0]; 3720 slot = path->slots[0]; 3721 nritems = btrfs_header_nritems(l); 3722 mid = (nritems + 1) / 2; 3723 3724 if (mid <= slot) { 3725 if (nritems == 1 || 3726 leaf_space_used(l, mid, nritems - mid) + data_size > 3727 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3728 if (slot >= nritems) { 3729 split = 0; 3730 } else { 3731 mid = slot; 3732 if (mid != nritems && 3733 leaf_space_used(l, mid, nritems - mid) + 3734 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3735 if (data_size && !tried_avoid_double) 3736 goto push_for_double; 3737 split = 2; 3738 } 3739 } 3740 } 3741 } else { 3742 if (leaf_space_used(l, 0, mid) + data_size > 3743 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3744 if (!extend && data_size && slot == 0) { 3745 split = 0; 3746 } else if ((extend || !data_size) && slot == 0) { 3747 mid = 1; 3748 } else { 3749 mid = slot; 3750 if (mid != nritems && 3751 leaf_space_used(l, mid, nritems - mid) + 3752 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3753 if (data_size && !tried_avoid_double) 3754 goto push_for_double; 3755 split = 2; 3756 } 3757 } 3758 } 3759 } 3760 3761 if (split == 0) 3762 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3763 else 3764 btrfs_item_key(l, &disk_key, mid); 3765 3766 /* 3767 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double 3768 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES 3769 * subclasses, which is 8 at the time of this patch, and we've maxed it 3770 * out. In the future we could add a 3771 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just 3772 * use BTRFS_NESTING_NEW_ROOT. 3773 */ 3774 right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), 3775 &disk_key, 0, l->start, 0, 0, 3776 num_doubles ? BTRFS_NESTING_NEW_ROOT : 3777 BTRFS_NESTING_SPLIT); 3778 if (IS_ERR(right)) 3779 return PTR_ERR(right); 3780 3781 root_add_used_bytes(root); 3782 3783 if (split == 0) { 3784 if (mid <= slot) { 3785 btrfs_set_header_nritems(right, 0); 3786 ret = insert_ptr(trans, path, &disk_key, 3787 right->start, path->slots[1] + 1, 1); 3788 if (ret < 0) { 3789 btrfs_tree_unlock(right); 3790 free_extent_buffer(right); 3791 return ret; 3792 } 3793 btrfs_tree_unlock(path->nodes[0]); 3794 free_extent_buffer(path->nodes[0]); 3795 path->nodes[0] = right; 3796 path->slots[0] = 0; 3797 path->slots[1] += 1; 3798 } else { 3799 btrfs_set_header_nritems(right, 0); 3800 ret = insert_ptr(trans, path, &disk_key, 3801 right->start, path->slots[1], 1); 3802 if (ret < 0) { 3803 btrfs_tree_unlock(right); 3804 free_extent_buffer(right); 3805 return ret; 3806 } 3807 btrfs_tree_unlock(path->nodes[0]); 3808 free_extent_buffer(path->nodes[0]); 3809 path->nodes[0] = right; 3810 path->slots[0] = 0; 3811 if (path->slots[1] == 0) 3812 fixup_low_keys(trans, path, &disk_key, 1); 3813 } 3814 /* 3815 * We create a new leaf 'right' for the required ins_len and 3816 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 3817 * the content of ins_len to 'right'. 3818 */ 3819 return ret; 3820 } 3821 3822 ret = copy_for_split(trans, path, l, right, slot, mid, nritems); 3823 if (ret < 0) { 3824 btrfs_tree_unlock(right); 3825 free_extent_buffer(right); 3826 return ret; 3827 } 3828 3829 if (split == 2) { 3830 BUG_ON(num_doubles != 0); 3831 num_doubles++; 3832 goto again; 3833 } 3834 3835 return 0; 3836 3837 push_for_double: 3838 push_for_double_split(trans, root, path, data_size); 3839 tried_avoid_double = 1; 3840 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3841 return 0; 3842 goto again; 3843 } 3844 3845 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3846 struct btrfs_root *root, 3847 struct btrfs_path *path, int ins_len) 3848 { 3849 struct btrfs_key key; 3850 struct extent_buffer *leaf; 3851 struct btrfs_file_extent_item *fi; 3852 u64 extent_len = 0; 3853 u32 item_size; 3854 int ret; 3855 3856 leaf = path->nodes[0]; 3857 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3858 3859 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 3860 key.type != BTRFS_RAID_STRIPE_KEY && 3861 key.type != BTRFS_EXTENT_CSUM_KEY); 3862 3863 if (btrfs_leaf_free_space(leaf) >= ins_len) 3864 return 0; 3865 3866 item_size = btrfs_item_size(leaf, path->slots[0]); 3867 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3868 fi = btrfs_item_ptr(leaf, path->slots[0], 3869 struct btrfs_file_extent_item); 3870 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3871 } 3872 btrfs_release_path(path); 3873 3874 path->keep_locks = true; 3875 path->search_for_split = true; 3876 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 3877 path->search_for_split = false; 3878 if (ret > 0) 3879 ret = -EAGAIN; 3880 if (ret < 0) 3881 goto err; 3882 3883 ret = -EAGAIN; 3884 leaf = path->nodes[0]; 3885 /* if our item isn't there, return now */ 3886 if (item_size != btrfs_item_size(leaf, path->slots[0])) 3887 goto err; 3888 3889 /* the leaf has changed, it now has room. return now */ 3890 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) 3891 goto err; 3892 3893 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3894 fi = btrfs_item_ptr(leaf, path->slots[0], 3895 struct btrfs_file_extent_item); 3896 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 3897 goto err; 3898 } 3899 3900 ret = split_leaf(trans, root, &key, path, ins_len, true); 3901 if (ret) 3902 goto err; 3903 3904 path->keep_locks = false; 3905 btrfs_unlock_up_safe(path, 1); 3906 return 0; 3907 err: 3908 path->keep_locks = false; 3909 return ret; 3910 } 3911 3912 static noinline int split_item(struct btrfs_trans_handle *trans, 3913 struct btrfs_path *path, 3914 const struct btrfs_key *new_key, 3915 unsigned long split_offset) 3916 { 3917 struct extent_buffer *leaf; 3918 int orig_slot, slot; 3919 char *buf; 3920 u32 nritems; 3921 u32 item_size; 3922 u32 orig_offset; 3923 struct btrfs_disk_key disk_key; 3924 3925 leaf = path->nodes[0]; 3926 /* 3927 * Shouldn't happen because the caller must have previously called 3928 * setup_leaf_for_split() to make room for the new item in the leaf. 3929 */ 3930 if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item))) 3931 return -ENOSPC; 3932 3933 orig_slot = path->slots[0]; 3934 orig_offset = btrfs_item_offset(leaf, path->slots[0]); 3935 item_size = btrfs_item_size(leaf, path->slots[0]); 3936 3937 buf = kmalloc(item_size, GFP_NOFS); 3938 if (!buf) 3939 return -ENOMEM; 3940 3941 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3942 path->slots[0]), item_size); 3943 3944 slot = path->slots[0] + 1; 3945 nritems = btrfs_header_nritems(leaf); 3946 if (slot != nritems) { 3947 /* shift the items */ 3948 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); 3949 } 3950 3951 btrfs_cpu_key_to_disk(&disk_key, new_key); 3952 btrfs_set_item_key(leaf, &disk_key, slot); 3953 3954 btrfs_set_item_offset(leaf, slot, orig_offset); 3955 btrfs_set_item_size(leaf, slot, item_size - split_offset); 3956 3957 btrfs_set_item_offset(leaf, orig_slot, 3958 orig_offset + item_size - split_offset); 3959 btrfs_set_item_size(leaf, orig_slot, split_offset); 3960 3961 btrfs_set_header_nritems(leaf, nritems + 1); 3962 3963 /* write the data for the start of the original item */ 3964 write_extent_buffer(leaf, buf, 3965 btrfs_item_ptr_offset(leaf, path->slots[0]), 3966 split_offset); 3967 3968 /* write the data for the new item */ 3969 write_extent_buffer(leaf, buf + split_offset, 3970 btrfs_item_ptr_offset(leaf, slot), 3971 item_size - split_offset); 3972 btrfs_mark_buffer_dirty(trans, leaf); 3973 3974 BUG_ON(btrfs_leaf_free_space(leaf) < 0); 3975 kfree(buf); 3976 return 0; 3977 } 3978 3979 /* 3980 * This function splits a single item into two items, 3981 * giving 'new_key' to the new item and splitting the 3982 * old one at split_offset (from the start of the item). 3983 * 3984 * The path may be released by this operation. After 3985 * the split, the path is pointing to the old item. The 3986 * new item is going to be in the same node as the old one. 3987 * 3988 * Note, the item being split must be smaller enough to live alone on 3989 * a tree block with room for one extra struct btrfs_item 3990 * 3991 * This allows us to split the item in place, keeping a lock on the 3992 * leaf the entire time. 3993 */ 3994 int btrfs_split_item(struct btrfs_trans_handle *trans, 3995 struct btrfs_root *root, 3996 struct btrfs_path *path, 3997 const struct btrfs_key *new_key, 3998 unsigned long split_offset) 3999 { 4000 int ret; 4001 ret = setup_leaf_for_split(trans, root, path, 4002 sizeof(struct btrfs_item)); 4003 if (ret) 4004 return ret; 4005 4006 return split_item(trans, path, new_key, split_offset); 4007 } 4008 4009 /* 4010 * make the item pointed to by the path smaller. new_size indicates 4011 * how small to make it, and from_end tells us if we just chop bytes 4012 * off the end of the item or if we shift the item to chop bytes off 4013 * the front. 4014 */ 4015 void btrfs_truncate_item(struct btrfs_trans_handle *trans, 4016 const struct btrfs_path *path, u32 new_size, int from_end) 4017 { 4018 int slot; 4019 struct extent_buffer *leaf; 4020 u32 nritems; 4021 unsigned int data_end; 4022 unsigned int old_data_start; 4023 unsigned int old_size; 4024 unsigned int size_diff; 4025 int i; 4026 4027 leaf = path->nodes[0]; 4028 slot = path->slots[0]; 4029 4030 old_size = btrfs_item_size(leaf, slot); 4031 if (old_size == new_size) 4032 return; 4033 4034 nritems = btrfs_header_nritems(leaf); 4035 data_end = leaf_data_end(leaf); 4036 4037 old_data_start = btrfs_item_offset(leaf, slot); 4038 4039 size_diff = old_size - new_size; 4040 4041 BUG_ON(slot < 0); 4042 BUG_ON(slot >= nritems); 4043 4044 /* 4045 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4046 */ 4047 /* first correct the data pointers */ 4048 for (i = slot; i < nritems; i++) { 4049 u32 ioff; 4050 4051 ioff = btrfs_item_offset(leaf, i); 4052 btrfs_set_item_offset(leaf, i, ioff + size_diff); 4053 } 4054 4055 /* shift the data */ 4056 if (from_end) { 4057 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4058 old_data_start + new_size - data_end); 4059 } else { 4060 struct btrfs_disk_key disk_key; 4061 u64 offset; 4062 4063 btrfs_item_key(leaf, &disk_key, slot); 4064 4065 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 4066 unsigned long ptr; 4067 struct btrfs_file_extent_item *fi; 4068 4069 fi = btrfs_item_ptr(leaf, slot, 4070 struct btrfs_file_extent_item); 4071 fi = (struct btrfs_file_extent_item *)( 4072 (unsigned long)fi - size_diff); 4073 4074 if (btrfs_file_extent_type(leaf, fi) == 4075 BTRFS_FILE_EXTENT_INLINE) { 4076 ptr = btrfs_item_ptr_offset(leaf, slot); 4077 memmove_extent_buffer(leaf, ptr, 4078 (unsigned long)fi, 4079 BTRFS_FILE_EXTENT_INLINE_DATA_START); 4080 } 4081 } 4082 4083 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4084 old_data_start - data_end); 4085 4086 offset = btrfs_disk_key_offset(&disk_key); 4087 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4088 btrfs_set_item_key(leaf, &disk_key, slot); 4089 if (slot == 0) 4090 fixup_low_keys(trans, path, &disk_key, 1); 4091 } 4092 4093 btrfs_set_item_size(leaf, slot, new_size); 4094 btrfs_mark_buffer_dirty(trans, leaf); 4095 4096 if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { 4097 btrfs_print_leaf(leaf); 4098 BUG(); 4099 } 4100 } 4101 4102 /* 4103 * make the item pointed to by the path bigger, data_size is the added size. 4104 */ 4105 void btrfs_extend_item(struct btrfs_trans_handle *trans, 4106 const struct btrfs_path *path, u32 data_size) 4107 { 4108 int slot; 4109 struct extent_buffer *leaf; 4110 u32 nritems; 4111 unsigned int data_end; 4112 unsigned int old_data; 4113 unsigned int old_size; 4114 int i; 4115 4116 leaf = path->nodes[0]; 4117 4118 nritems = btrfs_header_nritems(leaf); 4119 data_end = leaf_data_end(leaf); 4120 4121 if (unlikely(btrfs_leaf_free_space(leaf) < data_size)) { 4122 btrfs_print_leaf(leaf); 4123 BUG(); 4124 } 4125 slot = path->slots[0]; 4126 old_data = btrfs_item_data_end(leaf, slot); 4127 4128 BUG_ON(slot < 0); 4129 if (unlikely(slot >= nritems)) { 4130 btrfs_print_leaf(leaf); 4131 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", 4132 slot, nritems); 4133 BUG(); 4134 } 4135 4136 /* 4137 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4138 */ 4139 /* first correct the data pointers */ 4140 for (i = slot; i < nritems; i++) { 4141 u32 ioff; 4142 4143 ioff = btrfs_item_offset(leaf, i); 4144 btrfs_set_item_offset(leaf, i, ioff - data_size); 4145 } 4146 4147 /* shift the data */ 4148 memmove_leaf_data(leaf, data_end - data_size, data_end, 4149 old_data - data_end); 4150 4151 old_size = btrfs_item_size(leaf, slot); 4152 btrfs_set_item_size(leaf, slot, old_size + data_size); 4153 btrfs_mark_buffer_dirty(trans, leaf); 4154 4155 if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { 4156 btrfs_print_leaf(leaf); 4157 BUG(); 4158 } 4159 } 4160 4161 /* 4162 * Make space in the node before inserting one or more items. 4163 * 4164 * @trans: transaction handle 4165 * @root: root we are inserting items to 4166 * @path: points to the leaf/slot where we are going to insert new items 4167 * @batch: information about the batch of items to insert 4168 * 4169 * Main purpose is to save stack depth by doing the bulk of the work in a 4170 * function that doesn't call btrfs_search_slot 4171 */ 4172 static void setup_items_for_insert(struct btrfs_trans_handle *trans, 4173 struct btrfs_root *root, struct btrfs_path *path, 4174 const struct btrfs_item_batch *batch) 4175 { 4176 struct btrfs_fs_info *fs_info = root->fs_info; 4177 int i; 4178 u32 nritems; 4179 unsigned int data_end; 4180 struct btrfs_disk_key disk_key; 4181 struct extent_buffer *leaf; 4182 int slot; 4183 u32 total_size; 4184 4185 /* 4186 * Before anything else, update keys in the parent and other ancestors 4187 * if needed, then release the write locks on them, so that other tasks 4188 * can use them while we modify the leaf. 4189 */ 4190 if (path->slots[0] == 0) { 4191 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); 4192 fixup_low_keys(trans, path, &disk_key, 1); 4193 } 4194 btrfs_unlock_up_safe(path, 1); 4195 4196 leaf = path->nodes[0]; 4197 slot = path->slots[0]; 4198 4199 nritems = btrfs_header_nritems(leaf); 4200 data_end = leaf_data_end(leaf); 4201 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4202 4203 if (unlikely(btrfs_leaf_free_space(leaf) < total_size)) { 4204 btrfs_print_leaf(leaf); 4205 btrfs_crit(fs_info, "not enough freespace need %u have %d", 4206 total_size, btrfs_leaf_free_space(leaf)); 4207 BUG(); 4208 } 4209 4210 if (slot != nritems) { 4211 unsigned int old_data = btrfs_item_data_end(leaf, slot); 4212 4213 if (unlikely(old_data < data_end)) { 4214 btrfs_print_leaf(leaf); 4215 btrfs_crit(fs_info, 4216 "item at slot %d with data offset %u beyond data end of leaf %u", 4217 slot, old_data, data_end); 4218 BUG(); 4219 } 4220 /* 4221 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4222 */ 4223 /* first correct the data pointers */ 4224 for (i = slot; i < nritems; i++) { 4225 u32 ioff; 4226 4227 ioff = btrfs_item_offset(leaf, i); 4228 btrfs_set_item_offset(leaf, i, 4229 ioff - batch->total_data_size); 4230 } 4231 /* shift the items */ 4232 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); 4233 4234 /* shift the data */ 4235 memmove_leaf_data(leaf, data_end - batch->total_data_size, 4236 data_end, old_data - data_end); 4237 data_end = old_data; 4238 } 4239 4240 /* setup the item for the new data */ 4241 for (i = 0; i < batch->nr; i++) { 4242 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); 4243 btrfs_set_item_key(leaf, &disk_key, slot + i); 4244 data_end -= batch->data_sizes[i]; 4245 btrfs_set_item_offset(leaf, slot + i, data_end); 4246 btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]); 4247 } 4248 4249 btrfs_set_header_nritems(leaf, nritems + batch->nr); 4250 btrfs_mark_buffer_dirty(trans, leaf); 4251 4252 if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { 4253 btrfs_print_leaf(leaf); 4254 BUG(); 4255 } 4256 } 4257 4258 /* 4259 * Insert a new item into a leaf. 4260 * 4261 * @trans: Transaction handle. 4262 * @root: The root of the btree. 4263 * @path: A path pointing to the target leaf and slot. 4264 * @key: The key of the new item. 4265 * @data_size: The size of the data associated with the new key. 4266 */ 4267 void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, 4268 struct btrfs_root *root, 4269 struct btrfs_path *path, 4270 const struct btrfs_key *key, 4271 u32 data_size) 4272 { 4273 struct btrfs_item_batch batch; 4274 4275 batch.keys = key; 4276 batch.data_sizes = &data_size; 4277 batch.total_data_size = data_size; 4278 batch.nr = 1; 4279 4280 setup_items_for_insert(trans, root, path, &batch); 4281 } 4282 4283 /* 4284 * Given a key and some data, insert items into the tree. 4285 * This does all the path init required, making room in the tree if needed. 4286 * 4287 * Returns: 0 on success 4288 * -EEXIST if the first key already exists 4289 * < 0 on other errors 4290 */ 4291 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4292 struct btrfs_root *root, 4293 struct btrfs_path *path, 4294 const struct btrfs_item_batch *batch) 4295 { 4296 int ret = 0; 4297 int slot; 4298 u32 total_size; 4299 4300 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4301 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); 4302 if (ret == 0) 4303 return -EEXIST; 4304 if (ret < 0) 4305 return ret; 4306 4307 slot = path->slots[0]; 4308 BUG_ON(slot < 0); 4309 4310 setup_items_for_insert(trans, root, path, batch); 4311 return 0; 4312 } 4313 4314 /* 4315 * Given a key and some data, insert an item into the tree. 4316 * This does all the path init required, making room in the tree if needed. 4317 */ 4318 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4319 const struct btrfs_key *cpu_key, void *data, 4320 u32 data_size) 4321 { 4322 int ret = 0; 4323 BTRFS_PATH_AUTO_FREE(path); 4324 struct extent_buffer *leaf; 4325 unsigned long ptr; 4326 4327 path = btrfs_alloc_path(); 4328 if (!path) 4329 return -ENOMEM; 4330 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4331 if (!ret) { 4332 leaf = path->nodes[0]; 4333 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4334 write_extent_buffer(leaf, data, ptr, data_size); 4335 btrfs_mark_buffer_dirty(trans, leaf); 4336 } 4337 return ret; 4338 } 4339 4340 /* 4341 * This function duplicates an item, giving 'new_key' to the new item. 4342 * It guarantees both items live in the same tree leaf and the new item is 4343 * contiguous with the original item. 4344 * 4345 * This allows us to split a file extent in place, keeping a lock on the leaf 4346 * the entire time. 4347 */ 4348 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4349 struct btrfs_root *root, 4350 struct btrfs_path *path, 4351 const struct btrfs_key *new_key) 4352 { 4353 struct extent_buffer *leaf; 4354 int ret; 4355 u32 item_size; 4356 4357 leaf = path->nodes[0]; 4358 item_size = btrfs_item_size(leaf, path->slots[0]); 4359 ret = setup_leaf_for_split(trans, root, path, 4360 item_size + sizeof(struct btrfs_item)); 4361 if (ret) 4362 return ret; 4363 4364 path->slots[0]++; 4365 btrfs_setup_item_for_insert(trans, root, path, new_key, item_size); 4366 leaf = path->nodes[0]; 4367 memcpy_extent_buffer(leaf, 4368 btrfs_item_ptr_offset(leaf, path->slots[0]), 4369 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4370 item_size); 4371 return 0; 4372 } 4373 4374 /* 4375 * delete the pointer from a given node. 4376 * 4377 * the tree should have been previously balanced so the deletion does not 4378 * empty a node. 4379 * 4380 * This is exported for use inside btrfs-progs, don't un-export it. 4381 */ 4382 int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4383 struct btrfs_path *path, int level, int slot) 4384 { 4385 struct extent_buffer *parent = path->nodes[level]; 4386 u32 nritems; 4387 int ret; 4388 4389 nritems = btrfs_header_nritems(parent); 4390 if (slot != nritems - 1) { 4391 if (level) { 4392 ret = btrfs_tree_mod_log_insert_move(parent, slot, 4393 slot + 1, nritems - slot - 1); 4394 if (unlikely(ret < 0)) { 4395 btrfs_abort_transaction(trans, ret); 4396 return ret; 4397 } 4398 } 4399 memmove_extent_buffer(parent, 4400 btrfs_node_key_ptr_offset(parent, slot), 4401 btrfs_node_key_ptr_offset(parent, slot + 1), 4402 sizeof(struct btrfs_key_ptr) * 4403 (nritems - slot - 1)); 4404 } else if (level) { 4405 ret = btrfs_tree_mod_log_insert_key(parent, slot, 4406 BTRFS_MOD_LOG_KEY_REMOVE); 4407 if (unlikely(ret < 0)) { 4408 btrfs_abort_transaction(trans, ret); 4409 return ret; 4410 } 4411 } 4412 4413 nritems--; 4414 btrfs_set_header_nritems(parent, nritems); 4415 if (nritems == 0 && parent == root->node) { 4416 BUG_ON(btrfs_header_level(root->node) != 1); 4417 /* just turn the root into a leaf and break */ 4418 btrfs_set_header_level(root->node, 0); 4419 } else if (slot == 0) { 4420 struct btrfs_disk_key disk_key; 4421 4422 btrfs_node_key(parent, &disk_key, 0); 4423 fixup_low_keys(trans, path, &disk_key, level + 1); 4424 } 4425 btrfs_mark_buffer_dirty(trans, parent); 4426 return 0; 4427 } 4428 4429 /* 4430 * a helper function to delete the leaf pointed to by path->slots[1] and 4431 * path->nodes[1]. 4432 * 4433 * This deletes the pointer in path->nodes[1] and frees the leaf 4434 * block extent. zero is returned if it all worked out, < 0 otherwise. 4435 * 4436 * The path must have already been setup for deleting the leaf, including 4437 * all the proper balancing. path->nodes[1] must be locked. 4438 */ 4439 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, 4440 struct btrfs_root *root, 4441 struct btrfs_path *path, 4442 struct extent_buffer *leaf) 4443 { 4444 int ret; 4445 4446 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4447 ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]); 4448 if (ret < 0) 4449 return ret; 4450 4451 /* 4452 * btrfs_free_extent is expensive, we want to make sure we 4453 * aren't holding any locks when we call it 4454 */ 4455 btrfs_unlock_up_safe(path, 0); 4456 4457 root_sub_used_bytes(root); 4458 4459 refcount_inc(&leaf->refs); 4460 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); 4461 free_extent_buffer_stale(leaf); 4462 if (ret < 0) 4463 btrfs_abort_transaction(trans, ret); 4464 4465 return ret; 4466 } 4467 /* 4468 * delete the item at the leaf level in path. If that empties 4469 * the leaf, remove it from the tree 4470 */ 4471 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4472 struct btrfs_path *path, int slot, int nr) 4473 { 4474 struct btrfs_fs_info *fs_info = root->fs_info; 4475 struct extent_buffer *leaf; 4476 int ret = 0; 4477 int wret; 4478 u32 nritems; 4479 4480 leaf = path->nodes[0]; 4481 nritems = btrfs_header_nritems(leaf); 4482 4483 if (slot + nr != nritems) { 4484 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); 4485 const int data_end = leaf_data_end(leaf); 4486 u32 dsize = 0; 4487 int i; 4488 4489 for (i = 0; i < nr; i++) 4490 dsize += btrfs_item_size(leaf, slot + i); 4491 4492 memmove_leaf_data(leaf, data_end + dsize, data_end, 4493 last_off - data_end); 4494 4495 for (i = slot + nr; i < nritems; i++) { 4496 u32 ioff; 4497 4498 ioff = btrfs_item_offset(leaf, i); 4499 btrfs_set_item_offset(leaf, i, ioff + dsize); 4500 } 4501 4502 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); 4503 } 4504 btrfs_set_header_nritems(leaf, nritems - nr); 4505 nritems -= nr; 4506 4507 /* delete the leaf if we've emptied it */ 4508 if (nritems == 0) { 4509 if (leaf != root->node) { 4510 btrfs_clear_buffer_dirty(trans, leaf); 4511 ret = btrfs_del_leaf(trans, root, path, leaf); 4512 if (ret < 0) 4513 return ret; 4514 } 4515 } else { 4516 int used = leaf_space_used(leaf, 0, nritems); 4517 if (slot == 0) { 4518 struct btrfs_disk_key disk_key; 4519 4520 btrfs_item_key(leaf, &disk_key, 0); 4521 fixup_low_keys(trans, path, &disk_key, 1); 4522 } 4523 4524 /* 4525 * Try to delete the leaf if it is mostly empty. We do this by 4526 * trying to move all its items into its left and right neighbours. 4527 * If we can't move all the items, then we don't delete it - it's 4528 * not ideal, but future insertions might fill the leaf with more 4529 * items, or items from other leaves might be moved later into our 4530 * leaf due to deletions on those leaves. 4531 */ 4532 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4533 u32 min_push_space; 4534 4535 /* push_leaf_left fixes the path. 4536 * make sure the path still points to our leaf 4537 * for possible call to btrfs_del_ptr below 4538 */ 4539 slot = path->slots[1]; 4540 refcount_inc(&leaf->refs); 4541 /* 4542 * We want to be able to at least push one item to the 4543 * left neighbour leaf, and that's the first item. 4544 */ 4545 min_push_space = sizeof(struct btrfs_item) + 4546 btrfs_item_size(leaf, 0); 4547 wret = push_leaf_left(trans, root, path, 0, 4548 min_push_space, 1, (u32)-1); 4549 if (wret < 0 && wret != -ENOSPC) 4550 ret = wret; 4551 4552 if (path->nodes[0] == leaf && 4553 btrfs_header_nritems(leaf)) { 4554 /* 4555 * If we were not able to push all items from our 4556 * leaf to its left neighbour, then attempt to 4557 * either push all the remaining items to the 4558 * right neighbour or none. There's no advantage 4559 * in pushing only some items, instead of all, as 4560 * it's pointless to end up with a leaf having 4561 * too few items while the neighbours can be full 4562 * or nearly full. 4563 */ 4564 nritems = btrfs_header_nritems(leaf); 4565 min_push_space = leaf_space_used(leaf, 0, nritems); 4566 wret = push_leaf_right(trans, root, path, 0, 4567 min_push_space, 1, 0); 4568 if (wret < 0 && wret != -ENOSPC) 4569 ret = wret; 4570 } 4571 4572 if (btrfs_header_nritems(leaf) == 0) { 4573 path->slots[1] = slot; 4574 ret = btrfs_del_leaf(trans, root, path, leaf); 4575 free_extent_buffer(leaf); 4576 if (ret < 0) 4577 return ret; 4578 } else { 4579 /* if we're still in the path, make sure 4580 * we're dirty. Otherwise, one of the 4581 * push_leaf functions must have already 4582 * dirtied this buffer 4583 */ 4584 if (path->nodes[0] == leaf) 4585 btrfs_mark_buffer_dirty(trans, leaf); 4586 free_extent_buffer(leaf); 4587 } 4588 } else { 4589 btrfs_mark_buffer_dirty(trans, leaf); 4590 } 4591 } 4592 return ret; 4593 } 4594 4595 /* 4596 * A helper function to walk down the tree starting at min_key, and looking 4597 * for leaves that have a minimum transaction id. 4598 * This is used by the btree defrag code, and tree logging 4599 * 4600 * This does not cow, but it does stuff the starting key it finds back 4601 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4602 * key and get a writable path. 4603 * 4604 * min_trans indicates the oldest transaction that you are interested 4605 * in walking through. Any nodes or leaves older than min_trans are 4606 * skipped over (without reading them). 4607 * 4608 * returns zero if something useful was found, < 0 on error and 1 if there 4609 * was nothing in the tree that matched the search criteria. 4610 */ 4611 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4612 struct btrfs_path *path, 4613 u64 min_trans) 4614 { 4615 struct extent_buffer *cur; 4616 int slot; 4617 int sret; 4618 u32 nritems; 4619 int level; 4620 int ret = 1; 4621 const bool keep_locks = path->keep_locks; 4622 4623 ASSERT(!path->nowait); 4624 ASSERT(path->lowest_level == 0); 4625 path->keep_locks = true; 4626 again: 4627 cur = btrfs_read_lock_root_node(root); 4628 level = btrfs_header_level(cur); 4629 WARN_ON(path->nodes[level]); 4630 path->nodes[level] = cur; 4631 path->locks[level] = BTRFS_READ_LOCK; 4632 4633 if (btrfs_header_generation(cur) < min_trans) { 4634 ret = 1; 4635 goto out; 4636 } 4637 while (1) { 4638 nritems = btrfs_header_nritems(cur); 4639 level = btrfs_header_level(cur); 4640 sret = btrfs_bin_search(cur, 0, min_key, &slot); 4641 if (sret < 0) { 4642 ret = sret; 4643 goto out; 4644 } 4645 4646 /* At level 0 we're done, setup the path and exit. */ 4647 if (level == 0) { 4648 if (slot >= nritems) 4649 goto find_next_key; 4650 ret = 0; 4651 path->slots[level] = slot; 4652 /* Save our key for returning back. */ 4653 btrfs_item_key_to_cpu(cur, min_key, slot); 4654 goto out; 4655 } 4656 if (sret && slot > 0) 4657 slot--; 4658 /* 4659 * check this node pointer against the min_trans parameters. 4660 * If it is too old, skip to the next one. 4661 */ 4662 while (slot < nritems) { 4663 u64 gen; 4664 4665 gen = btrfs_node_ptr_generation(cur, slot); 4666 if (gen < min_trans) { 4667 slot++; 4668 continue; 4669 } 4670 break; 4671 } 4672 find_next_key: 4673 /* 4674 * we didn't find a candidate key in this node, walk forward 4675 * and find another one 4676 */ 4677 path->slots[level] = slot; 4678 if (slot >= nritems) { 4679 sret = btrfs_find_next_key(root, path, min_key, level, 4680 min_trans); 4681 if (sret == 0) { 4682 btrfs_release_path(path); 4683 goto again; 4684 } else { 4685 goto out; 4686 } 4687 } 4688 cur = btrfs_read_node_slot(cur, slot); 4689 if (IS_ERR(cur)) { 4690 ret = PTR_ERR(cur); 4691 goto out; 4692 } 4693 4694 btrfs_tree_read_lock(cur); 4695 4696 path->locks[level - 1] = BTRFS_READ_LOCK; 4697 path->nodes[level - 1] = cur; 4698 unlock_up(path, level, 1, 0, NULL); 4699 } 4700 out: 4701 path->keep_locks = keep_locks; 4702 if (ret == 0) 4703 btrfs_unlock_up_safe(path, 1); 4704 return ret; 4705 } 4706 4707 /* 4708 * this is similar to btrfs_next_leaf, but does not try to preserve 4709 * and fixup the path. It looks for and returns the next key in the 4710 * tree based on the current path and the min_trans parameters. 4711 * 4712 * 0 is returned if another key is found, < 0 if there are any errors 4713 * and 1 is returned if there are no higher keys in the tree 4714 * 4715 * path->keep_locks should be set to true on the search made before 4716 * calling this function. 4717 */ 4718 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4719 struct btrfs_key *key, int level, u64 min_trans) 4720 { 4721 int slot; 4722 struct extent_buffer *c; 4723 4724 WARN_ON(!path->keep_locks && !path->skip_locking); 4725 while (level < BTRFS_MAX_LEVEL) { 4726 if (!path->nodes[level]) 4727 return 1; 4728 4729 slot = path->slots[level] + 1; 4730 c = path->nodes[level]; 4731 next: 4732 if (slot >= btrfs_header_nritems(c)) { 4733 int ret; 4734 int orig_lowest; 4735 struct btrfs_key cur_key; 4736 if (level + 1 >= BTRFS_MAX_LEVEL || 4737 !path->nodes[level + 1]) 4738 return 1; 4739 4740 if (path->locks[level + 1] || path->skip_locking) { 4741 level++; 4742 continue; 4743 } 4744 4745 slot = btrfs_header_nritems(c) - 1; 4746 if (level == 0) 4747 btrfs_item_key_to_cpu(c, &cur_key, slot); 4748 else 4749 btrfs_node_key_to_cpu(c, &cur_key, slot); 4750 4751 orig_lowest = path->lowest_level; 4752 btrfs_release_path(path); 4753 path->lowest_level = level; 4754 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4755 0, 0); 4756 path->lowest_level = orig_lowest; 4757 if (ret < 0) 4758 return ret; 4759 4760 c = path->nodes[level]; 4761 slot = path->slots[level]; 4762 if (ret == 0) 4763 slot++; 4764 goto next; 4765 } 4766 4767 if (level == 0) 4768 btrfs_item_key_to_cpu(c, key, slot); 4769 else { 4770 u64 gen = btrfs_node_ptr_generation(c, slot); 4771 4772 if (gen < min_trans) { 4773 slot++; 4774 goto next; 4775 } 4776 btrfs_node_key_to_cpu(c, key, slot); 4777 } 4778 return 0; 4779 } 4780 return 1; 4781 } 4782 4783 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 4784 u64 time_seq) 4785 { 4786 int slot; 4787 int level; 4788 struct extent_buffer *c; 4789 struct extent_buffer *next; 4790 struct btrfs_fs_info *fs_info = root->fs_info; 4791 struct btrfs_key key; 4792 bool need_commit_sem = false; 4793 u32 nritems; 4794 int ret; 4795 int i; 4796 4797 /* 4798 * The nowait semantics are used only for write paths, where we don't 4799 * use the tree mod log and sequence numbers. 4800 */ 4801 if (time_seq) 4802 ASSERT(!path->nowait); 4803 4804 nritems = btrfs_header_nritems(path->nodes[0]); 4805 if (nritems == 0) 4806 return 1; 4807 4808 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4809 again: 4810 level = 1; 4811 next = NULL; 4812 btrfs_release_path(path); 4813 4814 path->keep_locks = true; 4815 4816 if (time_seq) { 4817 ret = btrfs_search_old_slot(root, &key, path, time_seq); 4818 } else { 4819 if (path->need_commit_sem) { 4820 path->need_commit_sem = false; 4821 need_commit_sem = true; 4822 if (path->nowait) { 4823 if (!down_read_trylock(&fs_info->commit_root_sem)) { 4824 ret = -EAGAIN; 4825 goto done; 4826 } 4827 } else { 4828 down_read(&fs_info->commit_root_sem); 4829 } 4830 } 4831 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4832 } 4833 path->keep_locks = false; 4834 4835 if (ret < 0) 4836 goto done; 4837 4838 nritems = btrfs_header_nritems(path->nodes[0]); 4839 /* 4840 * By releasing the path above we dropped all our locks. A balance 4841 * could have happened and 4842 * 4843 * 1. added more items after the previous last item 4844 * 2. deleted the previous last item 4845 * 4846 * So, check again here and advance the path if there are now more 4847 * items available. 4848 */ 4849 if (nritems > 0 && path->slots[0] <= nritems - 1) { 4850 if (ret == 0 && path->slots[0] != nritems - 1) { 4851 path->slots[0]++; 4852 goto done; 4853 } else if (ret > 0) { 4854 ret = 0; 4855 goto done; 4856 } 4857 } 4858 4859 while (level < BTRFS_MAX_LEVEL) { 4860 if (!path->nodes[level]) { 4861 ret = 1; 4862 goto done; 4863 } 4864 4865 slot = path->slots[level] + 1; 4866 c = path->nodes[level]; 4867 if (slot >= btrfs_header_nritems(c)) { 4868 level++; 4869 if (level == BTRFS_MAX_LEVEL) { 4870 ret = 1; 4871 goto done; 4872 } 4873 continue; 4874 } 4875 4876 4877 /* 4878 * Our current level is where we're going to start from, and to 4879 * make sure lockdep doesn't complain we need to drop our locks 4880 * and nodes from 0 to our current level. 4881 */ 4882 for (i = 0; i < level; i++) { 4883 if (path->locks[level]) { 4884 btrfs_tree_read_unlock(path->nodes[i]); 4885 path->locks[i] = 0; 4886 } 4887 free_extent_buffer(path->nodes[i]); 4888 path->nodes[i] = NULL; 4889 } 4890 4891 next = c; 4892 ret = read_block_for_search(root, path, &next, slot, &key); 4893 if (ret == -EAGAIN && !path->nowait) 4894 goto again; 4895 4896 if (ret < 0) { 4897 btrfs_release_path(path); 4898 goto done; 4899 } 4900 4901 if (!path->skip_locking) { 4902 ret = btrfs_try_tree_read_lock(next); 4903 if (!ret && path->nowait) { 4904 ret = -EAGAIN; 4905 goto done; 4906 } 4907 if (!ret && time_seq) { 4908 /* 4909 * If we don't get the lock, we may be racing 4910 * with push_leaf_left, holding that lock while 4911 * itself waiting for the leaf we've currently 4912 * locked. To solve this situation, we give up 4913 * on our lock and cycle. 4914 */ 4915 free_extent_buffer(next); 4916 btrfs_release_path(path); 4917 cond_resched(); 4918 goto again; 4919 } 4920 if (!ret) 4921 btrfs_tree_read_lock(next); 4922 } 4923 break; 4924 } 4925 path->slots[level] = slot; 4926 while (1) { 4927 level--; 4928 path->nodes[level] = next; 4929 path->slots[level] = 0; 4930 if (!path->skip_locking) 4931 path->locks[level] = BTRFS_READ_LOCK; 4932 if (!level) 4933 break; 4934 4935 ret = read_block_for_search(root, path, &next, 0, &key); 4936 if (ret == -EAGAIN && !path->nowait) 4937 goto again; 4938 4939 if (ret < 0) { 4940 btrfs_release_path(path); 4941 goto done; 4942 } 4943 4944 if (!path->skip_locking) { 4945 if (path->nowait) { 4946 if (!btrfs_try_tree_read_lock(next)) { 4947 ret = -EAGAIN; 4948 goto done; 4949 } 4950 } else { 4951 btrfs_tree_read_lock(next); 4952 } 4953 } 4954 } 4955 ret = 0; 4956 done: 4957 unlock_up(path, 0, 1, 0, NULL); 4958 if (need_commit_sem) { 4959 int ret2; 4960 4961 path->need_commit_sem = true; 4962 ret2 = finish_need_commit_sem_search(path); 4963 up_read(&fs_info->commit_root_sem); 4964 if (ret2) 4965 ret = ret2; 4966 } 4967 4968 return ret; 4969 } 4970 4971 int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) 4972 { 4973 path->slots[0]++; 4974 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 4975 return btrfs_next_old_leaf(root, path, time_seq); 4976 return 0; 4977 } 4978 4979 /* 4980 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 4981 * searching until it gets past min_objectid or finds an item of 'type' 4982 * 4983 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4984 */ 4985 int btrfs_previous_item(struct btrfs_root *root, 4986 struct btrfs_path *path, u64 min_objectid, 4987 int type) 4988 { 4989 struct btrfs_key found_key; 4990 struct extent_buffer *leaf; 4991 u32 nritems; 4992 int ret; 4993 4994 while (1) { 4995 if (path->slots[0] == 0) { 4996 ret = btrfs_prev_leaf(root, path); 4997 if (ret != 0) 4998 return ret; 4999 } else { 5000 path->slots[0]--; 5001 } 5002 leaf = path->nodes[0]; 5003 nritems = btrfs_header_nritems(leaf); 5004 if (nritems == 0) 5005 return 1; 5006 if (path->slots[0] == nritems) 5007 path->slots[0]--; 5008 5009 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5010 if (found_key.objectid < min_objectid) 5011 break; 5012 if (found_key.type == type) 5013 return 0; 5014 if (found_key.objectid == min_objectid && 5015 found_key.type < type) 5016 break; 5017 } 5018 return 1; 5019 } 5020 5021 /* 5022 * search in extent tree to find a previous Metadata/Data extent item with 5023 * min objecitd. 5024 * 5025 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5026 */ 5027 int btrfs_previous_extent_item(struct btrfs_root *root, 5028 struct btrfs_path *path, u64 min_objectid) 5029 { 5030 struct btrfs_key found_key; 5031 struct extent_buffer *leaf; 5032 u32 nritems; 5033 int ret; 5034 5035 while (1) { 5036 if (path->slots[0] == 0) { 5037 ret = btrfs_prev_leaf(root, path); 5038 if (ret != 0) 5039 return ret; 5040 } else { 5041 path->slots[0]--; 5042 } 5043 leaf = path->nodes[0]; 5044 nritems = btrfs_header_nritems(leaf); 5045 if (nritems == 0) 5046 return 1; 5047 if (path->slots[0] == nritems) 5048 path->slots[0]--; 5049 5050 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5051 if (found_key.objectid < min_objectid) 5052 break; 5053 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 5054 found_key.type == BTRFS_METADATA_ITEM_KEY) 5055 return 0; 5056 if (found_key.objectid == min_objectid && 5057 found_key.type < BTRFS_EXTENT_ITEM_KEY) 5058 break; 5059 } 5060 return 1; 5061 } 5062 5063 int __init btrfs_ctree_init(void) 5064 { 5065 btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0); 5066 if (!btrfs_path_cachep) 5067 return -ENOMEM; 5068 return 0; 5069 } 5070 5071 void __cold btrfs_ctree_exit(void) 5072 { 5073 kmem_cache_destroy(btrfs_path_cachep); 5074 } 5075