1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include <linux/error-injection.h> 11 #include "messages.h" 12 #include "ctree.h" 13 #include "disk-io.h" 14 #include "transaction.h" 15 #include "print-tree.h" 16 #include "locking.h" 17 #include "volumes.h" 18 #include "qgroup.h" 19 #include "tree-mod-log.h" 20 #include "tree-checker.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "extent-tree.h" 24 #include "extent_io.h" 25 #include "relocation.h" 26 #include "file-item.h" 27 28 static struct kmem_cache *btrfs_path_cachep; 29 30 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 31 *root, struct btrfs_path *path, int level); 32 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 33 const struct btrfs_key *ins_key, struct btrfs_path *path, 34 int data_size, bool extend); 35 static int push_node_left(struct btrfs_trans_handle *trans, 36 struct extent_buffer *dst, 37 struct extent_buffer *src, bool empty); 38 static int balance_node_right(struct btrfs_trans_handle *trans, 39 struct extent_buffer *dst_buf, 40 struct extent_buffer *src_buf); 41 /* 42 * The leaf data grows from end-to-front in the node. this returns the address 43 * of the start of the last item, which is the stop of the leaf data stack. 44 */ 45 static unsigned int leaf_data_end(const struct extent_buffer *leaf) 46 { 47 u32 nr = btrfs_header_nritems(leaf); 48 49 if (nr == 0) 50 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); 51 return btrfs_item_offset(leaf, nr - 1); 52 } 53 54 /* 55 * Move data in a @leaf (using memmove, safe for overlapping ranges). 56 * 57 * @leaf: leaf that we're doing a memmove on 58 * @dst_offset: item data offset we're moving to 59 * @src_offset: item data offset were' moving from 60 * @len: length of the data we're moving 61 * 62 * Wrapper around memmove_extent_buffer() that takes into account the header on 63 * the leaf. The btrfs_item offset's start directly after the header, so we 64 * have to adjust any offsets to account for the header in the leaf. This 65 * handles that math to simplify the callers. 66 */ 67 static inline void memmove_leaf_data(const struct extent_buffer *leaf, 68 unsigned long dst_offset, 69 unsigned long src_offset, 70 unsigned long len) 71 { 72 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, 73 btrfs_item_nr_offset(leaf, 0) + src_offset, len); 74 } 75 76 /* 77 * Copy item data from @src into @dst at the given @offset. 78 * 79 * @dst: destination leaf that we're copying into 80 * @src: source leaf that we're copying from 81 * @dst_offset: item data offset we're copying to 82 * @src_offset: item data offset were' copying from 83 * @len: length of the data we're copying 84 * 85 * Wrapper around copy_extent_buffer() that takes into account the header on 86 * the leaf. The btrfs_item offset's start directly after the header, so we 87 * have to adjust any offsets to account for the header in the leaf. This 88 * handles that math to simplify the callers. 89 */ 90 static inline void copy_leaf_data(const struct extent_buffer *dst, 91 const struct extent_buffer *src, 92 unsigned long dst_offset, 93 unsigned long src_offset, unsigned long len) 94 { 95 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, 96 btrfs_item_nr_offset(src, 0) + src_offset, len); 97 } 98 99 /* 100 * Move items in a @leaf (using memmove). 101 * 102 * @dst: destination leaf for the items 103 * @dst_item: the item nr we're copying into 104 * @src_item: the item nr we're copying from 105 * @nr_items: the number of items to copy 106 * 107 * Wrapper around memmove_extent_buffer() that does the math to get the 108 * appropriate offsets into the leaf from the item numbers. 109 */ 110 static inline void memmove_leaf_items(const struct extent_buffer *leaf, 111 int dst_item, int src_item, int nr_items) 112 { 113 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), 114 btrfs_item_nr_offset(leaf, src_item), 115 nr_items * sizeof(struct btrfs_item)); 116 } 117 118 /* 119 * Copy items from @src into @dst at the given @offset. 120 * 121 * @dst: destination leaf for the items 122 * @src: source leaf for the items 123 * @dst_item: the item nr we're copying into 124 * @src_item: the item nr we're copying from 125 * @nr_items: the number of items to copy 126 * 127 * Wrapper around copy_extent_buffer() that does the math to get the 128 * appropriate offsets into the leaf from the item numbers. 129 */ 130 static inline void copy_leaf_items(const struct extent_buffer *dst, 131 const struct extent_buffer *src, 132 int dst_item, int src_item, int nr_items) 133 { 134 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), 135 btrfs_item_nr_offset(src, src_item), 136 nr_items * sizeof(struct btrfs_item)); 137 } 138 139 struct btrfs_path *btrfs_alloc_path(void) 140 { 141 might_sleep(); 142 143 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 144 } 145 146 /* this also releases the path */ 147 void btrfs_free_path(struct btrfs_path *p) 148 { 149 if (!p) 150 return; 151 btrfs_release_path(p); 152 kmem_cache_free(btrfs_path_cachep, p); 153 } 154 155 /* 156 * path release drops references on the extent buffers in the path 157 * and it drops any locks held by this path 158 * 159 * It is safe to call this on paths that no locks or extent buffers held. 160 */ 161 noinline void btrfs_release_path(struct btrfs_path *p) 162 { 163 int i; 164 165 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 166 p->slots[i] = 0; 167 if (!p->nodes[i]) 168 continue; 169 if (p->locks[i]) { 170 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 171 p->locks[i] = 0; 172 } 173 free_extent_buffer(p->nodes[i]); 174 p->nodes[i] = NULL; 175 } 176 } 177 178 /* 179 * safely gets a reference on the root node of a tree. A lock 180 * is not taken, so a concurrent writer may put a different node 181 * at the root of the tree. See btrfs_lock_root_node for the 182 * looping required. 183 * 184 * The extent buffer returned by this has a reference taken, so 185 * it won't disappear. It may stop being the root of the tree 186 * at any time because there are no locks held. 187 */ 188 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 189 { 190 struct extent_buffer *eb; 191 192 while (1) { 193 rcu_read_lock(); 194 eb = rcu_dereference(root->node); 195 196 /* 197 * RCU really hurts here, we could free up the root node because 198 * it was COWed but we may not get the new root node yet so do 199 * the inc_not_zero dance and if it doesn't work then 200 * synchronize_rcu and try again. 201 */ 202 if (refcount_inc_not_zero(&eb->refs)) { 203 rcu_read_unlock(); 204 break; 205 } 206 rcu_read_unlock(); 207 synchronize_rcu(); 208 } 209 return eb; 210 } 211 212 /* 213 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), 214 * just get put onto a simple dirty list. Transaction walks this list to make 215 * sure they get properly updated on disk. 216 */ 217 static void add_root_to_dirty_list(struct btrfs_root *root) 218 { 219 struct btrfs_fs_info *fs_info = root->fs_info; 220 221 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 222 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 223 return; 224 225 spin_lock(&fs_info->trans_lock); 226 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 227 /* Want the extent tree to be the last on the list */ 228 if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID) 229 list_move_tail(&root->dirty_list, 230 &fs_info->dirty_cowonly_roots); 231 else 232 list_move(&root->dirty_list, 233 &fs_info->dirty_cowonly_roots); 234 } 235 spin_unlock(&fs_info->trans_lock); 236 } 237 238 /* 239 * used by snapshot creation to make a copy of a root for a tree with 240 * a given objectid. The buffer with the new root node is returned in 241 * cow_ret, and this func returns zero on success or a negative error code. 242 */ 243 int btrfs_copy_root(struct btrfs_trans_handle *trans, 244 struct btrfs_root *root, 245 struct extent_buffer *buf, 246 struct extent_buffer **cow_ret, u64 new_root_objectid) 247 { 248 struct btrfs_fs_info *fs_info = root->fs_info; 249 struct extent_buffer *cow; 250 int ret = 0; 251 int level; 252 struct btrfs_disk_key disk_key; 253 const bool is_reloc_root = (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID); 254 u64 reloc_src_root = 0; 255 256 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 257 trans->transid != fs_info->running_transaction->transid); 258 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 259 trans->transid != btrfs_get_root_last_trans(root)); 260 261 level = btrfs_header_level(buf); 262 if (level == 0) 263 btrfs_item_key(buf, &disk_key, 0); 264 else 265 btrfs_node_key(buf, &disk_key, 0); 266 267 if (is_reloc_root) 268 reloc_src_root = btrfs_header_owner(buf); 269 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 270 &disk_key, level, buf->start, 0, 271 reloc_src_root, BTRFS_NESTING_NEW_ROOT); 272 if (IS_ERR(cow)) 273 return PTR_ERR(cow); 274 275 copy_extent_buffer_full(cow, buf); 276 btrfs_set_header_bytenr(cow, cow->start); 277 btrfs_set_header_generation(cow, trans->transid); 278 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 279 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 280 BTRFS_HEADER_FLAG_RELOC); 281 if (is_reloc_root) 282 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 283 else 284 btrfs_set_header_owner(cow, new_root_objectid); 285 286 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 287 288 if (unlikely(btrfs_header_generation(buf) > trans->transid)) { 289 btrfs_tree_unlock(cow); 290 free_extent_buffer(cow); 291 ret = -EUCLEAN; 292 btrfs_abort_transaction(trans, ret); 293 return ret; 294 } 295 296 ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); 297 if (unlikely(ret)) { 298 btrfs_abort_transaction(trans, ret); 299 btrfs_tree_unlock(cow); 300 free_extent_buffer(cow); 301 return ret; 302 } 303 304 btrfs_mark_buffer_dirty(trans, cow); 305 *cow_ret = cow; 306 return 0; 307 } 308 309 /* 310 * check if the tree block can be shared by multiple trees 311 */ 312 bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans, 313 const struct btrfs_root *root, 314 const struct extent_buffer *buf) 315 { 316 const u64 buf_gen = btrfs_header_generation(buf); 317 318 /* 319 * Tree blocks not in shareable trees and tree roots are never shared. 320 * If a block was allocated after the last snapshot and the block was 321 * not allocated by tree relocation, we know the block is not shared. 322 */ 323 324 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 325 return false; 326 327 if (buf == root->node) 328 return false; 329 330 if (buf_gen > btrfs_root_last_snapshot(&root->root_item) && 331 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) 332 return false; 333 334 if (buf != root->commit_root) 335 return true; 336 337 /* 338 * An extent buffer that used to be the commit root may still be shared 339 * because the tree height may have increased and it became a child of a 340 * higher level root. This can happen when snapshotting a subvolume 341 * created in the current transaction. 342 */ 343 if (buf_gen == trans->transid) 344 return true; 345 346 return false; 347 } 348 349 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 350 struct btrfs_root *root, 351 struct extent_buffer *buf, 352 struct extent_buffer *cow, 353 int *last_ref) 354 { 355 struct btrfs_fs_info *fs_info = root->fs_info; 356 u64 refs; 357 u64 owner; 358 u64 flags; 359 int ret; 360 const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); 361 362 /* 363 * Backrefs update rules: 364 * 365 * Always use full backrefs for extent pointers in tree block 366 * allocated by tree relocation. 367 * 368 * If a shared tree block is no longer referenced by its owner 369 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 370 * use full backrefs for extent pointers in tree block. 371 * 372 * If a tree block is been relocating 373 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 374 * use full backrefs for extent pointers in tree block. 375 * The reason for this is some operations (such as drop tree) 376 * are only allowed for blocks use full backrefs. 377 */ 378 379 if (btrfs_block_can_be_shared(trans, root, buf)) { 380 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 381 btrfs_header_level(buf), 1, 382 &refs, &flags, NULL); 383 if (ret) 384 return ret; 385 if (unlikely(refs == 0)) { 386 btrfs_crit(fs_info, 387 "found 0 references for tree block at bytenr %llu level %d root %llu", 388 buf->start, btrfs_header_level(buf), 389 btrfs_root_id(root)); 390 ret = -EUCLEAN; 391 btrfs_abort_transaction(trans, ret); 392 return ret; 393 } 394 } else { 395 refs = 1; 396 if (is_reloc_root || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 397 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 398 else 399 flags = 0; 400 } 401 402 owner = btrfs_header_owner(buf); 403 if (unlikely(owner == BTRFS_TREE_RELOC_OBJECTID && 404 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))) { 405 btrfs_crit(fs_info, 406 "found tree block at bytenr %llu level %d root %llu refs %llu flags %llx without full backref flag set", 407 buf->start, btrfs_header_level(buf), 408 btrfs_root_id(root), refs, flags); 409 ret = -EUCLEAN; 410 btrfs_abort_transaction(trans, ret); 411 return ret; 412 } 413 414 if (refs > 1) { 415 if ((owner == btrfs_root_id(root) || is_reloc_root) && 416 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 417 ret = btrfs_inc_ref(trans, root, buf, true); 418 if (ret) 419 return ret; 420 421 if (is_reloc_root) { 422 ret = btrfs_dec_ref(trans, root, buf, false); 423 if (ret) 424 return ret; 425 ret = btrfs_inc_ref(trans, root, cow, true); 426 if (ret) 427 return ret; 428 } 429 ret = btrfs_set_disk_extent_flags(trans, buf, 430 BTRFS_BLOCK_FLAG_FULL_BACKREF); 431 if (ret) 432 return ret; 433 } else { 434 ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); 435 if (ret) 436 return ret; 437 } 438 } else { 439 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 440 ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); 441 if (ret) 442 return ret; 443 ret = btrfs_dec_ref(trans, root, buf, true); 444 if (ret) 445 return ret; 446 } 447 btrfs_clear_buffer_dirty(trans, buf); 448 *last_ref = 1; 449 } 450 return 0; 451 } 452 453 /* 454 * does the dirty work in cow of a single block. The parent block (if 455 * supplied) is updated to point to the new cow copy. The new buffer is marked 456 * dirty and returned locked. If you modify the block it needs to be marked 457 * dirty again. 458 * 459 * search_start -- an allocation hint for the new block 460 * 461 * empty_size -- a hint that you plan on doing more cow. This is the size in 462 * bytes the allocator should try to find free next to the block it returns. 463 * This is just a hint and may be ignored by the allocator. 464 */ 465 int btrfs_force_cow_block(struct btrfs_trans_handle *trans, 466 struct btrfs_root *root, 467 struct extent_buffer *buf, 468 struct extent_buffer *parent, int parent_slot, 469 struct extent_buffer **cow_ret, 470 u64 search_start, u64 empty_size, 471 enum btrfs_lock_nesting nest) 472 { 473 struct btrfs_fs_info *fs_info = root->fs_info; 474 struct btrfs_disk_key disk_key; 475 struct extent_buffer *cow; 476 int level, ret; 477 int last_ref = 0; 478 const bool unlock_orig = (*cow_ret == buf); 479 u64 parent_start = 0; 480 u64 reloc_src_root = 0; 481 482 btrfs_assert_tree_write_locked(buf); 483 484 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 485 trans->transid != fs_info->running_transaction->transid); 486 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 487 trans->transid != btrfs_get_root_last_trans(root)); 488 489 level = btrfs_header_level(buf); 490 491 if (level == 0) 492 btrfs_item_key(buf, &disk_key, 0); 493 else 494 btrfs_node_key(buf, &disk_key, 0); 495 496 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { 497 if (parent) 498 parent_start = parent->start; 499 reloc_src_root = btrfs_header_owner(buf); 500 } 501 cow = btrfs_alloc_tree_block(trans, root, parent_start, 502 btrfs_root_id(root), &disk_key, level, 503 search_start, empty_size, reloc_src_root, nest); 504 if (IS_ERR(cow)) 505 return PTR_ERR(cow); 506 507 /* cow is set to blocking by btrfs_init_new_buffer */ 508 509 copy_extent_buffer_full(cow, buf); 510 btrfs_set_header_bytenr(cow, cow->start); 511 btrfs_set_header_generation(cow, trans->transid); 512 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 513 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 514 BTRFS_HEADER_FLAG_RELOC); 515 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 516 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 517 else 518 btrfs_set_header_owner(cow, btrfs_root_id(root)); 519 520 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 521 522 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 523 if (unlikely(ret)) { 524 btrfs_abort_transaction(trans, ret); 525 goto error_unlock_cow; 526 } 527 528 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 529 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 530 if (unlikely(ret)) { 531 btrfs_abort_transaction(trans, ret); 532 goto error_unlock_cow; 533 } 534 } 535 536 if (buf == root->node) { 537 WARN_ON(parent && parent != buf); 538 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || 539 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 540 parent_start = buf->start; 541 542 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); 543 if (unlikely(ret < 0)) { 544 btrfs_abort_transaction(trans, ret); 545 goto error_unlock_cow; 546 } 547 refcount_inc(&cow->refs); 548 rcu_assign_pointer(root->node, cow); 549 550 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 551 parent_start, last_ref); 552 free_extent_buffer(buf); 553 add_root_to_dirty_list(root); 554 if (unlikely(ret < 0)) { 555 btrfs_abort_transaction(trans, ret); 556 goto error_unlock_cow; 557 } 558 } else { 559 WARN_ON(trans->transid != btrfs_header_generation(parent)); 560 ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, 561 BTRFS_MOD_LOG_KEY_REPLACE); 562 if (unlikely(ret)) { 563 btrfs_abort_transaction(trans, ret); 564 goto error_unlock_cow; 565 } 566 btrfs_set_node_blockptr(parent, parent_slot, 567 cow->start); 568 btrfs_set_node_ptr_generation(parent, parent_slot, 569 trans->transid); 570 btrfs_mark_buffer_dirty(trans, parent); 571 if (last_ref) { 572 ret = btrfs_tree_mod_log_free_eb(buf); 573 if (unlikely(ret)) { 574 btrfs_abort_transaction(trans, ret); 575 goto error_unlock_cow; 576 } 577 } 578 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 579 parent_start, last_ref); 580 if (unlikely(ret < 0)) { 581 btrfs_abort_transaction(trans, ret); 582 goto error_unlock_cow; 583 } 584 } 585 586 trace_btrfs_cow_block(root, buf, cow); 587 if (unlock_orig) 588 btrfs_tree_unlock(buf); 589 free_extent_buffer_stale(buf); 590 btrfs_mark_buffer_dirty(trans, cow); 591 592 btrfs_inhibit_eb_writeback(trans, cow); 593 594 *cow_ret = cow; 595 return 0; 596 597 error_unlock_cow: 598 btrfs_tree_unlock(cow); 599 free_extent_buffer(cow); 600 return ret; 601 } 602 603 static inline bool should_cow_block(struct btrfs_trans_handle *trans, 604 const struct btrfs_root *root, 605 struct extent_buffer *buf) 606 { 607 if (btrfs_is_testing(root->fs_info)) 608 return false; 609 610 /* 611 * We do not need to cow a block if 612 * 1) this block is not created or changed in this transaction; 613 * 2) this block does not belong to TREE_RELOC tree; 614 * 3) the root is not forced COW. 615 * 616 * What is forced COW: 617 * when we create snapshot during committing the transaction, 618 * after we've finished copying src root, we must COW the shared 619 * block to ensure the metadata consistency. 620 */ 621 622 if (btrfs_header_generation(buf) != trans->transid) 623 return true; 624 625 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) 626 return true; 627 628 /* Ensure we can see the FORCE_COW bit. */ 629 smp_mb__before_atomic(); 630 if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 631 return true; 632 633 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 634 return false; 635 636 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) 637 return true; 638 639 btrfs_inhibit_eb_writeback(trans, buf); 640 return false; 641 } 642 643 /* 644 * COWs a single block, see btrfs_force_cow_block() for the real work. 645 * This version of it has extra checks so that a block isn't COWed more than 646 * once per transaction, as long as it hasn't been written yet 647 */ 648 int btrfs_cow_block(struct btrfs_trans_handle *trans, 649 struct btrfs_root *root, struct extent_buffer *buf, 650 struct extent_buffer *parent, int parent_slot, 651 struct extent_buffer **cow_ret, 652 enum btrfs_lock_nesting nest) 653 { 654 struct btrfs_fs_info *fs_info = root->fs_info; 655 u64 search_start; 656 657 if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { 658 btrfs_abort_transaction(trans, -EUCLEAN); 659 btrfs_crit(fs_info, 660 "attempt to COW block %llu on root %llu that is being deleted", 661 buf->start, btrfs_root_id(root)); 662 return -EUCLEAN; 663 } 664 665 /* 666 * COWing must happen through a running transaction, which always 667 * matches the current fs generation (it's a transaction with a state 668 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 669 * into error state to prevent the commit of any transaction. 670 */ 671 if (unlikely(trans->transaction != fs_info->running_transaction || 672 trans->transid != fs_info->generation)) { 673 btrfs_abort_transaction(trans, -EUCLEAN); 674 btrfs_crit(fs_info, 675 "unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", 676 buf->start, btrfs_root_id(root), trans->transid, 677 fs_info->running_transaction->transid, 678 fs_info->generation); 679 return -EUCLEAN; 680 } 681 682 if (!should_cow_block(trans, root, buf)) { 683 *cow_ret = buf; 684 return 0; 685 } 686 687 search_start = round_down(buf->start, SZ_1G); 688 689 /* 690 * Before CoWing this block for later modification, check if it's 691 * the subtree root and do the delayed subtree trace if needed. 692 * 693 * Also We don't care about the error, as it's handled internally. 694 */ 695 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 696 return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, 697 cow_ret, search_start, 0, nest); 698 } 699 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 700 701 /* 702 * same as comp_keys only with two btrfs_key's 703 */ 704 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 705 { 706 if (k1->objectid > k2->objectid) 707 return 1; 708 if (k1->objectid < k2->objectid) 709 return -1; 710 if (k1->type > k2->type) 711 return 1; 712 if (k1->type < k2->type) 713 return -1; 714 if (k1->offset > k2->offset) 715 return 1; 716 if (k1->offset < k2->offset) 717 return -1; 718 return 0; 719 } 720 721 /* 722 * Search for a key in the given extent_buffer. 723 * 724 * The lower boundary for the search is specified by the slot number @first_slot. 725 * Use a value of 0 to search over the whole extent buffer. Works for both 726 * leaves and nodes. 727 * 728 * The slot in the extent buffer is returned via @slot. If the key exists in the 729 * extent buffer, then @slot will point to the slot where the key is, otherwise 730 * it points to the slot where you would insert the key. 731 * 732 * Slot may point to the total number of items (i.e. one position beyond the last 733 * key) if the key is bigger than the last key in the extent buffer. 734 */ 735 int btrfs_bin_search(const struct extent_buffer *eb, int first_slot, 736 const struct btrfs_key *key, int *slot) 737 { 738 unsigned long p; 739 int item_size; 740 /* 741 * Use unsigned types for the low and high slots, so that we get a more 742 * efficient division in the search loop below. 743 */ 744 u32 low = first_slot; 745 u32 high = btrfs_header_nritems(eb); 746 int ret; 747 const int key_size = sizeof(struct btrfs_disk_key); 748 749 if (unlikely(low > high)) { 750 btrfs_err(eb->fs_info, 751 "%s: low (%u) > high (%u) eb %llu owner %llu level %d", 752 __func__, low, high, eb->start, 753 btrfs_header_owner(eb), btrfs_header_level(eb)); 754 return -EINVAL; 755 } 756 757 if (btrfs_header_level(eb) == 0) { 758 p = offsetof(struct btrfs_leaf, items); 759 item_size = sizeof(struct btrfs_item); 760 } else { 761 p = offsetof(struct btrfs_node, ptrs); 762 item_size = sizeof(struct btrfs_key_ptr); 763 } 764 765 while (low < high) { 766 const int unit_size = eb->folio_size; 767 unsigned long oif; 768 unsigned long offset; 769 struct btrfs_disk_key *tmp; 770 struct btrfs_disk_key unaligned; 771 u32 mid; 772 773 mid = (low + high) / 2; 774 offset = p + mid * item_size; 775 oif = get_eb_offset_in_folio(eb, offset); 776 777 if (oif + key_size <= unit_size) { 778 const unsigned long idx = get_eb_folio_index(eb, offset); 779 char *kaddr = folio_address(eb->folios[idx]); 780 781 tmp = (struct btrfs_disk_key *)(kaddr + oif); 782 } else { 783 read_extent_buffer(eb, &unaligned, offset, key_size); 784 tmp = &unaligned; 785 } 786 787 ret = btrfs_comp_keys(tmp, key); 788 789 if (ret < 0) 790 low = mid + 1; 791 else if (ret > 0) 792 high = mid; 793 else { 794 *slot = mid; 795 return 0; 796 } 797 } 798 *slot = low; 799 return 1; 800 } 801 802 static void root_add_used_bytes(struct btrfs_root *root) 803 { 804 spin_lock(&root->accounting_lock); 805 btrfs_set_root_used(&root->root_item, 806 btrfs_root_used(&root->root_item) + root->fs_info->nodesize); 807 spin_unlock(&root->accounting_lock); 808 } 809 810 static void root_sub_used_bytes(struct btrfs_root *root) 811 { 812 spin_lock(&root->accounting_lock); 813 btrfs_set_root_used(&root->root_item, 814 btrfs_root_used(&root->root_item) - root->fs_info->nodesize); 815 spin_unlock(&root->accounting_lock); 816 } 817 818 /* given a node and slot number, this reads the blocks it points to. The 819 * extent buffer is returned with a reference taken (but unlocked). 820 */ 821 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, 822 int slot) 823 { 824 int level = btrfs_header_level(parent); 825 struct btrfs_tree_parent_check check = { 0 }; 826 827 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 828 return ERR_PTR(-ENOENT); 829 830 ASSERT(level); 831 832 check.level = level - 1; 833 check.transid = btrfs_node_ptr_generation(parent, slot); 834 check.owner_root = btrfs_header_owner(parent); 835 check.has_first_key = true; 836 btrfs_node_key_to_cpu(parent, &check.first_key, slot); 837 838 return read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 839 &check); 840 } 841 842 /* 843 * Promote a child node to become the new tree root. 844 * 845 * @trans: Transaction handle 846 * @root: Tree root structure to update 847 * @path: Path holding nodes and locks 848 * @level: Level of the parent (old root) 849 * @parent: The parent (old root) with exactly one item 850 * 851 * This helper is called during rebalancing when the root node contains only 852 * a single item (nritems == 1). We can reduce the tree height by promoting 853 * that child to become the new root and freeing the old root node. The path 854 * locks and references are updated accordingly. 855 * 856 * Return: 0 on success, negative errno on failure. The transaction is aborted 857 * on critical errors. 858 */ 859 static int promote_child_to_root(struct btrfs_trans_handle *trans, 860 struct btrfs_root *root, struct btrfs_path *path, 861 int level, struct extent_buffer *parent) 862 { 863 struct extent_buffer *child; 864 int ret; 865 866 ASSERT(btrfs_header_nritems(parent) == 1); 867 868 child = btrfs_read_node_slot(parent, 0); 869 if (IS_ERR(child)) 870 return PTR_ERR(child); 871 872 btrfs_tree_lock(child); 873 ret = btrfs_cow_block(trans, root, child, parent, 0, &child, BTRFS_NESTING_COW); 874 if (ret) { 875 btrfs_tree_unlock(child); 876 free_extent_buffer(child); 877 return ret; 878 } 879 880 ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 881 if (unlikely(ret < 0)) { 882 btrfs_tree_unlock(child); 883 free_extent_buffer(child); 884 btrfs_abort_transaction(trans, ret); 885 return ret; 886 } 887 rcu_assign_pointer(root->node, child); 888 889 add_root_to_dirty_list(root); 890 btrfs_tree_unlock(child); 891 892 path->locks[level] = 0; 893 path->nodes[level] = NULL; 894 btrfs_clear_buffer_dirty(trans, parent); 895 btrfs_tree_unlock(parent); 896 /* Once for the path. */ 897 free_extent_buffer(parent); 898 899 root_sub_used_bytes(root); 900 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), parent, 0, 1); 901 /* Once for the root ptr. */ 902 free_extent_buffer_stale(parent); 903 if (unlikely(ret < 0)) { 904 btrfs_abort_transaction(trans, ret); 905 return ret; 906 } 907 908 return 0; 909 } 910 911 /* 912 * node level balancing, used to make sure nodes are in proper order for 913 * item deletion. We balance from the top down, so we have to make sure 914 * that a deletion won't leave an node completely empty later on. 915 */ 916 static noinline int balance_level(struct btrfs_trans_handle *trans, 917 struct btrfs_root *root, 918 struct btrfs_path *path, int level) 919 { 920 struct btrfs_fs_info *fs_info = root->fs_info; 921 struct extent_buffer *right = NULL; 922 struct extent_buffer *mid; 923 struct extent_buffer *left = NULL; 924 struct extent_buffer *parent = NULL; 925 int ret = 0; 926 int wret; 927 int pslot; 928 int orig_slot = path->slots[level]; 929 u64 orig_ptr; 930 931 ASSERT(level > 0); 932 933 mid = path->nodes[level]; 934 935 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); 936 WARN_ON(btrfs_header_generation(mid) != trans->transid); 937 938 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 939 940 if (level < BTRFS_MAX_LEVEL - 1) { 941 parent = path->nodes[level + 1]; 942 pslot = path->slots[level + 1]; 943 } 944 945 /* 946 * deal with the case where there is only one pointer in the root 947 * by promoting the node below to a root 948 */ 949 if (!parent) { 950 if (btrfs_header_nritems(mid) != 1) 951 return 0; 952 953 return promote_child_to_root(trans, root, path, level, mid); 954 } 955 if (btrfs_header_nritems(mid) > 956 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 957 return 0; 958 959 if (pslot) { 960 left = btrfs_read_node_slot(parent, pslot - 1); 961 if (IS_ERR(left)) { 962 ret = PTR_ERR(left); 963 left = NULL; 964 goto out; 965 } 966 967 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); 968 wret = btrfs_cow_block(trans, root, left, 969 parent, pslot - 1, &left, 970 BTRFS_NESTING_LEFT_COW); 971 if (wret) { 972 ret = wret; 973 goto out; 974 } 975 } 976 977 if (pslot + 1 < btrfs_header_nritems(parent)) { 978 right = btrfs_read_node_slot(parent, pslot + 1); 979 if (IS_ERR(right)) { 980 ret = PTR_ERR(right); 981 right = NULL; 982 goto out; 983 } 984 985 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); 986 wret = btrfs_cow_block(trans, root, right, 987 parent, pslot + 1, &right, 988 BTRFS_NESTING_RIGHT_COW); 989 if (wret) { 990 ret = wret; 991 goto out; 992 } 993 } 994 995 /* first, try to make some room in the middle buffer */ 996 if (left) { 997 orig_slot += btrfs_header_nritems(left); 998 wret = push_node_left(trans, left, mid, 1); 999 if (wret < 0) 1000 ret = wret; 1001 } 1002 1003 /* 1004 * then try to empty the right most buffer into the middle 1005 */ 1006 if (right) { 1007 wret = push_node_left(trans, mid, right, 1); 1008 if (wret < 0 && wret != -ENOSPC) 1009 ret = wret; 1010 if (btrfs_header_nritems(right) == 0) { 1011 btrfs_clear_buffer_dirty(trans, right); 1012 btrfs_tree_unlock(right); 1013 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); 1014 if (ret < 0) { 1015 free_extent_buffer_stale(right); 1016 right = NULL; 1017 goto out; 1018 } 1019 root_sub_used_bytes(root); 1020 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), 1021 right, 0, 1); 1022 free_extent_buffer_stale(right); 1023 right = NULL; 1024 if (unlikely(ret < 0)) { 1025 btrfs_abort_transaction(trans, ret); 1026 goto out; 1027 } 1028 } else { 1029 struct btrfs_disk_key right_key; 1030 btrfs_node_key(right, &right_key, 0); 1031 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1032 BTRFS_MOD_LOG_KEY_REPLACE); 1033 if (unlikely(ret < 0)) { 1034 btrfs_abort_transaction(trans, ret); 1035 goto out; 1036 } 1037 btrfs_set_node_key(parent, &right_key, pslot + 1); 1038 btrfs_mark_buffer_dirty(trans, parent); 1039 } 1040 } 1041 if (btrfs_header_nritems(mid) == 1) { 1042 /* 1043 * we're not allowed to leave a node with one item in the 1044 * tree during a delete. A deletion from lower in the tree 1045 * could try to delete the only pointer in this node. 1046 * So, pull some keys from the left. 1047 * There has to be a left pointer at this point because 1048 * otherwise we would have pulled some pointers from the 1049 * right 1050 */ 1051 if (unlikely(!left)) { 1052 btrfs_crit(fs_info, 1053 "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", 1054 parent->start, btrfs_header_level(parent), 1055 mid->start, btrfs_root_id(root)); 1056 ret = -EUCLEAN; 1057 btrfs_abort_transaction(trans, ret); 1058 goto out; 1059 } 1060 wret = balance_node_right(trans, mid, left); 1061 if (wret < 0) { 1062 ret = wret; 1063 goto out; 1064 } 1065 if (wret == 1) { 1066 wret = push_node_left(trans, left, mid, 1); 1067 if (wret < 0) 1068 ret = wret; 1069 } 1070 BUG_ON(wret == 1); 1071 } 1072 if (btrfs_header_nritems(mid) == 0) { 1073 btrfs_clear_buffer_dirty(trans, mid); 1074 btrfs_tree_unlock(mid); 1075 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); 1076 if (ret < 0) { 1077 free_extent_buffer_stale(mid); 1078 mid = NULL; 1079 goto out; 1080 } 1081 root_sub_used_bytes(root); 1082 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1083 free_extent_buffer_stale(mid); 1084 mid = NULL; 1085 if (unlikely(ret < 0)) { 1086 btrfs_abort_transaction(trans, ret); 1087 goto out; 1088 } 1089 } else { 1090 /* update the parent key to reflect our changes */ 1091 struct btrfs_disk_key mid_key; 1092 btrfs_node_key(mid, &mid_key, 0); 1093 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1094 BTRFS_MOD_LOG_KEY_REPLACE); 1095 if (unlikely(ret < 0)) { 1096 btrfs_abort_transaction(trans, ret); 1097 goto out; 1098 } 1099 btrfs_set_node_key(parent, &mid_key, pslot); 1100 btrfs_mark_buffer_dirty(trans, parent); 1101 } 1102 1103 /* update the path */ 1104 if (left) { 1105 if (btrfs_header_nritems(left) > orig_slot) { 1106 /* left was locked after cow */ 1107 path->nodes[level] = left; 1108 path->slots[level + 1] -= 1; 1109 path->slots[level] = orig_slot; 1110 /* Left is now owned by path. */ 1111 left = NULL; 1112 if (mid) { 1113 btrfs_tree_unlock(mid); 1114 free_extent_buffer(mid); 1115 } 1116 } else { 1117 orig_slot -= btrfs_header_nritems(left); 1118 path->slots[level] = orig_slot; 1119 } 1120 } 1121 /* double check we haven't messed things up */ 1122 if (orig_ptr != 1123 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1124 BUG(); 1125 out: 1126 if (right) { 1127 btrfs_tree_unlock(right); 1128 free_extent_buffer(right); 1129 } 1130 if (left) { 1131 btrfs_tree_unlock(left); 1132 free_extent_buffer(left); 1133 } 1134 return ret; 1135 } 1136 1137 /* Node balancing for insertion. Here we only split or push nodes around 1138 * when they are completely full. This is also done top down, so we 1139 * have to be pessimistic. 1140 */ 1141 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1142 struct btrfs_root *root, 1143 struct btrfs_path *path, int level) 1144 { 1145 struct btrfs_fs_info *fs_info = root->fs_info; 1146 struct extent_buffer *right = NULL; 1147 struct extent_buffer *mid; 1148 struct extent_buffer *left = NULL; 1149 struct extent_buffer *parent = NULL; 1150 int ret = 0; 1151 int wret; 1152 int pslot; 1153 int orig_slot = path->slots[level]; 1154 1155 if (level == 0) 1156 return 1; 1157 1158 mid = path->nodes[level]; 1159 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1160 1161 if (level < BTRFS_MAX_LEVEL - 1) { 1162 parent = path->nodes[level + 1]; 1163 pslot = path->slots[level + 1]; 1164 } 1165 1166 if (!parent) 1167 return 1; 1168 1169 /* first, try to make some room in the middle buffer */ 1170 if (pslot) { 1171 u32 left_nr; 1172 1173 left = btrfs_read_node_slot(parent, pslot - 1); 1174 if (IS_ERR(left)) 1175 return PTR_ERR(left); 1176 1177 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); 1178 1179 left_nr = btrfs_header_nritems(left); 1180 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1181 wret = 1; 1182 } else { 1183 ret = btrfs_cow_block(trans, root, left, parent, 1184 pslot - 1, &left, 1185 BTRFS_NESTING_LEFT_COW); 1186 if (ret) 1187 wret = 1; 1188 else { 1189 wret = push_node_left(trans, left, mid, 0); 1190 } 1191 } 1192 if (wret < 0) 1193 ret = wret; 1194 if (wret == 0) { 1195 struct btrfs_disk_key disk_key; 1196 orig_slot += left_nr; 1197 btrfs_node_key(mid, &disk_key, 0); 1198 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1199 BTRFS_MOD_LOG_KEY_REPLACE); 1200 if (unlikely(ret < 0)) { 1201 btrfs_tree_unlock(left); 1202 free_extent_buffer(left); 1203 btrfs_abort_transaction(trans, ret); 1204 return ret; 1205 } 1206 btrfs_set_node_key(parent, &disk_key, pslot); 1207 btrfs_mark_buffer_dirty(trans, parent); 1208 if (btrfs_header_nritems(left) > orig_slot) { 1209 path->nodes[level] = left; 1210 path->slots[level + 1] -= 1; 1211 path->slots[level] = orig_slot; 1212 btrfs_tree_unlock(mid); 1213 free_extent_buffer(mid); 1214 } else { 1215 orig_slot -= 1216 btrfs_header_nritems(left); 1217 path->slots[level] = orig_slot; 1218 btrfs_tree_unlock(left); 1219 free_extent_buffer(left); 1220 } 1221 return 0; 1222 } 1223 btrfs_tree_unlock(left); 1224 free_extent_buffer(left); 1225 } 1226 1227 /* 1228 * then try to empty the right most buffer into the middle 1229 */ 1230 if (pslot + 1 < btrfs_header_nritems(parent)) { 1231 u32 right_nr; 1232 1233 right = btrfs_read_node_slot(parent, pslot + 1); 1234 if (IS_ERR(right)) 1235 return PTR_ERR(right); 1236 1237 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); 1238 1239 right_nr = btrfs_header_nritems(right); 1240 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1241 wret = 1; 1242 } else { 1243 ret = btrfs_cow_block(trans, root, right, 1244 parent, pslot + 1, 1245 &right, BTRFS_NESTING_RIGHT_COW); 1246 if (ret) 1247 wret = 1; 1248 else { 1249 wret = balance_node_right(trans, right, mid); 1250 } 1251 } 1252 if (wret < 0) 1253 ret = wret; 1254 if (wret == 0) { 1255 struct btrfs_disk_key disk_key; 1256 1257 btrfs_node_key(right, &disk_key, 0); 1258 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1259 BTRFS_MOD_LOG_KEY_REPLACE); 1260 if (unlikely(ret < 0)) { 1261 btrfs_tree_unlock(right); 1262 free_extent_buffer(right); 1263 btrfs_abort_transaction(trans, ret); 1264 return ret; 1265 } 1266 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1267 btrfs_mark_buffer_dirty(trans, parent); 1268 1269 if (btrfs_header_nritems(mid) <= orig_slot) { 1270 path->nodes[level] = right; 1271 path->slots[level + 1] += 1; 1272 path->slots[level] = orig_slot - 1273 btrfs_header_nritems(mid); 1274 btrfs_tree_unlock(mid); 1275 free_extent_buffer(mid); 1276 } else { 1277 btrfs_tree_unlock(right); 1278 free_extent_buffer(right); 1279 } 1280 return 0; 1281 } 1282 btrfs_tree_unlock(right); 1283 free_extent_buffer(right); 1284 } 1285 return 1; 1286 } 1287 1288 /* 1289 * readahead one full node of leaves, finding things that are close 1290 * to the block in 'slot', and triggering ra on them. 1291 */ 1292 static void reada_for_search(struct btrfs_fs_info *fs_info, 1293 const struct btrfs_path *path, 1294 int level, int slot, u64 objectid) 1295 { 1296 struct extent_buffer *node; 1297 struct btrfs_disk_key disk_key; 1298 u32 nritems; 1299 u64 search; 1300 u64 target; 1301 u64 nread = 0; 1302 u64 nread_max; 1303 u32 nr; 1304 u32 blocksize; 1305 u32 nscan = 0; 1306 1307 if (level != 1 && path->reada != READA_FORWARD_ALWAYS) 1308 return; 1309 1310 if (!path->nodes[level]) 1311 return; 1312 1313 node = path->nodes[level]; 1314 1315 /* 1316 * Since the time between visiting leaves is much shorter than the time 1317 * between visiting nodes, limit read ahead of nodes to 1, to avoid too 1318 * much IO at once (possibly random). 1319 */ 1320 if (path->reada == READA_FORWARD_ALWAYS) { 1321 if (level > 1) 1322 nread_max = node->fs_info->nodesize; 1323 else 1324 nread_max = SZ_128K; 1325 } else { 1326 nread_max = SZ_64K; 1327 } 1328 1329 search = btrfs_node_blockptr(node, slot); 1330 blocksize = fs_info->nodesize; 1331 if (path->reada != READA_FORWARD_ALWAYS) { 1332 struct extent_buffer *eb; 1333 1334 eb = find_extent_buffer(fs_info, search); 1335 if (eb) { 1336 free_extent_buffer(eb); 1337 return; 1338 } 1339 } 1340 1341 target = search; 1342 1343 nritems = btrfs_header_nritems(node); 1344 nr = slot; 1345 1346 while (1) { 1347 if (path->reada == READA_BACK) { 1348 if (nr == 0) 1349 break; 1350 nr--; 1351 } else if (path->reada == READA_FORWARD || 1352 path->reada == READA_FORWARD_ALWAYS) { 1353 nr++; 1354 if (nr >= nritems) 1355 break; 1356 } 1357 if (path->reada == READA_BACK && objectid) { 1358 btrfs_node_key(node, &disk_key, nr); 1359 if (btrfs_disk_key_objectid(&disk_key) != objectid) 1360 break; 1361 } 1362 search = btrfs_node_blockptr(node, nr); 1363 if (path->reada == READA_FORWARD_ALWAYS || 1364 (search <= target && target - search <= 65536) || 1365 (search > target && search - target <= 65536)) { 1366 btrfs_readahead_node_child(node, nr); 1367 nread += blocksize; 1368 } 1369 nscan++; 1370 if (nread > nread_max || nscan > 32) 1371 break; 1372 } 1373 } 1374 1375 static noinline void reada_for_balance(const struct btrfs_path *path, int level) 1376 { 1377 struct extent_buffer *parent; 1378 int slot; 1379 int nritems; 1380 1381 parent = path->nodes[level + 1]; 1382 if (!parent) 1383 return; 1384 1385 nritems = btrfs_header_nritems(parent); 1386 slot = path->slots[level + 1]; 1387 1388 if (slot > 0) 1389 btrfs_readahead_node_child(parent, slot - 1); 1390 if (slot + 1 < nritems) 1391 btrfs_readahead_node_child(parent, slot + 1); 1392 } 1393 1394 1395 /* 1396 * when we walk down the tree, it is usually safe to unlock the higher layers 1397 * in the tree. The exceptions are when our path goes through slot 0, because 1398 * operations on the tree might require changing key pointers higher up in the 1399 * tree. 1400 * 1401 * callers might also have set path->keep_locks, which tells this code to keep 1402 * the lock if the path points to the last slot in the block. This is part of 1403 * walking through the tree, and selecting the next slot in the higher block. 1404 * 1405 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 1406 * if lowest_unlock is 1, level 0 won't be unlocked 1407 */ 1408 static noinline void unlock_up(struct btrfs_path *path, int level, 1409 int lowest_unlock, int min_write_lock_level, 1410 int *write_lock_level) 1411 { 1412 int i; 1413 int skip_level = level; 1414 bool check_skip = true; 1415 1416 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1417 if (!path->nodes[i]) 1418 break; 1419 if (!path->locks[i]) 1420 break; 1421 1422 if (check_skip) { 1423 if (path->slots[i] == 0) { 1424 skip_level = i + 1; 1425 continue; 1426 } 1427 1428 if (path->keep_locks) { 1429 u32 nritems; 1430 1431 nritems = btrfs_header_nritems(path->nodes[i]); 1432 if (nritems < 1 || path->slots[i] >= nritems - 1) { 1433 skip_level = i + 1; 1434 continue; 1435 } 1436 } 1437 } 1438 1439 if (i >= lowest_unlock && i > skip_level) { 1440 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 1441 check_skip = false; 1442 path->locks[i] = 0; 1443 if (write_lock_level && 1444 i > min_write_lock_level && 1445 i <= *write_lock_level) { 1446 *write_lock_level = i - 1; 1447 } 1448 } 1449 } 1450 } 1451 1452 /* 1453 * Helper function for btrfs_search_slot() and other functions that do a search 1454 * on a btree. The goal is to find a tree block in the cache (the radix tree at 1455 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read 1456 * its pages from disk. 1457 * 1458 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the 1459 * whole btree search, starting again from the current root node. 1460 */ 1461 static int 1462 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 1463 struct extent_buffer **eb_ret, int slot, 1464 const struct btrfs_key *key) 1465 { 1466 struct btrfs_fs_info *fs_info = root->fs_info; 1467 struct btrfs_tree_parent_check check = { 0 }; 1468 u64 blocknr; 1469 struct extent_buffer *tmp = NULL; 1470 int ret = 0; 1471 int ret2; 1472 int parent_level; 1473 bool read_tmp = false; 1474 bool tmp_locked = false; 1475 bool path_released = false; 1476 1477 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1478 parent_level = btrfs_header_level(*eb_ret); 1479 btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); 1480 check.has_first_key = true; 1481 check.level = parent_level - 1; 1482 check.transid = btrfs_node_ptr_generation(*eb_ret, slot); 1483 check.owner_root = btrfs_root_id(root); 1484 1485 /* 1486 * If we need to read an extent buffer from disk and we are holding locks 1487 * on upper level nodes, we unlock all the upper nodes before reading the 1488 * extent buffer, and then return -EAGAIN to the caller as it needs to 1489 * restart the search. We don't release the lock on the current level 1490 * because we need to walk this node to figure out which blocks to read. 1491 */ 1492 tmp = find_extent_buffer(fs_info, blocknr); 1493 if (tmp) { 1494 if (p->reada == READA_FORWARD_ALWAYS) 1495 reada_for_search(fs_info, p, parent_level, slot, key->objectid); 1496 1497 /* Check if the cached eb is uptodate. */ 1498 ret = btrfs_buffer_uptodate(tmp, check.transid, &check); 1499 if (unlikely(ret < 0)) 1500 goto out; 1501 if (ret > 0) { 1502 *eb_ret = tmp; 1503 tmp = NULL; 1504 ret = 0; 1505 goto out; 1506 } 1507 1508 if (p->nowait) { 1509 ret = -EAGAIN; 1510 goto out; 1511 } 1512 1513 if (!p->skip_locking) { 1514 btrfs_unlock_up_safe(p, parent_level + 1); 1515 btrfs_maybe_reset_lockdep_class(root, tmp); 1516 tmp_locked = true; 1517 btrfs_tree_read_lock(tmp); 1518 btrfs_release_path(p); 1519 ret = -EAGAIN; 1520 path_released = true; 1521 } 1522 1523 /* Now we're allowed to do a blocking uptodate check. */ 1524 ret2 = btrfs_read_extent_buffer(tmp, &check); 1525 if (ret2) { 1526 ret = ret2; 1527 goto out; 1528 } 1529 1530 if (ret == 0) { 1531 ASSERT(!tmp_locked); 1532 *eb_ret = tmp; 1533 tmp = NULL; 1534 } 1535 goto out; 1536 } else if (p->nowait) { 1537 ret = -EAGAIN; 1538 goto out; 1539 } 1540 1541 if (!p->skip_locking) { 1542 btrfs_unlock_up_safe(p, parent_level + 1); 1543 ret = -EAGAIN; 1544 } 1545 1546 if (p->reada != READA_NONE) 1547 reada_for_search(fs_info, p, parent_level, slot, key->objectid); 1548 1549 tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level); 1550 if (IS_ERR(tmp)) { 1551 ret = PTR_ERR(tmp); 1552 tmp = NULL; 1553 goto out; 1554 } 1555 read_tmp = true; 1556 1557 if (!p->skip_locking) { 1558 ASSERT(ret == -EAGAIN); 1559 btrfs_maybe_reset_lockdep_class(root, tmp); 1560 tmp_locked = true; 1561 btrfs_tree_read_lock(tmp); 1562 btrfs_release_path(p); 1563 path_released = true; 1564 } 1565 1566 /* Now we're allowed to do a blocking uptodate check. */ 1567 ret2 = btrfs_read_extent_buffer(tmp, &check); 1568 if (ret2) { 1569 ret = ret2; 1570 goto out; 1571 } 1572 1573 /* 1574 * If the read above didn't mark this buffer up to date, 1575 * it will never end up being up to date. Set ret to EIO now 1576 * and give up so that our caller doesn't loop forever 1577 * on our EAGAINs. 1578 */ 1579 if (unlikely(!extent_buffer_uptodate(tmp))) { 1580 ret = -EIO; 1581 goto out; 1582 } 1583 1584 if (ret == 0) { 1585 ASSERT(!tmp_locked); 1586 *eb_ret = tmp; 1587 tmp = NULL; 1588 } 1589 out: 1590 if (tmp) { 1591 if (tmp_locked) 1592 btrfs_tree_read_unlock(tmp); 1593 if (read_tmp && ret && ret != -EAGAIN) 1594 free_extent_buffer_stale(tmp); 1595 else 1596 free_extent_buffer(tmp); 1597 } 1598 if (ret && !path_released) 1599 btrfs_release_path(p); 1600 1601 return ret; 1602 } 1603 1604 /* 1605 * helper function for btrfs_search_slot. This does all of the checks 1606 * for node-level blocks and does any balancing required based on 1607 * the ins_len. 1608 * 1609 * If no extra work was required, zero is returned. If we had to 1610 * drop the path, -EAGAIN is returned and btrfs_search_slot must 1611 * start over 1612 */ 1613 static int 1614 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1615 struct btrfs_root *root, struct btrfs_path *p, 1616 struct extent_buffer *b, int level, int ins_len, 1617 int *write_lock_level) 1618 { 1619 struct btrfs_fs_info *fs_info = root->fs_info; 1620 int ret = 0; 1621 1622 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1623 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 1624 1625 if (*write_lock_level < level + 1) { 1626 *write_lock_level = level + 1; 1627 btrfs_release_path(p); 1628 return -EAGAIN; 1629 } 1630 1631 reada_for_balance(p, level); 1632 ret = split_node(trans, root, p, level); 1633 1634 b = p->nodes[level]; 1635 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1636 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 1637 1638 if (*write_lock_level < level + 1) { 1639 *write_lock_level = level + 1; 1640 btrfs_release_path(p); 1641 return -EAGAIN; 1642 } 1643 1644 reada_for_balance(p, level); 1645 ret = balance_level(trans, root, p, level); 1646 if (ret) 1647 return ret; 1648 1649 b = p->nodes[level]; 1650 if (!b) { 1651 btrfs_release_path(p); 1652 return -EAGAIN; 1653 } 1654 BUG_ON(btrfs_header_nritems(b) == 1); 1655 } 1656 return ret; 1657 } 1658 1659 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 1660 u64 iobjectid, u64 ioff, u8 key_type, 1661 struct btrfs_key *found_key) 1662 { 1663 int ret; 1664 struct btrfs_key key; 1665 struct extent_buffer *eb; 1666 1667 ASSERT(path); 1668 ASSERT(found_key); 1669 1670 key.type = key_type; 1671 key.objectid = iobjectid; 1672 key.offset = ioff; 1673 1674 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 1675 if (ret < 0) 1676 return ret; 1677 1678 eb = path->nodes[0]; 1679 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 1680 ret = btrfs_next_leaf(fs_root, path); 1681 if (ret) 1682 return ret; 1683 eb = path->nodes[0]; 1684 } 1685 1686 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 1687 if (found_key->type != key.type || 1688 found_key->objectid != key.objectid) 1689 return 1; 1690 1691 return 0; 1692 } 1693 1694 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 1695 struct btrfs_path *p, 1696 int write_lock_level) 1697 { 1698 struct extent_buffer *b; 1699 int root_lock = 0; 1700 int level = 0; 1701 1702 if (p->search_commit_root) { 1703 b = root->commit_root; 1704 refcount_inc(&b->refs); 1705 level = btrfs_header_level(b); 1706 /* 1707 * Ensure that all callers have set skip_locking when 1708 * p->search_commit_root is true. 1709 */ 1710 ASSERT(p->skip_locking); 1711 1712 goto out; 1713 } 1714 1715 if (p->skip_locking) { 1716 b = btrfs_root_node(root); 1717 level = btrfs_header_level(b); 1718 goto out; 1719 } 1720 1721 /* We try very hard to do read locks on the root */ 1722 root_lock = BTRFS_READ_LOCK; 1723 1724 /* 1725 * If the level is set to maximum, we can skip trying to get the read 1726 * lock. 1727 */ 1728 if (write_lock_level < BTRFS_MAX_LEVEL) { 1729 /* 1730 * We don't know the level of the root node until we actually 1731 * have it read locked 1732 */ 1733 if (p->nowait) { 1734 b = btrfs_try_read_lock_root_node(root); 1735 if (IS_ERR(b)) 1736 return b; 1737 } else { 1738 b = btrfs_read_lock_root_node(root); 1739 } 1740 level = btrfs_header_level(b); 1741 if (level > write_lock_level) 1742 goto out; 1743 1744 /* Whoops, must trade for write lock */ 1745 btrfs_tree_read_unlock(b); 1746 free_extent_buffer(b); 1747 } 1748 1749 b = btrfs_lock_root_node(root); 1750 root_lock = BTRFS_WRITE_LOCK; 1751 1752 /* The level might have changed, check again */ 1753 level = btrfs_header_level(b); 1754 1755 out: 1756 /* 1757 * The root may have failed to write out at some point, and thus is no 1758 * longer valid, return an error in this case. 1759 */ 1760 if (unlikely(!extent_buffer_uptodate(b))) { 1761 if (root_lock) 1762 btrfs_tree_unlock_rw(b, root_lock); 1763 free_extent_buffer(b); 1764 return ERR_PTR(-EIO); 1765 } 1766 1767 p->nodes[level] = b; 1768 if (!p->skip_locking) 1769 p->locks[level] = root_lock; 1770 /* 1771 * Callers are responsible for dropping b's references. 1772 */ 1773 return b; 1774 } 1775 1776 /* 1777 * Replace the extent buffer at the lowest level of the path with a cloned 1778 * version. The purpose is to be able to use it safely, after releasing the 1779 * commit root semaphore, even if relocation is happening in parallel, the 1780 * transaction used for relocation is committed and the extent buffer is 1781 * reallocated in the next transaction. 1782 * 1783 * This is used in a context where the caller does not prevent transaction 1784 * commits from happening, either by holding a transaction handle or holding 1785 * some lock, while it's doing searches through a commit root. 1786 * At the moment it's only used for send operations. 1787 */ 1788 static int finish_need_commit_sem_search(struct btrfs_path *path) 1789 { 1790 const int i = path->lowest_level; 1791 const int slot = path->slots[i]; 1792 struct extent_buffer *lowest = path->nodes[i]; 1793 struct extent_buffer *clone; 1794 1795 ASSERT(path->need_commit_sem); 1796 1797 if (!lowest) 1798 return 0; 1799 1800 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); 1801 1802 clone = btrfs_clone_extent_buffer(lowest); 1803 if (!clone) 1804 return -ENOMEM; 1805 1806 btrfs_release_path(path); 1807 path->nodes[i] = clone; 1808 path->slots[i] = slot; 1809 1810 return 0; 1811 } 1812 1813 static inline int search_for_key_slot(const struct extent_buffer *eb, 1814 int search_low_slot, 1815 const struct btrfs_key *key, 1816 int prev_cmp, 1817 int *slot) 1818 { 1819 /* 1820 * If a previous call to btrfs_bin_search() on a parent node returned an 1821 * exact match (prev_cmp == 0), we can safely assume the target key will 1822 * always be at slot 0 on lower levels, since each key pointer 1823 * (struct btrfs_key_ptr) refers to the lowest key accessible from the 1824 * subtree it points to. Thus we can skip searching lower levels. 1825 */ 1826 if (prev_cmp == 0) { 1827 *slot = 0; 1828 return 0; 1829 } 1830 1831 return btrfs_bin_search(eb, search_low_slot, key, slot); 1832 } 1833 1834 static int search_leaf(struct btrfs_trans_handle *trans, 1835 struct btrfs_root *root, 1836 const struct btrfs_key *key, 1837 struct btrfs_path *path, 1838 int ins_len, 1839 int prev_cmp) 1840 { 1841 struct extent_buffer *leaf = path->nodes[0]; 1842 int leaf_free_space = -1; 1843 int search_low_slot = 0; 1844 int ret; 1845 bool do_bin_search = true; 1846 1847 /* 1848 * If we are doing an insertion, the leaf has enough free space and the 1849 * destination slot for the key is not slot 0, then we can unlock our 1850 * write lock on the parent, and any other upper nodes, before doing the 1851 * binary search on the leaf (with search_for_key_slot()), allowing other 1852 * tasks to lock the parent and any other upper nodes. 1853 */ 1854 if (ins_len > 0) { 1855 /* 1856 * Cache the leaf free space, since we will need it later and it 1857 * will not change until then. 1858 */ 1859 leaf_free_space = btrfs_leaf_free_space(leaf); 1860 1861 /* 1862 * !path->locks[1] means we have a single node tree, the leaf is 1863 * the root of the tree. 1864 */ 1865 if (path->locks[1] && leaf_free_space >= ins_len) { 1866 struct btrfs_disk_key first_key; 1867 1868 ASSERT(btrfs_header_nritems(leaf) > 0); 1869 btrfs_item_key(leaf, &first_key, 0); 1870 1871 /* 1872 * Doing the extra comparison with the first key is cheap, 1873 * taking into account that the first key is very likely 1874 * already in a cache line because it immediately follows 1875 * the extent buffer's header and we have recently accessed 1876 * the header's level field. 1877 */ 1878 ret = btrfs_comp_keys(&first_key, key); 1879 if (ret < 0) { 1880 /* 1881 * The first key is smaller than the key we want 1882 * to insert, so we are safe to unlock all upper 1883 * nodes and we have to do the binary search. 1884 * 1885 * We do use btrfs_unlock_up_safe() and not 1886 * unlock_up() because the later does not unlock 1887 * nodes with a slot of 0 - we can safely unlock 1888 * any node even if its slot is 0 since in this 1889 * case the key does not end up at slot 0 of the 1890 * leaf and there's no need to split the leaf. 1891 */ 1892 btrfs_unlock_up_safe(path, 1); 1893 search_low_slot = 1; 1894 } else { 1895 /* 1896 * The first key is >= then the key we want to 1897 * insert, so we can skip the binary search as 1898 * the target key will be at slot 0. 1899 * 1900 * We can not unlock upper nodes when the key is 1901 * less than the first key, because we will need 1902 * to update the key at slot 0 of the parent node 1903 * and possibly of other upper nodes too. 1904 * If the key matches the first key, then we can 1905 * unlock all the upper nodes, using 1906 * btrfs_unlock_up_safe() instead of unlock_up() 1907 * as stated above. 1908 */ 1909 if (ret == 0) 1910 btrfs_unlock_up_safe(path, 1); 1911 /* 1912 * ret is already 0 or 1, matching the result of 1913 * a btrfs_bin_search() call, so there is no need 1914 * to adjust it. 1915 */ 1916 do_bin_search = false; 1917 path->slots[0] = 0; 1918 } 1919 } 1920 } 1921 1922 if (do_bin_search) { 1923 ret = search_for_key_slot(leaf, search_low_slot, key, 1924 prev_cmp, &path->slots[0]); 1925 if (ret < 0) 1926 return ret; 1927 } 1928 1929 if (ins_len > 0) { 1930 /* 1931 * Item key already exists. In this case, if we are allowed to 1932 * insert the item (for example, in dir_item case, item key 1933 * collision is allowed), it will be merged with the original 1934 * item. Only the item size grows, no new btrfs item will be 1935 * added. If search_for_extension is not set, ins_len already 1936 * accounts the size btrfs_item, deduct it here so leaf space 1937 * check will be correct. 1938 */ 1939 if (ret == 0 && !path->search_for_extension) { 1940 ASSERT(ins_len >= sizeof(struct btrfs_item)); 1941 ins_len -= sizeof(struct btrfs_item); 1942 } 1943 1944 ASSERT(leaf_free_space >= 0); 1945 1946 if (leaf_free_space < ins_len) { 1947 int ret2; 1948 1949 ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0)); 1950 ASSERT(ret2 <= 0); 1951 if (WARN_ON(ret2 > 0)) 1952 ret2 = -EUCLEAN; 1953 if (ret2) 1954 ret = ret2; 1955 } 1956 } 1957 1958 return ret; 1959 } 1960 1961 /* 1962 * Look for a key in a tree and perform necessary modifications to preserve 1963 * tree invariants. 1964 * 1965 * @trans: Handle of transaction, used when modifying the tree 1966 * @p: Holds all btree nodes along the search path 1967 * @root: The root node of the tree 1968 * @key: The key we are looking for 1969 * @ins_len: Indicates purpose of search: 1970 * >0 for inserts it's size of item inserted (*) 1971 * <0 for deletions 1972 * 0 for plain searches, not modifying the tree 1973 * 1974 * (*) If size of item inserted doesn't include 1975 * sizeof(struct btrfs_item), then p->search_for_extension must 1976 * be set. 1977 * @cow: boolean should CoW operations be performed. Must always be 1 1978 * when modifying the tree. 1979 * 1980 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 1981 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 1982 * 1983 * If @key is found, 0 is returned and you can find the item in the leaf level 1984 * of the path (level 0) 1985 * 1986 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 1987 * points to the slot where it should be inserted 1988 * 1989 * If an error is encountered while searching the tree a negative error number 1990 * is returned 1991 */ 1992 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1993 const struct btrfs_key *key, struct btrfs_path *p, 1994 int ins_len, int cow) 1995 { 1996 struct btrfs_fs_info *fs_info; 1997 struct extent_buffer *b; 1998 int slot; 1999 int ret; 2000 int level; 2001 int lowest_unlock = 1; 2002 /* everything at write_lock_level or lower must be write locked */ 2003 int write_lock_level = 0; 2004 u8 lowest_level = 0; 2005 int min_write_lock_level; 2006 int prev_cmp; 2007 2008 if (!root) 2009 return -EINVAL; 2010 2011 fs_info = root->fs_info; 2012 might_sleep(); 2013 2014 lowest_level = p->lowest_level; 2015 WARN_ON(lowest_level && ins_len > 0); 2016 WARN_ON(p->nodes[0] != NULL); 2017 BUG_ON(!cow && ins_len); 2018 2019 /* 2020 * For now only allow nowait for read only operations. There's no 2021 * strict reason why we can't, we just only need it for reads so it's 2022 * only implemented for reads. 2023 */ 2024 ASSERT(!p->nowait || !cow); 2025 2026 if (ins_len < 0) { 2027 lowest_unlock = 2; 2028 2029 /* when we are removing items, we might have to go up to level 2030 * two as we update tree pointers Make sure we keep write 2031 * for those levels as well 2032 */ 2033 write_lock_level = 2; 2034 } else if (ins_len > 0) { 2035 /* 2036 * for inserting items, make sure we have a write lock on 2037 * level 1 so we can update keys 2038 */ 2039 write_lock_level = 1; 2040 } 2041 2042 if (!cow) 2043 write_lock_level = -1; 2044 2045 if (cow && (p->keep_locks || p->lowest_level)) 2046 write_lock_level = BTRFS_MAX_LEVEL; 2047 2048 min_write_lock_level = write_lock_level; 2049 2050 if (p->need_commit_sem) { 2051 ASSERT(p->search_commit_root); 2052 if (p->nowait) { 2053 if (!down_read_trylock(&fs_info->commit_root_sem)) 2054 return -EAGAIN; 2055 } else { 2056 down_read(&fs_info->commit_root_sem); 2057 } 2058 } 2059 2060 again: 2061 prev_cmp = -1; 2062 b = btrfs_search_slot_get_root(root, p, write_lock_level); 2063 if (IS_ERR(b)) { 2064 ret = PTR_ERR(b); 2065 goto done; 2066 } 2067 2068 while (b) { 2069 bool dec = false; 2070 int ret2; 2071 2072 level = btrfs_header_level(b); 2073 2074 if (cow) { 2075 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 2076 2077 /* 2078 * if we don't really need to cow this block 2079 * then we don't want to set the path blocking, 2080 * so we test it here 2081 */ 2082 if (!should_cow_block(trans, root, b)) 2083 goto cow_done; 2084 2085 /* 2086 * must have write locks on this node and the 2087 * parent 2088 */ 2089 if (level > write_lock_level || 2090 (level + 1 > write_lock_level && 2091 level + 1 < BTRFS_MAX_LEVEL && 2092 p->nodes[level + 1])) { 2093 write_lock_level = level + 1; 2094 btrfs_release_path(p); 2095 trace_btrfs_search_slot_restart(root, level, "write_lock"); 2096 goto again; 2097 } 2098 2099 if (last_level) 2100 ret2 = btrfs_cow_block(trans, root, b, NULL, 0, 2101 &b, BTRFS_NESTING_COW); 2102 else 2103 ret2 = btrfs_cow_block(trans, root, b, 2104 p->nodes[level + 1], 2105 p->slots[level + 1], &b, 2106 BTRFS_NESTING_COW); 2107 if (ret2) { 2108 ret = ret2; 2109 goto done; 2110 } 2111 } 2112 cow_done: 2113 p->nodes[level] = b; 2114 2115 /* 2116 * we have a lock on b and as long as we aren't changing 2117 * the tree, there is no way to for the items in b to change. 2118 * It is safe to drop the lock on our parent before we 2119 * go through the expensive btree search on b. 2120 * 2121 * If we're inserting or deleting (ins_len != 0), then we might 2122 * be changing slot zero, which may require changing the parent. 2123 * So, we can't drop the lock until after we know which slot 2124 * we're operating on. 2125 */ 2126 if (!ins_len && !p->keep_locks) { 2127 int u = level + 1; 2128 2129 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 2130 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 2131 p->locks[u] = 0; 2132 } 2133 } 2134 2135 if (level == 0) { 2136 if (ins_len > 0) 2137 ASSERT(write_lock_level >= 1); 2138 2139 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); 2140 if (!p->search_for_split) 2141 unlock_up(p, level, lowest_unlock, 2142 min_write_lock_level, NULL); 2143 goto done; 2144 } 2145 2146 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); 2147 if (ret < 0) 2148 goto done; 2149 prev_cmp = ret; 2150 2151 if (ret && slot > 0) { 2152 dec = true; 2153 slot--; 2154 } 2155 p->slots[level] = slot; 2156 ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len, 2157 &write_lock_level); 2158 if (ret2 == -EAGAIN) { 2159 trace_btrfs_search_slot_restart(root, level, "setup_nodes"); 2160 goto again; 2161 } 2162 if (ret2) { 2163 ret = ret2; 2164 goto done; 2165 } 2166 b = p->nodes[level]; 2167 slot = p->slots[level]; 2168 2169 /* 2170 * Slot 0 is special, if we change the key we have to update 2171 * the parent pointer which means we must have a write lock on 2172 * the parent 2173 */ 2174 if (slot == 0 && ins_len && write_lock_level < level + 1) { 2175 write_lock_level = level + 1; 2176 btrfs_release_path(p); 2177 trace_btrfs_search_slot_restart(root, level, "slot_zero"); 2178 goto again; 2179 } 2180 2181 unlock_up(p, level, lowest_unlock, min_write_lock_level, 2182 &write_lock_level); 2183 2184 if (level == lowest_level) { 2185 if (dec) 2186 p->slots[level]++; 2187 goto done; 2188 } 2189 2190 ret2 = read_block_for_search(root, p, &b, slot, key); 2191 if (ret2 == -EAGAIN && !p->nowait) { 2192 trace_btrfs_search_slot_restart(root, level, "read_block"); 2193 goto again; 2194 } 2195 if (ret2) { 2196 ret = ret2; 2197 goto done; 2198 } 2199 2200 if (!p->skip_locking) { 2201 level = btrfs_header_level(b); 2202 2203 btrfs_maybe_reset_lockdep_class(root, b); 2204 2205 if (level <= write_lock_level) { 2206 btrfs_tree_lock(b); 2207 p->locks[level] = BTRFS_WRITE_LOCK; 2208 } else { 2209 if (p->nowait) { 2210 if (!btrfs_try_tree_read_lock(b)) { 2211 free_extent_buffer(b); 2212 ret = -EAGAIN; 2213 goto done; 2214 } 2215 } else { 2216 btrfs_tree_read_lock(b); 2217 } 2218 p->locks[level] = BTRFS_READ_LOCK; 2219 } 2220 p->nodes[level] = b; 2221 } 2222 } 2223 ret = 1; 2224 done: 2225 if (ret < 0 && !p->skip_release_on_error) 2226 btrfs_release_path(p); 2227 2228 if (p->need_commit_sem) { 2229 int ret2; 2230 2231 ret2 = finish_need_commit_sem_search(p); 2232 up_read(&fs_info->commit_root_sem); 2233 if (ret2) 2234 ret = ret2; 2235 } 2236 2237 return ret; 2238 } 2239 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); 2240 2241 /* 2242 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2243 * current state of the tree together with the operations recorded in the tree 2244 * modification log to search for the key in a previous version of this tree, as 2245 * denoted by the time_seq parameter. 2246 * 2247 * Naturally, there is no support for insert, delete or cow operations. 2248 * 2249 * The resulting path and return value will be set up as if we called 2250 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2251 */ 2252 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 2253 struct btrfs_path *p, u64 time_seq) 2254 { 2255 struct btrfs_fs_info *fs_info = root->fs_info; 2256 struct extent_buffer *b; 2257 int slot; 2258 int ret; 2259 int level; 2260 int lowest_unlock = 1; 2261 u8 lowest_level = 0; 2262 2263 lowest_level = p->lowest_level; 2264 WARN_ON(p->nodes[0] != NULL); 2265 ASSERT(!p->nowait); 2266 2267 if (p->search_commit_root) { 2268 BUG_ON(time_seq); 2269 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2270 } 2271 2272 again: 2273 b = btrfs_get_old_root(root, time_seq); 2274 if (unlikely(!b)) { 2275 ret = -EIO; 2276 goto done; 2277 } 2278 level = btrfs_header_level(b); 2279 p->locks[level] = BTRFS_READ_LOCK; 2280 2281 while (b) { 2282 bool dec = false; 2283 int ret2; 2284 2285 level = btrfs_header_level(b); 2286 p->nodes[level] = b; 2287 2288 /* 2289 * we have a lock on b and as long as we aren't changing 2290 * the tree, there is no way to for the items in b to change. 2291 * It is safe to drop the lock on our parent before we 2292 * go through the expensive btree search on b. 2293 */ 2294 btrfs_unlock_up_safe(p, level + 1); 2295 2296 ret = btrfs_bin_search(b, 0, key, &slot); 2297 if (ret < 0) 2298 goto done; 2299 2300 if (level == 0) { 2301 p->slots[level] = slot; 2302 unlock_up(p, level, lowest_unlock, 0, NULL); 2303 goto done; 2304 } 2305 2306 if (ret && slot > 0) { 2307 dec = true; 2308 slot--; 2309 } 2310 p->slots[level] = slot; 2311 unlock_up(p, level, lowest_unlock, 0, NULL); 2312 2313 if (level == lowest_level) { 2314 if (dec) 2315 p->slots[level]++; 2316 goto done; 2317 } 2318 2319 ret2 = read_block_for_search(root, p, &b, slot, key); 2320 if (ret2 == -EAGAIN && !p->nowait) 2321 goto again; 2322 if (ret2) { 2323 ret = ret2; 2324 goto done; 2325 } 2326 2327 level = btrfs_header_level(b); 2328 btrfs_tree_read_lock(b); 2329 b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq); 2330 if (!b) { 2331 ret = -ENOMEM; 2332 goto done; 2333 } 2334 p->locks[level] = BTRFS_READ_LOCK; 2335 p->nodes[level] = b; 2336 } 2337 ret = 1; 2338 done: 2339 if (ret < 0) 2340 btrfs_release_path(p); 2341 2342 return ret; 2343 } 2344 2345 /* 2346 * Search the tree again to find a leaf with smaller keys. 2347 * Returns 0 if it found something. 2348 * Returns 1 if there are no smaller keys. 2349 * Returns < 0 on error. 2350 * 2351 * This may release the path, and so you may lose any locks held at the 2352 * time you call it. 2353 */ 2354 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 2355 { 2356 struct btrfs_key key; 2357 struct btrfs_key orig_key; 2358 struct btrfs_disk_key found_key; 2359 int ret; 2360 2361 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 2362 orig_key = key; 2363 2364 if (key.offset > 0) { 2365 key.offset--; 2366 } else if (key.type > 0) { 2367 key.type--; 2368 key.offset = (u64)-1; 2369 } else if (key.objectid > 0) { 2370 key.objectid--; 2371 key.type = (u8)-1; 2372 key.offset = (u64)-1; 2373 } else { 2374 return 1; 2375 } 2376 2377 btrfs_release_path(path); 2378 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2379 if (ret <= 0) 2380 return ret; 2381 2382 /* 2383 * Previous key not found. Even if we were at slot 0 of the leaf we had 2384 * before releasing the path and calling btrfs_search_slot(), we now may 2385 * be in a slot pointing to the same original key - this can happen if 2386 * after we released the path, one of more items were moved from a 2387 * sibling leaf into the front of the leaf we had due to an insertion 2388 * (see push_leaf_right()). 2389 * If we hit this case and our slot is > 0 and just decrement the slot 2390 * so that the caller does not process the same key again, which may or 2391 * may not break the caller, depending on its logic. 2392 */ 2393 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 2394 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); 2395 ret = btrfs_comp_keys(&found_key, &orig_key); 2396 if (ret == 0) { 2397 if (path->slots[0] > 0) { 2398 path->slots[0]--; 2399 return 0; 2400 } 2401 /* 2402 * At slot 0, same key as before, it means orig_key is 2403 * the lowest, leftmost, key in the tree. We're done. 2404 */ 2405 return 1; 2406 } 2407 } 2408 2409 btrfs_item_key(path->nodes[0], &found_key, 0); 2410 ret = btrfs_comp_keys(&found_key, &key); 2411 /* 2412 * We might have had an item with the previous key in the tree right 2413 * before we released our path. And after we released our path, that 2414 * item might have been pushed to the first slot (0) of the leaf we 2415 * were holding due to a tree balance. Alternatively, an item with the 2416 * previous key can exist as the only element of a leaf (big fat item). 2417 * Therefore account for these 2 cases, so that our callers (like 2418 * btrfs_previous_item) don't miss an existing item with a key matching 2419 * the previous key we computed above. 2420 */ 2421 if (ret <= 0) 2422 return 0; 2423 return 1; 2424 } 2425 2426 /* 2427 * helper to use instead of search slot if no exact match is needed but 2428 * instead the next or previous item should be returned. 2429 * When find_higher is true, the next higher item is returned, the next lower 2430 * otherwise. 2431 * When return_any and find_higher are both true, and no higher item is found, 2432 * return the next lower instead. 2433 * When return_any is true and find_higher is false, and no lower item is found, 2434 * return the next higher instead. 2435 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2436 * < 0 on error 2437 */ 2438 int btrfs_search_slot_for_read(struct btrfs_root *root, 2439 const struct btrfs_key *key, 2440 struct btrfs_path *p, int find_higher, 2441 int return_any) 2442 { 2443 int ret; 2444 struct extent_buffer *leaf; 2445 2446 again: 2447 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2448 if (ret <= 0) 2449 return ret; 2450 /* 2451 * a return value of 1 means the path is at the position where the 2452 * item should be inserted. Normally this is the next bigger item, 2453 * but in case the previous item is the last in a leaf, path points 2454 * to the first free slot in the previous leaf, i.e. at an invalid 2455 * item. 2456 */ 2457 leaf = p->nodes[0]; 2458 2459 if (find_higher) { 2460 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2461 ret = btrfs_next_leaf(root, p); 2462 if (ret <= 0) 2463 return ret; 2464 if (!return_any) 2465 return 1; 2466 /* 2467 * no higher item found, return the next 2468 * lower instead 2469 */ 2470 return_any = 0; 2471 find_higher = 0; 2472 btrfs_release_path(p); 2473 goto again; 2474 } 2475 } else { 2476 if (p->slots[0] == 0) { 2477 ret = btrfs_prev_leaf(root, p); 2478 if (ret < 0) 2479 return ret; 2480 if (!ret) { 2481 leaf = p->nodes[0]; 2482 if (p->slots[0] == btrfs_header_nritems(leaf)) 2483 p->slots[0]--; 2484 return 0; 2485 } 2486 if (!return_any) 2487 return 1; 2488 /* 2489 * no lower item found, return the next 2490 * higher instead 2491 */ 2492 return_any = 0; 2493 find_higher = 1; 2494 btrfs_release_path(p); 2495 goto again; 2496 } else { 2497 --p->slots[0]; 2498 } 2499 } 2500 return 0; 2501 } 2502 2503 /* 2504 * Execute search and call btrfs_previous_item to traverse backwards if the item 2505 * was not found. 2506 * 2507 * Return 0 if found, 1 if not found and < 0 if error. 2508 */ 2509 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, 2510 struct btrfs_path *path) 2511 { 2512 int ret; 2513 2514 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 2515 if (ret > 0) 2516 ret = btrfs_previous_item(root, path, key->objectid, key->type); 2517 2518 if (ret == 0) 2519 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2520 2521 return ret; 2522 } 2523 2524 /* 2525 * Search for a valid slot for the given path. 2526 * 2527 * @root: The root node of the tree. 2528 * @key: Will contain a valid item if found. 2529 * @path: The starting point to validate the slot. 2530 * 2531 * Return: 0 if the item is valid 2532 * 1 if not found 2533 * <0 if error. 2534 */ 2535 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, 2536 struct btrfs_path *path) 2537 { 2538 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 2539 int ret; 2540 2541 ret = btrfs_next_leaf(root, path); 2542 if (ret) 2543 return ret; 2544 } 2545 2546 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2547 return 0; 2548 } 2549 2550 /* 2551 * adjust the pointers going up the tree, starting at level 2552 * making sure the right key of each node is points to 'key'. 2553 * This is used after shifting pointers to the left, so it stops 2554 * fixing up pointers when a given leaf/node is not in slot 0 of the 2555 * higher levels 2556 * 2557 */ 2558 static void fixup_low_keys(struct btrfs_trans_handle *trans, 2559 const struct btrfs_path *path, 2560 const struct btrfs_disk_key *key, int level) 2561 { 2562 int i; 2563 struct extent_buffer *t; 2564 int ret; 2565 2566 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2567 int tslot = path->slots[i]; 2568 2569 if (!path->nodes[i]) 2570 break; 2571 t = path->nodes[i]; 2572 ret = btrfs_tree_mod_log_insert_key(t, tslot, 2573 BTRFS_MOD_LOG_KEY_REPLACE); 2574 BUG_ON(ret < 0); 2575 btrfs_set_node_key(t, key, tslot); 2576 btrfs_mark_buffer_dirty(trans, path->nodes[i]); 2577 if (tslot != 0) 2578 break; 2579 } 2580 } 2581 2582 /* 2583 * update item key. 2584 * 2585 * This function isn't completely safe. It's the caller's responsibility 2586 * that the new key won't break the order 2587 */ 2588 void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, 2589 const struct btrfs_path *path, 2590 const struct btrfs_key *new_key) 2591 { 2592 struct btrfs_fs_info *fs_info = trans->fs_info; 2593 struct btrfs_disk_key disk_key; 2594 struct extent_buffer *eb; 2595 int slot; 2596 2597 eb = path->nodes[0]; 2598 slot = path->slots[0]; 2599 if (slot > 0) { 2600 btrfs_item_key(eb, &disk_key, slot - 1); 2601 if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) { 2602 btrfs_print_leaf(eb); 2603 btrfs_crit(fs_info, 2604 "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT, 2605 slot, btrfs_disk_key_objectid(&disk_key), 2606 btrfs_disk_key_type(&disk_key), 2607 btrfs_disk_key_offset(&disk_key), 2608 BTRFS_KEY_FMT_VALUE(new_key)); 2609 BUG(); 2610 } 2611 } 2612 if (slot < btrfs_header_nritems(eb) - 1) { 2613 btrfs_item_key(eb, &disk_key, slot + 1); 2614 if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) { 2615 btrfs_print_leaf(eb); 2616 btrfs_crit(fs_info, 2617 "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT, 2618 slot, btrfs_disk_key_objectid(&disk_key), 2619 btrfs_disk_key_type(&disk_key), 2620 btrfs_disk_key_offset(&disk_key), 2621 BTRFS_KEY_FMT_VALUE(new_key)); 2622 BUG(); 2623 } 2624 } 2625 2626 btrfs_cpu_key_to_disk(&disk_key, new_key); 2627 btrfs_set_item_key(eb, &disk_key, slot); 2628 btrfs_mark_buffer_dirty(trans, eb); 2629 if (slot == 0) 2630 fixup_low_keys(trans, path, &disk_key, 1); 2631 } 2632 2633 /* 2634 * Check key order of two sibling extent buffers. 2635 * 2636 * Return true if something is wrong. 2637 * Return false if everything is fine. 2638 * 2639 * Tree-checker only works inside one tree block, thus the following 2640 * corruption can not be detected by tree-checker: 2641 * 2642 * Leaf @left | Leaf @right 2643 * -------------------------------------------------------------- 2644 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | 2645 * 2646 * Key f6 in leaf @left itself is valid, but not valid when the next 2647 * key in leaf @right is 7. 2648 * This can only be checked at tree block merge time. 2649 * And since tree checker has ensured all key order in each tree block 2650 * is correct, we only need to bother the last key of @left and the first 2651 * key of @right. 2652 */ 2653 static bool check_sibling_keys(const struct extent_buffer *left, 2654 const struct extent_buffer *right) 2655 { 2656 struct btrfs_key left_last; 2657 struct btrfs_key right_first; 2658 int level = btrfs_header_level(left); 2659 int nr_left = btrfs_header_nritems(left); 2660 int nr_right = btrfs_header_nritems(right); 2661 2662 /* No key to check in one of the tree blocks */ 2663 if (!nr_left || !nr_right) 2664 return false; 2665 2666 if (level) { 2667 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); 2668 btrfs_node_key_to_cpu(right, &right_first, 0); 2669 } else { 2670 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); 2671 btrfs_item_key_to_cpu(right, &right_first, 0); 2672 } 2673 2674 if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { 2675 btrfs_crit(left->fs_info, "left extent buffer:"); 2676 btrfs_print_tree(left, false); 2677 btrfs_crit(left->fs_info, "right extent buffer:"); 2678 btrfs_print_tree(right, false); 2679 btrfs_crit(left->fs_info, 2680 "bad key order, sibling blocks, left last " BTRFS_KEY_FMT " right first " BTRFS_KEY_FMT, 2681 BTRFS_KEY_FMT_VALUE(&left_last), 2682 BTRFS_KEY_FMT_VALUE(&right_first)); 2683 return true; 2684 } 2685 return false; 2686 } 2687 2688 /* 2689 * try to push data from one node into the next node left in the 2690 * tree. 2691 * 2692 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2693 * error, and > 0 if there was no room in the left hand block. 2694 */ 2695 static int push_node_left(struct btrfs_trans_handle *trans, 2696 struct extent_buffer *dst, 2697 struct extent_buffer *src, bool empty) 2698 { 2699 struct btrfs_fs_info *fs_info = trans->fs_info; 2700 int push_items = 0; 2701 int src_nritems; 2702 int dst_nritems; 2703 int ret = 0; 2704 2705 src_nritems = btrfs_header_nritems(src); 2706 dst_nritems = btrfs_header_nritems(dst); 2707 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2708 WARN_ON(btrfs_header_generation(src) != trans->transid); 2709 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2710 2711 if (!empty && src_nritems <= 8) 2712 return 1; 2713 2714 if (push_items <= 0) 2715 return 1; 2716 2717 if (empty) { 2718 push_items = min(src_nritems, push_items); 2719 if (push_items < src_nritems) { 2720 /* leave at least 8 pointers in the node if 2721 * we aren't going to empty it 2722 */ 2723 if (src_nritems - push_items < 8) { 2724 if (push_items <= 8) 2725 return 1; 2726 push_items -= 8; 2727 } 2728 } 2729 } else 2730 push_items = min(src_nritems - 8, push_items); 2731 2732 /* dst is the left eb, src is the middle eb */ 2733 if (unlikely(check_sibling_keys(dst, src))) { 2734 ret = -EUCLEAN; 2735 btrfs_abort_transaction(trans, ret); 2736 return ret; 2737 } 2738 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); 2739 if (unlikely(ret)) { 2740 btrfs_abort_transaction(trans, ret); 2741 return ret; 2742 } 2743 copy_extent_buffer(dst, src, 2744 btrfs_node_key_ptr_offset(dst, dst_nritems), 2745 btrfs_node_key_ptr_offset(src, 0), 2746 push_items * sizeof(struct btrfs_key_ptr)); 2747 2748 if (push_items < src_nritems) { 2749 /* 2750 * btrfs_tree_mod_log_eb_copy handles logging the move, so we 2751 * don't need to do an explicit tree mod log operation for it. 2752 */ 2753 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), 2754 btrfs_node_key_ptr_offset(src, push_items), 2755 (src_nritems - push_items) * 2756 sizeof(struct btrfs_key_ptr)); 2757 } 2758 btrfs_set_header_nritems(src, src_nritems - push_items); 2759 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2760 btrfs_mark_buffer_dirty(trans, src); 2761 btrfs_mark_buffer_dirty(trans, dst); 2762 2763 return ret; 2764 } 2765 2766 /* 2767 * try to push data from one node into the next node right in the 2768 * tree. 2769 * 2770 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 2771 * error, and > 0 if there was no room in the right hand block. 2772 * 2773 * this will only push up to 1/2 the contents of the left node over 2774 */ 2775 static int balance_node_right(struct btrfs_trans_handle *trans, 2776 struct extent_buffer *dst, 2777 struct extent_buffer *src) 2778 { 2779 struct btrfs_fs_info *fs_info = trans->fs_info; 2780 int push_items = 0; 2781 int max_push; 2782 int src_nritems; 2783 int dst_nritems; 2784 int ret = 0; 2785 2786 WARN_ON(btrfs_header_generation(src) != trans->transid); 2787 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2788 2789 src_nritems = btrfs_header_nritems(src); 2790 dst_nritems = btrfs_header_nritems(dst); 2791 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2792 if (push_items <= 0) 2793 return 1; 2794 2795 if (src_nritems < 4) 2796 return 1; 2797 2798 max_push = src_nritems / 2 + 1; 2799 /* don't try to empty the node */ 2800 if (max_push >= src_nritems) 2801 return 1; 2802 2803 if (max_push < push_items) 2804 push_items = max_push; 2805 2806 /* dst is the right eb, src is the middle eb */ 2807 if (unlikely(check_sibling_keys(src, dst))) { 2808 ret = -EUCLEAN; 2809 btrfs_abort_transaction(trans, ret); 2810 return ret; 2811 } 2812 2813 /* 2814 * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't 2815 * need to do an explicit tree mod log operation for it. 2816 */ 2817 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), 2818 btrfs_node_key_ptr_offset(dst, 0), 2819 (dst_nritems) * 2820 sizeof(struct btrfs_key_ptr)); 2821 2822 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, 2823 push_items); 2824 if (unlikely(ret)) { 2825 btrfs_abort_transaction(trans, ret); 2826 return ret; 2827 } 2828 copy_extent_buffer(dst, src, 2829 btrfs_node_key_ptr_offset(dst, 0), 2830 btrfs_node_key_ptr_offset(src, src_nritems - push_items), 2831 push_items * sizeof(struct btrfs_key_ptr)); 2832 2833 btrfs_set_header_nritems(src, src_nritems - push_items); 2834 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2835 2836 btrfs_mark_buffer_dirty(trans, src); 2837 btrfs_mark_buffer_dirty(trans, dst); 2838 2839 return ret; 2840 } 2841 2842 /* 2843 * helper function to insert a new root level in the tree. 2844 * A new node is allocated, and a single item is inserted to 2845 * point to the existing root 2846 * 2847 * returns zero on success or < 0 on failure. 2848 */ 2849 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 2850 struct btrfs_root *root, 2851 struct btrfs_path *path, int level) 2852 { 2853 u64 lower_gen; 2854 struct extent_buffer *lower; 2855 struct extent_buffer *c; 2856 struct extent_buffer *old; 2857 struct btrfs_disk_key lower_key; 2858 int ret; 2859 2860 BUG_ON(path->nodes[level]); 2861 BUG_ON(path->nodes[level-1] != root->node); 2862 2863 lower = path->nodes[level-1]; 2864 if (level == 1) 2865 btrfs_item_key(lower, &lower_key, 0); 2866 else 2867 btrfs_node_key(lower, &lower_key, 0); 2868 2869 c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), 2870 &lower_key, level, root->node->start, 0, 2871 0, BTRFS_NESTING_NEW_ROOT); 2872 if (IS_ERR(c)) 2873 return PTR_ERR(c); 2874 2875 root_add_used_bytes(root); 2876 2877 btrfs_set_header_nritems(c, 1); 2878 btrfs_set_node_key(c, &lower_key, 0); 2879 btrfs_set_node_blockptr(c, 0, lower->start); 2880 lower_gen = btrfs_header_generation(lower); 2881 WARN_ON(lower_gen != trans->transid); 2882 2883 btrfs_set_node_ptr_generation(c, 0, lower_gen); 2884 2885 btrfs_mark_buffer_dirty(trans, c); 2886 2887 old = root->node; 2888 ret = btrfs_tree_mod_log_insert_root(root->node, c, false); 2889 if (ret < 0) { 2890 int ret2; 2891 2892 btrfs_clear_buffer_dirty(trans, c); 2893 ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); 2894 if (unlikely(ret2 < 0)) 2895 btrfs_abort_transaction(trans, ret2); 2896 btrfs_tree_unlock(c); 2897 free_extent_buffer(c); 2898 return ret; 2899 } 2900 rcu_assign_pointer(root->node, c); 2901 2902 /* the super has an extra ref to root->node */ 2903 free_extent_buffer(old); 2904 2905 add_root_to_dirty_list(root); 2906 refcount_inc(&c->refs); 2907 path->nodes[level] = c; 2908 path->locks[level] = BTRFS_WRITE_LOCK; 2909 path->slots[level] = 0; 2910 return 0; 2911 } 2912 2913 /* 2914 * worker function to insert a single pointer in a node. 2915 * the node should have enough room for the pointer already 2916 * 2917 * slot and level indicate where you want the key to go, and 2918 * blocknr is the block the key points to. 2919 */ 2920 static int insert_ptr(struct btrfs_trans_handle *trans, 2921 const struct btrfs_path *path, 2922 const struct btrfs_disk_key *key, u64 bytenr, 2923 int slot, int level) 2924 { 2925 struct extent_buffer *lower; 2926 int nritems; 2927 int ret; 2928 2929 BUG_ON(!path->nodes[level]); 2930 btrfs_assert_tree_write_locked(path->nodes[level]); 2931 lower = path->nodes[level]; 2932 nritems = btrfs_header_nritems(lower); 2933 BUG_ON(slot > nritems); 2934 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); 2935 if (slot != nritems) { 2936 if (level) { 2937 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, 2938 slot, nritems - slot); 2939 if (unlikely(ret < 0)) { 2940 btrfs_abort_transaction(trans, ret); 2941 return ret; 2942 } 2943 } 2944 memmove_extent_buffer(lower, 2945 btrfs_node_key_ptr_offset(lower, slot + 1), 2946 btrfs_node_key_ptr_offset(lower, slot), 2947 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2948 } 2949 if (level) { 2950 ret = btrfs_tree_mod_log_insert_key(lower, slot, 2951 BTRFS_MOD_LOG_KEY_ADD); 2952 if (unlikely(ret < 0)) { 2953 btrfs_abort_transaction(trans, ret); 2954 return ret; 2955 } 2956 } 2957 btrfs_set_node_key(lower, key, slot); 2958 btrfs_set_node_blockptr(lower, slot, bytenr); 2959 WARN_ON(trans->transid == 0); 2960 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 2961 btrfs_set_header_nritems(lower, nritems + 1); 2962 btrfs_mark_buffer_dirty(trans, lower); 2963 2964 return 0; 2965 } 2966 2967 /* 2968 * split the node at the specified level in path in two. 2969 * The path is corrected to point to the appropriate node after the split 2970 * 2971 * Before splitting this tries to make some room in the node by pushing 2972 * left and right, if either one works, it returns right away. 2973 * 2974 * returns 0 on success and < 0 on failure 2975 */ 2976 static noinline int split_node(struct btrfs_trans_handle *trans, 2977 struct btrfs_root *root, 2978 struct btrfs_path *path, int level) 2979 { 2980 struct btrfs_fs_info *fs_info = root->fs_info; 2981 struct extent_buffer *c; 2982 struct extent_buffer *split; 2983 struct btrfs_disk_key disk_key; 2984 int mid; 2985 int ret; 2986 u32 c_nritems; 2987 2988 c = path->nodes[level]; 2989 WARN_ON(btrfs_header_generation(c) != trans->transid); 2990 if (c == root->node) { 2991 /* 2992 * trying to split the root, lets make a new one 2993 * 2994 * tree mod log: We don't log_removal old root in 2995 * insert_new_root, because that root buffer will be kept as a 2996 * normal node. We are going to log removal of half of the 2997 * elements below with btrfs_tree_mod_log_eb_copy(). We're 2998 * holding a tree lock on the buffer, which is why we cannot 2999 * race with other tree_mod_log users. 3000 */ 3001 ret = insert_new_root(trans, root, path, level + 1); 3002 if (ret) 3003 return ret; 3004 } else { 3005 ret = push_nodes_for_insert(trans, root, path, level); 3006 c = path->nodes[level]; 3007 if (!ret && btrfs_header_nritems(c) < 3008 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 3009 return 0; 3010 if (ret < 0) 3011 return ret; 3012 } 3013 3014 c_nritems = btrfs_header_nritems(c); 3015 mid = (c_nritems + 1) / 2; 3016 btrfs_node_key(c, &disk_key, mid); 3017 3018 split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), 3019 &disk_key, level, c->start, 0, 3020 0, BTRFS_NESTING_SPLIT); 3021 if (IS_ERR(split)) 3022 return PTR_ERR(split); 3023 3024 root_add_used_bytes(root); 3025 ASSERT(btrfs_header_level(c) == level); 3026 3027 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); 3028 if (unlikely(ret)) { 3029 btrfs_tree_unlock(split); 3030 free_extent_buffer(split); 3031 btrfs_abort_transaction(trans, ret); 3032 return ret; 3033 } 3034 copy_extent_buffer(split, c, 3035 btrfs_node_key_ptr_offset(split, 0), 3036 btrfs_node_key_ptr_offset(c, mid), 3037 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 3038 btrfs_set_header_nritems(split, c_nritems - mid); 3039 btrfs_set_header_nritems(c, mid); 3040 3041 btrfs_mark_buffer_dirty(trans, c); 3042 btrfs_mark_buffer_dirty(trans, split); 3043 3044 ret = insert_ptr(trans, path, &disk_key, split->start, 3045 path->slots[level + 1] + 1, level + 1); 3046 if (ret < 0) { 3047 btrfs_tree_unlock(split); 3048 free_extent_buffer(split); 3049 return ret; 3050 } 3051 3052 if (path->slots[level] >= mid) { 3053 path->slots[level] -= mid; 3054 btrfs_tree_unlock(c); 3055 free_extent_buffer(c); 3056 path->nodes[level] = split; 3057 path->slots[level + 1] += 1; 3058 } else { 3059 btrfs_tree_unlock(split); 3060 free_extent_buffer(split); 3061 } 3062 return 0; 3063 } 3064 3065 /* 3066 * how many bytes are required to store the items in a leaf. start 3067 * and nr indicate which items in the leaf to check. This totals up the 3068 * space used both by the item structs and the item data 3069 */ 3070 static int leaf_space_used(const struct extent_buffer *l, int start, int nr) 3071 { 3072 int data_len; 3073 int nritems = btrfs_header_nritems(l); 3074 int end = min(nritems, start + nr) - 1; 3075 3076 if (!nr) 3077 return 0; 3078 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); 3079 data_len = data_len - btrfs_item_offset(l, end); 3080 data_len += sizeof(struct btrfs_item) * nr; 3081 WARN_ON(data_len < 0); 3082 return data_len; 3083 } 3084 3085 /* 3086 * The space between the end of the leaf items and 3087 * the start of the leaf data. IOW, how much room 3088 * the leaf has left for both items and data 3089 */ 3090 int btrfs_leaf_free_space(const struct extent_buffer *leaf) 3091 { 3092 struct btrfs_fs_info *fs_info = leaf->fs_info; 3093 int nritems = btrfs_header_nritems(leaf); 3094 int ret; 3095 3096 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 3097 if (unlikely(ret < 0)) { 3098 btrfs_crit(fs_info, 3099 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 3100 ret, 3101 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 3102 leaf_space_used(leaf, 0, nritems), nritems); 3103 } 3104 return ret; 3105 } 3106 3107 /* 3108 * min slot controls the lowest index we're willing to push to the 3109 * right. We'll push up to and including min_slot, but no lower 3110 */ 3111 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 3112 struct btrfs_path *path, 3113 int data_size, bool empty, 3114 struct extent_buffer *right, 3115 int free_space, u32 left_nritems, 3116 u32 min_slot) 3117 { 3118 struct btrfs_fs_info *fs_info = right->fs_info; 3119 struct extent_buffer *left = path->nodes[0]; 3120 struct extent_buffer *upper = path->nodes[1]; 3121 struct btrfs_disk_key disk_key; 3122 int slot; 3123 u32 i; 3124 int push_space = 0; 3125 int push_items = 0; 3126 u32 nr; 3127 u32 right_nritems; 3128 u32 data_end; 3129 u32 this_item_size; 3130 3131 if (empty) 3132 nr = 0; 3133 else 3134 nr = max_t(u32, 1, min_slot); 3135 3136 if (path->slots[0] >= left_nritems) 3137 push_space += data_size; 3138 3139 slot = path->slots[1]; 3140 i = left_nritems - 1; 3141 while (i >= nr) { 3142 if (!empty && push_items > 0) { 3143 if (path->slots[0] > i) 3144 break; 3145 if (path->slots[0] == i) { 3146 int space = btrfs_leaf_free_space(left); 3147 3148 if (space + push_space * 2 > free_space) 3149 break; 3150 } 3151 } 3152 3153 if (path->slots[0] == i) 3154 push_space += data_size; 3155 3156 this_item_size = btrfs_item_size(left, i); 3157 if (this_item_size + sizeof(struct btrfs_item) + 3158 push_space > free_space) 3159 break; 3160 3161 push_items++; 3162 push_space += this_item_size + sizeof(struct btrfs_item); 3163 if (i == 0) 3164 break; 3165 i--; 3166 } 3167 3168 if (push_items == 0) 3169 goto out_unlock; 3170 3171 WARN_ON(!empty && push_items == left_nritems); 3172 3173 /* push left to right */ 3174 right_nritems = btrfs_header_nritems(right); 3175 3176 push_space = btrfs_item_data_end(left, left_nritems - push_items); 3177 push_space -= leaf_data_end(left); 3178 3179 /* make room in the right data area */ 3180 data_end = leaf_data_end(right); 3181 memmove_leaf_data(right, data_end - push_space, data_end, 3182 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 3183 3184 /* copy from the left data area */ 3185 copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3186 leaf_data_end(left), push_space); 3187 3188 memmove_leaf_items(right, push_items, 0, right_nritems); 3189 3190 /* copy the items from left to right */ 3191 copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); 3192 3193 /* update the item pointers */ 3194 right_nritems += push_items; 3195 btrfs_set_header_nritems(right, right_nritems); 3196 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3197 for (i = 0; i < right_nritems; i++) { 3198 push_space -= btrfs_item_size(right, i); 3199 btrfs_set_item_offset(right, i, push_space); 3200 } 3201 3202 left_nritems -= push_items; 3203 btrfs_set_header_nritems(left, left_nritems); 3204 3205 if (left_nritems) 3206 btrfs_mark_buffer_dirty(trans, left); 3207 else 3208 btrfs_clear_buffer_dirty(trans, left); 3209 3210 btrfs_mark_buffer_dirty(trans, right); 3211 3212 btrfs_item_key(right, &disk_key, 0); 3213 btrfs_set_node_key(upper, &disk_key, slot + 1); 3214 btrfs_mark_buffer_dirty(trans, upper); 3215 3216 /* then fixup the leaf pointer in the path */ 3217 if (path->slots[0] >= left_nritems) { 3218 path->slots[0] -= left_nritems; 3219 btrfs_tree_unlock(left); 3220 free_extent_buffer(left); 3221 path->nodes[0] = right; 3222 path->slots[1] += 1; 3223 } else { 3224 btrfs_tree_unlock(right); 3225 free_extent_buffer(right); 3226 } 3227 return 0; 3228 3229 out_unlock: 3230 btrfs_tree_unlock(right); 3231 free_extent_buffer(right); 3232 return 1; 3233 } 3234 3235 /* 3236 * push some data in the path leaf to the right, trying to free up at 3237 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3238 * 3239 * returns 1 if the push failed because the other node didn't have enough 3240 * room, 0 if everything worked out and < 0 if there were major errors. 3241 * 3242 * this will push starting from min_slot to the end of the leaf. It won't 3243 * push any slot lower than min_slot 3244 */ 3245 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3246 *root, struct btrfs_path *path, 3247 int min_data_size, int data_size, 3248 bool empty, u32 min_slot) 3249 { 3250 struct extent_buffer *left = path->nodes[0]; 3251 struct extent_buffer *right; 3252 struct extent_buffer *upper; 3253 int slot; 3254 int free_space; 3255 u32 left_nritems; 3256 int ret; 3257 3258 if (!path->nodes[1]) 3259 return 1; 3260 3261 slot = path->slots[1]; 3262 upper = path->nodes[1]; 3263 if (slot >= btrfs_header_nritems(upper) - 1) 3264 return 1; 3265 3266 btrfs_assert_tree_write_locked(path->nodes[1]); 3267 3268 right = btrfs_read_node_slot(upper, slot + 1); 3269 if (IS_ERR(right)) 3270 return PTR_ERR(right); 3271 3272 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); 3273 3274 free_space = btrfs_leaf_free_space(right); 3275 if (free_space < data_size) 3276 goto out_unlock; 3277 3278 ret = btrfs_cow_block(trans, root, right, upper, 3279 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 3280 if (ret) 3281 goto out_unlock; 3282 3283 left_nritems = btrfs_header_nritems(left); 3284 if (left_nritems == 0) 3285 goto out_unlock; 3286 3287 if (unlikely(check_sibling_keys(left, right))) { 3288 ret = -EUCLEAN; 3289 btrfs_abort_transaction(trans, ret); 3290 btrfs_tree_unlock(right); 3291 free_extent_buffer(right); 3292 return ret; 3293 } 3294 if (path->slots[0] == left_nritems && !empty) { 3295 /* Key greater than all keys in the leaf, right neighbor has 3296 * enough room for it and we're not emptying our leaf to delete 3297 * it, therefore use right neighbor to insert the new item and 3298 * no need to touch/dirty our left leaf. */ 3299 btrfs_tree_unlock(left); 3300 free_extent_buffer(left); 3301 path->nodes[0] = right; 3302 path->slots[0] = 0; 3303 path->slots[1]++; 3304 return 0; 3305 } 3306 3307 return __push_leaf_right(trans, path, min_data_size, empty, right, 3308 free_space, left_nritems, min_slot); 3309 out_unlock: 3310 btrfs_tree_unlock(right); 3311 free_extent_buffer(right); 3312 return 1; 3313 } 3314 3315 /* 3316 * push some data in the path leaf to the left, trying to free up at 3317 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3318 * 3319 * max_slot can put a limit on how far into the leaf we'll push items. The 3320 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3321 * items 3322 */ 3323 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 3324 struct btrfs_path *path, int data_size, 3325 bool empty, struct extent_buffer *left, 3326 int free_space, u32 right_nritems, 3327 u32 max_slot) 3328 { 3329 struct btrfs_fs_info *fs_info = left->fs_info; 3330 struct btrfs_disk_key disk_key; 3331 struct extent_buffer *right = path->nodes[0]; 3332 int i; 3333 int push_space = 0; 3334 int push_items = 0; 3335 u32 old_left_nritems; 3336 u32 nr; 3337 int ret = 0; 3338 u32 this_item_size; 3339 u32 old_left_item_size; 3340 3341 if (empty) 3342 nr = min(right_nritems, max_slot); 3343 else 3344 nr = min(right_nritems - 1, max_slot); 3345 3346 for (i = 0; i < nr; i++) { 3347 if (!empty && push_items > 0) { 3348 if (path->slots[0] < i) 3349 break; 3350 if (path->slots[0] == i) { 3351 int space = btrfs_leaf_free_space(right); 3352 3353 if (space + push_space * 2 > free_space) 3354 break; 3355 } 3356 } 3357 3358 if (path->slots[0] == i) 3359 push_space += data_size; 3360 3361 this_item_size = btrfs_item_size(right, i); 3362 if (this_item_size + sizeof(struct btrfs_item) + push_space > 3363 free_space) 3364 break; 3365 3366 push_items++; 3367 push_space += this_item_size + sizeof(struct btrfs_item); 3368 } 3369 3370 if (push_items == 0) { 3371 ret = 1; 3372 goto out; 3373 } 3374 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 3375 3376 /* push data from right to left */ 3377 copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); 3378 3379 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 3380 btrfs_item_offset(right, push_items - 1); 3381 3382 copy_leaf_data(left, right, leaf_data_end(left) - push_space, 3383 btrfs_item_offset(right, push_items - 1), push_space); 3384 old_left_nritems = btrfs_header_nritems(left); 3385 BUG_ON(old_left_nritems <= 0); 3386 3387 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); 3388 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3389 u32 ioff; 3390 3391 ioff = btrfs_item_offset(left, i); 3392 btrfs_set_item_offset(left, i, 3393 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); 3394 } 3395 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3396 3397 /* fixup right node */ 3398 if (unlikely(push_items > right_nritems)) { 3399 ret = -EUCLEAN; 3400 btrfs_abort_transaction(trans, ret); 3401 btrfs_crit(fs_info, "push items (%d) > right leaf items (%u)", 3402 push_items, right_nritems); 3403 goto out; 3404 } 3405 3406 if (push_items < right_nritems) { 3407 push_space = btrfs_item_offset(right, push_items - 1) - 3408 leaf_data_end(right); 3409 memmove_leaf_data(right, 3410 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3411 leaf_data_end(right), push_space); 3412 3413 memmove_leaf_items(right, 0, push_items, 3414 btrfs_header_nritems(right) - push_items); 3415 } 3416 3417 right_nritems -= push_items; 3418 btrfs_set_header_nritems(right, right_nritems); 3419 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3420 for (i = 0; i < right_nritems; i++) { 3421 push_space = push_space - btrfs_item_size(right, i); 3422 btrfs_set_item_offset(right, i, push_space); 3423 } 3424 3425 btrfs_mark_buffer_dirty(trans, left); 3426 if (right_nritems) 3427 btrfs_mark_buffer_dirty(trans, right); 3428 else 3429 btrfs_clear_buffer_dirty(trans, right); 3430 3431 btrfs_item_key(right, &disk_key, 0); 3432 fixup_low_keys(trans, path, &disk_key, 1); 3433 3434 /* then fixup the leaf pointer in the path */ 3435 if (path->slots[0] < push_items) { 3436 path->slots[0] += old_left_nritems; 3437 btrfs_tree_unlock(right); 3438 free_extent_buffer(right); 3439 path->nodes[0] = left; 3440 path->slots[1] -= 1; 3441 } else { 3442 btrfs_tree_unlock(left); 3443 free_extent_buffer(left); 3444 path->slots[0] -= push_items; 3445 } 3446 BUG_ON(path->slots[0] < 0); 3447 return ret; 3448 out: 3449 btrfs_tree_unlock(left); 3450 free_extent_buffer(left); 3451 return ret; 3452 } 3453 3454 /* 3455 * push some data in the path leaf to the left, trying to free up at 3456 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3457 * 3458 * max_slot can put a limit on how far into the leaf we'll push items. The 3459 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3460 * items 3461 */ 3462 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3463 *root, struct btrfs_path *path, int min_data_size, 3464 int data_size, int empty, u32 max_slot) 3465 { 3466 struct extent_buffer *right = path->nodes[0]; 3467 struct extent_buffer *left; 3468 int slot; 3469 int free_space; 3470 u32 right_nritems; 3471 int ret = 0; 3472 3473 slot = path->slots[1]; 3474 if (slot == 0) 3475 return 1; 3476 if (!path->nodes[1]) 3477 return 1; 3478 3479 right_nritems = btrfs_header_nritems(right); 3480 if (right_nritems == 0) 3481 return 1; 3482 3483 btrfs_assert_tree_write_locked(path->nodes[1]); 3484 3485 left = btrfs_read_node_slot(path->nodes[1], slot - 1); 3486 if (IS_ERR(left)) 3487 return PTR_ERR(left); 3488 3489 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); 3490 3491 free_space = btrfs_leaf_free_space(left); 3492 if (free_space < data_size) { 3493 ret = 1; 3494 goto out; 3495 } 3496 3497 ret = btrfs_cow_block(trans, root, left, 3498 path->nodes[1], slot - 1, &left, 3499 BTRFS_NESTING_LEFT_COW); 3500 if (ret) { 3501 /* we hit -ENOSPC, but it isn't fatal here */ 3502 if (ret == -ENOSPC) 3503 ret = 1; 3504 goto out; 3505 } 3506 3507 if (unlikely(check_sibling_keys(left, right))) { 3508 ret = -EUCLEAN; 3509 btrfs_abort_transaction(trans, ret); 3510 goto out; 3511 } 3512 return __push_leaf_left(trans, path, min_data_size, empty, left, 3513 free_space, right_nritems, max_slot); 3514 out: 3515 btrfs_tree_unlock(left); 3516 free_extent_buffer(left); 3517 return ret; 3518 } 3519 3520 /* 3521 * split the path's leaf in two, making sure there is at least data_size 3522 * available for the resulting leaf level of the path. 3523 */ 3524 static noinline int copy_for_split(struct btrfs_trans_handle *trans, 3525 struct btrfs_path *path, 3526 struct extent_buffer *l, 3527 struct extent_buffer *right, 3528 int slot, int mid, int nritems) 3529 { 3530 struct btrfs_fs_info *fs_info = trans->fs_info; 3531 int data_copy_size; 3532 int rt_data_off; 3533 int i; 3534 int ret; 3535 struct btrfs_disk_key disk_key; 3536 3537 nritems = nritems - mid; 3538 btrfs_set_header_nritems(right, nritems); 3539 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); 3540 3541 copy_leaf_items(right, l, 0, mid, nritems); 3542 3543 copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, 3544 leaf_data_end(l), data_copy_size); 3545 3546 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); 3547 3548 for (i = 0; i < nritems; i++) { 3549 u32 ioff; 3550 3551 ioff = btrfs_item_offset(right, i); 3552 btrfs_set_item_offset(right, i, ioff + rt_data_off); 3553 } 3554 3555 btrfs_set_header_nritems(l, mid); 3556 btrfs_item_key(right, &disk_key, 0); 3557 ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); 3558 if (ret < 0) 3559 return ret; 3560 3561 btrfs_mark_buffer_dirty(trans, right); 3562 btrfs_mark_buffer_dirty(trans, l); 3563 BUG_ON(path->slots[0] != slot); 3564 3565 if (mid <= slot) { 3566 btrfs_tree_unlock(path->nodes[0]); 3567 free_extent_buffer(path->nodes[0]); 3568 path->nodes[0] = right; 3569 path->slots[0] -= mid; 3570 path->slots[1] += 1; 3571 } else { 3572 btrfs_tree_unlock(right); 3573 free_extent_buffer(right); 3574 } 3575 3576 BUG_ON(path->slots[0] < 0); 3577 3578 return 0; 3579 } 3580 3581 /* 3582 * double splits happen when we need to insert a big item in the middle 3583 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3584 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3585 * A B C 3586 * 3587 * We avoid this by trying to push the items on either side of our target 3588 * into the adjacent leaves. If all goes well we can avoid the double split 3589 * completely. 3590 */ 3591 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3592 struct btrfs_root *root, 3593 struct btrfs_path *path, 3594 int data_size) 3595 { 3596 int ret; 3597 int progress = 0; 3598 int slot; 3599 u32 nritems; 3600 int space_needed = data_size; 3601 3602 slot = path->slots[0]; 3603 if (slot < btrfs_header_nritems(path->nodes[0])) 3604 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3605 3606 /* 3607 * try to push all the items after our slot into the 3608 * right leaf 3609 */ 3610 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 3611 if (ret < 0) 3612 return ret; 3613 3614 if (ret == 0) 3615 progress++; 3616 3617 nritems = btrfs_header_nritems(path->nodes[0]); 3618 /* 3619 * our goal is to get our slot at the start or end of a leaf. If 3620 * we've done so we're done 3621 */ 3622 if (path->slots[0] == 0 || path->slots[0] == nritems) 3623 return 0; 3624 3625 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3626 return 0; 3627 3628 /* try to push all the items before our slot into the next leaf */ 3629 slot = path->slots[0]; 3630 space_needed = data_size; 3631 if (slot > 0) 3632 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3633 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 3634 if (ret < 0) 3635 return ret; 3636 3637 if (ret == 0) 3638 progress++; 3639 3640 if (progress) 3641 return 0; 3642 return 1; 3643 } 3644 3645 /* 3646 * split the path's leaf in two, making sure there is at least data_size 3647 * available for the resulting leaf level of the path. 3648 * 3649 * returns 0 if all went well and < 0 on failure. 3650 */ 3651 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3652 struct btrfs_root *root, 3653 const struct btrfs_key *ins_key, 3654 struct btrfs_path *path, int data_size, 3655 bool extend) 3656 { 3657 struct btrfs_disk_key disk_key; 3658 struct extent_buffer *l; 3659 u32 nritems; 3660 int mid; 3661 int slot; 3662 struct extent_buffer *right; 3663 struct btrfs_fs_info *fs_info = root->fs_info; 3664 int ret = 0; 3665 int wret; 3666 int split; 3667 int num_doubles = 0; 3668 bool tried_avoid_double = false; 3669 3670 l = path->nodes[0]; 3671 slot = path->slots[0]; 3672 if (extend && data_size + btrfs_item_size(l, slot) + 3673 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 3674 return -EOVERFLOW; 3675 3676 /* first try to make some room by pushing left and right */ 3677 if (data_size && path->nodes[1]) { 3678 int space_needed = data_size; 3679 3680 if (slot < btrfs_header_nritems(l)) 3681 space_needed -= btrfs_leaf_free_space(l); 3682 3683 wret = push_leaf_right(trans, root, path, space_needed, 3684 space_needed, 0, 0); 3685 if (wret < 0) 3686 return wret; 3687 if (wret) { 3688 space_needed = data_size; 3689 if (slot > 0) 3690 space_needed -= btrfs_leaf_free_space(l); 3691 wret = push_leaf_left(trans, root, path, space_needed, 3692 space_needed, 0, (u32)-1); 3693 if (wret < 0) 3694 return wret; 3695 } 3696 l = path->nodes[0]; 3697 3698 /* did the pushes work? */ 3699 if (btrfs_leaf_free_space(l) >= data_size) 3700 return 0; 3701 } 3702 3703 if (!path->nodes[1]) { 3704 ret = insert_new_root(trans, root, path, 1); 3705 if (ret) 3706 return ret; 3707 } 3708 again: 3709 split = 1; 3710 l = path->nodes[0]; 3711 slot = path->slots[0]; 3712 nritems = btrfs_header_nritems(l); 3713 mid = (nritems + 1) / 2; 3714 3715 if (mid <= slot) { 3716 if (nritems == 1 || 3717 leaf_space_used(l, mid, nritems - mid) + data_size > 3718 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3719 if (slot >= nritems) { 3720 split = 0; 3721 } else { 3722 mid = slot; 3723 if (mid != nritems && 3724 leaf_space_used(l, mid, nritems - mid) + 3725 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3726 if (data_size && !tried_avoid_double) 3727 goto push_for_double; 3728 split = 2; 3729 } 3730 } 3731 } 3732 } else { 3733 if (leaf_space_used(l, 0, mid) + data_size > 3734 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3735 if (!extend && data_size && slot == 0) { 3736 split = 0; 3737 } else if ((extend || !data_size) && slot == 0) { 3738 mid = 1; 3739 } else { 3740 mid = slot; 3741 if (mid != nritems && 3742 leaf_space_used(l, mid, nritems - mid) + 3743 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3744 if (data_size && !tried_avoid_double) 3745 goto push_for_double; 3746 split = 2; 3747 } 3748 } 3749 } 3750 } 3751 3752 if (split == 0) 3753 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3754 else 3755 btrfs_item_key(l, &disk_key, mid); 3756 3757 /* 3758 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double 3759 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES 3760 * subclasses, which is 8 at the time of this patch, and we've maxed it 3761 * out. In the future we could add a 3762 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just 3763 * use BTRFS_NESTING_NEW_ROOT. 3764 */ 3765 right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), 3766 &disk_key, 0, l->start, 0, 0, 3767 num_doubles ? BTRFS_NESTING_NEW_ROOT : 3768 BTRFS_NESTING_SPLIT); 3769 if (IS_ERR(right)) 3770 return PTR_ERR(right); 3771 3772 root_add_used_bytes(root); 3773 3774 if (split == 0) { 3775 if (mid <= slot) { 3776 btrfs_set_header_nritems(right, 0); 3777 ret = insert_ptr(trans, path, &disk_key, 3778 right->start, path->slots[1] + 1, 1); 3779 if (ret < 0) { 3780 btrfs_tree_unlock(right); 3781 free_extent_buffer(right); 3782 return ret; 3783 } 3784 btrfs_tree_unlock(path->nodes[0]); 3785 free_extent_buffer(path->nodes[0]); 3786 path->nodes[0] = right; 3787 path->slots[0] = 0; 3788 path->slots[1] += 1; 3789 } else { 3790 btrfs_set_header_nritems(right, 0); 3791 ret = insert_ptr(trans, path, &disk_key, 3792 right->start, path->slots[1], 1); 3793 if (ret < 0) { 3794 btrfs_tree_unlock(right); 3795 free_extent_buffer(right); 3796 return ret; 3797 } 3798 btrfs_tree_unlock(path->nodes[0]); 3799 free_extent_buffer(path->nodes[0]); 3800 path->nodes[0] = right; 3801 path->slots[0] = 0; 3802 if (path->slots[1] == 0) 3803 fixup_low_keys(trans, path, &disk_key, 1); 3804 } 3805 /* 3806 * We create a new leaf 'right' for the required ins_len and 3807 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 3808 * the content of ins_len to 'right'. 3809 */ 3810 return ret; 3811 } 3812 3813 ret = copy_for_split(trans, path, l, right, slot, mid, nritems); 3814 if (ret < 0) { 3815 btrfs_tree_unlock(right); 3816 free_extent_buffer(right); 3817 return ret; 3818 } 3819 3820 if (split == 2) { 3821 BUG_ON(num_doubles != 0); 3822 num_doubles++; 3823 goto again; 3824 } 3825 3826 return 0; 3827 3828 push_for_double: 3829 push_for_double_split(trans, root, path, data_size); 3830 tried_avoid_double = true; 3831 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3832 return 0; 3833 goto again; 3834 } 3835 3836 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3837 struct btrfs_root *root, 3838 struct btrfs_path *path, int ins_len) 3839 { 3840 struct btrfs_key key; 3841 struct extent_buffer *leaf; 3842 struct btrfs_file_extent_item *fi; 3843 u64 extent_len = 0; 3844 u32 item_size; 3845 int ret; 3846 3847 leaf = path->nodes[0]; 3848 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3849 3850 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 3851 key.type != BTRFS_RAID_STRIPE_KEY && 3852 key.type != BTRFS_EXTENT_CSUM_KEY); 3853 3854 if (btrfs_leaf_free_space(leaf) >= ins_len) 3855 return 0; 3856 3857 item_size = btrfs_item_size(leaf, path->slots[0]); 3858 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3859 fi = btrfs_item_ptr(leaf, path->slots[0], 3860 struct btrfs_file_extent_item); 3861 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3862 } 3863 btrfs_release_path(path); 3864 3865 path->keep_locks = true; 3866 path->search_for_split = true; 3867 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 3868 path->search_for_split = false; 3869 if (ret > 0) 3870 ret = -EAGAIN; 3871 if (ret < 0) 3872 goto err; 3873 3874 ret = -EAGAIN; 3875 leaf = path->nodes[0]; 3876 /* if our item isn't there, return now */ 3877 if (item_size != btrfs_item_size(leaf, path->slots[0])) 3878 goto err; 3879 3880 /* the leaf has changed, it now has room. return now */ 3881 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) 3882 goto err; 3883 3884 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3885 fi = btrfs_item_ptr(leaf, path->slots[0], 3886 struct btrfs_file_extent_item); 3887 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 3888 goto err; 3889 } 3890 3891 ret = split_leaf(trans, root, &key, path, ins_len, true); 3892 if (ret) 3893 goto err; 3894 3895 path->keep_locks = false; 3896 btrfs_unlock_up_safe(path, 1); 3897 return 0; 3898 err: 3899 path->keep_locks = false; 3900 return ret; 3901 } 3902 3903 static noinline int split_item(struct btrfs_trans_handle *trans, 3904 struct btrfs_path *path, 3905 const struct btrfs_key *new_key, 3906 unsigned long split_offset) 3907 { 3908 struct extent_buffer *leaf; 3909 int orig_slot, slot; 3910 char *buf; 3911 u32 nritems; 3912 u32 item_size; 3913 u32 orig_offset; 3914 struct btrfs_disk_key disk_key; 3915 3916 leaf = path->nodes[0]; 3917 /* 3918 * Shouldn't happen because the caller must have previously called 3919 * setup_leaf_for_split() to make room for the new item in the leaf. 3920 */ 3921 if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item))) 3922 return -ENOSPC; 3923 3924 orig_slot = path->slots[0]; 3925 orig_offset = btrfs_item_offset(leaf, path->slots[0]); 3926 item_size = btrfs_item_size(leaf, path->slots[0]); 3927 3928 buf = kmalloc(item_size, GFP_NOFS); 3929 if (!buf) 3930 return -ENOMEM; 3931 3932 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3933 path->slots[0]), item_size); 3934 3935 slot = path->slots[0] + 1; 3936 nritems = btrfs_header_nritems(leaf); 3937 if (slot != nritems) { 3938 /* shift the items */ 3939 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); 3940 } 3941 3942 btrfs_cpu_key_to_disk(&disk_key, new_key); 3943 btrfs_set_item_key(leaf, &disk_key, slot); 3944 3945 btrfs_set_item_offset(leaf, slot, orig_offset); 3946 btrfs_set_item_size(leaf, slot, item_size - split_offset); 3947 3948 btrfs_set_item_offset(leaf, orig_slot, 3949 orig_offset + item_size - split_offset); 3950 btrfs_set_item_size(leaf, orig_slot, split_offset); 3951 3952 btrfs_set_header_nritems(leaf, nritems + 1); 3953 3954 /* write the data for the start of the original item */ 3955 write_extent_buffer(leaf, buf, 3956 btrfs_item_ptr_offset(leaf, path->slots[0]), 3957 split_offset); 3958 3959 /* write the data for the new item */ 3960 write_extent_buffer(leaf, buf + split_offset, 3961 btrfs_item_ptr_offset(leaf, slot), 3962 item_size - split_offset); 3963 btrfs_mark_buffer_dirty(trans, leaf); 3964 3965 BUG_ON(btrfs_leaf_free_space(leaf) < 0); 3966 kfree(buf); 3967 return 0; 3968 } 3969 3970 /* 3971 * This function splits a single item into two items, 3972 * giving 'new_key' to the new item and splitting the 3973 * old one at split_offset (from the start of the item). 3974 * 3975 * The path may be released by this operation. After 3976 * the split, the path is pointing to the old item. The 3977 * new item is going to be in the same node as the old one. 3978 * 3979 * Note, the item being split must be smaller enough to live alone on 3980 * a tree block with room for one extra struct btrfs_item 3981 * 3982 * This allows us to split the item in place, keeping a lock on the 3983 * leaf the entire time. 3984 */ 3985 int btrfs_split_item(struct btrfs_trans_handle *trans, 3986 struct btrfs_root *root, 3987 struct btrfs_path *path, 3988 const struct btrfs_key *new_key, 3989 unsigned long split_offset) 3990 { 3991 int ret; 3992 ret = setup_leaf_for_split(trans, root, path, 3993 sizeof(struct btrfs_item)); 3994 if (ret) 3995 return ret; 3996 3997 return split_item(trans, path, new_key, split_offset); 3998 } 3999 4000 /* 4001 * make the item pointed to by the path smaller. new_size indicates 4002 * how small to make it, and from_end tells us if we just chop bytes 4003 * off the end of the item or if we shift the item to chop bytes off 4004 * the front. 4005 */ 4006 void btrfs_truncate_item(struct btrfs_trans_handle *trans, 4007 const struct btrfs_path *path, u32 new_size, int from_end) 4008 { 4009 int slot; 4010 struct extent_buffer *leaf; 4011 u32 nritems; 4012 unsigned int data_end; 4013 unsigned int old_data_start; 4014 unsigned int old_size; 4015 unsigned int size_diff; 4016 int i; 4017 4018 leaf = path->nodes[0]; 4019 slot = path->slots[0]; 4020 4021 old_size = btrfs_item_size(leaf, slot); 4022 if (old_size == new_size) 4023 return; 4024 4025 nritems = btrfs_header_nritems(leaf); 4026 data_end = leaf_data_end(leaf); 4027 4028 old_data_start = btrfs_item_offset(leaf, slot); 4029 4030 size_diff = old_size - new_size; 4031 4032 BUG_ON(slot < 0); 4033 BUG_ON(slot >= nritems); 4034 4035 /* 4036 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4037 */ 4038 /* first correct the data pointers */ 4039 for (i = slot; i < nritems; i++) { 4040 u32 ioff; 4041 4042 ioff = btrfs_item_offset(leaf, i); 4043 btrfs_set_item_offset(leaf, i, ioff + size_diff); 4044 } 4045 4046 /* shift the data */ 4047 if (from_end) { 4048 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4049 old_data_start + new_size - data_end); 4050 } else { 4051 struct btrfs_disk_key disk_key; 4052 u64 offset; 4053 4054 btrfs_item_key(leaf, &disk_key, slot); 4055 4056 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 4057 unsigned long ptr; 4058 struct btrfs_file_extent_item *fi; 4059 4060 fi = btrfs_item_ptr(leaf, slot, 4061 struct btrfs_file_extent_item); 4062 fi = (struct btrfs_file_extent_item *)( 4063 (unsigned long)fi - size_diff); 4064 4065 if (btrfs_file_extent_type(leaf, fi) == 4066 BTRFS_FILE_EXTENT_INLINE) { 4067 ptr = btrfs_item_ptr_offset(leaf, slot); 4068 memmove_extent_buffer(leaf, ptr, 4069 (unsigned long)fi, 4070 BTRFS_FILE_EXTENT_INLINE_DATA_START); 4071 } 4072 } 4073 4074 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4075 old_data_start - data_end); 4076 4077 offset = btrfs_disk_key_offset(&disk_key); 4078 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4079 btrfs_set_item_key(leaf, &disk_key, slot); 4080 if (slot == 0) 4081 fixup_low_keys(trans, path, &disk_key, 1); 4082 } 4083 4084 btrfs_set_item_size(leaf, slot, new_size); 4085 btrfs_mark_buffer_dirty(trans, leaf); 4086 4087 if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { 4088 btrfs_print_leaf(leaf); 4089 BUG(); 4090 } 4091 } 4092 4093 /* 4094 * make the item pointed to by the path bigger, data_size is the added size. 4095 */ 4096 void btrfs_extend_item(struct btrfs_trans_handle *trans, 4097 const struct btrfs_path *path, u32 data_size) 4098 { 4099 int slot; 4100 struct extent_buffer *leaf; 4101 u32 nritems; 4102 unsigned int data_end; 4103 unsigned int old_data; 4104 unsigned int old_size; 4105 int i; 4106 4107 leaf = path->nodes[0]; 4108 4109 nritems = btrfs_header_nritems(leaf); 4110 data_end = leaf_data_end(leaf); 4111 4112 if (unlikely(btrfs_leaf_free_space(leaf) < data_size)) { 4113 btrfs_print_leaf(leaf); 4114 BUG(); 4115 } 4116 slot = path->slots[0]; 4117 old_data = btrfs_item_data_end(leaf, slot); 4118 4119 BUG_ON(slot < 0); 4120 if (unlikely(slot >= nritems)) { 4121 btrfs_print_leaf(leaf); 4122 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", 4123 slot, nritems); 4124 BUG(); 4125 } 4126 4127 /* 4128 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4129 */ 4130 /* first correct the data pointers */ 4131 for (i = slot; i < nritems; i++) { 4132 u32 ioff; 4133 4134 ioff = btrfs_item_offset(leaf, i); 4135 btrfs_set_item_offset(leaf, i, ioff - data_size); 4136 } 4137 4138 /* shift the data */ 4139 memmove_leaf_data(leaf, data_end - data_size, data_end, 4140 old_data - data_end); 4141 4142 old_size = btrfs_item_size(leaf, slot); 4143 btrfs_set_item_size(leaf, slot, old_size + data_size); 4144 btrfs_mark_buffer_dirty(trans, leaf); 4145 4146 if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { 4147 btrfs_print_leaf(leaf); 4148 BUG(); 4149 } 4150 } 4151 4152 /* 4153 * Make space in the node before inserting one or more items. 4154 * 4155 * @trans: transaction handle 4156 * @root: root we are inserting items to 4157 * @path: points to the leaf/slot where we are going to insert new items 4158 * @batch: information about the batch of items to insert 4159 * 4160 * Main purpose is to save stack depth by doing the bulk of the work in a 4161 * function that doesn't call btrfs_search_slot 4162 */ 4163 static void setup_items_for_insert(struct btrfs_trans_handle *trans, 4164 struct btrfs_root *root, struct btrfs_path *path, 4165 const struct btrfs_item_batch *batch) 4166 { 4167 struct btrfs_fs_info *fs_info = root->fs_info; 4168 int i; 4169 u32 nritems; 4170 unsigned int data_end; 4171 struct btrfs_disk_key disk_key; 4172 struct extent_buffer *leaf; 4173 int slot; 4174 u32 total_size; 4175 4176 /* 4177 * Before anything else, update keys in the parent and other ancestors 4178 * if needed, then release the write locks on them, so that other tasks 4179 * can use them while we modify the leaf. 4180 */ 4181 if (path->slots[0] == 0) { 4182 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); 4183 fixup_low_keys(trans, path, &disk_key, 1); 4184 } 4185 btrfs_unlock_up_safe(path, 1); 4186 4187 leaf = path->nodes[0]; 4188 slot = path->slots[0]; 4189 4190 nritems = btrfs_header_nritems(leaf); 4191 data_end = leaf_data_end(leaf); 4192 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4193 4194 if (unlikely(btrfs_leaf_free_space(leaf) < total_size)) { 4195 btrfs_print_leaf(leaf); 4196 btrfs_crit(fs_info, "not enough freespace need %u have %d", 4197 total_size, btrfs_leaf_free_space(leaf)); 4198 BUG(); 4199 } 4200 4201 if (slot != nritems) { 4202 unsigned int old_data = btrfs_item_data_end(leaf, slot); 4203 4204 if (unlikely(old_data < data_end)) { 4205 btrfs_print_leaf(leaf); 4206 btrfs_crit(fs_info, 4207 "item at slot %d with data offset %u beyond data end of leaf %u", 4208 slot, old_data, data_end); 4209 BUG(); 4210 } 4211 /* 4212 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4213 */ 4214 /* first correct the data pointers */ 4215 for (i = slot; i < nritems; i++) { 4216 u32 ioff; 4217 4218 ioff = btrfs_item_offset(leaf, i); 4219 btrfs_set_item_offset(leaf, i, 4220 ioff - batch->total_data_size); 4221 } 4222 /* shift the items */ 4223 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); 4224 4225 /* shift the data */ 4226 memmove_leaf_data(leaf, data_end - batch->total_data_size, 4227 data_end, old_data - data_end); 4228 data_end = old_data; 4229 } 4230 4231 /* setup the item for the new data */ 4232 for (i = 0; i < batch->nr; i++) { 4233 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); 4234 btrfs_set_item_key(leaf, &disk_key, slot + i); 4235 data_end -= batch->data_sizes[i]; 4236 btrfs_set_item_offset(leaf, slot + i, data_end); 4237 btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]); 4238 } 4239 4240 btrfs_set_header_nritems(leaf, nritems + batch->nr); 4241 btrfs_mark_buffer_dirty(trans, leaf); 4242 4243 if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { 4244 btrfs_print_leaf(leaf); 4245 BUG(); 4246 } 4247 } 4248 4249 /* 4250 * Insert a new item into a leaf. 4251 * 4252 * @trans: Transaction handle. 4253 * @root: The root of the btree. 4254 * @path: A path pointing to the target leaf and slot. 4255 * @key: The key of the new item. 4256 * @data_size: The size of the data associated with the new key. 4257 */ 4258 void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, 4259 struct btrfs_root *root, 4260 struct btrfs_path *path, 4261 const struct btrfs_key *key, 4262 u32 data_size) 4263 { 4264 struct btrfs_item_batch batch; 4265 4266 batch.keys = key; 4267 batch.data_sizes = &data_size; 4268 batch.total_data_size = data_size; 4269 batch.nr = 1; 4270 4271 setup_items_for_insert(trans, root, path, &batch); 4272 } 4273 4274 /* 4275 * Given a key and some data, insert items into the tree. 4276 * This does all the path init required, making room in the tree if needed. 4277 * 4278 * Returns: 0 on success 4279 * -EEXIST if the first key already exists 4280 * < 0 on other errors 4281 */ 4282 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4283 struct btrfs_root *root, 4284 struct btrfs_path *path, 4285 const struct btrfs_item_batch *batch) 4286 { 4287 int ret = 0; 4288 int slot; 4289 u32 total_size; 4290 4291 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4292 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); 4293 if (ret == 0) 4294 return -EEXIST; 4295 if (ret < 0) 4296 return ret; 4297 4298 slot = path->slots[0]; 4299 BUG_ON(slot < 0); 4300 4301 setup_items_for_insert(trans, root, path, batch); 4302 return 0; 4303 } 4304 4305 /* 4306 * Given a key and some data, insert an item into the tree. 4307 * This does all the path init required, making room in the tree if needed. 4308 */ 4309 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4310 const struct btrfs_key *cpu_key, void *data, 4311 u32 data_size) 4312 { 4313 int ret = 0; 4314 BTRFS_PATH_AUTO_FREE(path); 4315 struct extent_buffer *leaf; 4316 unsigned long ptr; 4317 4318 path = btrfs_alloc_path(); 4319 if (!path) 4320 return -ENOMEM; 4321 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4322 if (!ret) { 4323 leaf = path->nodes[0]; 4324 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4325 write_extent_buffer(leaf, data, ptr, data_size); 4326 btrfs_mark_buffer_dirty(trans, leaf); 4327 } 4328 return ret; 4329 } 4330 4331 /* 4332 * This function duplicates an item, giving 'new_key' to the new item. 4333 * It guarantees both items live in the same tree leaf and the new item is 4334 * contiguous with the original item. 4335 * 4336 * This allows us to split a file extent in place, keeping a lock on the leaf 4337 * the entire time. 4338 */ 4339 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4340 struct btrfs_root *root, 4341 struct btrfs_path *path, 4342 const struct btrfs_key *new_key) 4343 { 4344 struct extent_buffer *leaf; 4345 int ret; 4346 u32 item_size; 4347 4348 leaf = path->nodes[0]; 4349 item_size = btrfs_item_size(leaf, path->slots[0]); 4350 ret = setup_leaf_for_split(trans, root, path, 4351 item_size + sizeof(struct btrfs_item)); 4352 if (ret) 4353 return ret; 4354 4355 path->slots[0]++; 4356 btrfs_setup_item_for_insert(trans, root, path, new_key, item_size); 4357 leaf = path->nodes[0]; 4358 memcpy_extent_buffer(leaf, 4359 btrfs_item_ptr_offset(leaf, path->slots[0]), 4360 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4361 item_size); 4362 return 0; 4363 } 4364 4365 /* 4366 * delete the pointer from a given node. 4367 * 4368 * the tree should have been previously balanced so the deletion does not 4369 * empty a node. 4370 * 4371 * This is exported for use inside btrfs-progs, don't un-export it. 4372 */ 4373 int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4374 struct btrfs_path *path, int level, int slot) 4375 { 4376 struct extent_buffer *parent = path->nodes[level]; 4377 u32 nritems; 4378 int ret; 4379 4380 nritems = btrfs_header_nritems(parent); 4381 if (slot != nritems - 1) { 4382 if (level) { 4383 ret = btrfs_tree_mod_log_insert_move(parent, slot, 4384 slot + 1, nritems - slot - 1); 4385 if (unlikely(ret < 0)) { 4386 btrfs_abort_transaction(trans, ret); 4387 return ret; 4388 } 4389 } 4390 memmove_extent_buffer(parent, 4391 btrfs_node_key_ptr_offset(parent, slot), 4392 btrfs_node_key_ptr_offset(parent, slot + 1), 4393 sizeof(struct btrfs_key_ptr) * 4394 (nritems - slot - 1)); 4395 } else if (level) { 4396 ret = btrfs_tree_mod_log_insert_key(parent, slot, 4397 BTRFS_MOD_LOG_KEY_REMOVE); 4398 if (unlikely(ret < 0)) { 4399 btrfs_abort_transaction(trans, ret); 4400 return ret; 4401 } 4402 } 4403 4404 nritems--; 4405 btrfs_set_header_nritems(parent, nritems); 4406 if (nritems == 0 && parent == root->node) { 4407 BUG_ON(btrfs_header_level(root->node) != 1); 4408 /* just turn the root into a leaf and break */ 4409 btrfs_set_header_level(root->node, 0); 4410 } else if (slot == 0) { 4411 struct btrfs_disk_key disk_key; 4412 4413 btrfs_node_key(parent, &disk_key, 0); 4414 fixup_low_keys(trans, path, &disk_key, level + 1); 4415 } 4416 btrfs_mark_buffer_dirty(trans, parent); 4417 return 0; 4418 } 4419 4420 /* 4421 * a helper function to delete the leaf pointed to by path->slots[1] and 4422 * path->nodes[1]. 4423 * 4424 * This deletes the pointer in path->nodes[1] and frees the leaf 4425 * block extent. zero is returned if it all worked out, < 0 otherwise. 4426 * 4427 * The path must have already been setup for deleting the leaf, including 4428 * all the proper balancing. path->nodes[1] must be locked. 4429 */ 4430 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, 4431 struct btrfs_root *root, 4432 struct btrfs_path *path, 4433 struct extent_buffer *leaf) 4434 { 4435 int ret; 4436 4437 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4438 ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]); 4439 if (ret < 0) 4440 return ret; 4441 4442 /* 4443 * btrfs_free_extent is expensive, we want to make sure we 4444 * aren't holding any locks when we call it 4445 */ 4446 btrfs_unlock_up_safe(path, 0); 4447 4448 root_sub_used_bytes(root); 4449 4450 refcount_inc(&leaf->refs); 4451 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); 4452 free_extent_buffer_stale(leaf); 4453 if (ret < 0) 4454 btrfs_abort_transaction(trans, ret); 4455 4456 return ret; 4457 } 4458 /* 4459 * delete the item at the leaf level in path. If that empties 4460 * the leaf, remove it from the tree 4461 */ 4462 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4463 struct btrfs_path *path, int slot, int nr) 4464 { 4465 struct btrfs_fs_info *fs_info = root->fs_info; 4466 struct extent_buffer *leaf; 4467 int ret = 0; 4468 int wret; 4469 u32 nritems; 4470 4471 leaf = path->nodes[0]; 4472 nritems = btrfs_header_nritems(leaf); 4473 4474 if (slot + nr != nritems) { 4475 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); 4476 const int data_end = leaf_data_end(leaf); 4477 u32 dsize = 0; 4478 int i; 4479 4480 for (i = 0; i < nr; i++) 4481 dsize += btrfs_item_size(leaf, slot + i); 4482 4483 memmove_leaf_data(leaf, data_end + dsize, data_end, 4484 last_off - data_end); 4485 4486 for (i = slot + nr; i < nritems; i++) { 4487 u32 ioff; 4488 4489 ioff = btrfs_item_offset(leaf, i); 4490 btrfs_set_item_offset(leaf, i, ioff + dsize); 4491 } 4492 4493 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); 4494 } 4495 btrfs_set_header_nritems(leaf, nritems - nr); 4496 nritems -= nr; 4497 4498 /* delete the leaf if we've emptied it */ 4499 if (nritems == 0) { 4500 if (leaf != root->node) { 4501 btrfs_clear_buffer_dirty(trans, leaf); 4502 ret = btrfs_del_leaf(trans, root, path, leaf); 4503 if (ret < 0) 4504 return ret; 4505 } 4506 } else { 4507 int used = leaf_space_used(leaf, 0, nritems); 4508 if (slot == 0) { 4509 struct btrfs_disk_key disk_key; 4510 4511 btrfs_item_key(leaf, &disk_key, 0); 4512 fixup_low_keys(trans, path, &disk_key, 1); 4513 } 4514 4515 /* 4516 * Try to delete the leaf if it is mostly empty. We do this by 4517 * trying to move all its items into its left and right neighbours. 4518 * If we can't move all the items, then we don't delete it - it's 4519 * not ideal, but future insertions might fill the leaf with more 4520 * items, or items from other leaves might be moved later into our 4521 * leaf due to deletions on those leaves. 4522 */ 4523 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4524 u32 min_push_space; 4525 4526 /* push_leaf_left fixes the path. 4527 * make sure the path still points to our leaf 4528 * for possible call to btrfs_del_ptr below 4529 */ 4530 slot = path->slots[1]; 4531 refcount_inc(&leaf->refs); 4532 /* 4533 * We want to be able to at least push one item to the 4534 * left neighbour leaf, and that's the first item. 4535 */ 4536 min_push_space = sizeof(struct btrfs_item) + 4537 btrfs_item_size(leaf, 0); 4538 wret = push_leaf_left(trans, root, path, 0, 4539 min_push_space, 1, (u32)-1); 4540 if (wret < 0 && wret != -ENOSPC) 4541 ret = wret; 4542 4543 if (path->nodes[0] == leaf && 4544 btrfs_header_nritems(leaf)) { 4545 /* 4546 * If we were not able to push all items from our 4547 * leaf to its left neighbour, then attempt to 4548 * either push all the remaining items to the 4549 * right neighbour or none. There's no advantage 4550 * in pushing only some items, instead of all, as 4551 * it's pointless to end up with a leaf having 4552 * too few items while the neighbours can be full 4553 * or nearly full. 4554 */ 4555 nritems = btrfs_header_nritems(leaf); 4556 min_push_space = leaf_space_used(leaf, 0, nritems); 4557 wret = push_leaf_right(trans, root, path, 0, 4558 min_push_space, 1, 0); 4559 if (wret < 0 && wret != -ENOSPC) 4560 ret = wret; 4561 } 4562 4563 if (btrfs_header_nritems(leaf) == 0) { 4564 path->slots[1] = slot; 4565 ret = btrfs_del_leaf(trans, root, path, leaf); 4566 free_extent_buffer(leaf); 4567 if (ret < 0) 4568 return ret; 4569 } else { 4570 /* if we're still in the path, make sure 4571 * we're dirty. Otherwise, one of the 4572 * push_leaf functions must have already 4573 * dirtied this buffer 4574 */ 4575 if (path->nodes[0] == leaf) 4576 btrfs_mark_buffer_dirty(trans, leaf); 4577 free_extent_buffer(leaf); 4578 } 4579 } else { 4580 btrfs_mark_buffer_dirty(trans, leaf); 4581 } 4582 } 4583 return ret; 4584 } 4585 4586 /* 4587 * A helper function to walk down the tree starting at min_key, and looking 4588 * for leaves that have a minimum transaction id. 4589 * This is used by the btree defrag code, and tree logging 4590 * 4591 * This does not cow, but it does stuff the starting key it finds back 4592 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4593 * key and get a writable path. 4594 * 4595 * min_trans indicates the oldest transaction that you are interested 4596 * in walking through. Any nodes or leaves older than min_trans are 4597 * skipped over (without reading them). 4598 * 4599 * returns zero if something useful was found, < 0 on error and 1 if there 4600 * was nothing in the tree that matched the search criteria. 4601 */ 4602 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4603 struct btrfs_path *path, 4604 u64 min_trans) 4605 { 4606 struct extent_buffer *cur; 4607 int slot; 4608 int sret; 4609 u32 nritems; 4610 int level; 4611 int ret = 1; 4612 const bool keep_locks = path->keep_locks; 4613 4614 ASSERT(!path->nowait); 4615 ASSERT(path->lowest_level == 0); 4616 path->keep_locks = true; 4617 again: 4618 cur = btrfs_read_lock_root_node(root); 4619 level = btrfs_header_level(cur); 4620 WARN_ON(path->nodes[level]); 4621 path->nodes[level] = cur; 4622 path->locks[level] = BTRFS_READ_LOCK; 4623 4624 if (btrfs_header_generation(cur) < min_trans) { 4625 ret = 1; 4626 goto out; 4627 } 4628 while (1) { 4629 nritems = btrfs_header_nritems(cur); 4630 level = btrfs_header_level(cur); 4631 sret = btrfs_bin_search(cur, 0, min_key, &slot); 4632 if (sret < 0) { 4633 ret = sret; 4634 goto out; 4635 } 4636 4637 /* At level 0 we're done, setup the path and exit. */ 4638 if (level == 0) { 4639 if (slot >= nritems) 4640 goto find_next_key; 4641 ret = 0; 4642 path->slots[level] = slot; 4643 /* Save our key for returning back. */ 4644 btrfs_item_key_to_cpu(cur, min_key, slot); 4645 goto out; 4646 } 4647 if (sret && slot > 0) 4648 slot--; 4649 /* 4650 * check this node pointer against the min_trans parameters. 4651 * If it is too old, skip to the next one. 4652 */ 4653 while (slot < nritems) { 4654 u64 gen; 4655 4656 gen = btrfs_node_ptr_generation(cur, slot); 4657 if (gen < min_trans) { 4658 slot++; 4659 continue; 4660 } 4661 break; 4662 } 4663 find_next_key: 4664 /* 4665 * we didn't find a candidate key in this node, walk forward 4666 * and find another one 4667 */ 4668 path->slots[level] = slot; 4669 if (slot >= nritems) { 4670 sret = btrfs_find_next_key(root, path, min_key, level, 4671 min_trans); 4672 if (sret == 0) { 4673 btrfs_release_path(path); 4674 goto again; 4675 } else { 4676 goto out; 4677 } 4678 } 4679 cur = btrfs_read_node_slot(cur, slot); 4680 if (IS_ERR(cur)) { 4681 ret = PTR_ERR(cur); 4682 goto out; 4683 } 4684 4685 btrfs_tree_read_lock(cur); 4686 4687 path->locks[level - 1] = BTRFS_READ_LOCK; 4688 path->nodes[level - 1] = cur; 4689 unlock_up(path, level, 1, 0, NULL); 4690 } 4691 out: 4692 path->keep_locks = keep_locks; 4693 if (ret == 0) 4694 btrfs_unlock_up_safe(path, 1); 4695 return ret; 4696 } 4697 4698 /* 4699 * this is similar to btrfs_next_leaf, but does not try to preserve 4700 * and fixup the path. It looks for and returns the next key in the 4701 * tree based on the current path and the min_trans parameters. 4702 * 4703 * 0 is returned if another key is found, < 0 if there are any errors 4704 * and 1 is returned if there are no higher keys in the tree 4705 * 4706 * path->keep_locks should be set to true on the search made before 4707 * calling this function. 4708 */ 4709 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4710 struct btrfs_key *key, int level, u64 min_trans) 4711 { 4712 int slot; 4713 struct extent_buffer *c; 4714 4715 WARN_ON(!path->keep_locks && !path->skip_locking); 4716 while (level < BTRFS_MAX_LEVEL) { 4717 if (!path->nodes[level]) 4718 return 1; 4719 4720 slot = path->slots[level] + 1; 4721 c = path->nodes[level]; 4722 next: 4723 if (slot >= btrfs_header_nritems(c)) { 4724 int ret; 4725 int orig_lowest; 4726 struct btrfs_key cur_key; 4727 if (level + 1 >= BTRFS_MAX_LEVEL || 4728 !path->nodes[level + 1]) 4729 return 1; 4730 4731 if (path->locks[level + 1] || path->skip_locking) { 4732 level++; 4733 continue; 4734 } 4735 4736 slot = btrfs_header_nritems(c) - 1; 4737 if (level == 0) 4738 btrfs_item_key_to_cpu(c, &cur_key, slot); 4739 else 4740 btrfs_node_key_to_cpu(c, &cur_key, slot); 4741 4742 orig_lowest = path->lowest_level; 4743 btrfs_release_path(path); 4744 path->lowest_level = level; 4745 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4746 0, 0); 4747 path->lowest_level = orig_lowest; 4748 if (ret < 0) 4749 return ret; 4750 4751 c = path->nodes[level]; 4752 slot = path->slots[level]; 4753 if (ret == 0) 4754 slot++; 4755 goto next; 4756 } 4757 4758 if (level == 0) 4759 btrfs_item_key_to_cpu(c, key, slot); 4760 else { 4761 u64 gen = btrfs_node_ptr_generation(c, slot); 4762 4763 if (gen < min_trans) { 4764 slot++; 4765 goto next; 4766 } 4767 btrfs_node_key_to_cpu(c, key, slot); 4768 } 4769 return 0; 4770 } 4771 return 1; 4772 } 4773 4774 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 4775 u64 time_seq) 4776 { 4777 int slot; 4778 int level; 4779 struct extent_buffer *c; 4780 struct extent_buffer *next; 4781 struct btrfs_fs_info *fs_info = root->fs_info; 4782 struct btrfs_key key; 4783 bool need_commit_sem = false; 4784 u32 nritems; 4785 int ret; 4786 int i; 4787 4788 /* 4789 * The nowait semantics are used only for write paths, where we don't 4790 * use the tree mod log and sequence numbers. 4791 */ 4792 if (time_seq) 4793 ASSERT(!path->nowait); 4794 4795 nritems = btrfs_header_nritems(path->nodes[0]); 4796 if (nritems == 0) 4797 return 1; 4798 4799 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4800 again: 4801 level = 1; 4802 next = NULL; 4803 btrfs_release_path(path); 4804 4805 path->keep_locks = true; 4806 4807 if (time_seq) { 4808 ret = btrfs_search_old_slot(root, &key, path, time_seq); 4809 } else { 4810 if (path->need_commit_sem) { 4811 path->need_commit_sem = false; 4812 need_commit_sem = true; 4813 if (path->nowait) { 4814 if (!down_read_trylock(&fs_info->commit_root_sem)) { 4815 ret = -EAGAIN; 4816 goto done; 4817 } 4818 } else { 4819 down_read(&fs_info->commit_root_sem); 4820 } 4821 } 4822 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4823 } 4824 path->keep_locks = false; 4825 4826 if (ret < 0) 4827 goto done; 4828 4829 nritems = btrfs_header_nritems(path->nodes[0]); 4830 /* 4831 * By releasing the path above we dropped all our locks. A balance 4832 * could have happened and 4833 * 4834 * 1. added more items after the previous last item 4835 * 2. deleted the previous last item 4836 * 4837 * So, check again here and advance the path if there are now more 4838 * items available. 4839 */ 4840 if (nritems > 0 && path->slots[0] <= nritems - 1) { 4841 if (ret == 0 && path->slots[0] != nritems - 1) { 4842 path->slots[0]++; 4843 goto done; 4844 } else if (ret > 0) { 4845 ret = 0; 4846 goto done; 4847 } 4848 } 4849 4850 while (level < BTRFS_MAX_LEVEL) { 4851 if (!path->nodes[level]) { 4852 ret = 1; 4853 goto done; 4854 } 4855 4856 slot = path->slots[level] + 1; 4857 c = path->nodes[level]; 4858 if (slot >= btrfs_header_nritems(c)) { 4859 level++; 4860 if (level == BTRFS_MAX_LEVEL) { 4861 ret = 1; 4862 goto done; 4863 } 4864 continue; 4865 } 4866 4867 4868 /* 4869 * Our current level is where we're going to start from, and to 4870 * make sure lockdep doesn't complain we need to drop our locks 4871 * and nodes from 0 to our current level. 4872 */ 4873 for (i = 0; i < level; i++) { 4874 if (path->locks[level]) { 4875 btrfs_tree_read_unlock(path->nodes[i]); 4876 path->locks[i] = 0; 4877 } 4878 free_extent_buffer(path->nodes[i]); 4879 path->nodes[i] = NULL; 4880 } 4881 4882 next = c; 4883 ret = read_block_for_search(root, path, &next, slot, &key); 4884 if (ret == -EAGAIN && !path->nowait) 4885 goto again; 4886 4887 if (ret < 0) { 4888 btrfs_release_path(path); 4889 goto done; 4890 } 4891 4892 if (!path->skip_locking) { 4893 ret = btrfs_try_tree_read_lock(next); 4894 if (!ret && path->nowait) { 4895 ret = -EAGAIN; 4896 goto done; 4897 } 4898 if (!ret && time_seq) { 4899 /* 4900 * If we don't get the lock, we may be racing 4901 * with push_leaf_left, holding that lock while 4902 * itself waiting for the leaf we've currently 4903 * locked. To solve this situation, we give up 4904 * on our lock and cycle. 4905 */ 4906 free_extent_buffer(next); 4907 btrfs_release_path(path); 4908 cond_resched(); 4909 goto again; 4910 } 4911 if (!ret) 4912 btrfs_tree_read_lock(next); 4913 } 4914 break; 4915 } 4916 path->slots[level] = slot; 4917 while (1) { 4918 level--; 4919 path->nodes[level] = next; 4920 path->slots[level] = 0; 4921 if (!path->skip_locking) 4922 path->locks[level] = BTRFS_READ_LOCK; 4923 if (!level) 4924 break; 4925 4926 ret = read_block_for_search(root, path, &next, 0, &key); 4927 if (ret == -EAGAIN && !path->nowait) 4928 goto again; 4929 4930 if (ret < 0) { 4931 btrfs_release_path(path); 4932 goto done; 4933 } 4934 4935 if (!path->skip_locking) { 4936 if (path->nowait) { 4937 if (!btrfs_try_tree_read_lock(next)) { 4938 ret = -EAGAIN; 4939 goto done; 4940 } 4941 } else { 4942 btrfs_tree_read_lock(next); 4943 } 4944 } 4945 } 4946 ret = 0; 4947 done: 4948 unlock_up(path, 0, 1, 0, NULL); 4949 if (need_commit_sem) { 4950 int ret2; 4951 4952 path->need_commit_sem = true; 4953 ret2 = finish_need_commit_sem_search(path); 4954 up_read(&fs_info->commit_root_sem); 4955 if (ret2) 4956 ret = ret2; 4957 } 4958 4959 return ret; 4960 } 4961 4962 int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) 4963 { 4964 path->slots[0]++; 4965 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 4966 return btrfs_next_old_leaf(root, path, time_seq); 4967 return 0; 4968 } 4969 4970 /* 4971 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 4972 * searching until it gets past min_objectid or finds an item of 'type' 4973 * 4974 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4975 */ 4976 int btrfs_previous_item(struct btrfs_root *root, 4977 struct btrfs_path *path, u64 min_objectid, 4978 int type) 4979 { 4980 struct btrfs_key found_key; 4981 struct extent_buffer *leaf; 4982 u32 nritems; 4983 int ret; 4984 4985 while (1) { 4986 if (path->slots[0] == 0) { 4987 ret = btrfs_prev_leaf(root, path); 4988 if (ret != 0) 4989 return ret; 4990 } else { 4991 path->slots[0]--; 4992 } 4993 leaf = path->nodes[0]; 4994 nritems = btrfs_header_nritems(leaf); 4995 if (nritems == 0) 4996 return 1; 4997 if (path->slots[0] == nritems) 4998 path->slots[0]--; 4999 5000 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5001 if (found_key.objectid < min_objectid) 5002 break; 5003 if (found_key.type == type) 5004 return 0; 5005 if (found_key.objectid == min_objectid && 5006 found_key.type < type) 5007 break; 5008 } 5009 return 1; 5010 } 5011 5012 /* 5013 * search in extent tree to find a previous Metadata/Data extent item with 5014 * min objecitd. 5015 * 5016 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5017 */ 5018 int btrfs_previous_extent_item(struct btrfs_root *root, 5019 struct btrfs_path *path, u64 min_objectid) 5020 { 5021 struct btrfs_key found_key; 5022 struct extent_buffer *leaf; 5023 u32 nritems; 5024 int ret; 5025 5026 while (1) { 5027 if (path->slots[0] == 0) { 5028 ret = btrfs_prev_leaf(root, path); 5029 if (ret != 0) 5030 return ret; 5031 } else { 5032 path->slots[0]--; 5033 } 5034 leaf = path->nodes[0]; 5035 nritems = btrfs_header_nritems(leaf); 5036 if (nritems == 0) 5037 return 1; 5038 if (path->slots[0] == nritems) 5039 path->slots[0]--; 5040 5041 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5042 if (found_key.objectid < min_objectid) 5043 break; 5044 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 5045 found_key.type == BTRFS_METADATA_ITEM_KEY) 5046 return 0; 5047 if (found_key.objectid == min_objectid && 5048 found_key.type < BTRFS_EXTENT_ITEM_KEY) 5049 break; 5050 } 5051 return 1; 5052 } 5053 5054 int __init btrfs_ctree_init(void) 5055 { 5056 btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0); 5057 if (!btrfs_path_cachep) 5058 return -ENOMEM; 5059 return 0; 5060 } 5061 5062 void __cold btrfs_ctree_exit(void) 5063 { 5064 kmem_cache_destroy(btrfs_path_cachep); 5065 } 5066