1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include <linux/error-injection.h> 11 #include "messages.h" 12 #include "ctree.h" 13 #include "disk-io.h" 14 #include "transaction.h" 15 #include "print-tree.h" 16 #include "locking.h" 17 #include "volumes.h" 18 #include "qgroup.h" 19 #include "tree-mod-log.h" 20 #include "tree-checker.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "extent-tree.h" 24 #include "relocation.h" 25 #include "file-item.h" 26 27 static struct kmem_cache *btrfs_path_cachep; 28 29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 30 *root, struct btrfs_path *path, int level); 31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 32 const struct btrfs_key *ins_key, struct btrfs_path *path, 33 int data_size, int extend); 34 static int push_node_left(struct btrfs_trans_handle *trans, 35 struct extent_buffer *dst, 36 struct extent_buffer *src, int empty); 37 static int balance_node_right(struct btrfs_trans_handle *trans, 38 struct extent_buffer *dst_buf, 39 struct extent_buffer *src_buf); 40 41 static const struct btrfs_csums { 42 u16 size; 43 const char name[10]; 44 const char driver[12]; 45 } btrfs_csums[] = { 46 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, 47 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, 48 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, 49 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", 50 .driver = "blake2b-256" }, 51 }; 52 53 /* 54 * The leaf data grows from end-to-front in the node. this returns the address 55 * of the start of the last item, which is the stop of the leaf data stack. 56 */ 57 static unsigned int leaf_data_end(const struct extent_buffer *leaf) 58 { 59 u32 nr = btrfs_header_nritems(leaf); 60 61 if (nr == 0) 62 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); 63 return btrfs_item_offset(leaf, nr - 1); 64 } 65 66 /* 67 * Move data in a @leaf (using memmove, safe for overlapping ranges). 68 * 69 * @leaf: leaf that we're doing a memmove on 70 * @dst_offset: item data offset we're moving to 71 * @src_offset: item data offset were' moving from 72 * @len: length of the data we're moving 73 * 74 * Wrapper around memmove_extent_buffer() that takes into account the header on 75 * the leaf. The btrfs_item offset's start directly after the header, so we 76 * have to adjust any offsets to account for the header in the leaf. This 77 * handles that math to simplify the callers. 78 */ 79 static inline void memmove_leaf_data(const struct extent_buffer *leaf, 80 unsigned long dst_offset, 81 unsigned long src_offset, 82 unsigned long len) 83 { 84 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, 85 btrfs_item_nr_offset(leaf, 0) + src_offset, len); 86 } 87 88 /* 89 * Copy item data from @src into @dst at the given @offset. 90 * 91 * @dst: destination leaf that we're copying into 92 * @src: source leaf that we're copying from 93 * @dst_offset: item data offset we're copying to 94 * @src_offset: item data offset were' copying from 95 * @len: length of the data we're copying 96 * 97 * Wrapper around copy_extent_buffer() that takes into account the header on 98 * the leaf. The btrfs_item offset's start directly after the header, so we 99 * have to adjust any offsets to account for the header in the leaf. This 100 * handles that math to simplify the callers. 101 */ 102 static inline void copy_leaf_data(const struct extent_buffer *dst, 103 const struct extent_buffer *src, 104 unsigned long dst_offset, 105 unsigned long src_offset, unsigned long len) 106 { 107 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, 108 btrfs_item_nr_offset(src, 0) + src_offset, len); 109 } 110 111 /* 112 * Move items in a @leaf (using memmove). 113 * 114 * @dst: destination leaf for the items 115 * @dst_item: the item nr we're copying into 116 * @src_item: the item nr we're copying from 117 * @nr_items: the number of items to copy 118 * 119 * Wrapper around memmove_extent_buffer() that does the math to get the 120 * appropriate offsets into the leaf from the item numbers. 121 */ 122 static inline void memmove_leaf_items(const struct extent_buffer *leaf, 123 int dst_item, int src_item, int nr_items) 124 { 125 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), 126 btrfs_item_nr_offset(leaf, src_item), 127 nr_items * sizeof(struct btrfs_item)); 128 } 129 130 /* 131 * Copy items from @src into @dst at the given @offset. 132 * 133 * @dst: destination leaf for the items 134 * @src: source leaf for the items 135 * @dst_item: the item nr we're copying into 136 * @src_item: the item nr we're copying from 137 * @nr_items: the number of items to copy 138 * 139 * Wrapper around copy_extent_buffer() that does the math to get the 140 * appropriate offsets into the leaf from the item numbers. 141 */ 142 static inline void copy_leaf_items(const struct extent_buffer *dst, 143 const struct extent_buffer *src, 144 int dst_item, int src_item, int nr_items) 145 { 146 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), 147 btrfs_item_nr_offset(src, src_item), 148 nr_items * sizeof(struct btrfs_item)); 149 } 150 151 /* This exists for btrfs-progs usages. */ 152 u16 btrfs_csum_type_size(u16 type) 153 { 154 return btrfs_csums[type].size; 155 } 156 157 int btrfs_super_csum_size(const struct btrfs_super_block *s) 158 { 159 u16 t = btrfs_super_csum_type(s); 160 /* 161 * csum type is validated at mount time 162 */ 163 return btrfs_csum_type_size(t); 164 } 165 166 const char *btrfs_super_csum_name(u16 csum_type) 167 { 168 /* csum type is validated at mount time */ 169 return btrfs_csums[csum_type].name; 170 } 171 172 /* 173 * Return driver name if defined, otherwise the name that's also a valid driver 174 * name 175 */ 176 const char *btrfs_super_csum_driver(u16 csum_type) 177 { 178 /* csum type is validated at mount time */ 179 return btrfs_csums[csum_type].driver[0] ? 180 btrfs_csums[csum_type].driver : 181 btrfs_csums[csum_type].name; 182 } 183 184 size_t __attribute_const__ btrfs_get_num_csums(void) 185 { 186 return ARRAY_SIZE(btrfs_csums); 187 } 188 189 struct btrfs_path *btrfs_alloc_path(void) 190 { 191 might_sleep(); 192 193 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 194 } 195 196 /* this also releases the path */ 197 void btrfs_free_path(struct btrfs_path *p) 198 { 199 if (!p) 200 return; 201 btrfs_release_path(p); 202 kmem_cache_free(btrfs_path_cachep, p); 203 } 204 205 /* 206 * path release drops references on the extent buffers in the path 207 * and it drops any locks held by this path 208 * 209 * It is safe to call this on paths that no locks or extent buffers held. 210 */ 211 noinline void btrfs_release_path(struct btrfs_path *p) 212 { 213 int i; 214 215 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 216 p->slots[i] = 0; 217 if (!p->nodes[i]) 218 continue; 219 if (p->locks[i]) { 220 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 221 p->locks[i] = 0; 222 } 223 free_extent_buffer(p->nodes[i]); 224 p->nodes[i] = NULL; 225 } 226 } 227 228 /* 229 * We want the transaction abort to print stack trace only for errors where the 230 * cause could be a bug, eg. due to ENOSPC, and not for common errors that are 231 * caused by external factors. 232 */ 233 bool __cold abort_should_print_stack(int errno) 234 { 235 switch (errno) { 236 case -EIO: 237 case -EROFS: 238 case -ENOMEM: 239 return false; 240 } 241 return true; 242 } 243 244 /* 245 * safely gets a reference on the root node of a tree. A lock 246 * is not taken, so a concurrent writer may put a different node 247 * at the root of the tree. See btrfs_lock_root_node for the 248 * looping required. 249 * 250 * The extent buffer returned by this has a reference taken, so 251 * it won't disappear. It may stop being the root of the tree 252 * at any time because there are no locks held. 253 */ 254 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 255 { 256 struct extent_buffer *eb; 257 258 while (1) { 259 rcu_read_lock(); 260 eb = rcu_dereference(root->node); 261 262 /* 263 * RCU really hurts here, we could free up the root node because 264 * it was COWed but we may not get the new root node yet so do 265 * the inc_not_zero dance and if it doesn't work then 266 * synchronize_rcu and try again. 267 */ 268 if (atomic_inc_not_zero(&eb->refs)) { 269 rcu_read_unlock(); 270 break; 271 } 272 rcu_read_unlock(); 273 synchronize_rcu(); 274 } 275 return eb; 276 } 277 278 /* 279 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), 280 * just get put onto a simple dirty list. Transaction walks this list to make 281 * sure they get properly updated on disk. 282 */ 283 static void add_root_to_dirty_list(struct btrfs_root *root) 284 { 285 struct btrfs_fs_info *fs_info = root->fs_info; 286 287 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 288 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 289 return; 290 291 spin_lock(&fs_info->trans_lock); 292 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 293 /* Want the extent tree to be the last on the list */ 294 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) 295 list_move_tail(&root->dirty_list, 296 &fs_info->dirty_cowonly_roots); 297 else 298 list_move(&root->dirty_list, 299 &fs_info->dirty_cowonly_roots); 300 } 301 spin_unlock(&fs_info->trans_lock); 302 } 303 304 /* 305 * used by snapshot creation to make a copy of a root for a tree with 306 * a given objectid. The buffer with the new root node is returned in 307 * cow_ret, and this func returns zero on success or a negative error code. 308 */ 309 int btrfs_copy_root(struct btrfs_trans_handle *trans, 310 struct btrfs_root *root, 311 struct extent_buffer *buf, 312 struct extent_buffer **cow_ret, u64 new_root_objectid) 313 { 314 struct btrfs_fs_info *fs_info = root->fs_info; 315 struct extent_buffer *cow; 316 int ret = 0; 317 int level; 318 struct btrfs_disk_key disk_key; 319 320 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 321 trans->transid != fs_info->running_transaction->transid); 322 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 323 trans->transid != root->last_trans); 324 325 level = btrfs_header_level(buf); 326 if (level == 0) 327 btrfs_item_key(buf, &disk_key, 0); 328 else 329 btrfs_node_key(buf, &disk_key, 0); 330 331 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 332 &disk_key, level, buf->start, 0, 333 BTRFS_NESTING_NEW_ROOT); 334 if (IS_ERR(cow)) 335 return PTR_ERR(cow); 336 337 copy_extent_buffer_full(cow, buf); 338 btrfs_set_header_bytenr(cow, cow->start); 339 btrfs_set_header_generation(cow, trans->transid); 340 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 341 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 342 BTRFS_HEADER_FLAG_RELOC); 343 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 344 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 345 else 346 btrfs_set_header_owner(cow, new_root_objectid); 347 348 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 349 350 WARN_ON(btrfs_header_generation(buf) > trans->transid); 351 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 352 ret = btrfs_inc_ref(trans, root, cow, 1); 353 else 354 ret = btrfs_inc_ref(trans, root, cow, 0); 355 if (ret) { 356 btrfs_tree_unlock(cow); 357 free_extent_buffer(cow); 358 btrfs_abort_transaction(trans, ret); 359 return ret; 360 } 361 362 btrfs_mark_buffer_dirty(cow); 363 *cow_ret = cow; 364 return 0; 365 } 366 367 /* 368 * check if the tree block can be shared by multiple trees 369 */ 370 int btrfs_block_can_be_shared(struct btrfs_root *root, 371 struct extent_buffer *buf) 372 { 373 /* 374 * Tree blocks not in shareable trees and tree roots are never shared. 375 * If a block was allocated after the last snapshot and the block was 376 * not allocated by tree relocation, we know the block is not shared. 377 */ 378 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 379 buf != root->node && buf != root->commit_root && 380 (btrfs_header_generation(buf) <= 381 btrfs_root_last_snapshot(&root->root_item) || 382 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 383 return 1; 384 385 return 0; 386 } 387 388 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 389 struct btrfs_root *root, 390 struct extent_buffer *buf, 391 struct extent_buffer *cow, 392 int *last_ref) 393 { 394 struct btrfs_fs_info *fs_info = root->fs_info; 395 u64 refs; 396 u64 owner; 397 u64 flags; 398 u64 new_flags = 0; 399 int ret; 400 401 /* 402 * Backrefs update rules: 403 * 404 * Always use full backrefs for extent pointers in tree block 405 * allocated by tree relocation. 406 * 407 * If a shared tree block is no longer referenced by its owner 408 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 409 * use full backrefs for extent pointers in tree block. 410 * 411 * If a tree block is been relocating 412 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 413 * use full backrefs for extent pointers in tree block. 414 * The reason for this is some operations (such as drop tree) 415 * are only allowed for blocks use full backrefs. 416 */ 417 418 if (btrfs_block_can_be_shared(root, buf)) { 419 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 420 btrfs_header_level(buf), 1, 421 &refs, &flags); 422 if (ret) 423 return ret; 424 if (unlikely(refs == 0)) { 425 btrfs_crit(fs_info, 426 "found 0 references for tree block at bytenr %llu level %d root %llu", 427 buf->start, btrfs_header_level(buf), 428 btrfs_root_id(root)); 429 ret = -EUCLEAN; 430 btrfs_abort_transaction(trans, ret); 431 return ret; 432 } 433 } else { 434 refs = 1; 435 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 436 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 437 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 438 else 439 flags = 0; 440 } 441 442 owner = btrfs_header_owner(buf); 443 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && 444 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 445 446 if (refs > 1) { 447 if ((owner == root->root_key.objectid || 448 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 449 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 450 ret = btrfs_inc_ref(trans, root, buf, 1); 451 if (ret) 452 return ret; 453 454 if (root->root_key.objectid == 455 BTRFS_TREE_RELOC_OBJECTID) { 456 ret = btrfs_dec_ref(trans, root, buf, 0); 457 if (ret) 458 return ret; 459 ret = btrfs_inc_ref(trans, root, cow, 1); 460 if (ret) 461 return ret; 462 } 463 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 464 } else { 465 466 if (root->root_key.objectid == 467 BTRFS_TREE_RELOC_OBJECTID) 468 ret = btrfs_inc_ref(trans, root, cow, 1); 469 else 470 ret = btrfs_inc_ref(trans, root, cow, 0); 471 if (ret) 472 return ret; 473 } 474 if (new_flags != 0) { 475 ret = btrfs_set_disk_extent_flags(trans, buf, new_flags); 476 if (ret) 477 return ret; 478 } 479 } else { 480 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 481 if (root->root_key.objectid == 482 BTRFS_TREE_RELOC_OBJECTID) 483 ret = btrfs_inc_ref(trans, root, cow, 1); 484 else 485 ret = btrfs_inc_ref(trans, root, cow, 0); 486 if (ret) 487 return ret; 488 ret = btrfs_dec_ref(trans, root, buf, 1); 489 if (ret) 490 return ret; 491 } 492 btrfs_clear_buffer_dirty(trans, buf); 493 *last_ref = 1; 494 } 495 return 0; 496 } 497 498 /* 499 * does the dirty work in cow of a single block. The parent block (if 500 * supplied) is updated to point to the new cow copy. The new buffer is marked 501 * dirty and returned locked. If you modify the block it needs to be marked 502 * dirty again. 503 * 504 * search_start -- an allocation hint for the new block 505 * 506 * empty_size -- a hint that you plan on doing more cow. This is the size in 507 * bytes the allocator should try to find free next to the block it returns. 508 * This is just a hint and may be ignored by the allocator. 509 */ 510 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 511 struct btrfs_root *root, 512 struct extent_buffer *buf, 513 struct extent_buffer *parent, int parent_slot, 514 struct extent_buffer **cow_ret, 515 u64 search_start, u64 empty_size, 516 enum btrfs_lock_nesting nest) 517 { 518 struct btrfs_fs_info *fs_info = root->fs_info; 519 struct btrfs_disk_key disk_key; 520 struct extent_buffer *cow; 521 int level, ret; 522 int last_ref = 0; 523 int unlock_orig = 0; 524 u64 parent_start = 0; 525 526 if (*cow_ret == buf) 527 unlock_orig = 1; 528 529 btrfs_assert_tree_write_locked(buf); 530 531 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 532 trans->transid != fs_info->running_transaction->transid); 533 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 534 trans->transid != root->last_trans); 535 536 level = btrfs_header_level(buf); 537 538 if (level == 0) 539 btrfs_item_key(buf, &disk_key, 0); 540 else 541 btrfs_node_key(buf, &disk_key, 0); 542 543 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) 544 parent_start = parent->start; 545 546 cow = btrfs_alloc_tree_block(trans, root, parent_start, 547 root->root_key.objectid, &disk_key, level, 548 search_start, empty_size, nest); 549 if (IS_ERR(cow)) 550 return PTR_ERR(cow); 551 552 /* cow is set to blocking by btrfs_init_new_buffer */ 553 554 copy_extent_buffer_full(cow, buf); 555 btrfs_set_header_bytenr(cow, cow->start); 556 btrfs_set_header_generation(cow, trans->transid); 557 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 558 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 559 BTRFS_HEADER_FLAG_RELOC); 560 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 561 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 562 else 563 btrfs_set_header_owner(cow, root->root_key.objectid); 564 565 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 566 567 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 568 if (ret) { 569 btrfs_tree_unlock(cow); 570 free_extent_buffer(cow); 571 btrfs_abort_transaction(trans, ret); 572 return ret; 573 } 574 575 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 576 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 577 if (ret) { 578 btrfs_tree_unlock(cow); 579 free_extent_buffer(cow); 580 btrfs_abort_transaction(trans, ret); 581 return ret; 582 } 583 } 584 585 if (buf == root->node) { 586 WARN_ON(parent && parent != buf); 587 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 588 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 589 parent_start = buf->start; 590 591 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); 592 if (ret < 0) { 593 btrfs_tree_unlock(cow); 594 free_extent_buffer(cow); 595 btrfs_abort_transaction(trans, ret); 596 return ret; 597 } 598 atomic_inc(&cow->refs); 599 rcu_assign_pointer(root->node, cow); 600 601 btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 602 parent_start, last_ref); 603 free_extent_buffer(buf); 604 add_root_to_dirty_list(root); 605 } else { 606 WARN_ON(trans->transid != btrfs_header_generation(parent)); 607 ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, 608 BTRFS_MOD_LOG_KEY_REPLACE); 609 if (ret) { 610 btrfs_tree_unlock(cow); 611 free_extent_buffer(cow); 612 btrfs_abort_transaction(trans, ret); 613 return ret; 614 } 615 btrfs_set_node_blockptr(parent, parent_slot, 616 cow->start); 617 btrfs_set_node_ptr_generation(parent, parent_slot, 618 trans->transid); 619 btrfs_mark_buffer_dirty(parent); 620 if (last_ref) { 621 ret = btrfs_tree_mod_log_free_eb(buf); 622 if (ret) { 623 btrfs_tree_unlock(cow); 624 free_extent_buffer(cow); 625 btrfs_abort_transaction(trans, ret); 626 return ret; 627 } 628 } 629 btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 630 parent_start, last_ref); 631 } 632 if (unlock_orig) 633 btrfs_tree_unlock(buf); 634 free_extent_buffer_stale(buf); 635 btrfs_mark_buffer_dirty(cow); 636 *cow_ret = cow; 637 return 0; 638 } 639 640 static inline int should_cow_block(struct btrfs_trans_handle *trans, 641 struct btrfs_root *root, 642 struct extent_buffer *buf) 643 { 644 if (btrfs_is_testing(root->fs_info)) 645 return 0; 646 647 /* Ensure we can see the FORCE_COW bit */ 648 smp_mb__before_atomic(); 649 650 /* 651 * We do not need to cow a block if 652 * 1) this block is not created or changed in this transaction; 653 * 2) this block does not belong to TREE_RELOC tree; 654 * 3) the root is not forced COW. 655 * 656 * What is forced COW: 657 * when we create snapshot during committing the transaction, 658 * after we've finished copying src root, we must COW the shared 659 * block to ensure the metadata consistency. 660 */ 661 if (btrfs_header_generation(buf) == trans->transid && 662 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 663 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 664 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 665 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 666 return 0; 667 return 1; 668 } 669 670 /* 671 * cows a single block, see __btrfs_cow_block for the real work. 672 * This version of it has extra checks so that a block isn't COWed more than 673 * once per transaction, as long as it hasn't been written yet 674 */ 675 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 676 struct btrfs_root *root, struct extent_buffer *buf, 677 struct extent_buffer *parent, int parent_slot, 678 struct extent_buffer **cow_ret, 679 enum btrfs_lock_nesting nest) 680 { 681 struct btrfs_fs_info *fs_info = root->fs_info; 682 u64 search_start; 683 int ret; 684 685 if (test_bit(BTRFS_ROOT_DELETING, &root->state)) 686 btrfs_err(fs_info, 687 "COW'ing blocks on a fs root that's being dropped"); 688 689 if (trans->transaction != fs_info->running_transaction) 690 WARN(1, KERN_CRIT "trans %llu running %llu\n", 691 trans->transid, 692 fs_info->running_transaction->transid); 693 694 if (trans->transid != fs_info->generation) 695 WARN(1, KERN_CRIT "trans %llu running %llu\n", 696 trans->transid, fs_info->generation); 697 698 if (!should_cow_block(trans, root, buf)) { 699 *cow_ret = buf; 700 return 0; 701 } 702 703 search_start = buf->start & ~((u64)SZ_1G - 1); 704 705 /* 706 * Before CoWing this block for later modification, check if it's 707 * the subtree root and do the delayed subtree trace if needed. 708 * 709 * Also We don't care about the error, as it's handled internally. 710 */ 711 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 712 ret = __btrfs_cow_block(trans, root, buf, parent, 713 parent_slot, cow_ret, search_start, 0, nest); 714 715 trace_btrfs_cow_block(root, buf, *cow_ret); 716 717 return ret; 718 } 719 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 720 721 /* 722 * helper function for defrag to decide if two blocks pointed to by a 723 * node are actually close by 724 */ 725 static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 726 { 727 if (blocknr < other && other - (blocknr + blocksize) < 32768) 728 return 1; 729 if (blocknr > other && blocknr - (other + blocksize) < 32768) 730 return 1; 731 return 0; 732 } 733 734 #ifdef __LITTLE_ENDIAN 735 736 /* 737 * Compare two keys, on little-endian the disk order is same as CPU order and 738 * we can avoid the conversion. 739 */ 740 static int comp_keys(const struct btrfs_disk_key *disk_key, 741 const struct btrfs_key *k2) 742 { 743 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; 744 745 return btrfs_comp_cpu_keys(k1, k2); 746 } 747 748 #else 749 750 /* 751 * compare two keys in a memcmp fashion 752 */ 753 static int comp_keys(const struct btrfs_disk_key *disk, 754 const struct btrfs_key *k2) 755 { 756 struct btrfs_key k1; 757 758 btrfs_disk_key_to_cpu(&k1, disk); 759 760 return btrfs_comp_cpu_keys(&k1, k2); 761 } 762 #endif 763 764 /* 765 * same as comp_keys only with two btrfs_key's 766 */ 767 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 768 { 769 if (k1->objectid > k2->objectid) 770 return 1; 771 if (k1->objectid < k2->objectid) 772 return -1; 773 if (k1->type > k2->type) 774 return 1; 775 if (k1->type < k2->type) 776 return -1; 777 if (k1->offset > k2->offset) 778 return 1; 779 if (k1->offset < k2->offset) 780 return -1; 781 return 0; 782 } 783 784 /* 785 * this is used by the defrag code to go through all the 786 * leaves pointed to by a node and reallocate them so that 787 * disk order is close to key order 788 */ 789 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 790 struct btrfs_root *root, struct extent_buffer *parent, 791 int start_slot, u64 *last_ret, 792 struct btrfs_key *progress) 793 { 794 struct btrfs_fs_info *fs_info = root->fs_info; 795 struct extent_buffer *cur; 796 u64 blocknr; 797 u64 search_start = *last_ret; 798 u64 last_block = 0; 799 u64 other; 800 u32 parent_nritems; 801 int end_slot; 802 int i; 803 int err = 0; 804 u32 blocksize; 805 int progress_passed = 0; 806 struct btrfs_disk_key disk_key; 807 808 WARN_ON(trans->transaction != fs_info->running_transaction); 809 WARN_ON(trans->transid != fs_info->generation); 810 811 parent_nritems = btrfs_header_nritems(parent); 812 blocksize = fs_info->nodesize; 813 end_slot = parent_nritems - 1; 814 815 if (parent_nritems <= 1) 816 return 0; 817 818 for (i = start_slot; i <= end_slot; i++) { 819 int close = 1; 820 821 btrfs_node_key(parent, &disk_key, i); 822 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 823 continue; 824 825 progress_passed = 1; 826 blocknr = btrfs_node_blockptr(parent, i); 827 if (last_block == 0) 828 last_block = blocknr; 829 830 if (i > 0) { 831 other = btrfs_node_blockptr(parent, i - 1); 832 close = close_blocks(blocknr, other, blocksize); 833 } 834 if (!close && i < end_slot) { 835 other = btrfs_node_blockptr(parent, i + 1); 836 close = close_blocks(blocknr, other, blocksize); 837 } 838 if (close) { 839 last_block = blocknr; 840 continue; 841 } 842 843 cur = btrfs_read_node_slot(parent, i); 844 if (IS_ERR(cur)) 845 return PTR_ERR(cur); 846 if (search_start == 0) 847 search_start = last_block; 848 849 btrfs_tree_lock(cur); 850 err = __btrfs_cow_block(trans, root, cur, parent, i, 851 &cur, search_start, 852 min(16 * blocksize, 853 (end_slot - i) * blocksize), 854 BTRFS_NESTING_COW); 855 if (err) { 856 btrfs_tree_unlock(cur); 857 free_extent_buffer(cur); 858 break; 859 } 860 search_start = cur->start; 861 last_block = cur->start; 862 *last_ret = search_start; 863 btrfs_tree_unlock(cur); 864 free_extent_buffer(cur); 865 } 866 return err; 867 } 868 869 /* 870 * Search for a key in the given extent_buffer. 871 * 872 * The lower boundary for the search is specified by the slot number @first_slot. 873 * Use a value of 0 to search over the whole extent buffer. Works for both 874 * leaves and nodes. 875 * 876 * The slot in the extent buffer is returned via @slot. If the key exists in the 877 * extent buffer, then @slot will point to the slot where the key is, otherwise 878 * it points to the slot where you would insert the key. 879 * 880 * Slot may point to the total number of items (i.e. one position beyond the last 881 * key) if the key is bigger than the last key in the extent buffer. 882 */ 883 int btrfs_bin_search(struct extent_buffer *eb, int first_slot, 884 const struct btrfs_key *key, int *slot) 885 { 886 unsigned long p; 887 int item_size; 888 /* 889 * Use unsigned types for the low and high slots, so that we get a more 890 * efficient division in the search loop below. 891 */ 892 u32 low = first_slot; 893 u32 high = btrfs_header_nritems(eb); 894 int ret; 895 const int key_size = sizeof(struct btrfs_disk_key); 896 897 if (unlikely(low > high)) { 898 btrfs_err(eb->fs_info, 899 "%s: low (%u) > high (%u) eb %llu owner %llu level %d", 900 __func__, low, high, eb->start, 901 btrfs_header_owner(eb), btrfs_header_level(eb)); 902 return -EINVAL; 903 } 904 905 if (btrfs_header_level(eb) == 0) { 906 p = offsetof(struct btrfs_leaf, items); 907 item_size = sizeof(struct btrfs_item); 908 } else { 909 p = offsetof(struct btrfs_node, ptrs); 910 item_size = sizeof(struct btrfs_key_ptr); 911 } 912 913 while (low < high) { 914 unsigned long oip; 915 unsigned long offset; 916 struct btrfs_disk_key *tmp; 917 struct btrfs_disk_key unaligned; 918 int mid; 919 920 mid = (low + high) / 2; 921 offset = p + mid * item_size; 922 oip = offset_in_page(offset); 923 924 if (oip + key_size <= PAGE_SIZE) { 925 const unsigned long idx = get_eb_page_index(offset); 926 char *kaddr = page_address(eb->pages[idx]); 927 928 oip = get_eb_offset_in_page(eb, offset); 929 tmp = (struct btrfs_disk_key *)(kaddr + oip); 930 } else { 931 read_extent_buffer(eb, &unaligned, offset, key_size); 932 tmp = &unaligned; 933 } 934 935 ret = comp_keys(tmp, key); 936 937 if (ret < 0) 938 low = mid + 1; 939 else if (ret > 0) 940 high = mid; 941 else { 942 *slot = mid; 943 return 0; 944 } 945 } 946 *slot = low; 947 return 1; 948 } 949 950 static void root_add_used(struct btrfs_root *root, u32 size) 951 { 952 spin_lock(&root->accounting_lock); 953 btrfs_set_root_used(&root->root_item, 954 btrfs_root_used(&root->root_item) + size); 955 spin_unlock(&root->accounting_lock); 956 } 957 958 static void root_sub_used(struct btrfs_root *root, u32 size) 959 { 960 spin_lock(&root->accounting_lock); 961 btrfs_set_root_used(&root->root_item, 962 btrfs_root_used(&root->root_item) - size); 963 spin_unlock(&root->accounting_lock); 964 } 965 966 /* given a node and slot number, this reads the blocks it points to. The 967 * extent buffer is returned with a reference taken (but unlocked). 968 */ 969 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, 970 int slot) 971 { 972 int level = btrfs_header_level(parent); 973 struct btrfs_tree_parent_check check = { 0 }; 974 struct extent_buffer *eb; 975 976 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 977 return ERR_PTR(-ENOENT); 978 979 ASSERT(level); 980 981 check.level = level - 1; 982 check.transid = btrfs_node_ptr_generation(parent, slot); 983 check.owner_root = btrfs_header_owner(parent); 984 check.has_first_key = true; 985 btrfs_node_key_to_cpu(parent, &check.first_key, slot); 986 987 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 988 &check); 989 if (IS_ERR(eb)) 990 return eb; 991 if (!extent_buffer_uptodate(eb)) { 992 free_extent_buffer(eb); 993 return ERR_PTR(-EIO); 994 } 995 996 return eb; 997 } 998 999 /* 1000 * node level balancing, used to make sure nodes are in proper order for 1001 * item deletion. We balance from the top down, so we have to make sure 1002 * that a deletion won't leave an node completely empty later on. 1003 */ 1004 static noinline int balance_level(struct btrfs_trans_handle *trans, 1005 struct btrfs_root *root, 1006 struct btrfs_path *path, int level) 1007 { 1008 struct btrfs_fs_info *fs_info = root->fs_info; 1009 struct extent_buffer *right = NULL; 1010 struct extent_buffer *mid; 1011 struct extent_buffer *left = NULL; 1012 struct extent_buffer *parent = NULL; 1013 int ret = 0; 1014 int wret; 1015 int pslot; 1016 int orig_slot = path->slots[level]; 1017 u64 orig_ptr; 1018 1019 ASSERT(level > 0); 1020 1021 mid = path->nodes[level]; 1022 1023 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); 1024 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1025 1026 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 1027 1028 if (level < BTRFS_MAX_LEVEL - 1) { 1029 parent = path->nodes[level + 1]; 1030 pslot = path->slots[level + 1]; 1031 } 1032 1033 /* 1034 * deal with the case where there is only one pointer in the root 1035 * by promoting the node below to a root 1036 */ 1037 if (!parent) { 1038 struct extent_buffer *child; 1039 1040 if (btrfs_header_nritems(mid) != 1) 1041 return 0; 1042 1043 /* promote the child to a root */ 1044 child = btrfs_read_node_slot(mid, 0); 1045 if (IS_ERR(child)) { 1046 ret = PTR_ERR(child); 1047 goto out; 1048 } 1049 1050 btrfs_tree_lock(child); 1051 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 1052 BTRFS_NESTING_COW); 1053 if (ret) { 1054 btrfs_tree_unlock(child); 1055 free_extent_buffer(child); 1056 goto out; 1057 } 1058 1059 ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 1060 if (ret < 0) { 1061 btrfs_tree_unlock(child); 1062 free_extent_buffer(child); 1063 btrfs_abort_transaction(trans, ret); 1064 goto out; 1065 } 1066 rcu_assign_pointer(root->node, child); 1067 1068 add_root_to_dirty_list(root); 1069 btrfs_tree_unlock(child); 1070 1071 path->locks[level] = 0; 1072 path->nodes[level] = NULL; 1073 btrfs_clear_buffer_dirty(trans, mid); 1074 btrfs_tree_unlock(mid); 1075 /* once for the path */ 1076 free_extent_buffer(mid); 1077 1078 root_sub_used(root, mid->len); 1079 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1080 /* once for the root ptr */ 1081 free_extent_buffer_stale(mid); 1082 return 0; 1083 } 1084 if (btrfs_header_nritems(mid) > 1085 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 1086 return 0; 1087 1088 if (pslot) { 1089 left = btrfs_read_node_slot(parent, pslot - 1); 1090 if (IS_ERR(left)) { 1091 ret = PTR_ERR(left); 1092 left = NULL; 1093 goto out; 1094 } 1095 1096 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 1097 wret = btrfs_cow_block(trans, root, left, 1098 parent, pslot - 1, &left, 1099 BTRFS_NESTING_LEFT_COW); 1100 if (wret) { 1101 ret = wret; 1102 goto out; 1103 } 1104 } 1105 1106 if (pslot + 1 < btrfs_header_nritems(parent)) { 1107 right = btrfs_read_node_slot(parent, pslot + 1); 1108 if (IS_ERR(right)) { 1109 ret = PTR_ERR(right); 1110 right = NULL; 1111 goto out; 1112 } 1113 1114 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1115 wret = btrfs_cow_block(trans, root, right, 1116 parent, pslot + 1, &right, 1117 BTRFS_NESTING_RIGHT_COW); 1118 if (wret) { 1119 ret = wret; 1120 goto out; 1121 } 1122 } 1123 1124 /* first, try to make some room in the middle buffer */ 1125 if (left) { 1126 orig_slot += btrfs_header_nritems(left); 1127 wret = push_node_left(trans, left, mid, 1); 1128 if (wret < 0) 1129 ret = wret; 1130 } 1131 1132 /* 1133 * then try to empty the right most buffer into the middle 1134 */ 1135 if (right) { 1136 wret = push_node_left(trans, mid, right, 1); 1137 if (wret < 0 && wret != -ENOSPC) 1138 ret = wret; 1139 if (btrfs_header_nritems(right) == 0) { 1140 btrfs_clear_buffer_dirty(trans, right); 1141 btrfs_tree_unlock(right); 1142 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); 1143 if (ret < 0) { 1144 free_extent_buffer_stale(right); 1145 right = NULL; 1146 goto out; 1147 } 1148 root_sub_used(root, right->len); 1149 btrfs_free_tree_block(trans, btrfs_root_id(root), right, 1150 0, 1); 1151 free_extent_buffer_stale(right); 1152 right = NULL; 1153 } else { 1154 struct btrfs_disk_key right_key; 1155 btrfs_node_key(right, &right_key, 0); 1156 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1157 BTRFS_MOD_LOG_KEY_REPLACE); 1158 if (ret < 0) { 1159 btrfs_abort_transaction(trans, ret); 1160 goto out; 1161 } 1162 btrfs_set_node_key(parent, &right_key, pslot + 1); 1163 btrfs_mark_buffer_dirty(parent); 1164 } 1165 } 1166 if (btrfs_header_nritems(mid) == 1) { 1167 /* 1168 * we're not allowed to leave a node with one item in the 1169 * tree during a delete. A deletion from lower in the tree 1170 * could try to delete the only pointer in this node. 1171 * So, pull some keys from the left. 1172 * There has to be a left pointer at this point because 1173 * otherwise we would have pulled some pointers from the 1174 * right 1175 */ 1176 if (unlikely(!left)) { 1177 btrfs_crit(fs_info, 1178 "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", 1179 parent->start, btrfs_header_level(parent), 1180 mid->start, btrfs_root_id(root)); 1181 ret = -EUCLEAN; 1182 btrfs_abort_transaction(trans, ret); 1183 goto out; 1184 } 1185 wret = balance_node_right(trans, mid, left); 1186 if (wret < 0) { 1187 ret = wret; 1188 goto out; 1189 } 1190 if (wret == 1) { 1191 wret = push_node_left(trans, left, mid, 1); 1192 if (wret < 0) 1193 ret = wret; 1194 } 1195 BUG_ON(wret == 1); 1196 } 1197 if (btrfs_header_nritems(mid) == 0) { 1198 btrfs_clear_buffer_dirty(trans, mid); 1199 btrfs_tree_unlock(mid); 1200 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); 1201 if (ret < 0) { 1202 free_extent_buffer_stale(mid); 1203 mid = NULL; 1204 goto out; 1205 } 1206 root_sub_used(root, mid->len); 1207 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1208 free_extent_buffer_stale(mid); 1209 mid = NULL; 1210 } else { 1211 /* update the parent key to reflect our changes */ 1212 struct btrfs_disk_key mid_key; 1213 btrfs_node_key(mid, &mid_key, 0); 1214 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1215 BTRFS_MOD_LOG_KEY_REPLACE); 1216 if (ret < 0) { 1217 btrfs_abort_transaction(trans, ret); 1218 goto out; 1219 } 1220 btrfs_set_node_key(parent, &mid_key, pslot); 1221 btrfs_mark_buffer_dirty(parent); 1222 } 1223 1224 /* update the path */ 1225 if (left) { 1226 if (btrfs_header_nritems(left) > orig_slot) { 1227 atomic_inc(&left->refs); 1228 /* left was locked after cow */ 1229 path->nodes[level] = left; 1230 path->slots[level + 1] -= 1; 1231 path->slots[level] = orig_slot; 1232 if (mid) { 1233 btrfs_tree_unlock(mid); 1234 free_extent_buffer(mid); 1235 } 1236 } else { 1237 orig_slot -= btrfs_header_nritems(left); 1238 path->slots[level] = orig_slot; 1239 } 1240 } 1241 /* double check we haven't messed things up */ 1242 if (orig_ptr != 1243 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1244 BUG(); 1245 out: 1246 if (right) { 1247 btrfs_tree_unlock(right); 1248 free_extent_buffer(right); 1249 } 1250 if (left) { 1251 if (path->nodes[level] != left) 1252 btrfs_tree_unlock(left); 1253 free_extent_buffer(left); 1254 } 1255 return ret; 1256 } 1257 1258 /* Node balancing for insertion. Here we only split or push nodes around 1259 * when they are completely full. This is also done top down, so we 1260 * have to be pessimistic. 1261 */ 1262 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1263 struct btrfs_root *root, 1264 struct btrfs_path *path, int level) 1265 { 1266 struct btrfs_fs_info *fs_info = root->fs_info; 1267 struct extent_buffer *right = NULL; 1268 struct extent_buffer *mid; 1269 struct extent_buffer *left = NULL; 1270 struct extent_buffer *parent = NULL; 1271 int ret = 0; 1272 int wret; 1273 int pslot; 1274 int orig_slot = path->slots[level]; 1275 1276 if (level == 0) 1277 return 1; 1278 1279 mid = path->nodes[level]; 1280 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1281 1282 if (level < BTRFS_MAX_LEVEL - 1) { 1283 parent = path->nodes[level + 1]; 1284 pslot = path->slots[level + 1]; 1285 } 1286 1287 if (!parent) 1288 return 1; 1289 1290 /* first, try to make some room in the middle buffer */ 1291 if (pslot) { 1292 u32 left_nr; 1293 1294 left = btrfs_read_node_slot(parent, pslot - 1); 1295 if (IS_ERR(left)) 1296 return PTR_ERR(left); 1297 1298 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 1299 1300 left_nr = btrfs_header_nritems(left); 1301 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1302 wret = 1; 1303 } else { 1304 ret = btrfs_cow_block(trans, root, left, parent, 1305 pslot - 1, &left, 1306 BTRFS_NESTING_LEFT_COW); 1307 if (ret) 1308 wret = 1; 1309 else { 1310 wret = push_node_left(trans, left, mid, 0); 1311 } 1312 } 1313 if (wret < 0) 1314 ret = wret; 1315 if (wret == 0) { 1316 struct btrfs_disk_key disk_key; 1317 orig_slot += left_nr; 1318 btrfs_node_key(mid, &disk_key, 0); 1319 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1320 BTRFS_MOD_LOG_KEY_REPLACE); 1321 if (ret < 0) { 1322 btrfs_tree_unlock(left); 1323 free_extent_buffer(left); 1324 btrfs_abort_transaction(trans, ret); 1325 return ret; 1326 } 1327 btrfs_set_node_key(parent, &disk_key, pslot); 1328 btrfs_mark_buffer_dirty(parent); 1329 if (btrfs_header_nritems(left) > orig_slot) { 1330 path->nodes[level] = left; 1331 path->slots[level + 1] -= 1; 1332 path->slots[level] = orig_slot; 1333 btrfs_tree_unlock(mid); 1334 free_extent_buffer(mid); 1335 } else { 1336 orig_slot -= 1337 btrfs_header_nritems(left); 1338 path->slots[level] = orig_slot; 1339 btrfs_tree_unlock(left); 1340 free_extent_buffer(left); 1341 } 1342 return 0; 1343 } 1344 btrfs_tree_unlock(left); 1345 free_extent_buffer(left); 1346 } 1347 1348 /* 1349 * then try to empty the right most buffer into the middle 1350 */ 1351 if (pslot + 1 < btrfs_header_nritems(parent)) { 1352 u32 right_nr; 1353 1354 right = btrfs_read_node_slot(parent, pslot + 1); 1355 if (IS_ERR(right)) 1356 return PTR_ERR(right); 1357 1358 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1359 1360 right_nr = btrfs_header_nritems(right); 1361 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1362 wret = 1; 1363 } else { 1364 ret = btrfs_cow_block(trans, root, right, 1365 parent, pslot + 1, 1366 &right, BTRFS_NESTING_RIGHT_COW); 1367 if (ret) 1368 wret = 1; 1369 else { 1370 wret = balance_node_right(trans, right, mid); 1371 } 1372 } 1373 if (wret < 0) 1374 ret = wret; 1375 if (wret == 0) { 1376 struct btrfs_disk_key disk_key; 1377 1378 btrfs_node_key(right, &disk_key, 0); 1379 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1380 BTRFS_MOD_LOG_KEY_REPLACE); 1381 if (ret < 0) { 1382 btrfs_tree_unlock(right); 1383 free_extent_buffer(right); 1384 btrfs_abort_transaction(trans, ret); 1385 return ret; 1386 } 1387 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1388 btrfs_mark_buffer_dirty(parent); 1389 1390 if (btrfs_header_nritems(mid) <= orig_slot) { 1391 path->nodes[level] = right; 1392 path->slots[level + 1] += 1; 1393 path->slots[level] = orig_slot - 1394 btrfs_header_nritems(mid); 1395 btrfs_tree_unlock(mid); 1396 free_extent_buffer(mid); 1397 } else { 1398 btrfs_tree_unlock(right); 1399 free_extent_buffer(right); 1400 } 1401 return 0; 1402 } 1403 btrfs_tree_unlock(right); 1404 free_extent_buffer(right); 1405 } 1406 return 1; 1407 } 1408 1409 /* 1410 * readahead one full node of leaves, finding things that are close 1411 * to the block in 'slot', and triggering ra on them. 1412 */ 1413 static void reada_for_search(struct btrfs_fs_info *fs_info, 1414 struct btrfs_path *path, 1415 int level, int slot, u64 objectid) 1416 { 1417 struct extent_buffer *node; 1418 struct btrfs_disk_key disk_key; 1419 u32 nritems; 1420 u64 search; 1421 u64 target; 1422 u64 nread = 0; 1423 u64 nread_max; 1424 u32 nr; 1425 u32 blocksize; 1426 u32 nscan = 0; 1427 1428 if (level != 1 && path->reada != READA_FORWARD_ALWAYS) 1429 return; 1430 1431 if (!path->nodes[level]) 1432 return; 1433 1434 node = path->nodes[level]; 1435 1436 /* 1437 * Since the time between visiting leaves is much shorter than the time 1438 * between visiting nodes, limit read ahead of nodes to 1, to avoid too 1439 * much IO at once (possibly random). 1440 */ 1441 if (path->reada == READA_FORWARD_ALWAYS) { 1442 if (level > 1) 1443 nread_max = node->fs_info->nodesize; 1444 else 1445 nread_max = SZ_128K; 1446 } else { 1447 nread_max = SZ_64K; 1448 } 1449 1450 search = btrfs_node_blockptr(node, slot); 1451 blocksize = fs_info->nodesize; 1452 if (path->reada != READA_FORWARD_ALWAYS) { 1453 struct extent_buffer *eb; 1454 1455 eb = find_extent_buffer(fs_info, search); 1456 if (eb) { 1457 free_extent_buffer(eb); 1458 return; 1459 } 1460 } 1461 1462 target = search; 1463 1464 nritems = btrfs_header_nritems(node); 1465 nr = slot; 1466 1467 while (1) { 1468 if (path->reada == READA_BACK) { 1469 if (nr == 0) 1470 break; 1471 nr--; 1472 } else if (path->reada == READA_FORWARD || 1473 path->reada == READA_FORWARD_ALWAYS) { 1474 nr++; 1475 if (nr >= nritems) 1476 break; 1477 } 1478 if (path->reada == READA_BACK && objectid) { 1479 btrfs_node_key(node, &disk_key, nr); 1480 if (btrfs_disk_key_objectid(&disk_key) != objectid) 1481 break; 1482 } 1483 search = btrfs_node_blockptr(node, nr); 1484 if (path->reada == READA_FORWARD_ALWAYS || 1485 (search <= target && target - search <= 65536) || 1486 (search > target && search - target <= 65536)) { 1487 btrfs_readahead_node_child(node, nr); 1488 nread += blocksize; 1489 } 1490 nscan++; 1491 if (nread > nread_max || nscan > 32) 1492 break; 1493 } 1494 } 1495 1496 static noinline void reada_for_balance(struct btrfs_path *path, int level) 1497 { 1498 struct extent_buffer *parent; 1499 int slot; 1500 int nritems; 1501 1502 parent = path->nodes[level + 1]; 1503 if (!parent) 1504 return; 1505 1506 nritems = btrfs_header_nritems(parent); 1507 slot = path->slots[level + 1]; 1508 1509 if (slot > 0) 1510 btrfs_readahead_node_child(parent, slot - 1); 1511 if (slot + 1 < nritems) 1512 btrfs_readahead_node_child(parent, slot + 1); 1513 } 1514 1515 1516 /* 1517 * when we walk down the tree, it is usually safe to unlock the higher layers 1518 * in the tree. The exceptions are when our path goes through slot 0, because 1519 * operations on the tree might require changing key pointers higher up in the 1520 * tree. 1521 * 1522 * callers might also have set path->keep_locks, which tells this code to keep 1523 * the lock if the path points to the last slot in the block. This is part of 1524 * walking through the tree, and selecting the next slot in the higher block. 1525 * 1526 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 1527 * if lowest_unlock is 1, level 0 won't be unlocked 1528 */ 1529 static noinline void unlock_up(struct btrfs_path *path, int level, 1530 int lowest_unlock, int min_write_lock_level, 1531 int *write_lock_level) 1532 { 1533 int i; 1534 int skip_level = level; 1535 bool check_skip = true; 1536 1537 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1538 if (!path->nodes[i]) 1539 break; 1540 if (!path->locks[i]) 1541 break; 1542 1543 if (check_skip) { 1544 if (path->slots[i] == 0) { 1545 skip_level = i + 1; 1546 continue; 1547 } 1548 1549 if (path->keep_locks) { 1550 u32 nritems; 1551 1552 nritems = btrfs_header_nritems(path->nodes[i]); 1553 if (nritems < 1 || path->slots[i] >= nritems - 1) { 1554 skip_level = i + 1; 1555 continue; 1556 } 1557 } 1558 } 1559 1560 if (i >= lowest_unlock && i > skip_level) { 1561 check_skip = false; 1562 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 1563 path->locks[i] = 0; 1564 if (write_lock_level && 1565 i > min_write_lock_level && 1566 i <= *write_lock_level) { 1567 *write_lock_level = i - 1; 1568 } 1569 } 1570 } 1571 } 1572 1573 /* 1574 * Helper function for btrfs_search_slot() and other functions that do a search 1575 * on a btree. The goal is to find a tree block in the cache (the radix tree at 1576 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read 1577 * its pages from disk. 1578 * 1579 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the 1580 * whole btree search, starting again from the current root node. 1581 */ 1582 static int 1583 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 1584 struct extent_buffer **eb_ret, int level, int slot, 1585 const struct btrfs_key *key) 1586 { 1587 struct btrfs_fs_info *fs_info = root->fs_info; 1588 struct btrfs_tree_parent_check check = { 0 }; 1589 u64 blocknr; 1590 u64 gen; 1591 struct extent_buffer *tmp; 1592 int ret; 1593 int parent_level; 1594 bool unlock_up; 1595 1596 unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); 1597 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1598 gen = btrfs_node_ptr_generation(*eb_ret, slot); 1599 parent_level = btrfs_header_level(*eb_ret); 1600 btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); 1601 check.has_first_key = true; 1602 check.level = parent_level - 1; 1603 check.transid = gen; 1604 check.owner_root = root->root_key.objectid; 1605 1606 /* 1607 * If we need to read an extent buffer from disk and we are holding locks 1608 * on upper level nodes, we unlock all the upper nodes before reading the 1609 * extent buffer, and then return -EAGAIN to the caller as it needs to 1610 * restart the search. We don't release the lock on the current level 1611 * because we need to walk this node to figure out which blocks to read. 1612 */ 1613 tmp = find_extent_buffer(fs_info, blocknr); 1614 if (tmp) { 1615 if (p->reada == READA_FORWARD_ALWAYS) 1616 reada_for_search(fs_info, p, level, slot, key->objectid); 1617 1618 /* first we do an atomic uptodate check */ 1619 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 1620 /* 1621 * Do extra check for first_key, eb can be stale due to 1622 * being cached, read from scrub, or have multiple 1623 * parents (shared tree blocks). 1624 */ 1625 if (btrfs_verify_level_key(tmp, 1626 parent_level - 1, &check.first_key, gen)) { 1627 free_extent_buffer(tmp); 1628 return -EUCLEAN; 1629 } 1630 *eb_ret = tmp; 1631 return 0; 1632 } 1633 1634 if (p->nowait) { 1635 free_extent_buffer(tmp); 1636 return -EAGAIN; 1637 } 1638 1639 if (unlock_up) 1640 btrfs_unlock_up_safe(p, level + 1); 1641 1642 /* now we're allowed to do a blocking uptodate check */ 1643 ret = btrfs_read_extent_buffer(tmp, &check); 1644 if (ret) { 1645 free_extent_buffer(tmp); 1646 btrfs_release_path(p); 1647 return -EIO; 1648 } 1649 if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { 1650 free_extent_buffer(tmp); 1651 btrfs_release_path(p); 1652 return -EUCLEAN; 1653 } 1654 1655 if (unlock_up) 1656 ret = -EAGAIN; 1657 1658 goto out; 1659 } else if (p->nowait) { 1660 return -EAGAIN; 1661 } 1662 1663 if (unlock_up) { 1664 btrfs_unlock_up_safe(p, level + 1); 1665 ret = -EAGAIN; 1666 } else { 1667 ret = 0; 1668 } 1669 1670 if (p->reada != READA_NONE) 1671 reada_for_search(fs_info, p, level, slot, key->objectid); 1672 1673 tmp = read_tree_block(fs_info, blocknr, &check); 1674 if (IS_ERR(tmp)) { 1675 btrfs_release_path(p); 1676 return PTR_ERR(tmp); 1677 } 1678 /* 1679 * If the read above didn't mark this buffer up to date, 1680 * it will never end up being up to date. Set ret to EIO now 1681 * and give up so that our caller doesn't loop forever 1682 * on our EAGAINs. 1683 */ 1684 if (!extent_buffer_uptodate(tmp)) 1685 ret = -EIO; 1686 1687 out: 1688 if (ret == 0) { 1689 *eb_ret = tmp; 1690 } else { 1691 free_extent_buffer(tmp); 1692 btrfs_release_path(p); 1693 } 1694 1695 return ret; 1696 } 1697 1698 /* 1699 * helper function for btrfs_search_slot. This does all of the checks 1700 * for node-level blocks and does any balancing required based on 1701 * the ins_len. 1702 * 1703 * If no extra work was required, zero is returned. If we had to 1704 * drop the path, -EAGAIN is returned and btrfs_search_slot must 1705 * start over 1706 */ 1707 static int 1708 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1709 struct btrfs_root *root, struct btrfs_path *p, 1710 struct extent_buffer *b, int level, int ins_len, 1711 int *write_lock_level) 1712 { 1713 struct btrfs_fs_info *fs_info = root->fs_info; 1714 int ret = 0; 1715 1716 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1717 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 1718 1719 if (*write_lock_level < level + 1) { 1720 *write_lock_level = level + 1; 1721 btrfs_release_path(p); 1722 return -EAGAIN; 1723 } 1724 1725 reada_for_balance(p, level); 1726 ret = split_node(trans, root, p, level); 1727 1728 b = p->nodes[level]; 1729 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1730 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 1731 1732 if (*write_lock_level < level + 1) { 1733 *write_lock_level = level + 1; 1734 btrfs_release_path(p); 1735 return -EAGAIN; 1736 } 1737 1738 reada_for_balance(p, level); 1739 ret = balance_level(trans, root, p, level); 1740 if (ret) 1741 return ret; 1742 1743 b = p->nodes[level]; 1744 if (!b) { 1745 btrfs_release_path(p); 1746 return -EAGAIN; 1747 } 1748 BUG_ON(btrfs_header_nritems(b) == 1); 1749 } 1750 return ret; 1751 } 1752 1753 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 1754 u64 iobjectid, u64 ioff, u8 key_type, 1755 struct btrfs_key *found_key) 1756 { 1757 int ret; 1758 struct btrfs_key key; 1759 struct extent_buffer *eb; 1760 1761 ASSERT(path); 1762 ASSERT(found_key); 1763 1764 key.type = key_type; 1765 key.objectid = iobjectid; 1766 key.offset = ioff; 1767 1768 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 1769 if (ret < 0) 1770 return ret; 1771 1772 eb = path->nodes[0]; 1773 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 1774 ret = btrfs_next_leaf(fs_root, path); 1775 if (ret) 1776 return ret; 1777 eb = path->nodes[0]; 1778 } 1779 1780 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 1781 if (found_key->type != key.type || 1782 found_key->objectid != key.objectid) 1783 return 1; 1784 1785 return 0; 1786 } 1787 1788 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 1789 struct btrfs_path *p, 1790 int write_lock_level) 1791 { 1792 struct extent_buffer *b; 1793 int root_lock = 0; 1794 int level = 0; 1795 1796 if (p->search_commit_root) { 1797 b = root->commit_root; 1798 atomic_inc(&b->refs); 1799 level = btrfs_header_level(b); 1800 /* 1801 * Ensure that all callers have set skip_locking when 1802 * p->search_commit_root = 1. 1803 */ 1804 ASSERT(p->skip_locking == 1); 1805 1806 goto out; 1807 } 1808 1809 if (p->skip_locking) { 1810 b = btrfs_root_node(root); 1811 level = btrfs_header_level(b); 1812 goto out; 1813 } 1814 1815 /* We try very hard to do read locks on the root */ 1816 root_lock = BTRFS_READ_LOCK; 1817 1818 /* 1819 * If the level is set to maximum, we can skip trying to get the read 1820 * lock. 1821 */ 1822 if (write_lock_level < BTRFS_MAX_LEVEL) { 1823 /* 1824 * We don't know the level of the root node until we actually 1825 * have it read locked 1826 */ 1827 if (p->nowait) { 1828 b = btrfs_try_read_lock_root_node(root); 1829 if (IS_ERR(b)) 1830 return b; 1831 } else { 1832 b = btrfs_read_lock_root_node(root); 1833 } 1834 level = btrfs_header_level(b); 1835 if (level > write_lock_level) 1836 goto out; 1837 1838 /* Whoops, must trade for write lock */ 1839 btrfs_tree_read_unlock(b); 1840 free_extent_buffer(b); 1841 } 1842 1843 b = btrfs_lock_root_node(root); 1844 root_lock = BTRFS_WRITE_LOCK; 1845 1846 /* The level might have changed, check again */ 1847 level = btrfs_header_level(b); 1848 1849 out: 1850 /* 1851 * The root may have failed to write out at some point, and thus is no 1852 * longer valid, return an error in this case. 1853 */ 1854 if (!extent_buffer_uptodate(b)) { 1855 if (root_lock) 1856 btrfs_tree_unlock_rw(b, root_lock); 1857 free_extent_buffer(b); 1858 return ERR_PTR(-EIO); 1859 } 1860 1861 p->nodes[level] = b; 1862 if (!p->skip_locking) 1863 p->locks[level] = root_lock; 1864 /* 1865 * Callers are responsible for dropping b's references. 1866 */ 1867 return b; 1868 } 1869 1870 /* 1871 * Replace the extent buffer at the lowest level of the path with a cloned 1872 * version. The purpose is to be able to use it safely, after releasing the 1873 * commit root semaphore, even if relocation is happening in parallel, the 1874 * transaction used for relocation is committed and the extent buffer is 1875 * reallocated in the next transaction. 1876 * 1877 * This is used in a context where the caller does not prevent transaction 1878 * commits from happening, either by holding a transaction handle or holding 1879 * some lock, while it's doing searches through a commit root. 1880 * At the moment it's only used for send operations. 1881 */ 1882 static int finish_need_commit_sem_search(struct btrfs_path *path) 1883 { 1884 const int i = path->lowest_level; 1885 const int slot = path->slots[i]; 1886 struct extent_buffer *lowest = path->nodes[i]; 1887 struct extent_buffer *clone; 1888 1889 ASSERT(path->need_commit_sem); 1890 1891 if (!lowest) 1892 return 0; 1893 1894 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); 1895 1896 clone = btrfs_clone_extent_buffer(lowest); 1897 if (!clone) 1898 return -ENOMEM; 1899 1900 btrfs_release_path(path); 1901 path->nodes[i] = clone; 1902 path->slots[i] = slot; 1903 1904 return 0; 1905 } 1906 1907 static inline int search_for_key_slot(struct extent_buffer *eb, 1908 int search_low_slot, 1909 const struct btrfs_key *key, 1910 int prev_cmp, 1911 int *slot) 1912 { 1913 /* 1914 * If a previous call to btrfs_bin_search() on a parent node returned an 1915 * exact match (prev_cmp == 0), we can safely assume the target key will 1916 * always be at slot 0 on lower levels, since each key pointer 1917 * (struct btrfs_key_ptr) refers to the lowest key accessible from the 1918 * subtree it points to. Thus we can skip searching lower levels. 1919 */ 1920 if (prev_cmp == 0) { 1921 *slot = 0; 1922 return 0; 1923 } 1924 1925 return btrfs_bin_search(eb, search_low_slot, key, slot); 1926 } 1927 1928 static int search_leaf(struct btrfs_trans_handle *trans, 1929 struct btrfs_root *root, 1930 const struct btrfs_key *key, 1931 struct btrfs_path *path, 1932 int ins_len, 1933 int prev_cmp) 1934 { 1935 struct extent_buffer *leaf = path->nodes[0]; 1936 int leaf_free_space = -1; 1937 int search_low_slot = 0; 1938 int ret; 1939 bool do_bin_search = true; 1940 1941 /* 1942 * If we are doing an insertion, the leaf has enough free space and the 1943 * destination slot for the key is not slot 0, then we can unlock our 1944 * write lock on the parent, and any other upper nodes, before doing the 1945 * binary search on the leaf (with search_for_key_slot()), allowing other 1946 * tasks to lock the parent and any other upper nodes. 1947 */ 1948 if (ins_len > 0) { 1949 /* 1950 * Cache the leaf free space, since we will need it later and it 1951 * will not change until then. 1952 */ 1953 leaf_free_space = btrfs_leaf_free_space(leaf); 1954 1955 /* 1956 * !path->locks[1] means we have a single node tree, the leaf is 1957 * the root of the tree. 1958 */ 1959 if (path->locks[1] && leaf_free_space >= ins_len) { 1960 struct btrfs_disk_key first_key; 1961 1962 ASSERT(btrfs_header_nritems(leaf) > 0); 1963 btrfs_item_key(leaf, &first_key, 0); 1964 1965 /* 1966 * Doing the extra comparison with the first key is cheap, 1967 * taking into account that the first key is very likely 1968 * already in a cache line because it immediately follows 1969 * the extent buffer's header and we have recently accessed 1970 * the header's level field. 1971 */ 1972 ret = comp_keys(&first_key, key); 1973 if (ret < 0) { 1974 /* 1975 * The first key is smaller than the key we want 1976 * to insert, so we are safe to unlock all upper 1977 * nodes and we have to do the binary search. 1978 * 1979 * We do use btrfs_unlock_up_safe() and not 1980 * unlock_up() because the later does not unlock 1981 * nodes with a slot of 0 - we can safely unlock 1982 * any node even if its slot is 0 since in this 1983 * case the key does not end up at slot 0 of the 1984 * leaf and there's no need to split the leaf. 1985 */ 1986 btrfs_unlock_up_safe(path, 1); 1987 search_low_slot = 1; 1988 } else { 1989 /* 1990 * The first key is >= then the key we want to 1991 * insert, so we can skip the binary search as 1992 * the target key will be at slot 0. 1993 * 1994 * We can not unlock upper nodes when the key is 1995 * less than the first key, because we will need 1996 * to update the key at slot 0 of the parent node 1997 * and possibly of other upper nodes too. 1998 * If the key matches the first key, then we can 1999 * unlock all the upper nodes, using 2000 * btrfs_unlock_up_safe() instead of unlock_up() 2001 * as stated above. 2002 */ 2003 if (ret == 0) 2004 btrfs_unlock_up_safe(path, 1); 2005 /* 2006 * ret is already 0 or 1, matching the result of 2007 * a btrfs_bin_search() call, so there is no need 2008 * to adjust it. 2009 */ 2010 do_bin_search = false; 2011 path->slots[0] = 0; 2012 } 2013 } 2014 } 2015 2016 if (do_bin_search) { 2017 ret = search_for_key_slot(leaf, search_low_slot, key, 2018 prev_cmp, &path->slots[0]); 2019 if (ret < 0) 2020 return ret; 2021 } 2022 2023 if (ins_len > 0) { 2024 /* 2025 * Item key already exists. In this case, if we are allowed to 2026 * insert the item (for example, in dir_item case, item key 2027 * collision is allowed), it will be merged with the original 2028 * item. Only the item size grows, no new btrfs item will be 2029 * added. If search_for_extension is not set, ins_len already 2030 * accounts the size btrfs_item, deduct it here so leaf space 2031 * check will be correct. 2032 */ 2033 if (ret == 0 && !path->search_for_extension) { 2034 ASSERT(ins_len >= sizeof(struct btrfs_item)); 2035 ins_len -= sizeof(struct btrfs_item); 2036 } 2037 2038 ASSERT(leaf_free_space >= 0); 2039 2040 if (leaf_free_space < ins_len) { 2041 int err; 2042 2043 err = split_leaf(trans, root, key, path, ins_len, 2044 (ret == 0)); 2045 ASSERT(err <= 0); 2046 if (WARN_ON(err > 0)) 2047 err = -EUCLEAN; 2048 if (err) 2049 ret = err; 2050 } 2051 } 2052 2053 return ret; 2054 } 2055 2056 /* 2057 * btrfs_search_slot - look for a key in a tree and perform necessary 2058 * modifications to preserve tree invariants. 2059 * 2060 * @trans: Handle of transaction, used when modifying the tree 2061 * @p: Holds all btree nodes along the search path 2062 * @root: The root node of the tree 2063 * @key: The key we are looking for 2064 * @ins_len: Indicates purpose of search: 2065 * >0 for inserts it's size of item inserted (*) 2066 * <0 for deletions 2067 * 0 for plain searches, not modifying the tree 2068 * 2069 * (*) If size of item inserted doesn't include 2070 * sizeof(struct btrfs_item), then p->search_for_extension must 2071 * be set. 2072 * @cow: boolean should CoW operations be performed. Must always be 1 2073 * when modifying the tree. 2074 * 2075 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 2076 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 2077 * 2078 * If @key is found, 0 is returned and you can find the item in the leaf level 2079 * of the path (level 0) 2080 * 2081 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 2082 * points to the slot where it should be inserted 2083 * 2084 * If an error is encountered while searching the tree a negative error number 2085 * is returned 2086 */ 2087 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2088 const struct btrfs_key *key, struct btrfs_path *p, 2089 int ins_len, int cow) 2090 { 2091 struct btrfs_fs_info *fs_info = root->fs_info; 2092 struct extent_buffer *b; 2093 int slot; 2094 int ret; 2095 int err; 2096 int level; 2097 int lowest_unlock = 1; 2098 /* everything at write_lock_level or lower must be write locked */ 2099 int write_lock_level = 0; 2100 u8 lowest_level = 0; 2101 int min_write_lock_level; 2102 int prev_cmp; 2103 2104 might_sleep(); 2105 2106 lowest_level = p->lowest_level; 2107 WARN_ON(lowest_level && ins_len > 0); 2108 WARN_ON(p->nodes[0] != NULL); 2109 BUG_ON(!cow && ins_len); 2110 2111 /* 2112 * For now only allow nowait for read only operations. There's no 2113 * strict reason why we can't, we just only need it for reads so it's 2114 * only implemented for reads. 2115 */ 2116 ASSERT(!p->nowait || !cow); 2117 2118 if (ins_len < 0) { 2119 lowest_unlock = 2; 2120 2121 /* when we are removing items, we might have to go up to level 2122 * two as we update tree pointers Make sure we keep write 2123 * for those levels as well 2124 */ 2125 write_lock_level = 2; 2126 } else if (ins_len > 0) { 2127 /* 2128 * for inserting items, make sure we have a write lock on 2129 * level 1 so we can update keys 2130 */ 2131 write_lock_level = 1; 2132 } 2133 2134 if (!cow) 2135 write_lock_level = -1; 2136 2137 if (cow && (p->keep_locks || p->lowest_level)) 2138 write_lock_level = BTRFS_MAX_LEVEL; 2139 2140 min_write_lock_level = write_lock_level; 2141 2142 if (p->need_commit_sem) { 2143 ASSERT(p->search_commit_root); 2144 if (p->nowait) { 2145 if (!down_read_trylock(&fs_info->commit_root_sem)) 2146 return -EAGAIN; 2147 } else { 2148 down_read(&fs_info->commit_root_sem); 2149 } 2150 } 2151 2152 again: 2153 prev_cmp = -1; 2154 b = btrfs_search_slot_get_root(root, p, write_lock_level); 2155 if (IS_ERR(b)) { 2156 ret = PTR_ERR(b); 2157 goto done; 2158 } 2159 2160 while (b) { 2161 int dec = 0; 2162 2163 level = btrfs_header_level(b); 2164 2165 if (cow) { 2166 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 2167 2168 /* 2169 * if we don't really need to cow this block 2170 * then we don't want to set the path blocking, 2171 * so we test it here 2172 */ 2173 if (!should_cow_block(trans, root, b)) 2174 goto cow_done; 2175 2176 /* 2177 * must have write locks on this node and the 2178 * parent 2179 */ 2180 if (level > write_lock_level || 2181 (level + 1 > write_lock_level && 2182 level + 1 < BTRFS_MAX_LEVEL && 2183 p->nodes[level + 1])) { 2184 write_lock_level = level + 1; 2185 btrfs_release_path(p); 2186 goto again; 2187 } 2188 2189 if (last_level) 2190 err = btrfs_cow_block(trans, root, b, NULL, 0, 2191 &b, 2192 BTRFS_NESTING_COW); 2193 else 2194 err = btrfs_cow_block(trans, root, b, 2195 p->nodes[level + 1], 2196 p->slots[level + 1], &b, 2197 BTRFS_NESTING_COW); 2198 if (err) { 2199 ret = err; 2200 goto done; 2201 } 2202 } 2203 cow_done: 2204 p->nodes[level] = b; 2205 2206 /* 2207 * we have a lock on b and as long as we aren't changing 2208 * the tree, there is no way to for the items in b to change. 2209 * It is safe to drop the lock on our parent before we 2210 * go through the expensive btree search on b. 2211 * 2212 * If we're inserting or deleting (ins_len != 0), then we might 2213 * be changing slot zero, which may require changing the parent. 2214 * So, we can't drop the lock until after we know which slot 2215 * we're operating on. 2216 */ 2217 if (!ins_len && !p->keep_locks) { 2218 int u = level + 1; 2219 2220 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 2221 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 2222 p->locks[u] = 0; 2223 } 2224 } 2225 2226 if (level == 0) { 2227 if (ins_len > 0) 2228 ASSERT(write_lock_level >= 1); 2229 2230 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); 2231 if (!p->search_for_split) 2232 unlock_up(p, level, lowest_unlock, 2233 min_write_lock_level, NULL); 2234 goto done; 2235 } 2236 2237 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); 2238 if (ret < 0) 2239 goto done; 2240 prev_cmp = ret; 2241 2242 if (ret && slot > 0) { 2243 dec = 1; 2244 slot--; 2245 } 2246 p->slots[level] = slot; 2247 err = setup_nodes_for_search(trans, root, p, b, level, ins_len, 2248 &write_lock_level); 2249 if (err == -EAGAIN) 2250 goto again; 2251 if (err) { 2252 ret = err; 2253 goto done; 2254 } 2255 b = p->nodes[level]; 2256 slot = p->slots[level]; 2257 2258 /* 2259 * Slot 0 is special, if we change the key we have to update 2260 * the parent pointer which means we must have a write lock on 2261 * the parent 2262 */ 2263 if (slot == 0 && ins_len && write_lock_level < level + 1) { 2264 write_lock_level = level + 1; 2265 btrfs_release_path(p); 2266 goto again; 2267 } 2268 2269 unlock_up(p, level, lowest_unlock, min_write_lock_level, 2270 &write_lock_level); 2271 2272 if (level == lowest_level) { 2273 if (dec) 2274 p->slots[level]++; 2275 goto done; 2276 } 2277 2278 err = read_block_for_search(root, p, &b, level, slot, key); 2279 if (err == -EAGAIN) 2280 goto again; 2281 if (err) { 2282 ret = err; 2283 goto done; 2284 } 2285 2286 if (!p->skip_locking) { 2287 level = btrfs_header_level(b); 2288 2289 btrfs_maybe_reset_lockdep_class(root, b); 2290 2291 if (level <= write_lock_level) { 2292 btrfs_tree_lock(b); 2293 p->locks[level] = BTRFS_WRITE_LOCK; 2294 } else { 2295 if (p->nowait) { 2296 if (!btrfs_try_tree_read_lock(b)) { 2297 free_extent_buffer(b); 2298 ret = -EAGAIN; 2299 goto done; 2300 } 2301 } else { 2302 btrfs_tree_read_lock(b); 2303 } 2304 p->locks[level] = BTRFS_READ_LOCK; 2305 } 2306 p->nodes[level] = b; 2307 } 2308 } 2309 ret = 1; 2310 done: 2311 if (ret < 0 && !p->skip_release_on_error) 2312 btrfs_release_path(p); 2313 2314 if (p->need_commit_sem) { 2315 int ret2; 2316 2317 ret2 = finish_need_commit_sem_search(p); 2318 up_read(&fs_info->commit_root_sem); 2319 if (ret2) 2320 ret = ret2; 2321 } 2322 2323 return ret; 2324 } 2325 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); 2326 2327 /* 2328 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2329 * current state of the tree together with the operations recorded in the tree 2330 * modification log to search for the key in a previous version of this tree, as 2331 * denoted by the time_seq parameter. 2332 * 2333 * Naturally, there is no support for insert, delete or cow operations. 2334 * 2335 * The resulting path and return value will be set up as if we called 2336 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2337 */ 2338 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 2339 struct btrfs_path *p, u64 time_seq) 2340 { 2341 struct btrfs_fs_info *fs_info = root->fs_info; 2342 struct extent_buffer *b; 2343 int slot; 2344 int ret; 2345 int err; 2346 int level; 2347 int lowest_unlock = 1; 2348 u8 lowest_level = 0; 2349 2350 lowest_level = p->lowest_level; 2351 WARN_ON(p->nodes[0] != NULL); 2352 ASSERT(!p->nowait); 2353 2354 if (p->search_commit_root) { 2355 BUG_ON(time_seq); 2356 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2357 } 2358 2359 again: 2360 b = btrfs_get_old_root(root, time_seq); 2361 if (!b) { 2362 ret = -EIO; 2363 goto done; 2364 } 2365 level = btrfs_header_level(b); 2366 p->locks[level] = BTRFS_READ_LOCK; 2367 2368 while (b) { 2369 int dec = 0; 2370 2371 level = btrfs_header_level(b); 2372 p->nodes[level] = b; 2373 2374 /* 2375 * we have a lock on b and as long as we aren't changing 2376 * the tree, there is no way to for the items in b to change. 2377 * It is safe to drop the lock on our parent before we 2378 * go through the expensive btree search on b. 2379 */ 2380 btrfs_unlock_up_safe(p, level + 1); 2381 2382 ret = btrfs_bin_search(b, 0, key, &slot); 2383 if (ret < 0) 2384 goto done; 2385 2386 if (level == 0) { 2387 p->slots[level] = slot; 2388 unlock_up(p, level, lowest_unlock, 0, NULL); 2389 goto done; 2390 } 2391 2392 if (ret && slot > 0) { 2393 dec = 1; 2394 slot--; 2395 } 2396 p->slots[level] = slot; 2397 unlock_up(p, level, lowest_unlock, 0, NULL); 2398 2399 if (level == lowest_level) { 2400 if (dec) 2401 p->slots[level]++; 2402 goto done; 2403 } 2404 2405 err = read_block_for_search(root, p, &b, level, slot, key); 2406 if (err == -EAGAIN) 2407 goto again; 2408 if (err) { 2409 ret = err; 2410 goto done; 2411 } 2412 2413 level = btrfs_header_level(b); 2414 btrfs_tree_read_lock(b); 2415 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); 2416 if (!b) { 2417 ret = -ENOMEM; 2418 goto done; 2419 } 2420 p->locks[level] = BTRFS_READ_LOCK; 2421 p->nodes[level] = b; 2422 } 2423 ret = 1; 2424 done: 2425 if (ret < 0) 2426 btrfs_release_path(p); 2427 2428 return ret; 2429 } 2430 2431 /* 2432 * Search the tree again to find a leaf with smaller keys. 2433 * Returns 0 if it found something. 2434 * Returns 1 if there are no smaller keys. 2435 * Returns < 0 on error. 2436 * 2437 * This may release the path, and so you may lose any locks held at the 2438 * time you call it. 2439 */ 2440 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 2441 { 2442 struct btrfs_key key; 2443 struct btrfs_key orig_key; 2444 struct btrfs_disk_key found_key; 2445 int ret; 2446 2447 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 2448 orig_key = key; 2449 2450 if (key.offset > 0) { 2451 key.offset--; 2452 } else if (key.type > 0) { 2453 key.type--; 2454 key.offset = (u64)-1; 2455 } else if (key.objectid > 0) { 2456 key.objectid--; 2457 key.type = (u8)-1; 2458 key.offset = (u64)-1; 2459 } else { 2460 return 1; 2461 } 2462 2463 btrfs_release_path(path); 2464 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2465 if (ret <= 0) 2466 return ret; 2467 2468 /* 2469 * Previous key not found. Even if we were at slot 0 of the leaf we had 2470 * before releasing the path and calling btrfs_search_slot(), we now may 2471 * be in a slot pointing to the same original key - this can happen if 2472 * after we released the path, one of more items were moved from a 2473 * sibling leaf into the front of the leaf we had due to an insertion 2474 * (see push_leaf_right()). 2475 * If we hit this case and our slot is > 0 and just decrement the slot 2476 * so that the caller does not process the same key again, which may or 2477 * may not break the caller, depending on its logic. 2478 */ 2479 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 2480 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); 2481 ret = comp_keys(&found_key, &orig_key); 2482 if (ret == 0) { 2483 if (path->slots[0] > 0) { 2484 path->slots[0]--; 2485 return 0; 2486 } 2487 /* 2488 * At slot 0, same key as before, it means orig_key is 2489 * the lowest, leftmost, key in the tree. We're done. 2490 */ 2491 return 1; 2492 } 2493 } 2494 2495 btrfs_item_key(path->nodes[0], &found_key, 0); 2496 ret = comp_keys(&found_key, &key); 2497 /* 2498 * We might have had an item with the previous key in the tree right 2499 * before we released our path. And after we released our path, that 2500 * item might have been pushed to the first slot (0) of the leaf we 2501 * were holding due to a tree balance. Alternatively, an item with the 2502 * previous key can exist as the only element of a leaf (big fat item). 2503 * Therefore account for these 2 cases, so that our callers (like 2504 * btrfs_previous_item) don't miss an existing item with a key matching 2505 * the previous key we computed above. 2506 */ 2507 if (ret <= 0) 2508 return 0; 2509 return 1; 2510 } 2511 2512 /* 2513 * helper to use instead of search slot if no exact match is needed but 2514 * instead the next or previous item should be returned. 2515 * When find_higher is true, the next higher item is returned, the next lower 2516 * otherwise. 2517 * When return_any and find_higher are both true, and no higher item is found, 2518 * return the next lower instead. 2519 * When return_any is true and find_higher is false, and no lower item is found, 2520 * return the next higher instead. 2521 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2522 * < 0 on error 2523 */ 2524 int btrfs_search_slot_for_read(struct btrfs_root *root, 2525 const struct btrfs_key *key, 2526 struct btrfs_path *p, int find_higher, 2527 int return_any) 2528 { 2529 int ret; 2530 struct extent_buffer *leaf; 2531 2532 again: 2533 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2534 if (ret <= 0) 2535 return ret; 2536 /* 2537 * a return value of 1 means the path is at the position where the 2538 * item should be inserted. Normally this is the next bigger item, 2539 * but in case the previous item is the last in a leaf, path points 2540 * to the first free slot in the previous leaf, i.e. at an invalid 2541 * item. 2542 */ 2543 leaf = p->nodes[0]; 2544 2545 if (find_higher) { 2546 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2547 ret = btrfs_next_leaf(root, p); 2548 if (ret <= 0) 2549 return ret; 2550 if (!return_any) 2551 return 1; 2552 /* 2553 * no higher item found, return the next 2554 * lower instead 2555 */ 2556 return_any = 0; 2557 find_higher = 0; 2558 btrfs_release_path(p); 2559 goto again; 2560 } 2561 } else { 2562 if (p->slots[0] == 0) { 2563 ret = btrfs_prev_leaf(root, p); 2564 if (ret < 0) 2565 return ret; 2566 if (!ret) { 2567 leaf = p->nodes[0]; 2568 if (p->slots[0] == btrfs_header_nritems(leaf)) 2569 p->slots[0]--; 2570 return 0; 2571 } 2572 if (!return_any) 2573 return 1; 2574 /* 2575 * no lower item found, return the next 2576 * higher instead 2577 */ 2578 return_any = 0; 2579 find_higher = 1; 2580 btrfs_release_path(p); 2581 goto again; 2582 } else { 2583 --p->slots[0]; 2584 } 2585 } 2586 return 0; 2587 } 2588 2589 /* 2590 * Execute search and call btrfs_previous_item to traverse backwards if the item 2591 * was not found. 2592 * 2593 * Return 0 if found, 1 if not found and < 0 if error. 2594 */ 2595 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, 2596 struct btrfs_path *path) 2597 { 2598 int ret; 2599 2600 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 2601 if (ret > 0) 2602 ret = btrfs_previous_item(root, path, key->objectid, key->type); 2603 2604 if (ret == 0) 2605 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2606 2607 return ret; 2608 } 2609 2610 /* 2611 * Search for a valid slot for the given path. 2612 * 2613 * @root: The root node of the tree. 2614 * @key: Will contain a valid item if found. 2615 * @path: The starting point to validate the slot. 2616 * 2617 * Return: 0 if the item is valid 2618 * 1 if not found 2619 * <0 if error. 2620 */ 2621 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, 2622 struct btrfs_path *path) 2623 { 2624 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 2625 int ret; 2626 2627 ret = btrfs_next_leaf(root, path); 2628 if (ret) 2629 return ret; 2630 } 2631 2632 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2633 return 0; 2634 } 2635 2636 /* 2637 * adjust the pointers going up the tree, starting at level 2638 * making sure the right key of each node is points to 'key'. 2639 * This is used after shifting pointers to the left, so it stops 2640 * fixing up pointers when a given leaf/node is not in slot 0 of the 2641 * higher levels 2642 * 2643 */ 2644 static void fixup_low_keys(struct btrfs_path *path, 2645 struct btrfs_disk_key *key, int level) 2646 { 2647 int i; 2648 struct extent_buffer *t; 2649 int ret; 2650 2651 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2652 int tslot = path->slots[i]; 2653 2654 if (!path->nodes[i]) 2655 break; 2656 t = path->nodes[i]; 2657 ret = btrfs_tree_mod_log_insert_key(t, tslot, 2658 BTRFS_MOD_LOG_KEY_REPLACE); 2659 BUG_ON(ret < 0); 2660 btrfs_set_node_key(t, key, tslot); 2661 btrfs_mark_buffer_dirty(path->nodes[i]); 2662 if (tslot != 0) 2663 break; 2664 } 2665 } 2666 2667 /* 2668 * update item key. 2669 * 2670 * This function isn't completely safe. It's the caller's responsibility 2671 * that the new key won't break the order 2672 */ 2673 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, 2674 struct btrfs_path *path, 2675 const struct btrfs_key *new_key) 2676 { 2677 struct btrfs_disk_key disk_key; 2678 struct extent_buffer *eb; 2679 int slot; 2680 2681 eb = path->nodes[0]; 2682 slot = path->slots[0]; 2683 if (slot > 0) { 2684 btrfs_item_key(eb, &disk_key, slot - 1); 2685 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { 2686 btrfs_print_leaf(eb); 2687 btrfs_crit(fs_info, 2688 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2689 slot, btrfs_disk_key_objectid(&disk_key), 2690 btrfs_disk_key_type(&disk_key), 2691 btrfs_disk_key_offset(&disk_key), 2692 new_key->objectid, new_key->type, 2693 new_key->offset); 2694 BUG(); 2695 } 2696 } 2697 if (slot < btrfs_header_nritems(eb) - 1) { 2698 btrfs_item_key(eb, &disk_key, slot + 1); 2699 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { 2700 btrfs_print_leaf(eb); 2701 btrfs_crit(fs_info, 2702 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2703 slot, btrfs_disk_key_objectid(&disk_key), 2704 btrfs_disk_key_type(&disk_key), 2705 btrfs_disk_key_offset(&disk_key), 2706 new_key->objectid, new_key->type, 2707 new_key->offset); 2708 BUG(); 2709 } 2710 } 2711 2712 btrfs_cpu_key_to_disk(&disk_key, new_key); 2713 btrfs_set_item_key(eb, &disk_key, slot); 2714 btrfs_mark_buffer_dirty(eb); 2715 if (slot == 0) 2716 fixup_low_keys(path, &disk_key, 1); 2717 } 2718 2719 /* 2720 * Check key order of two sibling extent buffers. 2721 * 2722 * Return true if something is wrong. 2723 * Return false if everything is fine. 2724 * 2725 * Tree-checker only works inside one tree block, thus the following 2726 * corruption can not be detected by tree-checker: 2727 * 2728 * Leaf @left | Leaf @right 2729 * -------------------------------------------------------------- 2730 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | 2731 * 2732 * Key f6 in leaf @left itself is valid, but not valid when the next 2733 * key in leaf @right is 7. 2734 * This can only be checked at tree block merge time. 2735 * And since tree checker has ensured all key order in each tree block 2736 * is correct, we only need to bother the last key of @left and the first 2737 * key of @right. 2738 */ 2739 static bool check_sibling_keys(struct extent_buffer *left, 2740 struct extent_buffer *right) 2741 { 2742 struct btrfs_key left_last; 2743 struct btrfs_key right_first; 2744 int level = btrfs_header_level(left); 2745 int nr_left = btrfs_header_nritems(left); 2746 int nr_right = btrfs_header_nritems(right); 2747 2748 /* No key to check in one of the tree blocks */ 2749 if (!nr_left || !nr_right) 2750 return false; 2751 2752 if (level) { 2753 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); 2754 btrfs_node_key_to_cpu(right, &right_first, 0); 2755 } else { 2756 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); 2757 btrfs_item_key_to_cpu(right, &right_first, 0); 2758 } 2759 2760 if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { 2761 btrfs_crit(left->fs_info, "left extent buffer:"); 2762 btrfs_print_tree(left, false); 2763 btrfs_crit(left->fs_info, "right extent buffer:"); 2764 btrfs_print_tree(right, false); 2765 btrfs_crit(left->fs_info, 2766 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", 2767 left_last.objectid, left_last.type, 2768 left_last.offset, right_first.objectid, 2769 right_first.type, right_first.offset); 2770 return true; 2771 } 2772 return false; 2773 } 2774 2775 /* 2776 * try to push data from one node into the next node left in the 2777 * tree. 2778 * 2779 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2780 * error, and > 0 if there was no room in the left hand block. 2781 */ 2782 static int push_node_left(struct btrfs_trans_handle *trans, 2783 struct extent_buffer *dst, 2784 struct extent_buffer *src, int empty) 2785 { 2786 struct btrfs_fs_info *fs_info = trans->fs_info; 2787 int push_items = 0; 2788 int src_nritems; 2789 int dst_nritems; 2790 int ret = 0; 2791 2792 src_nritems = btrfs_header_nritems(src); 2793 dst_nritems = btrfs_header_nritems(dst); 2794 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2795 WARN_ON(btrfs_header_generation(src) != trans->transid); 2796 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2797 2798 if (!empty && src_nritems <= 8) 2799 return 1; 2800 2801 if (push_items <= 0) 2802 return 1; 2803 2804 if (empty) { 2805 push_items = min(src_nritems, push_items); 2806 if (push_items < src_nritems) { 2807 /* leave at least 8 pointers in the node if 2808 * we aren't going to empty it 2809 */ 2810 if (src_nritems - push_items < 8) { 2811 if (push_items <= 8) 2812 return 1; 2813 push_items -= 8; 2814 } 2815 } 2816 } else 2817 push_items = min(src_nritems - 8, push_items); 2818 2819 /* dst is the left eb, src is the middle eb */ 2820 if (check_sibling_keys(dst, src)) { 2821 ret = -EUCLEAN; 2822 btrfs_abort_transaction(trans, ret); 2823 return ret; 2824 } 2825 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); 2826 if (ret) { 2827 btrfs_abort_transaction(trans, ret); 2828 return ret; 2829 } 2830 copy_extent_buffer(dst, src, 2831 btrfs_node_key_ptr_offset(dst, dst_nritems), 2832 btrfs_node_key_ptr_offset(src, 0), 2833 push_items * sizeof(struct btrfs_key_ptr)); 2834 2835 if (push_items < src_nritems) { 2836 /* 2837 * btrfs_tree_mod_log_eb_copy handles logging the move, so we 2838 * don't need to do an explicit tree mod log operation for it. 2839 */ 2840 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), 2841 btrfs_node_key_ptr_offset(src, push_items), 2842 (src_nritems - push_items) * 2843 sizeof(struct btrfs_key_ptr)); 2844 } 2845 btrfs_set_header_nritems(src, src_nritems - push_items); 2846 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2847 btrfs_mark_buffer_dirty(src); 2848 btrfs_mark_buffer_dirty(dst); 2849 2850 return ret; 2851 } 2852 2853 /* 2854 * try to push data from one node into the next node right in the 2855 * tree. 2856 * 2857 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 2858 * error, and > 0 if there was no room in the right hand block. 2859 * 2860 * this will only push up to 1/2 the contents of the left node over 2861 */ 2862 static int balance_node_right(struct btrfs_trans_handle *trans, 2863 struct extent_buffer *dst, 2864 struct extent_buffer *src) 2865 { 2866 struct btrfs_fs_info *fs_info = trans->fs_info; 2867 int push_items = 0; 2868 int max_push; 2869 int src_nritems; 2870 int dst_nritems; 2871 int ret = 0; 2872 2873 WARN_ON(btrfs_header_generation(src) != trans->transid); 2874 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2875 2876 src_nritems = btrfs_header_nritems(src); 2877 dst_nritems = btrfs_header_nritems(dst); 2878 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2879 if (push_items <= 0) 2880 return 1; 2881 2882 if (src_nritems < 4) 2883 return 1; 2884 2885 max_push = src_nritems / 2 + 1; 2886 /* don't try to empty the node */ 2887 if (max_push >= src_nritems) 2888 return 1; 2889 2890 if (max_push < push_items) 2891 push_items = max_push; 2892 2893 /* dst is the right eb, src is the middle eb */ 2894 if (check_sibling_keys(src, dst)) { 2895 ret = -EUCLEAN; 2896 btrfs_abort_transaction(trans, ret); 2897 return ret; 2898 } 2899 2900 /* 2901 * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't 2902 * need to do an explicit tree mod log operation for it. 2903 */ 2904 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), 2905 btrfs_node_key_ptr_offset(dst, 0), 2906 (dst_nritems) * 2907 sizeof(struct btrfs_key_ptr)); 2908 2909 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, 2910 push_items); 2911 if (ret) { 2912 btrfs_abort_transaction(trans, ret); 2913 return ret; 2914 } 2915 copy_extent_buffer(dst, src, 2916 btrfs_node_key_ptr_offset(dst, 0), 2917 btrfs_node_key_ptr_offset(src, src_nritems - push_items), 2918 push_items * sizeof(struct btrfs_key_ptr)); 2919 2920 btrfs_set_header_nritems(src, src_nritems - push_items); 2921 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2922 2923 btrfs_mark_buffer_dirty(src); 2924 btrfs_mark_buffer_dirty(dst); 2925 2926 return ret; 2927 } 2928 2929 /* 2930 * helper function to insert a new root level in the tree. 2931 * A new node is allocated, and a single item is inserted to 2932 * point to the existing root 2933 * 2934 * returns zero on success or < 0 on failure. 2935 */ 2936 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 2937 struct btrfs_root *root, 2938 struct btrfs_path *path, int level) 2939 { 2940 struct btrfs_fs_info *fs_info = root->fs_info; 2941 u64 lower_gen; 2942 struct extent_buffer *lower; 2943 struct extent_buffer *c; 2944 struct extent_buffer *old; 2945 struct btrfs_disk_key lower_key; 2946 int ret; 2947 2948 BUG_ON(path->nodes[level]); 2949 BUG_ON(path->nodes[level-1] != root->node); 2950 2951 lower = path->nodes[level-1]; 2952 if (level == 1) 2953 btrfs_item_key(lower, &lower_key, 0); 2954 else 2955 btrfs_node_key(lower, &lower_key, 0); 2956 2957 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 2958 &lower_key, level, root->node->start, 0, 2959 BTRFS_NESTING_NEW_ROOT); 2960 if (IS_ERR(c)) 2961 return PTR_ERR(c); 2962 2963 root_add_used(root, fs_info->nodesize); 2964 2965 btrfs_set_header_nritems(c, 1); 2966 btrfs_set_node_key(c, &lower_key, 0); 2967 btrfs_set_node_blockptr(c, 0, lower->start); 2968 lower_gen = btrfs_header_generation(lower); 2969 WARN_ON(lower_gen != trans->transid); 2970 2971 btrfs_set_node_ptr_generation(c, 0, lower_gen); 2972 2973 btrfs_mark_buffer_dirty(c); 2974 2975 old = root->node; 2976 ret = btrfs_tree_mod_log_insert_root(root->node, c, false); 2977 if (ret < 0) { 2978 btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); 2979 btrfs_tree_unlock(c); 2980 free_extent_buffer(c); 2981 return ret; 2982 } 2983 rcu_assign_pointer(root->node, c); 2984 2985 /* the super has an extra ref to root->node */ 2986 free_extent_buffer(old); 2987 2988 add_root_to_dirty_list(root); 2989 atomic_inc(&c->refs); 2990 path->nodes[level] = c; 2991 path->locks[level] = BTRFS_WRITE_LOCK; 2992 path->slots[level] = 0; 2993 return 0; 2994 } 2995 2996 /* 2997 * worker function to insert a single pointer in a node. 2998 * the node should have enough room for the pointer already 2999 * 3000 * slot and level indicate where you want the key to go, and 3001 * blocknr is the block the key points to. 3002 */ 3003 static int insert_ptr(struct btrfs_trans_handle *trans, 3004 struct btrfs_path *path, 3005 struct btrfs_disk_key *key, u64 bytenr, 3006 int slot, int level) 3007 { 3008 struct extent_buffer *lower; 3009 int nritems; 3010 int ret; 3011 3012 BUG_ON(!path->nodes[level]); 3013 btrfs_assert_tree_write_locked(path->nodes[level]); 3014 lower = path->nodes[level]; 3015 nritems = btrfs_header_nritems(lower); 3016 BUG_ON(slot > nritems); 3017 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); 3018 if (slot != nritems) { 3019 if (level) { 3020 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, 3021 slot, nritems - slot); 3022 if (ret < 0) { 3023 btrfs_abort_transaction(trans, ret); 3024 return ret; 3025 } 3026 } 3027 memmove_extent_buffer(lower, 3028 btrfs_node_key_ptr_offset(lower, slot + 1), 3029 btrfs_node_key_ptr_offset(lower, slot), 3030 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 3031 } 3032 if (level) { 3033 ret = btrfs_tree_mod_log_insert_key(lower, slot, 3034 BTRFS_MOD_LOG_KEY_ADD); 3035 if (ret < 0) { 3036 btrfs_abort_transaction(trans, ret); 3037 return ret; 3038 } 3039 } 3040 btrfs_set_node_key(lower, key, slot); 3041 btrfs_set_node_blockptr(lower, slot, bytenr); 3042 WARN_ON(trans->transid == 0); 3043 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 3044 btrfs_set_header_nritems(lower, nritems + 1); 3045 btrfs_mark_buffer_dirty(lower); 3046 3047 return 0; 3048 } 3049 3050 /* 3051 * split the node at the specified level in path in two. 3052 * The path is corrected to point to the appropriate node after the split 3053 * 3054 * Before splitting this tries to make some room in the node by pushing 3055 * left and right, if either one works, it returns right away. 3056 * 3057 * returns 0 on success and < 0 on failure 3058 */ 3059 static noinline int split_node(struct btrfs_trans_handle *trans, 3060 struct btrfs_root *root, 3061 struct btrfs_path *path, int level) 3062 { 3063 struct btrfs_fs_info *fs_info = root->fs_info; 3064 struct extent_buffer *c; 3065 struct extent_buffer *split; 3066 struct btrfs_disk_key disk_key; 3067 int mid; 3068 int ret; 3069 u32 c_nritems; 3070 3071 c = path->nodes[level]; 3072 WARN_ON(btrfs_header_generation(c) != trans->transid); 3073 if (c == root->node) { 3074 /* 3075 * trying to split the root, lets make a new one 3076 * 3077 * tree mod log: We don't log_removal old root in 3078 * insert_new_root, because that root buffer will be kept as a 3079 * normal node. We are going to log removal of half of the 3080 * elements below with btrfs_tree_mod_log_eb_copy(). We're 3081 * holding a tree lock on the buffer, which is why we cannot 3082 * race with other tree_mod_log users. 3083 */ 3084 ret = insert_new_root(trans, root, path, level + 1); 3085 if (ret) 3086 return ret; 3087 } else { 3088 ret = push_nodes_for_insert(trans, root, path, level); 3089 c = path->nodes[level]; 3090 if (!ret && btrfs_header_nritems(c) < 3091 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 3092 return 0; 3093 if (ret < 0) 3094 return ret; 3095 } 3096 3097 c_nritems = btrfs_header_nritems(c); 3098 mid = (c_nritems + 1) / 2; 3099 btrfs_node_key(c, &disk_key, mid); 3100 3101 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3102 &disk_key, level, c->start, 0, 3103 BTRFS_NESTING_SPLIT); 3104 if (IS_ERR(split)) 3105 return PTR_ERR(split); 3106 3107 root_add_used(root, fs_info->nodesize); 3108 ASSERT(btrfs_header_level(c) == level); 3109 3110 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); 3111 if (ret) { 3112 btrfs_tree_unlock(split); 3113 free_extent_buffer(split); 3114 btrfs_abort_transaction(trans, ret); 3115 return ret; 3116 } 3117 copy_extent_buffer(split, c, 3118 btrfs_node_key_ptr_offset(split, 0), 3119 btrfs_node_key_ptr_offset(c, mid), 3120 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 3121 btrfs_set_header_nritems(split, c_nritems - mid); 3122 btrfs_set_header_nritems(c, mid); 3123 3124 btrfs_mark_buffer_dirty(c); 3125 btrfs_mark_buffer_dirty(split); 3126 3127 ret = insert_ptr(trans, path, &disk_key, split->start, 3128 path->slots[level + 1] + 1, level + 1); 3129 if (ret < 0) { 3130 btrfs_tree_unlock(split); 3131 free_extent_buffer(split); 3132 return ret; 3133 } 3134 3135 if (path->slots[level] >= mid) { 3136 path->slots[level] -= mid; 3137 btrfs_tree_unlock(c); 3138 free_extent_buffer(c); 3139 path->nodes[level] = split; 3140 path->slots[level + 1] += 1; 3141 } else { 3142 btrfs_tree_unlock(split); 3143 free_extent_buffer(split); 3144 } 3145 return 0; 3146 } 3147 3148 /* 3149 * how many bytes are required to store the items in a leaf. start 3150 * and nr indicate which items in the leaf to check. This totals up the 3151 * space used both by the item structs and the item data 3152 */ 3153 static int leaf_space_used(const struct extent_buffer *l, int start, int nr) 3154 { 3155 int data_len; 3156 int nritems = btrfs_header_nritems(l); 3157 int end = min(nritems, start + nr) - 1; 3158 3159 if (!nr) 3160 return 0; 3161 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); 3162 data_len = data_len - btrfs_item_offset(l, end); 3163 data_len += sizeof(struct btrfs_item) * nr; 3164 WARN_ON(data_len < 0); 3165 return data_len; 3166 } 3167 3168 /* 3169 * The space between the end of the leaf items and 3170 * the start of the leaf data. IOW, how much room 3171 * the leaf has left for both items and data 3172 */ 3173 int btrfs_leaf_free_space(const struct extent_buffer *leaf) 3174 { 3175 struct btrfs_fs_info *fs_info = leaf->fs_info; 3176 int nritems = btrfs_header_nritems(leaf); 3177 int ret; 3178 3179 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 3180 if (ret < 0) { 3181 btrfs_crit(fs_info, 3182 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 3183 ret, 3184 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 3185 leaf_space_used(leaf, 0, nritems), nritems); 3186 } 3187 return ret; 3188 } 3189 3190 /* 3191 * min slot controls the lowest index we're willing to push to the 3192 * right. We'll push up to and including min_slot, but no lower 3193 */ 3194 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 3195 struct btrfs_path *path, 3196 int data_size, int empty, 3197 struct extent_buffer *right, 3198 int free_space, u32 left_nritems, 3199 u32 min_slot) 3200 { 3201 struct btrfs_fs_info *fs_info = right->fs_info; 3202 struct extent_buffer *left = path->nodes[0]; 3203 struct extent_buffer *upper = path->nodes[1]; 3204 struct btrfs_map_token token; 3205 struct btrfs_disk_key disk_key; 3206 int slot; 3207 u32 i; 3208 int push_space = 0; 3209 int push_items = 0; 3210 u32 nr; 3211 u32 right_nritems; 3212 u32 data_end; 3213 u32 this_item_size; 3214 3215 if (empty) 3216 nr = 0; 3217 else 3218 nr = max_t(u32, 1, min_slot); 3219 3220 if (path->slots[0] >= left_nritems) 3221 push_space += data_size; 3222 3223 slot = path->slots[1]; 3224 i = left_nritems - 1; 3225 while (i >= nr) { 3226 if (!empty && push_items > 0) { 3227 if (path->slots[0] > i) 3228 break; 3229 if (path->slots[0] == i) { 3230 int space = btrfs_leaf_free_space(left); 3231 3232 if (space + push_space * 2 > free_space) 3233 break; 3234 } 3235 } 3236 3237 if (path->slots[0] == i) 3238 push_space += data_size; 3239 3240 this_item_size = btrfs_item_size(left, i); 3241 if (this_item_size + sizeof(struct btrfs_item) + 3242 push_space > free_space) 3243 break; 3244 3245 push_items++; 3246 push_space += this_item_size + sizeof(struct btrfs_item); 3247 if (i == 0) 3248 break; 3249 i--; 3250 } 3251 3252 if (push_items == 0) 3253 goto out_unlock; 3254 3255 WARN_ON(!empty && push_items == left_nritems); 3256 3257 /* push left to right */ 3258 right_nritems = btrfs_header_nritems(right); 3259 3260 push_space = btrfs_item_data_end(left, left_nritems - push_items); 3261 push_space -= leaf_data_end(left); 3262 3263 /* make room in the right data area */ 3264 data_end = leaf_data_end(right); 3265 memmove_leaf_data(right, data_end - push_space, data_end, 3266 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 3267 3268 /* copy from the left data area */ 3269 copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3270 leaf_data_end(left), push_space); 3271 3272 memmove_leaf_items(right, push_items, 0, right_nritems); 3273 3274 /* copy the items from left to right */ 3275 copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); 3276 3277 /* update the item pointers */ 3278 btrfs_init_map_token(&token, right); 3279 right_nritems += push_items; 3280 btrfs_set_header_nritems(right, right_nritems); 3281 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3282 for (i = 0; i < right_nritems; i++) { 3283 push_space -= btrfs_token_item_size(&token, i); 3284 btrfs_set_token_item_offset(&token, i, push_space); 3285 } 3286 3287 left_nritems -= push_items; 3288 btrfs_set_header_nritems(left, left_nritems); 3289 3290 if (left_nritems) 3291 btrfs_mark_buffer_dirty(left); 3292 else 3293 btrfs_clear_buffer_dirty(trans, left); 3294 3295 btrfs_mark_buffer_dirty(right); 3296 3297 btrfs_item_key(right, &disk_key, 0); 3298 btrfs_set_node_key(upper, &disk_key, slot + 1); 3299 btrfs_mark_buffer_dirty(upper); 3300 3301 /* then fixup the leaf pointer in the path */ 3302 if (path->slots[0] >= left_nritems) { 3303 path->slots[0] -= left_nritems; 3304 if (btrfs_header_nritems(path->nodes[0]) == 0) 3305 btrfs_clear_buffer_dirty(trans, path->nodes[0]); 3306 btrfs_tree_unlock(path->nodes[0]); 3307 free_extent_buffer(path->nodes[0]); 3308 path->nodes[0] = right; 3309 path->slots[1] += 1; 3310 } else { 3311 btrfs_tree_unlock(right); 3312 free_extent_buffer(right); 3313 } 3314 return 0; 3315 3316 out_unlock: 3317 btrfs_tree_unlock(right); 3318 free_extent_buffer(right); 3319 return 1; 3320 } 3321 3322 /* 3323 * push some data in the path leaf to the right, trying to free up at 3324 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3325 * 3326 * returns 1 if the push failed because the other node didn't have enough 3327 * room, 0 if everything worked out and < 0 if there were major errors. 3328 * 3329 * this will push starting from min_slot to the end of the leaf. It won't 3330 * push any slot lower than min_slot 3331 */ 3332 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3333 *root, struct btrfs_path *path, 3334 int min_data_size, int data_size, 3335 int empty, u32 min_slot) 3336 { 3337 struct extent_buffer *left = path->nodes[0]; 3338 struct extent_buffer *right; 3339 struct extent_buffer *upper; 3340 int slot; 3341 int free_space; 3342 u32 left_nritems; 3343 int ret; 3344 3345 if (!path->nodes[1]) 3346 return 1; 3347 3348 slot = path->slots[1]; 3349 upper = path->nodes[1]; 3350 if (slot >= btrfs_header_nritems(upper) - 1) 3351 return 1; 3352 3353 btrfs_assert_tree_write_locked(path->nodes[1]); 3354 3355 right = btrfs_read_node_slot(upper, slot + 1); 3356 if (IS_ERR(right)) 3357 return PTR_ERR(right); 3358 3359 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 3360 3361 free_space = btrfs_leaf_free_space(right); 3362 if (free_space < data_size) 3363 goto out_unlock; 3364 3365 ret = btrfs_cow_block(trans, root, right, upper, 3366 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 3367 if (ret) 3368 goto out_unlock; 3369 3370 left_nritems = btrfs_header_nritems(left); 3371 if (left_nritems == 0) 3372 goto out_unlock; 3373 3374 if (check_sibling_keys(left, right)) { 3375 ret = -EUCLEAN; 3376 btrfs_abort_transaction(trans, ret); 3377 btrfs_tree_unlock(right); 3378 free_extent_buffer(right); 3379 return ret; 3380 } 3381 if (path->slots[0] == left_nritems && !empty) { 3382 /* Key greater than all keys in the leaf, right neighbor has 3383 * enough room for it and we're not emptying our leaf to delete 3384 * it, therefore use right neighbor to insert the new item and 3385 * no need to touch/dirty our left leaf. */ 3386 btrfs_tree_unlock(left); 3387 free_extent_buffer(left); 3388 path->nodes[0] = right; 3389 path->slots[0] = 0; 3390 path->slots[1]++; 3391 return 0; 3392 } 3393 3394 return __push_leaf_right(trans, path, min_data_size, empty, right, 3395 free_space, left_nritems, min_slot); 3396 out_unlock: 3397 btrfs_tree_unlock(right); 3398 free_extent_buffer(right); 3399 return 1; 3400 } 3401 3402 /* 3403 * push some data in the path leaf to the left, trying to free up at 3404 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3405 * 3406 * max_slot can put a limit on how far into the leaf we'll push items. The 3407 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3408 * items 3409 */ 3410 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 3411 struct btrfs_path *path, int data_size, 3412 int empty, struct extent_buffer *left, 3413 int free_space, u32 right_nritems, 3414 u32 max_slot) 3415 { 3416 struct btrfs_fs_info *fs_info = left->fs_info; 3417 struct btrfs_disk_key disk_key; 3418 struct extent_buffer *right = path->nodes[0]; 3419 int i; 3420 int push_space = 0; 3421 int push_items = 0; 3422 u32 old_left_nritems; 3423 u32 nr; 3424 int ret = 0; 3425 u32 this_item_size; 3426 u32 old_left_item_size; 3427 struct btrfs_map_token token; 3428 3429 if (empty) 3430 nr = min(right_nritems, max_slot); 3431 else 3432 nr = min(right_nritems - 1, max_slot); 3433 3434 for (i = 0; i < nr; i++) { 3435 if (!empty && push_items > 0) { 3436 if (path->slots[0] < i) 3437 break; 3438 if (path->slots[0] == i) { 3439 int space = btrfs_leaf_free_space(right); 3440 3441 if (space + push_space * 2 > free_space) 3442 break; 3443 } 3444 } 3445 3446 if (path->slots[0] == i) 3447 push_space += data_size; 3448 3449 this_item_size = btrfs_item_size(right, i); 3450 if (this_item_size + sizeof(struct btrfs_item) + push_space > 3451 free_space) 3452 break; 3453 3454 push_items++; 3455 push_space += this_item_size + sizeof(struct btrfs_item); 3456 } 3457 3458 if (push_items == 0) { 3459 ret = 1; 3460 goto out; 3461 } 3462 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 3463 3464 /* push data from right to left */ 3465 copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); 3466 3467 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 3468 btrfs_item_offset(right, push_items - 1); 3469 3470 copy_leaf_data(left, right, leaf_data_end(left) - push_space, 3471 btrfs_item_offset(right, push_items - 1), push_space); 3472 old_left_nritems = btrfs_header_nritems(left); 3473 BUG_ON(old_left_nritems <= 0); 3474 3475 btrfs_init_map_token(&token, left); 3476 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); 3477 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3478 u32 ioff; 3479 3480 ioff = btrfs_token_item_offset(&token, i); 3481 btrfs_set_token_item_offset(&token, i, 3482 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); 3483 } 3484 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3485 3486 /* fixup right node */ 3487 if (push_items > right_nritems) 3488 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3489 right_nritems); 3490 3491 if (push_items < right_nritems) { 3492 push_space = btrfs_item_offset(right, push_items - 1) - 3493 leaf_data_end(right); 3494 memmove_leaf_data(right, 3495 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3496 leaf_data_end(right), push_space); 3497 3498 memmove_leaf_items(right, 0, push_items, 3499 btrfs_header_nritems(right) - push_items); 3500 } 3501 3502 btrfs_init_map_token(&token, right); 3503 right_nritems -= push_items; 3504 btrfs_set_header_nritems(right, right_nritems); 3505 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3506 for (i = 0; i < right_nritems; i++) { 3507 push_space = push_space - btrfs_token_item_size(&token, i); 3508 btrfs_set_token_item_offset(&token, i, push_space); 3509 } 3510 3511 btrfs_mark_buffer_dirty(left); 3512 if (right_nritems) 3513 btrfs_mark_buffer_dirty(right); 3514 else 3515 btrfs_clear_buffer_dirty(trans, right); 3516 3517 btrfs_item_key(right, &disk_key, 0); 3518 fixup_low_keys(path, &disk_key, 1); 3519 3520 /* then fixup the leaf pointer in the path */ 3521 if (path->slots[0] < push_items) { 3522 path->slots[0] += old_left_nritems; 3523 btrfs_tree_unlock(path->nodes[0]); 3524 free_extent_buffer(path->nodes[0]); 3525 path->nodes[0] = left; 3526 path->slots[1] -= 1; 3527 } else { 3528 btrfs_tree_unlock(left); 3529 free_extent_buffer(left); 3530 path->slots[0] -= push_items; 3531 } 3532 BUG_ON(path->slots[0] < 0); 3533 return ret; 3534 out: 3535 btrfs_tree_unlock(left); 3536 free_extent_buffer(left); 3537 return ret; 3538 } 3539 3540 /* 3541 * push some data in the path leaf to the left, trying to free up at 3542 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3543 * 3544 * max_slot can put a limit on how far into the leaf we'll push items. The 3545 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3546 * items 3547 */ 3548 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3549 *root, struct btrfs_path *path, int min_data_size, 3550 int data_size, int empty, u32 max_slot) 3551 { 3552 struct extent_buffer *right = path->nodes[0]; 3553 struct extent_buffer *left; 3554 int slot; 3555 int free_space; 3556 u32 right_nritems; 3557 int ret = 0; 3558 3559 slot = path->slots[1]; 3560 if (slot == 0) 3561 return 1; 3562 if (!path->nodes[1]) 3563 return 1; 3564 3565 right_nritems = btrfs_header_nritems(right); 3566 if (right_nritems == 0) 3567 return 1; 3568 3569 btrfs_assert_tree_write_locked(path->nodes[1]); 3570 3571 left = btrfs_read_node_slot(path->nodes[1], slot - 1); 3572 if (IS_ERR(left)) 3573 return PTR_ERR(left); 3574 3575 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 3576 3577 free_space = btrfs_leaf_free_space(left); 3578 if (free_space < data_size) { 3579 ret = 1; 3580 goto out; 3581 } 3582 3583 ret = btrfs_cow_block(trans, root, left, 3584 path->nodes[1], slot - 1, &left, 3585 BTRFS_NESTING_LEFT_COW); 3586 if (ret) { 3587 /* we hit -ENOSPC, but it isn't fatal here */ 3588 if (ret == -ENOSPC) 3589 ret = 1; 3590 goto out; 3591 } 3592 3593 if (check_sibling_keys(left, right)) { 3594 ret = -EUCLEAN; 3595 btrfs_abort_transaction(trans, ret); 3596 goto out; 3597 } 3598 return __push_leaf_left(trans, path, min_data_size, empty, left, 3599 free_space, right_nritems, max_slot); 3600 out: 3601 btrfs_tree_unlock(left); 3602 free_extent_buffer(left); 3603 return ret; 3604 } 3605 3606 /* 3607 * split the path's leaf in two, making sure there is at least data_size 3608 * available for the resulting leaf level of the path. 3609 */ 3610 static noinline int copy_for_split(struct btrfs_trans_handle *trans, 3611 struct btrfs_path *path, 3612 struct extent_buffer *l, 3613 struct extent_buffer *right, 3614 int slot, int mid, int nritems) 3615 { 3616 struct btrfs_fs_info *fs_info = trans->fs_info; 3617 int data_copy_size; 3618 int rt_data_off; 3619 int i; 3620 int ret; 3621 struct btrfs_disk_key disk_key; 3622 struct btrfs_map_token token; 3623 3624 nritems = nritems - mid; 3625 btrfs_set_header_nritems(right, nritems); 3626 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); 3627 3628 copy_leaf_items(right, l, 0, mid, nritems); 3629 3630 copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, 3631 leaf_data_end(l), data_copy_size); 3632 3633 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); 3634 3635 btrfs_init_map_token(&token, right); 3636 for (i = 0; i < nritems; i++) { 3637 u32 ioff; 3638 3639 ioff = btrfs_token_item_offset(&token, i); 3640 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); 3641 } 3642 3643 btrfs_set_header_nritems(l, mid); 3644 btrfs_item_key(right, &disk_key, 0); 3645 ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); 3646 if (ret < 0) 3647 return ret; 3648 3649 btrfs_mark_buffer_dirty(right); 3650 btrfs_mark_buffer_dirty(l); 3651 BUG_ON(path->slots[0] != slot); 3652 3653 if (mid <= slot) { 3654 btrfs_tree_unlock(path->nodes[0]); 3655 free_extent_buffer(path->nodes[0]); 3656 path->nodes[0] = right; 3657 path->slots[0] -= mid; 3658 path->slots[1] += 1; 3659 } else { 3660 btrfs_tree_unlock(right); 3661 free_extent_buffer(right); 3662 } 3663 3664 BUG_ON(path->slots[0] < 0); 3665 3666 return 0; 3667 } 3668 3669 /* 3670 * double splits happen when we need to insert a big item in the middle 3671 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3672 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3673 * A B C 3674 * 3675 * We avoid this by trying to push the items on either side of our target 3676 * into the adjacent leaves. If all goes well we can avoid the double split 3677 * completely. 3678 */ 3679 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3680 struct btrfs_root *root, 3681 struct btrfs_path *path, 3682 int data_size) 3683 { 3684 int ret; 3685 int progress = 0; 3686 int slot; 3687 u32 nritems; 3688 int space_needed = data_size; 3689 3690 slot = path->slots[0]; 3691 if (slot < btrfs_header_nritems(path->nodes[0])) 3692 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3693 3694 /* 3695 * try to push all the items after our slot into the 3696 * right leaf 3697 */ 3698 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 3699 if (ret < 0) 3700 return ret; 3701 3702 if (ret == 0) 3703 progress++; 3704 3705 nritems = btrfs_header_nritems(path->nodes[0]); 3706 /* 3707 * our goal is to get our slot at the start or end of a leaf. If 3708 * we've done so we're done 3709 */ 3710 if (path->slots[0] == 0 || path->slots[0] == nritems) 3711 return 0; 3712 3713 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3714 return 0; 3715 3716 /* try to push all the items before our slot into the next leaf */ 3717 slot = path->slots[0]; 3718 space_needed = data_size; 3719 if (slot > 0) 3720 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3721 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 3722 if (ret < 0) 3723 return ret; 3724 3725 if (ret == 0) 3726 progress++; 3727 3728 if (progress) 3729 return 0; 3730 return 1; 3731 } 3732 3733 /* 3734 * split the path's leaf in two, making sure there is at least data_size 3735 * available for the resulting leaf level of the path. 3736 * 3737 * returns 0 if all went well and < 0 on failure. 3738 */ 3739 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3740 struct btrfs_root *root, 3741 const struct btrfs_key *ins_key, 3742 struct btrfs_path *path, int data_size, 3743 int extend) 3744 { 3745 struct btrfs_disk_key disk_key; 3746 struct extent_buffer *l; 3747 u32 nritems; 3748 int mid; 3749 int slot; 3750 struct extent_buffer *right; 3751 struct btrfs_fs_info *fs_info = root->fs_info; 3752 int ret = 0; 3753 int wret; 3754 int split; 3755 int num_doubles = 0; 3756 int tried_avoid_double = 0; 3757 3758 l = path->nodes[0]; 3759 slot = path->slots[0]; 3760 if (extend && data_size + btrfs_item_size(l, slot) + 3761 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 3762 return -EOVERFLOW; 3763 3764 /* first try to make some room by pushing left and right */ 3765 if (data_size && path->nodes[1]) { 3766 int space_needed = data_size; 3767 3768 if (slot < btrfs_header_nritems(l)) 3769 space_needed -= btrfs_leaf_free_space(l); 3770 3771 wret = push_leaf_right(trans, root, path, space_needed, 3772 space_needed, 0, 0); 3773 if (wret < 0) 3774 return wret; 3775 if (wret) { 3776 space_needed = data_size; 3777 if (slot > 0) 3778 space_needed -= btrfs_leaf_free_space(l); 3779 wret = push_leaf_left(trans, root, path, space_needed, 3780 space_needed, 0, (u32)-1); 3781 if (wret < 0) 3782 return wret; 3783 } 3784 l = path->nodes[0]; 3785 3786 /* did the pushes work? */ 3787 if (btrfs_leaf_free_space(l) >= data_size) 3788 return 0; 3789 } 3790 3791 if (!path->nodes[1]) { 3792 ret = insert_new_root(trans, root, path, 1); 3793 if (ret) 3794 return ret; 3795 } 3796 again: 3797 split = 1; 3798 l = path->nodes[0]; 3799 slot = path->slots[0]; 3800 nritems = btrfs_header_nritems(l); 3801 mid = (nritems + 1) / 2; 3802 3803 if (mid <= slot) { 3804 if (nritems == 1 || 3805 leaf_space_used(l, mid, nritems - mid) + data_size > 3806 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3807 if (slot >= nritems) { 3808 split = 0; 3809 } else { 3810 mid = slot; 3811 if (mid != nritems && 3812 leaf_space_used(l, mid, nritems - mid) + 3813 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3814 if (data_size && !tried_avoid_double) 3815 goto push_for_double; 3816 split = 2; 3817 } 3818 } 3819 } 3820 } else { 3821 if (leaf_space_used(l, 0, mid) + data_size > 3822 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3823 if (!extend && data_size && slot == 0) { 3824 split = 0; 3825 } else if ((extend || !data_size) && slot == 0) { 3826 mid = 1; 3827 } else { 3828 mid = slot; 3829 if (mid != nritems && 3830 leaf_space_used(l, mid, nritems - mid) + 3831 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3832 if (data_size && !tried_avoid_double) 3833 goto push_for_double; 3834 split = 2; 3835 } 3836 } 3837 } 3838 } 3839 3840 if (split == 0) 3841 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3842 else 3843 btrfs_item_key(l, &disk_key, mid); 3844 3845 /* 3846 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double 3847 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES 3848 * subclasses, which is 8 at the time of this patch, and we've maxed it 3849 * out. In the future we could add a 3850 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just 3851 * use BTRFS_NESTING_NEW_ROOT. 3852 */ 3853 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3854 &disk_key, 0, l->start, 0, 3855 num_doubles ? BTRFS_NESTING_NEW_ROOT : 3856 BTRFS_NESTING_SPLIT); 3857 if (IS_ERR(right)) 3858 return PTR_ERR(right); 3859 3860 root_add_used(root, fs_info->nodesize); 3861 3862 if (split == 0) { 3863 if (mid <= slot) { 3864 btrfs_set_header_nritems(right, 0); 3865 ret = insert_ptr(trans, path, &disk_key, 3866 right->start, path->slots[1] + 1, 1); 3867 if (ret < 0) { 3868 btrfs_tree_unlock(right); 3869 free_extent_buffer(right); 3870 return ret; 3871 } 3872 btrfs_tree_unlock(path->nodes[0]); 3873 free_extent_buffer(path->nodes[0]); 3874 path->nodes[0] = right; 3875 path->slots[0] = 0; 3876 path->slots[1] += 1; 3877 } else { 3878 btrfs_set_header_nritems(right, 0); 3879 ret = insert_ptr(trans, path, &disk_key, 3880 right->start, path->slots[1], 1); 3881 if (ret < 0) { 3882 btrfs_tree_unlock(right); 3883 free_extent_buffer(right); 3884 return ret; 3885 } 3886 btrfs_tree_unlock(path->nodes[0]); 3887 free_extent_buffer(path->nodes[0]); 3888 path->nodes[0] = right; 3889 path->slots[0] = 0; 3890 if (path->slots[1] == 0) 3891 fixup_low_keys(path, &disk_key, 1); 3892 } 3893 /* 3894 * We create a new leaf 'right' for the required ins_len and 3895 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 3896 * the content of ins_len to 'right'. 3897 */ 3898 return ret; 3899 } 3900 3901 ret = copy_for_split(trans, path, l, right, slot, mid, nritems); 3902 if (ret < 0) { 3903 btrfs_tree_unlock(right); 3904 free_extent_buffer(right); 3905 return ret; 3906 } 3907 3908 if (split == 2) { 3909 BUG_ON(num_doubles != 0); 3910 num_doubles++; 3911 goto again; 3912 } 3913 3914 return 0; 3915 3916 push_for_double: 3917 push_for_double_split(trans, root, path, data_size); 3918 tried_avoid_double = 1; 3919 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3920 return 0; 3921 goto again; 3922 } 3923 3924 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3925 struct btrfs_root *root, 3926 struct btrfs_path *path, int ins_len) 3927 { 3928 struct btrfs_key key; 3929 struct extent_buffer *leaf; 3930 struct btrfs_file_extent_item *fi; 3931 u64 extent_len = 0; 3932 u32 item_size; 3933 int ret; 3934 3935 leaf = path->nodes[0]; 3936 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3937 3938 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 3939 key.type != BTRFS_EXTENT_CSUM_KEY); 3940 3941 if (btrfs_leaf_free_space(leaf) >= ins_len) 3942 return 0; 3943 3944 item_size = btrfs_item_size(leaf, path->slots[0]); 3945 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3946 fi = btrfs_item_ptr(leaf, path->slots[0], 3947 struct btrfs_file_extent_item); 3948 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3949 } 3950 btrfs_release_path(path); 3951 3952 path->keep_locks = 1; 3953 path->search_for_split = 1; 3954 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 3955 path->search_for_split = 0; 3956 if (ret > 0) 3957 ret = -EAGAIN; 3958 if (ret < 0) 3959 goto err; 3960 3961 ret = -EAGAIN; 3962 leaf = path->nodes[0]; 3963 /* if our item isn't there, return now */ 3964 if (item_size != btrfs_item_size(leaf, path->slots[0])) 3965 goto err; 3966 3967 /* the leaf has changed, it now has room. return now */ 3968 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) 3969 goto err; 3970 3971 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3972 fi = btrfs_item_ptr(leaf, path->slots[0], 3973 struct btrfs_file_extent_item); 3974 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 3975 goto err; 3976 } 3977 3978 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3979 if (ret) 3980 goto err; 3981 3982 path->keep_locks = 0; 3983 btrfs_unlock_up_safe(path, 1); 3984 return 0; 3985 err: 3986 path->keep_locks = 0; 3987 return ret; 3988 } 3989 3990 static noinline int split_item(struct btrfs_path *path, 3991 const struct btrfs_key *new_key, 3992 unsigned long split_offset) 3993 { 3994 struct extent_buffer *leaf; 3995 int orig_slot, slot; 3996 char *buf; 3997 u32 nritems; 3998 u32 item_size; 3999 u32 orig_offset; 4000 struct btrfs_disk_key disk_key; 4001 4002 leaf = path->nodes[0]; 4003 /* 4004 * Shouldn't happen because the caller must have previously called 4005 * setup_leaf_for_split() to make room for the new item in the leaf. 4006 */ 4007 if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item))) 4008 return -ENOSPC; 4009 4010 orig_slot = path->slots[0]; 4011 orig_offset = btrfs_item_offset(leaf, path->slots[0]); 4012 item_size = btrfs_item_size(leaf, path->slots[0]); 4013 4014 buf = kmalloc(item_size, GFP_NOFS); 4015 if (!buf) 4016 return -ENOMEM; 4017 4018 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 4019 path->slots[0]), item_size); 4020 4021 slot = path->slots[0] + 1; 4022 nritems = btrfs_header_nritems(leaf); 4023 if (slot != nritems) { 4024 /* shift the items */ 4025 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); 4026 } 4027 4028 btrfs_cpu_key_to_disk(&disk_key, new_key); 4029 btrfs_set_item_key(leaf, &disk_key, slot); 4030 4031 btrfs_set_item_offset(leaf, slot, orig_offset); 4032 btrfs_set_item_size(leaf, slot, item_size - split_offset); 4033 4034 btrfs_set_item_offset(leaf, orig_slot, 4035 orig_offset + item_size - split_offset); 4036 btrfs_set_item_size(leaf, orig_slot, split_offset); 4037 4038 btrfs_set_header_nritems(leaf, nritems + 1); 4039 4040 /* write the data for the start of the original item */ 4041 write_extent_buffer(leaf, buf, 4042 btrfs_item_ptr_offset(leaf, path->slots[0]), 4043 split_offset); 4044 4045 /* write the data for the new item */ 4046 write_extent_buffer(leaf, buf + split_offset, 4047 btrfs_item_ptr_offset(leaf, slot), 4048 item_size - split_offset); 4049 btrfs_mark_buffer_dirty(leaf); 4050 4051 BUG_ON(btrfs_leaf_free_space(leaf) < 0); 4052 kfree(buf); 4053 return 0; 4054 } 4055 4056 /* 4057 * This function splits a single item into two items, 4058 * giving 'new_key' to the new item and splitting the 4059 * old one at split_offset (from the start of the item). 4060 * 4061 * The path may be released by this operation. After 4062 * the split, the path is pointing to the old item. The 4063 * new item is going to be in the same node as the old one. 4064 * 4065 * Note, the item being split must be smaller enough to live alone on 4066 * a tree block with room for one extra struct btrfs_item 4067 * 4068 * This allows us to split the item in place, keeping a lock on the 4069 * leaf the entire time. 4070 */ 4071 int btrfs_split_item(struct btrfs_trans_handle *trans, 4072 struct btrfs_root *root, 4073 struct btrfs_path *path, 4074 const struct btrfs_key *new_key, 4075 unsigned long split_offset) 4076 { 4077 int ret; 4078 ret = setup_leaf_for_split(trans, root, path, 4079 sizeof(struct btrfs_item)); 4080 if (ret) 4081 return ret; 4082 4083 ret = split_item(path, new_key, split_offset); 4084 return ret; 4085 } 4086 4087 /* 4088 * make the item pointed to by the path smaller. new_size indicates 4089 * how small to make it, and from_end tells us if we just chop bytes 4090 * off the end of the item or if we shift the item to chop bytes off 4091 * the front. 4092 */ 4093 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) 4094 { 4095 int slot; 4096 struct extent_buffer *leaf; 4097 u32 nritems; 4098 unsigned int data_end; 4099 unsigned int old_data_start; 4100 unsigned int old_size; 4101 unsigned int size_diff; 4102 int i; 4103 struct btrfs_map_token token; 4104 4105 leaf = path->nodes[0]; 4106 slot = path->slots[0]; 4107 4108 old_size = btrfs_item_size(leaf, slot); 4109 if (old_size == new_size) 4110 return; 4111 4112 nritems = btrfs_header_nritems(leaf); 4113 data_end = leaf_data_end(leaf); 4114 4115 old_data_start = btrfs_item_offset(leaf, slot); 4116 4117 size_diff = old_size - new_size; 4118 4119 BUG_ON(slot < 0); 4120 BUG_ON(slot >= nritems); 4121 4122 /* 4123 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4124 */ 4125 /* first correct the data pointers */ 4126 btrfs_init_map_token(&token, leaf); 4127 for (i = slot; i < nritems; i++) { 4128 u32 ioff; 4129 4130 ioff = btrfs_token_item_offset(&token, i); 4131 btrfs_set_token_item_offset(&token, i, ioff + size_diff); 4132 } 4133 4134 /* shift the data */ 4135 if (from_end) { 4136 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4137 old_data_start + new_size - data_end); 4138 } else { 4139 struct btrfs_disk_key disk_key; 4140 u64 offset; 4141 4142 btrfs_item_key(leaf, &disk_key, slot); 4143 4144 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 4145 unsigned long ptr; 4146 struct btrfs_file_extent_item *fi; 4147 4148 fi = btrfs_item_ptr(leaf, slot, 4149 struct btrfs_file_extent_item); 4150 fi = (struct btrfs_file_extent_item *)( 4151 (unsigned long)fi - size_diff); 4152 4153 if (btrfs_file_extent_type(leaf, fi) == 4154 BTRFS_FILE_EXTENT_INLINE) { 4155 ptr = btrfs_item_ptr_offset(leaf, slot); 4156 memmove_extent_buffer(leaf, ptr, 4157 (unsigned long)fi, 4158 BTRFS_FILE_EXTENT_INLINE_DATA_START); 4159 } 4160 } 4161 4162 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4163 old_data_start - data_end); 4164 4165 offset = btrfs_disk_key_offset(&disk_key); 4166 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4167 btrfs_set_item_key(leaf, &disk_key, slot); 4168 if (slot == 0) 4169 fixup_low_keys(path, &disk_key, 1); 4170 } 4171 4172 btrfs_set_item_size(leaf, slot, new_size); 4173 btrfs_mark_buffer_dirty(leaf); 4174 4175 if (btrfs_leaf_free_space(leaf) < 0) { 4176 btrfs_print_leaf(leaf); 4177 BUG(); 4178 } 4179 } 4180 4181 /* 4182 * make the item pointed to by the path bigger, data_size is the added size. 4183 */ 4184 void btrfs_extend_item(struct btrfs_path *path, u32 data_size) 4185 { 4186 int slot; 4187 struct extent_buffer *leaf; 4188 u32 nritems; 4189 unsigned int data_end; 4190 unsigned int old_data; 4191 unsigned int old_size; 4192 int i; 4193 struct btrfs_map_token token; 4194 4195 leaf = path->nodes[0]; 4196 4197 nritems = btrfs_header_nritems(leaf); 4198 data_end = leaf_data_end(leaf); 4199 4200 if (btrfs_leaf_free_space(leaf) < data_size) { 4201 btrfs_print_leaf(leaf); 4202 BUG(); 4203 } 4204 slot = path->slots[0]; 4205 old_data = btrfs_item_data_end(leaf, slot); 4206 4207 BUG_ON(slot < 0); 4208 if (slot >= nritems) { 4209 btrfs_print_leaf(leaf); 4210 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", 4211 slot, nritems); 4212 BUG(); 4213 } 4214 4215 /* 4216 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4217 */ 4218 /* first correct the data pointers */ 4219 btrfs_init_map_token(&token, leaf); 4220 for (i = slot; i < nritems; i++) { 4221 u32 ioff; 4222 4223 ioff = btrfs_token_item_offset(&token, i); 4224 btrfs_set_token_item_offset(&token, i, ioff - data_size); 4225 } 4226 4227 /* shift the data */ 4228 memmove_leaf_data(leaf, data_end - data_size, data_end, 4229 old_data - data_end); 4230 4231 data_end = old_data; 4232 old_size = btrfs_item_size(leaf, slot); 4233 btrfs_set_item_size(leaf, slot, old_size + data_size); 4234 btrfs_mark_buffer_dirty(leaf); 4235 4236 if (btrfs_leaf_free_space(leaf) < 0) { 4237 btrfs_print_leaf(leaf); 4238 BUG(); 4239 } 4240 } 4241 4242 /* 4243 * Make space in the node before inserting one or more items. 4244 * 4245 * @root: root we are inserting items to 4246 * @path: points to the leaf/slot where we are going to insert new items 4247 * @batch: information about the batch of items to insert 4248 * 4249 * Main purpose is to save stack depth by doing the bulk of the work in a 4250 * function that doesn't call btrfs_search_slot 4251 */ 4252 static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, 4253 const struct btrfs_item_batch *batch) 4254 { 4255 struct btrfs_fs_info *fs_info = root->fs_info; 4256 int i; 4257 u32 nritems; 4258 unsigned int data_end; 4259 struct btrfs_disk_key disk_key; 4260 struct extent_buffer *leaf; 4261 int slot; 4262 struct btrfs_map_token token; 4263 u32 total_size; 4264 4265 /* 4266 * Before anything else, update keys in the parent and other ancestors 4267 * if needed, then release the write locks on them, so that other tasks 4268 * can use them while we modify the leaf. 4269 */ 4270 if (path->slots[0] == 0) { 4271 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); 4272 fixup_low_keys(path, &disk_key, 1); 4273 } 4274 btrfs_unlock_up_safe(path, 1); 4275 4276 leaf = path->nodes[0]; 4277 slot = path->slots[0]; 4278 4279 nritems = btrfs_header_nritems(leaf); 4280 data_end = leaf_data_end(leaf); 4281 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4282 4283 if (btrfs_leaf_free_space(leaf) < total_size) { 4284 btrfs_print_leaf(leaf); 4285 btrfs_crit(fs_info, "not enough freespace need %u have %d", 4286 total_size, btrfs_leaf_free_space(leaf)); 4287 BUG(); 4288 } 4289 4290 btrfs_init_map_token(&token, leaf); 4291 if (slot != nritems) { 4292 unsigned int old_data = btrfs_item_data_end(leaf, slot); 4293 4294 if (old_data < data_end) { 4295 btrfs_print_leaf(leaf); 4296 btrfs_crit(fs_info, 4297 "item at slot %d with data offset %u beyond data end of leaf %u", 4298 slot, old_data, data_end); 4299 BUG(); 4300 } 4301 /* 4302 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4303 */ 4304 /* first correct the data pointers */ 4305 for (i = slot; i < nritems; i++) { 4306 u32 ioff; 4307 4308 ioff = btrfs_token_item_offset(&token, i); 4309 btrfs_set_token_item_offset(&token, i, 4310 ioff - batch->total_data_size); 4311 } 4312 /* shift the items */ 4313 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); 4314 4315 /* shift the data */ 4316 memmove_leaf_data(leaf, data_end - batch->total_data_size, 4317 data_end, old_data - data_end); 4318 data_end = old_data; 4319 } 4320 4321 /* setup the item for the new data */ 4322 for (i = 0; i < batch->nr; i++) { 4323 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); 4324 btrfs_set_item_key(leaf, &disk_key, slot + i); 4325 data_end -= batch->data_sizes[i]; 4326 btrfs_set_token_item_offset(&token, slot + i, data_end); 4327 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); 4328 } 4329 4330 btrfs_set_header_nritems(leaf, nritems + batch->nr); 4331 btrfs_mark_buffer_dirty(leaf); 4332 4333 if (btrfs_leaf_free_space(leaf) < 0) { 4334 btrfs_print_leaf(leaf); 4335 BUG(); 4336 } 4337 } 4338 4339 /* 4340 * Insert a new item into a leaf. 4341 * 4342 * @root: The root of the btree. 4343 * @path: A path pointing to the target leaf and slot. 4344 * @key: The key of the new item. 4345 * @data_size: The size of the data associated with the new key. 4346 */ 4347 void btrfs_setup_item_for_insert(struct btrfs_root *root, 4348 struct btrfs_path *path, 4349 const struct btrfs_key *key, 4350 u32 data_size) 4351 { 4352 struct btrfs_item_batch batch; 4353 4354 batch.keys = key; 4355 batch.data_sizes = &data_size; 4356 batch.total_data_size = data_size; 4357 batch.nr = 1; 4358 4359 setup_items_for_insert(root, path, &batch); 4360 } 4361 4362 /* 4363 * Given a key and some data, insert items into the tree. 4364 * This does all the path init required, making room in the tree if needed. 4365 */ 4366 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4367 struct btrfs_root *root, 4368 struct btrfs_path *path, 4369 const struct btrfs_item_batch *batch) 4370 { 4371 int ret = 0; 4372 int slot; 4373 u32 total_size; 4374 4375 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4376 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); 4377 if (ret == 0) 4378 return -EEXIST; 4379 if (ret < 0) 4380 return ret; 4381 4382 slot = path->slots[0]; 4383 BUG_ON(slot < 0); 4384 4385 setup_items_for_insert(root, path, batch); 4386 return 0; 4387 } 4388 4389 /* 4390 * Given a key and some data, insert an item into the tree. 4391 * This does all the path init required, making room in the tree if needed. 4392 */ 4393 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4394 const struct btrfs_key *cpu_key, void *data, 4395 u32 data_size) 4396 { 4397 int ret = 0; 4398 struct btrfs_path *path; 4399 struct extent_buffer *leaf; 4400 unsigned long ptr; 4401 4402 path = btrfs_alloc_path(); 4403 if (!path) 4404 return -ENOMEM; 4405 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4406 if (!ret) { 4407 leaf = path->nodes[0]; 4408 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4409 write_extent_buffer(leaf, data, ptr, data_size); 4410 btrfs_mark_buffer_dirty(leaf); 4411 } 4412 btrfs_free_path(path); 4413 return ret; 4414 } 4415 4416 /* 4417 * This function duplicates an item, giving 'new_key' to the new item. 4418 * It guarantees both items live in the same tree leaf and the new item is 4419 * contiguous with the original item. 4420 * 4421 * This allows us to split a file extent in place, keeping a lock on the leaf 4422 * the entire time. 4423 */ 4424 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4425 struct btrfs_root *root, 4426 struct btrfs_path *path, 4427 const struct btrfs_key *new_key) 4428 { 4429 struct extent_buffer *leaf; 4430 int ret; 4431 u32 item_size; 4432 4433 leaf = path->nodes[0]; 4434 item_size = btrfs_item_size(leaf, path->slots[0]); 4435 ret = setup_leaf_for_split(trans, root, path, 4436 item_size + sizeof(struct btrfs_item)); 4437 if (ret) 4438 return ret; 4439 4440 path->slots[0]++; 4441 btrfs_setup_item_for_insert(root, path, new_key, item_size); 4442 leaf = path->nodes[0]; 4443 memcpy_extent_buffer(leaf, 4444 btrfs_item_ptr_offset(leaf, path->slots[0]), 4445 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4446 item_size); 4447 return 0; 4448 } 4449 4450 /* 4451 * delete the pointer from a given node. 4452 * 4453 * the tree should have been previously balanced so the deletion does not 4454 * empty a node. 4455 * 4456 * This is exported for use inside btrfs-progs, don't un-export it. 4457 */ 4458 int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4459 struct btrfs_path *path, int level, int slot) 4460 { 4461 struct extent_buffer *parent = path->nodes[level]; 4462 u32 nritems; 4463 int ret; 4464 4465 nritems = btrfs_header_nritems(parent); 4466 if (slot != nritems - 1) { 4467 if (level) { 4468 ret = btrfs_tree_mod_log_insert_move(parent, slot, 4469 slot + 1, nritems - slot - 1); 4470 if (ret < 0) { 4471 btrfs_abort_transaction(trans, ret); 4472 return ret; 4473 } 4474 } 4475 memmove_extent_buffer(parent, 4476 btrfs_node_key_ptr_offset(parent, slot), 4477 btrfs_node_key_ptr_offset(parent, slot + 1), 4478 sizeof(struct btrfs_key_ptr) * 4479 (nritems - slot - 1)); 4480 } else if (level) { 4481 ret = btrfs_tree_mod_log_insert_key(parent, slot, 4482 BTRFS_MOD_LOG_KEY_REMOVE); 4483 if (ret < 0) { 4484 btrfs_abort_transaction(trans, ret); 4485 return ret; 4486 } 4487 } 4488 4489 nritems--; 4490 btrfs_set_header_nritems(parent, nritems); 4491 if (nritems == 0 && parent == root->node) { 4492 BUG_ON(btrfs_header_level(root->node) != 1); 4493 /* just turn the root into a leaf and break */ 4494 btrfs_set_header_level(root->node, 0); 4495 } else if (slot == 0) { 4496 struct btrfs_disk_key disk_key; 4497 4498 btrfs_node_key(parent, &disk_key, 0); 4499 fixup_low_keys(path, &disk_key, level + 1); 4500 } 4501 btrfs_mark_buffer_dirty(parent); 4502 return 0; 4503 } 4504 4505 /* 4506 * a helper function to delete the leaf pointed to by path->slots[1] and 4507 * path->nodes[1]. 4508 * 4509 * This deletes the pointer in path->nodes[1] and frees the leaf 4510 * block extent. zero is returned if it all worked out, < 0 otherwise. 4511 * 4512 * The path must have already been setup for deleting the leaf, including 4513 * all the proper balancing. path->nodes[1] must be locked. 4514 */ 4515 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, 4516 struct btrfs_root *root, 4517 struct btrfs_path *path, 4518 struct extent_buffer *leaf) 4519 { 4520 int ret; 4521 4522 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4523 ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]); 4524 if (ret < 0) 4525 return ret; 4526 4527 /* 4528 * btrfs_free_extent is expensive, we want to make sure we 4529 * aren't holding any locks when we call it 4530 */ 4531 btrfs_unlock_up_safe(path, 0); 4532 4533 root_sub_used(root, leaf->len); 4534 4535 atomic_inc(&leaf->refs); 4536 btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); 4537 free_extent_buffer_stale(leaf); 4538 return 0; 4539 } 4540 /* 4541 * delete the item at the leaf level in path. If that empties 4542 * the leaf, remove it from the tree 4543 */ 4544 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4545 struct btrfs_path *path, int slot, int nr) 4546 { 4547 struct btrfs_fs_info *fs_info = root->fs_info; 4548 struct extent_buffer *leaf; 4549 int ret = 0; 4550 int wret; 4551 u32 nritems; 4552 4553 leaf = path->nodes[0]; 4554 nritems = btrfs_header_nritems(leaf); 4555 4556 if (slot + nr != nritems) { 4557 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); 4558 const int data_end = leaf_data_end(leaf); 4559 struct btrfs_map_token token; 4560 u32 dsize = 0; 4561 int i; 4562 4563 for (i = 0; i < nr; i++) 4564 dsize += btrfs_item_size(leaf, slot + i); 4565 4566 memmove_leaf_data(leaf, data_end + dsize, data_end, 4567 last_off - data_end); 4568 4569 btrfs_init_map_token(&token, leaf); 4570 for (i = slot + nr; i < nritems; i++) { 4571 u32 ioff; 4572 4573 ioff = btrfs_token_item_offset(&token, i); 4574 btrfs_set_token_item_offset(&token, i, ioff + dsize); 4575 } 4576 4577 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); 4578 } 4579 btrfs_set_header_nritems(leaf, nritems - nr); 4580 nritems -= nr; 4581 4582 /* delete the leaf if we've emptied it */ 4583 if (nritems == 0) { 4584 if (leaf == root->node) { 4585 btrfs_set_header_level(leaf, 0); 4586 } else { 4587 btrfs_clear_buffer_dirty(trans, leaf); 4588 ret = btrfs_del_leaf(trans, root, path, leaf); 4589 if (ret < 0) 4590 return ret; 4591 } 4592 } else { 4593 int used = leaf_space_used(leaf, 0, nritems); 4594 if (slot == 0) { 4595 struct btrfs_disk_key disk_key; 4596 4597 btrfs_item_key(leaf, &disk_key, 0); 4598 fixup_low_keys(path, &disk_key, 1); 4599 } 4600 4601 /* 4602 * Try to delete the leaf if it is mostly empty. We do this by 4603 * trying to move all its items into its left and right neighbours. 4604 * If we can't move all the items, then we don't delete it - it's 4605 * not ideal, but future insertions might fill the leaf with more 4606 * items, or items from other leaves might be moved later into our 4607 * leaf due to deletions on those leaves. 4608 */ 4609 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4610 u32 min_push_space; 4611 4612 /* push_leaf_left fixes the path. 4613 * make sure the path still points to our leaf 4614 * for possible call to btrfs_del_ptr below 4615 */ 4616 slot = path->slots[1]; 4617 atomic_inc(&leaf->refs); 4618 /* 4619 * We want to be able to at least push one item to the 4620 * left neighbour leaf, and that's the first item. 4621 */ 4622 min_push_space = sizeof(struct btrfs_item) + 4623 btrfs_item_size(leaf, 0); 4624 wret = push_leaf_left(trans, root, path, 0, 4625 min_push_space, 1, (u32)-1); 4626 if (wret < 0 && wret != -ENOSPC) 4627 ret = wret; 4628 4629 if (path->nodes[0] == leaf && 4630 btrfs_header_nritems(leaf)) { 4631 /* 4632 * If we were not able to push all items from our 4633 * leaf to its left neighbour, then attempt to 4634 * either push all the remaining items to the 4635 * right neighbour or none. There's no advantage 4636 * in pushing only some items, instead of all, as 4637 * it's pointless to end up with a leaf having 4638 * too few items while the neighbours can be full 4639 * or nearly full. 4640 */ 4641 nritems = btrfs_header_nritems(leaf); 4642 min_push_space = leaf_space_used(leaf, 0, nritems); 4643 wret = push_leaf_right(trans, root, path, 0, 4644 min_push_space, 1, 0); 4645 if (wret < 0 && wret != -ENOSPC) 4646 ret = wret; 4647 } 4648 4649 if (btrfs_header_nritems(leaf) == 0) { 4650 path->slots[1] = slot; 4651 ret = btrfs_del_leaf(trans, root, path, leaf); 4652 if (ret < 0) 4653 return ret; 4654 free_extent_buffer(leaf); 4655 ret = 0; 4656 } else { 4657 /* if we're still in the path, make sure 4658 * we're dirty. Otherwise, one of the 4659 * push_leaf functions must have already 4660 * dirtied this buffer 4661 */ 4662 if (path->nodes[0] == leaf) 4663 btrfs_mark_buffer_dirty(leaf); 4664 free_extent_buffer(leaf); 4665 } 4666 } else { 4667 btrfs_mark_buffer_dirty(leaf); 4668 } 4669 } 4670 return ret; 4671 } 4672 4673 /* 4674 * A helper function to walk down the tree starting at min_key, and looking 4675 * for nodes or leaves that are have a minimum transaction id. 4676 * This is used by the btree defrag code, and tree logging 4677 * 4678 * This does not cow, but it does stuff the starting key it finds back 4679 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4680 * key and get a writable path. 4681 * 4682 * This honors path->lowest_level to prevent descent past a given level 4683 * of the tree. 4684 * 4685 * min_trans indicates the oldest transaction that you are interested 4686 * in walking through. Any nodes or leaves older than min_trans are 4687 * skipped over (without reading them). 4688 * 4689 * returns zero if something useful was found, < 0 on error and 1 if there 4690 * was nothing in the tree that matched the search criteria. 4691 */ 4692 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4693 struct btrfs_path *path, 4694 u64 min_trans) 4695 { 4696 struct extent_buffer *cur; 4697 struct btrfs_key found_key; 4698 int slot; 4699 int sret; 4700 u32 nritems; 4701 int level; 4702 int ret = 1; 4703 int keep_locks = path->keep_locks; 4704 4705 ASSERT(!path->nowait); 4706 path->keep_locks = 1; 4707 again: 4708 cur = btrfs_read_lock_root_node(root); 4709 level = btrfs_header_level(cur); 4710 WARN_ON(path->nodes[level]); 4711 path->nodes[level] = cur; 4712 path->locks[level] = BTRFS_READ_LOCK; 4713 4714 if (btrfs_header_generation(cur) < min_trans) { 4715 ret = 1; 4716 goto out; 4717 } 4718 while (1) { 4719 nritems = btrfs_header_nritems(cur); 4720 level = btrfs_header_level(cur); 4721 sret = btrfs_bin_search(cur, 0, min_key, &slot); 4722 if (sret < 0) { 4723 ret = sret; 4724 goto out; 4725 } 4726 4727 /* at the lowest level, we're done, setup the path and exit */ 4728 if (level == path->lowest_level) { 4729 if (slot >= nritems) 4730 goto find_next_key; 4731 ret = 0; 4732 path->slots[level] = slot; 4733 btrfs_item_key_to_cpu(cur, &found_key, slot); 4734 goto out; 4735 } 4736 if (sret && slot > 0) 4737 slot--; 4738 /* 4739 * check this node pointer against the min_trans parameters. 4740 * If it is too old, skip to the next one. 4741 */ 4742 while (slot < nritems) { 4743 u64 gen; 4744 4745 gen = btrfs_node_ptr_generation(cur, slot); 4746 if (gen < min_trans) { 4747 slot++; 4748 continue; 4749 } 4750 break; 4751 } 4752 find_next_key: 4753 /* 4754 * we didn't find a candidate key in this node, walk forward 4755 * and find another one 4756 */ 4757 if (slot >= nritems) { 4758 path->slots[level] = slot; 4759 sret = btrfs_find_next_key(root, path, min_key, level, 4760 min_trans); 4761 if (sret == 0) { 4762 btrfs_release_path(path); 4763 goto again; 4764 } else { 4765 goto out; 4766 } 4767 } 4768 /* save our key for returning back */ 4769 btrfs_node_key_to_cpu(cur, &found_key, slot); 4770 path->slots[level] = slot; 4771 if (level == path->lowest_level) { 4772 ret = 0; 4773 goto out; 4774 } 4775 cur = btrfs_read_node_slot(cur, slot); 4776 if (IS_ERR(cur)) { 4777 ret = PTR_ERR(cur); 4778 goto out; 4779 } 4780 4781 btrfs_tree_read_lock(cur); 4782 4783 path->locks[level - 1] = BTRFS_READ_LOCK; 4784 path->nodes[level - 1] = cur; 4785 unlock_up(path, level, 1, 0, NULL); 4786 } 4787 out: 4788 path->keep_locks = keep_locks; 4789 if (ret == 0) { 4790 btrfs_unlock_up_safe(path, path->lowest_level + 1); 4791 memcpy(min_key, &found_key, sizeof(found_key)); 4792 } 4793 return ret; 4794 } 4795 4796 /* 4797 * this is similar to btrfs_next_leaf, but does not try to preserve 4798 * and fixup the path. It looks for and returns the next key in the 4799 * tree based on the current path and the min_trans parameters. 4800 * 4801 * 0 is returned if another key is found, < 0 if there are any errors 4802 * and 1 is returned if there are no higher keys in the tree 4803 * 4804 * path->keep_locks should be set to 1 on the search made before 4805 * calling this function. 4806 */ 4807 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4808 struct btrfs_key *key, int level, u64 min_trans) 4809 { 4810 int slot; 4811 struct extent_buffer *c; 4812 4813 WARN_ON(!path->keep_locks && !path->skip_locking); 4814 while (level < BTRFS_MAX_LEVEL) { 4815 if (!path->nodes[level]) 4816 return 1; 4817 4818 slot = path->slots[level] + 1; 4819 c = path->nodes[level]; 4820 next: 4821 if (slot >= btrfs_header_nritems(c)) { 4822 int ret; 4823 int orig_lowest; 4824 struct btrfs_key cur_key; 4825 if (level + 1 >= BTRFS_MAX_LEVEL || 4826 !path->nodes[level + 1]) 4827 return 1; 4828 4829 if (path->locks[level + 1] || path->skip_locking) { 4830 level++; 4831 continue; 4832 } 4833 4834 slot = btrfs_header_nritems(c) - 1; 4835 if (level == 0) 4836 btrfs_item_key_to_cpu(c, &cur_key, slot); 4837 else 4838 btrfs_node_key_to_cpu(c, &cur_key, slot); 4839 4840 orig_lowest = path->lowest_level; 4841 btrfs_release_path(path); 4842 path->lowest_level = level; 4843 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4844 0, 0); 4845 path->lowest_level = orig_lowest; 4846 if (ret < 0) 4847 return ret; 4848 4849 c = path->nodes[level]; 4850 slot = path->slots[level]; 4851 if (ret == 0) 4852 slot++; 4853 goto next; 4854 } 4855 4856 if (level == 0) 4857 btrfs_item_key_to_cpu(c, key, slot); 4858 else { 4859 u64 gen = btrfs_node_ptr_generation(c, slot); 4860 4861 if (gen < min_trans) { 4862 slot++; 4863 goto next; 4864 } 4865 btrfs_node_key_to_cpu(c, key, slot); 4866 } 4867 return 0; 4868 } 4869 return 1; 4870 } 4871 4872 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 4873 u64 time_seq) 4874 { 4875 int slot; 4876 int level; 4877 struct extent_buffer *c; 4878 struct extent_buffer *next; 4879 struct btrfs_fs_info *fs_info = root->fs_info; 4880 struct btrfs_key key; 4881 bool need_commit_sem = false; 4882 u32 nritems; 4883 int ret; 4884 int i; 4885 4886 /* 4887 * The nowait semantics are used only for write paths, where we don't 4888 * use the tree mod log and sequence numbers. 4889 */ 4890 if (time_seq) 4891 ASSERT(!path->nowait); 4892 4893 nritems = btrfs_header_nritems(path->nodes[0]); 4894 if (nritems == 0) 4895 return 1; 4896 4897 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4898 again: 4899 level = 1; 4900 next = NULL; 4901 btrfs_release_path(path); 4902 4903 path->keep_locks = 1; 4904 4905 if (time_seq) { 4906 ret = btrfs_search_old_slot(root, &key, path, time_seq); 4907 } else { 4908 if (path->need_commit_sem) { 4909 path->need_commit_sem = 0; 4910 need_commit_sem = true; 4911 if (path->nowait) { 4912 if (!down_read_trylock(&fs_info->commit_root_sem)) { 4913 ret = -EAGAIN; 4914 goto done; 4915 } 4916 } else { 4917 down_read(&fs_info->commit_root_sem); 4918 } 4919 } 4920 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4921 } 4922 path->keep_locks = 0; 4923 4924 if (ret < 0) 4925 goto done; 4926 4927 nritems = btrfs_header_nritems(path->nodes[0]); 4928 /* 4929 * by releasing the path above we dropped all our locks. A balance 4930 * could have added more items next to the key that used to be 4931 * at the very end of the block. So, check again here and 4932 * advance the path if there are now more items available. 4933 */ 4934 if (nritems > 0 && path->slots[0] < nritems - 1) { 4935 if (ret == 0) 4936 path->slots[0]++; 4937 ret = 0; 4938 goto done; 4939 } 4940 /* 4941 * So the above check misses one case: 4942 * - after releasing the path above, someone has removed the item that 4943 * used to be at the very end of the block, and balance between leafs 4944 * gets another one with bigger key.offset to replace it. 4945 * 4946 * This one should be returned as well, or we can get leaf corruption 4947 * later(esp. in __btrfs_drop_extents()). 4948 * 4949 * And a bit more explanation about this check, 4950 * with ret > 0, the key isn't found, the path points to the slot 4951 * where it should be inserted, so the path->slots[0] item must be the 4952 * bigger one. 4953 */ 4954 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { 4955 ret = 0; 4956 goto done; 4957 } 4958 4959 while (level < BTRFS_MAX_LEVEL) { 4960 if (!path->nodes[level]) { 4961 ret = 1; 4962 goto done; 4963 } 4964 4965 slot = path->slots[level] + 1; 4966 c = path->nodes[level]; 4967 if (slot >= btrfs_header_nritems(c)) { 4968 level++; 4969 if (level == BTRFS_MAX_LEVEL) { 4970 ret = 1; 4971 goto done; 4972 } 4973 continue; 4974 } 4975 4976 4977 /* 4978 * Our current level is where we're going to start from, and to 4979 * make sure lockdep doesn't complain we need to drop our locks 4980 * and nodes from 0 to our current level. 4981 */ 4982 for (i = 0; i < level; i++) { 4983 if (path->locks[level]) { 4984 btrfs_tree_read_unlock(path->nodes[i]); 4985 path->locks[i] = 0; 4986 } 4987 free_extent_buffer(path->nodes[i]); 4988 path->nodes[i] = NULL; 4989 } 4990 4991 next = c; 4992 ret = read_block_for_search(root, path, &next, level, 4993 slot, &key); 4994 if (ret == -EAGAIN && !path->nowait) 4995 goto again; 4996 4997 if (ret < 0) { 4998 btrfs_release_path(path); 4999 goto done; 5000 } 5001 5002 if (!path->skip_locking) { 5003 ret = btrfs_try_tree_read_lock(next); 5004 if (!ret && path->nowait) { 5005 ret = -EAGAIN; 5006 goto done; 5007 } 5008 if (!ret && time_seq) { 5009 /* 5010 * If we don't get the lock, we may be racing 5011 * with push_leaf_left, holding that lock while 5012 * itself waiting for the leaf we've currently 5013 * locked. To solve this situation, we give up 5014 * on our lock and cycle. 5015 */ 5016 free_extent_buffer(next); 5017 btrfs_release_path(path); 5018 cond_resched(); 5019 goto again; 5020 } 5021 if (!ret) 5022 btrfs_tree_read_lock(next); 5023 } 5024 break; 5025 } 5026 path->slots[level] = slot; 5027 while (1) { 5028 level--; 5029 path->nodes[level] = next; 5030 path->slots[level] = 0; 5031 if (!path->skip_locking) 5032 path->locks[level] = BTRFS_READ_LOCK; 5033 if (!level) 5034 break; 5035 5036 ret = read_block_for_search(root, path, &next, level, 5037 0, &key); 5038 if (ret == -EAGAIN && !path->nowait) 5039 goto again; 5040 5041 if (ret < 0) { 5042 btrfs_release_path(path); 5043 goto done; 5044 } 5045 5046 if (!path->skip_locking) { 5047 if (path->nowait) { 5048 if (!btrfs_try_tree_read_lock(next)) { 5049 ret = -EAGAIN; 5050 goto done; 5051 } 5052 } else { 5053 btrfs_tree_read_lock(next); 5054 } 5055 } 5056 } 5057 ret = 0; 5058 done: 5059 unlock_up(path, 0, 1, 0, NULL); 5060 if (need_commit_sem) { 5061 int ret2; 5062 5063 path->need_commit_sem = 1; 5064 ret2 = finish_need_commit_sem_search(path); 5065 up_read(&fs_info->commit_root_sem); 5066 if (ret2) 5067 ret = ret2; 5068 } 5069 5070 return ret; 5071 } 5072 5073 int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) 5074 { 5075 path->slots[0]++; 5076 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 5077 return btrfs_next_old_leaf(root, path, time_seq); 5078 return 0; 5079 } 5080 5081 /* 5082 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 5083 * searching until it gets past min_objectid or finds an item of 'type' 5084 * 5085 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5086 */ 5087 int btrfs_previous_item(struct btrfs_root *root, 5088 struct btrfs_path *path, u64 min_objectid, 5089 int type) 5090 { 5091 struct btrfs_key found_key; 5092 struct extent_buffer *leaf; 5093 u32 nritems; 5094 int ret; 5095 5096 while (1) { 5097 if (path->slots[0] == 0) { 5098 ret = btrfs_prev_leaf(root, path); 5099 if (ret != 0) 5100 return ret; 5101 } else { 5102 path->slots[0]--; 5103 } 5104 leaf = path->nodes[0]; 5105 nritems = btrfs_header_nritems(leaf); 5106 if (nritems == 0) 5107 return 1; 5108 if (path->slots[0] == nritems) 5109 path->slots[0]--; 5110 5111 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5112 if (found_key.objectid < min_objectid) 5113 break; 5114 if (found_key.type == type) 5115 return 0; 5116 if (found_key.objectid == min_objectid && 5117 found_key.type < type) 5118 break; 5119 } 5120 return 1; 5121 } 5122 5123 /* 5124 * search in extent tree to find a previous Metadata/Data extent item with 5125 * min objecitd. 5126 * 5127 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5128 */ 5129 int btrfs_previous_extent_item(struct btrfs_root *root, 5130 struct btrfs_path *path, u64 min_objectid) 5131 { 5132 struct btrfs_key found_key; 5133 struct extent_buffer *leaf; 5134 u32 nritems; 5135 int ret; 5136 5137 while (1) { 5138 if (path->slots[0] == 0) { 5139 ret = btrfs_prev_leaf(root, path); 5140 if (ret != 0) 5141 return ret; 5142 } else { 5143 path->slots[0]--; 5144 } 5145 leaf = path->nodes[0]; 5146 nritems = btrfs_header_nritems(leaf); 5147 if (nritems == 0) 5148 return 1; 5149 if (path->slots[0] == nritems) 5150 path->slots[0]--; 5151 5152 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5153 if (found_key.objectid < min_objectid) 5154 break; 5155 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 5156 found_key.type == BTRFS_METADATA_ITEM_KEY) 5157 return 0; 5158 if (found_key.objectid == min_objectid && 5159 found_key.type < BTRFS_EXTENT_ITEM_KEY) 5160 break; 5161 } 5162 return 1; 5163 } 5164 5165 int __init btrfs_ctree_init(void) 5166 { 5167 btrfs_path_cachep = kmem_cache_create("btrfs_path", 5168 sizeof(struct btrfs_path), 0, 5169 SLAB_MEM_SPREAD, NULL); 5170 if (!btrfs_path_cachep) 5171 return -ENOMEM; 5172 return 0; 5173 } 5174 5175 void __cold btrfs_ctree_exit(void) 5176 { 5177 kmem_cache_destroy(btrfs_path_cachep); 5178 } 5179