1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include <linux/error-injection.h> 11 #include "messages.h" 12 #include "ctree.h" 13 #include "disk-io.h" 14 #include "transaction.h" 15 #include "print-tree.h" 16 #include "locking.h" 17 #include "volumes.h" 18 #include "qgroup.h" 19 #include "tree-mod-log.h" 20 #include "tree-checker.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "extent-tree.h" 24 #include "relocation.h" 25 #include "file-item.h" 26 27 static struct kmem_cache *btrfs_path_cachep; 28 29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 30 *root, struct btrfs_path *path, int level); 31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 32 const struct btrfs_key *ins_key, struct btrfs_path *path, 33 int data_size, int extend); 34 static int push_node_left(struct btrfs_trans_handle *trans, 35 struct extent_buffer *dst, 36 struct extent_buffer *src, int empty); 37 static int balance_node_right(struct btrfs_trans_handle *trans, 38 struct extent_buffer *dst_buf, 39 struct extent_buffer *src_buf); 40 41 static const struct btrfs_csums { 42 u16 size; 43 const char name[10]; 44 const char driver[12]; 45 } btrfs_csums[] = { 46 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, 47 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, 48 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, 49 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", 50 .driver = "blake2b-256" }, 51 }; 52 53 /* 54 * The leaf data grows from end-to-front in the node. this returns the address 55 * of the start of the last item, which is the stop of the leaf data stack. 56 */ 57 static unsigned int leaf_data_end(const struct extent_buffer *leaf) 58 { 59 u32 nr = btrfs_header_nritems(leaf); 60 61 if (nr == 0) 62 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); 63 return btrfs_item_offset(leaf, nr - 1); 64 } 65 66 /* 67 * Move data in a @leaf (using memmove, safe for overlapping ranges). 68 * 69 * @leaf: leaf that we're doing a memmove on 70 * @dst_offset: item data offset we're moving to 71 * @src_offset: item data offset were' moving from 72 * @len: length of the data we're moving 73 * 74 * Wrapper around memmove_extent_buffer() that takes into account the header on 75 * the leaf. The btrfs_item offset's start directly after the header, so we 76 * have to adjust any offsets to account for the header in the leaf. This 77 * handles that math to simplify the callers. 78 */ 79 static inline void memmove_leaf_data(const struct extent_buffer *leaf, 80 unsigned long dst_offset, 81 unsigned long src_offset, 82 unsigned long len) 83 { 84 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, 85 btrfs_item_nr_offset(leaf, 0) + src_offset, len); 86 } 87 88 /* 89 * Copy item data from @src into @dst at the given @offset. 90 * 91 * @dst: destination leaf that we're copying into 92 * @src: source leaf that we're copying from 93 * @dst_offset: item data offset we're copying to 94 * @src_offset: item data offset were' copying from 95 * @len: length of the data we're copying 96 * 97 * Wrapper around copy_extent_buffer() that takes into account the header on 98 * the leaf. The btrfs_item offset's start directly after the header, so we 99 * have to adjust any offsets to account for the header in the leaf. This 100 * handles that math to simplify the callers. 101 */ 102 static inline void copy_leaf_data(const struct extent_buffer *dst, 103 const struct extent_buffer *src, 104 unsigned long dst_offset, 105 unsigned long src_offset, unsigned long len) 106 { 107 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, 108 btrfs_item_nr_offset(src, 0) + src_offset, len); 109 } 110 111 /* 112 * Move items in a @leaf (using memmove). 113 * 114 * @dst: destination leaf for the items 115 * @dst_item: the item nr we're copying into 116 * @src_item: the item nr we're copying from 117 * @nr_items: the number of items to copy 118 * 119 * Wrapper around memmove_extent_buffer() that does the math to get the 120 * appropriate offsets into the leaf from the item numbers. 121 */ 122 static inline void memmove_leaf_items(const struct extent_buffer *leaf, 123 int dst_item, int src_item, int nr_items) 124 { 125 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), 126 btrfs_item_nr_offset(leaf, src_item), 127 nr_items * sizeof(struct btrfs_item)); 128 } 129 130 /* 131 * Copy items from @src into @dst at the given @offset. 132 * 133 * @dst: destination leaf for the items 134 * @src: source leaf for the items 135 * @dst_item: the item nr we're copying into 136 * @src_item: the item nr we're copying from 137 * @nr_items: the number of items to copy 138 * 139 * Wrapper around copy_extent_buffer() that does the math to get the 140 * appropriate offsets into the leaf from the item numbers. 141 */ 142 static inline void copy_leaf_items(const struct extent_buffer *dst, 143 const struct extent_buffer *src, 144 int dst_item, int src_item, int nr_items) 145 { 146 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), 147 btrfs_item_nr_offset(src, src_item), 148 nr_items * sizeof(struct btrfs_item)); 149 } 150 151 /* This exists for btrfs-progs usages. */ 152 u16 btrfs_csum_type_size(u16 type) 153 { 154 return btrfs_csums[type].size; 155 } 156 157 int btrfs_super_csum_size(const struct btrfs_super_block *s) 158 { 159 u16 t = btrfs_super_csum_type(s); 160 /* 161 * csum type is validated at mount time 162 */ 163 return btrfs_csum_type_size(t); 164 } 165 166 const char *btrfs_super_csum_name(u16 csum_type) 167 { 168 /* csum type is validated at mount time */ 169 return btrfs_csums[csum_type].name; 170 } 171 172 /* 173 * Return driver name if defined, otherwise the name that's also a valid driver 174 * name 175 */ 176 const char *btrfs_super_csum_driver(u16 csum_type) 177 { 178 /* csum type is validated at mount time */ 179 return btrfs_csums[csum_type].driver[0] ? 180 btrfs_csums[csum_type].driver : 181 btrfs_csums[csum_type].name; 182 } 183 184 size_t __attribute_const__ btrfs_get_num_csums(void) 185 { 186 return ARRAY_SIZE(btrfs_csums); 187 } 188 189 struct btrfs_path *btrfs_alloc_path(void) 190 { 191 might_sleep(); 192 193 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 194 } 195 196 /* this also releases the path */ 197 void btrfs_free_path(struct btrfs_path *p) 198 { 199 if (!p) 200 return; 201 btrfs_release_path(p); 202 kmem_cache_free(btrfs_path_cachep, p); 203 } 204 205 /* 206 * path release drops references on the extent buffers in the path 207 * and it drops any locks held by this path 208 * 209 * It is safe to call this on paths that no locks or extent buffers held. 210 */ 211 noinline void btrfs_release_path(struct btrfs_path *p) 212 { 213 int i; 214 215 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 216 p->slots[i] = 0; 217 if (!p->nodes[i]) 218 continue; 219 if (p->locks[i]) { 220 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 221 p->locks[i] = 0; 222 } 223 free_extent_buffer(p->nodes[i]); 224 p->nodes[i] = NULL; 225 } 226 } 227 228 /* 229 * We want the transaction abort to print stack trace only for errors where the 230 * cause could be a bug, eg. due to ENOSPC, and not for common errors that are 231 * caused by external factors. 232 */ 233 bool __cold abort_should_print_stack(int error) 234 { 235 switch (error) { 236 case -EIO: 237 case -EROFS: 238 case -ENOMEM: 239 return false; 240 } 241 return true; 242 } 243 244 /* 245 * safely gets a reference on the root node of a tree. A lock 246 * is not taken, so a concurrent writer may put a different node 247 * at the root of the tree. See btrfs_lock_root_node for the 248 * looping required. 249 * 250 * The extent buffer returned by this has a reference taken, so 251 * it won't disappear. It may stop being the root of the tree 252 * at any time because there are no locks held. 253 */ 254 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 255 { 256 struct extent_buffer *eb; 257 258 while (1) { 259 rcu_read_lock(); 260 eb = rcu_dereference(root->node); 261 262 /* 263 * RCU really hurts here, we could free up the root node because 264 * it was COWed but we may not get the new root node yet so do 265 * the inc_not_zero dance and if it doesn't work then 266 * synchronize_rcu and try again. 267 */ 268 if (atomic_inc_not_zero(&eb->refs)) { 269 rcu_read_unlock(); 270 break; 271 } 272 rcu_read_unlock(); 273 synchronize_rcu(); 274 } 275 return eb; 276 } 277 278 /* 279 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), 280 * just get put onto a simple dirty list. Transaction walks this list to make 281 * sure they get properly updated on disk. 282 */ 283 static void add_root_to_dirty_list(struct btrfs_root *root) 284 { 285 struct btrfs_fs_info *fs_info = root->fs_info; 286 287 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 288 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 289 return; 290 291 spin_lock(&fs_info->trans_lock); 292 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 293 /* Want the extent tree to be the last on the list */ 294 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) 295 list_move_tail(&root->dirty_list, 296 &fs_info->dirty_cowonly_roots); 297 else 298 list_move(&root->dirty_list, 299 &fs_info->dirty_cowonly_roots); 300 } 301 spin_unlock(&fs_info->trans_lock); 302 } 303 304 /* 305 * used by snapshot creation to make a copy of a root for a tree with 306 * a given objectid. The buffer with the new root node is returned in 307 * cow_ret, and this func returns zero on success or a negative error code. 308 */ 309 int btrfs_copy_root(struct btrfs_trans_handle *trans, 310 struct btrfs_root *root, 311 struct extent_buffer *buf, 312 struct extent_buffer **cow_ret, u64 new_root_objectid) 313 { 314 struct btrfs_fs_info *fs_info = root->fs_info; 315 struct extent_buffer *cow; 316 int ret = 0; 317 int level; 318 struct btrfs_disk_key disk_key; 319 u64 reloc_src_root = 0; 320 321 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 322 trans->transid != fs_info->running_transaction->transid); 323 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 324 trans->transid != root->last_trans); 325 326 level = btrfs_header_level(buf); 327 if (level == 0) 328 btrfs_item_key(buf, &disk_key, 0); 329 else 330 btrfs_node_key(buf, &disk_key, 0); 331 332 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 333 reloc_src_root = btrfs_header_owner(buf); 334 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 335 &disk_key, level, buf->start, 0, 336 reloc_src_root, BTRFS_NESTING_NEW_ROOT); 337 if (IS_ERR(cow)) 338 return PTR_ERR(cow); 339 340 copy_extent_buffer_full(cow, buf); 341 btrfs_set_header_bytenr(cow, cow->start); 342 btrfs_set_header_generation(cow, trans->transid); 343 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 344 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 345 BTRFS_HEADER_FLAG_RELOC); 346 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 347 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 348 else 349 btrfs_set_header_owner(cow, new_root_objectid); 350 351 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 352 353 WARN_ON(btrfs_header_generation(buf) > trans->transid); 354 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 355 ret = btrfs_inc_ref(trans, root, cow, 1); 356 else 357 ret = btrfs_inc_ref(trans, root, cow, 0); 358 if (ret) { 359 btrfs_tree_unlock(cow); 360 free_extent_buffer(cow); 361 btrfs_abort_transaction(trans, ret); 362 return ret; 363 } 364 365 btrfs_mark_buffer_dirty(trans, cow); 366 *cow_ret = cow; 367 return 0; 368 } 369 370 /* 371 * check if the tree block can be shared by multiple trees 372 */ 373 int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, 374 struct btrfs_root *root, 375 struct extent_buffer *buf) 376 { 377 /* 378 * Tree blocks not in shareable trees and tree roots are never shared. 379 * If a block was allocated after the last snapshot and the block was 380 * not allocated by tree relocation, we know the block is not shared. 381 */ 382 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 383 buf != root->node && 384 (btrfs_header_generation(buf) <= 385 btrfs_root_last_snapshot(&root->root_item) || 386 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) { 387 if (buf != root->commit_root) 388 return 1; 389 /* 390 * An extent buffer that used to be the commit root may still be 391 * shared because the tree height may have increased and it 392 * became a child of a higher level root. This can happen when 393 * snapshotting a subvolume created in the current transaction. 394 */ 395 if (btrfs_header_generation(buf) == trans->transid) 396 return 1; 397 } 398 399 return 0; 400 } 401 402 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 403 struct btrfs_root *root, 404 struct extent_buffer *buf, 405 struct extent_buffer *cow, 406 int *last_ref) 407 { 408 struct btrfs_fs_info *fs_info = root->fs_info; 409 u64 refs; 410 u64 owner; 411 u64 flags; 412 u64 new_flags = 0; 413 int ret; 414 415 /* 416 * Backrefs update rules: 417 * 418 * Always use full backrefs for extent pointers in tree block 419 * allocated by tree relocation. 420 * 421 * If a shared tree block is no longer referenced by its owner 422 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 423 * use full backrefs for extent pointers in tree block. 424 * 425 * If a tree block is been relocating 426 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 427 * use full backrefs for extent pointers in tree block. 428 * The reason for this is some operations (such as drop tree) 429 * are only allowed for blocks use full backrefs. 430 */ 431 432 if (btrfs_block_can_be_shared(trans, root, buf)) { 433 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 434 btrfs_header_level(buf), 1, 435 &refs, &flags); 436 if (ret) 437 return ret; 438 if (unlikely(refs == 0)) { 439 btrfs_crit(fs_info, 440 "found 0 references for tree block at bytenr %llu level %d root %llu", 441 buf->start, btrfs_header_level(buf), 442 btrfs_root_id(root)); 443 ret = -EUCLEAN; 444 btrfs_abort_transaction(trans, ret); 445 return ret; 446 } 447 } else { 448 refs = 1; 449 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 450 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 451 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 452 else 453 flags = 0; 454 } 455 456 owner = btrfs_header_owner(buf); 457 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && 458 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 459 460 if (refs > 1) { 461 if ((owner == root->root_key.objectid || 462 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 463 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 464 ret = btrfs_inc_ref(trans, root, buf, 1); 465 if (ret) 466 return ret; 467 468 if (root->root_key.objectid == 469 BTRFS_TREE_RELOC_OBJECTID) { 470 ret = btrfs_dec_ref(trans, root, buf, 0); 471 if (ret) 472 return ret; 473 ret = btrfs_inc_ref(trans, root, cow, 1); 474 if (ret) 475 return ret; 476 } 477 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 478 } else { 479 480 if (root->root_key.objectid == 481 BTRFS_TREE_RELOC_OBJECTID) 482 ret = btrfs_inc_ref(trans, root, cow, 1); 483 else 484 ret = btrfs_inc_ref(trans, root, cow, 0); 485 if (ret) 486 return ret; 487 } 488 if (new_flags != 0) { 489 ret = btrfs_set_disk_extent_flags(trans, buf, new_flags); 490 if (ret) 491 return ret; 492 } 493 } else { 494 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 495 if (root->root_key.objectid == 496 BTRFS_TREE_RELOC_OBJECTID) 497 ret = btrfs_inc_ref(trans, root, cow, 1); 498 else 499 ret = btrfs_inc_ref(trans, root, cow, 0); 500 if (ret) 501 return ret; 502 ret = btrfs_dec_ref(trans, root, buf, 1); 503 if (ret) 504 return ret; 505 } 506 btrfs_clear_buffer_dirty(trans, buf); 507 *last_ref = 1; 508 } 509 return 0; 510 } 511 512 /* 513 * does the dirty work in cow of a single block. The parent block (if 514 * supplied) is updated to point to the new cow copy. The new buffer is marked 515 * dirty and returned locked. If you modify the block it needs to be marked 516 * dirty again. 517 * 518 * search_start -- an allocation hint for the new block 519 * 520 * empty_size -- a hint that you plan on doing more cow. This is the size in 521 * bytes the allocator should try to find free next to the block it returns. 522 * This is just a hint and may be ignored by the allocator. 523 */ 524 int btrfs_force_cow_block(struct btrfs_trans_handle *trans, 525 struct btrfs_root *root, 526 struct extent_buffer *buf, 527 struct extent_buffer *parent, int parent_slot, 528 struct extent_buffer **cow_ret, 529 u64 search_start, u64 empty_size, 530 enum btrfs_lock_nesting nest) 531 { 532 struct btrfs_fs_info *fs_info = root->fs_info; 533 struct btrfs_disk_key disk_key; 534 struct extent_buffer *cow; 535 int level, ret; 536 int last_ref = 0; 537 int unlock_orig = 0; 538 u64 parent_start = 0; 539 u64 reloc_src_root = 0; 540 541 if (*cow_ret == buf) 542 unlock_orig = 1; 543 544 btrfs_assert_tree_write_locked(buf); 545 546 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 547 trans->transid != fs_info->running_transaction->transid); 548 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 549 trans->transid != root->last_trans); 550 551 level = btrfs_header_level(buf); 552 553 if (level == 0) 554 btrfs_item_key(buf, &disk_key, 0); 555 else 556 btrfs_node_key(buf, &disk_key, 0); 557 558 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 559 if (parent) 560 parent_start = parent->start; 561 reloc_src_root = btrfs_header_owner(buf); 562 } 563 cow = btrfs_alloc_tree_block(trans, root, parent_start, 564 root->root_key.objectid, &disk_key, level, 565 search_start, empty_size, reloc_src_root, nest); 566 if (IS_ERR(cow)) 567 return PTR_ERR(cow); 568 569 /* cow is set to blocking by btrfs_init_new_buffer */ 570 571 copy_extent_buffer_full(cow, buf); 572 btrfs_set_header_bytenr(cow, cow->start); 573 btrfs_set_header_generation(cow, trans->transid); 574 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 575 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 576 BTRFS_HEADER_FLAG_RELOC); 577 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 578 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 579 else 580 btrfs_set_header_owner(cow, root->root_key.objectid); 581 582 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 583 584 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 585 if (ret) { 586 btrfs_tree_unlock(cow); 587 free_extent_buffer(cow); 588 btrfs_abort_transaction(trans, ret); 589 return ret; 590 } 591 592 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 593 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 594 if (ret) { 595 btrfs_tree_unlock(cow); 596 free_extent_buffer(cow); 597 btrfs_abort_transaction(trans, ret); 598 return ret; 599 } 600 } 601 602 if (buf == root->node) { 603 WARN_ON(parent && parent != buf); 604 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 605 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 606 parent_start = buf->start; 607 608 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); 609 if (ret < 0) { 610 btrfs_tree_unlock(cow); 611 free_extent_buffer(cow); 612 btrfs_abort_transaction(trans, ret); 613 return ret; 614 } 615 atomic_inc(&cow->refs); 616 rcu_assign_pointer(root->node, cow); 617 618 btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 619 parent_start, last_ref); 620 free_extent_buffer(buf); 621 add_root_to_dirty_list(root); 622 } else { 623 WARN_ON(trans->transid != btrfs_header_generation(parent)); 624 ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, 625 BTRFS_MOD_LOG_KEY_REPLACE); 626 if (ret) { 627 btrfs_tree_unlock(cow); 628 free_extent_buffer(cow); 629 btrfs_abort_transaction(trans, ret); 630 return ret; 631 } 632 btrfs_set_node_blockptr(parent, parent_slot, 633 cow->start); 634 btrfs_set_node_ptr_generation(parent, parent_slot, 635 trans->transid); 636 btrfs_mark_buffer_dirty(trans, parent); 637 if (last_ref) { 638 ret = btrfs_tree_mod_log_free_eb(buf); 639 if (ret) { 640 btrfs_tree_unlock(cow); 641 free_extent_buffer(cow); 642 btrfs_abort_transaction(trans, ret); 643 return ret; 644 } 645 } 646 btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 647 parent_start, last_ref); 648 } 649 if (unlock_orig) 650 btrfs_tree_unlock(buf); 651 free_extent_buffer_stale(buf); 652 btrfs_mark_buffer_dirty(trans, cow); 653 *cow_ret = cow; 654 return 0; 655 } 656 657 static inline int should_cow_block(struct btrfs_trans_handle *trans, 658 struct btrfs_root *root, 659 struct extent_buffer *buf) 660 { 661 if (btrfs_is_testing(root->fs_info)) 662 return 0; 663 664 /* Ensure we can see the FORCE_COW bit */ 665 smp_mb__before_atomic(); 666 667 /* 668 * We do not need to cow a block if 669 * 1) this block is not created or changed in this transaction; 670 * 2) this block does not belong to TREE_RELOC tree; 671 * 3) the root is not forced COW. 672 * 673 * What is forced COW: 674 * when we create snapshot during committing the transaction, 675 * after we've finished copying src root, we must COW the shared 676 * block to ensure the metadata consistency. 677 */ 678 if (btrfs_header_generation(buf) == trans->transid && 679 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 680 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 681 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 682 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 683 return 0; 684 return 1; 685 } 686 687 /* 688 * COWs a single block, see btrfs_force_cow_block() for the real work. 689 * This version of it has extra checks so that a block isn't COWed more than 690 * once per transaction, as long as it hasn't been written yet 691 */ 692 int btrfs_cow_block(struct btrfs_trans_handle *trans, 693 struct btrfs_root *root, struct extent_buffer *buf, 694 struct extent_buffer *parent, int parent_slot, 695 struct extent_buffer **cow_ret, 696 enum btrfs_lock_nesting nest) 697 { 698 struct btrfs_fs_info *fs_info = root->fs_info; 699 u64 search_start; 700 int ret; 701 702 if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { 703 btrfs_abort_transaction(trans, -EUCLEAN); 704 btrfs_crit(fs_info, 705 "attempt to COW block %llu on root %llu that is being deleted", 706 buf->start, btrfs_root_id(root)); 707 return -EUCLEAN; 708 } 709 710 /* 711 * COWing must happen through a running transaction, which always 712 * matches the current fs generation (it's a transaction with a state 713 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 714 * into error state to prevent the commit of any transaction. 715 */ 716 if (unlikely(trans->transaction != fs_info->running_transaction || 717 trans->transid != fs_info->generation)) { 718 btrfs_abort_transaction(trans, -EUCLEAN); 719 btrfs_crit(fs_info, 720 "unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", 721 buf->start, btrfs_root_id(root), trans->transid, 722 fs_info->running_transaction->transid, 723 fs_info->generation); 724 return -EUCLEAN; 725 } 726 727 if (!should_cow_block(trans, root, buf)) { 728 *cow_ret = buf; 729 return 0; 730 } 731 732 search_start = round_down(buf->start, SZ_1G); 733 734 /* 735 * Before CoWing this block for later modification, check if it's 736 * the subtree root and do the delayed subtree trace if needed. 737 * 738 * Also We don't care about the error, as it's handled internally. 739 */ 740 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 741 ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot, 742 cow_ret, search_start, 0, nest); 743 744 trace_btrfs_cow_block(root, buf, *cow_ret); 745 746 return ret; 747 } 748 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 749 750 /* 751 * same as comp_keys only with two btrfs_key's 752 */ 753 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 754 { 755 if (k1->objectid > k2->objectid) 756 return 1; 757 if (k1->objectid < k2->objectid) 758 return -1; 759 if (k1->type > k2->type) 760 return 1; 761 if (k1->type < k2->type) 762 return -1; 763 if (k1->offset > k2->offset) 764 return 1; 765 if (k1->offset < k2->offset) 766 return -1; 767 return 0; 768 } 769 770 /* 771 * Search for a key in the given extent_buffer. 772 * 773 * The lower boundary for the search is specified by the slot number @first_slot. 774 * Use a value of 0 to search over the whole extent buffer. Works for both 775 * leaves and nodes. 776 * 777 * The slot in the extent buffer is returned via @slot. If the key exists in the 778 * extent buffer, then @slot will point to the slot where the key is, otherwise 779 * it points to the slot where you would insert the key. 780 * 781 * Slot may point to the total number of items (i.e. one position beyond the last 782 * key) if the key is bigger than the last key in the extent buffer. 783 */ 784 int btrfs_bin_search(struct extent_buffer *eb, int first_slot, 785 const struct btrfs_key *key, int *slot) 786 { 787 unsigned long p; 788 int item_size; 789 /* 790 * Use unsigned types for the low and high slots, so that we get a more 791 * efficient division in the search loop below. 792 */ 793 u32 low = first_slot; 794 u32 high = btrfs_header_nritems(eb); 795 int ret; 796 const int key_size = sizeof(struct btrfs_disk_key); 797 798 if (unlikely(low > high)) { 799 btrfs_err(eb->fs_info, 800 "%s: low (%u) > high (%u) eb %llu owner %llu level %d", 801 __func__, low, high, eb->start, 802 btrfs_header_owner(eb), btrfs_header_level(eb)); 803 return -EINVAL; 804 } 805 806 if (btrfs_header_level(eb) == 0) { 807 p = offsetof(struct btrfs_leaf, items); 808 item_size = sizeof(struct btrfs_item); 809 } else { 810 p = offsetof(struct btrfs_node, ptrs); 811 item_size = sizeof(struct btrfs_key_ptr); 812 } 813 814 while (low < high) { 815 unsigned long oip; 816 unsigned long offset; 817 struct btrfs_disk_key *tmp; 818 struct btrfs_disk_key unaligned; 819 int mid; 820 821 mid = (low + high) / 2; 822 offset = p + mid * item_size; 823 oip = offset_in_page(offset); 824 825 if (oip + key_size <= PAGE_SIZE) { 826 const unsigned long idx = get_eb_page_index(offset); 827 char *kaddr = page_address(eb->pages[idx]); 828 829 oip = get_eb_offset_in_page(eb, offset); 830 tmp = (struct btrfs_disk_key *)(kaddr + oip); 831 } else { 832 read_extent_buffer(eb, &unaligned, offset, key_size); 833 tmp = &unaligned; 834 } 835 836 ret = btrfs_comp_keys(tmp, key); 837 838 if (ret < 0) 839 low = mid + 1; 840 else if (ret > 0) 841 high = mid; 842 else { 843 *slot = mid; 844 return 0; 845 } 846 } 847 *slot = low; 848 return 1; 849 } 850 851 static void root_add_used_bytes(struct btrfs_root *root) 852 { 853 spin_lock(&root->accounting_lock); 854 btrfs_set_root_used(&root->root_item, 855 btrfs_root_used(&root->root_item) + root->fs_info->nodesize); 856 spin_unlock(&root->accounting_lock); 857 } 858 859 static void root_sub_used_bytes(struct btrfs_root *root) 860 { 861 spin_lock(&root->accounting_lock); 862 btrfs_set_root_used(&root->root_item, 863 btrfs_root_used(&root->root_item) - root->fs_info->nodesize); 864 spin_unlock(&root->accounting_lock); 865 } 866 867 /* given a node and slot number, this reads the blocks it points to. The 868 * extent buffer is returned with a reference taken (but unlocked). 869 */ 870 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, 871 int slot) 872 { 873 int level = btrfs_header_level(parent); 874 struct btrfs_tree_parent_check check = { 0 }; 875 struct extent_buffer *eb; 876 877 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 878 return ERR_PTR(-ENOENT); 879 880 ASSERT(level); 881 882 check.level = level - 1; 883 check.transid = btrfs_node_ptr_generation(parent, slot); 884 check.owner_root = btrfs_header_owner(parent); 885 check.has_first_key = true; 886 btrfs_node_key_to_cpu(parent, &check.first_key, slot); 887 888 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 889 &check); 890 if (IS_ERR(eb)) 891 return eb; 892 if (!extent_buffer_uptodate(eb)) { 893 free_extent_buffer(eb); 894 return ERR_PTR(-EIO); 895 } 896 897 return eb; 898 } 899 900 /* 901 * node level balancing, used to make sure nodes are in proper order for 902 * item deletion. We balance from the top down, so we have to make sure 903 * that a deletion won't leave an node completely empty later on. 904 */ 905 static noinline int balance_level(struct btrfs_trans_handle *trans, 906 struct btrfs_root *root, 907 struct btrfs_path *path, int level) 908 { 909 struct btrfs_fs_info *fs_info = root->fs_info; 910 struct extent_buffer *right = NULL; 911 struct extent_buffer *mid; 912 struct extent_buffer *left = NULL; 913 struct extent_buffer *parent = NULL; 914 int ret = 0; 915 int wret; 916 int pslot; 917 int orig_slot = path->slots[level]; 918 u64 orig_ptr; 919 920 ASSERT(level > 0); 921 922 mid = path->nodes[level]; 923 924 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); 925 WARN_ON(btrfs_header_generation(mid) != trans->transid); 926 927 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 928 929 if (level < BTRFS_MAX_LEVEL - 1) { 930 parent = path->nodes[level + 1]; 931 pslot = path->slots[level + 1]; 932 } 933 934 /* 935 * deal with the case where there is only one pointer in the root 936 * by promoting the node below to a root 937 */ 938 if (!parent) { 939 struct extent_buffer *child; 940 941 if (btrfs_header_nritems(mid) != 1) 942 return 0; 943 944 /* promote the child to a root */ 945 child = btrfs_read_node_slot(mid, 0); 946 if (IS_ERR(child)) { 947 ret = PTR_ERR(child); 948 goto out; 949 } 950 951 btrfs_tree_lock(child); 952 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 953 BTRFS_NESTING_COW); 954 if (ret) { 955 btrfs_tree_unlock(child); 956 free_extent_buffer(child); 957 goto out; 958 } 959 960 ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 961 if (ret < 0) { 962 btrfs_tree_unlock(child); 963 free_extent_buffer(child); 964 btrfs_abort_transaction(trans, ret); 965 goto out; 966 } 967 rcu_assign_pointer(root->node, child); 968 969 add_root_to_dirty_list(root); 970 btrfs_tree_unlock(child); 971 972 path->locks[level] = 0; 973 path->nodes[level] = NULL; 974 btrfs_clear_buffer_dirty(trans, mid); 975 btrfs_tree_unlock(mid); 976 /* once for the path */ 977 free_extent_buffer(mid); 978 979 root_sub_used_bytes(root); 980 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 981 /* once for the root ptr */ 982 free_extent_buffer_stale(mid); 983 return 0; 984 } 985 if (btrfs_header_nritems(mid) > 986 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 987 return 0; 988 989 if (pslot) { 990 left = btrfs_read_node_slot(parent, pslot - 1); 991 if (IS_ERR(left)) { 992 ret = PTR_ERR(left); 993 left = NULL; 994 goto out; 995 } 996 997 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 998 wret = btrfs_cow_block(trans, root, left, 999 parent, pslot - 1, &left, 1000 BTRFS_NESTING_LEFT_COW); 1001 if (wret) { 1002 ret = wret; 1003 goto out; 1004 } 1005 } 1006 1007 if (pslot + 1 < btrfs_header_nritems(parent)) { 1008 right = btrfs_read_node_slot(parent, pslot + 1); 1009 if (IS_ERR(right)) { 1010 ret = PTR_ERR(right); 1011 right = NULL; 1012 goto out; 1013 } 1014 1015 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1016 wret = btrfs_cow_block(trans, root, right, 1017 parent, pslot + 1, &right, 1018 BTRFS_NESTING_RIGHT_COW); 1019 if (wret) { 1020 ret = wret; 1021 goto out; 1022 } 1023 } 1024 1025 /* first, try to make some room in the middle buffer */ 1026 if (left) { 1027 orig_slot += btrfs_header_nritems(left); 1028 wret = push_node_left(trans, left, mid, 1); 1029 if (wret < 0) 1030 ret = wret; 1031 } 1032 1033 /* 1034 * then try to empty the right most buffer into the middle 1035 */ 1036 if (right) { 1037 wret = push_node_left(trans, mid, right, 1); 1038 if (wret < 0 && wret != -ENOSPC) 1039 ret = wret; 1040 if (btrfs_header_nritems(right) == 0) { 1041 btrfs_clear_buffer_dirty(trans, right); 1042 btrfs_tree_unlock(right); 1043 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); 1044 if (ret < 0) { 1045 free_extent_buffer_stale(right); 1046 right = NULL; 1047 goto out; 1048 } 1049 root_sub_used_bytes(root); 1050 btrfs_free_tree_block(trans, btrfs_root_id(root), right, 1051 0, 1); 1052 free_extent_buffer_stale(right); 1053 right = NULL; 1054 } else { 1055 struct btrfs_disk_key right_key; 1056 btrfs_node_key(right, &right_key, 0); 1057 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1058 BTRFS_MOD_LOG_KEY_REPLACE); 1059 if (ret < 0) { 1060 btrfs_abort_transaction(trans, ret); 1061 goto out; 1062 } 1063 btrfs_set_node_key(parent, &right_key, pslot + 1); 1064 btrfs_mark_buffer_dirty(trans, parent); 1065 } 1066 } 1067 if (btrfs_header_nritems(mid) == 1) { 1068 /* 1069 * we're not allowed to leave a node with one item in the 1070 * tree during a delete. A deletion from lower in the tree 1071 * could try to delete the only pointer in this node. 1072 * So, pull some keys from the left. 1073 * There has to be a left pointer at this point because 1074 * otherwise we would have pulled some pointers from the 1075 * right 1076 */ 1077 if (unlikely(!left)) { 1078 btrfs_crit(fs_info, 1079 "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", 1080 parent->start, btrfs_header_level(parent), 1081 mid->start, btrfs_root_id(root)); 1082 ret = -EUCLEAN; 1083 btrfs_abort_transaction(trans, ret); 1084 goto out; 1085 } 1086 wret = balance_node_right(trans, mid, left); 1087 if (wret < 0) { 1088 ret = wret; 1089 goto out; 1090 } 1091 if (wret == 1) { 1092 wret = push_node_left(trans, left, mid, 1); 1093 if (wret < 0) 1094 ret = wret; 1095 } 1096 BUG_ON(wret == 1); 1097 } 1098 if (btrfs_header_nritems(mid) == 0) { 1099 btrfs_clear_buffer_dirty(trans, mid); 1100 btrfs_tree_unlock(mid); 1101 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); 1102 if (ret < 0) { 1103 free_extent_buffer_stale(mid); 1104 mid = NULL; 1105 goto out; 1106 } 1107 root_sub_used_bytes(root); 1108 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1109 free_extent_buffer_stale(mid); 1110 mid = NULL; 1111 } else { 1112 /* update the parent key to reflect our changes */ 1113 struct btrfs_disk_key mid_key; 1114 btrfs_node_key(mid, &mid_key, 0); 1115 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1116 BTRFS_MOD_LOG_KEY_REPLACE); 1117 if (ret < 0) { 1118 btrfs_abort_transaction(trans, ret); 1119 goto out; 1120 } 1121 btrfs_set_node_key(parent, &mid_key, pslot); 1122 btrfs_mark_buffer_dirty(trans, parent); 1123 } 1124 1125 /* update the path */ 1126 if (left) { 1127 if (btrfs_header_nritems(left) > orig_slot) { 1128 atomic_inc(&left->refs); 1129 /* left was locked after cow */ 1130 path->nodes[level] = left; 1131 path->slots[level + 1] -= 1; 1132 path->slots[level] = orig_slot; 1133 if (mid) { 1134 btrfs_tree_unlock(mid); 1135 free_extent_buffer(mid); 1136 } 1137 } else { 1138 orig_slot -= btrfs_header_nritems(left); 1139 path->slots[level] = orig_slot; 1140 } 1141 } 1142 /* double check we haven't messed things up */ 1143 if (orig_ptr != 1144 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1145 BUG(); 1146 out: 1147 if (right) { 1148 btrfs_tree_unlock(right); 1149 free_extent_buffer(right); 1150 } 1151 if (left) { 1152 if (path->nodes[level] != left) 1153 btrfs_tree_unlock(left); 1154 free_extent_buffer(left); 1155 } 1156 return ret; 1157 } 1158 1159 /* Node balancing for insertion. Here we only split or push nodes around 1160 * when they are completely full. This is also done top down, so we 1161 * have to be pessimistic. 1162 */ 1163 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1164 struct btrfs_root *root, 1165 struct btrfs_path *path, int level) 1166 { 1167 struct btrfs_fs_info *fs_info = root->fs_info; 1168 struct extent_buffer *right = NULL; 1169 struct extent_buffer *mid; 1170 struct extent_buffer *left = NULL; 1171 struct extent_buffer *parent = NULL; 1172 int ret = 0; 1173 int wret; 1174 int pslot; 1175 int orig_slot = path->slots[level]; 1176 1177 if (level == 0) 1178 return 1; 1179 1180 mid = path->nodes[level]; 1181 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1182 1183 if (level < BTRFS_MAX_LEVEL - 1) { 1184 parent = path->nodes[level + 1]; 1185 pslot = path->slots[level + 1]; 1186 } 1187 1188 if (!parent) 1189 return 1; 1190 1191 /* first, try to make some room in the middle buffer */ 1192 if (pslot) { 1193 u32 left_nr; 1194 1195 left = btrfs_read_node_slot(parent, pslot - 1); 1196 if (IS_ERR(left)) 1197 return PTR_ERR(left); 1198 1199 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 1200 1201 left_nr = btrfs_header_nritems(left); 1202 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1203 wret = 1; 1204 } else { 1205 ret = btrfs_cow_block(trans, root, left, parent, 1206 pslot - 1, &left, 1207 BTRFS_NESTING_LEFT_COW); 1208 if (ret) 1209 wret = 1; 1210 else { 1211 wret = push_node_left(trans, left, mid, 0); 1212 } 1213 } 1214 if (wret < 0) 1215 ret = wret; 1216 if (wret == 0) { 1217 struct btrfs_disk_key disk_key; 1218 orig_slot += left_nr; 1219 btrfs_node_key(mid, &disk_key, 0); 1220 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1221 BTRFS_MOD_LOG_KEY_REPLACE); 1222 if (ret < 0) { 1223 btrfs_tree_unlock(left); 1224 free_extent_buffer(left); 1225 btrfs_abort_transaction(trans, ret); 1226 return ret; 1227 } 1228 btrfs_set_node_key(parent, &disk_key, pslot); 1229 btrfs_mark_buffer_dirty(trans, parent); 1230 if (btrfs_header_nritems(left) > orig_slot) { 1231 path->nodes[level] = left; 1232 path->slots[level + 1] -= 1; 1233 path->slots[level] = orig_slot; 1234 btrfs_tree_unlock(mid); 1235 free_extent_buffer(mid); 1236 } else { 1237 orig_slot -= 1238 btrfs_header_nritems(left); 1239 path->slots[level] = orig_slot; 1240 btrfs_tree_unlock(left); 1241 free_extent_buffer(left); 1242 } 1243 return 0; 1244 } 1245 btrfs_tree_unlock(left); 1246 free_extent_buffer(left); 1247 } 1248 1249 /* 1250 * then try to empty the right most buffer into the middle 1251 */ 1252 if (pslot + 1 < btrfs_header_nritems(parent)) { 1253 u32 right_nr; 1254 1255 right = btrfs_read_node_slot(parent, pslot + 1); 1256 if (IS_ERR(right)) 1257 return PTR_ERR(right); 1258 1259 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1260 1261 right_nr = btrfs_header_nritems(right); 1262 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1263 wret = 1; 1264 } else { 1265 ret = btrfs_cow_block(trans, root, right, 1266 parent, pslot + 1, 1267 &right, BTRFS_NESTING_RIGHT_COW); 1268 if (ret) 1269 wret = 1; 1270 else { 1271 wret = balance_node_right(trans, right, mid); 1272 } 1273 } 1274 if (wret < 0) 1275 ret = wret; 1276 if (wret == 0) { 1277 struct btrfs_disk_key disk_key; 1278 1279 btrfs_node_key(right, &disk_key, 0); 1280 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1281 BTRFS_MOD_LOG_KEY_REPLACE); 1282 if (ret < 0) { 1283 btrfs_tree_unlock(right); 1284 free_extent_buffer(right); 1285 btrfs_abort_transaction(trans, ret); 1286 return ret; 1287 } 1288 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1289 btrfs_mark_buffer_dirty(trans, parent); 1290 1291 if (btrfs_header_nritems(mid) <= orig_slot) { 1292 path->nodes[level] = right; 1293 path->slots[level + 1] += 1; 1294 path->slots[level] = orig_slot - 1295 btrfs_header_nritems(mid); 1296 btrfs_tree_unlock(mid); 1297 free_extent_buffer(mid); 1298 } else { 1299 btrfs_tree_unlock(right); 1300 free_extent_buffer(right); 1301 } 1302 return 0; 1303 } 1304 btrfs_tree_unlock(right); 1305 free_extent_buffer(right); 1306 } 1307 return 1; 1308 } 1309 1310 /* 1311 * readahead one full node of leaves, finding things that are close 1312 * to the block in 'slot', and triggering ra on them. 1313 */ 1314 static void reada_for_search(struct btrfs_fs_info *fs_info, 1315 struct btrfs_path *path, 1316 int level, int slot, u64 objectid) 1317 { 1318 struct extent_buffer *node; 1319 struct btrfs_disk_key disk_key; 1320 u32 nritems; 1321 u64 search; 1322 u64 target; 1323 u64 nread = 0; 1324 u64 nread_max; 1325 u32 nr; 1326 u32 blocksize; 1327 u32 nscan = 0; 1328 1329 if (level != 1 && path->reada != READA_FORWARD_ALWAYS) 1330 return; 1331 1332 if (!path->nodes[level]) 1333 return; 1334 1335 node = path->nodes[level]; 1336 1337 /* 1338 * Since the time between visiting leaves is much shorter than the time 1339 * between visiting nodes, limit read ahead of nodes to 1, to avoid too 1340 * much IO at once (possibly random). 1341 */ 1342 if (path->reada == READA_FORWARD_ALWAYS) { 1343 if (level > 1) 1344 nread_max = node->fs_info->nodesize; 1345 else 1346 nread_max = SZ_128K; 1347 } else { 1348 nread_max = SZ_64K; 1349 } 1350 1351 search = btrfs_node_blockptr(node, slot); 1352 blocksize = fs_info->nodesize; 1353 if (path->reada != READA_FORWARD_ALWAYS) { 1354 struct extent_buffer *eb; 1355 1356 eb = find_extent_buffer(fs_info, search); 1357 if (eb) { 1358 free_extent_buffer(eb); 1359 return; 1360 } 1361 } 1362 1363 target = search; 1364 1365 nritems = btrfs_header_nritems(node); 1366 nr = slot; 1367 1368 while (1) { 1369 if (path->reada == READA_BACK) { 1370 if (nr == 0) 1371 break; 1372 nr--; 1373 } else if (path->reada == READA_FORWARD || 1374 path->reada == READA_FORWARD_ALWAYS) { 1375 nr++; 1376 if (nr >= nritems) 1377 break; 1378 } 1379 if (path->reada == READA_BACK && objectid) { 1380 btrfs_node_key(node, &disk_key, nr); 1381 if (btrfs_disk_key_objectid(&disk_key) != objectid) 1382 break; 1383 } 1384 search = btrfs_node_blockptr(node, nr); 1385 if (path->reada == READA_FORWARD_ALWAYS || 1386 (search <= target && target - search <= 65536) || 1387 (search > target && search - target <= 65536)) { 1388 btrfs_readahead_node_child(node, nr); 1389 nread += blocksize; 1390 } 1391 nscan++; 1392 if (nread > nread_max || nscan > 32) 1393 break; 1394 } 1395 } 1396 1397 static noinline void reada_for_balance(struct btrfs_path *path, int level) 1398 { 1399 struct extent_buffer *parent; 1400 int slot; 1401 int nritems; 1402 1403 parent = path->nodes[level + 1]; 1404 if (!parent) 1405 return; 1406 1407 nritems = btrfs_header_nritems(parent); 1408 slot = path->slots[level + 1]; 1409 1410 if (slot > 0) 1411 btrfs_readahead_node_child(parent, slot - 1); 1412 if (slot + 1 < nritems) 1413 btrfs_readahead_node_child(parent, slot + 1); 1414 } 1415 1416 1417 /* 1418 * when we walk down the tree, it is usually safe to unlock the higher layers 1419 * in the tree. The exceptions are when our path goes through slot 0, because 1420 * operations on the tree might require changing key pointers higher up in the 1421 * tree. 1422 * 1423 * callers might also have set path->keep_locks, which tells this code to keep 1424 * the lock if the path points to the last slot in the block. This is part of 1425 * walking through the tree, and selecting the next slot in the higher block. 1426 * 1427 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 1428 * if lowest_unlock is 1, level 0 won't be unlocked 1429 */ 1430 static noinline void unlock_up(struct btrfs_path *path, int level, 1431 int lowest_unlock, int min_write_lock_level, 1432 int *write_lock_level) 1433 { 1434 int i; 1435 int skip_level = level; 1436 bool check_skip = true; 1437 1438 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1439 if (!path->nodes[i]) 1440 break; 1441 if (!path->locks[i]) 1442 break; 1443 1444 if (check_skip) { 1445 if (path->slots[i] == 0) { 1446 skip_level = i + 1; 1447 continue; 1448 } 1449 1450 if (path->keep_locks) { 1451 u32 nritems; 1452 1453 nritems = btrfs_header_nritems(path->nodes[i]); 1454 if (nritems < 1 || path->slots[i] >= nritems - 1) { 1455 skip_level = i + 1; 1456 continue; 1457 } 1458 } 1459 } 1460 1461 if (i >= lowest_unlock && i > skip_level) { 1462 check_skip = false; 1463 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 1464 path->locks[i] = 0; 1465 if (write_lock_level && 1466 i > min_write_lock_level && 1467 i <= *write_lock_level) { 1468 *write_lock_level = i - 1; 1469 } 1470 } 1471 } 1472 } 1473 1474 /* 1475 * Helper function for btrfs_search_slot() and other functions that do a search 1476 * on a btree. The goal is to find a tree block in the cache (the radix tree at 1477 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read 1478 * its pages from disk. 1479 * 1480 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the 1481 * whole btree search, starting again from the current root node. 1482 */ 1483 static int 1484 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 1485 struct extent_buffer **eb_ret, int level, int slot, 1486 const struct btrfs_key *key) 1487 { 1488 struct btrfs_fs_info *fs_info = root->fs_info; 1489 struct btrfs_tree_parent_check check = { 0 }; 1490 u64 blocknr; 1491 u64 gen; 1492 struct extent_buffer *tmp; 1493 int ret; 1494 int parent_level; 1495 bool unlock_up; 1496 1497 unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); 1498 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1499 gen = btrfs_node_ptr_generation(*eb_ret, slot); 1500 parent_level = btrfs_header_level(*eb_ret); 1501 btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); 1502 check.has_first_key = true; 1503 check.level = parent_level - 1; 1504 check.transid = gen; 1505 check.owner_root = root->root_key.objectid; 1506 1507 /* 1508 * If we need to read an extent buffer from disk and we are holding locks 1509 * on upper level nodes, we unlock all the upper nodes before reading the 1510 * extent buffer, and then return -EAGAIN to the caller as it needs to 1511 * restart the search. We don't release the lock on the current level 1512 * because we need to walk this node to figure out which blocks to read. 1513 */ 1514 tmp = find_extent_buffer(fs_info, blocknr); 1515 if (tmp) { 1516 if (p->reada == READA_FORWARD_ALWAYS) 1517 reada_for_search(fs_info, p, level, slot, key->objectid); 1518 1519 /* first we do an atomic uptodate check */ 1520 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 1521 /* 1522 * Do extra check for first_key, eb can be stale due to 1523 * being cached, read from scrub, or have multiple 1524 * parents (shared tree blocks). 1525 */ 1526 if (btrfs_verify_level_key(tmp, 1527 parent_level - 1, &check.first_key, gen)) { 1528 free_extent_buffer(tmp); 1529 return -EUCLEAN; 1530 } 1531 *eb_ret = tmp; 1532 return 0; 1533 } 1534 1535 if (p->nowait) { 1536 free_extent_buffer(tmp); 1537 return -EAGAIN; 1538 } 1539 1540 if (unlock_up) 1541 btrfs_unlock_up_safe(p, level + 1); 1542 1543 /* now we're allowed to do a blocking uptodate check */ 1544 ret = btrfs_read_extent_buffer(tmp, &check); 1545 if (ret) { 1546 free_extent_buffer(tmp); 1547 btrfs_release_path(p); 1548 return -EIO; 1549 } 1550 if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { 1551 free_extent_buffer(tmp); 1552 btrfs_release_path(p); 1553 return -EUCLEAN; 1554 } 1555 1556 if (unlock_up) 1557 ret = -EAGAIN; 1558 1559 goto out; 1560 } else if (p->nowait) { 1561 return -EAGAIN; 1562 } 1563 1564 if (unlock_up) { 1565 btrfs_unlock_up_safe(p, level + 1); 1566 ret = -EAGAIN; 1567 } else { 1568 ret = 0; 1569 } 1570 1571 if (p->reada != READA_NONE) 1572 reada_for_search(fs_info, p, level, slot, key->objectid); 1573 1574 tmp = read_tree_block(fs_info, blocknr, &check); 1575 if (IS_ERR(tmp)) { 1576 btrfs_release_path(p); 1577 return PTR_ERR(tmp); 1578 } 1579 /* 1580 * If the read above didn't mark this buffer up to date, 1581 * it will never end up being up to date. Set ret to EIO now 1582 * and give up so that our caller doesn't loop forever 1583 * on our EAGAINs. 1584 */ 1585 if (!extent_buffer_uptodate(tmp)) 1586 ret = -EIO; 1587 1588 out: 1589 if (ret == 0) { 1590 *eb_ret = tmp; 1591 } else { 1592 free_extent_buffer(tmp); 1593 btrfs_release_path(p); 1594 } 1595 1596 return ret; 1597 } 1598 1599 /* 1600 * helper function for btrfs_search_slot. This does all of the checks 1601 * for node-level blocks and does any balancing required based on 1602 * the ins_len. 1603 * 1604 * If no extra work was required, zero is returned. If we had to 1605 * drop the path, -EAGAIN is returned and btrfs_search_slot must 1606 * start over 1607 */ 1608 static int 1609 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1610 struct btrfs_root *root, struct btrfs_path *p, 1611 struct extent_buffer *b, int level, int ins_len, 1612 int *write_lock_level) 1613 { 1614 struct btrfs_fs_info *fs_info = root->fs_info; 1615 int ret = 0; 1616 1617 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1618 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 1619 1620 if (*write_lock_level < level + 1) { 1621 *write_lock_level = level + 1; 1622 btrfs_release_path(p); 1623 return -EAGAIN; 1624 } 1625 1626 reada_for_balance(p, level); 1627 ret = split_node(trans, root, p, level); 1628 1629 b = p->nodes[level]; 1630 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1631 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 1632 1633 if (*write_lock_level < level + 1) { 1634 *write_lock_level = level + 1; 1635 btrfs_release_path(p); 1636 return -EAGAIN; 1637 } 1638 1639 reada_for_balance(p, level); 1640 ret = balance_level(trans, root, p, level); 1641 if (ret) 1642 return ret; 1643 1644 b = p->nodes[level]; 1645 if (!b) { 1646 btrfs_release_path(p); 1647 return -EAGAIN; 1648 } 1649 BUG_ON(btrfs_header_nritems(b) == 1); 1650 } 1651 return ret; 1652 } 1653 1654 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 1655 u64 iobjectid, u64 ioff, u8 key_type, 1656 struct btrfs_key *found_key) 1657 { 1658 int ret; 1659 struct btrfs_key key; 1660 struct extent_buffer *eb; 1661 1662 ASSERT(path); 1663 ASSERT(found_key); 1664 1665 key.type = key_type; 1666 key.objectid = iobjectid; 1667 key.offset = ioff; 1668 1669 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 1670 if (ret < 0) 1671 return ret; 1672 1673 eb = path->nodes[0]; 1674 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 1675 ret = btrfs_next_leaf(fs_root, path); 1676 if (ret) 1677 return ret; 1678 eb = path->nodes[0]; 1679 } 1680 1681 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 1682 if (found_key->type != key.type || 1683 found_key->objectid != key.objectid) 1684 return 1; 1685 1686 return 0; 1687 } 1688 1689 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 1690 struct btrfs_path *p, 1691 int write_lock_level) 1692 { 1693 struct extent_buffer *b; 1694 int root_lock = 0; 1695 int level = 0; 1696 1697 if (p->search_commit_root) { 1698 b = root->commit_root; 1699 atomic_inc(&b->refs); 1700 level = btrfs_header_level(b); 1701 /* 1702 * Ensure that all callers have set skip_locking when 1703 * p->search_commit_root = 1. 1704 */ 1705 ASSERT(p->skip_locking == 1); 1706 1707 goto out; 1708 } 1709 1710 if (p->skip_locking) { 1711 b = btrfs_root_node(root); 1712 level = btrfs_header_level(b); 1713 goto out; 1714 } 1715 1716 /* We try very hard to do read locks on the root */ 1717 root_lock = BTRFS_READ_LOCK; 1718 1719 /* 1720 * If the level is set to maximum, we can skip trying to get the read 1721 * lock. 1722 */ 1723 if (write_lock_level < BTRFS_MAX_LEVEL) { 1724 /* 1725 * We don't know the level of the root node until we actually 1726 * have it read locked 1727 */ 1728 if (p->nowait) { 1729 b = btrfs_try_read_lock_root_node(root); 1730 if (IS_ERR(b)) 1731 return b; 1732 } else { 1733 b = btrfs_read_lock_root_node(root); 1734 } 1735 level = btrfs_header_level(b); 1736 if (level > write_lock_level) 1737 goto out; 1738 1739 /* Whoops, must trade for write lock */ 1740 btrfs_tree_read_unlock(b); 1741 free_extent_buffer(b); 1742 } 1743 1744 b = btrfs_lock_root_node(root); 1745 root_lock = BTRFS_WRITE_LOCK; 1746 1747 /* The level might have changed, check again */ 1748 level = btrfs_header_level(b); 1749 1750 out: 1751 /* 1752 * The root may have failed to write out at some point, and thus is no 1753 * longer valid, return an error in this case. 1754 */ 1755 if (!extent_buffer_uptodate(b)) { 1756 if (root_lock) 1757 btrfs_tree_unlock_rw(b, root_lock); 1758 free_extent_buffer(b); 1759 return ERR_PTR(-EIO); 1760 } 1761 1762 p->nodes[level] = b; 1763 if (!p->skip_locking) 1764 p->locks[level] = root_lock; 1765 /* 1766 * Callers are responsible for dropping b's references. 1767 */ 1768 return b; 1769 } 1770 1771 /* 1772 * Replace the extent buffer at the lowest level of the path with a cloned 1773 * version. The purpose is to be able to use it safely, after releasing the 1774 * commit root semaphore, even if relocation is happening in parallel, the 1775 * transaction used for relocation is committed and the extent buffer is 1776 * reallocated in the next transaction. 1777 * 1778 * This is used in a context where the caller does not prevent transaction 1779 * commits from happening, either by holding a transaction handle or holding 1780 * some lock, while it's doing searches through a commit root. 1781 * At the moment it's only used for send operations. 1782 */ 1783 static int finish_need_commit_sem_search(struct btrfs_path *path) 1784 { 1785 const int i = path->lowest_level; 1786 const int slot = path->slots[i]; 1787 struct extent_buffer *lowest = path->nodes[i]; 1788 struct extent_buffer *clone; 1789 1790 ASSERT(path->need_commit_sem); 1791 1792 if (!lowest) 1793 return 0; 1794 1795 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); 1796 1797 clone = btrfs_clone_extent_buffer(lowest); 1798 if (!clone) 1799 return -ENOMEM; 1800 1801 btrfs_release_path(path); 1802 path->nodes[i] = clone; 1803 path->slots[i] = slot; 1804 1805 return 0; 1806 } 1807 1808 static inline int search_for_key_slot(struct extent_buffer *eb, 1809 int search_low_slot, 1810 const struct btrfs_key *key, 1811 int prev_cmp, 1812 int *slot) 1813 { 1814 /* 1815 * If a previous call to btrfs_bin_search() on a parent node returned an 1816 * exact match (prev_cmp == 0), we can safely assume the target key will 1817 * always be at slot 0 on lower levels, since each key pointer 1818 * (struct btrfs_key_ptr) refers to the lowest key accessible from the 1819 * subtree it points to. Thus we can skip searching lower levels. 1820 */ 1821 if (prev_cmp == 0) { 1822 *slot = 0; 1823 return 0; 1824 } 1825 1826 return btrfs_bin_search(eb, search_low_slot, key, slot); 1827 } 1828 1829 static int search_leaf(struct btrfs_trans_handle *trans, 1830 struct btrfs_root *root, 1831 const struct btrfs_key *key, 1832 struct btrfs_path *path, 1833 int ins_len, 1834 int prev_cmp) 1835 { 1836 struct extent_buffer *leaf = path->nodes[0]; 1837 int leaf_free_space = -1; 1838 int search_low_slot = 0; 1839 int ret; 1840 bool do_bin_search = true; 1841 1842 /* 1843 * If we are doing an insertion, the leaf has enough free space and the 1844 * destination slot for the key is not slot 0, then we can unlock our 1845 * write lock on the parent, and any other upper nodes, before doing the 1846 * binary search on the leaf (with search_for_key_slot()), allowing other 1847 * tasks to lock the parent and any other upper nodes. 1848 */ 1849 if (ins_len > 0) { 1850 /* 1851 * Cache the leaf free space, since we will need it later and it 1852 * will not change until then. 1853 */ 1854 leaf_free_space = btrfs_leaf_free_space(leaf); 1855 1856 /* 1857 * !path->locks[1] means we have a single node tree, the leaf is 1858 * the root of the tree. 1859 */ 1860 if (path->locks[1] && leaf_free_space >= ins_len) { 1861 struct btrfs_disk_key first_key; 1862 1863 ASSERT(btrfs_header_nritems(leaf) > 0); 1864 btrfs_item_key(leaf, &first_key, 0); 1865 1866 /* 1867 * Doing the extra comparison with the first key is cheap, 1868 * taking into account that the first key is very likely 1869 * already in a cache line because it immediately follows 1870 * the extent buffer's header and we have recently accessed 1871 * the header's level field. 1872 */ 1873 ret = btrfs_comp_keys(&first_key, key); 1874 if (ret < 0) { 1875 /* 1876 * The first key is smaller than the key we want 1877 * to insert, so we are safe to unlock all upper 1878 * nodes and we have to do the binary search. 1879 * 1880 * We do use btrfs_unlock_up_safe() and not 1881 * unlock_up() because the later does not unlock 1882 * nodes with a slot of 0 - we can safely unlock 1883 * any node even if its slot is 0 since in this 1884 * case the key does not end up at slot 0 of the 1885 * leaf and there's no need to split the leaf. 1886 */ 1887 btrfs_unlock_up_safe(path, 1); 1888 search_low_slot = 1; 1889 } else { 1890 /* 1891 * The first key is >= then the key we want to 1892 * insert, so we can skip the binary search as 1893 * the target key will be at slot 0. 1894 * 1895 * We can not unlock upper nodes when the key is 1896 * less than the first key, because we will need 1897 * to update the key at slot 0 of the parent node 1898 * and possibly of other upper nodes too. 1899 * If the key matches the first key, then we can 1900 * unlock all the upper nodes, using 1901 * btrfs_unlock_up_safe() instead of unlock_up() 1902 * as stated above. 1903 */ 1904 if (ret == 0) 1905 btrfs_unlock_up_safe(path, 1); 1906 /* 1907 * ret is already 0 or 1, matching the result of 1908 * a btrfs_bin_search() call, so there is no need 1909 * to adjust it. 1910 */ 1911 do_bin_search = false; 1912 path->slots[0] = 0; 1913 } 1914 } 1915 } 1916 1917 if (do_bin_search) { 1918 ret = search_for_key_slot(leaf, search_low_slot, key, 1919 prev_cmp, &path->slots[0]); 1920 if (ret < 0) 1921 return ret; 1922 } 1923 1924 if (ins_len > 0) { 1925 /* 1926 * Item key already exists. In this case, if we are allowed to 1927 * insert the item (for example, in dir_item case, item key 1928 * collision is allowed), it will be merged with the original 1929 * item. Only the item size grows, no new btrfs item will be 1930 * added. If search_for_extension is not set, ins_len already 1931 * accounts the size btrfs_item, deduct it here so leaf space 1932 * check will be correct. 1933 */ 1934 if (ret == 0 && !path->search_for_extension) { 1935 ASSERT(ins_len >= sizeof(struct btrfs_item)); 1936 ins_len -= sizeof(struct btrfs_item); 1937 } 1938 1939 ASSERT(leaf_free_space >= 0); 1940 1941 if (leaf_free_space < ins_len) { 1942 int err; 1943 1944 err = split_leaf(trans, root, key, path, ins_len, 1945 (ret == 0)); 1946 ASSERT(err <= 0); 1947 if (WARN_ON(err > 0)) 1948 err = -EUCLEAN; 1949 if (err) 1950 ret = err; 1951 } 1952 } 1953 1954 return ret; 1955 } 1956 1957 /* 1958 * Look for a key in a tree and perform necessary modifications to preserve 1959 * tree invariants. 1960 * 1961 * @trans: Handle of transaction, used when modifying the tree 1962 * @p: Holds all btree nodes along the search path 1963 * @root: The root node of the tree 1964 * @key: The key we are looking for 1965 * @ins_len: Indicates purpose of search: 1966 * >0 for inserts it's size of item inserted (*) 1967 * <0 for deletions 1968 * 0 for plain searches, not modifying the tree 1969 * 1970 * (*) If size of item inserted doesn't include 1971 * sizeof(struct btrfs_item), then p->search_for_extension must 1972 * be set. 1973 * @cow: boolean should CoW operations be performed. Must always be 1 1974 * when modifying the tree. 1975 * 1976 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 1977 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 1978 * 1979 * If @key is found, 0 is returned and you can find the item in the leaf level 1980 * of the path (level 0) 1981 * 1982 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 1983 * points to the slot where it should be inserted 1984 * 1985 * If an error is encountered while searching the tree a negative error number 1986 * is returned 1987 */ 1988 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1989 const struct btrfs_key *key, struct btrfs_path *p, 1990 int ins_len, int cow) 1991 { 1992 struct btrfs_fs_info *fs_info = root->fs_info; 1993 struct extent_buffer *b; 1994 int slot; 1995 int ret; 1996 int err; 1997 int level; 1998 int lowest_unlock = 1; 1999 /* everything at write_lock_level or lower must be write locked */ 2000 int write_lock_level = 0; 2001 u8 lowest_level = 0; 2002 int min_write_lock_level; 2003 int prev_cmp; 2004 2005 might_sleep(); 2006 2007 lowest_level = p->lowest_level; 2008 WARN_ON(lowest_level && ins_len > 0); 2009 WARN_ON(p->nodes[0] != NULL); 2010 BUG_ON(!cow && ins_len); 2011 2012 /* 2013 * For now only allow nowait for read only operations. There's no 2014 * strict reason why we can't, we just only need it for reads so it's 2015 * only implemented for reads. 2016 */ 2017 ASSERT(!p->nowait || !cow); 2018 2019 if (ins_len < 0) { 2020 lowest_unlock = 2; 2021 2022 /* when we are removing items, we might have to go up to level 2023 * two as we update tree pointers Make sure we keep write 2024 * for those levels as well 2025 */ 2026 write_lock_level = 2; 2027 } else if (ins_len > 0) { 2028 /* 2029 * for inserting items, make sure we have a write lock on 2030 * level 1 so we can update keys 2031 */ 2032 write_lock_level = 1; 2033 } 2034 2035 if (!cow) 2036 write_lock_level = -1; 2037 2038 if (cow && (p->keep_locks || p->lowest_level)) 2039 write_lock_level = BTRFS_MAX_LEVEL; 2040 2041 min_write_lock_level = write_lock_level; 2042 2043 if (p->need_commit_sem) { 2044 ASSERT(p->search_commit_root); 2045 if (p->nowait) { 2046 if (!down_read_trylock(&fs_info->commit_root_sem)) 2047 return -EAGAIN; 2048 } else { 2049 down_read(&fs_info->commit_root_sem); 2050 } 2051 } 2052 2053 again: 2054 prev_cmp = -1; 2055 b = btrfs_search_slot_get_root(root, p, write_lock_level); 2056 if (IS_ERR(b)) { 2057 ret = PTR_ERR(b); 2058 goto done; 2059 } 2060 2061 while (b) { 2062 int dec = 0; 2063 2064 level = btrfs_header_level(b); 2065 2066 if (cow) { 2067 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 2068 2069 /* 2070 * if we don't really need to cow this block 2071 * then we don't want to set the path blocking, 2072 * so we test it here 2073 */ 2074 if (!should_cow_block(trans, root, b)) 2075 goto cow_done; 2076 2077 /* 2078 * must have write locks on this node and the 2079 * parent 2080 */ 2081 if (level > write_lock_level || 2082 (level + 1 > write_lock_level && 2083 level + 1 < BTRFS_MAX_LEVEL && 2084 p->nodes[level + 1])) { 2085 write_lock_level = level + 1; 2086 btrfs_release_path(p); 2087 goto again; 2088 } 2089 2090 if (last_level) 2091 err = btrfs_cow_block(trans, root, b, NULL, 0, 2092 &b, 2093 BTRFS_NESTING_COW); 2094 else 2095 err = btrfs_cow_block(trans, root, b, 2096 p->nodes[level + 1], 2097 p->slots[level + 1], &b, 2098 BTRFS_NESTING_COW); 2099 if (err) { 2100 ret = err; 2101 goto done; 2102 } 2103 } 2104 cow_done: 2105 p->nodes[level] = b; 2106 2107 /* 2108 * we have a lock on b and as long as we aren't changing 2109 * the tree, there is no way to for the items in b to change. 2110 * It is safe to drop the lock on our parent before we 2111 * go through the expensive btree search on b. 2112 * 2113 * If we're inserting or deleting (ins_len != 0), then we might 2114 * be changing slot zero, which may require changing the parent. 2115 * So, we can't drop the lock until after we know which slot 2116 * we're operating on. 2117 */ 2118 if (!ins_len && !p->keep_locks) { 2119 int u = level + 1; 2120 2121 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 2122 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 2123 p->locks[u] = 0; 2124 } 2125 } 2126 2127 if (level == 0) { 2128 if (ins_len > 0) 2129 ASSERT(write_lock_level >= 1); 2130 2131 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); 2132 if (!p->search_for_split) 2133 unlock_up(p, level, lowest_unlock, 2134 min_write_lock_level, NULL); 2135 goto done; 2136 } 2137 2138 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); 2139 if (ret < 0) 2140 goto done; 2141 prev_cmp = ret; 2142 2143 if (ret && slot > 0) { 2144 dec = 1; 2145 slot--; 2146 } 2147 p->slots[level] = slot; 2148 err = setup_nodes_for_search(trans, root, p, b, level, ins_len, 2149 &write_lock_level); 2150 if (err == -EAGAIN) 2151 goto again; 2152 if (err) { 2153 ret = err; 2154 goto done; 2155 } 2156 b = p->nodes[level]; 2157 slot = p->slots[level]; 2158 2159 /* 2160 * Slot 0 is special, if we change the key we have to update 2161 * the parent pointer which means we must have a write lock on 2162 * the parent 2163 */ 2164 if (slot == 0 && ins_len && write_lock_level < level + 1) { 2165 write_lock_level = level + 1; 2166 btrfs_release_path(p); 2167 goto again; 2168 } 2169 2170 unlock_up(p, level, lowest_unlock, min_write_lock_level, 2171 &write_lock_level); 2172 2173 if (level == lowest_level) { 2174 if (dec) 2175 p->slots[level]++; 2176 goto done; 2177 } 2178 2179 err = read_block_for_search(root, p, &b, level, slot, key); 2180 if (err == -EAGAIN) 2181 goto again; 2182 if (err) { 2183 ret = err; 2184 goto done; 2185 } 2186 2187 if (!p->skip_locking) { 2188 level = btrfs_header_level(b); 2189 2190 btrfs_maybe_reset_lockdep_class(root, b); 2191 2192 if (level <= write_lock_level) { 2193 btrfs_tree_lock(b); 2194 p->locks[level] = BTRFS_WRITE_LOCK; 2195 } else { 2196 if (p->nowait) { 2197 if (!btrfs_try_tree_read_lock(b)) { 2198 free_extent_buffer(b); 2199 ret = -EAGAIN; 2200 goto done; 2201 } 2202 } else { 2203 btrfs_tree_read_lock(b); 2204 } 2205 p->locks[level] = BTRFS_READ_LOCK; 2206 } 2207 p->nodes[level] = b; 2208 } 2209 } 2210 ret = 1; 2211 done: 2212 if (ret < 0 && !p->skip_release_on_error) 2213 btrfs_release_path(p); 2214 2215 if (p->need_commit_sem) { 2216 int ret2; 2217 2218 ret2 = finish_need_commit_sem_search(p); 2219 up_read(&fs_info->commit_root_sem); 2220 if (ret2) 2221 ret = ret2; 2222 } 2223 2224 return ret; 2225 } 2226 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); 2227 2228 /* 2229 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2230 * current state of the tree together with the operations recorded in the tree 2231 * modification log to search for the key in a previous version of this tree, as 2232 * denoted by the time_seq parameter. 2233 * 2234 * Naturally, there is no support for insert, delete or cow operations. 2235 * 2236 * The resulting path and return value will be set up as if we called 2237 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2238 */ 2239 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 2240 struct btrfs_path *p, u64 time_seq) 2241 { 2242 struct btrfs_fs_info *fs_info = root->fs_info; 2243 struct extent_buffer *b; 2244 int slot; 2245 int ret; 2246 int err; 2247 int level; 2248 int lowest_unlock = 1; 2249 u8 lowest_level = 0; 2250 2251 lowest_level = p->lowest_level; 2252 WARN_ON(p->nodes[0] != NULL); 2253 ASSERT(!p->nowait); 2254 2255 if (p->search_commit_root) { 2256 BUG_ON(time_seq); 2257 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2258 } 2259 2260 again: 2261 b = btrfs_get_old_root(root, time_seq); 2262 if (!b) { 2263 ret = -EIO; 2264 goto done; 2265 } 2266 level = btrfs_header_level(b); 2267 p->locks[level] = BTRFS_READ_LOCK; 2268 2269 while (b) { 2270 int dec = 0; 2271 2272 level = btrfs_header_level(b); 2273 p->nodes[level] = b; 2274 2275 /* 2276 * we have a lock on b and as long as we aren't changing 2277 * the tree, there is no way to for the items in b to change. 2278 * It is safe to drop the lock on our parent before we 2279 * go through the expensive btree search on b. 2280 */ 2281 btrfs_unlock_up_safe(p, level + 1); 2282 2283 ret = btrfs_bin_search(b, 0, key, &slot); 2284 if (ret < 0) 2285 goto done; 2286 2287 if (level == 0) { 2288 p->slots[level] = slot; 2289 unlock_up(p, level, lowest_unlock, 0, NULL); 2290 goto done; 2291 } 2292 2293 if (ret && slot > 0) { 2294 dec = 1; 2295 slot--; 2296 } 2297 p->slots[level] = slot; 2298 unlock_up(p, level, lowest_unlock, 0, NULL); 2299 2300 if (level == lowest_level) { 2301 if (dec) 2302 p->slots[level]++; 2303 goto done; 2304 } 2305 2306 err = read_block_for_search(root, p, &b, level, slot, key); 2307 if (err == -EAGAIN) 2308 goto again; 2309 if (err) { 2310 ret = err; 2311 goto done; 2312 } 2313 2314 level = btrfs_header_level(b); 2315 btrfs_tree_read_lock(b); 2316 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); 2317 if (!b) { 2318 ret = -ENOMEM; 2319 goto done; 2320 } 2321 p->locks[level] = BTRFS_READ_LOCK; 2322 p->nodes[level] = b; 2323 } 2324 ret = 1; 2325 done: 2326 if (ret < 0) 2327 btrfs_release_path(p); 2328 2329 return ret; 2330 } 2331 2332 /* 2333 * Search the tree again to find a leaf with smaller keys. 2334 * Returns 0 if it found something. 2335 * Returns 1 if there are no smaller keys. 2336 * Returns < 0 on error. 2337 * 2338 * This may release the path, and so you may lose any locks held at the 2339 * time you call it. 2340 */ 2341 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 2342 { 2343 struct btrfs_key key; 2344 struct btrfs_key orig_key; 2345 struct btrfs_disk_key found_key; 2346 int ret; 2347 2348 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 2349 orig_key = key; 2350 2351 if (key.offset > 0) { 2352 key.offset--; 2353 } else if (key.type > 0) { 2354 key.type--; 2355 key.offset = (u64)-1; 2356 } else if (key.objectid > 0) { 2357 key.objectid--; 2358 key.type = (u8)-1; 2359 key.offset = (u64)-1; 2360 } else { 2361 return 1; 2362 } 2363 2364 btrfs_release_path(path); 2365 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2366 if (ret <= 0) 2367 return ret; 2368 2369 /* 2370 * Previous key not found. Even if we were at slot 0 of the leaf we had 2371 * before releasing the path and calling btrfs_search_slot(), we now may 2372 * be in a slot pointing to the same original key - this can happen if 2373 * after we released the path, one of more items were moved from a 2374 * sibling leaf into the front of the leaf we had due to an insertion 2375 * (see push_leaf_right()). 2376 * If we hit this case and our slot is > 0 and just decrement the slot 2377 * so that the caller does not process the same key again, which may or 2378 * may not break the caller, depending on its logic. 2379 */ 2380 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 2381 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); 2382 ret = btrfs_comp_keys(&found_key, &orig_key); 2383 if (ret == 0) { 2384 if (path->slots[0] > 0) { 2385 path->slots[0]--; 2386 return 0; 2387 } 2388 /* 2389 * At slot 0, same key as before, it means orig_key is 2390 * the lowest, leftmost, key in the tree. We're done. 2391 */ 2392 return 1; 2393 } 2394 } 2395 2396 btrfs_item_key(path->nodes[0], &found_key, 0); 2397 ret = btrfs_comp_keys(&found_key, &key); 2398 /* 2399 * We might have had an item with the previous key in the tree right 2400 * before we released our path. And after we released our path, that 2401 * item might have been pushed to the first slot (0) of the leaf we 2402 * were holding due to a tree balance. Alternatively, an item with the 2403 * previous key can exist as the only element of a leaf (big fat item). 2404 * Therefore account for these 2 cases, so that our callers (like 2405 * btrfs_previous_item) don't miss an existing item with a key matching 2406 * the previous key we computed above. 2407 */ 2408 if (ret <= 0) 2409 return 0; 2410 return 1; 2411 } 2412 2413 /* 2414 * helper to use instead of search slot if no exact match is needed but 2415 * instead the next or previous item should be returned. 2416 * When find_higher is true, the next higher item is returned, the next lower 2417 * otherwise. 2418 * When return_any and find_higher are both true, and no higher item is found, 2419 * return the next lower instead. 2420 * When return_any is true and find_higher is false, and no lower item is found, 2421 * return the next higher instead. 2422 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2423 * < 0 on error 2424 */ 2425 int btrfs_search_slot_for_read(struct btrfs_root *root, 2426 const struct btrfs_key *key, 2427 struct btrfs_path *p, int find_higher, 2428 int return_any) 2429 { 2430 int ret; 2431 struct extent_buffer *leaf; 2432 2433 again: 2434 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2435 if (ret <= 0) 2436 return ret; 2437 /* 2438 * a return value of 1 means the path is at the position where the 2439 * item should be inserted. Normally this is the next bigger item, 2440 * but in case the previous item is the last in a leaf, path points 2441 * to the first free slot in the previous leaf, i.e. at an invalid 2442 * item. 2443 */ 2444 leaf = p->nodes[0]; 2445 2446 if (find_higher) { 2447 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2448 ret = btrfs_next_leaf(root, p); 2449 if (ret <= 0) 2450 return ret; 2451 if (!return_any) 2452 return 1; 2453 /* 2454 * no higher item found, return the next 2455 * lower instead 2456 */ 2457 return_any = 0; 2458 find_higher = 0; 2459 btrfs_release_path(p); 2460 goto again; 2461 } 2462 } else { 2463 if (p->slots[0] == 0) { 2464 ret = btrfs_prev_leaf(root, p); 2465 if (ret < 0) 2466 return ret; 2467 if (!ret) { 2468 leaf = p->nodes[0]; 2469 if (p->slots[0] == btrfs_header_nritems(leaf)) 2470 p->slots[0]--; 2471 return 0; 2472 } 2473 if (!return_any) 2474 return 1; 2475 /* 2476 * no lower item found, return the next 2477 * higher instead 2478 */ 2479 return_any = 0; 2480 find_higher = 1; 2481 btrfs_release_path(p); 2482 goto again; 2483 } else { 2484 --p->slots[0]; 2485 } 2486 } 2487 return 0; 2488 } 2489 2490 /* 2491 * Execute search and call btrfs_previous_item to traverse backwards if the item 2492 * was not found. 2493 * 2494 * Return 0 if found, 1 if not found and < 0 if error. 2495 */ 2496 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, 2497 struct btrfs_path *path) 2498 { 2499 int ret; 2500 2501 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 2502 if (ret > 0) 2503 ret = btrfs_previous_item(root, path, key->objectid, key->type); 2504 2505 if (ret == 0) 2506 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2507 2508 return ret; 2509 } 2510 2511 /* 2512 * Search for a valid slot for the given path. 2513 * 2514 * @root: The root node of the tree. 2515 * @key: Will contain a valid item if found. 2516 * @path: The starting point to validate the slot. 2517 * 2518 * Return: 0 if the item is valid 2519 * 1 if not found 2520 * <0 if error. 2521 */ 2522 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, 2523 struct btrfs_path *path) 2524 { 2525 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 2526 int ret; 2527 2528 ret = btrfs_next_leaf(root, path); 2529 if (ret) 2530 return ret; 2531 } 2532 2533 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2534 return 0; 2535 } 2536 2537 /* 2538 * adjust the pointers going up the tree, starting at level 2539 * making sure the right key of each node is points to 'key'. 2540 * This is used after shifting pointers to the left, so it stops 2541 * fixing up pointers when a given leaf/node is not in slot 0 of the 2542 * higher levels 2543 * 2544 */ 2545 static void fixup_low_keys(struct btrfs_trans_handle *trans, 2546 struct btrfs_path *path, 2547 struct btrfs_disk_key *key, int level) 2548 { 2549 int i; 2550 struct extent_buffer *t; 2551 int ret; 2552 2553 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2554 int tslot = path->slots[i]; 2555 2556 if (!path->nodes[i]) 2557 break; 2558 t = path->nodes[i]; 2559 ret = btrfs_tree_mod_log_insert_key(t, tslot, 2560 BTRFS_MOD_LOG_KEY_REPLACE); 2561 BUG_ON(ret < 0); 2562 btrfs_set_node_key(t, key, tslot); 2563 btrfs_mark_buffer_dirty(trans, path->nodes[i]); 2564 if (tslot != 0) 2565 break; 2566 } 2567 } 2568 2569 /* 2570 * update item key. 2571 * 2572 * This function isn't completely safe. It's the caller's responsibility 2573 * that the new key won't break the order 2574 */ 2575 void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, 2576 struct btrfs_path *path, 2577 const struct btrfs_key *new_key) 2578 { 2579 struct btrfs_fs_info *fs_info = trans->fs_info; 2580 struct btrfs_disk_key disk_key; 2581 struct extent_buffer *eb; 2582 int slot; 2583 2584 eb = path->nodes[0]; 2585 slot = path->slots[0]; 2586 if (slot > 0) { 2587 btrfs_item_key(eb, &disk_key, slot - 1); 2588 if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) { 2589 btrfs_print_leaf(eb); 2590 btrfs_crit(fs_info, 2591 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2592 slot, btrfs_disk_key_objectid(&disk_key), 2593 btrfs_disk_key_type(&disk_key), 2594 btrfs_disk_key_offset(&disk_key), 2595 new_key->objectid, new_key->type, 2596 new_key->offset); 2597 BUG(); 2598 } 2599 } 2600 if (slot < btrfs_header_nritems(eb) - 1) { 2601 btrfs_item_key(eb, &disk_key, slot + 1); 2602 if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) { 2603 btrfs_print_leaf(eb); 2604 btrfs_crit(fs_info, 2605 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2606 slot, btrfs_disk_key_objectid(&disk_key), 2607 btrfs_disk_key_type(&disk_key), 2608 btrfs_disk_key_offset(&disk_key), 2609 new_key->objectid, new_key->type, 2610 new_key->offset); 2611 BUG(); 2612 } 2613 } 2614 2615 btrfs_cpu_key_to_disk(&disk_key, new_key); 2616 btrfs_set_item_key(eb, &disk_key, slot); 2617 btrfs_mark_buffer_dirty(trans, eb); 2618 if (slot == 0) 2619 fixup_low_keys(trans, path, &disk_key, 1); 2620 } 2621 2622 /* 2623 * Check key order of two sibling extent buffers. 2624 * 2625 * Return true if something is wrong. 2626 * Return false if everything is fine. 2627 * 2628 * Tree-checker only works inside one tree block, thus the following 2629 * corruption can not be detected by tree-checker: 2630 * 2631 * Leaf @left | Leaf @right 2632 * -------------------------------------------------------------- 2633 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | 2634 * 2635 * Key f6 in leaf @left itself is valid, but not valid when the next 2636 * key in leaf @right is 7. 2637 * This can only be checked at tree block merge time. 2638 * And since tree checker has ensured all key order in each tree block 2639 * is correct, we only need to bother the last key of @left and the first 2640 * key of @right. 2641 */ 2642 static bool check_sibling_keys(struct extent_buffer *left, 2643 struct extent_buffer *right) 2644 { 2645 struct btrfs_key left_last; 2646 struct btrfs_key right_first; 2647 int level = btrfs_header_level(left); 2648 int nr_left = btrfs_header_nritems(left); 2649 int nr_right = btrfs_header_nritems(right); 2650 2651 /* No key to check in one of the tree blocks */ 2652 if (!nr_left || !nr_right) 2653 return false; 2654 2655 if (level) { 2656 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); 2657 btrfs_node_key_to_cpu(right, &right_first, 0); 2658 } else { 2659 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); 2660 btrfs_item_key_to_cpu(right, &right_first, 0); 2661 } 2662 2663 if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { 2664 btrfs_crit(left->fs_info, "left extent buffer:"); 2665 btrfs_print_tree(left, false); 2666 btrfs_crit(left->fs_info, "right extent buffer:"); 2667 btrfs_print_tree(right, false); 2668 btrfs_crit(left->fs_info, 2669 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", 2670 left_last.objectid, left_last.type, 2671 left_last.offset, right_first.objectid, 2672 right_first.type, right_first.offset); 2673 return true; 2674 } 2675 return false; 2676 } 2677 2678 /* 2679 * try to push data from one node into the next node left in the 2680 * tree. 2681 * 2682 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2683 * error, and > 0 if there was no room in the left hand block. 2684 */ 2685 static int push_node_left(struct btrfs_trans_handle *trans, 2686 struct extent_buffer *dst, 2687 struct extent_buffer *src, int empty) 2688 { 2689 struct btrfs_fs_info *fs_info = trans->fs_info; 2690 int push_items = 0; 2691 int src_nritems; 2692 int dst_nritems; 2693 int ret = 0; 2694 2695 src_nritems = btrfs_header_nritems(src); 2696 dst_nritems = btrfs_header_nritems(dst); 2697 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2698 WARN_ON(btrfs_header_generation(src) != trans->transid); 2699 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2700 2701 if (!empty && src_nritems <= 8) 2702 return 1; 2703 2704 if (push_items <= 0) 2705 return 1; 2706 2707 if (empty) { 2708 push_items = min(src_nritems, push_items); 2709 if (push_items < src_nritems) { 2710 /* leave at least 8 pointers in the node if 2711 * we aren't going to empty it 2712 */ 2713 if (src_nritems - push_items < 8) { 2714 if (push_items <= 8) 2715 return 1; 2716 push_items -= 8; 2717 } 2718 } 2719 } else 2720 push_items = min(src_nritems - 8, push_items); 2721 2722 /* dst is the left eb, src is the middle eb */ 2723 if (check_sibling_keys(dst, src)) { 2724 ret = -EUCLEAN; 2725 btrfs_abort_transaction(trans, ret); 2726 return ret; 2727 } 2728 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); 2729 if (ret) { 2730 btrfs_abort_transaction(trans, ret); 2731 return ret; 2732 } 2733 copy_extent_buffer(dst, src, 2734 btrfs_node_key_ptr_offset(dst, dst_nritems), 2735 btrfs_node_key_ptr_offset(src, 0), 2736 push_items * sizeof(struct btrfs_key_ptr)); 2737 2738 if (push_items < src_nritems) { 2739 /* 2740 * btrfs_tree_mod_log_eb_copy handles logging the move, so we 2741 * don't need to do an explicit tree mod log operation for it. 2742 */ 2743 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), 2744 btrfs_node_key_ptr_offset(src, push_items), 2745 (src_nritems - push_items) * 2746 sizeof(struct btrfs_key_ptr)); 2747 } 2748 btrfs_set_header_nritems(src, src_nritems - push_items); 2749 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2750 btrfs_mark_buffer_dirty(trans, src); 2751 btrfs_mark_buffer_dirty(trans, dst); 2752 2753 return ret; 2754 } 2755 2756 /* 2757 * try to push data from one node into the next node right in the 2758 * tree. 2759 * 2760 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 2761 * error, and > 0 if there was no room in the right hand block. 2762 * 2763 * this will only push up to 1/2 the contents of the left node over 2764 */ 2765 static int balance_node_right(struct btrfs_trans_handle *trans, 2766 struct extent_buffer *dst, 2767 struct extent_buffer *src) 2768 { 2769 struct btrfs_fs_info *fs_info = trans->fs_info; 2770 int push_items = 0; 2771 int max_push; 2772 int src_nritems; 2773 int dst_nritems; 2774 int ret = 0; 2775 2776 WARN_ON(btrfs_header_generation(src) != trans->transid); 2777 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2778 2779 src_nritems = btrfs_header_nritems(src); 2780 dst_nritems = btrfs_header_nritems(dst); 2781 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2782 if (push_items <= 0) 2783 return 1; 2784 2785 if (src_nritems < 4) 2786 return 1; 2787 2788 max_push = src_nritems / 2 + 1; 2789 /* don't try to empty the node */ 2790 if (max_push >= src_nritems) 2791 return 1; 2792 2793 if (max_push < push_items) 2794 push_items = max_push; 2795 2796 /* dst is the right eb, src is the middle eb */ 2797 if (check_sibling_keys(src, dst)) { 2798 ret = -EUCLEAN; 2799 btrfs_abort_transaction(trans, ret); 2800 return ret; 2801 } 2802 2803 /* 2804 * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't 2805 * need to do an explicit tree mod log operation for it. 2806 */ 2807 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), 2808 btrfs_node_key_ptr_offset(dst, 0), 2809 (dst_nritems) * 2810 sizeof(struct btrfs_key_ptr)); 2811 2812 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, 2813 push_items); 2814 if (ret) { 2815 btrfs_abort_transaction(trans, ret); 2816 return ret; 2817 } 2818 copy_extent_buffer(dst, src, 2819 btrfs_node_key_ptr_offset(dst, 0), 2820 btrfs_node_key_ptr_offset(src, src_nritems - push_items), 2821 push_items * sizeof(struct btrfs_key_ptr)); 2822 2823 btrfs_set_header_nritems(src, src_nritems - push_items); 2824 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2825 2826 btrfs_mark_buffer_dirty(trans, src); 2827 btrfs_mark_buffer_dirty(trans, dst); 2828 2829 return ret; 2830 } 2831 2832 /* 2833 * helper function to insert a new root level in the tree. 2834 * A new node is allocated, and a single item is inserted to 2835 * point to the existing root 2836 * 2837 * returns zero on success or < 0 on failure. 2838 */ 2839 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 2840 struct btrfs_root *root, 2841 struct btrfs_path *path, int level) 2842 { 2843 u64 lower_gen; 2844 struct extent_buffer *lower; 2845 struct extent_buffer *c; 2846 struct extent_buffer *old; 2847 struct btrfs_disk_key lower_key; 2848 int ret; 2849 2850 BUG_ON(path->nodes[level]); 2851 BUG_ON(path->nodes[level-1] != root->node); 2852 2853 lower = path->nodes[level-1]; 2854 if (level == 1) 2855 btrfs_item_key(lower, &lower_key, 0); 2856 else 2857 btrfs_node_key(lower, &lower_key, 0); 2858 2859 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 2860 &lower_key, level, root->node->start, 0, 2861 0, BTRFS_NESTING_NEW_ROOT); 2862 if (IS_ERR(c)) 2863 return PTR_ERR(c); 2864 2865 root_add_used_bytes(root); 2866 2867 btrfs_set_header_nritems(c, 1); 2868 btrfs_set_node_key(c, &lower_key, 0); 2869 btrfs_set_node_blockptr(c, 0, lower->start); 2870 lower_gen = btrfs_header_generation(lower); 2871 WARN_ON(lower_gen != trans->transid); 2872 2873 btrfs_set_node_ptr_generation(c, 0, lower_gen); 2874 2875 btrfs_mark_buffer_dirty(trans, c); 2876 2877 old = root->node; 2878 ret = btrfs_tree_mod_log_insert_root(root->node, c, false); 2879 if (ret < 0) { 2880 btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); 2881 btrfs_tree_unlock(c); 2882 free_extent_buffer(c); 2883 return ret; 2884 } 2885 rcu_assign_pointer(root->node, c); 2886 2887 /* the super has an extra ref to root->node */ 2888 free_extent_buffer(old); 2889 2890 add_root_to_dirty_list(root); 2891 atomic_inc(&c->refs); 2892 path->nodes[level] = c; 2893 path->locks[level] = BTRFS_WRITE_LOCK; 2894 path->slots[level] = 0; 2895 return 0; 2896 } 2897 2898 /* 2899 * worker function to insert a single pointer in a node. 2900 * the node should have enough room for the pointer already 2901 * 2902 * slot and level indicate where you want the key to go, and 2903 * blocknr is the block the key points to. 2904 */ 2905 static int insert_ptr(struct btrfs_trans_handle *trans, 2906 struct btrfs_path *path, 2907 struct btrfs_disk_key *key, u64 bytenr, 2908 int slot, int level) 2909 { 2910 struct extent_buffer *lower; 2911 int nritems; 2912 int ret; 2913 2914 BUG_ON(!path->nodes[level]); 2915 btrfs_assert_tree_write_locked(path->nodes[level]); 2916 lower = path->nodes[level]; 2917 nritems = btrfs_header_nritems(lower); 2918 BUG_ON(slot > nritems); 2919 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); 2920 if (slot != nritems) { 2921 if (level) { 2922 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, 2923 slot, nritems - slot); 2924 if (ret < 0) { 2925 btrfs_abort_transaction(trans, ret); 2926 return ret; 2927 } 2928 } 2929 memmove_extent_buffer(lower, 2930 btrfs_node_key_ptr_offset(lower, slot + 1), 2931 btrfs_node_key_ptr_offset(lower, slot), 2932 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2933 } 2934 if (level) { 2935 ret = btrfs_tree_mod_log_insert_key(lower, slot, 2936 BTRFS_MOD_LOG_KEY_ADD); 2937 if (ret < 0) { 2938 btrfs_abort_transaction(trans, ret); 2939 return ret; 2940 } 2941 } 2942 btrfs_set_node_key(lower, key, slot); 2943 btrfs_set_node_blockptr(lower, slot, bytenr); 2944 WARN_ON(trans->transid == 0); 2945 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 2946 btrfs_set_header_nritems(lower, nritems + 1); 2947 btrfs_mark_buffer_dirty(trans, lower); 2948 2949 return 0; 2950 } 2951 2952 /* 2953 * split the node at the specified level in path in two. 2954 * The path is corrected to point to the appropriate node after the split 2955 * 2956 * Before splitting this tries to make some room in the node by pushing 2957 * left and right, if either one works, it returns right away. 2958 * 2959 * returns 0 on success and < 0 on failure 2960 */ 2961 static noinline int split_node(struct btrfs_trans_handle *trans, 2962 struct btrfs_root *root, 2963 struct btrfs_path *path, int level) 2964 { 2965 struct btrfs_fs_info *fs_info = root->fs_info; 2966 struct extent_buffer *c; 2967 struct extent_buffer *split; 2968 struct btrfs_disk_key disk_key; 2969 int mid; 2970 int ret; 2971 u32 c_nritems; 2972 2973 c = path->nodes[level]; 2974 WARN_ON(btrfs_header_generation(c) != trans->transid); 2975 if (c == root->node) { 2976 /* 2977 * trying to split the root, lets make a new one 2978 * 2979 * tree mod log: We don't log_removal old root in 2980 * insert_new_root, because that root buffer will be kept as a 2981 * normal node. We are going to log removal of half of the 2982 * elements below with btrfs_tree_mod_log_eb_copy(). We're 2983 * holding a tree lock on the buffer, which is why we cannot 2984 * race with other tree_mod_log users. 2985 */ 2986 ret = insert_new_root(trans, root, path, level + 1); 2987 if (ret) 2988 return ret; 2989 } else { 2990 ret = push_nodes_for_insert(trans, root, path, level); 2991 c = path->nodes[level]; 2992 if (!ret && btrfs_header_nritems(c) < 2993 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 2994 return 0; 2995 if (ret < 0) 2996 return ret; 2997 } 2998 2999 c_nritems = btrfs_header_nritems(c); 3000 mid = (c_nritems + 1) / 2; 3001 btrfs_node_key(c, &disk_key, mid); 3002 3003 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3004 &disk_key, level, c->start, 0, 3005 0, BTRFS_NESTING_SPLIT); 3006 if (IS_ERR(split)) 3007 return PTR_ERR(split); 3008 3009 root_add_used_bytes(root); 3010 ASSERT(btrfs_header_level(c) == level); 3011 3012 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); 3013 if (ret) { 3014 btrfs_tree_unlock(split); 3015 free_extent_buffer(split); 3016 btrfs_abort_transaction(trans, ret); 3017 return ret; 3018 } 3019 copy_extent_buffer(split, c, 3020 btrfs_node_key_ptr_offset(split, 0), 3021 btrfs_node_key_ptr_offset(c, mid), 3022 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 3023 btrfs_set_header_nritems(split, c_nritems - mid); 3024 btrfs_set_header_nritems(c, mid); 3025 3026 btrfs_mark_buffer_dirty(trans, c); 3027 btrfs_mark_buffer_dirty(trans, split); 3028 3029 ret = insert_ptr(trans, path, &disk_key, split->start, 3030 path->slots[level + 1] + 1, level + 1); 3031 if (ret < 0) { 3032 btrfs_tree_unlock(split); 3033 free_extent_buffer(split); 3034 return ret; 3035 } 3036 3037 if (path->slots[level] >= mid) { 3038 path->slots[level] -= mid; 3039 btrfs_tree_unlock(c); 3040 free_extent_buffer(c); 3041 path->nodes[level] = split; 3042 path->slots[level + 1] += 1; 3043 } else { 3044 btrfs_tree_unlock(split); 3045 free_extent_buffer(split); 3046 } 3047 return 0; 3048 } 3049 3050 /* 3051 * how many bytes are required to store the items in a leaf. start 3052 * and nr indicate which items in the leaf to check. This totals up the 3053 * space used both by the item structs and the item data 3054 */ 3055 static int leaf_space_used(const struct extent_buffer *l, int start, int nr) 3056 { 3057 int data_len; 3058 int nritems = btrfs_header_nritems(l); 3059 int end = min(nritems, start + nr) - 1; 3060 3061 if (!nr) 3062 return 0; 3063 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); 3064 data_len = data_len - btrfs_item_offset(l, end); 3065 data_len += sizeof(struct btrfs_item) * nr; 3066 WARN_ON(data_len < 0); 3067 return data_len; 3068 } 3069 3070 /* 3071 * The space between the end of the leaf items and 3072 * the start of the leaf data. IOW, how much room 3073 * the leaf has left for both items and data 3074 */ 3075 int btrfs_leaf_free_space(const struct extent_buffer *leaf) 3076 { 3077 struct btrfs_fs_info *fs_info = leaf->fs_info; 3078 int nritems = btrfs_header_nritems(leaf); 3079 int ret; 3080 3081 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 3082 if (ret < 0) { 3083 btrfs_crit(fs_info, 3084 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 3085 ret, 3086 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 3087 leaf_space_used(leaf, 0, nritems), nritems); 3088 } 3089 return ret; 3090 } 3091 3092 /* 3093 * min slot controls the lowest index we're willing to push to the 3094 * right. We'll push up to and including min_slot, but no lower 3095 */ 3096 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 3097 struct btrfs_path *path, 3098 int data_size, int empty, 3099 struct extent_buffer *right, 3100 int free_space, u32 left_nritems, 3101 u32 min_slot) 3102 { 3103 struct btrfs_fs_info *fs_info = right->fs_info; 3104 struct extent_buffer *left = path->nodes[0]; 3105 struct extent_buffer *upper = path->nodes[1]; 3106 struct btrfs_map_token token; 3107 struct btrfs_disk_key disk_key; 3108 int slot; 3109 u32 i; 3110 int push_space = 0; 3111 int push_items = 0; 3112 u32 nr; 3113 u32 right_nritems; 3114 u32 data_end; 3115 u32 this_item_size; 3116 3117 if (empty) 3118 nr = 0; 3119 else 3120 nr = max_t(u32, 1, min_slot); 3121 3122 if (path->slots[0] >= left_nritems) 3123 push_space += data_size; 3124 3125 slot = path->slots[1]; 3126 i = left_nritems - 1; 3127 while (i >= nr) { 3128 if (!empty && push_items > 0) { 3129 if (path->slots[0] > i) 3130 break; 3131 if (path->slots[0] == i) { 3132 int space = btrfs_leaf_free_space(left); 3133 3134 if (space + push_space * 2 > free_space) 3135 break; 3136 } 3137 } 3138 3139 if (path->slots[0] == i) 3140 push_space += data_size; 3141 3142 this_item_size = btrfs_item_size(left, i); 3143 if (this_item_size + sizeof(struct btrfs_item) + 3144 push_space > free_space) 3145 break; 3146 3147 push_items++; 3148 push_space += this_item_size + sizeof(struct btrfs_item); 3149 if (i == 0) 3150 break; 3151 i--; 3152 } 3153 3154 if (push_items == 0) 3155 goto out_unlock; 3156 3157 WARN_ON(!empty && push_items == left_nritems); 3158 3159 /* push left to right */ 3160 right_nritems = btrfs_header_nritems(right); 3161 3162 push_space = btrfs_item_data_end(left, left_nritems - push_items); 3163 push_space -= leaf_data_end(left); 3164 3165 /* make room in the right data area */ 3166 data_end = leaf_data_end(right); 3167 memmove_leaf_data(right, data_end - push_space, data_end, 3168 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 3169 3170 /* copy from the left data area */ 3171 copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3172 leaf_data_end(left), push_space); 3173 3174 memmove_leaf_items(right, push_items, 0, right_nritems); 3175 3176 /* copy the items from left to right */ 3177 copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); 3178 3179 /* update the item pointers */ 3180 btrfs_init_map_token(&token, right); 3181 right_nritems += push_items; 3182 btrfs_set_header_nritems(right, right_nritems); 3183 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3184 for (i = 0; i < right_nritems; i++) { 3185 push_space -= btrfs_token_item_size(&token, i); 3186 btrfs_set_token_item_offset(&token, i, push_space); 3187 } 3188 3189 left_nritems -= push_items; 3190 btrfs_set_header_nritems(left, left_nritems); 3191 3192 if (left_nritems) 3193 btrfs_mark_buffer_dirty(trans, left); 3194 else 3195 btrfs_clear_buffer_dirty(trans, left); 3196 3197 btrfs_mark_buffer_dirty(trans, right); 3198 3199 btrfs_item_key(right, &disk_key, 0); 3200 btrfs_set_node_key(upper, &disk_key, slot + 1); 3201 btrfs_mark_buffer_dirty(trans, upper); 3202 3203 /* then fixup the leaf pointer in the path */ 3204 if (path->slots[0] >= left_nritems) { 3205 path->slots[0] -= left_nritems; 3206 if (btrfs_header_nritems(path->nodes[0]) == 0) 3207 btrfs_clear_buffer_dirty(trans, path->nodes[0]); 3208 btrfs_tree_unlock(path->nodes[0]); 3209 free_extent_buffer(path->nodes[0]); 3210 path->nodes[0] = right; 3211 path->slots[1] += 1; 3212 } else { 3213 btrfs_tree_unlock(right); 3214 free_extent_buffer(right); 3215 } 3216 return 0; 3217 3218 out_unlock: 3219 btrfs_tree_unlock(right); 3220 free_extent_buffer(right); 3221 return 1; 3222 } 3223 3224 /* 3225 * push some data in the path leaf to the right, trying to free up at 3226 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3227 * 3228 * returns 1 if the push failed because the other node didn't have enough 3229 * room, 0 if everything worked out and < 0 if there were major errors. 3230 * 3231 * this will push starting from min_slot to the end of the leaf. It won't 3232 * push any slot lower than min_slot 3233 */ 3234 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3235 *root, struct btrfs_path *path, 3236 int min_data_size, int data_size, 3237 int empty, u32 min_slot) 3238 { 3239 struct extent_buffer *left = path->nodes[0]; 3240 struct extent_buffer *right; 3241 struct extent_buffer *upper; 3242 int slot; 3243 int free_space; 3244 u32 left_nritems; 3245 int ret; 3246 3247 if (!path->nodes[1]) 3248 return 1; 3249 3250 slot = path->slots[1]; 3251 upper = path->nodes[1]; 3252 if (slot >= btrfs_header_nritems(upper) - 1) 3253 return 1; 3254 3255 btrfs_assert_tree_write_locked(path->nodes[1]); 3256 3257 right = btrfs_read_node_slot(upper, slot + 1); 3258 if (IS_ERR(right)) 3259 return PTR_ERR(right); 3260 3261 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 3262 3263 free_space = btrfs_leaf_free_space(right); 3264 if (free_space < data_size) 3265 goto out_unlock; 3266 3267 ret = btrfs_cow_block(trans, root, right, upper, 3268 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 3269 if (ret) 3270 goto out_unlock; 3271 3272 left_nritems = btrfs_header_nritems(left); 3273 if (left_nritems == 0) 3274 goto out_unlock; 3275 3276 if (check_sibling_keys(left, right)) { 3277 ret = -EUCLEAN; 3278 btrfs_abort_transaction(trans, ret); 3279 btrfs_tree_unlock(right); 3280 free_extent_buffer(right); 3281 return ret; 3282 } 3283 if (path->slots[0] == left_nritems && !empty) { 3284 /* Key greater than all keys in the leaf, right neighbor has 3285 * enough room for it and we're not emptying our leaf to delete 3286 * it, therefore use right neighbor to insert the new item and 3287 * no need to touch/dirty our left leaf. */ 3288 btrfs_tree_unlock(left); 3289 free_extent_buffer(left); 3290 path->nodes[0] = right; 3291 path->slots[0] = 0; 3292 path->slots[1]++; 3293 return 0; 3294 } 3295 3296 return __push_leaf_right(trans, path, min_data_size, empty, right, 3297 free_space, left_nritems, min_slot); 3298 out_unlock: 3299 btrfs_tree_unlock(right); 3300 free_extent_buffer(right); 3301 return 1; 3302 } 3303 3304 /* 3305 * push some data in the path leaf to the left, trying to free up at 3306 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3307 * 3308 * max_slot can put a limit on how far into the leaf we'll push items. The 3309 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3310 * items 3311 */ 3312 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 3313 struct btrfs_path *path, int data_size, 3314 int empty, struct extent_buffer *left, 3315 int free_space, u32 right_nritems, 3316 u32 max_slot) 3317 { 3318 struct btrfs_fs_info *fs_info = left->fs_info; 3319 struct btrfs_disk_key disk_key; 3320 struct extent_buffer *right = path->nodes[0]; 3321 int i; 3322 int push_space = 0; 3323 int push_items = 0; 3324 u32 old_left_nritems; 3325 u32 nr; 3326 int ret = 0; 3327 u32 this_item_size; 3328 u32 old_left_item_size; 3329 struct btrfs_map_token token; 3330 3331 if (empty) 3332 nr = min(right_nritems, max_slot); 3333 else 3334 nr = min(right_nritems - 1, max_slot); 3335 3336 for (i = 0; i < nr; i++) { 3337 if (!empty && push_items > 0) { 3338 if (path->slots[0] < i) 3339 break; 3340 if (path->slots[0] == i) { 3341 int space = btrfs_leaf_free_space(right); 3342 3343 if (space + push_space * 2 > free_space) 3344 break; 3345 } 3346 } 3347 3348 if (path->slots[0] == i) 3349 push_space += data_size; 3350 3351 this_item_size = btrfs_item_size(right, i); 3352 if (this_item_size + sizeof(struct btrfs_item) + push_space > 3353 free_space) 3354 break; 3355 3356 push_items++; 3357 push_space += this_item_size + sizeof(struct btrfs_item); 3358 } 3359 3360 if (push_items == 0) { 3361 ret = 1; 3362 goto out; 3363 } 3364 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 3365 3366 /* push data from right to left */ 3367 copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); 3368 3369 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 3370 btrfs_item_offset(right, push_items - 1); 3371 3372 copy_leaf_data(left, right, leaf_data_end(left) - push_space, 3373 btrfs_item_offset(right, push_items - 1), push_space); 3374 old_left_nritems = btrfs_header_nritems(left); 3375 BUG_ON(old_left_nritems <= 0); 3376 3377 btrfs_init_map_token(&token, left); 3378 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); 3379 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3380 u32 ioff; 3381 3382 ioff = btrfs_token_item_offset(&token, i); 3383 btrfs_set_token_item_offset(&token, i, 3384 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); 3385 } 3386 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3387 3388 /* fixup right node */ 3389 if (push_items > right_nritems) 3390 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3391 right_nritems); 3392 3393 if (push_items < right_nritems) { 3394 push_space = btrfs_item_offset(right, push_items - 1) - 3395 leaf_data_end(right); 3396 memmove_leaf_data(right, 3397 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3398 leaf_data_end(right), push_space); 3399 3400 memmove_leaf_items(right, 0, push_items, 3401 btrfs_header_nritems(right) - push_items); 3402 } 3403 3404 btrfs_init_map_token(&token, right); 3405 right_nritems -= push_items; 3406 btrfs_set_header_nritems(right, right_nritems); 3407 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3408 for (i = 0; i < right_nritems; i++) { 3409 push_space = push_space - btrfs_token_item_size(&token, i); 3410 btrfs_set_token_item_offset(&token, i, push_space); 3411 } 3412 3413 btrfs_mark_buffer_dirty(trans, left); 3414 if (right_nritems) 3415 btrfs_mark_buffer_dirty(trans, right); 3416 else 3417 btrfs_clear_buffer_dirty(trans, right); 3418 3419 btrfs_item_key(right, &disk_key, 0); 3420 fixup_low_keys(trans, path, &disk_key, 1); 3421 3422 /* then fixup the leaf pointer in the path */ 3423 if (path->slots[0] < push_items) { 3424 path->slots[0] += old_left_nritems; 3425 btrfs_tree_unlock(path->nodes[0]); 3426 free_extent_buffer(path->nodes[0]); 3427 path->nodes[0] = left; 3428 path->slots[1] -= 1; 3429 } else { 3430 btrfs_tree_unlock(left); 3431 free_extent_buffer(left); 3432 path->slots[0] -= push_items; 3433 } 3434 BUG_ON(path->slots[0] < 0); 3435 return ret; 3436 out: 3437 btrfs_tree_unlock(left); 3438 free_extent_buffer(left); 3439 return ret; 3440 } 3441 3442 /* 3443 * push some data in the path leaf to the left, trying to free up at 3444 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3445 * 3446 * max_slot can put a limit on how far into the leaf we'll push items. The 3447 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3448 * items 3449 */ 3450 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3451 *root, struct btrfs_path *path, int min_data_size, 3452 int data_size, int empty, u32 max_slot) 3453 { 3454 struct extent_buffer *right = path->nodes[0]; 3455 struct extent_buffer *left; 3456 int slot; 3457 int free_space; 3458 u32 right_nritems; 3459 int ret = 0; 3460 3461 slot = path->slots[1]; 3462 if (slot == 0) 3463 return 1; 3464 if (!path->nodes[1]) 3465 return 1; 3466 3467 right_nritems = btrfs_header_nritems(right); 3468 if (right_nritems == 0) 3469 return 1; 3470 3471 btrfs_assert_tree_write_locked(path->nodes[1]); 3472 3473 left = btrfs_read_node_slot(path->nodes[1], slot - 1); 3474 if (IS_ERR(left)) 3475 return PTR_ERR(left); 3476 3477 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 3478 3479 free_space = btrfs_leaf_free_space(left); 3480 if (free_space < data_size) { 3481 ret = 1; 3482 goto out; 3483 } 3484 3485 ret = btrfs_cow_block(trans, root, left, 3486 path->nodes[1], slot - 1, &left, 3487 BTRFS_NESTING_LEFT_COW); 3488 if (ret) { 3489 /* we hit -ENOSPC, but it isn't fatal here */ 3490 if (ret == -ENOSPC) 3491 ret = 1; 3492 goto out; 3493 } 3494 3495 if (check_sibling_keys(left, right)) { 3496 ret = -EUCLEAN; 3497 btrfs_abort_transaction(trans, ret); 3498 goto out; 3499 } 3500 return __push_leaf_left(trans, path, min_data_size, empty, left, 3501 free_space, right_nritems, max_slot); 3502 out: 3503 btrfs_tree_unlock(left); 3504 free_extent_buffer(left); 3505 return ret; 3506 } 3507 3508 /* 3509 * split the path's leaf in two, making sure there is at least data_size 3510 * available for the resulting leaf level of the path. 3511 */ 3512 static noinline int copy_for_split(struct btrfs_trans_handle *trans, 3513 struct btrfs_path *path, 3514 struct extent_buffer *l, 3515 struct extent_buffer *right, 3516 int slot, int mid, int nritems) 3517 { 3518 struct btrfs_fs_info *fs_info = trans->fs_info; 3519 int data_copy_size; 3520 int rt_data_off; 3521 int i; 3522 int ret; 3523 struct btrfs_disk_key disk_key; 3524 struct btrfs_map_token token; 3525 3526 nritems = nritems - mid; 3527 btrfs_set_header_nritems(right, nritems); 3528 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); 3529 3530 copy_leaf_items(right, l, 0, mid, nritems); 3531 3532 copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, 3533 leaf_data_end(l), data_copy_size); 3534 3535 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); 3536 3537 btrfs_init_map_token(&token, right); 3538 for (i = 0; i < nritems; i++) { 3539 u32 ioff; 3540 3541 ioff = btrfs_token_item_offset(&token, i); 3542 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); 3543 } 3544 3545 btrfs_set_header_nritems(l, mid); 3546 btrfs_item_key(right, &disk_key, 0); 3547 ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); 3548 if (ret < 0) 3549 return ret; 3550 3551 btrfs_mark_buffer_dirty(trans, right); 3552 btrfs_mark_buffer_dirty(trans, l); 3553 BUG_ON(path->slots[0] != slot); 3554 3555 if (mid <= slot) { 3556 btrfs_tree_unlock(path->nodes[0]); 3557 free_extent_buffer(path->nodes[0]); 3558 path->nodes[0] = right; 3559 path->slots[0] -= mid; 3560 path->slots[1] += 1; 3561 } else { 3562 btrfs_tree_unlock(right); 3563 free_extent_buffer(right); 3564 } 3565 3566 BUG_ON(path->slots[0] < 0); 3567 3568 return 0; 3569 } 3570 3571 /* 3572 * double splits happen when we need to insert a big item in the middle 3573 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3574 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3575 * A B C 3576 * 3577 * We avoid this by trying to push the items on either side of our target 3578 * into the adjacent leaves. If all goes well we can avoid the double split 3579 * completely. 3580 */ 3581 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3582 struct btrfs_root *root, 3583 struct btrfs_path *path, 3584 int data_size) 3585 { 3586 int ret; 3587 int progress = 0; 3588 int slot; 3589 u32 nritems; 3590 int space_needed = data_size; 3591 3592 slot = path->slots[0]; 3593 if (slot < btrfs_header_nritems(path->nodes[0])) 3594 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3595 3596 /* 3597 * try to push all the items after our slot into the 3598 * right leaf 3599 */ 3600 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 3601 if (ret < 0) 3602 return ret; 3603 3604 if (ret == 0) 3605 progress++; 3606 3607 nritems = btrfs_header_nritems(path->nodes[0]); 3608 /* 3609 * our goal is to get our slot at the start or end of a leaf. If 3610 * we've done so we're done 3611 */ 3612 if (path->slots[0] == 0 || path->slots[0] == nritems) 3613 return 0; 3614 3615 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3616 return 0; 3617 3618 /* try to push all the items before our slot into the next leaf */ 3619 slot = path->slots[0]; 3620 space_needed = data_size; 3621 if (slot > 0) 3622 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3623 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 3624 if (ret < 0) 3625 return ret; 3626 3627 if (ret == 0) 3628 progress++; 3629 3630 if (progress) 3631 return 0; 3632 return 1; 3633 } 3634 3635 /* 3636 * split the path's leaf in two, making sure there is at least data_size 3637 * available for the resulting leaf level of the path. 3638 * 3639 * returns 0 if all went well and < 0 on failure. 3640 */ 3641 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3642 struct btrfs_root *root, 3643 const struct btrfs_key *ins_key, 3644 struct btrfs_path *path, int data_size, 3645 int extend) 3646 { 3647 struct btrfs_disk_key disk_key; 3648 struct extent_buffer *l; 3649 u32 nritems; 3650 int mid; 3651 int slot; 3652 struct extent_buffer *right; 3653 struct btrfs_fs_info *fs_info = root->fs_info; 3654 int ret = 0; 3655 int wret; 3656 int split; 3657 int num_doubles = 0; 3658 int tried_avoid_double = 0; 3659 3660 l = path->nodes[0]; 3661 slot = path->slots[0]; 3662 if (extend && data_size + btrfs_item_size(l, slot) + 3663 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 3664 return -EOVERFLOW; 3665 3666 /* first try to make some room by pushing left and right */ 3667 if (data_size && path->nodes[1]) { 3668 int space_needed = data_size; 3669 3670 if (slot < btrfs_header_nritems(l)) 3671 space_needed -= btrfs_leaf_free_space(l); 3672 3673 wret = push_leaf_right(trans, root, path, space_needed, 3674 space_needed, 0, 0); 3675 if (wret < 0) 3676 return wret; 3677 if (wret) { 3678 space_needed = data_size; 3679 if (slot > 0) 3680 space_needed -= btrfs_leaf_free_space(l); 3681 wret = push_leaf_left(trans, root, path, space_needed, 3682 space_needed, 0, (u32)-1); 3683 if (wret < 0) 3684 return wret; 3685 } 3686 l = path->nodes[0]; 3687 3688 /* did the pushes work? */ 3689 if (btrfs_leaf_free_space(l) >= data_size) 3690 return 0; 3691 } 3692 3693 if (!path->nodes[1]) { 3694 ret = insert_new_root(trans, root, path, 1); 3695 if (ret) 3696 return ret; 3697 } 3698 again: 3699 split = 1; 3700 l = path->nodes[0]; 3701 slot = path->slots[0]; 3702 nritems = btrfs_header_nritems(l); 3703 mid = (nritems + 1) / 2; 3704 3705 if (mid <= slot) { 3706 if (nritems == 1 || 3707 leaf_space_used(l, mid, nritems - mid) + data_size > 3708 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3709 if (slot >= nritems) { 3710 split = 0; 3711 } else { 3712 mid = slot; 3713 if (mid != nritems && 3714 leaf_space_used(l, mid, nritems - mid) + 3715 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3716 if (data_size && !tried_avoid_double) 3717 goto push_for_double; 3718 split = 2; 3719 } 3720 } 3721 } 3722 } else { 3723 if (leaf_space_used(l, 0, mid) + data_size > 3724 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3725 if (!extend && data_size && slot == 0) { 3726 split = 0; 3727 } else if ((extend || !data_size) && slot == 0) { 3728 mid = 1; 3729 } else { 3730 mid = slot; 3731 if (mid != nritems && 3732 leaf_space_used(l, mid, nritems - mid) + 3733 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3734 if (data_size && !tried_avoid_double) 3735 goto push_for_double; 3736 split = 2; 3737 } 3738 } 3739 } 3740 } 3741 3742 if (split == 0) 3743 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3744 else 3745 btrfs_item_key(l, &disk_key, mid); 3746 3747 /* 3748 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double 3749 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES 3750 * subclasses, which is 8 at the time of this patch, and we've maxed it 3751 * out. In the future we could add a 3752 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just 3753 * use BTRFS_NESTING_NEW_ROOT. 3754 */ 3755 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3756 &disk_key, 0, l->start, 0, 0, 3757 num_doubles ? BTRFS_NESTING_NEW_ROOT : 3758 BTRFS_NESTING_SPLIT); 3759 if (IS_ERR(right)) 3760 return PTR_ERR(right); 3761 3762 root_add_used_bytes(root); 3763 3764 if (split == 0) { 3765 if (mid <= slot) { 3766 btrfs_set_header_nritems(right, 0); 3767 ret = insert_ptr(trans, path, &disk_key, 3768 right->start, path->slots[1] + 1, 1); 3769 if (ret < 0) { 3770 btrfs_tree_unlock(right); 3771 free_extent_buffer(right); 3772 return ret; 3773 } 3774 btrfs_tree_unlock(path->nodes[0]); 3775 free_extent_buffer(path->nodes[0]); 3776 path->nodes[0] = right; 3777 path->slots[0] = 0; 3778 path->slots[1] += 1; 3779 } else { 3780 btrfs_set_header_nritems(right, 0); 3781 ret = insert_ptr(trans, path, &disk_key, 3782 right->start, path->slots[1], 1); 3783 if (ret < 0) { 3784 btrfs_tree_unlock(right); 3785 free_extent_buffer(right); 3786 return ret; 3787 } 3788 btrfs_tree_unlock(path->nodes[0]); 3789 free_extent_buffer(path->nodes[0]); 3790 path->nodes[0] = right; 3791 path->slots[0] = 0; 3792 if (path->slots[1] == 0) 3793 fixup_low_keys(trans, path, &disk_key, 1); 3794 } 3795 /* 3796 * We create a new leaf 'right' for the required ins_len and 3797 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 3798 * the content of ins_len to 'right'. 3799 */ 3800 return ret; 3801 } 3802 3803 ret = copy_for_split(trans, path, l, right, slot, mid, nritems); 3804 if (ret < 0) { 3805 btrfs_tree_unlock(right); 3806 free_extent_buffer(right); 3807 return ret; 3808 } 3809 3810 if (split == 2) { 3811 BUG_ON(num_doubles != 0); 3812 num_doubles++; 3813 goto again; 3814 } 3815 3816 return 0; 3817 3818 push_for_double: 3819 push_for_double_split(trans, root, path, data_size); 3820 tried_avoid_double = 1; 3821 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3822 return 0; 3823 goto again; 3824 } 3825 3826 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3827 struct btrfs_root *root, 3828 struct btrfs_path *path, int ins_len) 3829 { 3830 struct btrfs_key key; 3831 struct extent_buffer *leaf; 3832 struct btrfs_file_extent_item *fi; 3833 u64 extent_len = 0; 3834 u32 item_size; 3835 int ret; 3836 3837 leaf = path->nodes[0]; 3838 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3839 3840 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 3841 key.type != BTRFS_EXTENT_CSUM_KEY); 3842 3843 if (btrfs_leaf_free_space(leaf) >= ins_len) 3844 return 0; 3845 3846 item_size = btrfs_item_size(leaf, path->slots[0]); 3847 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3848 fi = btrfs_item_ptr(leaf, path->slots[0], 3849 struct btrfs_file_extent_item); 3850 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3851 } 3852 btrfs_release_path(path); 3853 3854 path->keep_locks = 1; 3855 path->search_for_split = 1; 3856 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 3857 path->search_for_split = 0; 3858 if (ret > 0) 3859 ret = -EAGAIN; 3860 if (ret < 0) 3861 goto err; 3862 3863 ret = -EAGAIN; 3864 leaf = path->nodes[0]; 3865 /* if our item isn't there, return now */ 3866 if (item_size != btrfs_item_size(leaf, path->slots[0])) 3867 goto err; 3868 3869 /* the leaf has changed, it now has room. return now */ 3870 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) 3871 goto err; 3872 3873 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3874 fi = btrfs_item_ptr(leaf, path->slots[0], 3875 struct btrfs_file_extent_item); 3876 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 3877 goto err; 3878 } 3879 3880 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3881 if (ret) 3882 goto err; 3883 3884 path->keep_locks = 0; 3885 btrfs_unlock_up_safe(path, 1); 3886 return 0; 3887 err: 3888 path->keep_locks = 0; 3889 return ret; 3890 } 3891 3892 static noinline int split_item(struct btrfs_trans_handle *trans, 3893 struct btrfs_path *path, 3894 const struct btrfs_key *new_key, 3895 unsigned long split_offset) 3896 { 3897 struct extent_buffer *leaf; 3898 int orig_slot, slot; 3899 char *buf; 3900 u32 nritems; 3901 u32 item_size; 3902 u32 orig_offset; 3903 struct btrfs_disk_key disk_key; 3904 3905 leaf = path->nodes[0]; 3906 /* 3907 * Shouldn't happen because the caller must have previously called 3908 * setup_leaf_for_split() to make room for the new item in the leaf. 3909 */ 3910 if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item))) 3911 return -ENOSPC; 3912 3913 orig_slot = path->slots[0]; 3914 orig_offset = btrfs_item_offset(leaf, path->slots[0]); 3915 item_size = btrfs_item_size(leaf, path->slots[0]); 3916 3917 buf = kmalloc(item_size, GFP_NOFS); 3918 if (!buf) 3919 return -ENOMEM; 3920 3921 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3922 path->slots[0]), item_size); 3923 3924 slot = path->slots[0] + 1; 3925 nritems = btrfs_header_nritems(leaf); 3926 if (slot != nritems) { 3927 /* shift the items */ 3928 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); 3929 } 3930 3931 btrfs_cpu_key_to_disk(&disk_key, new_key); 3932 btrfs_set_item_key(leaf, &disk_key, slot); 3933 3934 btrfs_set_item_offset(leaf, slot, orig_offset); 3935 btrfs_set_item_size(leaf, slot, item_size - split_offset); 3936 3937 btrfs_set_item_offset(leaf, orig_slot, 3938 orig_offset + item_size - split_offset); 3939 btrfs_set_item_size(leaf, orig_slot, split_offset); 3940 3941 btrfs_set_header_nritems(leaf, nritems + 1); 3942 3943 /* write the data for the start of the original item */ 3944 write_extent_buffer(leaf, buf, 3945 btrfs_item_ptr_offset(leaf, path->slots[0]), 3946 split_offset); 3947 3948 /* write the data for the new item */ 3949 write_extent_buffer(leaf, buf + split_offset, 3950 btrfs_item_ptr_offset(leaf, slot), 3951 item_size - split_offset); 3952 btrfs_mark_buffer_dirty(trans, leaf); 3953 3954 BUG_ON(btrfs_leaf_free_space(leaf) < 0); 3955 kfree(buf); 3956 return 0; 3957 } 3958 3959 /* 3960 * This function splits a single item into two items, 3961 * giving 'new_key' to the new item and splitting the 3962 * old one at split_offset (from the start of the item). 3963 * 3964 * The path may be released by this operation. After 3965 * the split, the path is pointing to the old item. The 3966 * new item is going to be in the same node as the old one. 3967 * 3968 * Note, the item being split must be smaller enough to live alone on 3969 * a tree block with room for one extra struct btrfs_item 3970 * 3971 * This allows us to split the item in place, keeping a lock on the 3972 * leaf the entire time. 3973 */ 3974 int btrfs_split_item(struct btrfs_trans_handle *trans, 3975 struct btrfs_root *root, 3976 struct btrfs_path *path, 3977 const struct btrfs_key *new_key, 3978 unsigned long split_offset) 3979 { 3980 int ret; 3981 ret = setup_leaf_for_split(trans, root, path, 3982 sizeof(struct btrfs_item)); 3983 if (ret) 3984 return ret; 3985 3986 ret = split_item(trans, path, new_key, split_offset); 3987 return ret; 3988 } 3989 3990 /* 3991 * make the item pointed to by the path smaller. new_size indicates 3992 * how small to make it, and from_end tells us if we just chop bytes 3993 * off the end of the item or if we shift the item to chop bytes off 3994 * the front. 3995 */ 3996 void btrfs_truncate_item(struct btrfs_trans_handle *trans, 3997 struct btrfs_path *path, u32 new_size, int from_end) 3998 { 3999 int slot; 4000 struct extent_buffer *leaf; 4001 u32 nritems; 4002 unsigned int data_end; 4003 unsigned int old_data_start; 4004 unsigned int old_size; 4005 unsigned int size_diff; 4006 int i; 4007 struct btrfs_map_token token; 4008 4009 leaf = path->nodes[0]; 4010 slot = path->slots[0]; 4011 4012 old_size = btrfs_item_size(leaf, slot); 4013 if (old_size == new_size) 4014 return; 4015 4016 nritems = btrfs_header_nritems(leaf); 4017 data_end = leaf_data_end(leaf); 4018 4019 old_data_start = btrfs_item_offset(leaf, slot); 4020 4021 size_diff = old_size - new_size; 4022 4023 BUG_ON(slot < 0); 4024 BUG_ON(slot >= nritems); 4025 4026 /* 4027 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4028 */ 4029 /* first correct the data pointers */ 4030 btrfs_init_map_token(&token, leaf); 4031 for (i = slot; i < nritems; i++) { 4032 u32 ioff; 4033 4034 ioff = btrfs_token_item_offset(&token, i); 4035 btrfs_set_token_item_offset(&token, i, ioff + size_diff); 4036 } 4037 4038 /* shift the data */ 4039 if (from_end) { 4040 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4041 old_data_start + new_size - data_end); 4042 } else { 4043 struct btrfs_disk_key disk_key; 4044 u64 offset; 4045 4046 btrfs_item_key(leaf, &disk_key, slot); 4047 4048 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 4049 unsigned long ptr; 4050 struct btrfs_file_extent_item *fi; 4051 4052 fi = btrfs_item_ptr(leaf, slot, 4053 struct btrfs_file_extent_item); 4054 fi = (struct btrfs_file_extent_item *)( 4055 (unsigned long)fi - size_diff); 4056 4057 if (btrfs_file_extent_type(leaf, fi) == 4058 BTRFS_FILE_EXTENT_INLINE) { 4059 ptr = btrfs_item_ptr_offset(leaf, slot); 4060 memmove_extent_buffer(leaf, ptr, 4061 (unsigned long)fi, 4062 BTRFS_FILE_EXTENT_INLINE_DATA_START); 4063 } 4064 } 4065 4066 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4067 old_data_start - data_end); 4068 4069 offset = btrfs_disk_key_offset(&disk_key); 4070 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4071 btrfs_set_item_key(leaf, &disk_key, slot); 4072 if (slot == 0) 4073 fixup_low_keys(trans, path, &disk_key, 1); 4074 } 4075 4076 btrfs_set_item_size(leaf, slot, new_size); 4077 btrfs_mark_buffer_dirty(trans, leaf); 4078 4079 if (btrfs_leaf_free_space(leaf) < 0) { 4080 btrfs_print_leaf(leaf); 4081 BUG(); 4082 } 4083 } 4084 4085 /* 4086 * make the item pointed to by the path bigger, data_size is the added size. 4087 */ 4088 void btrfs_extend_item(struct btrfs_trans_handle *trans, 4089 struct btrfs_path *path, u32 data_size) 4090 { 4091 int slot; 4092 struct extent_buffer *leaf; 4093 u32 nritems; 4094 unsigned int data_end; 4095 unsigned int old_data; 4096 unsigned int old_size; 4097 int i; 4098 struct btrfs_map_token token; 4099 4100 leaf = path->nodes[0]; 4101 4102 nritems = btrfs_header_nritems(leaf); 4103 data_end = leaf_data_end(leaf); 4104 4105 if (btrfs_leaf_free_space(leaf) < data_size) { 4106 btrfs_print_leaf(leaf); 4107 BUG(); 4108 } 4109 slot = path->slots[0]; 4110 old_data = btrfs_item_data_end(leaf, slot); 4111 4112 BUG_ON(slot < 0); 4113 if (slot >= nritems) { 4114 btrfs_print_leaf(leaf); 4115 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", 4116 slot, nritems); 4117 BUG(); 4118 } 4119 4120 /* 4121 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4122 */ 4123 /* first correct the data pointers */ 4124 btrfs_init_map_token(&token, leaf); 4125 for (i = slot; i < nritems; i++) { 4126 u32 ioff; 4127 4128 ioff = btrfs_token_item_offset(&token, i); 4129 btrfs_set_token_item_offset(&token, i, ioff - data_size); 4130 } 4131 4132 /* shift the data */ 4133 memmove_leaf_data(leaf, data_end - data_size, data_end, 4134 old_data - data_end); 4135 4136 data_end = old_data; 4137 old_size = btrfs_item_size(leaf, slot); 4138 btrfs_set_item_size(leaf, slot, old_size + data_size); 4139 btrfs_mark_buffer_dirty(trans, leaf); 4140 4141 if (btrfs_leaf_free_space(leaf) < 0) { 4142 btrfs_print_leaf(leaf); 4143 BUG(); 4144 } 4145 } 4146 4147 /* 4148 * Make space in the node before inserting one or more items. 4149 * 4150 * @trans: transaction handle 4151 * @root: root we are inserting items to 4152 * @path: points to the leaf/slot where we are going to insert new items 4153 * @batch: information about the batch of items to insert 4154 * 4155 * Main purpose is to save stack depth by doing the bulk of the work in a 4156 * function that doesn't call btrfs_search_slot 4157 */ 4158 static void setup_items_for_insert(struct btrfs_trans_handle *trans, 4159 struct btrfs_root *root, struct btrfs_path *path, 4160 const struct btrfs_item_batch *batch) 4161 { 4162 struct btrfs_fs_info *fs_info = root->fs_info; 4163 int i; 4164 u32 nritems; 4165 unsigned int data_end; 4166 struct btrfs_disk_key disk_key; 4167 struct extent_buffer *leaf; 4168 int slot; 4169 struct btrfs_map_token token; 4170 u32 total_size; 4171 4172 /* 4173 * Before anything else, update keys in the parent and other ancestors 4174 * if needed, then release the write locks on them, so that other tasks 4175 * can use them while we modify the leaf. 4176 */ 4177 if (path->slots[0] == 0) { 4178 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); 4179 fixup_low_keys(trans, path, &disk_key, 1); 4180 } 4181 btrfs_unlock_up_safe(path, 1); 4182 4183 leaf = path->nodes[0]; 4184 slot = path->slots[0]; 4185 4186 nritems = btrfs_header_nritems(leaf); 4187 data_end = leaf_data_end(leaf); 4188 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4189 4190 if (btrfs_leaf_free_space(leaf) < total_size) { 4191 btrfs_print_leaf(leaf); 4192 btrfs_crit(fs_info, "not enough freespace need %u have %d", 4193 total_size, btrfs_leaf_free_space(leaf)); 4194 BUG(); 4195 } 4196 4197 btrfs_init_map_token(&token, leaf); 4198 if (slot != nritems) { 4199 unsigned int old_data = btrfs_item_data_end(leaf, slot); 4200 4201 if (old_data < data_end) { 4202 btrfs_print_leaf(leaf); 4203 btrfs_crit(fs_info, 4204 "item at slot %d with data offset %u beyond data end of leaf %u", 4205 slot, old_data, data_end); 4206 BUG(); 4207 } 4208 /* 4209 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4210 */ 4211 /* first correct the data pointers */ 4212 for (i = slot; i < nritems; i++) { 4213 u32 ioff; 4214 4215 ioff = btrfs_token_item_offset(&token, i); 4216 btrfs_set_token_item_offset(&token, i, 4217 ioff - batch->total_data_size); 4218 } 4219 /* shift the items */ 4220 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); 4221 4222 /* shift the data */ 4223 memmove_leaf_data(leaf, data_end - batch->total_data_size, 4224 data_end, old_data - data_end); 4225 data_end = old_data; 4226 } 4227 4228 /* setup the item for the new data */ 4229 for (i = 0; i < batch->nr; i++) { 4230 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); 4231 btrfs_set_item_key(leaf, &disk_key, slot + i); 4232 data_end -= batch->data_sizes[i]; 4233 btrfs_set_token_item_offset(&token, slot + i, data_end); 4234 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); 4235 } 4236 4237 btrfs_set_header_nritems(leaf, nritems + batch->nr); 4238 btrfs_mark_buffer_dirty(trans, leaf); 4239 4240 if (btrfs_leaf_free_space(leaf) < 0) { 4241 btrfs_print_leaf(leaf); 4242 BUG(); 4243 } 4244 } 4245 4246 /* 4247 * Insert a new item into a leaf. 4248 * 4249 * @trans: Transaction handle. 4250 * @root: The root of the btree. 4251 * @path: A path pointing to the target leaf and slot. 4252 * @key: The key of the new item. 4253 * @data_size: The size of the data associated with the new key. 4254 */ 4255 void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, 4256 struct btrfs_root *root, 4257 struct btrfs_path *path, 4258 const struct btrfs_key *key, 4259 u32 data_size) 4260 { 4261 struct btrfs_item_batch batch; 4262 4263 batch.keys = key; 4264 batch.data_sizes = &data_size; 4265 batch.total_data_size = data_size; 4266 batch.nr = 1; 4267 4268 setup_items_for_insert(trans, root, path, &batch); 4269 } 4270 4271 /* 4272 * Given a key and some data, insert items into the tree. 4273 * This does all the path init required, making room in the tree if needed. 4274 */ 4275 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4276 struct btrfs_root *root, 4277 struct btrfs_path *path, 4278 const struct btrfs_item_batch *batch) 4279 { 4280 int ret = 0; 4281 int slot; 4282 u32 total_size; 4283 4284 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4285 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); 4286 if (ret == 0) 4287 return -EEXIST; 4288 if (ret < 0) 4289 return ret; 4290 4291 slot = path->slots[0]; 4292 BUG_ON(slot < 0); 4293 4294 setup_items_for_insert(trans, root, path, batch); 4295 return 0; 4296 } 4297 4298 /* 4299 * Given a key and some data, insert an item into the tree. 4300 * This does all the path init required, making room in the tree if needed. 4301 */ 4302 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4303 const struct btrfs_key *cpu_key, void *data, 4304 u32 data_size) 4305 { 4306 int ret = 0; 4307 struct btrfs_path *path; 4308 struct extent_buffer *leaf; 4309 unsigned long ptr; 4310 4311 path = btrfs_alloc_path(); 4312 if (!path) 4313 return -ENOMEM; 4314 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4315 if (!ret) { 4316 leaf = path->nodes[0]; 4317 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4318 write_extent_buffer(leaf, data, ptr, data_size); 4319 btrfs_mark_buffer_dirty(trans, leaf); 4320 } 4321 btrfs_free_path(path); 4322 return ret; 4323 } 4324 4325 /* 4326 * This function duplicates an item, giving 'new_key' to the new item. 4327 * It guarantees both items live in the same tree leaf and the new item is 4328 * contiguous with the original item. 4329 * 4330 * This allows us to split a file extent in place, keeping a lock on the leaf 4331 * the entire time. 4332 */ 4333 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4334 struct btrfs_root *root, 4335 struct btrfs_path *path, 4336 const struct btrfs_key *new_key) 4337 { 4338 struct extent_buffer *leaf; 4339 int ret; 4340 u32 item_size; 4341 4342 leaf = path->nodes[0]; 4343 item_size = btrfs_item_size(leaf, path->slots[0]); 4344 ret = setup_leaf_for_split(trans, root, path, 4345 item_size + sizeof(struct btrfs_item)); 4346 if (ret) 4347 return ret; 4348 4349 path->slots[0]++; 4350 btrfs_setup_item_for_insert(trans, root, path, new_key, item_size); 4351 leaf = path->nodes[0]; 4352 memcpy_extent_buffer(leaf, 4353 btrfs_item_ptr_offset(leaf, path->slots[0]), 4354 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4355 item_size); 4356 return 0; 4357 } 4358 4359 /* 4360 * delete the pointer from a given node. 4361 * 4362 * the tree should have been previously balanced so the deletion does not 4363 * empty a node. 4364 * 4365 * This is exported for use inside btrfs-progs, don't un-export it. 4366 */ 4367 int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4368 struct btrfs_path *path, int level, int slot) 4369 { 4370 struct extent_buffer *parent = path->nodes[level]; 4371 u32 nritems; 4372 int ret; 4373 4374 nritems = btrfs_header_nritems(parent); 4375 if (slot != nritems - 1) { 4376 if (level) { 4377 ret = btrfs_tree_mod_log_insert_move(parent, slot, 4378 slot + 1, nritems - slot - 1); 4379 if (ret < 0) { 4380 btrfs_abort_transaction(trans, ret); 4381 return ret; 4382 } 4383 } 4384 memmove_extent_buffer(parent, 4385 btrfs_node_key_ptr_offset(parent, slot), 4386 btrfs_node_key_ptr_offset(parent, slot + 1), 4387 sizeof(struct btrfs_key_ptr) * 4388 (nritems - slot - 1)); 4389 } else if (level) { 4390 ret = btrfs_tree_mod_log_insert_key(parent, slot, 4391 BTRFS_MOD_LOG_KEY_REMOVE); 4392 if (ret < 0) { 4393 btrfs_abort_transaction(trans, ret); 4394 return ret; 4395 } 4396 } 4397 4398 nritems--; 4399 btrfs_set_header_nritems(parent, nritems); 4400 if (nritems == 0 && parent == root->node) { 4401 BUG_ON(btrfs_header_level(root->node) != 1); 4402 /* just turn the root into a leaf and break */ 4403 btrfs_set_header_level(root->node, 0); 4404 } else if (slot == 0) { 4405 struct btrfs_disk_key disk_key; 4406 4407 btrfs_node_key(parent, &disk_key, 0); 4408 fixup_low_keys(trans, path, &disk_key, level + 1); 4409 } 4410 btrfs_mark_buffer_dirty(trans, parent); 4411 return 0; 4412 } 4413 4414 /* 4415 * a helper function to delete the leaf pointed to by path->slots[1] and 4416 * path->nodes[1]. 4417 * 4418 * This deletes the pointer in path->nodes[1] and frees the leaf 4419 * block extent. zero is returned if it all worked out, < 0 otherwise. 4420 * 4421 * The path must have already been setup for deleting the leaf, including 4422 * all the proper balancing. path->nodes[1] must be locked. 4423 */ 4424 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, 4425 struct btrfs_root *root, 4426 struct btrfs_path *path, 4427 struct extent_buffer *leaf) 4428 { 4429 int ret; 4430 4431 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4432 ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]); 4433 if (ret < 0) 4434 return ret; 4435 4436 /* 4437 * btrfs_free_extent is expensive, we want to make sure we 4438 * aren't holding any locks when we call it 4439 */ 4440 btrfs_unlock_up_safe(path, 0); 4441 4442 root_sub_used_bytes(root); 4443 4444 atomic_inc(&leaf->refs); 4445 btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); 4446 free_extent_buffer_stale(leaf); 4447 return 0; 4448 } 4449 /* 4450 * delete the item at the leaf level in path. If that empties 4451 * the leaf, remove it from the tree 4452 */ 4453 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4454 struct btrfs_path *path, int slot, int nr) 4455 { 4456 struct btrfs_fs_info *fs_info = root->fs_info; 4457 struct extent_buffer *leaf; 4458 int ret = 0; 4459 int wret; 4460 u32 nritems; 4461 4462 leaf = path->nodes[0]; 4463 nritems = btrfs_header_nritems(leaf); 4464 4465 if (slot + nr != nritems) { 4466 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); 4467 const int data_end = leaf_data_end(leaf); 4468 struct btrfs_map_token token; 4469 u32 dsize = 0; 4470 int i; 4471 4472 for (i = 0; i < nr; i++) 4473 dsize += btrfs_item_size(leaf, slot + i); 4474 4475 memmove_leaf_data(leaf, data_end + dsize, data_end, 4476 last_off - data_end); 4477 4478 btrfs_init_map_token(&token, leaf); 4479 for (i = slot + nr; i < nritems; i++) { 4480 u32 ioff; 4481 4482 ioff = btrfs_token_item_offset(&token, i); 4483 btrfs_set_token_item_offset(&token, i, ioff + dsize); 4484 } 4485 4486 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); 4487 } 4488 btrfs_set_header_nritems(leaf, nritems - nr); 4489 nritems -= nr; 4490 4491 /* delete the leaf if we've emptied it */ 4492 if (nritems == 0) { 4493 if (leaf == root->node) { 4494 btrfs_set_header_level(leaf, 0); 4495 } else { 4496 btrfs_clear_buffer_dirty(trans, leaf); 4497 ret = btrfs_del_leaf(trans, root, path, leaf); 4498 if (ret < 0) 4499 return ret; 4500 } 4501 } else { 4502 int used = leaf_space_used(leaf, 0, nritems); 4503 if (slot == 0) { 4504 struct btrfs_disk_key disk_key; 4505 4506 btrfs_item_key(leaf, &disk_key, 0); 4507 fixup_low_keys(trans, path, &disk_key, 1); 4508 } 4509 4510 /* 4511 * Try to delete the leaf if it is mostly empty. We do this by 4512 * trying to move all its items into its left and right neighbours. 4513 * If we can't move all the items, then we don't delete it - it's 4514 * not ideal, but future insertions might fill the leaf with more 4515 * items, or items from other leaves might be moved later into our 4516 * leaf due to deletions on those leaves. 4517 */ 4518 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4519 u32 min_push_space; 4520 4521 /* push_leaf_left fixes the path. 4522 * make sure the path still points to our leaf 4523 * for possible call to btrfs_del_ptr below 4524 */ 4525 slot = path->slots[1]; 4526 atomic_inc(&leaf->refs); 4527 /* 4528 * We want to be able to at least push one item to the 4529 * left neighbour leaf, and that's the first item. 4530 */ 4531 min_push_space = sizeof(struct btrfs_item) + 4532 btrfs_item_size(leaf, 0); 4533 wret = push_leaf_left(trans, root, path, 0, 4534 min_push_space, 1, (u32)-1); 4535 if (wret < 0 && wret != -ENOSPC) 4536 ret = wret; 4537 4538 if (path->nodes[0] == leaf && 4539 btrfs_header_nritems(leaf)) { 4540 /* 4541 * If we were not able to push all items from our 4542 * leaf to its left neighbour, then attempt to 4543 * either push all the remaining items to the 4544 * right neighbour or none. There's no advantage 4545 * in pushing only some items, instead of all, as 4546 * it's pointless to end up with a leaf having 4547 * too few items while the neighbours can be full 4548 * or nearly full. 4549 */ 4550 nritems = btrfs_header_nritems(leaf); 4551 min_push_space = leaf_space_used(leaf, 0, nritems); 4552 wret = push_leaf_right(trans, root, path, 0, 4553 min_push_space, 1, 0); 4554 if (wret < 0 && wret != -ENOSPC) 4555 ret = wret; 4556 } 4557 4558 if (btrfs_header_nritems(leaf) == 0) { 4559 path->slots[1] = slot; 4560 ret = btrfs_del_leaf(trans, root, path, leaf); 4561 if (ret < 0) 4562 return ret; 4563 free_extent_buffer(leaf); 4564 ret = 0; 4565 } else { 4566 /* if we're still in the path, make sure 4567 * we're dirty. Otherwise, one of the 4568 * push_leaf functions must have already 4569 * dirtied this buffer 4570 */ 4571 if (path->nodes[0] == leaf) 4572 btrfs_mark_buffer_dirty(trans, leaf); 4573 free_extent_buffer(leaf); 4574 } 4575 } else { 4576 btrfs_mark_buffer_dirty(trans, leaf); 4577 } 4578 } 4579 return ret; 4580 } 4581 4582 /* 4583 * A helper function to walk down the tree starting at min_key, and looking 4584 * for nodes or leaves that are have a minimum transaction id. 4585 * This is used by the btree defrag code, and tree logging 4586 * 4587 * This does not cow, but it does stuff the starting key it finds back 4588 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4589 * key and get a writable path. 4590 * 4591 * This honors path->lowest_level to prevent descent past a given level 4592 * of the tree. 4593 * 4594 * min_trans indicates the oldest transaction that you are interested 4595 * in walking through. Any nodes or leaves older than min_trans are 4596 * skipped over (without reading them). 4597 * 4598 * returns zero if something useful was found, < 0 on error and 1 if there 4599 * was nothing in the tree that matched the search criteria. 4600 */ 4601 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4602 struct btrfs_path *path, 4603 u64 min_trans) 4604 { 4605 struct extent_buffer *cur; 4606 struct btrfs_key found_key; 4607 int slot; 4608 int sret; 4609 u32 nritems; 4610 int level; 4611 int ret = 1; 4612 int keep_locks = path->keep_locks; 4613 4614 ASSERT(!path->nowait); 4615 path->keep_locks = 1; 4616 again: 4617 cur = btrfs_read_lock_root_node(root); 4618 level = btrfs_header_level(cur); 4619 WARN_ON(path->nodes[level]); 4620 path->nodes[level] = cur; 4621 path->locks[level] = BTRFS_READ_LOCK; 4622 4623 if (btrfs_header_generation(cur) < min_trans) { 4624 ret = 1; 4625 goto out; 4626 } 4627 while (1) { 4628 nritems = btrfs_header_nritems(cur); 4629 level = btrfs_header_level(cur); 4630 sret = btrfs_bin_search(cur, 0, min_key, &slot); 4631 if (sret < 0) { 4632 ret = sret; 4633 goto out; 4634 } 4635 4636 /* at the lowest level, we're done, setup the path and exit */ 4637 if (level == path->lowest_level) { 4638 if (slot >= nritems) 4639 goto find_next_key; 4640 ret = 0; 4641 path->slots[level] = slot; 4642 btrfs_item_key_to_cpu(cur, &found_key, slot); 4643 goto out; 4644 } 4645 if (sret && slot > 0) 4646 slot--; 4647 /* 4648 * check this node pointer against the min_trans parameters. 4649 * If it is too old, skip to the next one. 4650 */ 4651 while (slot < nritems) { 4652 u64 gen; 4653 4654 gen = btrfs_node_ptr_generation(cur, slot); 4655 if (gen < min_trans) { 4656 slot++; 4657 continue; 4658 } 4659 break; 4660 } 4661 find_next_key: 4662 /* 4663 * we didn't find a candidate key in this node, walk forward 4664 * and find another one 4665 */ 4666 if (slot >= nritems) { 4667 path->slots[level] = slot; 4668 sret = btrfs_find_next_key(root, path, min_key, level, 4669 min_trans); 4670 if (sret == 0) { 4671 btrfs_release_path(path); 4672 goto again; 4673 } else { 4674 goto out; 4675 } 4676 } 4677 /* save our key for returning back */ 4678 btrfs_node_key_to_cpu(cur, &found_key, slot); 4679 path->slots[level] = slot; 4680 if (level == path->lowest_level) { 4681 ret = 0; 4682 goto out; 4683 } 4684 cur = btrfs_read_node_slot(cur, slot); 4685 if (IS_ERR(cur)) { 4686 ret = PTR_ERR(cur); 4687 goto out; 4688 } 4689 4690 btrfs_tree_read_lock(cur); 4691 4692 path->locks[level - 1] = BTRFS_READ_LOCK; 4693 path->nodes[level - 1] = cur; 4694 unlock_up(path, level, 1, 0, NULL); 4695 } 4696 out: 4697 path->keep_locks = keep_locks; 4698 if (ret == 0) { 4699 btrfs_unlock_up_safe(path, path->lowest_level + 1); 4700 memcpy(min_key, &found_key, sizeof(found_key)); 4701 } 4702 return ret; 4703 } 4704 4705 /* 4706 * this is similar to btrfs_next_leaf, but does not try to preserve 4707 * and fixup the path. It looks for and returns the next key in the 4708 * tree based on the current path and the min_trans parameters. 4709 * 4710 * 0 is returned if another key is found, < 0 if there are any errors 4711 * and 1 is returned if there are no higher keys in the tree 4712 * 4713 * path->keep_locks should be set to 1 on the search made before 4714 * calling this function. 4715 */ 4716 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4717 struct btrfs_key *key, int level, u64 min_trans) 4718 { 4719 int slot; 4720 struct extent_buffer *c; 4721 4722 WARN_ON(!path->keep_locks && !path->skip_locking); 4723 while (level < BTRFS_MAX_LEVEL) { 4724 if (!path->nodes[level]) 4725 return 1; 4726 4727 slot = path->slots[level] + 1; 4728 c = path->nodes[level]; 4729 next: 4730 if (slot >= btrfs_header_nritems(c)) { 4731 int ret; 4732 int orig_lowest; 4733 struct btrfs_key cur_key; 4734 if (level + 1 >= BTRFS_MAX_LEVEL || 4735 !path->nodes[level + 1]) 4736 return 1; 4737 4738 if (path->locks[level + 1] || path->skip_locking) { 4739 level++; 4740 continue; 4741 } 4742 4743 slot = btrfs_header_nritems(c) - 1; 4744 if (level == 0) 4745 btrfs_item_key_to_cpu(c, &cur_key, slot); 4746 else 4747 btrfs_node_key_to_cpu(c, &cur_key, slot); 4748 4749 orig_lowest = path->lowest_level; 4750 btrfs_release_path(path); 4751 path->lowest_level = level; 4752 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4753 0, 0); 4754 path->lowest_level = orig_lowest; 4755 if (ret < 0) 4756 return ret; 4757 4758 c = path->nodes[level]; 4759 slot = path->slots[level]; 4760 if (ret == 0) 4761 slot++; 4762 goto next; 4763 } 4764 4765 if (level == 0) 4766 btrfs_item_key_to_cpu(c, key, slot); 4767 else { 4768 u64 gen = btrfs_node_ptr_generation(c, slot); 4769 4770 if (gen < min_trans) { 4771 slot++; 4772 goto next; 4773 } 4774 btrfs_node_key_to_cpu(c, key, slot); 4775 } 4776 return 0; 4777 } 4778 return 1; 4779 } 4780 4781 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 4782 u64 time_seq) 4783 { 4784 int slot; 4785 int level; 4786 struct extent_buffer *c; 4787 struct extent_buffer *next; 4788 struct btrfs_fs_info *fs_info = root->fs_info; 4789 struct btrfs_key key; 4790 bool need_commit_sem = false; 4791 u32 nritems; 4792 int ret; 4793 int i; 4794 4795 /* 4796 * The nowait semantics are used only for write paths, where we don't 4797 * use the tree mod log and sequence numbers. 4798 */ 4799 if (time_seq) 4800 ASSERT(!path->nowait); 4801 4802 nritems = btrfs_header_nritems(path->nodes[0]); 4803 if (nritems == 0) 4804 return 1; 4805 4806 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4807 again: 4808 level = 1; 4809 next = NULL; 4810 btrfs_release_path(path); 4811 4812 path->keep_locks = 1; 4813 4814 if (time_seq) { 4815 ret = btrfs_search_old_slot(root, &key, path, time_seq); 4816 } else { 4817 if (path->need_commit_sem) { 4818 path->need_commit_sem = 0; 4819 need_commit_sem = true; 4820 if (path->nowait) { 4821 if (!down_read_trylock(&fs_info->commit_root_sem)) { 4822 ret = -EAGAIN; 4823 goto done; 4824 } 4825 } else { 4826 down_read(&fs_info->commit_root_sem); 4827 } 4828 } 4829 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4830 } 4831 path->keep_locks = 0; 4832 4833 if (ret < 0) 4834 goto done; 4835 4836 nritems = btrfs_header_nritems(path->nodes[0]); 4837 /* 4838 * by releasing the path above we dropped all our locks. A balance 4839 * could have added more items next to the key that used to be 4840 * at the very end of the block. So, check again here and 4841 * advance the path if there are now more items available. 4842 */ 4843 if (nritems > 0 && path->slots[0] < nritems - 1) { 4844 if (ret == 0) 4845 path->slots[0]++; 4846 ret = 0; 4847 goto done; 4848 } 4849 /* 4850 * So the above check misses one case: 4851 * - after releasing the path above, someone has removed the item that 4852 * used to be at the very end of the block, and balance between leafs 4853 * gets another one with bigger key.offset to replace it. 4854 * 4855 * This one should be returned as well, or we can get leaf corruption 4856 * later(esp. in __btrfs_drop_extents()). 4857 * 4858 * And a bit more explanation about this check, 4859 * with ret > 0, the key isn't found, the path points to the slot 4860 * where it should be inserted, so the path->slots[0] item must be the 4861 * bigger one. 4862 */ 4863 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { 4864 ret = 0; 4865 goto done; 4866 } 4867 4868 while (level < BTRFS_MAX_LEVEL) { 4869 if (!path->nodes[level]) { 4870 ret = 1; 4871 goto done; 4872 } 4873 4874 slot = path->slots[level] + 1; 4875 c = path->nodes[level]; 4876 if (slot >= btrfs_header_nritems(c)) { 4877 level++; 4878 if (level == BTRFS_MAX_LEVEL) { 4879 ret = 1; 4880 goto done; 4881 } 4882 continue; 4883 } 4884 4885 4886 /* 4887 * Our current level is where we're going to start from, and to 4888 * make sure lockdep doesn't complain we need to drop our locks 4889 * and nodes from 0 to our current level. 4890 */ 4891 for (i = 0; i < level; i++) { 4892 if (path->locks[level]) { 4893 btrfs_tree_read_unlock(path->nodes[i]); 4894 path->locks[i] = 0; 4895 } 4896 free_extent_buffer(path->nodes[i]); 4897 path->nodes[i] = NULL; 4898 } 4899 4900 next = c; 4901 ret = read_block_for_search(root, path, &next, level, 4902 slot, &key); 4903 if (ret == -EAGAIN && !path->nowait) 4904 goto again; 4905 4906 if (ret < 0) { 4907 btrfs_release_path(path); 4908 goto done; 4909 } 4910 4911 if (!path->skip_locking) { 4912 ret = btrfs_try_tree_read_lock(next); 4913 if (!ret && path->nowait) { 4914 ret = -EAGAIN; 4915 goto done; 4916 } 4917 if (!ret && time_seq) { 4918 /* 4919 * If we don't get the lock, we may be racing 4920 * with push_leaf_left, holding that lock while 4921 * itself waiting for the leaf we've currently 4922 * locked. To solve this situation, we give up 4923 * on our lock and cycle. 4924 */ 4925 free_extent_buffer(next); 4926 btrfs_release_path(path); 4927 cond_resched(); 4928 goto again; 4929 } 4930 if (!ret) 4931 btrfs_tree_read_lock(next); 4932 } 4933 break; 4934 } 4935 path->slots[level] = slot; 4936 while (1) { 4937 level--; 4938 path->nodes[level] = next; 4939 path->slots[level] = 0; 4940 if (!path->skip_locking) 4941 path->locks[level] = BTRFS_READ_LOCK; 4942 if (!level) 4943 break; 4944 4945 ret = read_block_for_search(root, path, &next, level, 4946 0, &key); 4947 if (ret == -EAGAIN && !path->nowait) 4948 goto again; 4949 4950 if (ret < 0) { 4951 btrfs_release_path(path); 4952 goto done; 4953 } 4954 4955 if (!path->skip_locking) { 4956 if (path->nowait) { 4957 if (!btrfs_try_tree_read_lock(next)) { 4958 ret = -EAGAIN; 4959 goto done; 4960 } 4961 } else { 4962 btrfs_tree_read_lock(next); 4963 } 4964 } 4965 } 4966 ret = 0; 4967 done: 4968 unlock_up(path, 0, 1, 0, NULL); 4969 if (need_commit_sem) { 4970 int ret2; 4971 4972 path->need_commit_sem = 1; 4973 ret2 = finish_need_commit_sem_search(path); 4974 up_read(&fs_info->commit_root_sem); 4975 if (ret2) 4976 ret = ret2; 4977 } 4978 4979 return ret; 4980 } 4981 4982 int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) 4983 { 4984 path->slots[0]++; 4985 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 4986 return btrfs_next_old_leaf(root, path, time_seq); 4987 return 0; 4988 } 4989 4990 /* 4991 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 4992 * searching until it gets past min_objectid or finds an item of 'type' 4993 * 4994 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4995 */ 4996 int btrfs_previous_item(struct btrfs_root *root, 4997 struct btrfs_path *path, u64 min_objectid, 4998 int type) 4999 { 5000 struct btrfs_key found_key; 5001 struct extent_buffer *leaf; 5002 u32 nritems; 5003 int ret; 5004 5005 while (1) { 5006 if (path->slots[0] == 0) { 5007 ret = btrfs_prev_leaf(root, path); 5008 if (ret != 0) 5009 return ret; 5010 } else { 5011 path->slots[0]--; 5012 } 5013 leaf = path->nodes[0]; 5014 nritems = btrfs_header_nritems(leaf); 5015 if (nritems == 0) 5016 return 1; 5017 if (path->slots[0] == nritems) 5018 path->slots[0]--; 5019 5020 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5021 if (found_key.objectid < min_objectid) 5022 break; 5023 if (found_key.type == type) 5024 return 0; 5025 if (found_key.objectid == min_objectid && 5026 found_key.type < type) 5027 break; 5028 } 5029 return 1; 5030 } 5031 5032 /* 5033 * search in extent tree to find a previous Metadata/Data extent item with 5034 * min objecitd. 5035 * 5036 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5037 */ 5038 int btrfs_previous_extent_item(struct btrfs_root *root, 5039 struct btrfs_path *path, u64 min_objectid) 5040 { 5041 struct btrfs_key found_key; 5042 struct extent_buffer *leaf; 5043 u32 nritems; 5044 int ret; 5045 5046 while (1) { 5047 if (path->slots[0] == 0) { 5048 ret = btrfs_prev_leaf(root, path); 5049 if (ret != 0) 5050 return ret; 5051 } else { 5052 path->slots[0]--; 5053 } 5054 leaf = path->nodes[0]; 5055 nritems = btrfs_header_nritems(leaf); 5056 if (nritems == 0) 5057 return 1; 5058 if (path->slots[0] == nritems) 5059 path->slots[0]--; 5060 5061 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5062 if (found_key.objectid < min_objectid) 5063 break; 5064 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 5065 found_key.type == BTRFS_METADATA_ITEM_KEY) 5066 return 0; 5067 if (found_key.objectid == min_objectid && 5068 found_key.type < BTRFS_EXTENT_ITEM_KEY) 5069 break; 5070 } 5071 return 1; 5072 } 5073 5074 int __init btrfs_ctree_init(void) 5075 { 5076 btrfs_path_cachep = kmem_cache_create("btrfs_path", 5077 sizeof(struct btrfs_path), 0, 5078 SLAB_MEM_SPREAD, NULL); 5079 if (!btrfs_path_cachep) 5080 return -ENOMEM; 5081 return 0; 5082 } 5083 5084 void __cold btrfs_ctree_exit(void) 5085 { 5086 kmem_cache_destroy(btrfs_path_cachep); 5087 } 5088