1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include <linux/error-injection.h> 11 #include "messages.h" 12 #include "ctree.h" 13 #include "disk-io.h" 14 #include "transaction.h" 15 #include "print-tree.h" 16 #include "locking.h" 17 #include "volumes.h" 18 #include "qgroup.h" 19 #include "tree-mod-log.h" 20 #include "tree-checker.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "extent-tree.h" 24 #include "relocation.h" 25 #include "file-item.h" 26 27 static struct kmem_cache *btrfs_path_cachep; 28 29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 30 *root, struct btrfs_path *path, int level); 31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 32 const struct btrfs_key *ins_key, struct btrfs_path *path, 33 int data_size, int extend); 34 static int push_node_left(struct btrfs_trans_handle *trans, 35 struct extent_buffer *dst, 36 struct extent_buffer *src, int empty); 37 static int balance_node_right(struct btrfs_trans_handle *trans, 38 struct extent_buffer *dst_buf, 39 struct extent_buffer *src_buf); 40 41 static const struct btrfs_csums { 42 u16 size; 43 const char name[10]; 44 const char driver[12]; 45 } btrfs_csums[] = { 46 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, 47 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, 48 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, 49 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", 50 .driver = "blake2b-256" }, 51 }; 52 53 /* 54 * The leaf data grows from end-to-front in the node. this returns the address 55 * of the start of the last item, which is the stop of the leaf data stack. 56 */ 57 static unsigned int leaf_data_end(const struct extent_buffer *leaf) 58 { 59 u32 nr = btrfs_header_nritems(leaf); 60 61 if (nr == 0) 62 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); 63 return btrfs_item_offset(leaf, nr - 1); 64 } 65 66 /* 67 * Move data in a @leaf (using memmove, safe for overlapping ranges). 68 * 69 * @leaf: leaf that we're doing a memmove on 70 * @dst_offset: item data offset we're moving to 71 * @src_offset: item data offset were' moving from 72 * @len: length of the data we're moving 73 * 74 * Wrapper around memmove_extent_buffer() that takes into account the header on 75 * the leaf. The btrfs_item offset's start directly after the header, so we 76 * have to adjust any offsets to account for the header in the leaf. This 77 * handles that math to simplify the callers. 78 */ 79 static inline void memmove_leaf_data(const struct extent_buffer *leaf, 80 unsigned long dst_offset, 81 unsigned long src_offset, 82 unsigned long len) 83 { 84 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, 85 btrfs_item_nr_offset(leaf, 0) + src_offset, len); 86 } 87 88 /* 89 * Copy item data from @src into @dst at the given @offset. 90 * 91 * @dst: destination leaf that we're copying into 92 * @src: source leaf that we're copying from 93 * @dst_offset: item data offset we're copying to 94 * @src_offset: item data offset were' copying from 95 * @len: length of the data we're copying 96 * 97 * Wrapper around copy_extent_buffer() that takes into account the header on 98 * the leaf. The btrfs_item offset's start directly after the header, so we 99 * have to adjust any offsets to account for the header in the leaf. This 100 * handles that math to simplify the callers. 101 */ 102 static inline void copy_leaf_data(const struct extent_buffer *dst, 103 const struct extent_buffer *src, 104 unsigned long dst_offset, 105 unsigned long src_offset, unsigned long len) 106 { 107 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, 108 btrfs_item_nr_offset(src, 0) + src_offset, len); 109 } 110 111 /* 112 * Move items in a @leaf (using memmove). 113 * 114 * @dst: destination leaf for the items 115 * @dst_item: the item nr we're copying into 116 * @src_item: the item nr we're copying from 117 * @nr_items: the number of items to copy 118 * 119 * Wrapper around memmove_extent_buffer() that does the math to get the 120 * appropriate offsets into the leaf from the item numbers. 121 */ 122 static inline void memmove_leaf_items(const struct extent_buffer *leaf, 123 int dst_item, int src_item, int nr_items) 124 { 125 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), 126 btrfs_item_nr_offset(leaf, src_item), 127 nr_items * sizeof(struct btrfs_item)); 128 } 129 130 /* 131 * Copy items from @src into @dst at the given @offset. 132 * 133 * @dst: destination leaf for the items 134 * @src: source leaf for the items 135 * @dst_item: the item nr we're copying into 136 * @src_item: the item nr we're copying from 137 * @nr_items: the number of items to copy 138 * 139 * Wrapper around copy_extent_buffer() that does the math to get the 140 * appropriate offsets into the leaf from the item numbers. 141 */ 142 static inline void copy_leaf_items(const struct extent_buffer *dst, 143 const struct extent_buffer *src, 144 int dst_item, int src_item, int nr_items) 145 { 146 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), 147 btrfs_item_nr_offset(src, src_item), 148 nr_items * sizeof(struct btrfs_item)); 149 } 150 151 /* This exists for btrfs-progs usages. */ 152 u16 btrfs_csum_type_size(u16 type) 153 { 154 return btrfs_csums[type].size; 155 } 156 157 int btrfs_super_csum_size(const struct btrfs_super_block *s) 158 { 159 u16 t = btrfs_super_csum_type(s); 160 /* 161 * csum type is validated at mount time 162 */ 163 return btrfs_csum_type_size(t); 164 } 165 166 const char *btrfs_super_csum_name(u16 csum_type) 167 { 168 /* csum type is validated at mount time */ 169 return btrfs_csums[csum_type].name; 170 } 171 172 /* 173 * Return driver name if defined, otherwise the name that's also a valid driver 174 * name 175 */ 176 const char *btrfs_super_csum_driver(u16 csum_type) 177 { 178 /* csum type is validated at mount time */ 179 return btrfs_csums[csum_type].driver[0] ? 180 btrfs_csums[csum_type].driver : 181 btrfs_csums[csum_type].name; 182 } 183 184 size_t __attribute_const__ btrfs_get_num_csums(void) 185 { 186 return ARRAY_SIZE(btrfs_csums); 187 } 188 189 struct btrfs_path *btrfs_alloc_path(void) 190 { 191 might_sleep(); 192 193 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 194 } 195 196 /* this also releases the path */ 197 void btrfs_free_path(struct btrfs_path *p) 198 { 199 if (!p) 200 return; 201 btrfs_release_path(p); 202 kmem_cache_free(btrfs_path_cachep, p); 203 } 204 205 /* 206 * path release drops references on the extent buffers in the path 207 * and it drops any locks held by this path 208 * 209 * It is safe to call this on paths that no locks or extent buffers held. 210 */ 211 noinline void btrfs_release_path(struct btrfs_path *p) 212 { 213 int i; 214 215 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 216 p->slots[i] = 0; 217 if (!p->nodes[i]) 218 continue; 219 if (p->locks[i]) { 220 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 221 p->locks[i] = 0; 222 } 223 free_extent_buffer(p->nodes[i]); 224 p->nodes[i] = NULL; 225 } 226 } 227 228 /* 229 * We want the transaction abort to print stack trace only for errors where the 230 * cause could be a bug, eg. due to ENOSPC, and not for common errors that are 231 * caused by external factors. 232 */ 233 bool __cold abort_should_print_stack(int error) 234 { 235 switch (error) { 236 case -EIO: 237 case -EROFS: 238 case -ENOMEM: 239 return false; 240 } 241 return true; 242 } 243 244 /* 245 * safely gets a reference on the root node of a tree. A lock 246 * is not taken, so a concurrent writer may put a different node 247 * at the root of the tree. See btrfs_lock_root_node for the 248 * looping required. 249 * 250 * The extent buffer returned by this has a reference taken, so 251 * it won't disappear. It may stop being the root of the tree 252 * at any time because there are no locks held. 253 */ 254 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 255 { 256 struct extent_buffer *eb; 257 258 while (1) { 259 rcu_read_lock(); 260 eb = rcu_dereference(root->node); 261 262 /* 263 * RCU really hurts here, we could free up the root node because 264 * it was COWed but we may not get the new root node yet so do 265 * the inc_not_zero dance and if it doesn't work then 266 * synchronize_rcu and try again. 267 */ 268 if (atomic_inc_not_zero(&eb->refs)) { 269 rcu_read_unlock(); 270 break; 271 } 272 rcu_read_unlock(); 273 synchronize_rcu(); 274 } 275 return eb; 276 } 277 278 /* 279 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), 280 * just get put onto a simple dirty list. Transaction walks this list to make 281 * sure they get properly updated on disk. 282 */ 283 static void add_root_to_dirty_list(struct btrfs_root *root) 284 { 285 struct btrfs_fs_info *fs_info = root->fs_info; 286 287 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 288 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 289 return; 290 291 spin_lock(&fs_info->trans_lock); 292 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 293 /* Want the extent tree to be the last on the list */ 294 if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID) 295 list_move_tail(&root->dirty_list, 296 &fs_info->dirty_cowonly_roots); 297 else 298 list_move(&root->dirty_list, 299 &fs_info->dirty_cowonly_roots); 300 } 301 spin_unlock(&fs_info->trans_lock); 302 } 303 304 /* 305 * used by snapshot creation to make a copy of a root for a tree with 306 * a given objectid. The buffer with the new root node is returned in 307 * cow_ret, and this func returns zero on success or a negative error code. 308 */ 309 int btrfs_copy_root(struct btrfs_trans_handle *trans, 310 struct btrfs_root *root, 311 struct extent_buffer *buf, 312 struct extent_buffer **cow_ret, u64 new_root_objectid) 313 { 314 struct btrfs_fs_info *fs_info = root->fs_info; 315 struct extent_buffer *cow; 316 int ret = 0; 317 int level; 318 struct btrfs_disk_key disk_key; 319 u64 reloc_src_root = 0; 320 321 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 322 trans->transid != fs_info->running_transaction->transid); 323 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 324 trans->transid != btrfs_get_root_last_trans(root)); 325 326 level = btrfs_header_level(buf); 327 if (level == 0) 328 btrfs_item_key(buf, &disk_key, 0); 329 else 330 btrfs_node_key(buf, &disk_key, 0); 331 332 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 333 reloc_src_root = btrfs_header_owner(buf); 334 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 335 &disk_key, level, buf->start, 0, 336 reloc_src_root, BTRFS_NESTING_NEW_ROOT); 337 if (IS_ERR(cow)) 338 return PTR_ERR(cow); 339 340 copy_extent_buffer_full(cow, buf); 341 btrfs_set_header_bytenr(cow, cow->start); 342 btrfs_set_header_generation(cow, trans->transid); 343 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 344 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 345 BTRFS_HEADER_FLAG_RELOC); 346 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 347 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 348 else 349 btrfs_set_header_owner(cow, new_root_objectid); 350 351 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 352 353 WARN_ON(btrfs_header_generation(buf) > trans->transid); 354 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 355 ret = btrfs_inc_ref(trans, root, cow, 1); 356 else 357 ret = btrfs_inc_ref(trans, root, cow, 0); 358 if (ret) { 359 btrfs_tree_unlock(cow); 360 free_extent_buffer(cow); 361 btrfs_abort_transaction(trans, ret); 362 return ret; 363 } 364 365 btrfs_mark_buffer_dirty(trans, cow); 366 *cow_ret = cow; 367 return 0; 368 } 369 370 /* 371 * check if the tree block can be shared by multiple trees 372 */ 373 bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, 374 struct btrfs_root *root, 375 struct extent_buffer *buf) 376 { 377 const u64 buf_gen = btrfs_header_generation(buf); 378 379 /* 380 * Tree blocks not in shareable trees and tree roots are never shared. 381 * If a block was allocated after the last snapshot and the block was 382 * not allocated by tree relocation, we know the block is not shared. 383 */ 384 385 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 386 return false; 387 388 if (buf == root->node) 389 return false; 390 391 if (buf_gen > btrfs_root_last_snapshot(&root->root_item) && 392 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) 393 return false; 394 395 if (buf != root->commit_root) 396 return true; 397 398 /* 399 * An extent buffer that used to be the commit root may still be shared 400 * because the tree height may have increased and it became a child of a 401 * higher level root. This can happen when snapshotting a subvolume 402 * created in the current transaction. 403 */ 404 if (buf_gen == trans->transid) 405 return true; 406 407 return false; 408 } 409 410 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 411 struct btrfs_root *root, 412 struct extent_buffer *buf, 413 struct extent_buffer *cow, 414 int *last_ref) 415 { 416 struct btrfs_fs_info *fs_info = root->fs_info; 417 u64 refs; 418 u64 owner; 419 u64 flags; 420 int ret; 421 422 /* 423 * Backrefs update rules: 424 * 425 * Always use full backrefs for extent pointers in tree block 426 * allocated by tree relocation. 427 * 428 * If a shared tree block is no longer referenced by its owner 429 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 430 * use full backrefs for extent pointers in tree block. 431 * 432 * If a tree block is been relocating 433 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 434 * use full backrefs for extent pointers in tree block. 435 * The reason for this is some operations (such as drop tree) 436 * are only allowed for blocks use full backrefs. 437 */ 438 439 if (btrfs_block_can_be_shared(trans, root, buf)) { 440 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 441 btrfs_header_level(buf), 1, 442 &refs, &flags, NULL); 443 if (ret) 444 return ret; 445 if (unlikely(refs == 0)) { 446 btrfs_crit(fs_info, 447 "found 0 references for tree block at bytenr %llu level %d root %llu", 448 buf->start, btrfs_header_level(buf), 449 btrfs_root_id(root)); 450 ret = -EUCLEAN; 451 btrfs_abort_transaction(trans, ret); 452 return ret; 453 } 454 } else { 455 refs = 1; 456 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || 457 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 458 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 459 else 460 flags = 0; 461 } 462 463 owner = btrfs_header_owner(buf); 464 if (unlikely(owner == BTRFS_TREE_RELOC_OBJECTID && 465 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))) { 466 btrfs_crit(fs_info, 467 "found tree block at bytenr %llu level %d root %llu refs %llu flags %llx without full backref flag set", 468 buf->start, btrfs_header_level(buf), 469 btrfs_root_id(root), refs, flags); 470 ret = -EUCLEAN; 471 btrfs_abort_transaction(trans, ret); 472 return ret; 473 } 474 475 if (refs > 1) { 476 if ((owner == btrfs_root_id(root) || 477 btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) && 478 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 479 ret = btrfs_inc_ref(trans, root, buf, 1); 480 if (ret) 481 return ret; 482 483 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { 484 ret = btrfs_dec_ref(trans, root, buf, 0); 485 if (ret) 486 return ret; 487 ret = btrfs_inc_ref(trans, root, cow, 1); 488 if (ret) 489 return ret; 490 } 491 ret = btrfs_set_disk_extent_flags(trans, buf, 492 BTRFS_BLOCK_FLAG_FULL_BACKREF); 493 if (ret) 494 return ret; 495 } else { 496 497 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 498 ret = btrfs_inc_ref(trans, root, cow, 1); 499 else 500 ret = btrfs_inc_ref(trans, root, cow, 0); 501 if (ret) 502 return ret; 503 } 504 } else { 505 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 506 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 507 ret = btrfs_inc_ref(trans, root, cow, 1); 508 else 509 ret = btrfs_inc_ref(trans, root, cow, 0); 510 if (ret) 511 return ret; 512 ret = btrfs_dec_ref(trans, root, buf, 1); 513 if (ret) 514 return ret; 515 } 516 btrfs_clear_buffer_dirty(trans, buf); 517 *last_ref = 1; 518 } 519 return 0; 520 } 521 522 /* 523 * does the dirty work in cow of a single block. The parent block (if 524 * supplied) is updated to point to the new cow copy. The new buffer is marked 525 * dirty and returned locked. If you modify the block it needs to be marked 526 * dirty again. 527 * 528 * search_start -- an allocation hint for the new block 529 * 530 * empty_size -- a hint that you plan on doing more cow. This is the size in 531 * bytes the allocator should try to find free next to the block it returns. 532 * This is just a hint and may be ignored by the allocator. 533 */ 534 int btrfs_force_cow_block(struct btrfs_trans_handle *trans, 535 struct btrfs_root *root, 536 struct extent_buffer *buf, 537 struct extent_buffer *parent, int parent_slot, 538 struct extent_buffer **cow_ret, 539 u64 search_start, u64 empty_size, 540 enum btrfs_lock_nesting nest) 541 { 542 struct btrfs_fs_info *fs_info = root->fs_info; 543 struct btrfs_disk_key disk_key; 544 struct extent_buffer *cow; 545 int level, ret; 546 int last_ref = 0; 547 int unlock_orig = 0; 548 u64 parent_start = 0; 549 u64 reloc_src_root = 0; 550 551 if (*cow_ret == buf) 552 unlock_orig = 1; 553 554 btrfs_assert_tree_write_locked(buf); 555 556 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 557 trans->transid != fs_info->running_transaction->transid); 558 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 559 trans->transid != btrfs_get_root_last_trans(root)); 560 561 level = btrfs_header_level(buf); 562 563 if (level == 0) 564 btrfs_item_key(buf, &disk_key, 0); 565 else 566 btrfs_node_key(buf, &disk_key, 0); 567 568 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { 569 if (parent) 570 parent_start = parent->start; 571 reloc_src_root = btrfs_header_owner(buf); 572 } 573 cow = btrfs_alloc_tree_block(trans, root, parent_start, 574 btrfs_root_id(root), &disk_key, level, 575 search_start, empty_size, reloc_src_root, nest); 576 if (IS_ERR(cow)) 577 return PTR_ERR(cow); 578 579 /* cow is set to blocking by btrfs_init_new_buffer */ 580 581 copy_extent_buffer_full(cow, buf); 582 btrfs_set_header_bytenr(cow, cow->start); 583 btrfs_set_header_generation(cow, trans->transid); 584 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 585 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 586 BTRFS_HEADER_FLAG_RELOC); 587 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 588 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 589 else 590 btrfs_set_header_owner(cow, btrfs_root_id(root)); 591 592 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 593 594 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 595 if (ret) { 596 btrfs_abort_transaction(trans, ret); 597 goto error_unlock_cow; 598 } 599 600 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 601 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 602 if (ret) { 603 btrfs_abort_transaction(trans, ret); 604 goto error_unlock_cow; 605 } 606 } 607 608 if (buf == root->node) { 609 WARN_ON(parent && parent != buf); 610 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || 611 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 612 parent_start = buf->start; 613 614 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); 615 if (ret < 0) { 616 btrfs_abort_transaction(trans, ret); 617 goto error_unlock_cow; 618 } 619 atomic_inc(&cow->refs); 620 rcu_assign_pointer(root->node, cow); 621 622 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 623 parent_start, last_ref); 624 free_extent_buffer(buf); 625 add_root_to_dirty_list(root); 626 if (ret < 0) { 627 btrfs_abort_transaction(trans, ret); 628 goto error_unlock_cow; 629 } 630 } else { 631 WARN_ON(trans->transid != btrfs_header_generation(parent)); 632 ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, 633 BTRFS_MOD_LOG_KEY_REPLACE); 634 if (ret) { 635 btrfs_abort_transaction(trans, ret); 636 goto error_unlock_cow; 637 } 638 btrfs_set_node_blockptr(parent, parent_slot, 639 cow->start); 640 btrfs_set_node_ptr_generation(parent, parent_slot, 641 trans->transid); 642 btrfs_mark_buffer_dirty(trans, parent); 643 if (last_ref) { 644 ret = btrfs_tree_mod_log_free_eb(buf); 645 if (ret) { 646 btrfs_abort_transaction(trans, ret); 647 goto error_unlock_cow; 648 } 649 } 650 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 651 parent_start, last_ref); 652 if (ret < 0) { 653 btrfs_abort_transaction(trans, ret); 654 goto error_unlock_cow; 655 } 656 } 657 658 trace_btrfs_cow_block(root, buf, cow); 659 if (unlock_orig) 660 btrfs_tree_unlock(buf); 661 free_extent_buffer_stale(buf); 662 btrfs_mark_buffer_dirty(trans, cow); 663 *cow_ret = cow; 664 return 0; 665 666 error_unlock_cow: 667 btrfs_tree_unlock(cow); 668 free_extent_buffer(cow); 669 return ret; 670 } 671 672 static inline int should_cow_block(struct btrfs_trans_handle *trans, 673 struct btrfs_root *root, 674 struct extent_buffer *buf) 675 { 676 if (btrfs_is_testing(root->fs_info)) 677 return 0; 678 679 /* Ensure we can see the FORCE_COW bit */ 680 smp_mb__before_atomic(); 681 682 /* 683 * We do not need to cow a block if 684 * 1) this block is not created or changed in this transaction; 685 * 2) this block does not belong to TREE_RELOC tree; 686 * 3) the root is not forced COW. 687 * 688 * What is forced COW: 689 * when we create snapshot during committing the transaction, 690 * after we've finished copying src root, we must COW the shared 691 * block to ensure the metadata consistency. 692 */ 693 if (btrfs_header_generation(buf) == trans->transid && 694 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 695 !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && 696 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 697 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 698 return 0; 699 return 1; 700 } 701 702 /* 703 * COWs a single block, see btrfs_force_cow_block() for the real work. 704 * This version of it has extra checks so that a block isn't COWed more than 705 * once per transaction, as long as it hasn't been written yet 706 */ 707 int btrfs_cow_block(struct btrfs_trans_handle *trans, 708 struct btrfs_root *root, struct extent_buffer *buf, 709 struct extent_buffer *parent, int parent_slot, 710 struct extent_buffer **cow_ret, 711 enum btrfs_lock_nesting nest) 712 { 713 struct btrfs_fs_info *fs_info = root->fs_info; 714 u64 search_start; 715 716 if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { 717 btrfs_abort_transaction(trans, -EUCLEAN); 718 btrfs_crit(fs_info, 719 "attempt to COW block %llu on root %llu that is being deleted", 720 buf->start, btrfs_root_id(root)); 721 return -EUCLEAN; 722 } 723 724 /* 725 * COWing must happen through a running transaction, which always 726 * matches the current fs generation (it's a transaction with a state 727 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 728 * into error state to prevent the commit of any transaction. 729 */ 730 if (unlikely(trans->transaction != fs_info->running_transaction || 731 trans->transid != fs_info->generation)) { 732 btrfs_abort_transaction(trans, -EUCLEAN); 733 btrfs_crit(fs_info, 734 "unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", 735 buf->start, btrfs_root_id(root), trans->transid, 736 fs_info->running_transaction->transid, 737 fs_info->generation); 738 return -EUCLEAN; 739 } 740 741 if (!should_cow_block(trans, root, buf)) { 742 *cow_ret = buf; 743 return 0; 744 } 745 746 search_start = round_down(buf->start, SZ_1G); 747 748 /* 749 * Before CoWing this block for later modification, check if it's 750 * the subtree root and do the delayed subtree trace if needed. 751 * 752 * Also We don't care about the error, as it's handled internally. 753 */ 754 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 755 return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, 756 cow_ret, search_start, 0, nest); 757 } 758 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 759 760 /* 761 * same as comp_keys only with two btrfs_key's 762 */ 763 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 764 { 765 if (k1->objectid > k2->objectid) 766 return 1; 767 if (k1->objectid < k2->objectid) 768 return -1; 769 if (k1->type > k2->type) 770 return 1; 771 if (k1->type < k2->type) 772 return -1; 773 if (k1->offset > k2->offset) 774 return 1; 775 if (k1->offset < k2->offset) 776 return -1; 777 return 0; 778 } 779 780 /* 781 * Search for a key in the given extent_buffer. 782 * 783 * The lower boundary for the search is specified by the slot number @first_slot. 784 * Use a value of 0 to search over the whole extent buffer. Works for both 785 * leaves and nodes. 786 * 787 * The slot in the extent buffer is returned via @slot. If the key exists in the 788 * extent buffer, then @slot will point to the slot where the key is, otherwise 789 * it points to the slot where you would insert the key. 790 * 791 * Slot may point to the total number of items (i.e. one position beyond the last 792 * key) if the key is bigger than the last key in the extent buffer. 793 */ 794 int btrfs_bin_search(struct extent_buffer *eb, int first_slot, 795 const struct btrfs_key *key, int *slot) 796 { 797 unsigned long p; 798 int item_size; 799 /* 800 * Use unsigned types for the low and high slots, so that we get a more 801 * efficient division in the search loop below. 802 */ 803 u32 low = first_slot; 804 u32 high = btrfs_header_nritems(eb); 805 int ret; 806 const int key_size = sizeof(struct btrfs_disk_key); 807 808 if (unlikely(low > high)) { 809 btrfs_err(eb->fs_info, 810 "%s: low (%u) > high (%u) eb %llu owner %llu level %d", 811 __func__, low, high, eb->start, 812 btrfs_header_owner(eb), btrfs_header_level(eb)); 813 return -EINVAL; 814 } 815 816 if (btrfs_header_level(eb) == 0) { 817 p = offsetof(struct btrfs_leaf, items); 818 item_size = sizeof(struct btrfs_item); 819 } else { 820 p = offsetof(struct btrfs_node, ptrs); 821 item_size = sizeof(struct btrfs_key_ptr); 822 } 823 824 while (low < high) { 825 const int unit_size = eb->folio_size; 826 unsigned long oil; 827 unsigned long offset; 828 struct btrfs_disk_key *tmp; 829 struct btrfs_disk_key unaligned; 830 int mid; 831 832 mid = (low + high) / 2; 833 offset = p + mid * item_size; 834 oil = get_eb_offset_in_folio(eb, offset); 835 836 if (oil + key_size <= unit_size) { 837 const unsigned long idx = get_eb_folio_index(eb, offset); 838 char *kaddr = folio_address(eb->folios[idx]); 839 840 oil = get_eb_offset_in_folio(eb, offset); 841 tmp = (struct btrfs_disk_key *)(kaddr + oil); 842 } else { 843 read_extent_buffer(eb, &unaligned, offset, key_size); 844 tmp = &unaligned; 845 } 846 847 ret = btrfs_comp_keys(tmp, key); 848 849 if (ret < 0) 850 low = mid + 1; 851 else if (ret > 0) 852 high = mid; 853 else { 854 *slot = mid; 855 return 0; 856 } 857 } 858 *slot = low; 859 return 1; 860 } 861 862 static void root_add_used_bytes(struct btrfs_root *root) 863 { 864 spin_lock(&root->accounting_lock); 865 btrfs_set_root_used(&root->root_item, 866 btrfs_root_used(&root->root_item) + root->fs_info->nodesize); 867 spin_unlock(&root->accounting_lock); 868 } 869 870 static void root_sub_used_bytes(struct btrfs_root *root) 871 { 872 spin_lock(&root->accounting_lock); 873 btrfs_set_root_used(&root->root_item, 874 btrfs_root_used(&root->root_item) - root->fs_info->nodesize); 875 spin_unlock(&root->accounting_lock); 876 } 877 878 /* given a node and slot number, this reads the blocks it points to. The 879 * extent buffer is returned with a reference taken (but unlocked). 880 */ 881 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, 882 int slot) 883 { 884 int level = btrfs_header_level(parent); 885 struct btrfs_tree_parent_check check = { 0 }; 886 struct extent_buffer *eb; 887 888 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 889 return ERR_PTR(-ENOENT); 890 891 ASSERT(level); 892 893 check.level = level - 1; 894 check.transid = btrfs_node_ptr_generation(parent, slot); 895 check.owner_root = btrfs_header_owner(parent); 896 check.has_first_key = true; 897 btrfs_node_key_to_cpu(parent, &check.first_key, slot); 898 899 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 900 &check); 901 if (IS_ERR(eb)) 902 return eb; 903 if (!extent_buffer_uptodate(eb)) { 904 free_extent_buffer(eb); 905 return ERR_PTR(-EIO); 906 } 907 908 return eb; 909 } 910 911 /* 912 * node level balancing, used to make sure nodes are in proper order for 913 * item deletion. We balance from the top down, so we have to make sure 914 * that a deletion won't leave an node completely empty later on. 915 */ 916 static noinline int balance_level(struct btrfs_trans_handle *trans, 917 struct btrfs_root *root, 918 struct btrfs_path *path, int level) 919 { 920 struct btrfs_fs_info *fs_info = root->fs_info; 921 struct extent_buffer *right = NULL; 922 struct extent_buffer *mid; 923 struct extent_buffer *left = NULL; 924 struct extent_buffer *parent = NULL; 925 int ret = 0; 926 int wret; 927 int pslot; 928 int orig_slot = path->slots[level]; 929 u64 orig_ptr; 930 931 ASSERT(level > 0); 932 933 mid = path->nodes[level]; 934 935 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); 936 WARN_ON(btrfs_header_generation(mid) != trans->transid); 937 938 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 939 940 if (level < BTRFS_MAX_LEVEL - 1) { 941 parent = path->nodes[level + 1]; 942 pslot = path->slots[level + 1]; 943 } 944 945 /* 946 * deal with the case where there is only one pointer in the root 947 * by promoting the node below to a root 948 */ 949 if (!parent) { 950 struct extent_buffer *child; 951 952 if (btrfs_header_nritems(mid) != 1) 953 return 0; 954 955 /* promote the child to a root */ 956 child = btrfs_read_node_slot(mid, 0); 957 if (IS_ERR(child)) { 958 ret = PTR_ERR(child); 959 goto out; 960 } 961 962 btrfs_tree_lock(child); 963 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 964 BTRFS_NESTING_COW); 965 if (ret) { 966 btrfs_tree_unlock(child); 967 free_extent_buffer(child); 968 goto out; 969 } 970 971 ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 972 if (ret < 0) { 973 btrfs_tree_unlock(child); 974 free_extent_buffer(child); 975 btrfs_abort_transaction(trans, ret); 976 goto out; 977 } 978 rcu_assign_pointer(root->node, child); 979 980 add_root_to_dirty_list(root); 981 btrfs_tree_unlock(child); 982 983 path->locks[level] = 0; 984 path->nodes[level] = NULL; 985 btrfs_clear_buffer_dirty(trans, mid); 986 btrfs_tree_unlock(mid); 987 /* once for the path */ 988 free_extent_buffer(mid); 989 990 root_sub_used_bytes(root); 991 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 992 /* once for the root ptr */ 993 free_extent_buffer_stale(mid); 994 if (ret < 0) { 995 btrfs_abort_transaction(trans, ret); 996 goto out; 997 } 998 return 0; 999 } 1000 if (btrfs_header_nritems(mid) > 1001 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 1002 return 0; 1003 1004 if (pslot) { 1005 left = btrfs_read_node_slot(parent, pslot - 1); 1006 if (IS_ERR(left)) { 1007 ret = PTR_ERR(left); 1008 left = NULL; 1009 goto out; 1010 } 1011 1012 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); 1013 wret = btrfs_cow_block(trans, root, left, 1014 parent, pslot - 1, &left, 1015 BTRFS_NESTING_LEFT_COW); 1016 if (wret) { 1017 ret = wret; 1018 goto out; 1019 } 1020 } 1021 1022 if (pslot + 1 < btrfs_header_nritems(parent)) { 1023 right = btrfs_read_node_slot(parent, pslot + 1); 1024 if (IS_ERR(right)) { 1025 ret = PTR_ERR(right); 1026 right = NULL; 1027 goto out; 1028 } 1029 1030 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); 1031 wret = btrfs_cow_block(trans, root, right, 1032 parent, pslot + 1, &right, 1033 BTRFS_NESTING_RIGHT_COW); 1034 if (wret) { 1035 ret = wret; 1036 goto out; 1037 } 1038 } 1039 1040 /* first, try to make some room in the middle buffer */ 1041 if (left) { 1042 orig_slot += btrfs_header_nritems(left); 1043 wret = push_node_left(trans, left, mid, 1); 1044 if (wret < 0) 1045 ret = wret; 1046 } 1047 1048 /* 1049 * then try to empty the right most buffer into the middle 1050 */ 1051 if (right) { 1052 wret = push_node_left(trans, mid, right, 1); 1053 if (wret < 0 && wret != -ENOSPC) 1054 ret = wret; 1055 if (btrfs_header_nritems(right) == 0) { 1056 btrfs_clear_buffer_dirty(trans, right); 1057 btrfs_tree_unlock(right); 1058 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); 1059 if (ret < 0) { 1060 free_extent_buffer_stale(right); 1061 right = NULL; 1062 goto out; 1063 } 1064 root_sub_used_bytes(root); 1065 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), 1066 right, 0, 1); 1067 free_extent_buffer_stale(right); 1068 right = NULL; 1069 if (ret < 0) { 1070 btrfs_abort_transaction(trans, ret); 1071 goto out; 1072 } 1073 } else { 1074 struct btrfs_disk_key right_key; 1075 btrfs_node_key(right, &right_key, 0); 1076 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1077 BTRFS_MOD_LOG_KEY_REPLACE); 1078 if (ret < 0) { 1079 btrfs_abort_transaction(trans, ret); 1080 goto out; 1081 } 1082 btrfs_set_node_key(parent, &right_key, pslot + 1); 1083 btrfs_mark_buffer_dirty(trans, parent); 1084 } 1085 } 1086 if (btrfs_header_nritems(mid) == 1) { 1087 /* 1088 * we're not allowed to leave a node with one item in the 1089 * tree during a delete. A deletion from lower in the tree 1090 * could try to delete the only pointer in this node. 1091 * So, pull some keys from the left. 1092 * There has to be a left pointer at this point because 1093 * otherwise we would have pulled some pointers from the 1094 * right 1095 */ 1096 if (unlikely(!left)) { 1097 btrfs_crit(fs_info, 1098 "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", 1099 parent->start, btrfs_header_level(parent), 1100 mid->start, btrfs_root_id(root)); 1101 ret = -EUCLEAN; 1102 btrfs_abort_transaction(trans, ret); 1103 goto out; 1104 } 1105 wret = balance_node_right(trans, mid, left); 1106 if (wret < 0) { 1107 ret = wret; 1108 goto out; 1109 } 1110 if (wret == 1) { 1111 wret = push_node_left(trans, left, mid, 1); 1112 if (wret < 0) 1113 ret = wret; 1114 } 1115 BUG_ON(wret == 1); 1116 } 1117 if (btrfs_header_nritems(mid) == 0) { 1118 btrfs_clear_buffer_dirty(trans, mid); 1119 btrfs_tree_unlock(mid); 1120 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); 1121 if (ret < 0) { 1122 free_extent_buffer_stale(mid); 1123 mid = NULL; 1124 goto out; 1125 } 1126 root_sub_used_bytes(root); 1127 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1128 free_extent_buffer_stale(mid); 1129 mid = NULL; 1130 if (ret < 0) { 1131 btrfs_abort_transaction(trans, ret); 1132 goto out; 1133 } 1134 } else { 1135 /* update the parent key to reflect our changes */ 1136 struct btrfs_disk_key mid_key; 1137 btrfs_node_key(mid, &mid_key, 0); 1138 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1139 BTRFS_MOD_LOG_KEY_REPLACE); 1140 if (ret < 0) { 1141 btrfs_abort_transaction(trans, ret); 1142 goto out; 1143 } 1144 btrfs_set_node_key(parent, &mid_key, pslot); 1145 btrfs_mark_buffer_dirty(trans, parent); 1146 } 1147 1148 /* update the path */ 1149 if (left) { 1150 if (btrfs_header_nritems(left) > orig_slot) { 1151 atomic_inc(&left->refs); 1152 /* left was locked after cow */ 1153 path->nodes[level] = left; 1154 path->slots[level + 1] -= 1; 1155 path->slots[level] = orig_slot; 1156 if (mid) { 1157 btrfs_tree_unlock(mid); 1158 free_extent_buffer(mid); 1159 } 1160 } else { 1161 orig_slot -= btrfs_header_nritems(left); 1162 path->slots[level] = orig_slot; 1163 } 1164 } 1165 /* double check we haven't messed things up */ 1166 if (orig_ptr != 1167 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1168 BUG(); 1169 out: 1170 if (right) { 1171 btrfs_tree_unlock(right); 1172 free_extent_buffer(right); 1173 } 1174 if (left) { 1175 if (path->nodes[level] != left) 1176 btrfs_tree_unlock(left); 1177 free_extent_buffer(left); 1178 } 1179 return ret; 1180 } 1181 1182 /* Node balancing for insertion. Here we only split or push nodes around 1183 * when they are completely full. This is also done top down, so we 1184 * have to be pessimistic. 1185 */ 1186 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1187 struct btrfs_root *root, 1188 struct btrfs_path *path, int level) 1189 { 1190 struct btrfs_fs_info *fs_info = root->fs_info; 1191 struct extent_buffer *right = NULL; 1192 struct extent_buffer *mid; 1193 struct extent_buffer *left = NULL; 1194 struct extent_buffer *parent = NULL; 1195 int ret = 0; 1196 int wret; 1197 int pslot; 1198 int orig_slot = path->slots[level]; 1199 1200 if (level == 0) 1201 return 1; 1202 1203 mid = path->nodes[level]; 1204 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1205 1206 if (level < BTRFS_MAX_LEVEL - 1) { 1207 parent = path->nodes[level + 1]; 1208 pslot = path->slots[level + 1]; 1209 } 1210 1211 if (!parent) 1212 return 1; 1213 1214 /* first, try to make some room in the middle buffer */ 1215 if (pslot) { 1216 u32 left_nr; 1217 1218 left = btrfs_read_node_slot(parent, pslot - 1); 1219 if (IS_ERR(left)) 1220 return PTR_ERR(left); 1221 1222 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); 1223 1224 left_nr = btrfs_header_nritems(left); 1225 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1226 wret = 1; 1227 } else { 1228 ret = btrfs_cow_block(trans, root, left, parent, 1229 pslot - 1, &left, 1230 BTRFS_NESTING_LEFT_COW); 1231 if (ret) 1232 wret = 1; 1233 else { 1234 wret = push_node_left(trans, left, mid, 0); 1235 } 1236 } 1237 if (wret < 0) 1238 ret = wret; 1239 if (wret == 0) { 1240 struct btrfs_disk_key disk_key; 1241 orig_slot += left_nr; 1242 btrfs_node_key(mid, &disk_key, 0); 1243 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1244 BTRFS_MOD_LOG_KEY_REPLACE); 1245 if (ret < 0) { 1246 btrfs_tree_unlock(left); 1247 free_extent_buffer(left); 1248 btrfs_abort_transaction(trans, ret); 1249 return ret; 1250 } 1251 btrfs_set_node_key(parent, &disk_key, pslot); 1252 btrfs_mark_buffer_dirty(trans, parent); 1253 if (btrfs_header_nritems(left) > orig_slot) { 1254 path->nodes[level] = left; 1255 path->slots[level + 1] -= 1; 1256 path->slots[level] = orig_slot; 1257 btrfs_tree_unlock(mid); 1258 free_extent_buffer(mid); 1259 } else { 1260 orig_slot -= 1261 btrfs_header_nritems(left); 1262 path->slots[level] = orig_slot; 1263 btrfs_tree_unlock(left); 1264 free_extent_buffer(left); 1265 } 1266 return 0; 1267 } 1268 btrfs_tree_unlock(left); 1269 free_extent_buffer(left); 1270 } 1271 1272 /* 1273 * then try to empty the right most buffer into the middle 1274 */ 1275 if (pslot + 1 < btrfs_header_nritems(parent)) { 1276 u32 right_nr; 1277 1278 right = btrfs_read_node_slot(parent, pslot + 1); 1279 if (IS_ERR(right)) 1280 return PTR_ERR(right); 1281 1282 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); 1283 1284 right_nr = btrfs_header_nritems(right); 1285 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1286 wret = 1; 1287 } else { 1288 ret = btrfs_cow_block(trans, root, right, 1289 parent, pslot + 1, 1290 &right, BTRFS_NESTING_RIGHT_COW); 1291 if (ret) 1292 wret = 1; 1293 else { 1294 wret = balance_node_right(trans, right, mid); 1295 } 1296 } 1297 if (wret < 0) 1298 ret = wret; 1299 if (wret == 0) { 1300 struct btrfs_disk_key disk_key; 1301 1302 btrfs_node_key(right, &disk_key, 0); 1303 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1304 BTRFS_MOD_LOG_KEY_REPLACE); 1305 if (ret < 0) { 1306 btrfs_tree_unlock(right); 1307 free_extent_buffer(right); 1308 btrfs_abort_transaction(trans, ret); 1309 return ret; 1310 } 1311 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1312 btrfs_mark_buffer_dirty(trans, parent); 1313 1314 if (btrfs_header_nritems(mid) <= orig_slot) { 1315 path->nodes[level] = right; 1316 path->slots[level + 1] += 1; 1317 path->slots[level] = orig_slot - 1318 btrfs_header_nritems(mid); 1319 btrfs_tree_unlock(mid); 1320 free_extent_buffer(mid); 1321 } else { 1322 btrfs_tree_unlock(right); 1323 free_extent_buffer(right); 1324 } 1325 return 0; 1326 } 1327 btrfs_tree_unlock(right); 1328 free_extent_buffer(right); 1329 } 1330 return 1; 1331 } 1332 1333 /* 1334 * readahead one full node of leaves, finding things that are close 1335 * to the block in 'slot', and triggering ra on them. 1336 */ 1337 static void reada_for_search(struct btrfs_fs_info *fs_info, 1338 struct btrfs_path *path, 1339 int level, int slot, u64 objectid) 1340 { 1341 struct extent_buffer *node; 1342 struct btrfs_disk_key disk_key; 1343 u32 nritems; 1344 u64 search; 1345 u64 target; 1346 u64 nread = 0; 1347 u64 nread_max; 1348 u32 nr; 1349 u32 blocksize; 1350 u32 nscan = 0; 1351 1352 if (level != 1 && path->reada != READA_FORWARD_ALWAYS) 1353 return; 1354 1355 if (!path->nodes[level]) 1356 return; 1357 1358 node = path->nodes[level]; 1359 1360 /* 1361 * Since the time between visiting leaves is much shorter than the time 1362 * between visiting nodes, limit read ahead of nodes to 1, to avoid too 1363 * much IO at once (possibly random). 1364 */ 1365 if (path->reada == READA_FORWARD_ALWAYS) { 1366 if (level > 1) 1367 nread_max = node->fs_info->nodesize; 1368 else 1369 nread_max = SZ_128K; 1370 } else { 1371 nread_max = SZ_64K; 1372 } 1373 1374 search = btrfs_node_blockptr(node, slot); 1375 blocksize = fs_info->nodesize; 1376 if (path->reada != READA_FORWARD_ALWAYS) { 1377 struct extent_buffer *eb; 1378 1379 eb = find_extent_buffer(fs_info, search); 1380 if (eb) { 1381 free_extent_buffer(eb); 1382 return; 1383 } 1384 } 1385 1386 target = search; 1387 1388 nritems = btrfs_header_nritems(node); 1389 nr = slot; 1390 1391 while (1) { 1392 if (path->reada == READA_BACK) { 1393 if (nr == 0) 1394 break; 1395 nr--; 1396 } else if (path->reada == READA_FORWARD || 1397 path->reada == READA_FORWARD_ALWAYS) { 1398 nr++; 1399 if (nr >= nritems) 1400 break; 1401 } 1402 if (path->reada == READA_BACK && objectid) { 1403 btrfs_node_key(node, &disk_key, nr); 1404 if (btrfs_disk_key_objectid(&disk_key) != objectid) 1405 break; 1406 } 1407 search = btrfs_node_blockptr(node, nr); 1408 if (path->reada == READA_FORWARD_ALWAYS || 1409 (search <= target && target - search <= 65536) || 1410 (search > target && search - target <= 65536)) { 1411 btrfs_readahead_node_child(node, nr); 1412 nread += blocksize; 1413 } 1414 nscan++; 1415 if (nread > nread_max || nscan > 32) 1416 break; 1417 } 1418 } 1419 1420 static noinline void reada_for_balance(struct btrfs_path *path, int level) 1421 { 1422 struct extent_buffer *parent; 1423 int slot; 1424 int nritems; 1425 1426 parent = path->nodes[level + 1]; 1427 if (!parent) 1428 return; 1429 1430 nritems = btrfs_header_nritems(parent); 1431 slot = path->slots[level + 1]; 1432 1433 if (slot > 0) 1434 btrfs_readahead_node_child(parent, slot - 1); 1435 if (slot + 1 < nritems) 1436 btrfs_readahead_node_child(parent, slot + 1); 1437 } 1438 1439 1440 /* 1441 * when we walk down the tree, it is usually safe to unlock the higher layers 1442 * in the tree. The exceptions are when our path goes through slot 0, because 1443 * operations on the tree might require changing key pointers higher up in the 1444 * tree. 1445 * 1446 * callers might also have set path->keep_locks, which tells this code to keep 1447 * the lock if the path points to the last slot in the block. This is part of 1448 * walking through the tree, and selecting the next slot in the higher block. 1449 * 1450 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 1451 * if lowest_unlock is 1, level 0 won't be unlocked 1452 */ 1453 static noinline void unlock_up(struct btrfs_path *path, int level, 1454 int lowest_unlock, int min_write_lock_level, 1455 int *write_lock_level) 1456 { 1457 int i; 1458 int skip_level = level; 1459 bool check_skip = true; 1460 1461 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1462 if (!path->nodes[i]) 1463 break; 1464 if (!path->locks[i]) 1465 break; 1466 1467 if (check_skip) { 1468 if (path->slots[i] == 0) { 1469 skip_level = i + 1; 1470 continue; 1471 } 1472 1473 if (path->keep_locks) { 1474 u32 nritems; 1475 1476 nritems = btrfs_header_nritems(path->nodes[i]); 1477 if (nritems < 1 || path->slots[i] >= nritems - 1) { 1478 skip_level = i + 1; 1479 continue; 1480 } 1481 } 1482 } 1483 1484 if (i >= lowest_unlock && i > skip_level) { 1485 check_skip = false; 1486 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 1487 path->locks[i] = 0; 1488 if (write_lock_level && 1489 i > min_write_lock_level && 1490 i <= *write_lock_level) { 1491 *write_lock_level = i - 1; 1492 } 1493 } 1494 } 1495 } 1496 1497 /* 1498 * Helper function for btrfs_search_slot() and other functions that do a search 1499 * on a btree. The goal is to find a tree block in the cache (the radix tree at 1500 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read 1501 * its pages from disk. 1502 * 1503 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the 1504 * whole btree search, starting again from the current root node. 1505 */ 1506 static int 1507 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 1508 struct extent_buffer **eb_ret, int slot, 1509 const struct btrfs_key *key) 1510 { 1511 struct btrfs_fs_info *fs_info = root->fs_info; 1512 struct btrfs_tree_parent_check check = { 0 }; 1513 u64 blocknr; 1514 struct extent_buffer *tmp = NULL; 1515 int ret = 0; 1516 int parent_level; 1517 int err; 1518 bool read_tmp = false; 1519 bool tmp_locked = false; 1520 bool path_released = false; 1521 1522 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1523 parent_level = btrfs_header_level(*eb_ret); 1524 btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); 1525 check.has_first_key = true; 1526 check.level = parent_level - 1; 1527 check.transid = btrfs_node_ptr_generation(*eb_ret, slot); 1528 check.owner_root = btrfs_root_id(root); 1529 1530 /* 1531 * If we need to read an extent buffer from disk and we are holding locks 1532 * on upper level nodes, we unlock all the upper nodes before reading the 1533 * extent buffer, and then return -EAGAIN to the caller as it needs to 1534 * restart the search. We don't release the lock on the current level 1535 * because we need to walk this node to figure out which blocks to read. 1536 */ 1537 tmp = find_extent_buffer(fs_info, blocknr); 1538 if (tmp) { 1539 if (p->reada == READA_FORWARD_ALWAYS) 1540 reada_for_search(fs_info, p, parent_level, slot, key->objectid); 1541 1542 /* first we do an atomic uptodate check */ 1543 if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) { 1544 /* 1545 * Do extra check for first_key, eb can be stale due to 1546 * being cached, read from scrub, or have multiple 1547 * parents (shared tree blocks). 1548 */ 1549 if (btrfs_verify_level_key(tmp, &check)) { 1550 ret = -EUCLEAN; 1551 goto out; 1552 } 1553 *eb_ret = tmp; 1554 tmp = NULL; 1555 ret = 0; 1556 goto out; 1557 } 1558 1559 if (p->nowait) { 1560 ret = -EAGAIN; 1561 goto out; 1562 } 1563 1564 if (!p->skip_locking) { 1565 btrfs_unlock_up_safe(p, parent_level + 1); 1566 tmp_locked = true; 1567 btrfs_tree_read_lock(tmp); 1568 btrfs_release_path(p); 1569 ret = -EAGAIN; 1570 path_released = true; 1571 } 1572 1573 /* Now we're allowed to do a blocking uptodate check. */ 1574 err = btrfs_read_extent_buffer(tmp, &check); 1575 if (err) { 1576 ret = err; 1577 goto out; 1578 } 1579 1580 if (ret == 0) { 1581 ASSERT(!tmp_locked); 1582 *eb_ret = tmp; 1583 tmp = NULL; 1584 } 1585 goto out; 1586 } else if (p->nowait) { 1587 ret = -EAGAIN; 1588 goto out; 1589 } 1590 1591 if (!p->skip_locking) { 1592 btrfs_unlock_up_safe(p, parent_level + 1); 1593 ret = -EAGAIN; 1594 } 1595 1596 if (p->reada != READA_NONE) 1597 reada_for_search(fs_info, p, parent_level, slot, key->objectid); 1598 1599 tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level); 1600 if (IS_ERR(tmp)) { 1601 ret = PTR_ERR(tmp); 1602 tmp = NULL; 1603 goto out; 1604 } 1605 read_tmp = true; 1606 1607 if (!p->skip_locking) { 1608 ASSERT(ret == -EAGAIN); 1609 tmp_locked = true; 1610 btrfs_tree_read_lock(tmp); 1611 btrfs_release_path(p); 1612 path_released = true; 1613 } 1614 1615 /* Now we're allowed to do a blocking uptodate check. */ 1616 err = btrfs_read_extent_buffer(tmp, &check); 1617 if (err) { 1618 ret = err; 1619 goto out; 1620 } 1621 1622 /* 1623 * If the read above didn't mark this buffer up to date, 1624 * it will never end up being up to date. Set ret to EIO now 1625 * and give up so that our caller doesn't loop forever 1626 * on our EAGAINs. 1627 */ 1628 if (!extent_buffer_uptodate(tmp)) { 1629 ret = -EIO; 1630 goto out; 1631 } 1632 1633 if (ret == 0) { 1634 ASSERT(!tmp_locked); 1635 *eb_ret = tmp; 1636 tmp = NULL; 1637 } 1638 out: 1639 if (tmp) { 1640 if (tmp_locked) 1641 btrfs_tree_read_unlock(tmp); 1642 if (read_tmp && ret && ret != -EAGAIN) 1643 free_extent_buffer_stale(tmp); 1644 else 1645 free_extent_buffer(tmp); 1646 } 1647 if (ret && !path_released) 1648 btrfs_release_path(p); 1649 1650 return ret; 1651 } 1652 1653 /* 1654 * helper function for btrfs_search_slot. This does all of the checks 1655 * for node-level blocks and does any balancing required based on 1656 * the ins_len. 1657 * 1658 * If no extra work was required, zero is returned. If we had to 1659 * drop the path, -EAGAIN is returned and btrfs_search_slot must 1660 * start over 1661 */ 1662 static int 1663 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1664 struct btrfs_root *root, struct btrfs_path *p, 1665 struct extent_buffer *b, int level, int ins_len, 1666 int *write_lock_level) 1667 { 1668 struct btrfs_fs_info *fs_info = root->fs_info; 1669 int ret = 0; 1670 1671 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1672 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 1673 1674 if (*write_lock_level < level + 1) { 1675 *write_lock_level = level + 1; 1676 btrfs_release_path(p); 1677 return -EAGAIN; 1678 } 1679 1680 reada_for_balance(p, level); 1681 ret = split_node(trans, root, p, level); 1682 1683 b = p->nodes[level]; 1684 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1685 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 1686 1687 if (*write_lock_level < level + 1) { 1688 *write_lock_level = level + 1; 1689 btrfs_release_path(p); 1690 return -EAGAIN; 1691 } 1692 1693 reada_for_balance(p, level); 1694 ret = balance_level(trans, root, p, level); 1695 if (ret) 1696 return ret; 1697 1698 b = p->nodes[level]; 1699 if (!b) { 1700 btrfs_release_path(p); 1701 return -EAGAIN; 1702 } 1703 BUG_ON(btrfs_header_nritems(b) == 1); 1704 } 1705 return ret; 1706 } 1707 1708 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 1709 u64 iobjectid, u64 ioff, u8 key_type, 1710 struct btrfs_key *found_key) 1711 { 1712 int ret; 1713 struct btrfs_key key; 1714 struct extent_buffer *eb; 1715 1716 ASSERT(path); 1717 ASSERT(found_key); 1718 1719 key.type = key_type; 1720 key.objectid = iobjectid; 1721 key.offset = ioff; 1722 1723 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 1724 if (ret < 0) 1725 return ret; 1726 1727 eb = path->nodes[0]; 1728 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 1729 ret = btrfs_next_leaf(fs_root, path); 1730 if (ret) 1731 return ret; 1732 eb = path->nodes[0]; 1733 } 1734 1735 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 1736 if (found_key->type != key.type || 1737 found_key->objectid != key.objectid) 1738 return 1; 1739 1740 return 0; 1741 } 1742 1743 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 1744 struct btrfs_path *p, 1745 int write_lock_level) 1746 { 1747 struct extent_buffer *b; 1748 int root_lock = 0; 1749 int level = 0; 1750 1751 if (p->search_commit_root) { 1752 b = root->commit_root; 1753 atomic_inc(&b->refs); 1754 level = btrfs_header_level(b); 1755 /* 1756 * Ensure that all callers have set skip_locking when 1757 * p->search_commit_root = 1. 1758 */ 1759 ASSERT(p->skip_locking == 1); 1760 1761 goto out; 1762 } 1763 1764 if (p->skip_locking) { 1765 b = btrfs_root_node(root); 1766 level = btrfs_header_level(b); 1767 goto out; 1768 } 1769 1770 /* We try very hard to do read locks on the root */ 1771 root_lock = BTRFS_READ_LOCK; 1772 1773 /* 1774 * If the level is set to maximum, we can skip trying to get the read 1775 * lock. 1776 */ 1777 if (write_lock_level < BTRFS_MAX_LEVEL) { 1778 /* 1779 * We don't know the level of the root node until we actually 1780 * have it read locked 1781 */ 1782 if (p->nowait) { 1783 b = btrfs_try_read_lock_root_node(root); 1784 if (IS_ERR(b)) 1785 return b; 1786 } else { 1787 b = btrfs_read_lock_root_node(root); 1788 } 1789 level = btrfs_header_level(b); 1790 if (level > write_lock_level) 1791 goto out; 1792 1793 /* Whoops, must trade for write lock */ 1794 btrfs_tree_read_unlock(b); 1795 free_extent_buffer(b); 1796 } 1797 1798 b = btrfs_lock_root_node(root); 1799 root_lock = BTRFS_WRITE_LOCK; 1800 1801 /* The level might have changed, check again */ 1802 level = btrfs_header_level(b); 1803 1804 out: 1805 /* 1806 * The root may have failed to write out at some point, and thus is no 1807 * longer valid, return an error in this case. 1808 */ 1809 if (!extent_buffer_uptodate(b)) { 1810 if (root_lock) 1811 btrfs_tree_unlock_rw(b, root_lock); 1812 free_extent_buffer(b); 1813 return ERR_PTR(-EIO); 1814 } 1815 1816 p->nodes[level] = b; 1817 if (!p->skip_locking) 1818 p->locks[level] = root_lock; 1819 /* 1820 * Callers are responsible for dropping b's references. 1821 */ 1822 return b; 1823 } 1824 1825 /* 1826 * Replace the extent buffer at the lowest level of the path with a cloned 1827 * version. The purpose is to be able to use it safely, after releasing the 1828 * commit root semaphore, even if relocation is happening in parallel, the 1829 * transaction used for relocation is committed and the extent buffer is 1830 * reallocated in the next transaction. 1831 * 1832 * This is used in a context where the caller does not prevent transaction 1833 * commits from happening, either by holding a transaction handle or holding 1834 * some lock, while it's doing searches through a commit root. 1835 * At the moment it's only used for send operations. 1836 */ 1837 static int finish_need_commit_sem_search(struct btrfs_path *path) 1838 { 1839 const int i = path->lowest_level; 1840 const int slot = path->slots[i]; 1841 struct extent_buffer *lowest = path->nodes[i]; 1842 struct extent_buffer *clone; 1843 1844 ASSERT(path->need_commit_sem); 1845 1846 if (!lowest) 1847 return 0; 1848 1849 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); 1850 1851 clone = btrfs_clone_extent_buffer(lowest); 1852 if (!clone) 1853 return -ENOMEM; 1854 1855 btrfs_release_path(path); 1856 path->nodes[i] = clone; 1857 path->slots[i] = slot; 1858 1859 return 0; 1860 } 1861 1862 static inline int search_for_key_slot(struct extent_buffer *eb, 1863 int search_low_slot, 1864 const struct btrfs_key *key, 1865 int prev_cmp, 1866 int *slot) 1867 { 1868 /* 1869 * If a previous call to btrfs_bin_search() on a parent node returned an 1870 * exact match (prev_cmp == 0), we can safely assume the target key will 1871 * always be at slot 0 on lower levels, since each key pointer 1872 * (struct btrfs_key_ptr) refers to the lowest key accessible from the 1873 * subtree it points to. Thus we can skip searching lower levels. 1874 */ 1875 if (prev_cmp == 0) { 1876 *slot = 0; 1877 return 0; 1878 } 1879 1880 return btrfs_bin_search(eb, search_low_slot, key, slot); 1881 } 1882 1883 static int search_leaf(struct btrfs_trans_handle *trans, 1884 struct btrfs_root *root, 1885 const struct btrfs_key *key, 1886 struct btrfs_path *path, 1887 int ins_len, 1888 int prev_cmp) 1889 { 1890 struct extent_buffer *leaf = path->nodes[0]; 1891 int leaf_free_space = -1; 1892 int search_low_slot = 0; 1893 int ret; 1894 bool do_bin_search = true; 1895 1896 /* 1897 * If we are doing an insertion, the leaf has enough free space and the 1898 * destination slot for the key is not slot 0, then we can unlock our 1899 * write lock on the parent, and any other upper nodes, before doing the 1900 * binary search on the leaf (with search_for_key_slot()), allowing other 1901 * tasks to lock the parent and any other upper nodes. 1902 */ 1903 if (ins_len > 0) { 1904 /* 1905 * Cache the leaf free space, since we will need it later and it 1906 * will not change until then. 1907 */ 1908 leaf_free_space = btrfs_leaf_free_space(leaf); 1909 1910 /* 1911 * !path->locks[1] means we have a single node tree, the leaf is 1912 * the root of the tree. 1913 */ 1914 if (path->locks[1] && leaf_free_space >= ins_len) { 1915 struct btrfs_disk_key first_key; 1916 1917 ASSERT(btrfs_header_nritems(leaf) > 0); 1918 btrfs_item_key(leaf, &first_key, 0); 1919 1920 /* 1921 * Doing the extra comparison with the first key is cheap, 1922 * taking into account that the first key is very likely 1923 * already in a cache line because it immediately follows 1924 * the extent buffer's header and we have recently accessed 1925 * the header's level field. 1926 */ 1927 ret = btrfs_comp_keys(&first_key, key); 1928 if (ret < 0) { 1929 /* 1930 * The first key is smaller than the key we want 1931 * to insert, so we are safe to unlock all upper 1932 * nodes and we have to do the binary search. 1933 * 1934 * We do use btrfs_unlock_up_safe() and not 1935 * unlock_up() because the later does not unlock 1936 * nodes with a slot of 0 - we can safely unlock 1937 * any node even if its slot is 0 since in this 1938 * case the key does not end up at slot 0 of the 1939 * leaf and there's no need to split the leaf. 1940 */ 1941 btrfs_unlock_up_safe(path, 1); 1942 search_low_slot = 1; 1943 } else { 1944 /* 1945 * The first key is >= then the key we want to 1946 * insert, so we can skip the binary search as 1947 * the target key will be at slot 0. 1948 * 1949 * We can not unlock upper nodes when the key is 1950 * less than the first key, because we will need 1951 * to update the key at slot 0 of the parent node 1952 * and possibly of other upper nodes too. 1953 * If the key matches the first key, then we can 1954 * unlock all the upper nodes, using 1955 * btrfs_unlock_up_safe() instead of unlock_up() 1956 * as stated above. 1957 */ 1958 if (ret == 0) 1959 btrfs_unlock_up_safe(path, 1); 1960 /* 1961 * ret is already 0 or 1, matching the result of 1962 * a btrfs_bin_search() call, so there is no need 1963 * to adjust it. 1964 */ 1965 do_bin_search = false; 1966 path->slots[0] = 0; 1967 } 1968 } 1969 } 1970 1971 if (do_bin_search) { 1972 ret = search_for_key_slot(leaf, search_low_slot, key, 1973 prev_cmp, &path->slots[0]); 1974 if (ret < 0) 1975 return ret; 1976 } 1977 1978 if (ins_len > 0) { 1979 /* 1980 * Item key already exists. In this case, if we are allowed to 1981 * insert the item (for example, in dir_item case, item key 1982 * collision is allowed), it will be merged with the original 1983 * item. Only the item size grows, no new btrfs item will be 1984 * added. If search_for_extension is not set, ins_len already 1985 * accounts the size btrfs_item, deduct it here so leaf space 1986 * check will be correct. 1987 */ 1988 if (ret == 0 && !path->search_for_extension) { 1989 ASSERT(ins_len >= sizeof(struct btrfs_item)); 1990 ins_len -= sizeof(struct btrfs_item); 1991 } 1992 1993 ASSERT(leaf_free_space >= 0); 1994 1995 if (leaf_free_space < ins_len) { 1996 int err; 1997 1998 err = split_leaf(trans, root, key, path, ins_len, 1999 (ret == 0)); 2000 ASSERT(err <= 0); 2001 if (WARN_ON(err > 0)) 2002 err = -EUCLEAN; 2003 if (err) 2004 ret = err; 2005 } 2006 } 2007 2008 return ret; 2009 } 2010 2011 /* 2012 * Look for a key in a tree and perform necessary modifications to preserve 2013 * tree invariants. 2014 * 2015 * @trans: Handle of transaction, used when modifying the tree 2016 * @p: Holds all btree nodes along the search path 2017 * @root: The root node of the tree 2018 * @key: The key we are looking for 2019 * @ins_len: Indicates purpose of search: 2020 * >0 for inserts it's size of item inserted (*) 2021 * <0 for deletions 2022 * 0 for plain searches, not modifying the tree 2023 * 2024 * (*) If size of item inserted doesn't include 2025 * sizeof(struct btrfs_item), then p->search_for_extension must 2026 * be set. 2027 * @cow: boolean should CoW operations be performed. Must always be 1 2028 * when modifying the tree. 2029 * 2030 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 2031 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 2032 * 2033 * If @key is found, 0 is returned and you can find the item in the leaf level 2034 * of the path (level 0) 2035 * 2036 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 2037 * points to the slot where it should be inserted 2038 * 2039 * If an error is encountered while searching the tree a negative error number 2040 * is returned 2041 */ 2042 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2043 const struct btrfs_key *key, struct btrfs_path *p, 2044 int ins_len, int cow) 2045 { 2046 struct btrfs_fs_info *fs_info; 2047 struct extent_buffer *b; 2048 int slot; 2049 int ret; 2050 int err; 2051 int level; 2052 int lowest_unlock = 1; 2053 /* everything at write_lock_level or lower must be write locked */ 2054 int write_lock_level = 0; 2055 u8 lowest_level = 0; 2056 int min_write_lock_level; 2057 int prev_cmp; 2058 2059 if (!root) 2060 return -EINVAL; 2061 2062 fs_info = root->fs_info; 2063 might_sleep(); 2064 2065 lowest_level = p->lowest_level; 2066 WARN_ON(lowest_level && ins_len > 0); 2067 WARN_ON(p->nodes[0] != NULL); 2068 BUG_ON(!cow && ins_len); 2069 2070 /* 2071 * For now only allow nowait for read only operations. There's no 2072 * strict reason why we can't, we just only need it for reads so it's 2073 * only implemented for reads. 2074 */ 2075 ASSERT(!p->nowait || !cow); 2076 2077 if (ins_len < 0) { 2078 lowest_unlock = 2; 2079 2080 /* when we are removing items, we might have to go up to level 2081 * two as we update tree pointers Make sure we keep write 2082 * for those levels as well 2083 */ 2084 write_lock_level = 2; 2085 } else if (ins_len > 0) { 2086 /* 2087 * for inserting items, make sure we have a write lock on 2088 * level 1 so we can update keys 2089 */ 2090 write_lock_level = 1; 2091 } 2092 2093 if (!cow) 2094 write_lock_level = -1; 2095 2096 if (cow && (p->keep_locks || p->lowest_level)) 2097 write_lock_level = BTRFS_MAX_LEVEL; 2098 2099 min_write_lock_level = write_lock_level; 2100 2101 if (p->need_commit_sem) { 2102 ASSERT(p->search_commit_root); 2103 if (p->nowait) { 2104 if (!down_read_trylock(&fs_info->commit_root_sem)) 2105 return -EAGAIN; 2106 } else { 2107 down_read(&fs_info->commit_root_sem); 2108 } 2109 } 2110 2111 again: 2112 prev_cmp = -1; 2113 b = btrfs_search_slot_get_root(root, p, write_lock_level); 2114 if (IS_ERR(b)) { 2115 ret = PTR_ERR(b); 2116 goto done; 2117 } 2118 2119 while (b) { 2120 int dec = 0; 2121 2122 level = btrfs_header_level(b); 2123 2124 if (cow) { 2125 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 2126 2127 /* 2128 * if we don't really need to cow this block 2129 * then we don't want to set the path blocking, 2130 * so we test it here 2131 */ 2132 if (!should_cow_block(trans, root, b)) 2133 goto cow_done; 2134 2135 /* 2136 * must have write locks on this node and the 2137 * parent 2138 */ 2139 if (level > write_lock_level || 2140 (level + 1 > write_lock_level && 2141 level + 1 < BTRFS_MAX_LEVEL && 2142 p->nodes[level + 1])) { 2143 write_lock_level = level + 1; 2144 btrfs_release_path(p); 2145 goto again; 2146 } 2147 2148 if (last_level) 2149 err = btrfs_cow_block(trans, root, b, NULL, 0, 2150 &b, 2151 BTRFS_NESTING_COW); 2152 else 2153 err = btrfs_cow_block(trans, root, b, 2154 p->nodes[level + 1], 2155 p->slots[level + 1], &b, 2156 BTRFS_NESTING_COW); 2157 if (err) { 2158 ret = err; 2159 goto done; 2160 } 2161 } 2162 cow_done: 2163 p->nodes[level] = b; 2164 2165 /* 2166 * we have a lock on b and as long as we aren't changing 2167 * the tree, there is no way to for the items in b to change. 2168 * It is safe to drop the lock on our parent before we 2169 * go through the expensive btree search on b. 2170 * 2171 * If we're inserting or deleting (ins_len != 0), then we might 2172 * be changing slot zero, which may require changing the parent. 2173 * So, we can't drop the lock until after we know which slot 2174 * we're operating on. 2175 */ 2176 if (!ins_len && !p->keep_locks) { 2177 int u = level + 1; 2178 2179 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 2180 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 2181 p->locks[u] = 0; 2182 } 2183 } 2184 2185 if (level == 0) { 2186 if (ins_len > 0) 2187 ASSERT(write_lock_level >= 1); 2188 2189 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); 2190 if (!p->search_for_split) 2191 unlock_up(p, level, lowest_unlock, 2192 min_write_lock_level, NULL); 2193 goto done; 2194 } 2195 2196 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); 2197 if (ret < 0) 2198 goto done; 2199 prev_cmp = ret; 2200 2201 if (ret && slot > 0) { 2202 dec = 1; 2203 slot--; 2204 } 2205 p->slots[level] = slot; 2206 err = setup_nodes_for_search(trans, root, p, b, level, ins_len, 2207 &write_lock_level); 2208 if (err == -EAGAIN) 2209 goto again; 2210 if (err) { 2211 ret = err; 2212 goto done; 2213 } 2214 b = p->nodes[level]; 2215 slot = p->slots[level]; 2216 2217 /* 2218 * Slot 0 is special, if we change the key we have to update 2219 * the parent pointer which means we must have a write lock on 2220 * the parent 2221 */ 2222 if (slot == 0 && ins_len && write_lock_level < level + 1) { 2223 write_lock_level = level + 1; 2224 btrfs_release_path(p); 2225 goto again; 2226 } 2227 2228 unlock_up(p, level, lowest_unlock, min_write_lock_level, 2229 &write_lock_level); 2230 2231 if (level == lowest_level) { 2232 if (dec) 2233 p->slots[level]++; 2234 goto done; 2235 } 2236 2237 err = read_block_for_search(root, p, &b, slot, key); 2238 if (err == -EAGAIN && !p->nowait) 2239 goto again; 2240 if (err) { 2241 ret = err; 2242 goto done; 2243 } 2244 2245 if (!p->skip_locking) { 2246 level = btrfs_header_level(b); 2247 2248 btrfs_maybe_reset_lockdep_class(root, b); 2249 2250 if (level <= write_lock_level) { 2251 btrfs_tree_lock(b); 2252 p->locks[level] = BTRFS_WRITE_LOCK; 2253 } else { 2254 if (p->nowait) { 2255 if (!btrfs_try_tree_read_lock(b)) { 2256 free_extent_buffer(b); 2257 ret = -EAGAIN; 2258 goto done; 2259 } 2260 } else { 2261 btrfs_tree_read_lock(b); 2262 } 2263 p->locks[level] = BTRFS_READ_LOCK; 2264 } 2265 p->nodes[level] = b; 2266 } 2267 } 2268 ret = 1; 2269 done: 2270 if (ret < 0 && !p->skip_release_on_error) 2271 btrfs_release_path(p); 2272 2273 if (p->need_commit_sem) { 2274 int ret2; 2275 2276 ret2 = finish_need_commit_sem_search(p); 2277 up_read(&fs_info->commit_root_sem); 2278 if (ret2) 2279 ret = ret2; 2280 } 2281 2282 return ret; 2283 } 2284 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); 2285 2286 /* 2287 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2288 * current state of the tree together with the operations recorded in the tree 2289 * modification log to search for the key in a previous version of this tree, as 2290 * denoted by the time_seq parameter. 2291 * 2292 * Naturally, there is no support for insert, delete or cow operations. 2293 * 2294 * The resulting path and return value will be set up as if we called 2295 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2296 */ 2297 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 2298 struct btrfs_path *p, u64 time_seq) 2299 { 2300 struct btrfs_fs_info *fs_info = root->fs_info; 2301 struct extent_buffer *b; 2302 int slot; 2303 int ret; 2304 int err; 2305 int level; 2306 int lowest_unlock = 1; 2307 u8 lowest_level = 0; 2308 2309 lowest_level = p->lowest_level; 2310 WARN_ON(p->nodes[0] != NULL); 2311 ASSERT(!p->nowait); 2312 2313 if (p->search_commit_root) { 2314 BUG_ON(time_seq); 2315 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2316 } 2317 2318 again: 2319 b = btrfs_get_old_root(root, time_seq); 2320 if (!b) { 2321 ret = -EIO; 2322 goto done; 2323 } 2324 level = btrfs_header_level(b); 2325 p->locks[level] = BTRFS_READ_LOCK; 2326 2327 while (b) { 2328 int dec = 0; 2329 2330 level = btrfs_header_level(b); 2331 p->nodes[level] = b; 2332 2333 /* 2334 * we have a lock on b and as long as we aren't changing 2335 * the tree, there is no way to for the items in b to change. 2336 * It is safe to drop the lock on our parent before we 2337 * go through the expensive btree search on b. 2338 */ 2339 btrfs_unlock_up_safe(p, level + 1); 2340 2341 ret = btrfs_bin_search(b, 0, key, &slot); 2342 if (ret < 0) 2343 goto done; 2344 2345 if (level == 0) { 2346 p->slots[level] = slot; 2347 unlock_up(p, level, lowest_unlock, 0, NULL); 2348 goto done; 2349 } 2350 2351 if (ret && slot > 0) { 2352 dec = 1; 2353 slot--; 2354 } 2355 p->slots[level] = slot; 2356 unlock_up(p, level, lowest_unlock, 0, NULL); 2357 2358 if (level == lowest_level) { 2359 if (dec) 2360 p->slots[level]++; 2361 goto done; 2362 } 2363 2364 err = read_block_for_search(root, p, &b, slot, key); 2365 if (err == -EAGAIN && !p->nowait) 2366 goto again; 2367 if (err) { 2368 ret = err; 2369 goto done; 2370 } 2371 2372 level = btrfs_header_level(b); 2373 btrfs_tree_read_lock(b); 2374 b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq); 2375 if (!b) { 2376 ret = -ENOMEM; 2377 goto done; 2378 } 2379 p->locks[level] = BTRFS_READ_LOCK; 2380 p->nodes[level] = b; 2381 } 2382 ret = 1; 2383 done: 2384 if (ret < 0) 2385 btrfs_release_path(p); 2386 2387 return ret; 2388 } 2389 2390 /* 2391 * Search the tree again to find a leaf with smaller keys. 2392 * Returns 0 if it found something. 2393 * Returns 1 if there are no smaller keys. 2394 * Returns < 0 on error. 2395 * 2396 * This may release the path, and so you may lose any locks held at the 2397 * time you call it. 2398 */ 2399 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 2400 { 2401 struct btrfs_key key; 2402 struct btrfs_key orig_key; 2403 struct btrfs_disk_key found_key; 2404 int ret; 2405 2406 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 2407 orig_key = key; 2408 2409 if (key.offset > 0) { 2410 key.offset--; 2411 } else if (key.type > 0) { 2412 key.type--; 2413 key.offset = (u64)-1; 2414 } else if (key.objectid > 0) { 2415 key.objectid--; 2416 key.type = (u8)-1; 2417 key.offset = (u64)-1; 2418 } else { 2419 return 1; 2420 } 2421 2422 btrfs_release_path(path); 2423 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2424 if (ret <= 0) 2425 return ret; 2426 2427 /* 2428 * Previous key not found. Even if we were at slot 0 of the leaf we had 2429 * before releasing the path and calling btrfs_search_slot(), we now may 2430 * be in a slot pointing to the same original key - this can happen if 2431 * after we released the path, one of more items were moved from a 2432 * sibling leaf into the front of the leaf we had due to an insertion 2433 * (see push_leaf_right()). 2434 * If we hit this case and our slot is > 0 and just decrement the slot 2435 * so that the caller does not process the same key again, which may or 2436 * may not break the caller, depending on its logic. 2437 */ 2438 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 2439 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); 2440 ret = btrfs_comp_keys(&found_key, &orig_key); 2441 if (ret == 0) { 2442 if (path->slots[0] > 0) { 2443 path->slots[0]--; 2444 return 0; 2445 } 2446 /* 2447 * At slot 0, same key as before, it means orig_key is 2448 * the lowest, leftmost, key in the tree. We're done. 2449 */ 2450 return 1; 2451 } 2452 } 2453 2454 btrfs_item_key(path->nodes[0], &found_key, 0); 2455 ret = btrfs_comp_keys(&found_key, &key); 2456 /* 2457 * We might have had an item with the previous key in the tree right 2458 * before we released our path. And after we released our path, that 2459 * item might have been pushed to the first slot (0) of the leaf we 2460 * were holding due to a tree balance. Alternatively, an item with the 2461 * previous key can exist as the only element of a leaf (big fat item). 2462 * Therefore account for these 2 cases, so that our callers (like 2463 * btrfs_previous_item) don't miss an existing item with a key matching 2464 * the previous key we computed above. 2465 */ 2466 if (ret <= 0) 2467 return 0; 2468 return 1; 2469 } 2470 2471 /* 2472 * helper to use instead of search slot if no exact match is needed but 2473 * instead the next or previous item should be returned. 2474 * When find_higher is true, the next higher item is returned, the next lower 2475 * otherwise. 2476 * When return_any and find_higher are both true, and no higher item is found, 2477 * return the next lower instead. 2478 * When return_any is true and find_higher is false, and no lower item is found, 2479 * return the next higher instead. 2480 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2481 * < 0 on error 2482 */ 2483 int btrfs_search_slot_for_read(struct btrfs_root *root, 2484 const struct btrfs_key *key, 2485 struct btrfs_path *p, int find_higher, 2486 int return_any) 2487 { 2488 int ret; 2489 struct extent_buffer *leaf; 2490 2491 again: 2492 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2493 if (ret <= 0) 2494 return ret; 2495 /* 2496 * a return value of 1 means the path is at the position where the 2497 * item should be inserted. Normally this is the next bigger item, 2498 * but in case the previous item is the last in a leaf, path points 2499 * to the first free slot in the previous leaf, i.e. at an invalid 2500 * item. 2501 */ 2502 leaf = p->nodes[0]; 2503 2504 if (find_higher) { 2505 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2506 ret = btrfs_next_leaf(root, p); 2507 if (ret <= 0) 2508 return ret; 2509 if (!return_any) 2510 return 1; 2511 /* 2512 * no higher item found, return the next 2513 * lower instead 2514 */ 2515 return_any = 0; 2516 find_higher = 0; 2517 btrfs_release_path(p); 2518 goto again; 2519 } 2520 } else { 2521 if (p->slots[0] == 0) { 2522 ret = btrfs_prev_leaf(root, p); 2523 if (ret < 0) 2524 return ret; 2525 if (!ret) { 2526 leaf = p->nodes[0]; 2527 if (p->slots[0] == btrfs_header_nritems(leaf)) 2528 p->slots[0]--; 2529 return 0; 2530 } 2531 if (!return_any) 2532 return 1; 2533 /* 2534 * no lower item found, return the next 2535 * higher instead 2536 */ 2537 return_any = 0; 2538 find_higher = 1; 2539 btrfs_release_path(p); 2540 goto again; 2541 } else { 2542 --p->slots[0]; 2543 } 2544 } 2545 return 0; 2546 } 2547 2548 /* 2549 * Execute search and call btrfs_previous_item to traverse backwards if the item 2550 * was not found. 2551 * 2552 * Return 0 if found, 1 if not found and < 0 if error. 2553 */ 2554 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, 2555 struct btrfs_path *path) 2556 { 2557 int ret; 2558 2559 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 2560 if (ret > 0) 2561 ret = btrfs_previous_item(root, path, key->objectid, key->type); 2562 2563 if (ret == 0) 2564 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2565 2566 return ret; 2567 } 2568 2569 /* 2570 * Search for a valid slot for the given path. 2571 * 2572 * @root: The root node of the tree. 2573 * @key: Will contain a valid item if found. 2574 * @path: The starting point to validate the slot. 2575 * 2576 * Return: 0 if the item is valid 2577 * 1 if not found 2578 * <0 if error. 2579 */ 2580 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, 2581 struct btrfs_path *path) 2582 { 2583 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 2584 int ret; 2585 2586 ret = btrfs_next_leaf(root, path); 2587 if (ret) 2588 return ret; 2589 } 2590 2591 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2592 return 0; 2593 } 2594 2595 /* 2596 * adjust the pointers going up the tree, starting at level 2597 * making sure the right key of each node is points to 'key'. 2598 * This is used after shifting pointers to the left, so it stops 2599 * fixing up pointers when a given leaf/node is not in slot 0 of the 2600 * higher levels 2601 * 2602 */ 2603 static void fixup_low_keys(struct btrfs_trans_handle *trans, 2604 const struct btrfs_path *path, 2605 const struct btrfs_disk_key *key, int level) 2606 { 2607 int i; 2608 struct extent_buffer *t; 2609 int ret; 2610 2611 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2612 int tslot = path->slots[i]; 2613 2614 if (!path->nodes[i]) 2615 break; 2616 t = path->nodes[i]; 2617 ret = btrfs_tree_mod_log_insert_key(t, tslot, 2618 BTRFS_MOD_LOG_KEY_REPLACE); 2619 BUG_ON(ret < 0); 2620 btrfs_set_node_key(t, key, tslot); 2621 btrfs_mark_buffer_dirty(trans, path->nodes[i]); 2622 if (tslot != 0) 2623 break; 2624 } 2625 } 2626 2627 /* 2628 * update item key. 2629 * 2630 * This function isn't completely safe. It's the caller's responsibility 2631 * that the new key won't break the order 2632 */ 2633 void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, 2634 const struct btrfs_path *path, 2635 const struct btrfs_key *new_key) 2636 { 2637 struct btrfs_fs_info *fs_info = trans->fs_info; 2638 struct btrfs_disk_key disk_key; 2639 struct extent_buffer *eb; 2640 int slot; 2641 2642 eb = path->nodes[0]; 2643 slot = path->slots[0]; 2644 if (slot > 0) { 2645 btrfs_item_key(eb, &disk_key, slot - 1); 2646 if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) { 2647 btrfs_print_leaf(eb); 2648 btrfs_crit(fs_info, 2649 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2650 slot, btrfs_disk_key_objectid(&disk_key), 2651 btrfs_disk_key_type(&disk_key), 2652 btrfs_disk_key_offset(&disk_key), 2653 new_key->objectid, new_key->type, 2654 new_key->offset); 2655 BUG(); 2656 } 2657 } 2658 if (slot < btrfs_header_nritems(eb) - 1) { 2659 btrfs_item_key(eb, &disk_key, slot + 1); 2660 if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) { 2661 btrfs_print_leaf(eb); 2662 btrfs_crit(fs_info, 2663 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2664 slot, btrfs_disk_key_objectid(&disk_key), 2665 btrfs_disk_key_type(&disk_key), 2666 btrfs_disk_key_offset(&disk_key), 2667 new_key->objectid, new_key->type, 2668 new_key->offset); 2669 BUG(); 2670 } 2671 } 2672 2673 btrfs_cpu_key_to_disk(&disk_key, new_key); 2674 btrfs_set_item_key(eb, &disk_key, slot); 2675 btrfs_mark_buffer_dirty(trans, eb); 2676 if (slot == 0) 2677 fixup_low_keys(trans, path, &disk_key, 1); 2678 } 2679 2680 /* 2681 * Check key order of two sibling extent buffers. 2682 * 2683 * Return true if something is wrong. 2684 * Return false if everything is fine. 2685 * 2686 * Tree-checker only works inside one tree block, thus the following 2687 * corruption can not be detected by tree-checker: 2688 * 2689 * Leaf @left | Leaf @right 2690 * -------------------------------------------------------------- 2691 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | 2692 * 2693 * Key f6 in leaf @left itself is valid, but not valid when the next 2694 * key in leaf @right is 7. 2695 * This can only be checked at tree block merge time. 2696 * And since tree checker has ensured all key order in each tree block 2697 * is correct, we only need to bother the last key of @left and the first 2698 * key of @right. 2699 */ 2700 static bool check_sibling_keys(const struct extent_buffer *left, 2701 const struct extent_buffer *right) 2702 { 2703 struct btrfs_key left_last; 2704 struct btrfs_key right_first; 2705 int level = btrfs_header_level(left); 2706 int nr_left = btrfs_header_nritems(left); 2707 int nr_right = btrfs_header_nritems(right); 2708 2709 /* No key to check in one of the tree blocks */ 2710 if (!nr_left || !nr_right) 2711 return false; 2712 2713 if (level) { 2714 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); 2715 btrfs_node_key_to_cpu(right, &right_first, 0); 2716 } else { 2717 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); 2718 btrfs_item_key_to_cpu(right, &right_first, 0); 2719 } 2720 2721 if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { 2722 btrfs_crit(left->fs_info, "left extent buffer:"); 2723 btrfs_print_tree(left, false); 2724 btrfs_crit(left->fs_info, "right extent buffer:"); 2725 btrfs_print_tree(right, false); 2726 btrfs_crit(left->fs_info, 2727 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", 2728 left_last.objectid, left_last.type, 2729 left_last.offset, right_first.objectid, 2730 right_first.type, right_first.offset); 2731 return true; 2732 } 2733 return false; 2734 } 2735 2736 /* 2737 * try to push data from one node into the next node left in the 2738 * tree. 2739 * 2740 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2741 * error, and > 0 if there was no room in the left hand block. 2742 */ 2743 static int push_node_left(struct btrfs_trans_handle *trans, 2744 struct extent_buffer *dst, 2745 struct extent_buffer *src, int empty) 2746 { 2747 struct btrfs_fs_info *fs_info = trans->fs_info; 2748 int push_items = 0; 2749 int src_nritems; 2750 int dst_nritems; 2751 int ret = 0; 2752 2753 src_nritems = btrfs_header_nritems(src); 2754 dst_nritems = btrfs_header_nritems(dst); 2755 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2756 WARN_ON(btrfs_header_generation(src) != trans->transid); 2757 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2758 2759 if (!empty && src_nritems <= 8) 2760 return 1; 2761 2762 if (push_items <= 0) 2763 return 1; 2764 2765 if (empty) { 2766 push_items = min(src_nritems, push_items); 2767 if (push_items < src_nritems) { 2768 /* leave at least 8 pointers in the node if 2769 * we aren't going to empty it 2770 */ 2771 if (src_nritems - push_items < 8) { 2772 if (push_items <= 8) 2773 return 1; 2774 push_items -= 8; 2775 } 2776 } 2777 } else 2778 push_items = min(src_nritems - 8, push_items); 2779 2780 /* dst is the left eb, src is the middle eb */ 2781 if (check_sibling_keys(dst, src)) { 2782 ret = -EUCLEAN; 2783 btrfs_abort_transaction(trans, ret); 2784 return ret; 2785 } 2786 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); 2787 if (ret) { 2788 btrfs_abort_transaction(trans, ret); 2789 return ret; 2790 } 2791 copy_extent_buffer(dst, src, 2792 btrfs_node_key_ptr_offset(dst, dst_nritems), 2793 btrfs_node_key_ptr_offset(src, 0), 2794 push_items * sizeof(struct btrfs_key_ptr)); 2795 2796 if (push_items < src_nritems) { 2797 /* 2798 * btrfs_tree_mod_log_eb_copy handles logging the move, so we 2799 * don't need to do an explicit tree mod log operation for it. 2800 */ 2801 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), 2802 btrfs_node_key_ptr_offset(src, push_items), 2803 (src_nritems - push_items) * 2804 sizeof(struct btrfs_key_ptr)); 2805 } 2806 btrfs_set_header_nritems(src, src_nritems - push_items); 2807 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2808 btrfs_mark_buffer_dirty(trans, src); 2809 btrfs_mark_buffer_dirty(trans, dst); 2810 2811 return ret; 2812 } 2813 2814 /* 2815 * try to push data from one node into the next node right in the 2816 * tree. 2817 * 2818 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 2819 * error, and > 0 if there was no room in the right hand block. 2820 * 2821 * this will only push up to 1/2 the contents of the left node over 2822 */ 2823 static int balance_node_right(struct btrfs_trans_handle *trans, 2824 struct extent_buffer *dst, 2825 struct extent_buffer *src) 2826 { 2827 struct btrfs_fs_info *fs_info = trans->fs_info; 2828 int push_items = 0; 2829 int max_push; 2830 int src_nritems; 2831 int dst_nritems; 2832 int ret = 0; 2833 2834 WARN_ON(btrfs_header_generation(src) != trans->transid); 2835 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2836 2837 src_nritems = btrfs_header_nritems(src); 2838 dst_nritems = btrfs_header_nritems(dst); 2839 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2840 if (push_items <= 0) 2841 return 1; 2842 2843 if (src_nritems < 4) 2844 return 1; 2845 2846 max_push = src_nritems / 2 + 1; 2847 /* don't try to empty the node */ 2848 if (max_push >= src_nritems) 2849 return 1; 2850 2851 if (max_push < push_items) 2852 push_items = max_push; 2853 2854 /* dst is the right eb, src is the middle eb */ 2855 if (check_sibling_keys(src, dst)) { 2856 ret = -EUCLEAN; 2857 btrfs_abort_transaction(trans, ret); 2858 return ret; 2859 } 2860 2861 /* 2862 * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't 2863 * need to do an explicit tree mod log operation for it. 2864 */ 2865 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), 2866 btrfs_node_key_ptr_offset(dst, 0), 2867 (dst_nritems) * 2868 sizeof(struct btrfs_key_ptr)); 2869 2870 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, 2871 push_items); 2872 if (ret) { 2873 btrfs_abort_transaction(trans, ret); 2874 return ret; 2875 } 2876 copy_extent_buffer(dst, src, 2877 btrfs_node_key_ptr_offset(dst, 0), 2878 btrfs_node_key_ptr_offset(src, src_nritems - push_items), 2879 push_items * sizeof(struct btrfs_key_ptr)); 2880 2881 btrfs_set_header_nritems(src, src_nritems - push_items); 2882 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2883 2884 btrfs_mark_buffer_dirty(trans, src); 2885 btrfs_mark_buffer_dirty(trans, dst); 2886 2887 return ret; 2888 } 2889 2890 /* 2891 * helper function to insert a new root level in the tree. 2892 * A new node is allocated, and a single item is inserted to 2893 * point to the existing root 2894 * 2895 * returns zero on success or < 0 on failure. 2896 */ 2897 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 2898 struct btrfs_root *root, 2899 struct btrfs_path *path, int level) 2900 { 2901 u64 lower_gen; 2902 struct extent_buffer *lower; 2903 struct extent_buffer *c; 2904 struct extent_buffer *old; 2905 struct btrfs_disk_key lower_key; 2906 int ret; 2907 2908 BUG_ON(path->nodes[level]); 2909 BUG_ON(path->nodes[level-1] != root->node); 2910 2911 lower = path->nodes[level-1]; 2912 if (level == 1) 2913 btrfs_item_key(lower, &lower_key, 0); 2914 else 2915 btrfs_node_key(lower, &lower_key, 0); 2916 2917 c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), 2918 &lower_key, level, root->node->start, 0, 2919 0, BTRFS_NESTING_NEW_ROOT); 2920 if (IS_ERR(c)) 2921 return PTR_ERR(c); 2922 2923 root_add_used_bytes(root); 2924 2925 btrfs_set_header_nritems(c, 1); 2926 btrfs_set_node_key(c, &lower_key, 0); 2927 btrfs_set_node_blockptr(c, 0, lower->start); 2928 lower_gen = btrfs_header_generation(lower); 2929 WARN_ON(lower_gen != trans->transid); 2930 2931 btrfs_set_node_ptr_generation(c, 0, lower_gen); 2932 2933 btrfs_mark_buffer_dirty(trans, c); 2934 2935 old = root->node; 2936 ret = btrfs_tree_mod_log_insert_root(root->node, c, false); 2937 if (ret < 0) { 2938 int ret2; 2939 2940 ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); 2941 if (ret2 < 0) 2942 btrfs_abort_transaction(trans, ret2); 2943 btrfs_tree_unlock(c); 2944 free_extent_buffer(c); 2945 return ret; 2946 } 2947 rcu_assign_pointer(root->node, c); 2948 2949 /* the super has an extra ref to root->node */ 2950 free_extent_buffer(old); 2951 2952 add_root_to_dirty_list(root); 2953 atomic_inc(&c->refs); 2954 path->nodes[level] = c; 2955 path->locks[level] = BTRFS_WRITE_LOCK; 2956 path->slots[level] = 0; 2957 return 0; 2958 } 2959 2960 /* 2961 * worker function to insert a single pointer in a node. 2962 * the node should have enough room for the pointer already 2963 * 2964 * slot and level indicate where you want the key to go, and 2965 * blocknr is the block the key points to. 2966 */ 2967 static int insert_ptr(struct btrfs_trans_handle *trans, 2968 const struct btrfs_path *path, 2969 const struct btrfs_disk_key *key, u64 bytenr, 2970 int slot, int level) 2971 { 2972 struct extent_buffer *lower; 2973 int nritems; 2974 int ret; 2975 2976 BUG_ON(!path->nodes[level]); 2977 btrfs_assert_tree_write_locked(path->nodes[level]); 2978 lower = path->nodes[level]; 2979 nritems = btrfs_header_nritems(lower); 2980 BUG_ON(slot > nritems); 2981 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); 2982 if (slot != nritems) { 2983 if (level) { 2984 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, 2985 slot, nritems - slot); 2986 if (ret < 0) { 2987 btrfs_abort_transaction(trans, ret); 2988 return ret; 2989 } 2990 } 2991 memmove_extent_buffer(lower, 2992 btrfs_node_key_ptr_offset(lower, slot + 1), 2993 btrfs_node_key_ptr_offset(lower, slot), 2994 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2995 } 2996 if (level) { 2997 ret = btrfs_tree_mod_log_insert_key(lower, slot, 2998 BTRFS_MOD_LOG_KEY_ADD); 2999 if (ret < 0) { 3000 btrfs_abort_transaction(trans, ret); 3001 return ret; 3002 } 3003 } 3004 btrfs_set_node_key(lower, key, slot); 3005 btrfs_set_node_blockptr(lower, slot, bytenr); 3006 WARN_ON(trans->transid == 0); 3007 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 3008 btrfs_set_header_nritems(lower, nritems + 1); 3009 btrfs_mark_buffer_dirty(trans, lower); 3010 3011 return 0; 3012 } 3013 3014 /* 3015 * split the node at the specified level in path in two. 3016 * The path is corrected to point to the appropriate node after the split 3017 * 3018 * Before splitting this tries to make some room in the node by pushing 3019 * left and right, if either one works, it returns right away. 3020 * 3021 * returns 0 on success and < 0 on failure 3022 */ 3023 static noinline int split_node(struct btrfs_trans_handle *trans, 3024 struct btrfs_root *root, 3025 struct btrfs_path *path, int level) 3026 { 3027 struct btrfs_fs_info *fs_info = root->fs_info; 3028 struct extent_buffer *c; 3029 struct extent_buffer *split; 3030 struct btrfs_disk_key disk_key; 3031 int mid; 3032 int ret; 3033 u32 c_nritems; 3034 3035 c = path->nodes[level]; 3036 WARN_ON(btrfs_header_generation(c) != trans->transid); 3037 if (c == root->node) { 3038 /* 3039 * trying to split the root, lets make a new one 3040 * 3041 * tree mod log: We don't log_removal old root in 3042 * insert_new_root, because that root buffer will be kept as a 3043 * normal node. We are going to log removal of half of the 3044 * elements below with btrfs_tree_mod_log_eb_copy(). We're 3045 * holding a tree lock on the buffer, which is why we cannot 3046 * race with other tree_mod_log users. 3047 */ 3048 ret = insert_new_root(trans, root, path, level + 1); 3049 if (ret) 3050 return ret; 3051 } else { 3052 ret = push_nodes_for_insert(trans, root, path, level); 3053 c = path->nodes[level]; 3054 if (!ret && btrfs_header_nritems(c) < 3055 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 3056 return 0; 3057 if (ret < 0) 3058 return ret; 3059 } 3060 3061 c_nritems = btrfs_header_nritems(c); 3062 mid = (c_nritems + 1) / 2; 3063 btrfs_node_key(c, &disk_key, mid); 3064 3065 split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), 3066 &disk_key, level, c->start, 0, 3067 0, BTRFS_NESTING_SPLIT); 3068 if (IS_ERR(split)) 3069 return PTR_ERR(split); 3070 3071 root_add_used_bytes(root); 3072 ASSERT(btrfs_header_level(c) == level); 3073 3074 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); 3075 if (ret) { 3076 btrfs_tree_unlock(split); 3077 free_extent_buffer(split); 3078 btrfs_abort_transaction(trans, ret); 3079 return ret; 3080 } 3081 copy_extent_buffer(split, c, 3082 btrfs_node_key_ptr_offset(split, 0), 3083 btrfs_node_key_ptr_offset(c, mid), 3084 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 3085 btrfs_set_header_nritems(split, c_nritems - mid); 3086 btrfs_set_header_nritems(c, mid); 3087 3088 btrfs_mark_buffer_dirty(trans, c); 3089 btrfs_mark_buffer_dirty(trans, split); 3090 3091 ret = insert_ptr(trans, path, &disk_key, split->start, 3092 path->slots[level + 1] + 1, level + 1); 3093 if (ret < 0) { 3094 btrfs_tree_unlock(split); 3095 free_extent_buffer(split); 3096 return ret; 3097 } 3098 3099 if (path->slots[level] >= mid) { 3100 path->slots[level] -= mid; 3101 btrfs_tree_unlock(c); 3102 free_extent_buffer(c); 3103 path->nodes[level] = split; 3104 path->slots[level + 1] += 1; 3105 } else { 3106 btrfs_tree_unlock(split); 3107 free_extent_buffer(split); 3108 } 3109 return 0; 3110 } 3111 3112 /* 3113 * how many bytes are required to store the items in a leaf. start 3114 * and nr indicate which items in the leaf to check. This totals up the 3115 * space used both by the item structs and the item data 3116 */ 3117 static int leaf_space_used(const struct extent_buffer *l, int start, int nr) 3118 { 3119 int data_len; 3120 int nritems = btrfs_header_nritems(l); 3121 int end = min(nritems, start + nr) - 1; 3122 3123 if (!nr) 3124 return 0; 3125 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); 3126 data_len = data_len - btrfs_item_offset(l, end); 3127 data_len += sizeof(struct btrfs_item) * nr; 3128 WARN_ON(data_len < 0); 3129 return data_len; 3130 } 3131 3132 /* 3133 * The space between the end of the leaf items and 3134 * the start of the leaf data. IOW, how much room 3135 * the leaf has left for both items and data 3136 */ 3137 int btrfs_leaf_free_space(const struct extent_buffer *leaf) 3138 { 3139 struct btrfs_fs_info *fs_info = leaf->fs_info; 3140 int nritems = btrfs_header_nritems(leaf); 3141 int ret; 3142 3143 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 3144 if (ret < 0) { 3145 btrfs_crit(fs_info, 3146 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 3147 ret, 3148 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 3149 leaf_space_used(leaf, 0, nritems), nritems); 3150 } 3151 return ret; 3152 } 3153 3154 /* 3155 * min slot controls the lowest index we're willing to push to the 3156 * right. We'll push up to and including min_slot, but no lower 3157 */ 3158 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 3159 struct btrfs_path *path, 3160 int data_size, int empty, 3161 struct extent_buffer *right, 3162 int free_space, u32 left_nritems, 3163 u32 min_slot) 3164 { 3165 struct btrfs_fs_info *fs_info = right->fs_info; 3166 struct extent_buffer *left = path->nodes[0]; 3167 struct extent_buffer *upper = path->nodes[1]; 3168 struct btrfs_map_token token; 3169 struct btrfs_disk_key disk_key; 3170 int slot; 3171 u32 i; 3172 int push_space = 0; 3173 int push_items = 0; 3174 u32 nr; 3175 u32 right_nritems; 3176 u32 data_end; 3177 u32 this_item_size; 3178 3179 if (empty) 3180 nr = 0; 3181 else 3182 nr = max_t(u32, 1, min_slot); 3183 3184 if (path->slots[0] >= left_nritems) 3185 push_space += data_size; 3186 3187 slot = path->slots[1]; 3188 i = left_nritems - 1; 3189 while (i >= nr) { 3190 if (!empty && push_items > 0) { 3191 if (path->slots[0] > i) 3192 break; 3193 if (path->slots[0] == i) { 3194 int space = btrfs_leaf_free_space(left); 3195 3196 if (space + push_space * 2 > free_space) 3197 break; 3198 } 3199 } 3200 3201 if (path->slots[0] == i) 3202 push_space += data_size; 3203 3204 this_item_size = btrfs_item_size(left, i); 3205 if (this_item_size + sizeof(struct btrfs_item) + 3206 push_space > free_space) 3207 break; 3208 3209 push_items++; 3210 push_space += this_item_size + sizeof(struct btrfs_item); 3211 if (i == 0) 3212 break; 3213 i--; 3214 } 3215 3216 if (push_items == 0) 3217 goto out_unlock; 3218 3219 WARN_ON(!empty && push_items == left_nritems); 3220 3221 /* push left to right */ 3222 right_nritems = btrfs_header_nritems(right); 3223 3224 push_space = btrfs_item_data_end(left, left_nritems - push_items); 3225 push_space -= leaf_data_end(left); 3226 3227 /* make room in the right data area */ 3228 data_end = leaf_data_end(right); 3229 memmove_leaf_data(right, data_end - push_space, data_end, 3230 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 3231 3232 /* copy from the left data area */ 3233 copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3234 leaf_data_end(left), push_space); 3235 3236 memmove_leaf_items(right, push_items, 0, right_nritems); 3237 3238 /* copy the items from left to right */ 3239 copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); 3240 3241 /* update the item pointers */ 3242 btrfs_init_map_token(&token, right); 3243 right_nritems += push_items; 3244 btrfs_set_header_nritems(right, right_nritems); 3245 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3246 for (i = 0; i < right_nritems; i++) { 3247 push_space -= btrfs_token_item_size(&token, i); 3248 btrfs_set_token_item_offset(&token, i, push_space); 3249 } 3250 3251 left_nritems -= push_items; 3252 btrfs_set_header_nritems(left, left_nritems); 3253 3254 if (left_nritems) 3255 btrfs_mark_buffer_dirty(trans, left); 3256 else 3257 btrfs_clear_buffer_dirty(trans, left); 3258 3259 btrfs_mark_buffer_dirty(trans, right); 3260 3261 btrfs_item_key(right, &disk_key, 0); 3262 btrfs_set_node_key(upper, &disk_key, slot + 1); 3263 btrfs_mark_buffer_dirty(trans, upper); 3264 3265 /* then fixup the leaf pointer in the path */ 3266 if (path->slots[0] >= left_nritems) { 3267 path->slots[0] -= left_nritems; 3268 if (btrfs_header_nritems(path->nodes[0]) == 0) 3269 btrfs_clear_buffer_dirty(trans, path->nodes[0]); 3270 btrfs_tree_unlock(path->nodes[0]); 3271 free_extent_buffer(path->nodes[0]); 3272 path->nodes[0] = right; 3273 path->slots[1] += 1; 3274 } else { 3275 btrfs_tree_unlock(right); 3276 free_extent_buffer(right); 3277 } 3278 return 0; 3279 3280 out_unlock: 3281 btrfs_tree_unlock(right); 3282 free_extent_buffer(right); 3283 return 1; 3284 } 3285 3286 /* 3287 * push some data in the path leaf to the right, trying to free up at 3288 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3289 * 3290 * returns 1 if the push failed because the other node didn't have enough 3291 * room, 0 if everything worked out and < 0 if there were major errors. 3292 * 3293 * this will push starting from min_slot to the end of the leaf. It won't 3294 * push any slot lower than min_slot 3295 */ 3296 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3297 *root, struct btrfs_path *path, 3298 int min_data_size, int data_size, 3299 int empty, u32 min_slot) 3300 { 3301 struct extent_buffer *left = path->nodes[0]; 3302 struct extent_buffer *right; 3303 struct extent_buffer *upper; 3304 int slot; 3305 int free_space; 3306 u32 left_nritems; 3307 int ret; 3308 3309 if (!path->nodes[1]) 3310 return 1; 3311 3312 slot = path->slots[1]; 3313 upper = path->nodes[1]; 3314 if (slot >= btrfs_header_nritems(upper) - 1) 3315 return 1; 3316 3317 btrfs_assert_tree_write_locked(path->nodes[1]); 3318 3319 right = btrfs_read_node_slot(upper, slot + 1); 3320 if (IS_ERR(right)) 3321 return PTR_ERR(right); 3322 3323 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); 3324 3325 free_space = btrfs_leaf_free_space(right); 3326 if (free_space < data_size) 3327 goto out_unlock; 3328 3329 ret = btrfs_cow_block(trans, root, right, upper, 3330 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 3331 if (ret) 3332 goto out_unlock; 3333 3334 left_nritems = btrfs_header_nritems(left); 3335 if (left_nritems == 0) 3336 goto out_unlock; 3337 3338 if (check_sibling_keys(left, right)) { 3339 ret = -EUCLEAN; 3340 btrfs_abort_transaction(trans, ret); 3341 btrfs_tree_unlock(right); 3342 free_extent_buffer(right); 3343 return ret; 3344 } 3345 if (path->slots[0] == left_nritems && !empty) { 3346 /* Key greater than all keys in the leaf, right neighbor has 3347 * enough room for it and we're not emptying our leaf to delete 3348 * it, therefore use right neighbor to insert the new item and 3349 * no need to touch/dirty our left leaf. */ 3350 btrfs_tree_unlock(left); 3351 free_extent_buffer(left); 3352 path->nodes[0] = right; 3353 path->slots[0] = 0; 3354 path->slots[1]++; 3355 return 0; 3356 } 3357 3358 return __push_leaf_right(trans, path, min_data_size, empty, right, 3359 free_space, left_nritems, min_slot); 3360 out_unlock: 3361 btrfs_tree_unlock(right); 3362 free_extent_buffer(right); 3363 return 1; 3364 } 3365 3366 /* 3367 * push some data in the path leaf to the left, trying to free up at 3368 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3369 * 3370 * max_slot can put a limit on how far into the leaf we'll push items. The 3371 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3372 * items 3373 */ 3374 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 3375 struct btrfs_path *path, int data_size, 3376 int empty, struct extent_buffer *left, 3377 int free_space, u32 right_nritems, 3378 u32 max_slot) 3379 { 3380 struct btrfs_fs_info *fs_info = left->fs_info; 3381 struct btrfs_disk_key disk_key; 3382 struct extent_buffer *right = path->nodes[0]; 3383 int i; 3384 int push_space = 0; 3385 int push_items = 0; 3386 u32 old_left_nritems; 3387 u32 nr; 3388 int ret = 0; 3389 u32 this_item_size; 3390 u32 old_left_item_size; 3391 struct btrfs_map_token token; 3392 3393 if (empty) 3394 nr = min(right_nritems, max_slot); 3395 else 3396 nr = min(right_nritems - 1, max_slot); 3397 3398 for (i = 0; i < nr; i++) { 3399 if (!empty && push_items > 0) { 3400 if (path->slots[0] < i) 3401 break; 3402 if (path->slots[0] == i) { 3403 int space = btrfs_leaf_free_space(right); 3404 3405 if (space + push_space * 2 > free_space) 3406 break; 3407 } 3408 } 3409 3410 if (path->slots[0] == i) 3411 push_space += data_size; 3412 3413 this_item_size = btrfs_item_size(right, i); 3414 if (this_item_size + sizeof(struct btrfs_item) + push_space > 3415 free_space) 3416 break; 3417 3418 push_items++; 3419 push_space += this_item_size + sizeof(struct btrfs_item); 3420 } 3421 3422 if (push_items == 0) { 3423 ret = 1; 3424 goto out; 3425 } 3426 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 3427 3428 /* push data from right to left */ 3429 copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); 3430 3431 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 3432 btrfs_item_offset(right, push_items - 1); 3433 3434 copy_leaf_data(left, right, leaf_data_end(left) - push_space, 3435 btrfs_item_offset(right, push_items - 1), push_space); 3436 old_left_nritems = btrfs_header_nritems(left); 3437 BUG_ON(old_left_nritems <= 0); 3438 3439 btrfs_init_map_token(&token, left); 3440 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); 3441 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3442 u32 ioff; 3443 3444 ioff = btrfs_token_item_offset(&token, i); 3445 btrfs_set_token_item_offset(&token, i, 3446 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); 3447 } 3448 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3449 3450 /* fixup right node */ 3451 if (push_items > right_nritems) 3452 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3453 right_nritems); 3454 3455 if (push_items < right_nritems) { 3456 push_space = btrfs_item_offset(right, push_items - 1) - 3457 leaf_data_end(right); 3458 memmove_leaf_data(right, 3459 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3460 leaf_data_end(right), push_space); 3461 3462 memmove_leaf_items(right, 0, push_items, 3463 btrfs_header_nritems(right) - push_items); 3464 } 3465 3466 btrfs_init_map_token(&token, right); 3467 right_nritems -= push_items; 3468 btrfs_set_header_nritems(right, right_nritems); 3469 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3470 for (i = 0; i < right_nritems; i++) { 3471 push_space = push_space - btrfs_token_item_size(&token, i); 3472 btrfs_set_token_item_offset(&token, i, push_space); 3473 } 3474 3475 btrfs_mark_buffer_dirty(trans, left); 3476 if (right_nritems) 3477 btrfs_mark_buffer_dirty(trans, right); 3478 else 3479 btrfs_clear_buffer_dirty(trans, right); 3480 3481 btrfs_item_key(right, &disk_key, 0); 3482 fixup_low_keys(trans, path, &disk_key, 1); 3483 3484 /* then fixup the leaf pointer in the path */ 3485 if (path->slots[0] < push_items) { 3486 path->slots[0] += old_left_nritems; 3487 btrfs_tree_unlock(path->nodes[0]); 3488 free_extent_buffer(path->nodes[0]); 3489 path->nodes[0] = left; 3490 path->slots[1] -= 1; 3491 } else { 3492 btrfs_tree_unlock(left); 3493 free_extent_buffer(left); 3494 path->slots[0] -= push_items; 3495 } 3496 BUG_ON(path->slots[0] < 0); 3497 return ret; 3498 out: 3499 btrfs_tree_unlock(left); 3500 free_extent_buffer(left); 3501 return ret; 3502 } 3503 3504 /* 3505 * push some data in the path leaf to the left, trying to free up at 3506 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3507 * 3508 * max_slot can put a limit on how far into the leaf we'll push items. The 3509 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3510 * items 3511 */ 3512 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3513 *root, struct btrfs_path *path, int min_data_size, 3514 int data_size, int empty, u32 max_slot) 3515 { 3516 struct extent_buffer *right = path->nodes[0]; 3517 struct extent_buffer *left; 3518 int slot; 3519 int free_space; 3520 u32 right_nritems; 3521 int ret = 0; 3522 3523 slot = path->slots[1]; 3524 if (slot == 0) 3525 return 1; 3526 if (!path->nodes[1]) 3527 return 1; 3528 3529 right_nritems = btrfs_header_nritems(right); 3530 if (right_nritems == 0) 3531 return 1; 3532 3533 btrfs_assert_tree_write_locked(path->nodes[1]); 3534 3535 left = btrfs_read_node_slot(path->nodes[1], slot - 1); 3536 if (IS_ERR(left)) 3537 return PTR_ERR(left); 3538 3539 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); 3540 3541 free_space = btrfs_leaf_free_space(left); 3542 if (free_space < data_size) { 3543 ret = 1; 3544 goto out; 3545 } 3546 3547 ret = btrfs_cow_block(trans, root, left, 3548 path->nodes[1], slot - 1, &left, 3549 BTRFS_NESTING_LEFT_COW); 3550 if (ret) { 3551 /* we hit -ENOSPC, but it isn't fatal here */ 3552 if (ret == -ENOSPC) 3553 ret = 1; 3554 goto out; 3555 } 3556 3557 if (check_sibling_keys(left, right)) { 3558 ret = -EUCLEAN; 3559 btrfs_abort_transaction(trans, ret); 3560 goto out; 3561 } 3562 return __push_leaf_left(trans, path, min_data_size, empty, left, 3563 free_space, right_nritems, max_slot); 3564 out: 3565 btrfs_tree_unlock(left); 3566 free_extent_buffer(left); 3567 return ret; 3568 } 3569 3570 /* 3571 * split the path's leaf in two, making sure there is at least data_size 3572 * available for the resulting leaf level of the path. 3573 */ 3574 static noinline int copy_for_split(struct btrfs_trans_handle *trans, 3575 struct btrfs_path *path, 3576 struct extent_buffer *l, 3577 struct extent_buffer *right, 3578 int slot, int mid, int nritems) 3579 { 3580 struct btrfs_fs_info *fs_info = trans->fs_info; 3581 int data_copy_size; 3582 int rt_data_off; 3583 int i; 3584 int ret; 3585 struct btrfs_disk_key disk_key; 3586 struct btrfs_map_token token; 3587 3588 nritems = nritems - mid; 3589 btrfs_set_header_nritems(right, nritems); 3590 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); 3591 3592 copy_leaf_items(right, l, 0, mid, nritems); 3593 3594 copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, 3595 leaf_data_end(l), data_copy_size); 3596 3597 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); 3598 3599 btrfs_init_map_token(&token, right); 3600 for (i = 0; i < nritems; i++) { 3601 u32 ioff; 3602 3603 ioff = btrfs_token_item_offset(&token, i); 3604 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); 3605 } 3606 3607 btrfs_set_header_nritems(l, mid); 3608 btrfs_item_key(right, &disk_key, 0); 3609 ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); 3610 if (ret < 0) 3611 return ret; 3612 3613 btrfs_mark_buffer_dirty(trans, right); 3614 btrfs_mark_buffer_dirty(trans, l); 3615 BUG_ON(path->slots[0] != slot); 3616 3617 if (mid <= slot) { 3618 btrfs_tree_unlock(path->nodes[0]); 3619 free_extent_buffer(path->nodes[0]); 3620 path->nodes[0] = right; 3621 path->slots[0] -= mid; 3622 path->slots[1] += 1; 3623 } else { 3624 btrfs_tree_unlock(right); 3625 free_extent_buffer(right); 3626 } 3627 3628 BUG_ON(path->slots[0] < 0); 3629 3630 return 0; 3631 } 3632 3633 /* 3634 * double splits happen when we need to insert a big item in the middle 3635 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3636 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3637 * A B C 3638 * 3639 * We avoid this by trying to push the items on either side of our target 3640 * into the adjacent leaves. If all goes well we can avoid the double split 3641 * completely. 3642 */ 3643 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3644 struct btrfs_root *root, 3645 struct btrfs_path *path, 3646 int data_size) 3647 { 3648 int ret; 3649 int progress = 0; 3650 int slot; 3651 u32 nritems; 3652 int space_needed = data_size; 3653 3654 slot = path->slots[0]; 3655 if (slot < btrfs_header_nritems(path->nodes[0])) 3656 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3657 3658 /* 3659 * try to push all the items after our slot into the 3660 * right leaf 3661 */ 3662 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 3663 if (ret < 0) 3664 return ret; 3665 3666 if (ret == 0) 3667 progress++; 3668 3669 nritems = btrfs_header_nritems(path->nodes[0]); 3670 /* 3671 * our goal is to get our slot at the start or end of a leaf. If 3672 * we've done so we're done 3673 */ 3674 if (path->slots[0] == 0 || path->slots[0] == nritems) 3675 return 0; 3676 3677 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3678 return 0; 3679 3680 /* try to push all the items before our slot into the next leaf */ 3681 slot = path->slots[0]; 3682 space_needed = data_size; 3683 if (slot > 0) 3684 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3685 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 3686 if (ret < 0) 3687 return ret; 3688 3689 if (ret == 0) 3690 progress++; 3691 3692 if (progress) 3693 return 0; 3694 return 1; 3695 } 3696 3697 /* 3698 * split the path's leaf in two, making sure there is at least data_size 3699 * available for the resulting leaf level of the path. 3700 * 3701 * returns 0 if all went well and < 0 on failure. 3702 */ 3703 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3704 struct btrfs_root *root, 3705 const struct btrfs_key *ins_key, 3706 struct btrfs_path *path, int data_size, 3707 int extend) 3708 { 3709 struct btrfs_disk_key disk_key; 3710 struct extent_buffer *l; 3711 u32 nritems; 3712 int mid; 3713 int slot; 3714 struct extent_buffer *right; 3715 struct btrfs_fs_info *fs_info = root->fs_info; 3716 int ret = 0; 3717 int wret; 3718 int split; 3719 int num_doubles = 0; 3720 int tried_avoid_double = 0; 3721 3722 l = path->nodes[0]; 3723 slot = path->slots[0]; 3724 if (extend && data_size + btrfs_item_size(l, slot) + 3725 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 3726 return -EOVERFLOW; 3727 3728 /* first try to make some room by pushing left and right */ 3729 if (data_size && path->nodes[1]) { 3730 int space_needed = data_size; 3731 3732 if (slot < btrfs_header_nritems(l)) 3733 space_needed -= btrfs_leaf_free_space(l); 3734 3735 wret = push_leaf_right(trans, root, path, space_needed, 3736 space_needed, 0, 0); 3737 if (wret < 0) 3738 return wret; 3739 if (wret) { 3740 space_needed = data_size; 3741 if (slot > 0) 3742 space_needed -= btrfs_leaf_free_space(l); 3743 wret = push_leaf_left(trans, root, path, space_needed, 3744 space_needed, 0, (u32)-1); 3745 if (wret < 0) 3746 return wret; 3747 } 3748 l = path->nodes[0]; 3749 3750 /* did the pushes work? */ 3751 if (btrfs_leaf_free_space(l) >= data_size) 3752 return 0; 3753 } 3754 3755 if (!path->nodes[1]) { 3756 ret = insert_new_root(trans, root, path, 1); 3757 if (ret) 3758 return ret; 3759 } 3760 again: 3761 split = 1; 3762 l = path->nodes[0]; 3763 slot = path->slots[0]; 3764 nritems = btrfs_header_nritems(l); 3765 mid = (nritems + 1) / 2; 3766 3767 if (mid <= slot) { 3768 if (nritems == 1 || 3769 leaf_space_used(l, mid, nritems - mid) + data_size > 3770 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3771 if (slot >= nritems) { 3772 split = 0; 3773 } else { 3774 mid = slot; 3775 if (mid != nritems && 3776 leaf_space_used(l, mid, nritems - mid) + 3777 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3778 if (data_size && !tried_avoid_double) 3779 goto push_for_double; 3780 split = 2; 3781 } 3782 } 3783 } 3784 } else { 3785 if (leaf_space_used(l, 0, mid) + data_size > 3786 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3787 if (!extend && data_size && slot == 0) { 3788 split = 0; 3789 } else if ((extend || !data_size) && slot == 0) { 3790 mid = 1; 3791 } else { 3792 mid = slot; 3793 if (mid != nritems && 3794 leaf_space_used(l, mid, nritems - mid) + 3795 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3796 if (data_size && !tried_avoid_double) 3797 goto push_for_double; 3798 split = 2; 3799 } 3800 } 3801 } 3802 } 3803 3804 if (split == 0) 3805 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3806 else 3807 btrfs_item_key(l, &disk_key, mid); 3808 3809 /* 3810 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double 3811 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES 3812 * subclasses, which is 8 at the time of this patch, and we've maxed it 3813 * out. In the future we could add a 3814 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just 3815 * use BTRFS_NESTING_NEW_ROOT. 3816 */ 3817 right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), 3818 &disk_key, 0, l->start, 0, 0, 3819 num_doubles ? BTRFS_NESTING_NEW_ROOT : 3820 BTRFS_NESTING_SPLIT); 3821 if (IS_ERR(right)) 3822 return PTR_ERR(right); 3823 3824 root_add_used_bytes(root); 3825 3826 if (split == 0) { 3827 if (mid <= slot) { 3828 btrfs_set_header_nritems(right, 0); 3829 ret = insert_ptr(trans, path, &disk_key, 3830 right->start, path->slots[1] + 1, 1); 3831 if (ret < 0) { 3832 btrfs_tree_unlock(right); 3833 free_extent_buffer(right); 3834 return ret; 3835 } 3836 btrfs_tree_unlock(path->nodes[0]); 3837 free_extent_buffer(path->nodes[0]); 3838 path->nodes[0] = right; 3839 path->slots[0] = 0; 3840 path->slots[1] += 1; 3841 } else { 3842 btrfs_set_header_nritems(right, 0); 3843 ret = insert_ptr(trans, path, &disk_key, 3844 right->start, path->slots[1], 1); 3845 if (ret < 0) { 3846 btrfs_tree_unlock(right); 3847 free_extent_buffer(right); 3848 return ret; 3849 } 3850 btrfs_tree_unlock(path->nodes[0]); 3851 free_extent_buffer(path->nodes[0]); 3852 path->nodes[0] = right; 3853 path->slots[0] = 0; 3854 if (path->slots[1] == 0) 3855 fixup_low_keys(trans, path, &disk_key, 1); 3856 } 3857 /* 3858 * We create a new leaf 'right' for the required ins_len and 3859 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 3860 * the content of ins_len to 'right'. 3861 */ 3862 return ret; 3863 } 3864 3865 ret = copy_for_split(trans, path, l, right, slot, mid, nritems); 3866 if (ret < 0) { 3867 btrfs_tree_unlock(right); 3868 free_extent_buffer(right); 3869 return ret; 3870 } 3871 3872 if (split == 2) { 3873 BUG_ON(num_doubles != 0); 3874 num_doubles++; 3875 goto again; 3876 } 3877 3878 return 0; 3879 3880 push_for_double: 3881 push_for_double_split(trans, root, path, data_size); 3882 tried_avoid_double = 1; 3883 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3884 return 0; 3885 goto again; 3886 } 3887 3888 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3889 struct btrfs_root *root, 3890 struct btrfs_path *path, int ins_len) 3891 { 3892 struct btrfs_key key; 3893 struct extent_buffer *leaf; 3894 struct btrfs_file_extent_item *fi; 3895 u64 extent_len = 0; 3896 u32 item_size; 3897 int ret; 3898 3899 leaf = path->nodes[0]; 3900 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3901 3902 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 3903 key.type != BTRFS_EXTENT_CSUM_KEY); 3904 3905 if (btrfs_leaf_free_space(leaf) >= ins_len) 3906 return 0; 3907 3908 item_size = btrfs_item_size(leaf, path->slots[0]); 3909 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3910 fi = btrfs_item_ptr(leaf, path->slots[0], 3911 struct btrfs_file_extent_item); 3912 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3913 } 3914 btrfs_release_path(path); 3915 3916 path->keep_locks = 1; 3917 path->search_for_split = 1; 3918 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 3919 path->search_for_split = 0; 3920 if (ret > 0) 3921 ret = -EAGAIN; 3922 if (ret < 0) 3923 goto err; 3924 3925 ret = -EAGAIN; 3926 leaf = path->nodes[0]; 3927 /* if our item isn't there, return now */ 3928 if (item_size != btrfs_item_size(leaf, path->slots[0])) 3929 goto err; 3930 3931 /* the leaf has changed, it now has room. return now */ 3932 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) 3933 goto err; 3934 3935 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3936 fi = btrfs_item_ptr(leaf, path->slots[0], 3937 struct btrfs_file_extent_item); 3938 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 3939 goto err; 3940 } 3941 3942 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3943 if (ret) 3944 goto err; 3945 3946 path->keep_locks = 0; 3947 btrfs_unlock_up_safe(path, 1); 3948 return 0; 3949 err: 3950 path->keep_locks = 0; 3951 return ret; 3952 } 3953 3954 static noinline int split_item(struct btrfs_trans_handle *trans, 3955 struct btrfs_path *path, 3956 const struct btrfs_key *new_key, 3957 unsigned long split_offset) 3958 { 3959 struct extent_buffer *leaf; 3960 int orig_slot, slot; 3961 char *buf; 3962 u32 nritems; 3963 u32 item_size; 3964 u32 orig_offset; 3965 struct btrfs_disk_key disk_key; 3966 3967 leaf = path->nodes[0]; 3968 /* 3969 * Shouldn't happen because the caller must have previously called 3970 * setup_leaf_for_split() to make room for the new item in the leaf. 3971 */ 3972 if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item))) 3973 return -ENOSPC; 3974 3975 orig_slot = path->slots[0]; 3976 orig_offset = btrfs_item_offset(leaf, path->slots[0]); 3977 item_size = btrfs_item_size(leaf, path->slots[0]); 3978 3979 buf = kmalloc(item_size, GFP_NOFS); 3980 if (!buf) 3981 return -ENOMEM; 3982 3983 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3984 path->slots[0]), item_size); 3985 3986 slot = path->slots[0] + 1; 3987 nritems = btrfs_header_nritems(leaf); 3988 if (slot != nritems) { 3989 /* shift the items */ 3990 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); 3991 } 3992 3993 btrfs_cpu_key_to_disk(&disk_key, new_key); 3994 btrfs_set_item_key(leaf, &disk_key, slot); 3995 3996 btrfs_set_item_offset(leaf, slot, orig_offset); 3997 btrfs_set_item_size(leaf, slot, item_size - split_offset); 3998 3999 btrfs_set_item_offset(leaf, orig_slot, 4000 orig_offset + item_size - split_offset); 4001 btrfs_set_item_size(leaf, orig_slot, split_offset); 4002 4003 btrfs_set_header_nritems(leaf, nritems + 1); 4004 4005 /* write the data for the start of the original item */ 4006 write_extent_buffer(leaf, buf, 4007 btrfs_item_ptr_offset(leaf, path->slots[0]), 4008 split_offset); 4009 4010 /* write the data for the new item */ 4011 write_extent_buffer(leaf, buf + split_offset, 4012 btrfs_item_ptr_offset(leaf, slot), 4013 item_size - split_offset); 4014 btrfs_mark_buffer_dirty(trans, leaf); 4015 4016 BUG_ON(btrfs_leaf_free_space(leaf) < 0); 4017 kfree(buf); 4018 return 0; 4019 } 4020 4021 /* 4022 * This function splits a single item into two items, 4023 * giving 'new_key' to the new item and splitting the 4024 * old one at split_offset (from the start of the item). 4025 * 4026 * The path may be released by this operation. After 4027 * the split, the path is pointing to the old item. The 4028 * new item is going to be in the same node as the old one. 4029 * 4030 * Note, the item being split must be smaller enough to live alone on 4031 * a tree block with room for one extra struct btrfs_item 4032 * 4033 * This allows us to split the item in place, keeping a lock on the 4034 * leaf the entire time. 4035 */ 4036 int btrfs_split_item(struct btrfs_trans_handle *trans, 4037 struct btrfs_root *root, 4038 struct btrfs_path *path, 4039 const struct btrfs_key *new_key, 4040 unsigned long split_offset) 4041 { 4042 int ret; 4043 ret = setup_leaf_for_split(trans, root, path, 4044 sizeof(struct btrfs_item)); 4045 if (ret) 4046 return ret; 4047 4048 ret = split_item(trans, path, new_key, split_offset); 4049 return ret; 4050 } 4051 4052 /* 4053 * make the item pointed to by the path smaller. new_size indicates 4054 * how small to make it, and from_end tells us if we just chop bytes 4055 * off the end of the item or if we shift the item to chop bytes off 4056 * the front. 4057 */ 4058 void btrfs_truncate_item(struct btrfs_trans_handle *trans, 4059 const struct btrfs_path *path, u32 new_size, int from_end) 4060 { 4061 int slot; 4062 struct extent_buffer *leaf; 4063 u32 nritems; 4064 unsigned int data_end; 4065 unsigned int old_data_start; 4066 unsigned int old_size; 4067 unsigned int size_diff; 4068 int i; 4069 struct btrfs_map_token token; 4070 4071 leaf = path->nodes[0]; 4072 slot = path->slots[0]; 4073 4074 old_size = btrfs_item_size(leaf, slot); 4075 if (old_size == new_size) 4076 return; 4077 4078 nritems = btrfs_header_nritems(leaf); 4079 data_end = leaf_data_end(leaf); 4080 4081 old_data_start = btrfs_item_offset(leaf, slot); 4082 4083 size_diff = old_size - new_size; 4084 4085 BUG_ON(slot < 0); 4086 BUG_ON(slot >= nritems); 4087 4088 /* 4089 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4090 */ 4091 /* first correct the data pointers */ 4092 btrfs_init_map_token(&token, leaf); 4093 for (i = slot; i < nritems; i++) { 4094 u32 ioff; 4095 4096 ioff = btrfs_token_item_offset(&token, i); 4097 btrfs_set_token_item_offset(&token, i, ioff + size_diff); 4098 } 4099 4100 /* shift the data */ 4101 if (from_end) { 4102 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4103 old_data_start + new_size - data_end); 4104 } else { 4105 struct btrfs_disk_key disk_key; 4106 u64 offset; 4107 4108 btrfs_item_key(leaf, &disk_key, slot); 4109 4110 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 4111 unsigned long ptr; 4112 struct btrfs_file_extent_item *fi; 4113 4114 fi = btrfs_item_ptr(leaf, slot, 4115 struct btrfs_file_extent_item); 4116 fi = (struct btrfs_file_extent_item *)( 4117 (unsigned long)fi - size_diff); 4118 4119 if (btrfs_file_extent_type(leaf, fi) == 4120 BTRFS_FILE_EXTENT_INLINE) { 4121 ptr = btrfs_item_ptr_offset(leaf, slot); 4122 memmove_extent_buffer(leaf, ptr, 4123 (unsigned long)fi, 4124 BTRFS_FILE_EXTENT_INLINE_DATA_START); 4125 } 4126 } 4127 4128 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4129 old_data_start - data_end); 4130 4131 offset = btrfs_disk_key_offset(&disk_key); 4132 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4133 btrfs_set_item_key(leaf, &disk_key, slot); 4134 if (slot == 0) 4135 fixup_low_keys(trans, path, &disk_key, 1); 4136 } 4137 4138 btrfs_set_item_size(leaf, slot, new_size); 4139 btrfs_mark_buffer_dirty(trans, leaf); 4140 4141 if (btrfs_leaf_free_space(leaf) < 0) { 4142 btrfs_print_leaf(leaf); 4143 BUG(); 4144 } 4145 } 4146 4147 /* 4148 * make the item pointed to by the path bigger, data_size is the added size. 4149 */ 4150 void btrfs_extend_item(struct btrfs_trans_handle *trans, 4151 const struct btrfs_path *path, u32 data_size) 4152 { 4153 int slot; 4154 struct extent_buffer *leaf; 4155 u32 nritems; 4156 unsigned int data_end; 4157 unsigned int old_data; 4158 unsigned int old_size; 4159 int i; 4160 struct btrfs_map_token token; 4161 4162 leaf = path->nodes[0]; 4163 4164 nritems = btrfs_header_nritems(leaf); 4165 data_end = leaf_data_end(leaf); 4166 4167 if (btrfs_leaf_free_space(leaf) < data_size) { 4168 btrfs_print_leaf(leaf); 4169 BUG(); 4170 } 4171 slot = path->slots[0]; 4172 old_data = btrfs_item_data_end(leaf, slot); 4173 4174 BUG_ON(slot < 0); 4175 if (slot >= nritems) { 4176 btrfs_print_leaf(leaf); 4177 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", 4178 slot, nritems); 4179 BUG(); 4180 } 4181 4182 /* 4183 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4184 */ 4185 /* first correct the data pointers */ 4186 btrfs_init_map_token(&token, leaf); 4187 for (i = slot; i < nritems; i++) { 4188 u32 ioff; 4189 4190 ioff = btrfs_token_item_offset(&token, i); 4191 btrfs_set_token_item_offset(&token, i, ioff - data_size); 4192 } 4193 4194 /* shift the data */ 4195 memmove_leaf_data(leaf, data_end - data_size, data_end, 4196 old_data - data_end); 4197 4198 data_end = old_data; 4199 old_size = btrfs_item_size(leaf, slot); 4200 btrfs_set_item_size(leaf, slot, old_size + data_size); 4201 btrfs_mark_buffer_dirty(trans, leaf); 4202 4203 if (btrfs_leaf_free_space(leaf) < 0) { 4204 btrfs_print_leaf(leaf); 4205 BUG(); 4206 } 4207 } 4208 4209 /* 4210 * Make space in the node before inserting one or more items. 4211 * 4212 * @trans: transaction handle 4213 * @root: root we are inserting items to 4214 * @path: points to the leaf/slot where we are going to insert new items 4215 * @batch: information about the batch of items to insert 4216 * 4217 * Main purpose is to save stack depth by doing the bulk of the work in a 4218 * function that doesn't call btrfs_search_slot 4219 */ 4220 static void setup_items_for_insert(struct btrfs_trans_handle *trans, 4221 struct btrfs_root *root, struct btrfs_path *path, 4222 const struct btrfs_item_batch *batch) 4223 { 4224 struct btrfs_fs_info *fs_info = root->fs_info; 4225 int i; 4226 u32 nritems; 4227 unsigned int data_end; 4228 struct btrfs_disk_key disk_key; 4229 struct extent_buffer *leaf; 4230 int slot; 4231 struct btrfs_map_token token; 4232 u32 total_size; 4233 4234 /* 4235 * Before anything else, update keys in the parent and other ancestors 4236 * if needed, then release the write locks on them, so that other tasks 4237 * can use them while we modify the leaf. 4238 */ 4239 if (path->slots[0] == 0) { 4240 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); 4241 fixup_low_keys(trans, path, &disk_key, 1); 4242 } 4243 btrfs_unlock_up_safe(path, 1); 4244 4245 leaf = path->nodes[0]; 4246 slot = path->slots[0]; 4247 4248 nritems = btrfs_header_nritems(leaf); 4249 data_end = leaf_data_end(leaf); 4250 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4251 4252 if (btrfs_leaf_free_space(leaf) < total_size) { 4253 btrfs_print_leaf(leaf); 4254 btrfs_crit(fs_info, "not enough freespace need %u have %d", 4255 total_size, btrfs_leaf_free_space(leaf)); 4256 BUG(); 4257 } 4258 4259 btrfs_init_map_token(&token, leaf); 4260 if (slot != nritems) { 4261 unsigned int old_data = btrfs_item_data_end(leaf, slot); 4262 4263 if (old_data < data_end) { 4264 btrfs_print_leaf(leaf); 4265 btrfs_crit(fs_info, 4266 "item at slot %d with data offset %u beyond data end of leaf %u", 4267 slot, old_data, data_end); 4268 BUG(); 4269 } 4270 /* 4271 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4272 */ 4273 /* first correct the data pointers */ 4274 for (i = slot; i < nritems; i++) { 4275 u32 ioff; 4276 4277 ioff = btrfs_token_item_offset(&token, i); 4278 btrfs_set_token_item_offset(&token, i, 4279 ioff - batch->total_data_size); 4280 } 4281 /* shift the items */ 4282 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); 4283 4284 /* shift the data */ 4285 memmove_leaf_data(leaf, data_end - batch->total_data_size, 4286 data_end, old_data - data_end); 4287 data_end = old_data; 4288 } 4289 4290 /* setup the item for the new data */ 4291 for (i = 0; i < batch->nr; i++) { 4292 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); 4293 btrfs_set_item_key(leaf, &disk_key, slot + i); 4294 data_end -= batch->data_sizes[i]; 4295 btrfs_set_token_item_offset(&token, slot + i, data_end); 4296 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); 4297 } 4298 4299 btrfs_set_header_nritems(leaf, nritems + batch->nr); 4300 btrfs_mark_buffer_dirty(trans, leaf); 4301 4302 if (btrfs_leaf_free_space(leaf) < 0) { 4303 btrfs_print_leaf(leaf); 4304 BUG(); 4305 } 4306 } 4307 4308 /* 4309 * Insert a new item into a leaf. 4310 * 4311 * @trans: Transaction handle. 4312 * @root: The root of the btree. 4313 * @path: A path pointing to the target leaf and slot. 4314 * @key: The key of the new item. 4315 * @data_size: The size of the data associated with the new key. 4316 */ 4317 void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, 4318 struct btrfs_root *root, 4319 struct btrfs_path *path, 4320 const struct btrfs_key *key, 4321 u32 data_size) 4322 { 4323 struct btrfs_item_batch batch; 4324 4325 batch.keys = key; 4326 batch.data_sizes = &data_size; 4327 batch.total_data_size = data_size; 4328 batch.nr = 1; 4329 4330 setup_items_for_insert(trans, root, path, &batch); 4331 } 4332 4333 /* 4334 * Given a key and some data, insert items into the tree. 4335 * This does all the path init required, making room in the tree if needed. 4336 * 4337 * Returns: 0 on success 4338 * -EEXIST if the first key already exists 4339 * < 0 on other errors 4340 */ 4341 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4342 struct btrfs_root *root, 4343 struct btrfs_path *path, 4344 const struct btrfs_item_batch *batch) 4345 { 4346 int ret = 0; 4347 int slot; 4348 u32 total_size; 4349 4350 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4351 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); 4352 if (ret == 0) 4353 return -EEXIST; 4354 if (ret < 0) 4355 return ret; 4356 4357 slot = path->slots[0]; 4358 BUG_ON(slot < 0); 4359 4360 setup_items_for_insert(trans, root, path, batch); 4361 return 0; 4362 } 4363 4364 /* 4365 * Given a key and some data, insert an item into the tree. 4366 * This does all the path init required, making room in the tree if needed. 4367 */ 4368 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4369 const struct btrfs_key *cpu_key, void *data, 4370 u32 data_size) 4371 { 4372 int ret = 0; 4373 struct btrfs_path *path; 4374 struct extent_buffer *leaf; 4375 unsigned long ptr; 4376 4377 path = btrfs_alloc_path(); 4378 if (!path) 4379 return -ENOMEM; 4380 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4381 if (!ret) { 4382 leaf = path->nodes[0]; 4383 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4384 write_extent_buffer(leaf, data, ptr, data_size); 4385 btrfs_mark_buffer_dirty(trans, leaf); 4386 } 4387 btrfs_free_path(path); 4388 return ret; 4389 } 4390 4391 /* 4392 * This function duplicates an item, giving 'new_key' to the new item. 4393 * It guarantees both items live in the same tree leaf and the new item is 4394 * contiguous with the original item. 4395 * 4396 * This allows us to split a file extent in place, keeping a lock on the leaf 4397 * the entire time. 4398 */ 4399 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4400 struct btrfs_root *root, 4401 struct btrfs_path *path, 4402 const struct btrfs_key *new_key) 4403 { 4404 struct extent_buffer *leaf; 4405 int ret; 4406 u32 item_size; 4407 4408 leaf = path->nodes[0]; 4409 item_size = btrfs_item_size(leaf, path->slots[0]); 4410 ret = setup_leaf_for_split(trans, root, path, 4411 item_size + sizeof(struct btrfs_item)); 4412 if (ret) 4413 return ret; 4414 4415 path->slots[0]++; 4416 btrfs_setup_item_for_insert(trans, root, path, new_key, item_size); 4417 leaf = path->nodes[0]; 4418 memcpy_extent_buffer(leaf, 4419 btrfs_item_ptr_offset(leaf, path->slots[0]), 4420 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4421 item_size); 4422 return 0; 4423 } 4424 4425 /* 4426 * delete the pointer from a given node. 4427 * 4428 * the tree should have been previously balanced so the deletion does not 4429 * empty a node. 4430 * 4431 * This is exported for use inside btrfs-progs, don't un-export it. 4432 */ 4433 int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4434 struct btrfs_path *path, int level, int slot) 4435 { 4436 struct extent_buffer *parent = path->nodes[level]; 4437 u32 nritems; 4438 int ret; 4439 4440 nritems = btrfs_header_nritems(parent); 4441 if (slot != nritems - 1) { 4442 if (level) { 4443 ret = btrfs_tree_mod_log_insert_move(parent, slot, 4444 slot + 1, nritems - slot - 1); 4445 if (ret < 0) { 4446 btrfs_abort_transaction(trans, ret); 4447 return ret; 4448 } 4449 } 4450 memmove_extent_buffer(parent, 4451 btrfs_node_key_ptr_offset(parent, slot), 4452 btrfs_node_key_ptr_offset(parent, slot + 1), 4453 sizeof(struct btrfs_key_ptr) * 4454 (nritems - slot - 1)); 4455 } else if (level) { 4456 ret = btrfs_tree_mod_log_insert_key(parent, slot, 4457 BTRFS_MOD_LOG_KEY_REMOVE); 4458 if (ret < 0) { 4459 btrfs_abort_transaction(trans, ret); 4460 return ret; 4461 } 4462 } 4463 4464 nritems--; 4465 btrfs_set_header_nritems(parent, nritems); 4466 if (nritems == 0 && parent == root->node) { 4467 BUG_ON(btrfs_header_level(root->node) != 1); 4468 /* just turn the root into a leaf and break */ 4469 btrfs_set_header_level(root->node, 0); 4470 } else if (slot == 0) { 4471 struct btrfs_disk_key disk_key; 4472 4473 btrfs_node_key(parent, &disk_key, 0); 4474 fixup_low_keys(trans, path, &disk_key, level + 1); 4475 } 4476 btrfs_mark_buffer_dirty(trans, parent); 4477 return 0; 4478 } 4479 4480 /* 4481 * a helper function to delete the leaf pointed to by path->slots[1] and 4482 * path->nodes[1]. 4483 * 4484 * This deletes the pointer in path->nodes[1] and frees the leaf 4485 * block extent. zero is returned if it all worked out, < 0 otherwise. 4486 * 4487 * The path must have already been setup for deleting the leaf, including 4488 * all the proper balancing. path->nodes[1] must be locked. 4489 */ 4490 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, 4491 struct btrfs_root *root, 4492 struct btrfs_path *path, 4493 struct extent_buffer *leaf) 4494 { 4495 int ret; 4496 4497 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4498 ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]); 4499 if (ret < 0) 4500 return ret; 4501 4502 /* 4503 * btrfs_free_extent is expensive, we want to make sure we 4504 * aren't holding any locks when we call it 4505 */ 4506 btrfs_unlock_up_safe(path, 0); 4507 4508 root_sub_used_bytes(root); 4509 4510 atomic_inc(&leaf->refs); 4511 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); 4512 free_extent_buffer_stale(leaf); 4513 if (ret < 0) 4514 btrfs_abort_transaction(trans, ret); 4515 4516 return ret; 4517 } 4518 /* 4519 * delete the item at the leaf level in path. If that empties 4520 * the leaf, remove it from the tree 4521 */ 4522 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4523 struct btrfs_path *path, int slot, int nr) 4524 { 4525 struct btrfs_fs_info *fs_info = root->fs_info; 4526 struct extent_buffer *leaf; 4527 int ret = 0; 4528 int wret; 4529 u32 nritems; 4530 4531 leaf = path->nodes[0]; 4532 nritems = btrfs_header_nritems(leaf); 4533 4534 if (slot + nr != nritems) { 4535 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); 4536 const int data_end = leaf_data_end(leaf); 4537 struct btrfs_map_token token; 4538 u32 dsize = 0; 4539 int i; 4540 4541 for (i = 0; i < nr; i++) 4542 dsize += btrfs_item_size(leaf, slot + i); 4543 4544 memmove_leaf_data(leaf, data_end + dsize, data_end, 4545 last_off - data_end); 4546 4547 btrfs_init_map_token(&token, leaf); 4548 for (i = slot + nr; i < nritems; i++) { 4549 u32 ioff; 4550 4551 ioff = btrfs_token_item_offset(&token, i); 4552 btrfs_set_token_item_offset(&token, i, ioff + dsize); 4553 } 4554 4555 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); 4556 } 4557 btrfs_set_header_nritems(leaf, nritems - nr); 4558 nritems -= nr; 4559 4560 /* delete the leaf if we've emptied it */ 4561 if (nritems == 0) { 4562 if (leaf == root->node) { 4563 btrfs_set_header_level(leaf, 0); 4564 } else { 4565 btrfs_clear_buffer_dirty(trans, leaf); 4566 ret = btrfs_del_leaf(trans, root, path, leaf); 4567 if (ret < 0) 4568 return ret; 4569 } 4570 } else { 4571 int used = leaf_space_used(leaf, 0, nritems); 4572 if (slot == 0) { 4573 struct btrfs_disk_key disk_key; 4574 4575 btrfs_item_key(leaf, &disk_key, 0); 4576 fixup_low_keys(trans, path, &disk_key, 1); 4577 } 4578 4579 /* 4580 * Try to delete the leaf if it is mostly empty. We do this by 4581 * trying to move all its items into its left and right neighbours. 4582 * If we can't move all the items, then we don't delete it - it's 4583 * not ideal, but future insertions might fill the leaf with more 4584 * items, or items from other leaves might be moved later into our 4585 * leaf due to deletions on those leaves. 4586 */ 4587 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4588 u32 min_push_space; 4589 4590 /* push_leaf_left fixes the path. 4591 * make sure the path still points to our leaf 4592 * for possible call to btrfs_del_ptr below 4593 */ 4594 slot = path->slots[1]; 4595 atomic_inc(&leaf->refs); 4596 /* 4597 * We want to be able to at least push one item to the 4598 * left neighbour leaf, and that's the first item. 4599 */ 4600 min_push_space = sizeof(struct btrfs_item) + 4601 btrfs_item_size(leaf, 0); 4602 wret = push_leaf_left(trans, root, path, 0, 4603 min_push_space, 1, (u32)-1); 4604 if (wret < 0 && wret != -ENOSPC) 4605 ret = wret; 4606 4607 if (path->nodes[0] == leaf && 4608 btrfs_header_nritems(leaf)) { 4609 /* 4610 * If we were not able to push all items from our 4611 * leaf to its left neighbour, then attempt to 4612 * either push all the remaining items to the 4613 * right neighbour or none. There's no advantage 4614 * in pushing only some items, instead of all, as 4615 * it's pointless to end up with a leaf having 4616 * too few items while the neighbours can be full 4617 * or nearly full. 4618 */ 4619 nritems = btrfs_header_nritems(leaf); 4620 min_push_space = leaf_space_used(leaf, 0, nritems); 4621 wret = push_leaf_right(trans, root, path, 0, 4622 min_push_space, 1, 0); 4623 if (wret < 0 && wret != -ENOSPC) 4624 ret = wret; 4625 } 4626 4627 if (btrfs_header_nritems(leaf) == 0) { 4628 path->slots[1] = slot; 4629 ret = btrfs_del_leaf(trans, root, path, leaf); 4630 if (ret < 0) 4631 return ret; 4632 free_extent_buffer(leaf); 4633 ret = 0; 4634 } else { 4635 /* if we're still in the path, make sure 4636 * we're dirty. Otherwise, one of the 4637 * push_leaf functions must have already 4638 * dirtied this buffer 4639 */ 4640 if (path->nodes[0] == leaf) 4641 btrfs_mark_buffer_dirty(trans, leaf); 4642 free_extent_buffer(leaf); 4643 } 4644 } else { 4645 btrfs_mark_buffer_dirty(trans, leaf); 4646 } 4647 } 4648 return ret; 4649 } 4650 4651 /* 4652 * A helper function to walk down the tree starting at min_key, and looking 4653 * for nodes or leaves that are have a minimum transaction id. 4654 * This is used by the btree defrag code, and tree logging 4655 * 4656 * This does not cow, but it does stuff the starting key it finds back 4657 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4658 * key and get a writable path. 4659 * 4660 * This honors path->lowest_level to prevent descent past a given level 4661 * of the tree. 4662 * 4663 * min_trans indicates the oldest transaction that you are interested 4664 * in walking through. Any nodes or leaves older than min_trans are 4665 * skipped over (without reading them). 4666 * 4667 * returns zero if something useful was found, < 0 on error and 1 if there 4668 * was nothing in the tree that matched the search criteria. 4669 */ 4670 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4671 struct btrfs_path *path, 4672 u64 min_trans) 4673 { 4674 struct extent_buffer *cur; 4675 struct btrfs_key found_key; 4676 int slot; 4677 int sret; 4678 u32 nritems; 4679 int level; 4680 int ret = 1; 4681 int keep_locks = path->keep_locks; 4682 4683 ASSERT(!path->nowait); 4684 path->keep_locks = 1; 4685 again: 4686 cur = btrfs_read_lock_root_node(root); 4687 level = btrfs_header_level(cur); 4688 WARN_ON(path->nodes[level]); 4689 path->nodes[level] = cur; 4690 path->locks[level] = BTRFS_READ_LOCK; 4691 4692 if (btrfs_header_generation(cur) < min_trans) { 4693 ret = 1; 4694 goto out; 4695 } 4696 while (1) { 4697 nritems = btrfs_header_nritems(cur); 4698 level = btrfs_header_level(cur); 4699 sret = btrfs_bin_search(cur, 0, min_key, &slot); 4700 if (sret < 0) { 4701 ret = sret; 4702 goto out; 4703 } 4704 4705 /* at the lowest level, we're done, setup the path and exit */ 4706 if (level == path->lowest_level) { 4707 if (slot >= nritems) 4708 goto find_next_key; 4709 ret = 0; 4710 path->slots[level] = slot; 4711 btrfs_item_key_to_cpu(cur, &found_key, slot); 4712 goto out; 4713 } 4714 if (sret && slot > 0) 4715 slot--; 4716 /* 4717 * check this node pointer against the min_trans parameters. 4718 * If it is too old, skip to the next one. 4719 */ 4720 while (slot < nritems) { 4721 u64 gen; 4722 4723 gen = btrfs_node_ptr_generation(cur, slot); 4724 if (gen < min_trans) { 4725 slot++; 4726 continue; 4727 } 4728 break; 4729 } 4730 find_next_key: 4731 /* 4732 * we didn't find a candidate key in this node, walk forward 4733 * and find another one 4734 */ 4735 if (slot >= nritems) { 4736 path->slots[level] = slot; 4737 sret = btrfs_find_next_key(root, path, min_key, level, 4738 min_trans); 4739 if (sret == 0) { 4740 btrfs_release_path(path); 4741 goto again; 4742 } else { 4743 goto out; 4744 } 4745 } 4746 /* save our key for returning back */ 4747 btrfs_node_key_to_cpu(cur, &found_key, slot); 4748 path->slots[level] = slot; 4749 if (level == path->lowest_level) { 4750 ret = 0; 4751 goto out; 4752 } 4753 cur = btrfs_read_node_slot(cur, slot); 4754 if (IS_ERR(cur)) { 4755 ret = PTR_ERR(cur); 4756 goto out; 4757 } 4758 4759 btrfs_tree_read_lock(cur); 4760 4761 path->locks[level - 1] = BTRFS_READ_LOCK; 4762 path->nodes[level - 1] = cur; 4763 unlock_up(path, level, 1, 0, NULL); 4764 } 4765 out: 4766 path->keep_locks = keep_locks; 4767 if (ret == 0) { 4768 btrfs_unlock_up_safe(path, path->lowest_level + 1); 4769 memcpy(min_key, &found_key, sizeof(found_key)); 4770 } 4771 return ret; 4772 } 4773 4774 /* 4775 * this is similar to btrfs_next_leaf, but does not try to preserve 4776 * and fixup the path. It looks for and returns the next key in the 4777 * tree based on the current path and the min_trans parameters. 4778 * 4779 * 0 is returned if another key is found, < 0 if there are any errors 4780 * and 1 is returned if there are no higher keys in the tree 4781 * 4782 * path->keep_locks should be set to 1 on the search made before 4783 * calling this function. 4784 */ 4785 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4786 struct btrfs_key *key, int level, u64 min_trans) 4787 { 4788 int slot; 4789 struct extent_buffer *c; 4790 4791 WARN_ON(!path->keep_locks && !path->skip_locking); 4792 while (level < BTRFS_MAX_LEVEL) { 4793 if (!path->nodes[level]) 4794 return 1; 4795 4796 slot = path->slots[level] + 1; 4797 c = path->nodes[level]; 4798 next: 4799 if (slot >= btrfs_header_nritems(c)) { 4800 int ret; 4801 int orig_lowest; 4802 struct btrfs_key cur_key; 4803 if (level + 1 >= BTRFS_MAX_LEVEL || 4804 !path->nodes[level + 1]) 4805 return 1; 4806 4807 if (path->locks[level + 1] || path->skip_locking) { 4808 level++; 4809 continue; 4810 } 4811 4812 slot = btrfs_header_nritems(c) - 1; 4813 if (level == 0) 4814 btrfs_item_key_to_cpu(c, &cur_key, slot); 4815 else 4816 btrfs_node_key_to_cpu(c, &cur_key, slot); 4817 4818 orig_lowest = path->lowest_level; 4819 btrfs_release_path(path); 4820 path->lowest_level = level; 4821 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4822 0, 0); 4823 path->lowest_level = orig_lowest; 4824 if (ret < 0) 4825 return ret; 4826 4827 c = path->nodes[level]; 4828 slot = path->slots[level]; 4829 if (ret == 0) 4830 slot++; 4831 goto next; 4832 } 4833 4834 if (level == 0) 4835 btrfs_item_key_to_cpu(c, key, slot); 4836 else { 4837 u64 gen = btrfs_node_ptr_generation(c, slot); 4838 4839 if (gen < min_trans) { 4840 slot++; 4841 goto next; 4842 } 4843 btrfs_node_key_to_cpu(c, key, slot); 4844 } 4845 return 0; 4846 } 4847 return 1; 4848 } 4849 4850 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 4851 u64 time_seq) 4852 { 4853 int slot; 4854 int level; 4855 struct extent_buffer *c; 4856 struct extent_buffer *next; 4857 struct btrfs_fs_info *fs_info = root->fs_info; 4858 struct btrfs_key key; 4859 bool need_commit_sem = false; 4860 u32 nritems; 4861 int ret; 4862 int i; 4863 4864 /* 4865 * The nowait semantics are used only for write paths, where we don't 4866 * use the tree mod log and sequence numbers. 4867 */ 4868 if (time_seq) 4869 ASSERT(!path->nowait); 4870 4871 nritems = btrfs_header_nritems(path->nodes[0]); 4872 if (nritems == 0) 4873 return 1; 4874 4875 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4876 again: 4877 level = 1; 4878 next = NULL; 4879 btrfs_release_path(path); 4880 4881 path->keep_locks = 1; 4882 4883 if (time_seq) { 4884 ret = btrfs_search_old_slot(root, &key, path, time_seq); 4885 } else { 4886 if (path->need_commit_sem) { 4887 path->need_commit_sem = 0; 4888 need_commit_sem = true; 4889 if (path->nowait) { 4890 if (!down_read_trylock(&fs_info->commit_root_sem)) { 4891 ret = -EAGAIN; 4892 goto done; 4893 } 4894 } else { 4895 down_read(&fs_info->commit_root_sem); 4896 } 4897 } 4898 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4899 } 4900 path->keep_locks = 0; 4901 4902 if (ret < 0) 4903 goto done; 4904 4905 nritems = btrfs_header_nritems(path->nodes[0]); 4906 /* 4907 * by releasing the path above we dropped all our locks. A balance 4908 * could have added more items next to the key that used to be 4909 * at the very end of the block. So, check again here and 4910 * advance the path if there are now more items available. 4911 */ 4912 if (nritems > 0 && path->slots[0] < nritems - 1) { 4913 if (ret == 0) 4914 path->slots[0]++; 4915 ret = 0; 4916 goto done; 4917 } 4918 /* 4919 * So the above check misses one case: 4920 * - after releasing the path above, someone has removed the item that 4921 * used to be at the very end of the block, and balance between leafs 4922 * gets another one with bigger key.offset to replace it. 4923 * 4924 * This one should be returned as well, or we can get leaf corruption 4925 * later(esp. in __btrfs_drop_extents()). 4926 * 4927 * And a bit more explanation about this check, 4928 * with ret > 0, the key isn't found, the path points to the slot 4929 * where it should be inserted, so the path->slots[0] item must be the 4930 * bigger one. 4931 */ 4932 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { 4933 ret = 0; 4934 goto done; 4935 } 4936 4937 while (level < BTRFS_MAX_LEVEL) { 4938 if (!path->nodes[level]) { 4939 ret = 1; 4940 goto done; 4941 } 4942 4943 slot = path->slots[level] + 1; 4944 c = path->nodes[level]; 4945 if (slot >= btrfs_header_nritems(c)) { 4946 level++; 4947 if (level == BTRFS_MAX_LEVEL) { 4948 ret = 1; 4949 goto done; 4950 } 4951 continue; 4952 } 4953 4954 4955 /* 4956 * Our current level is where we're going to start from, and to 4957 * make sure lockdep doesn't complain we need to drop our locks 4958 * and nodes from 0 to our current level. 4959 */ 4960 for (i = 0; i < level; i++) { 4961 if (path->locks[level]) { 4962 btrfs_tree_read_unlock(path->nodes[i]); 4963 path->locks[i] = 0; 4964 } 4965 free_extent_buffer(path->nodes[i]); 4966 path->nodes[i] = NULL; 4967 } 4968 4969 next = c; 4970 ret = read_block_for_search(root, path, &next, slot, &key); 4971 if (ret == -EAGAIN && !path->nowait) 4972 goto again; 4973 4974 if (ret < 0) { 4975 btrfs_release_path(path); 4976 goto done; 4977 } 4978 4979 if (!path->skip_locking) { 4980 ret = btrfs_try_tree_read_lock(next); 4981 if (!ret && path->nowait) { 4982 ret = -EAGAIN; 4983 goto done; 4984 } 4985 if (!ret && time_seq) { 4986 /* 4987 * If we don't get the lock, we may be racing 4988 * with push_leaf_left, holding that lock while 4989 * itself waiting for the leaf we've currently 4990 * locked. To solve this situation, we give up 4991 * on our lock and cycle. 4992 */ 4993 free_extent_buffer(next); 4994 btrfs_release_path(path); 4995 cond_resched(); 4996 goto again; 4997 } 4998 if (!ret) 4999 btrfs_tree_read_lock(next); 5000 } 5001 break; 5002 } 5003 path->slots[level] = slot; 5004 while (1) { 5005 level--; 5006 path->nodes[level] = next; 5007 path->slots[level] = 0; 5008 if (!path->skip_locking) 5009 path->locks[level] = BTRFS_READ_LOCK; 5010 if (!level) 5011 break; 5012 5013 ret = read_block_for_search(root, path, &next, 0, &key); 5014 if (ret == -EAGAIN && !path->nowait) 5015 goto again; 5016 5017 if (ret < 0) { 5018 btrfs_release_path(path); 5019 goto done; 5020 } 5021 5022 if (!path->skip_locking) { 5023 if (path->nowait) { 5024 if (!btrfs_try_tree_read_lock(next)) { 5025 ret = -EAGAIN; 5026 goto done; 5027 } 5028 } else { 5029 btrfs_tree_read_lock(next); 5030 } 5031 } 5032 } 5033 ret = 0; 5034 done: 5035 unlock_up(path, 0, 1, 0, NULL); 5036 if (need_commit_sem) { 5037 int ret2; 5038 5039 path->need_commit_sem = 1; 5040 ret2 = finish_need_commit_sem_search(path); 5041 up_read(&fs_info->commit_root_sem); 5042 if (ret2) 5043 ret = ret2; 5044 } 5045 5046 return ret; 5047 } 5048 5049 int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) 5050 { 5051 path->slots[0]++; 5052 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 5053 return btrfs_next_old_leaf(root, path, time_seq); 5054 return 0; 5055 } 5056 5057 /* 5058 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 5059 * searching until it gets past min_objectid or finds an item of 'type' 5060 * 5061 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5062 */ 5063 int btrfs_previous_item(struct btrfs_root *root, 5064 struct btrfs_path *path, u64 min_objectid, 5065 int type) 5066 { 5067 struct btrfs_key found_key; 5068 struct extent_buffer *leaf; 5069 u32 nritems; 5070 int ret; 5071 5072 while (1) { 5073 if (path->slots[0] == 0) { 5074 ret = btrfs_prev_leaf(root, path); 5075 if (ret != 0) 5076 return ret; 5077 } else { 5078 path->slots[0]--; 5079 } 5080 leaf = path->nodes[0]; 5081 nritems = btrfs_header_nritems(leaf); 5082 if (nritems == 0) 5083 return 1; 5084 if (path->slots[0] == nritems) 5085 path->slots[0]--; 5086 5087 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5088 if (found_key.objectid < min_objectid) 5089 break; 5090 if (found_key.type == type) 5091 return 0; 5092 if (found_key.objectid == min_objectid && 5093 found_key.type < type) 5094 break; 5095 } 5096 return 1; 5097 } 5098 5099 /* 5100 * search in extent tree to find a previous Metadata/Data extent item with 5101 * min objecitd. 5102 * 5103 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5104 */ 5105 int btrfs_previous_extent_item(struct btrfs_root *root, 5106 struct btrfs_path *path, u64 min_objectid) 5107 { 5108 struct btrfs_key found_key; 5109 struct extent_buffer *leaf; 5110 u32 nritems; 5111 int ret; 5112 5113 while (1) { 5114 if (path->slots[0] == 0) { 5115 ret = btrfs_prev_leaf(root, path); 5116 if (ret != 0) 5117 return ret; 5118 } else { 5119 path->slots[0]--; 5120 } 5121 leaf = path->nodes[0]; 5122 nritems = btrfs_header_nritems(leaf); 5123 if (nritems == 0) 5124 return 1; 5125 if (path->slots[0] == nritems) 5126 path->slots[0]--; 5127 5128 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5129 if (found_key.objectid < min_objectid) 5130 break; 5131 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 5132 found_key.type == BTRFS_METADATA_ITEM_KEY) 5133 return 0; 5134 if (found_key.objectid == min_objectid && 5135 found_key.type < BTRFS_EXTENT_ITEM_KEY) 5136 break; 5137 } 5138 return 1; 5139 } 5140 5141 int __init btrfs_ctree_init(void) 5142 { 5143 btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0); 5144 if (!btrfs_path_cachep) 5145 return -ENOMEM; 5146 return 0; 5147 } 5148 5149 void __cold btrfs_ctree_exit(void) 5150 { 5151 kmem_cache_destroy(btrfs_path_cachep); 5152 } 5153