1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include "ctree.h" 11 #include "disk-io.h" 12 #include "transaction.h" 13 #include "print-tree.h" 14 #include "locking.h" 15 #include "volumes.h" 16 #include "qgroup.h" 17 #include "tree-mod-log.h" 18 19 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 20 *root, struct btrfs_path *path, int level); 21 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 22 const struct btrfs_key *ins_key, struct btrfs_path *path, 23 int data_size, int extend); 24 static int push_node_left(struct btrfs_trans_handle *trans, 25 struct extent_buffer *dst, 26 struct extent_buffer *src, int empty); 27 static int balance_node_right(struct btrfs_trans_handle *trans, 28 struct extent_buffer *dst_buf, 29 struct extent_buffer *src_buf); 30 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 31 int level, int slot); 32 33 static const struct btrfs_csums { 34 u16 size; 35 const char name[10]; 36 const char driver[12]; 37 } btrfs_csums[] = { 38 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, 39 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, 40 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, 41 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", 42 .driver = "blake2b-256" }, 43 }; 44 45 int btrfs_super_csum_size(const struct btrfs_super_block *s) 46 { 47 u16 t = btrfs_super_csum_type(s); 48 /* 49 * csum type is validated at mount time 50 */ 51 return btrfs_csums[t].size; 52 } 53 54 const char *btrfs_super_csum_name(u16 csum_type) 55 { 56 /* csum type is validated at mount time */ 57 return btrfs_csums[csum_type].name; 58 } 59 60 /* 61 * Return driver name if defined, otherwise the name that's also a valid driver 62 * name 63 */ 64 const char *btrfs_super_csum_driver(u16 csum_type) 65 { 66 /* csum type is validated at mount time */ 67 return btrfs_csums[csum_type].driver[0] ? 68 btrfs_csums[csum_type].driver : 69 btrfs_csums[csum_type].name; 70 } 71 72 size_t __attribute_const__ btrfs_get_num_csums(void) 73 { 74 return ARRAY_SIZE(btrfs_csums); 75 } 76 77 struct btrfs_path *btrfs_alloc_path(void) 78 { 79 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 80 } 81 82 /* this also releases the path */ 83 void btrfs_free_path(struct btrfs_path *p) 84 { 85 if (!p) 86 return; 87 btrfs_release_path(p); 88 kmem_cache_free(btrfs_path_cachep, p); 89 } 90 91 /* 92 * path release drops references on the extent buffers in the path 93 * and it drops any locks held by this path 94 * 95 * It is safe to call this on paths that no locks or extent buffers held. 96 */ 97 noinline void btrfs_release_path(struct btrfs_path *p) 98 { 99 int i; 100 101 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 102 p->slots[i] = 0; 103 if (!p->nodes[i]) 104 continue; 105 if (p->locks[i]) { 106 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 107 p->locks[i] = 0; 108 } 109 free_extent_buffer(p->nodes[i]); 110 p->nodes[i] = NULL; 111 } 112 } 113 114 /* 115 * safely gets a reference on the root node of a tree. A lock 116 * is not taken, so a concurrent writer may put a different node 117 * at the root of the tree. See btrfs_lock_root_node for the 118 * looping required. 119 * 120 * The extent buffer returned by this has a reference taken, so 121 * it won't disappear. It may stop being the root of the tree 122 * at any time because there are no locks held. 123 */ 124 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 125 { 126 struct extent_buffer *eb; 127 128 while (1) { 129 rcu_read_lock(); 130 eb = rcu_dereference(root->node); 131 132 /* 133 * RCU really hurts here, we could free up the root node because 134 * it was COWed but we may not get the new root node yet so do 135 * the inc_not_zero dance and if it doesn't work then 136 * synchronize_rcu and try again. 137 */ 138 if (atomic_inc_not_zero(&eb->refs)) { 139 rcu_read_unlock(); 140 break; 141 } 142 rcu_read_unlock(); 143 synchronize_rcu(); 144 } 145 return eb; 146 } 147 148 /* 149 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), 150 * just get put onto a simple dirty list. Transaction walks this list to make 151 * sure they get properly updated on disk. 152 */ 153 static void add_root_to_dirty_list(struct btrfs_root *root) 154 { 155 struct btrfs_fs_info *fs_info = root->fs_info; 156 157 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 158 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 159 return; 160 161 spin_lock(&fs_info->trans_lock); 162 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 163 /* Want the extent tree to be the last on the list */ 164 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) 165 list_move_tail(&root->dirty_list, 166 &fs_info->dirty_cowonly_roots); 167 else 168 list_move(&root->dirty_list, 169 &fs_info->dirty_cowonly_roots); 170 } 171 spin_unlock(&fs_info->trans_lock); 172 } 173 174 /* 175 * used by snapshot creation to make a copy of a root for a tree with 176 * a given objectid. The buffer with the new root node is returned in 177 * cow_ret, and this func returns zero on success or a negative error code. 178 */ 179 int btrfs_copy_root(struct btrfs_trans_handle *trans, 180 struct btrfs_root *root, 181 struct extent_buffer *buf, 182 struct extent_buffer **cow_ret, u64 new_root_objectid) 183 { 184 struct btrfs_fs_info *fs_info = root->fs_info; 185 struct extent_buffer *cow; 186 int ret = 0; 187 int level; 188 struct btrfs_disk_key disk_key; 189 190 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 191 trans->transid != fs_info->running_transaction->transid); 192 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 193 trans->transid != root->last_trans); 194 195 level = btrfs_header_level(buf); 196 if (level == 0) 197 btrfs_item_key(buf, &disk_key, 0); 198 else 199 btrfs_node_key(buf, &disk_key, 0); 200 201 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 202 &disk_key, level, buf->start, 0, 203 BTRFS_NESTING_NEW_ROOT); 204 if (IS_ERR(cow)) 205 return PTR_ERR(cow); 206 207 copy_extent_buffer_full(cow, buf); 208 btrfs_set_header_bytenr(cow, cow->start); 209 btrfs_set_header_generation(cow, trans->transid); 210 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 211 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 212 BTRFS_HEADER_FLAG_RELOC); 213 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 214 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 215 else 216 btrfs_set_header_owner(cow, new_root_objectid); 217 218 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 219 220 WARN_ON(btrfs_header_generation(buf) > trans->transid); 221 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 222 ret = btrfs_inc_ref(trans, root, cow, 1); 223 else 224 ret = btrfs_inc_ref(trans, root, cow, 0); 225 if (ret) { 226 btrfs_tree_unlock(cow); 227 free_extent_buffer(cow); 228 btrfs_abort_transaction(trans, ret); 229 return ret; 230 } 231 232 btrfs_mark_buffer_dirty(cow); 233 *cow_ret = cow; 234 return 0; 235 } 236 237 /* 238 * check if the tree block can be shared by multiple trees 239 */ 240 int btrfs_block_can_be_shared(struct btrfs_root *root, 241 struct extent_buffer *buf) 242 { 243 /* 244 * Tree blocks not in shareable trees and tree roots are never shared. 245 * If a block was allocated after the last snapshot and the block was 246 * not allocated by tree relocation, we know the block is not shared. 247 */ 248 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 249 buf != root->node && buf != root->commit_root && 250 (btrfs_header_generation(buf) <= 251 btrfs_root_last_snapshot(&root->root_item) || 252 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 253 return 1; 254 255 return 0; 256 } 257 258 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 259 struct btrfs_root *root, 260 struct extent_buffer *buf, 261 struct extent_buffer *cow, 262 int *last_ref) 263 { 264 struct btrfs_fs_info *fs_info = root->fs_info; 265 u64 refs; 266 u64 owner; 267 u64 flags; 268 u64 new_flags = 0; 269 int ret; 270 271 /* 272 * Backrefs update rules: 273 * 274 * Always use full backrefs for extent pointers in tree block 275 * allocated by tree relocation. 276 * 277 * If a shared tree block is no longer referenced by its owner 278 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 279 * use full backrefs for extent pointers in tree block. 280 * 281 * If a tree block is been relocating 282 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 283 * use full backrefs for extent pointers in tree block. 284 * The reason for this is some operations (such as drop tree) 285 * are only allowed for blocks use full backrefs. 286 */ 287 288 if (btrfs_block_can_be_shared(root, buf)) { 289 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 290 btrfs_header_level(buf), 1, 291 &refs, &flags); 292 if (ret) 293 return ret; 294 if (refs == 0) { 295 ret = -EROFS; 296 btrfs_handle_fs_error(fs_info, ret, NULL); 297 return ret; 298 } 299 } else { 300 refs = 1; 301 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 302 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 303 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 304 else 305 flags = 0; 306 } 307 308 owner = btrfs_header_owner(buf); 309 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && 310 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 311 312 if (refs > 1) { 313 if ((owner == root->root_key.objectid || 314 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 315 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 316 ret = btrfs_inc_ref(trans, root, buf, 1); 317 if (ret) 318 return ret; 319 320 if (root->root_key.objectid == 321 BTRFS_TREE_RELOC_OBJECTID) { 322 ret = btrfs_dec_ref(trans, root, buf, 0); 323 if (ret) 324 return ret; 325 ret = btrfs_inc_ref(trans, root, cow, 1); 326 if (ret) 327 return ret; 328 } 329 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 330 } else { 331 332 if (root->root_key.objectid == 333 BTRFS_TREE_RELOC_OBJECTID) 334 ret = btrfs_inc_ref(trans, root, cow, 1); 335 else 336 ret = btrfs_inc_ref(trans, root, cow, 0); 337 if (ret) 338 return ret; 339 } 340 if (new_flags != 0) { 341 int level = btrfs_header_level(buf); 342 343 ret = btrfs_set_disk_extent_flags(trans, buf, 344 new_flags, level, 0); 345 if (ret) 346 return ret; 347 } 348 } else { 349 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 350 if (root->root_key.objectid == 351 BTRFS_TREE_RELOC_OBJECTID) 352 ret = btrfs_inc_ref(trans, root, cow, 1); 353 else 354 ret = btrfs_inc_ref(trans, root, cow, 0); 355 if (ret) 356 return ret; 357 ret = btrfs_dec_ref(trans, root, buf, 1); 358 if (ret) 359 return ret; 360 } 361 btrfs_clean_tree_block(buf); 362 *last_ref = 1; 363 } 364 return 0; 365 } 366 367 static struct extent_buffer *alloc_tree_block_no_bg_flush( 368 struct btrfs_trans_handle *trans, 369 struct btrfs_root *root, 370 u64 parent_start, 371 const struct btrfs_disk_key *disk_key, 372 int level, 373 u64 hint, 374 u64 empty_size, 375 enum btrfs_lock_nesting nest) 376 { 377 struct btrfs_fs_info *fs_info = root->fs_info; 378 struct extent_buffer *ret; 379 380 /* 381 * If we are COWing a node/leaf from the extent, chunk, device or free 382 * space trees, make sure that we do not finish block group creation of 383 * pending block groups. We do this to avoid a deadlock. 384 * COWing can result in allocation of a new chunk, and flushing pending 385 * block groups (btrfs_create_pending_block_groups()) can be triggered 386 * when finishing allocation of a new chunk. Creation of a pending block 387 * group modifies the extent, chunk, device and free space trees, 388 * therefore we could deadlock with ourselves since we are holding a 389 * lock on an extent buffer that btrfs_create_pending_block_groups() may 390 * try to COW later. 391 * For similar reasons, we also need to delay flushing pending block 392 * groups when splitting a leaf or node, from one of those trees, since 393 * we are holding a write lock on it and its parent or when inserting a 394 * new root node for one of those trees. 395 */ 396 if (root == fs_info->extent_root || 397 root == fs_info->chunk_root || 398 root == fs_info->dev_root || 399 root == fs_info->free_space_root) 400 trans->can_flush_pending_bgs = false; 401 402 ret = btrfs_alloc_tree_block(trans, root, parent_start, 403 root->root_key.objectid, disk_key, level, 404 hint, empty_size, nest); 405 trans->can_flush_pending_bgs = true; 406 407 return ret; 408 } 409 410 /* 411 * does the dirty work in cow of a single block. The parent block (if 412 * supplied) is updated to point to the new cow copy. The new buffer is marked 413 * dirty and returned locked. If you modify the block it needs to be marked 414 * dirty again. 415 * 416 * search_start -- an allocation hint for the new block 417 * 418 * empty_size -- a hint that you plan on doing more cow. This is the size in 419 * bytes the allocator should try to find free next to the block it returns. 420 * This is just a hint and may be ignored by the allocator. 421 */ 422 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 423 struct btrfs_root *root, 424 struct extent_buffer *buf, 425 struct extent_buffer *parent, int parent_slot, 426 struct extent_buffer **cow_ret, 427 u64 search_start, u64 empty_size, 428 enum btrfs_lock_nesting nest) 429 { 430 struct btrfs_fs_info *fs_info = root->fs_info; 431 struct btrfs_disk_key disk_key; 432 struct extent_buffer *cow; 433 int level, ret; 434 int last_ref = 0; 435 int unlock_orig = 0; 436 u64 parent_start = 0; 437 438 if (*cow_ret == buf) 439 unlock_orig = 1; 440 441 btrfs_assert_tree_locked(buf); 442 443 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 444 trans->transid != fs_info->running_transaction->transid); 445 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 446 trans->transid != root->last_trans); 447 448 level = btrfs_header_level(buf); 449 450 if (level == 0) 451 btrfs_item_key(buf, &disk_key, 0); 452 else 453 btrfs_node_key(buf, &disk_key, 0); 454 455 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) 456 parent_start = parent->start; 457 458 cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, 459 level, search_start, empty_size, nest); 460 if (IS_ERR(cow)) 461 return PTR_ERR(cow); 462 463 /* cow is set to blocking by btrfs_init_new_buffer */ 464 465 copy_extent_buffer_full(cow, buf); 466 btrfs_set_header_bytenr(cow, cow->start); 467 btrfs_set_header_generation(cow, trans->transid); 468 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 469 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 470 BTRFS_HEADER_FLAG_RELOC); 471 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 472 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 473 else 474 btrfs_set_header_owner(cow, root->root_key.objectid); 475 476 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 477 478 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 479 if (ret) { 480 btrfs_tree_unlock(cow); 481 free_extent_buffer(cow); 482 btrfs_abort_transaction(trans, ret); 483 return ret; 484 } 485 486 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 487 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 488 if (ret) { 489 btrfs_tree_unlock(cow); 490 free_extent_buffer(cow); 491 btrfs_abort_transaction(trans, ret); 492 return ret; 493 } 494 } 495 496 if (buf == root->node) { 497 WARN_ON(parent && parent != buf); 498 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 499 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 500 parent_start = buf->start; 501 502 atomic_inc(&cow->refs); 503 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); 504 BUG_ON(ret < 0); 505 rcu_assign_pointer(root->node, cow); 506 507 btrfs_free_tree_block(trans, root, buf, parent_start, 508 last_ref); 509 free_extent_buffer(buf); 510 add_root_to_dirty_list(root); 511 } else { 512 WARN_ON(trans->transid != btrfs_header_generation(parent)); 513 btrfs_tree_mod_log_insert_key(parent, parent_slot, 514 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 515 btrfs_set_node_blockptr(parent, parent_slot, 516 cow->start); 517 btrfs_set_node_ptr_generation(parent, parent_slot, 518 trans->transid); 519 btrfs_mark_buffer_dirty(parent); 520 if (last_ref) { 521 ret = btrfs_tree_mod_log_free_eb(buf); 522 if (ret) { 523 btrfs_tree_unlock(cow); 524 free_extent_buffer(cow); 525 btrfs_abort_transaction(trans, ret); 526 return ret; 527 } 528 } 529 btrfs_free_tree_block(trans, root, buf, parent_start, 530 last_ref); 531 } 532 if (unlock_orig) 533 btrfs_tree_unlock(buf); 534 free_extent_buffer_stale(buf); 535 btrfs_mark_buffer_dirty(cow); 536 *cow_ret = cow; 537 return 0; 538 } 539 540 static inline int should_cow_block(struct btrfs_trans_handle *trans, 541 struct btrfs_root *root, 542 struct extent_buffer *buf) 543 { 544 if (btrfs_is_testing(root->fs_info)) 545 return 0; 546 547 /* Ensure we can see the FORCE_COW bit */ 548 smp_mb__before_atomic(); 549 550 /* 551 * We do not need to cow a block if 552 * 1) this block is not created or changed in this transaction; 553 * 2) this block does not belong to TREE_RELOC tree; 554 * 3) the root is not forced COW. 555 * 556 * What is forced COW: 557 * when we create snapshot during committing the transaction, 558 * after we've finished copying src root, we must COW the shared 559 * block to ensure the metadata consistency. 560 */ 561 if (btrfs_header_generation(buf) == trans->transid && 562 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 563 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 564 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 565 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 566 return 0; 567 return 1; 568 } 569 570 /* 571 * cows a single block, see __btrfs_cow_block for the real work. 572 * This version of it has extra checks so that a block isn't COWed more than 573 * once per transaction, as long as it hasn't been written yet 574 */ 575 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 576 struct btrfs_root *root, struct extent_buffer *buf, 577 struct extent_buffer *parent, int parent_slot, 578 struct extent_buffer **cow_ret, 579 enum btrfs_lock_nesting nest) 580 { 581 struct btrfs_fs_info *fs_info = root->fs_info; 582 u64 search_start; 583 int ret; 584 585 if (test_bit(BTRFS_ROOT_DELETING, &root->state)) 586 btrfs_err(fs_info, 587 "COW'ing blocks on a fs root that's being dropped"); 588 589 if (trans->transaction != fs_info->running_transaction) 590 WARN(1, KERN_CRIT "trans %llu running %llu\n", 591 trans->transid, 592 fs_info->running_transaction->transid); 593 594 if (trans->transid != fs_info->generation) 595 WARN(1, KERN_CRIT "trans %llu running %llu\n", 596 trans->transid, fs_info->generation); 597 598 if (!should_cow_block(trans, root, buf)) { 599 trans->dirty = true; 600 *cow_ret = buf; 601 return 0; 602 } 603 604 search_start = buf->start & ~((u64)SZ_1G - 1); 605 606 /* 607 * Before CoWing this block for later modification, check if it's 608 * the subtree root and do the delayed subtree trace if needed. 609 * 610 * Also We don't care about the error, as it's handled internally. 611 */ 612 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 613 ret = __btrfs_cow_block(trans, root, buf, parent, 614 parent_slot, cow_ret, search_start, 0, nest); 615 616 trace_btrfs_cow_block(root, buf, *cow_ret); 617 618 return ret; 619 } 620 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 621 622 /* 623 * helper function for defrag to decide if two blocks pointed to by a 624 * node are actually close by 625 */ 626 static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 627 { 628 if (blocknr < other && other - (blocknr + blocksize) < 32768) 629 return 1; 630 if (blocknr > other && blocknr - (other + blocksize) < 32768) 631 return 1; 632 return 0; 633 } 634 635 #ifdef __LITTLE_ENDIAN 636 637 /* 638 * Compare two keys, on little-endian the disk order is same as CPU order and 639 * we can avoid the conversion. 640 */ 641 static int comp_keys(const struct btrfs_disk_key *disk_key, 642 const struct btrfs_key *k2) 643 { 644 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; 645 646 return btrfs_comp_cpu_keys(k1, k2); 647 } 648 649 #else 650 651 /* 652 * compare two keys in a memcmp fashion 653 */ 654 static int comp_keys(const struct btrfs_disk_key *disk, 655 const struct btrfs_key *k2) 656 { 657 struct btrfs_key k1; 658 659 btrfs_disk_key_to_cpu(&k1, disk); 660 661 return btrfs_comp_cpu_keys(&k1, k2); 662 } 663 #endif 664 665 /* 666 * same as comp_keys only with two btrfs_key's 667 */ 668 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 669 { 670 if (k1->objectid > k2->objectid) 671 return 1; 672 if (k1->objectid < k2->objectid) 673 return -1; 674 if (k1->type > k2->type) 675 return 1; 676 if (k1->type < k2->type) 677 return -1; 678 if (k1->offset > k2->offset) 679 return 1; 680 if (k1->offset < k2->offset) 681 return -1; 682 return 0; 683 } 684 685 /* 686 * this is used by the defrag code to go through all the 687 * leaves pointed to by a node and reallocate them so that 688 * disk order is close to key order 689 */ 690 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 691 struct btrfs_root *root, struct extent_buffer *parent, 692 int start_slot, u64 *last_ret, 693 struct btrfs_key *progress) 694 { 695 struct btrfs_fs_info *fs_info = root->fs_info; 696 struct extent_buffer *cur; 697 u64 blocknr; 698 u64 search_start = *last_ret; 699 u64 last_block = 0; 700 u64 other; 701 u32 parent_nritems; 702 int end_slot; 703 int i; 704 int err = 0; 705 u32 blocksize; 706 int progress_passed = 0; 707 struct btrfs_disk_key disk_key; 708 709 WARN_ON(trans->transaction != fs_info->running_transaction); 710 WARN_ON(trans->transid != fs_info->generation); 711 712 parent_nritems = btrfs_header_nritems(parent); 713 blocksize = fs_info->nodesize; 714 end_slot = parent_nritems - 1; 715 716 if (parent_nritems <= 1) 717 return 0; 718 719 for (i = start_slot; i <= end_slot; i++) { 720 int close = 1; 721 722 btrfs_node_key(parent, &disk_key, i); 723 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 724 continue; 725 726 progress_passed = 1; 727 blocknr = btrfs_node_blockptr(parent, i); 728 if (last_block == 0) 729 last_block = blocknr; 730 731 if (i > 0) { 732 other = btrfs_node_blockptr(parent, i - 1); 733 close = close_blocks(blocknr, other, blocksize); 734 } 735 if (!close && i < end_slot) { 736 other = btrfs_node_blockptr(parent, i + 1); 737 close = close_blocks(blocknr, other, blocksize); 738 } 739 if (close) { 740 last_block = blocknr; 741 continue; 742 } 743 744 cur = btrfs_read_node_slot(parent, i); 745 if (IS_ERR(cur)) 746 return PTR_ERR(cur); 747 if (search_start == 0) 748 search_start = last_block; 749 750 btrfs_tree_lock(cur); 751 err = __btrfs_cow_block(trans, root, cur, parent, i, 752 &cur, search_start, 753 min(16 * blocksize, 754 (end_slot - i) * blocksize), 755 BTRFS_NESTING_COW); 756 if (err) { 757 btrfs_tree_unlock(cur); 758 free_extent_buffer(cur); 759 break; 760 } 761 search_start = cur->start; 762 last_block = cur->start; 763 *last_ret = search_start; 764 btrfs_tree_unlock(cur); 765 free_extent_buffer(cur); 766 } 767 return err; 768 } 769 770 /* 771 * search for key in the extent_buffer. The items start at offset p, 772 * and they are item_size apart. There are 'max' items in p. 773 * 774 * the slot in the array is returned via slot, and it points to 775 * the place where you would insert key if it is not found in 776 * the array. 777 * 778 * slot may point to max if the key is bigger than all of the keys 779 */ 780 static noinline int generic_bin_search(struct extent_buffer *eb, 781 unsigned long p, int item_size, 782 const struct btrfs_key *key, 783 int max, int *slot) 784 { 785 int low = 0; 786 int high = max; 787 int ret; 788 const int key_size = sizeof(struct btrfs_disk_key); 789 790 if (low > high) { 791 btrfs_err(eb->fs_info, 792 "%s: low (%d) > high (%d) eb %llu owner %llu level %d", 793 __func__, low, high, eb->start, 794 btrfs_header_owner(eb), btrfs_header_level(eb)); 795 return -EINVAL; 796 } 797 798 while (low < high) { 799 unsigned long oip; 800 unsigned long offset; 801 struct btrfs_disk_key *tmp; 802 struct btrfs_disk_key unaligned; 803 int mid; 804 805 mid = (low + high) / 2; 806 offset = p + mid * item_size; 807 oip = offset_in_page(offset); 808 809 if (oip + key_size <= PAGE_SIZE) { 810 const unsigned long idx = get_eb_page_index(offset); 811 char *kaddr = page_address(eb->pages[idx]); 812 813 oip = get_eb_offset_in_page(eb, offset); 814 tmp = (struct btrfs_disk_key *)(kaddr + oip); 815 } else { 816 read_extent_buffer(eb, &unaligned, offset, key_size); 817 tmp = &unaligned; 818 } 819 820 ret = comp_keys(tmp, key); 821 822 if (ret < 0) 823 low = mid + 1; 824 else if (ret > 0) 825 high = mid; 826 else { 827 *slot = mid; 828 return 0; 829 } 830 } 831 *slot = low; 832 return 1; 833 } 834 835 /* 836 * simple bin_search frontend that does the right thing for 837 * leaves vs nodes 838 */ 839 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, 840 int *slot) 841 { 842 if (btrfs_header_level(eb) == 0) 843 return generic_bin_search(eb, 844 offsetof(struct btrfs_leaf, items), 845 sizeof(struct btrfs_item), 846 key, btrfs_header_nritems(eb), 847 slot); 848 else 849 return generic_bin_search(eb, 850 offsetof(struct btrfs_node, ptrs), 851 sizeof(struct btrfs_key_ptr), 852 key, btrfs_header_nritems(eb), 853 slot); 854 } 855 856 static void root_add_used(struct btrfs_root *root, u32 size) 857 { 858 spin_lock(&root->accounting_lock); 859 btrfs_set_root_used(&root->root_item, 860 btrfs_root_used(&root->root_item) + size); 861 spin_unlock(&root->accounting_lock); 862 } 863 864 static void root_sub_used(struct btrfs_root *root, u32 size) 865 { 866 spin_lock(&root->accounting_lock); 867 btrfs_set_root_used(&root->root_item, 868 btrfs_root_used(&root->root_item) - size); 869 spin_unlock(&root->accounting_lock); 870 } 871 872 /* given a node and slot number, this reads the blocks it points to. The 873 * extent buffer is returned with a reference taken (but unlocked). 874 */ 875 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, 876 int slot) 877 { 878 int level = btrfs_header_level(parent); 879 struct extent_buffer *eb; 880 struct btrfs_key first_key; 881 882 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 883 return ERR_PTR(-ENOENT); 884 885 BUG_ON(level == 0); 886 887 btrfs_node_key_to_cpu(parent, &first_key, slot); 888 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 889 btrfs_header_owner(parent), 890 btrfs_node_ptr_generation(parent, slot), 891 level - 1, &first_key); 892 if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { 893 free_extent_buffer(eb); 894 eb = ERR_PTR(-EIO); 895 } 896 897 return eb; 898 } 899 900 /* 901 * node level balancing, used to make sure nodes are in proper order for 902 * item deletion. We balance from the top down, so we have to make sure 903 * that a deletion won't leave an node completely empty later on. 904 */ 905 static noinline int balance_level(struct btrfs_trans_handle *trans, 906 struct btrfs_root *root, 907 struct btrfs_path *path, int level) 908 { 909 struct btrfs_fs_info *fs_info = root->fs_info; 910 struct extent_buffer *right = NULL; 911 struct extent_buffer *mid; 912 struct extent_buffer *left = NULL; 913 struct extent_buffer *parent = NULL; 914 int ret = 0; 915 int wret; 916 int pslot; 917 int orig_slot = path->slots[level]; 918 u64 orig_ptr; 919 920 ASSERT(level > 0); 921 922 mid = path->nodes[level]; 923 924 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); 925 WARN_ON(btrfs_header_generation(mid) != trans->transid); 926 927 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 928 929 if (level < BTRFS_MAX_LEVEL - 1) { 930 parent = path->nodes[level + 1]; 931 pslot = path->slots[level + 1]; 932 } 933 934 /* 935 * deal with the case where there is only one pointer in the root 936 * by promoting the node below to a root 937 */ 938 if (!parent) { 939 struct extent_buffer *child; 940 941 if (btrfs_header_nritems(mid) != 1) 942 return 0; 943 944 /* promote the child to a root */ 945 child = btrfs_read_node_slot(mid, 0); 946 if (IS_ERR(child)) { 947 ret = PTR_ERR(child); 948 btrfs_handle_fs_error(fs_info, ret, NULL); 949 goto enospc; 950 } 951 952 btrfs_tree_lock(child); 953 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 954 BTRFS_NESTING_COW); 955 if (ret) { 956 btrfs_tree_unlock(child); 957 free_extent_buffer(child); 958 goto enospc; 959 } 960 961 ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 962 BUG_ON(ret < 0); 963 rcu_assign_pointer(root->node, child); 964 965 add_root_to_dirty_list(root); 966 btrfs_tree_unlock(child); 967 968 path->locks[level] = 0; 969 path->nodes[level] = NULL; 970 btrfs_clean_tree_block(mid); 971 btrfs_tree_unlock(mid); 972 /* once for the path */ 973 free_extent_buffer(mid); 974 975 root_sub_used(root, mid->len); 976 btrfs_free_tree_block(trans, root, mid, 0, 1); 977 /* once for the root ptr */ 978 free_extent_buffer_stale(mid); 979 return 0; 980 } 981 if (btrfs_header_nritems(mid) > 982 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 983 return 0; 984 985 left = btrfs_read_node_slot(parent, pslot - 1); 986 if (IS_ERR(left)) 987 left = NULL; 988 989 if (left) { 990 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 991 wret = btrfs_cow_block(trans, root, left, 992 parent, pslot - 1, &left, 993 BTRFS_NESTING_LEFT_COW); 994 if (wret) { 995 ret = wret; 996 goto enospc; 997 } 998 } 999 1000 right = btrfs_read_node_slot(parent, pslot + 1); 1001 if (IS_ERR(right)) 1002 right = NULL; 1003 1004 if (right) { 1005 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1006 wret = btrfs_cow_block(trans, root, right, 1007 parent, pslot + 1, &right, 1008 BTRFS_NESTING_RIGHT_COW); 1009 if (wret) { 1010 ret = wret; 1011 goto enospc; 1012 } 1013 } 1014 1015 /* first, try to make some room in the middle buffer */ 1016 if (left) { 1017 orig_slot += btrfs_header_nritems(left); 1018 wret = push_node_left(trans, left, mid, 1); 1019 if (wret < 0) 1020 ret = wret; 1021 } 1022 1023 /* 1024 * then try to empty the right most buffer into the middle 1025 */ 1026 if (right) { 1027 wret = push_node_left(trans, mid, right, 1); 1028 if (wret < 0 && wret != -ENOSPC) 1029 ret = wret; 1030 if (btrfs_header_nritems(right) == 0) { 1031 btrfs_clean_tree_block(right); 1032 btrfs_tree_unlock(right); 1033 del_ptr(root, path, level + 1, pslot + 1); 1034 root_sub_used(root, right->len); 1035 btrfs_free_tree_block(trans, root, right, 0, 1); 1036 free_extent_buffer_stale(right); 1037 right = NULL; 1038 } else { 1039 struct btrfs_disk_key right_key; 1040 btrfs_node_key(right, &right_key, 0); 1041 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1042 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1043 BUG_ON(ret < 0); 1044 btrfs_set_node_key(parent, &right_key, pslot + 1); 1045 btrfs_mark_buffer_dirty(parent); 1046 } 1047 } 1048 if (btrfs_header_nritems(mid) == 1) { 1049 /* 1050 * we're not allowed to leave a node with one item in the 1051 * tree during a delete. A deletion from lower in the tree 1052 * could try to delete the only pointer in this node. 1053 * So, pull some keys from the left. 1054 * There has to be a left pointer at this point because 1055 * otherwise we would have pulled some pointers from the 1056 * right 1057 */ 1058 if (!left) { 1059 ret = -EROFS; 1060 btrfs_handle_fs_error(fs_info, ret, NULL); 1061 goto enospc; 1062 } 1063 wret = balance_node_right(trans, mid, left); 1064 if (wret < 0) { 1065 ret = wret; 1066 goto enospc; 1067 } 1068 if (wret == 1) { 1069 wret = push_node_left(trans, left, mid, 1); 1070 if (wret < 0) 1071 ret = wret; 1072 } 1073 BUG_ON(wret == 1); 1074 } 1075 if (btrfs_header_nritems(mid) == 0) { 1076 btrfs_clean_tree_block(mid); 1077 btrfs_tree_unlock(mid); 1078 del_ptr(root, path, level + 1, pslot); 1079 root_sub_used(root, mid->len); 1080 btrfs_free_tree_block(trans, root, mid, 0, 1); 1081 free_extent_buffer_stale(mid); 1082 mid = NULL; 1083 } else { 1084 /* update the parent key to reflect our changes */ 1085 struct btrfs_disk_key mid_key; 1086 btrfs_node_key(mid, &mid_key, 0); 1087 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1088 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1089 BUG_ON(ret < 0); 1090 btrfs_set_node_key(parent, &mid_key, pslot); 1091 btrfs_mark_buffer_dirty(parent); 1092 } 1093 1094 /* update the path */ 1095 if (left) { 1096 if (btrfs_header_nritems(left) > orig_slot) { 1097 atomic_inc(&left->refs); 1098 /* left was locked after cow */ 1099 path->nodes[level] = left; 1100 path->slots[level + 1] -= 1; 1101 path->slots[level] = orig_slot; 1102 if (mid) { 1103 btrfs_tree_unlock(mid); 1104 free_extent_buffer(mid); 1105 } 1106 } else { 1107 orig_slot -= btrfs_header_nritems(left); 1108 path->slots[level] = orig_slot; 1109 } 1110 } 1111 /* double check we haven't messed things up */ 1112 if (orig_ptr != 1113 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1114 BUG(); 1115 enospc: 1116 if (right) { 1117 btrfs_tree_unlock(right); 1118 free_extent_buffer(right); 1119 } 1120 if (left) { 1121 if (path->nodes[level] != left) 1122 btrfs_tree_unlock(left); 1123 free_extent_buffer(left); 1124 } 1125 return ret; 1126 } 1127 1128 /* Node balancing for insertion. Here we only split or push nodes around 1129 * when they are completely full. This is also done top down, so we 1130 * have to be pessimistic. 1131 */ 1132 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1133 struct btrfs_root *root, 1134 struct btrfs_path *path, int level) 1135 { 1136 struct btrfs_fs_info *fs_info = root->fs_info; 1137 struct extent_buffer *right = NULL; 1138 struct extent_buffer *mid; 1139 struct extent_buffer *left = NULL; 1140 struct extent_buffer *parent = NULL; 1141 int ret = 0; 1142 int wret; 1143 int pslot; 1144 int orig_slot = path->slots[level]; 1145 1146 if (level == 0) 1147 return 1; 1148 1149 mid = path->nodes[level]; 1150 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1151 1152 if (level < BTRFS_MAX_LEVEL - 1) { 1153 parent = path->nodes[level + 1]; 1154 pslot = path->slots[level + 1]; 1155 } 1156 1157 if (!parent) 1158 return 1; 1159 1160 left = btrfs_read_node_slot(parent, pslot - 1); 1161 if (IS_ERR(left)) 1162 left = NULL; 1163 1164 /* first, try to make some room in the middle buffer */ 1165 if (left) { 1166 u32 left_nr; 1167 1168 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 1169 1170 left_nr = btrfs_header_nritems(left); 1171 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1172 wret = 1; 1173 } else { 1174 ret = btrfs_cow_block(trans, root, left, parent, 1175 pslot - 1, &left, 1176 BTRFS_NESTING_LEFT_COW); 1177 if (ret) 1178 wret = 1; 1179 else { 1180 wret = push_node_left(trans, left, mid, 0); 1181 } 1182 } 1183 if (wret < 0) 1184 ret = wret; 1185 if (wret == 0) { 1186 struct btrfs_disk_key disk_key; 1187 orig_slot += left_nr; 1188 btrfs_node_key(mid, &disk_key, 0); 1189 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1190 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1191 BUG_ON(ret < 0); 1192 btrfs_set_node_key(parent, &disk_key, pslot); 1193 btrfs_mark_buffer_dirty(parent); 1194 if (btrfs_header_nritems(left) > orig_slot) { 1195 path->nodes[level] = left; 1196 path->slots[level + 1] -= 1; 1197 path->slots[level] = orig_slot; 1198 btrfs_tree_unlock(mid); 1199 free_extent_buffer(mid); 1200 } else { 1201 orig_slot -= 1202 btrfs_header_nritems(left); 1203 path->slots[level] = orig_slot; 1204 btrfs_tree_unlock(left); 1205 free_extent_buffer(left); 1206 } 1207 return 0; 1208 } 1209 btrfs_tree_unlock(left); 1210 free_extent_buffer(left); 1211 } 1212 right = btrfs_read_node_slot(parent, pslot + 1); 1213 if (IS_ERR(right)) 1214 right = NULL; 1215 1216 /* 1217 * then try to empty the right most buffer into the middle 1218 */ 1219 if (right) { 1220 u32 right_nr; 1221 1222 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1223 1224 right_nr = btrfs_header_nritems(right); 1225 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1226 wret = 1; 1227 } else { 1228 ret = btrfs_cow_block(trans, root, right, 1229 parent, pslot + 1, 1230 &right, BTRFS_NESTING_RIGHT_COW); 1231 if (ret) 1232 wret = 1; 1233 else { 1234 wret = balance_node_right(trans, right, mid); 1235 } 1236 } 1237 if (wret < 0) 1238 ret = wret; 1239 if (wret == 0) { 1240 struct btrfs_disk_key disk_key; 1241 1242 btrfs_node_key(right, &disk_key, 0); 1243 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1244 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1245 BUG_ON(ret < 0); 1246 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1247 btrfs_mark_buffer_dirty(parent); 1248 1249 if (btrfs_header_nritems(mid) <= orig_slot) { 1250 path->nodes[level] = right; 1251 path->slots[level + 1] += 1; 1252 path->slots[level] = orig_slot - 1253 btrfs_header_nritems(mid); 1254 btrfs_tree_unlock(mid); 1255 free_extent_buffer(mid); 1256 } else { 1257 btrfs_tree_unlock(right); 1258 free_extent_buffer(right); 1259 } 1260 return 0; 1261 } 1262 btrfs_tree_unlock(right); 1263 free_extent_buffer(right); 1264 } 1265 return 1; 1266 } 1267 1268 /* 1269 * readahead one full node of leaves, finding things that are close 1270 * to the block in 'slot', and triggering ra on them. 1271 */ 1272 static void reada_for_search(struct btrfs_fs_info *fs_info, 1273 struct btrfs_path *path, 1274 int level, int slot, u64 objectid) 1275 { 1276 struct extent_buffer *node; 1277 struct btrfs_disk_key disk_key; 1278 u32 nritems; 1279 u64 search; 1280 u64 target; 1281 u64 nread = 0; 1282 u64 nread_max; 1283 struct extent_buffer *eb; 1284 u32 nr; 1285 u32 blocksize; 1286 u32 nscan = 0; 1287 1288 if (level != 1 && path->reada != READA_FORWARD_ALWAYS) 1289 return; 1290 1291 if (!path->nodes[level]) 1292 return; 1293 1294 node = path->nodes[level]; 1295 1296 /* 1297 * Since the time between visiting leaves is much shorter than the time 1298 * between visiting nodes, limit read ahead of nodes to 1, to avoid too 1299 * much IO at once (possibly random). 1300 */ 1301 if (path->reada == READA_FORWARD_ALWAYS) { 1302 if (level > 1) 1303 nread_max = node->fs_info->nodesize; 1304 else 1305 nread_max = SZ_128K; 1306 } else { 1307 nread_max = SZ_64K; 1308 } 1309 1310 search = btrfs_node_blockptr(node, slot); 1311 blocksize = fs_info->nodesize; 1312 eb = find_extent_buffer(fs_info, search); 1313 if (eb) { 1314 free_extent_buffer(eb); 1315 return; 1316 } 1317 1318 target = search; 1319 1320 nritems = btrfs_header_nritems(node); 1321 nr = slot; 1322 1323 while (1) { 1324 if (path->reada == READA_BACK) { 1325 if (nr == 0) 1326 break; 1327 nr--; 1328 } else if (path->reada == READA_FORWARD || 1329 path->reada == READA_FORWARD_ALWAYS) { 1330 nr++; 1331 if (nr >= nritems) 1332 break; 1333 } 1334 if (path->reada == READA_BACK && objectid) { 1335 btrfs_node_key(node, &disk_key, nr); 1336 if (btrfs_disk_key_objectid(&disk_key) != objectid) 1337 break; 1338 } 1339 search = btrfs_node_blockptr(node, nr); 1340 if (path->reada == READA_FORWARD_ALWAYS || 1341 (search <= target && target - search <= 65536) || 1342 (search > target && search - target <= 65536)) { 1343 btrfs_readahead_node_child(node, nr); 1344 nread += blocksize; 1345 } 1346 nscan++; 1347 if (nread > nread_max || nscan > 32) 1348 break; 1349 } 1350 } 1351 1352 static noinline void reada_for_balance(struct btrfs_path *path, int level) 1353 { 1354 struct extent_buffer *parent; 1355 int slot; 1356 int nritems; 1357 1358 parent = path->nodes[level + 1]; 1359 if (!parent) 1360 return; 1361 1362 nritems = btrfs_header_nritems(parent); 1363 slot = path->slots[level + 1]; 1364 1365 if (slot > 0) 1366 btrfs_readahead_node_child(parent, slot - 1); 1367 if (slot + 1 < nritems) 1368 btrfs_readahead_node_child(parent, slot + 1); 1369 } 1370 1371 1372 /* 1373 * when we walk down the tree, it is usually safe to unlock the higher layers 1374 * in the tree. The exceptions are when our path goes through slot 0, because 1375 * operations on the tree might require changing key pointers higher up in the 1376 * tree. 1377 * 1378 * callers might also have set path->keep_locks, which tells this code to keep 1379 * the lock if the path points to the last slot in the block. This is part of 1380 * walking through the tree, and selecting the next slot in the higher block. 1381 * 1382 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 1383 * if lowest_unlock is 1, level 0 won't be unlocked 1384 */ 1385 static noinline void unlock_up(struct btrfs_path *path, int level, 1386 int lowest_unlock, int min_write_lock_level, 1387 int *write_lock_level) 1388 { 1389 int i; 1390 int skip_level = level; 1391 int no_skips = 0; 1392 struct extent_buffer *t; 1393 1394 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1395 if (!path->nodes[i]) 1396 break; 1397 if (!path->locks[i]) 1398 break; 1399 if (!no_skips && path->slots[i] == 0) { 1400 skip_level = i + 1; 1401 continue; 1402 } 1403 if (!no_skips && path->keep_locks) { 1404 u32 nritems; 1405 t = path->nodes[i]; 1406 nritems = btrfs_header_nritems(t); 1407 if (nritems < 1 || path->slots[i] >= nritems - 1) { 1408 skip_level = i + 1; 1409 continue; 1410 } 1411 } 1412 if (skip_level < i && i >= lowest_unlock) 1413 no_skips = 1; 1414 1415 t = path->nodes[i]; 1416 if (i >= lowest_unlock && i > skip_level) { 1417 btrfs_tree_unlock_rw(t, path->locks[i]); 1418 path->locks[i] = 0; 1419 if (write_lock_level && 1420 i > min_write_lock_level && 1421 i <= *write_lock_level) { 1422 *write_lock_level = i - 1; 1423 } 1424 } 1425 } 1426 } 1427 1428 /* 1429 * helper function for btrfs_search_slot. The goal is to find a block 1430 * in cache without setting the path to blocking. If we find the block 1431 * we return zero and the path is unchanged. 1432 * 1433 * If we can't find the block, we set the path blocking and do some 1434 * reada. -EAGAIN is returned and the search must be repeated. 1435 */ 1436 static int 1437 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 1438 struct extent_buffer **eb_ret, int level, int slot, 1439 const struct btrfs_key *key) 1440 { 1441 struct btrfs_fs_info *fs_info = root->fs_info; 1442 u64 blocknr; 1443 u64 gen; 1444 struct extent_buffer *tmp; 1445 struct btrfs_key first_key; 1446 int ret; 1447 int parent_level; 1448 1449 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1450 gen = btrfs_node_ptr_generation(*eb_ret, slot); 1451 parent_level = btrfs_header_level(*eb_ret); 1452 btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); 1453 1454 tmp = find_extent_buffer(fs_info, blocknr); 1455 if (tmp) { 1456 if (p->reada == READA_FORWARD_ALWAYS) 1457 reada_for_search(fs_info, p, level, slot, key->objectid); 1458 1459 /* first we do an atomic uptodate check */ 1460 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 1461 /* 1462 * Do extra check for first_key, eb can be stale due to 1463 * being cached, read from scrub, or have multiple 1464 * parents (shared tree blocks). 1465 */ 1466 if (btrfs_verify_level_key(tmp, 1467 parent_level - 1, &first_key, gen)) { 1468 free_extent_buffer(tmp); 1469 return -EUCLEAN; 1470 } 1471 *eb_ret = tmp; 1472 return 0; 1473 } 1474 1475 /* now we're allowed to do a blocking uptodate check */ 1476 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); 1477 if (!ret) { 1478 *eb_ret = tmp; 1479 return 0; 1480 } 1481 free_extent_buffer(tmp); 1482 btrfs_release_path(p); 1483 return -EIO; 1484 } 1485 1486 /* 1487 * reduce lock contention at high levels 1488 * of the btree by dropping locks before 1489 * we read. Don't release the lock on the current 1490 * level because we need to walk this node to figure 1491 * out which blocks to read. 1492 */ 1493 btrfs_unlock_up_safe(p, level + 1); 1494 1495 if (p->reada != READA_NONE) 1496 reada_for_search(fs_info, p, level, slot, key->objectid); 1497 1498 ret = -EAGAIN; 1499 tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, 1500 gen, parent_level - 1, &first_key); 1501 if (!IS_ERR(tmp)) { 1502 /* 1503 * If the read above didn't mark this buffer up to date, 1504 * it will never end up being up to date. Set ret to EIO now 1505 * and give up so that our caller doesn't loop forever 1506 * on our EAGAINs. 1507 */ 1508 if (!extent_buffer_uptodate(tmp)) 1509 ret = -EIO; 1510 free_extent_buffer(tmp); 1511 } else { 1512 ret = PTR_ERR(tmp); 1513 } 1514 1515 btrfs_release_path(p); 1516 return ret; 1517 } 1518 1519 /* 1520 * helper function for btrfs_search_slot. This does all of the checks 1521 * for node-level blocks and does any balancing required based on 1522 * the ins_len. 1523 * 1524 * If no extra work was required, zero is returned. If we had to 1525 * drop the path, -EAGAIN is returned and btrfs_search_slot must 1526 * start over 1527 */ 1528 static int 1529 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1530 struct btrfs_root *root, struct btrfs_path *p, 1531 struct extent_buffer *b, int level, int ins_len, 1532 int *write_lock_level) 1533 { 1534 struct btrfs_fs_info *fs_info = root->fs_info; 1535 int ret = 0; 1536 1537 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1538 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 1539 1540 if (*write_lock_level < level + 1) { 1541 *write_lock_level = level + 1; 1542 btrfs_release_path(p); 1543 return -EAGAIN; 1544 } 1545 1546 reada_for_balance(p, level); 1547 ret = split_node(trans, root, p, level); 1548 1549 b = p->nodes[level]; 1550 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1551 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 1552 1553 if (*write_lock_level < level + 1) { 1554 *write_lock_level = level + 1; 1555 btrfs_release_path(p); 1556 return -EAGAIN; 1557 } 1558 1559 reada_for_balance(p, level); 1560 ret = balance_level(trans, root, p, level); 1561 if (ret) 1562 return ret; 1563 1564 b = p->nodes[level]; 1565 if (!b) { 1566 btrfs_release_path(p); 1567 return -EAGAIN; 1568 } 1569 BUG_ON(btrfs_header_nritems(b) == 1); 1570 } 1571 return ret; 1572 } 1573 1574 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 1575 u64 iobjectid, u64 ioff, u8 key_type, 1576 struct btrfs_key *found_key) 1577 { 1578 int ret; 1579 struct btrfs_key key; 1580 struct extent_buffer *eb; 1581 1582 ASSERT(path); 1583 ASSERT(found_key); 1584 1585 key.type = key_type; 1586 key.objectid = iobjectid; 1587 key.offset = ioff; 1588 1589 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 1590 if (ret < 0) 1591 return ret; 1592 1593 eb = path->nodes[0]; 1594 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 1595 ret = btrfs_next_leaf(fs_root, path); 1596 if (ret) 1597 return ret; 1598 eb = path->nodes[0]; 1599 } 1600 1601 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 1602 if (found_key->type != key.type || 1603 found_key->objectid != key.objectid) 1604 return 1; 1605 1606 return 0; 1607 } 1608 1609 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 1610 struct btrfs_path *p, 1611 int write_lock_level) 1612 { 1613 struct btrfs_fs_info *fs_info = root->fs_info; 1614 struct extent_buffer *b; 1615 int root_lock; 1616 int level = 0; 1617 1618 /* We try very hard to do read locks on the root */ 1619 root_lock = BTRFS_READ_LOCK; 1620 1621 if (p->search_commit_root) { 1622 /* 1623 * The commit roots are read only so we always do read locks, 1624 * and we always must hold the commit_root_sem when doing 1625 * searches on them, the only exception is send where we don't 1626 * want to block transaction commits for a long time, so 1627 * we need to clone the commit root in order to avoid races 1628 * with transaction commits that create a snapshot of one of 1629 * the roots used by a send operation. 1630 */ 1631 if (p->need_commit_sem) { 1632 down_read(&fs_info->commit_root_sem); 1633 b = btrfs_clone_extent_buffer(root->commit_root); 1634 up_read(&fs_info->commit_root_sem); 1635 if (!b) 1636 return ERR_PTR(-ENOMEM); 1637 1638 } else { 1639 b = root->commit_root; 1640 atomic_inc(&b->refs); 1641 } 1642 level = btrfs_header_level(b); 1643 /* 1644 * Ensure that all callers have set skip_locking when 1645 * p->search_commit_root = 1. 1646 */ 1647 ASSERT(p->skip_locking == 1); 1648 1649 goto out; 1650 } 1651 1652 if (p->skip_locking) { 1653 b = btrfs_root_node(root); 1654 level = btrfs_header_level(b); 1655 goto out; 1656 } 1657 1658 /* 1659 * If the level is set to maximum, we can skip trying to get the read 1660 * lock. 1661 */ 1662 if (write_lock_level < BTRFS_MAX_LEVEL) { 1663 /* 1664 * We don't know the level of the root node until we actually 1665 * have it read locked 1666 */ 1667 b = btrfs_read_lock_root_node(root); 1668 level = btrfs_header_level(b); 1669 if (level > write_lock_level) 1670 goto out; 1671 1672 /* Whoops, must trade for write lock */ 1673 btrfs_tree_read_unlock(b); 1674 free_extent_buffer(b); 1675 } 1676 1677 b = btrfs_lock_root_node(root); 1678 root_lock = BTRFS_WRITE_LOCK; 1679 1680 /* The level might have changed, check again */ 1681 level = btrfs_header_level(b); 1682 1683 out: 1684 p->nodes[level] = b; 1685 if (!p->skip_locking) 1686 p->locks[level] = root_lock; 1687 /* 1688 * Callers are responsible for dropping b's references. 1689 */ 1690 return b; 1691 } 1692 1693 1694 /* 1695 * btrfs_search_slot - look for a key in a tree and perform necessary 1696 * modifications to preserve tree invariants. 1697 * 1698 * @trans: Handle of transaction, used when modifying the tree 1699 * @p: Holds all btree nodes along the search path 1700 * @root: The root node of the tree 1701 * @key: The key we are looking for 1702 * @ins_len: Indicates purpose of search: 1703 * >0 for inserts it's size of item inserted (*) 1704 * <0 for deletions 1705 * 0 for plain searches, not modifying the tree 1706 * 1707 * (*) If size of item inserted doesn't include 1708 * sizeof(struct btrfs_item), then p->search_for_extension must 1709 * be set. 1710 * @cow: boolean should CoW operations be performed. Must always be 1 1711 * when modifying the tree. 1712 * 1713 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 1714 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 1715 * 1716 * If @key is found, 0 is returned and you can find the item in the leaf level 1717 * of the path (level 0) 1718 * 1719 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 1720 * points to the slot where it should be inserted 1721 * 1722 * If an error is encountered while searching the tree a negative error number 1723 * is returned 1724 */ 1725 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1726 const struct btrfs_key *key, struct btrfs_path *p, 1727 int ins_len, int cow) 1728 { 1729 struct extent_buffer *b; 1730 int slot; 1731 int ret; 1732 int err; 1733 int level; 1734 int lowest_unlock = 1; 1735 /* everything at write_lock_level or lower must be write locked */ 1736 int write_lock_level = 0; 1737 u8 lowest_level = 0; 1738 int min_write_lock_level; 1739 int prev_cmp; 1740 1741 lowest_level = p->lowest_level; 1742 WARN_ON(lowest_level && ins_len > 0); 1743 WARN_ON(p->nodes[0] != NULL); 1744 BUG_ON(!cow && ins_len); 1745 1746 if (ins_len < 0) { 1747 lowest_unlock = 2; 1748 1749 /* when we are removing items, we might have to go up to level 1750 * two as we update tree pointers Make sure we keep write 1751 * for those levels as well 1752 */ 1753 write_lock_level = 2; 1754 } else if (ins_len > 0) { 1755 /* 1756 * for inserting items, make sure we have a write lock on 1757 * level 1 so we can update keys 1758 */ 1759 write_lock_level = 1; 1760 } 1761 1762 if (!cow) 1763 write_lock_level = -1; 1764 1765 if (cow && (p->keep_locks || p->lowest_level)) 1766 write_lock_level = BTRFS_MAX_LEVEL; 1767 1768 min_write_lock_level = write_lock_level; 1769 1770 again: 1771 prev_cmp = -1; 1772 b = btrfs_search_slot_get_root(root, p, write_lock_level); 1773 if (IS_ERR(b)) { 1774 ret = PTR_ERR(b); 1775 goto done; 1776 } 1777 1778 while (b) { 1779 int dec = 0; 1780 1781 level = btrfs_header_level(b); 1782 1783 if (cow) { 1784 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 1785 1786 /* 1787 * if we don't really need to cow this block 1788 * then we don't want to set the path blocking, 1789 * so we test it here 1790 */ 1791 if (!should_cow_block(trans, root, b)) { 1792 trans->dirty = true; 1793 goto cow_done; 1794 } 1795 1796 /* 1797 * must have write locks on this node and the 1798 * parent 1799 */ 1800 if (level > write_lock_level || 1801 (level + 1 > write_lock_level && 1802 level + 1 < BTRFS_MAX_LEVEL && 1803 p->nodes[level + 1])) { 1804 write_lock_level = level + 1; 1805 btrfs_release_path(p); 1806 goto again; 1807 } 1808 1809 if (last_level) 1810 err = btrfs_cow_block(trans, root, b, NULL, 0, 1811 &b, 1812 BTRFS_NESTING_COW); 1813 else 1814 err = btrfs_cow_block(trans, root, b, 1815 p->nodes[level + 1], 1816 p->slots[level + 1], &b, 1817 BTRFS_NESTING_COW); 1818 if (err) { 1819 ret = err; 1820 goto done; 1821 } 1822 } 1823 cow_done: 1824 p->nodes[level] = b; 1825 /* 1826 * Leave path with blocking locks to avoid massive 1827 * lock context switch, this is made on purpose. 1828 */ 1829 1830 /* 1831 * we have a lock on b and as long as we aren't changing 1832 * the tree, there is no way to for the items in b to change. 1833 * It is safe to drop the lock on our parent before we 1834 * go through the expensive btree search on b. 1835 * 1836 * If we're inserting or deleting (ins_len != 0), then we might 1837 * be changing slot zero, which may require changing the parent. 1838 * So, we can't drop the lock until after we know which slot 1839 * we're operating on. 1840 */ 1841 if (!ins_len && !p->keep_locks) { 1842 int u = level + 1; 1843 1844 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 1845 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 1846 p->locks[u] = 0; 1847 } 1848 } 1849 1850 /* 1851 * If btrfs_bin_search returns an exact match (prev_cmp == 0) 1852 * we can safely assume the target key will always be in slot 0 1853 * on lower levels due to the invariants BTRFS' btree provides, 1854 * namely that a btrfs_key_ptr entry always points to the 1855 * lowest key in the child node, thus we can skip searching 1856 * lower levels 1857 */ 1858 if (prev_cmp == 0) { 1859 slot = 0; 1860 ret = 0; 1861 } else { 1862 ret = btrfs_bin_search(b, key, &slot); 1863 prev_cmp = ret; 1864 if (ret < 0) 1865 goto done; 1866 } 1867 1868 if (level == 0) { 1869 p->slots[level] = slot; 1870 /* 1871 * Item key already exists. In this case, if we are 1872 * allowed to insert the item (for example, in dir_item 1873 * case, item key collision is allowed), it will be 1874 * merged with the original item. Only the item size 1875 * grows, no new btrfs item will be added. If 1876 * search_for_extension is not set, ins_len already 1877 * accounts the size btrfs_item, deduct it here so leaf 1878 * space check will be correct. 1879 */ 1880 if (ret == 0 && ins_len > 0 && !p->search_for_extension) { 1881 ASSERT(ins_len >= sizeof(struct btrfs_item)); 1882 ins_len -= sizeof(struct btrfs_item); 1883 } 1884 if (ins_len > 0 && 1885 btrfs_leaf_free_space(b) < ins_len) { 1886 if (write_lock_level < 1) { 1887 write_lock_level = 1; 1888 btrfs_release_path(p); 1889 goto again; 1890 } 1891 1892 err = split_leaf(trans, root, key, 1893 p, ins_len, ret == 0); 1894 1895 BUG_ON(err > 0); 1896 if (err) { 1897 ret = err; 1898 goto done; 1899 } 1900 } 1901 if (!p->search_for_split) 1902 unlock_up(p, level, lowest_unlock, 1903 min_write_lock_level, NULL); 1904 goto done; 1905 } 1906 if (ret && slot > 0) { 1907 dec = 1; 1908 slot--; 1909 } 1910 p->slots[level] = slot; 1911 err = setup_nodes_for_search(trans, root, p, b, level, ins_len, 1912 &write_lock_level); 1913 if (err == -EAGAIN) 1914 goto again; 1915 if (err) { 1916 ret = err; 1917 goto done; 1918 } 1919 b = p->nodes[level]; 1920 slot = p->slots[level]; 1921 1922 /* 1923 * Slot 0 is special, if we change the key we have to update 1924 * the parent pointer which means we must have a write lock on 1925 * the parent 1926 */ 1927 if (slot == 0 && ins_len && write_lock_level < level + 1) { 1928 write_lock_level = level + 1; 1929 btrfs_release_path(p); 1930 goto again; 1931 } 1932 1933 unlock_up(p, level, lowest_unlock, min_write_lock_level, 1934 &write_lock_level); 1935 1936 if (level == lowest_level) { 1937 if (dec) 1938 p->slots[level]++; 1939 goto done; 1940 } 1941 1942 err = read_block_for_search(root, p, &b, level, slot, key); 1943 if (err == -EAGAIN) 1944 goto again; 1945 if (err) { 1946 ret = err; 1947 goto done; 1948 } 1949 1950 if (!p->skip_locking) { 1951 level = btrfs_header_level(b); 1952 if (level <= write_lock_level) { 1953 btrfs_tree_lock(b); 1954 p->locks[level] = BTRFS_WRITE_LOCK; 1955 } else { 1956 btrfs_tree_read_lock(b); 1957 p->locks[level] = BTRFS_READ_LOCK; 1958 } 1959 p->nodes[level] = b; 1960 } 1961 } 1962 ret = 1; 1963 done: 1964 if (ret < 0 && !p->skip_release_on_error) 1965 btrfs_release_path(p); 1966 return ret; 1967 } 1968 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); 1969 1970 /* 1971 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 1972 * current state of the tree together with the operations recorded in the tree 1973 * modification log to search for the key in a previous version of this tree, as 1974 * denoted by the time_seq parameter. 1975 * 1976 * Naturally, there is no support for insert, delete or cow operations. 1977 * 1978 * The resulting path and return value will be set up as if we called 1979 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 1980 */ 1981 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 1982 struct btrfs_path *p, u64 time_seq) 1983 { 1984 struct btrfs_fs_info *fs_info = root->fs_info; 1985 struct extent_buffer *b; 1986 int slot; 1987 int ret; 1988 int err; 1989 int level; 1990 int lowest_unlock = 1; 1991 u8 lowest_level = 0; 1992 1993 lowest_level = p->lowest_level; 1994 WARN_ON(p->nodes[0] != NULL); 1995 1996 if (p->search_commit_root) { 1997 BUG_ON(time_seq); 1998 return btrfs_search_slot(NULL, root, key, p, 0, 0); 1999 } 2000 2001 again: 2002 b = btrfs_get_old_root(root, time_seq); 2003 if (!b) { 2004 ret = -EIO; 2005 goto done; 2006 } 2007 level = btrfs_header_level(b); 2008 p->locks[level] = BTRFS_READ_LOCK; 2009 2010 while (b) { 2011 int dec = 0; 2012 2013 level = btrfs_header_level(b); 2014 p->nodes[level] = b; 2015 2016 /* 2017 * we have a lock on b and as long as we aren't changing 2018 * the tree, there is no way to for the items in b to change. 2019 * It is safe to drop the lock on our parent before we 2020 * go through the expensive btree search on b. 2021 */ 2022 btrfs_unlock_up_safe(p, level + 1); 2023 2024 ret = btrfs_bin_search(b, key, &slot); 2025 if (ret < 0) 2026 goto done; 2027 2028 if (level == 0) { 2029 p->slots[level] = slot; 2030 unlock_up(p, level, lowest_unlock, 0, NULL); 2031 goto done; 2032 } 2033 2034 if (ret && slot > 0) { 2035 dec = 1; 2036 slot--; 2037 } 2038 p->slots[level] = slot; 2039 unlock_up(p, level, lowest_unlock, 0, NULL); 2040 2041 if (level == lowest_level) { 2042 if (dec) 2043 p->slots[level]++; 2044 goto done; 2045 } 2046 2047 err = read_block_for_search(root, p, &b, level, slot, key); 2048 if (err == -EAGAIN) 2049 goto again; 2050 if (err) { 2051 ret = err; 2052 goto done; 2053 } 2054 2055 level = btrfs_header_level(b); 2056 btrfs_tree_read_lock(b); 2057 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); 2058 if (!b) { 2059 ret = -ENOMEM; 2060 goto done; 2061 } 2062 p->locks[level] = BTRFS_READ_LOCK; 2063 p->nodes[level] = b; 2064 } 2065 ret = 1; 2066 done: 2067 if (ret < 0) 2068 btrfs_release_path(p); 2069 2070 return ret; 2071 } 2072 2073 /* 2074 * helper to use instead of search slot if no exact match is needed but 2075 * instead the next or previous item should be returned. 2076 * When find_higher is true, the next higher item is returned, the next lower 2077 * otherwise. 2078 * When return_any and find_higher are both true, and no higher item is found, 2079 * return the next lower instead. 2080 * When return_any is true and find_higher is false, and no lower item is found, 2081 * return the next higher instead. 2082 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2083 * < 0 on error 2084 */ 2085 int btrfs_search_slot_for_read(struct btrfs_root *root, 2086 const struct btrfs_key *key, 2087 struct btrfs_path *p, int find_higher, 2088 int return_any) 2089 { 2090 int ret; 2091 struct extent_buffer *leaf; 2092 2093 again: 2094 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2095 if (ret <= 0) 2096 return ret; 2097 /* 2098 * a return value of 1 means the path is at the position where the 2099 * item should be inserted. Normally this is the next bigger item, 2100 * but in case the previous item is the last in a leaf, path points 2101 * to the first free slot in the previous leaf, i.e. at an invalid 2102 * item. 2103 */ 2104 leaf = p->nodes[0]; 2105 2106 if (find_higher) { 2107 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2108 ret = btrfs_next_leaf(root, p); 2109 if (ret <= 0) 2110 return ret; 2111 if (!return_any) 2112 return 1; 2113 /* 2114 * no higher item found, return the next 2115 * lower instead 2116 */ 2117 return_any = 0; 2118 find_higher = 0; 2119 btrfs_release_path(p); 2120 goto again; 2121 } 2122 } else { 2123 if (p->slots[0] == 0) { 2124 ret = btrfs_prev_leaf(root, p); 2125 if (ret < 0) 2126 return ret; 2127 if (!ret) { 2128 leaf = p->nodes[0]; 2129 if (p->slots[0] == btrfs_header_nritems(leaf)) 2130 p->slots[0]--; 2131 return 0; 2132 } 2133 if (!return_any) 2134 return 1; 2135 /* 2136 * no lower item found, return the next 2137 * higher instead 2138 */ 2139 return_any = 0; 2140 find_higher = 1; 2141 btrfs_release_path(p); 2142 goto again; 2143 } else { 2144 --p->slots[0]; 2145 } 2146 } 2147 return 0; 2148 } 2149 2150 /* 2151 * adjust the pointers going up the tree, starting at level 2152 * making sure the right key of each node is points to 'key'. 2153 * This is used after shifting pointers to the left, so it stops 2154 * fixing up pointers when a given leaf/node is not in slot 0 of the 2155 * higher levels 2156 * 2157 */ 2158 static void fixup_low_keys(struct btrfs_path *path, 2159 struct btrfs_disk_key *key, int level) 2160 { 2161 int i; 2162 struct extent_buffer *t; 2163 int ret; 2164 2165 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2166 int tslot = path->slots[i]; 2167 2168 if (!path->nodes[i]) 2169 break; 2170 t = path->nodes[i]; 2171 ret = btrfs_tree_mod_log_insert_key(t, tslot, 2172 BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC); 2173 BUG_ON(ret < 0); 2174 btrfs_set_node_key(t, key, tslot); 2175 btrfs_mark_buffer_dirty(path->nodes[i]); 2176 if (tslot != 0) 2177 break; 2178 } 2179 } 2180 2181 /* 2182 * update item key. 2183 * 2184 * This function isn't completely safe. It's the caller's responsibility 2185 * that the new key won't break the order 2186 */ 2187 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, 2188 struct btrfs_path *path, 2189 const struct btrfs_key *new_key) 2190 { 2191 struct btrfs_disk_key disk_key; 2192 struct extent_buffer *eb; 2193 int slot; 2194 2195 eb = path->nodes[0]; 2196 slot = path->slots[0]; 2197 if (slot > 0) { 2198 btrfs_item_key(eb, &disk_key, slot - 1); 2199 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { 2200 btrfs_crit(fs_info, 2201 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2202 slot, btrfs_disk_key_objectid(&disk_key), 2203 btrfs_disk_key_type(&disk_key), 2204 btrfs_disk_key_offset(&disk_key), 2205 new_key->objectid, new_key->type, 2206 new_key->offset); 2207 btrfs_print_leaf(eb); 2208 BUG(); 2209 } 2210 } 2211 if (slot < btrfs_header_nritems(eb) - 1) { 2212 btrfs_item_key(eb, &disk_key, slot + 1); 2213 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { 2214 btrfs_crit(fs_info, 2215 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2216 slot, btrfs_disk_key_objectid(&disk_key), 2217 btrfs_disk_key_type(&disk_key), 2218 btrfs_disk_key_offset(&disk_key), 2219 new_key->objectid, new_key->type, 2220 new_key->offset); 2221 btrfs_print_leaf(eb); 2222 BUG(); 2223 } 2224 } 2225 2226 btrfs_cpu_key_to_disk(&disk_key, new_key); 2227 btrfs_set_item_key(eb, &disk_key, slot); 2228 btrfs_mark_buffer_dirty(eb); 2229 if (slot == 0) 2230 fixup_low_keys(path, &disk_key, 1); 2231 } 2232 2233 /* 2234 * Check key order of two sibling extent buffers. 2235 * 2236 * Return true if something is wrong. 2237 * Return false if everything is fine. 2238 * 2239 * Tree-checker only works inside one tree block, thus the following 2240 * corruption can not be detected by tree-checker: 2241 * 2242 * Leaf @left | Leaf @right 2243 * -------------------------------------------------------------- 2244 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | 2245 * 2246 * Key f6 in leaf @left itself is valid, but not valid when the next 2247 * key in leaf @right is 7. 2248 * This can only be checked at tree block merge time. 2249 * And since tree checker has ensured all key order in each tree block 2250 * is correct, we only need to bother the last key of @left and the first 2251 * key of @right. 2252 */ 2253 static bool check_sibling_keys(struct extent_buffer *left, 2254 struct extent_buffer *right) 2255 { 2256 struct btrfs_key left_last; 2257 struct btrfs_key right_first; 2258 int level = btrfs_header_level(left); 2259 int nr_left = btrfs_header_nritems(left); 2260 int nr_right = btrfs_header_nritems(right); 2261 2262 /* No key to check in one of the tree blocks */ 2263 if (!nr_left || !nr_right) 2264 return false; 2265 2266 if (level) { 2267 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); 2268 btrfs_node_key_to_cpu(right, &right_first, 0); 2269 } else { 2270 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); 2271 btrfs_item_key_to_cpu(right, &right_first, 0); 2272 } 2273 2274 if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { 2275 btrfs_crit(left->fs_info, 2276 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", 2277 left_last.objectid, left_last.type, 2278 left_last.offset, right_first.objectid, 2279 right_first.type, right_first.offset); 2280 return true; 2281 } 2282 return false; 2283 } 2284 2285 /* 2286 * try to push data from one node into the next node left in the 2287 * tree. 2288 * 2289 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2290 * error, and > 0 if there was no room in the left hand block. 2291 */ 2292 static int push_node_left(struct btrfs_trans_handle *trans, 2293 struct extent_buffer *dst, 2294 struct extent_buffer *src, int empty) 2295 { 2296 struct btrfs_fs_info *fs_info = trans->fs_info; 2297 int push_items = 0; 2298 int src_nritems; 2299 int dst_nritems; 2300 int ret = 0; 2301 2302 src_nritems = btrfs_header_nritems(src); 2303 dst_nritems = btrfs_header_nritems(dst); 2304 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2305 WARN_ON(btrfs_header_generation(src) != trans->transid); 2306 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2307 2308 if (!empty && src_nritems <= 8) 2309 return 1; 2310 2311 if (push_items <= 0) 2312 return 1; 2313 2314 if (empty) { 2315 push_items = min(src_nritems, push_items); 2316 if (push_items < src_nritems) { 2317 /* leave at least 8 pointers in the node if 2318 * we aren't going to empty it 2319 */ 2320 if (src_nritems - push_items < 8) { 2321 if (push_items <= 8) 2322 return 1; 2323 push_items -= 8; 2324 } 2325 } 2326 } else 2327 push_items = min(src_nritems - 8, push_items); 2328 2329 /* dst is the left eb, src is the middle eb */ 2330 if (check_sibling_keys(dst, src)) { 2331 ret = -EUCLEAN; 2332 btrfs_abort_transaction(trans, ret); 2333 return ret; 2334 } 2335 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); 2336 if (ret) { 2337 btrfs_abort_transaction(trans, ret); 2338 return ret; 2339 } 2340 copy_extent_buffer(dst, src, 2341 btrfs_node_key_ptr_offset(dst_nritems), 2342 btrfs_node_key_ptr_offset(0), 2343 push_items * sizeof(struct btrfs_key_ptr)); 2344 2345 if (push_items < src_nritems) { 2346 /* 2347 * Don't call btrfs_tree_mod_log_insert_move() here, key removal 2348 * was already fully logged by btrfs_tree_mod_log_eb_copy() above. 2349 */ 2350 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 2351 btrfs_node_key_ptr_offset(push_items), 2352 (src_nritems - push_items) * 2353 sizeof(struct btrfs_key_ptr)); 2354 } 2355 btrfs_set_header_nritems(src, src_nritems - push_items); 2356 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2357 btrfs_mark_buffer_dirty(src); 2358 btrfs_mark_buffer_dirty(dst); 2359 2360 return ret; 2361 } 2362 2363 /* 2364 * try to push data from one node into the next node right in the 2365 * tree. 2366 * 2367 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 2368 * error, and > 0 if there was no room in the right hand block. 2369 * 2370 * this will only push up to 1/2 the contents of the left node over 2371 */ 2372 static int balance_node_right(struct btrfs_trans_handle *trans, 2373 struct extent_buffer *dst, 2374 struct extent_buffer *src) 2375 { 2376 struct btrfs_fs_info *fs_info = trans->fs_info; 2377 int push_items = 0; 2378 int max_push; 2379 int src_nritems; 2380 int dst_nritems; 2381 int ret = 0; 2382 2383 WARN_ON(btrfs_header_generation(src) != trans->transid); 2384 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2385 2386 src_nritems = btrfs_header_nritems(src); 2387 dst_nritems = btrfs_header_nritems(dst); 2388 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2389 if (push_items <= 0) 2390 return 1; 2391 2392 if (src_nritems < 4) 2393 return 1; 2394 2395 max_push = src_nritems / 2 + 1; 2396 /* don't try to empty the node */ 2397 if (max_push >= src_nritems) 2398 return 1; 2399 2400 if (max_push < push_items) 2401 push_items = max_push; 2402 2403 /* dst is the right eb, src is the middle eb */ 2404 if (check_sibling_keys(src, dst)) { 2405 ret = -EUCLEAN; 2406 btrfs_abort_transaction(trans, ret); 2407 return ret; 2408 } 2409 ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); 2410 BUG_ON(ret < 0); 2411 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 2412 btrfs_node_key_ptr_offset(0), 2413 (dst_nritems) * 2414 sizeof(struct btrfs_key_ptr)); 2415 2416 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, 2417 push_items); 2418 if (ret) { 2419 btrfs_abort_transaction(trans, ret); 2420 return ret; 2421 } 2422 copy_extent_buffer(dst, src, 2423 btrfs_node_key_ptr_offset(0), 2424 btrfs_node_key_ptr_offset(src_nritems - push_items), 2425 push_items * sizeof(struct btrfs_key_ptr)); 2426 2427 btrfs_set_header_nritems(src, src_nritems - push_items); 2428 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2429 2430 btrfs_mark_buffer_dirty(src); 2431 btrfs_mark_buffer_dirty(dst); 2432 2433 return ret; 2434 } 2435 2436 /* 2437 * helper function to insert a new root level in the tree. 2438 * A new node is allocated, and a single item is inserted to 2439 * point to the existing root 2440 * 2441 * returns zero on success or < 0 on failure. 2442 */ 2443 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 2444 struct btrfs_root *root, 2445 struct btrfs_path *path, int level) 2446 { 2447 struct btrfs_fs_info *fs_info = root->fs_info; 2448 u64 lower_gen; 2449 struct extent_buffer *lower; 2450 struct extent_buffer *c; 2451 struct extent_buffer *old; 2452 struct btrfs_disk_key lower_key; 2453 int ret; 2454 2455 BUG_ON(path->nodes[level]); 2456 BUG_ON(path->nodes[level-1] != root->node); 2457 2458 lower = path->nodes[level-1]; 2459 if (level == 1) 2460 btrfs_item_key(lower, &lower_key, 0); 2461 else 2462 btrfs_node_key(lower, &lower_key, 0); 2463 2464 c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, 2465 root->node->start, 0, 2466 BTRFS_NESTING_NEW_ROOT); 2467 if (IS_ERR(c)) 2468 return PTR_ERR(c); 2469 2470 root_add_used(root, fs_info->nodesize); 2471 2472 btrfs_set_header_nritems(c, 1); 2473 btrfs_set_node_key(c, &lower_key, 0); 2474 btrfs_set_node_blockptr(c, 0, lower->start); 2475 lower_gen = btrfs_header_generation(lower); 2476 WARN_ON(lower_gen != trans->transid); 2477 2478 btrfs_set_node_ptr_generation(c, 0, lower_gen); 2479 2480 btrfs_mark_buffer_dirty(c); 2481 2482 old = root->node; 2483 ret = btrfs_tree_mod_log_insert_root(root->node, c, false); 2484 BUG_ON(ret < 0); 2485 rcu_assign_pointer(root->node, c); 2486 2487 /* the super has an extra ref to root->node */ 2488 free_extent_buffer(old); 2489 2490 add_root_to_dirty_list(root); 2491 atomic_inc(&c->refs); 2492 path->nodes[level] = c; 2493 path->locks[level] = BTRFS_WRITE_LOCK; 2494 path->slots[level] = 0; 2495 return 0; 2496 } 2497 2498 /* 2499 * worker function to insert a single pointer in a node. 2500 * the node should have enough room for the pointer already 2501 * 2502 * slot and level indicate where you want the key to go, and 2503 * blocknr is the block the key points to. 2504 */ 2505 static void insert_ptr(struct btrfs_trans_handle *trans, 2506 struct btrfs_path *path, 2507 struct btrfs_disk_key *key, u64 bytenr, 2508 int slot, int level) 2509 { 2510 struct extent_buffer *lower; 2511 int nritems; 2512 int ret; 2513 2514 BUG_ON(!path->nodes[level]); 2515 btrfs_assert_tree_locked(path->nodes[level]); 2516 lower = path->nodes[level]; 2517 nritems = btrfs_header_nritems(lower); 2518 BUG_ON(slot > nritems); 2519 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); 2520 if (slot != nritems) { 2521 if (level) { 2522 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, 2523 slot, nritems - slot); 2524 BUG_ON(ret < 0); 2525 } 2526 memmove_extent_buffer(lower, 2527 btrfs_node_key_ptr_offset(slot + 1), 2528 btrfs_node_key_ptr_offset(slot), 2529 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2530 } 2531 if (level) { 2532 ret = btrfs_tree_mod_log_insert_key(lower, slot, 2533 BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); 2534 BUG_ON(ret < 0); 2535 } 2536 btrfs_set_node_key(lower, key, slot); 2537 btrfs_set_node_blockptr(lower, slot, bytenr); 2538 WARN_ON(trans->transid == 0); 2539 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 2540 btrfs_set_header_nritems(lower, nritems + 1); 2541 btrfs_mark_buffer_dirty(lower); 2542 } 2543 2544 /* 2545 * split the node at the specified level in path in two. 2546 * The path is corrected to point to the appropriate node after the split 2547 * 2548 * Before splitting this tries to make some room in the node by pushing 2549 * left and right, if either one works, it returns right away. 2550 * 2551 * returns 0 on success and < 0 on failure 2552 */ 2553 static noinline int split_node(struct btrfs_trans_handle *trans, 2554 struct btrfs_root *root, 2555 struct btrfs_path *path, int level) 2556 { 2557 struct btrfs_fs_info *fs_info = root->fs_info; 2558 struct extent_buffer *c; 2559 struct extent_buffer *split; 2560 struct btrfs_disk_key disk_key; 2561 int mid; 2562 int ret; 2563 u32 c_nritems; 2564 2565 c = path->nodes[level]; 2566 WARN_ON(btrfs_header_generation(c) != trans->transid); 2567 if (c == root->node) { 2568 /* 2569 * trying to split the root, lets make a new one 2570 * 2571 * tree mod log: We don't log_removal old root in 2572 * insert_new_root, because that root buffer will be kept as a 2573 * normal node. We are going to log removal of half of the 2574 * elements below with btrfs_tree_mod_log_eb_copy(). We're 2575 * holding a tree lock on the buffer, which is why we cannot 2576 * race with other tree_mod_log users. 2577 */ 2578 ret = insert_new_root(trans, root, path, level + 1); 2579 if (ret) 2580 return ret; 2581 } else { 2582 ret = push_nodes_for_insert(trans, root, path, level); 2583 c = path->nodes[level]; 2584 if (!ret && btrfs_header_nritems(c) < 2585 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 2586 return 0; 2587 if (ret < 0) 2588 return ret; 2589 } 2590 2591 c_nritems = btrfs_header_nritems(c); 2592 mid = (c_nritems + 1) / 2; 2593 btrfs_node_key(c, &disk_key, mid); 2594 2595 split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, 2596 c->start, 0, BTRFS_NESTING_SPLIT); 2597 if (IS_ERR(split)) 2598 return PTR_ERR(split); 2599 2600 root_add_used(root, fs_info->nodesize); 2601 ASSERT(btrfs_header_level(c) == level); 2602 2603 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); 2604 if (ret) { 2605 btrfs_abort_transaction(trans, ret); 2606 return ret; 2607 } 2608 copy_extent_buffer(split, c, 2609 btrfs_node_key_ptr_offset(0), 2610 btrfs_node_key_ptr_offset(mid), 2611 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 2612 btrfs_set_header_nritems(split, c_nritems - mid); 2613 btrfs_set_header_nritems(c, mid); 2614 2615 btrfs_mark_buffer_dirty(c); 2616 btrfs_mark_buffer_dirty(split); 2617 2618 insert_ptr(trans, path, &disk_key, split->start, 2619 path->slots[level + 1] + 1, level + 1); 2620 2621 if (path->slots[level] >= mid) { 2622 path->slots[level] -= mid; 2623 btrfs_tree_unlock(c); 2624 free_extent_buffer(c); 2625 path->nodes[level] = split; 2626 path->slots[level + 1] += 1; 2627 } else { 2628 btrfs_tree_unlock(split); 2629 free_extent_buffer(split); 2630 } 2631 return 0; 2632 } 2633 2634 /* 2635 * how many bytes are required to store the items in a leaf. start 2636 * and nr indicate which items in the leaf to check. This totals up the 2637 * space used both by the item structs and the item data 2638 */ 2639 static int leaf_space_used(struct extent_buffer *l, int start, int nr) 2640 { 2641 struct btrfs_item *start_item; 2642 struct btrfs_item *end_item; 2643 int data_len; 2644 int nritems = btrfs_header_nritems(l); 2645 int end = min(nritems, start + nr) - 1; 2646 2647 if (!nr) 2648 return 0; 2649 start_item = btrfs_item_nr(start); 2650 end_item = btrfs_item_nr(end); 2651 data_len = btrfs_item_offset(l, start_item) + 2652 btrfs_item_size(l, start_item); 2653 data_len = data_len - btrfs_item_offset(l, end_item); 2654 data_len += sizeof(struct btrfs_item) * nr; 2655 WARN_ON(data_len < 0); 2656 return data_len; 2657 } 2658 2659 /* 2660 * The space between the end of the leaf items and 2661 * the start of the leaf data. IOW, how much room 2662 * the leaf has left for both items and data 2663 */ 2664 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) 2665 { 2666 struct btrfs_fs_info *fs_info = leaf->fs_info; 2667 int nritems = btrfs_header_nritems(leaf); 2668 int ret; 2669 2670 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 2671 if (ret < 0) { 2672 btrfs_crit(fs_info, 2673 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 2674 ret, 2675 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 2676 leaf_space_used(leaf, 0, nritems), nritems); 2677 } 2678 return ret; 2679 } 2680 2681 /* 2682 * min slot controls the lowest index we're willing to push to the 2683 * right. We'll push up to and including min_slot, but no lower 2684 */ 2685 static noinline int __push_leaf_right(struct btrfs_path *path, 2686 int data_size, int empty, 2687 struct extent_buffer *right, 2688 int free_space, u32 left_nritems, 2689 u32 min_slot) 2690 { 2691 struct btrfs_fs_info *fs_info = right->fs_info; 2692 struct extent_buffer *left = path->nodes[0]; 2693 struct extent_buffer *upper = path->nodes[1]; 2694 struct btrfs_map_token token; 2695 struct btrfs_disk_key disk_key; 2696 int slot; 2697 u32 i; 2698 int push_space = 0; 2699 int push_items = 0; 2700 struct btrfs_item *item; 2701 u32 nr; 2702 u32 right_nritems; 2703 u32 data_end; 2704 u32 this_item_size; 2705 2706 if (empty) 2707 nr = 0; 2708 else 2709 nr = max_t(u32, 1, min_slot); 2710 2711 if (path->slots[0] >= left_nritems) 2712 push_space += data_size; 2713 2714 slot = path->slots[1]; 2715 i = left_nritems - 1; 2716 while (i >= nr) { 2717 item = btrfs_item_nr(i); 2718 2719 if (!empty && push_items > 0) { 2720 if (path->slots[0] > i) 2721 break; 2722 if (path->slots[0] == i) { 2723 int space = btrfs_leaf_free_space(left); 2724 2725 if (space + push_space * 2 > free_space) 2726 break; 2727 } 2728 } 2729 2730 if (path->slots[0] == i) 2731 push_space += data_size; 2732 2733 this_item_size = btrfs_item_size(left, item); 2734 if (this_item_size + sizeof(*item) + push_space > free_space) 2735 break; 2736 2737 push_items++; 2738 push_space += this_item_size + sizeof(*item); 2739 if (i == 0) 2740 break; 2741 i--; 2742 } 2743 2744 if (push_items == 0) 2745 goto out_unlock; 2746 2747 WARN_ON(!empty && push_items == left_nritems); 2748 2749 /* push left to right */ 2750 right_nritems = btrfs_header_nritems(right); 2751 2752 push_space = btrfs_item_end_nr(left, left_nritems - push_items); 2753 push_space -= leaf_data_end(left); 2754 2755 /* make room in the right data area */ 2756 data_end = leaf_data_end(right); 2757 memmove_extent_buffer(right, 2758 BTRFS_LEAF_DATA_OFFSET + data_end - push_space, 2759 BTRFS_LEAF_DATA_OFFSET + data_end, 2760 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 2761 2762 /* copy from the left data area */ 2763 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + 2764 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 2765 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left), 2766 push_space); 2767 2768 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), 2769 btrfs_item_nr_offset(0), 2770 right_nritems * sizeof(struct btrfs_item)); 2771 2772 /* copy the items from left to right */ 2773 copy_extent_buffer(right, left, btrfs_item_nr_offset(0), 2774 btrfs_item_nr_offset(left_nritems - push_items), 2775 push_items * sizeof(struct btrfs_item)); 2776 2777 /* update the item pointers */ 2778 btrfs_init_map_token(&token, right); 2779 right_nritems += push_items; 2780 btrfs_set_header_nritems(right, right_nritems); 2781 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 2782 for (i = 0; i < right_nritems; i++) { 2783 item = btrfs_item_nr(i); 2784 push_space -= btrfs_token_item_size(&token, item); 2785 btrfs_set_token_item_offset(&token, item, push_space); 2786 } 2787 2788 left_nritems -= push_items; 2789 btrfs_set_header_nritems(left, left_nritems); 2790 2791 if (left_nritems) 2792 btrfs_mark_buffer_dirty(left); 2793 else 2794 btrfs_clean_tree_block(left); 2795 2796 btrfs_mark_buffer_dirty(right); 2797 2798 btrfs_item_key(right, &disk_key, 0); 2799 btrfs_set_node_key(upper, &disk_key, slot + 1); 2800 btrfs_mark_buffer_dirty(upper); 2801 2802 /* then fixup the leaf pointer in the path */ 2803 if (path->slots[0] >= left_nritems) { 2804 path->slots[0] -= left_nritems; 2805 if (btrfs_header_nritems(path->nodes[0]) == 0) 2806 btrfs_clean_tree_block(path->nodes[0]); 2807 btrfs_tree_unlock(path->nodes[0]); 2808 free_extent_buffer(path->nodes[0]); 2809 path->nodes[0] = right; 2810 path->slots[1] += 1; 2811 } else { 2812 btrfs_tree_unlock(right); 2813 free_extent_buffer(right); 2814 } 2815 return 0; 2816 2817 out_unlock: 2818 btrfs_tree_unlock(right); 2819 free_extent_buffer(right); 2820 return 1; 2821 } 2822 2823 /* 2824 * push some data in the path leaf to the right, trying to free up at 2825 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2826 * 2827 * returns 1 if the push failed because the other node didn't have enough 2828 * room, 0 if everything worked out and < 0 if there were major errors. 2829 * 2830 * this will push starting from min_slot to the end of the leaf. It won't 2831 * push any slot lower than min_slot 2832 */ 2833 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 2834 *root, struct btrfs_path *path, 2835 int min_data_size, int data_size, 2836 int empty, u32 min_slot) 2837 { 2838 struct extent_buffer *left = path->nodes[0]; 2839 struct extent_buffer *right; 2840 struct extent_buffer *upper; 2841 int slot; 2842 int free_space; 2843 u32 left_nritems; 2844 int ret; 2845 2846 if (!path->nodes[1]) 2847 return 1; 2848 2849 slot = path->slots[1]; 2850 upper = path->nodes[1]; 2851 if (slot >= btrfs_header_nritems(upper) - 1) 2852 return 1; 2853 2854 btrfs_assert_tree_locked(path->nodes[1]); 2855 2856 right = btrfs_read_node_slot(upper, slot + 1); 2857 /* 2858 * slot + 1 is not valid or we fail to read the right node, 2859 * no big deal, just return. 2860 */ 2861 if (IS_ERR(right)) 2862 return 1; 2863 2864 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 2865 2866 free_space = btrfs_leaf_free_space(right); 2867 if (free_space < data_size) 2868 goto out_unlock; 2869 2870 /* cow and double check */ 2871 ret = btrfs_cow_block(trans, root, right, upper, 2872 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 2873 if (ret) 2874 goto out_unlock; 2875 2876 free_space = btrfs_leaf_free_space(right); 2877 if (free_space < data_size) 2878 goto out_unlock; 2879 2880 left_nritems = btrfs_header_nritems(left); 2881 if (left_nritems == 0) 2882 goto out_unlock; 2883 2884 if (check_sibling_keys(left, right)) { 2885 ret = -EUCLEAN; 2886 btrfs_tree_unlock(right); 2887 free_extent_buffer(right); 2888 return ret; 2889 } 2890 if (path->slots[0] == left_nritems && !empty) { 2891 /* Key greater than all keys in the leaf, right neighbor has 2892 * enough room for it and we're not emptying our leaf to delete 2893 * it, therefore use right neighbor to insert the new item and 2894 * no need to touch/dirty our left leaf. */ 2895 btrfs_tree_unlock(left); 2896 free_extent_buffer(left); 2897 path->nodes[0] = right; 2898 path->slots[0] = 0; 2899 path->slots[1]++; 2900 return 0; 2901 } 2902 2903 return __push_leaf_right(path, min_data_size, empty, 2904 right, free_space, left_nritems, min_slot); 2905 out_unlock: 2906 btrfs_tree_unlock(right); 2907 free_extent_buffer(right); 2908 return 1; 2909 } 2910 2911 /* 2912 * push some data in the path leaf to the left, trying to free up at 2913 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2914 * 2915 * max_slot can put a limit on how far into the leaf we'll push items. The 2916 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 2917 * items 2918 */ 2919 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, 2920 int empty, struct extent_buffer *left, 2921 int free_space, u32 right_nritems, 2922 u32 max_slot) 2923 { 2924 struct btrfs_fs_info *fs_info = left->fs_info; 2925 struct btrfs_disk_key disk_key; 2926 struct extent_buffer *right = path->nodes[0]; 2927 int i; 2928 int push_space = 0; 2929 int push_items = 0; 2930 struct btrfs_item *item; 2931 u32 old_left_nritems; 2932 u32 nr; 2933 int ret = 0; 2934 u32 this_item_size; 2935 u32 old_left_item_size; 2936 struct btrfs_map_token token; 2937 2938 if (empty) 2939 nr = min(right_nritems, max_slot); 2940 else 2941 nr = min(right_nritems - 1, max_slot); 2942 2943 for (i = 0; i < nr; i++) { 2944 item = btrfs_item_nr(i); 2945 2946 if (!empty && push_items > 0) { 2947 if (path->slots[0] < i) 2948 break; 2949 if (path->slots[0] == i) { 2950 int space = btrfs_leaf_free_space(right); 2951 2952 if (space + push_space * 2 > free_space) 2953 break; 2954 } 2955 } 2956 2957 if (path->slots[0] == i) 2958 push_space += data_size; 2959 2960 this_item_size = btrfs_item_size(right, item); 2961 if (this_item_size + sizeof(*item) + push_space > free_space) 2962 break; 2963 2964 push_items++; 2965 push_space += this_item_size + sizeof(*item); 2966 } 2967 2968 if (push_items == 0) { 2969 ret = 1; 2970 goto out; 2971 } 2972 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 2973 2974 /* push data from right to left */ 2975 copy_extent_buffer(left, right, 2976 btrfs_item_nr_offset(btrfs_header_nritems(left)), 2977 btrfs_item_nr_offset(0), 2978 push_items * sizeof(struct btrfs_item)); 2979 2980 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 2981 btrfs_item_offset_nr(right, push_items - 1); 2982 2983 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + 2984 leaf_data_end(left) - push_space, 2985 BTRFS_LEAF_DATA_OFFSET + 2986 btrfs_item_offset_nr(right, push_items - 1), 2987 push_space); 2988 old_left_nritems = btrfs_header_nritems(left); 2989 BUG_ON(old_left_nritems <= 0); 2990 2991 btrfs_init_map_token(&token, left); 2992 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); 2993 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 2994 u32 ioff; 2995 2996 item = btrfs_item_nr(i); 2997 2998 ioff = btrfs_token_item_offset(&token, item); 2999 btrfs_set_token_item_offset(&token, item, 3000 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); 3001 } 3002 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3003 3004 /* fixup right node */ 3005 if (push_items > right_nritems) 3006 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3007 right_nritems); 3008 3009 if (push_items < right_nritems) { 3010 push_space = btrfs_item_offset_nr(right, push_items - 1) - 3011 leaf_data_end(right); 3012 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + 3013 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3014 BTRFS_LEAF_DATA_OFFSET + 3015 leaf_data_end(right), push_space); 3016 3017 memmove_extent_buffer(right, btrfs_item_nr_offset(0), 3018 btrfs_item_nr_offset(push_items), 3019 (btrfs_header_nritems(right) - push_items) * 3020 sizeof(struct btrfs_item)); 3021 } 3022 3023 btrfs_init_map_token(&token, right); 3024 right_nritems -= push_items; 3025 btrfs_set_header_nritems(right, right_nritems); 3026 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3027 for (i = 0; i < right_nritems; i++) { 3028 item = btrfs_item_nr(i); 3029 3030 push_space = push_space - btrfs_token_item_size(&token, item); 3031 btrfs_set_token_item_offset(&token, item, push_space); 3032 } 3033 3034 btrfs_mark_buffer_dirty(left); 3035 if (right_nritems) 3036 btrfs_mark_buffer_dirty(right); 3037 else 3038 btrfs_clean_tree_block(right); 3039 3040 btrfs_item_key(right, &disk_key, 0); 3041 fixup_low_keys(path, &disk_key, 1); 3042 3043 /* then fixup the leaf pointer in the path */ 3044 if (path->slots[0] < push_items) { 3045 path->slots[0] += old_left_nritems; 3046 btrfs_tree_unlock(path->nodes[0]); 3047 free_extent_buffer(path->nodes[0]); 3048 path->nodes[0] = left; 3049 path->slots[1] -= 1; 3050 } else { 3051 btrfs_tree_unlock(left); 3052 free_extent_buffer(left); 3053 path->slots[0] -= push_items; 3054 } 3055 BUG_ON(path->slots[0] < 0); 3056 return ret; 3057 out: 3058 btrfs_tree_unlock(left); 3059 free_extent_buffer(left); 3060 return ret; 3061 } 3062 3063 /* 3064 * push some data in the path leaf to the left, trying to free up at 3065 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3066 * 3067 * max_slot can put a limit on how far into the leaf we'll push items. The 3068 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3069 * items 3070 */ 3071 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3072 *root, struct btrfs_path *path, int min_data_size, 3073 int data_size, int empty, u32 max_slot) 3074 { 3075 struct extent_buffer *right = path->nodes[0]; 3076 struct extent_buffer *left; 3077 int slot; 3078 int free_space; 3079 u32 right_nritems; 3080 int ret = 0; 3081 3082 slot = path->slots[1]; 3083 if (slot == 0) 3084 return 1; 3085 if (!path->nodes[1]) 3086 return 1; 3087 3088 right_nritems = btrfs_header_nritems(right); 3089 if (right_nritems == 0) 3090 return 1; 3091 3092 btrfs_assert_tree_locked(path->nodes[1]); 3093 3094 left = btrfs_read_node_slot(path->nodes[1], slot - 1); 3095 /* 3096 * slot - 1 is not valid or we fail to read the left node, 3097 * no big deal, just return. 3098 */ 3099 if (IS_ERR(left)) 3100 return 1; 3101 3102 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 3103 3104 free_space = btrfs_leaf_free_space(left); 3105 if (free_space < data_size) { 3106 ret = 1; 3107 goto out; 3108 } 3109 3110 /* cow and double check */ 3111 ret = btrfs_cow_block(trans, root, left, 3112 path->nodes[1], slot - 1, &left, 3113 BTRFS_NESTING_LEFT_COW); 3114 if (ret) { 3115 /* we hit -ENOSPC, but it isn't fatal here */ 3116 if (ret == -ENOSPC) 3117 ret = 1; 3118 goto out; 3119 } 3120 3121 free_space = btrfs_leaf_free_space(left); 3122 if (free_space < data_size) { 3123 ret = 1; 3124 goto out; 3125 } 3126 3127 if (check_sibling_keys(left, right)) { 3128 ret = -EUCLEAN; 3129 goto out; 3130 } 3131 return __push_leaf_left(path, min_data_size, 3132 empty, left, free_space, right_nritems, 3133 max_slot); 3134 out: 3135 btrfs_tree_unlock(left); 3136 free_extent_buffer(left); 3137 return ret; 3138 } 3139 3140 /* 3141 * split the path's leaf in two, making sure there is at least data_size 3142 * available for the resulting leaf level of the path. 3143 */ 3144 static noinline void copy_for_split(struct btrfs_trans_handle *trans, 3145 struct btrfs_path *path, 3146 struct extent_buffer *l, 3147 struct extent_buffer *right, 3148 int slot, int mid, int nritems) 3149 { 3150 struct btrfs_fs_info *fs_info = trans->fs_info; 3151 int data_copy_size; 3152 int rt_data_off; 3153 int i; 3154 struct btrfs_disk_key disk_key; 3155 struct btrfs_map_token token; 3156 3157 nritems = nritems - mid; 3158 btrfs_set_header_nritems(right, nritems); 3159 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l); 3160 3161 copy_extent_buffer(right, l, btrfs_item_nr_offset(0), 3162 btrfs_item_nr_offset(mid), 3163 nritems * sizeof(struct btrfs_item)); 3164 3165 copy_extent_buffer(right, l, 3166 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - 3167 data_copy_size, BTRFS_LEAF_DATA_OFFSET + 3168 leaf_data_end(l), data_copy_size); 3169 3170 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); 3171 3172 btrfs_init_map_token(&token, right); 3173 for (i = 0; i < nritems; i++) { 3174 struct btrfs_item *item = btrfs_item_nr(i); 3175 u32 ioff; 3176 3177 ioff = btrfs_token_item_offset(&token, item); 3178 btrfs_set_token_item_offset(&token, item, ioff + rt_data_off); 3179 } 3180 3181 btrfs_set_header_nritems(l, mid); 3182 btrfs_item_key(right, &disk_key, 0); 3183 insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); 3184 3185 btrfs_mark_buffer_dirty(right); 3186 btrfs_mark_buffer_dirty(l); 3187 BUG_ON(path->slots[0] != slot); 3188 3189 if (mid <= slot) { 3190 btrfs_tree_unlock(path->nodes[0]); 3191 free_extent_buffer(path->nodes[0]); 3192 path->nodes[0] = right; 3193 path->slots[0] -= mid; 3194 path->slots[1] += 1; 3195 } else { 3196 btrfs_tree_unlock(right); 3197 free_extent_buffer(right); 3198 } 3199 3200 BUG_ON(path->slots[0] < 0); 3201 } 3202 3203 /* 3204 * double splits happen when we need to insert a big item in the middle 3205 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3206 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3207 * A B C 3208 * 3209 * We avoid this by trying to push the items on either side of our target 3210 * into the adjacent leaves. If all goes well we can avoid the double split 3211 * completely. 3212 */ 3213 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3214 struct btrfs_root *root, 3215 struct btrfs_path *path, 3216 int data_size) 3217 { 3218 int ret; 3219 int progress = 0; 3220 int slot; 3221 u32 nritems; 3222 int space_needed = data_size; 3223 3224 slot = path->slots[0]; 3225 if (slot < btrfs_header_nritems(path->nodes[0])) 3226 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3227 3228 /* 3229 * try to push all the items after our slot into the 3230 * right leaf 3231 */ 3232 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 3233 if (ret < 0) 3234 return ret; 3235 3236 if (ret == 0) 3237 progress++; 3238 3239 nritems = btrfs_header_nritems(path->nodes[0]); 3240 /* 3241 * our goal is to get our slot at the start or end of a leaf. If 3242 * we've done so we're done 3243 */ 3244 if (path->slots[0] == 0 || path->slots[0] == nritems) 3245 return 0; 3246 3247 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3248 return 0; 3249 3250 /* try to push all the items before our slot into the next leaf */ 3251 slot = path->slots[0]; 3252 space_needed = data_size; 3253 if (slot > 0) 3254 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3255 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 3256 if (ret < 0) 3257 return ret; 3258 3259 if (ret == 0) 3260 progress++; 3261 3262 if (progress) 3263 return 0; 3264 return 1; 3265 } 3266 3267 /* 3268 * split the path's leaf in two, making sure there is at least data_size 3269 * available for the resulting leaf level of the path. 3270 * 3271 * returns 0 if all went well and < 0 on failure. 3272 */ 3273 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3274 struct btrfs_root *root, 3275 const struct btrfs_key *ins_key, 3276 struct btrfs_path *path, int data_size, 3277 int extend) 3278 { 3279 struct btrfs_disk_key disk_key; 3280 struct extent_buffer *l; 3281 u32 nritems; 3282 int mid; 3283 int slot; 3284 struct extent_buffer *right; 3285 struct btrfs_fs_info *fs_info = root->fs_info; 3286 int ret = 0; 3287 int wret; 3288 int split; 3289 int num_doubles = 0; 3290 int tried_avoid_double = 0; 3291 3292 l = path->nodes[0]; 3293 slot = path->slots[0]; 3294 if (extend && data_size + btrfs_item_size_nr(l, slot) + 3295 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 3296 return -EOVERFLOW; 3297 3298 /* first try to make some room by pushing left and right */ 3299 if (data_size && path->nodes[1]) { 3300 int space_needed = data_size; 3301 3302 if (slot < btrfs_header_nritems(l)) 3303 space_needed -= btrfs_leaf_free_space(l); 3304 3305 wret = push_leaf_right(trans, root, path, space_needed, 3306 space_needed, 0, 0); 3307 if (wret < 0) 3308 return wret; 3309 if (wret) { 3310 space_needed = data_size; 3311 if (slot > 0) 3312 space_needed -= btrfs_leaf_free_space(l); 3313 wret = push_leaf_left(trans, root, path, space_needed, 3314 space_needed, 0, (u32)-1); 3315 if (wret < 0) 3316 return wret; 3317 } 3318 l = path->nodes[0]; 3319 3320 /* did the pushes work? */ 3321 if (btrfs_leaf_free_space(l) >= data_size) 3322 return 0; 3323 } 3324 3325 if (!path->nodes[1]) { 3326 ret = insert_new_root(trans, root, path, 1); 3327 if (ret) 3328 return ret; 3329 } 3330 again: 3331 split = 1; 3332 l = path->nodes[0]; 3333 slot = path->slots[0]; 3334 nritems = btrfs_header_nritems(l); 3335 mid = (nritems + 1) / 2; 3336 3337 if (mid <= slot) { 3338 if (nritems == 1 || 3339 leaf_space_used(l, mid, nritems - mid) + data_size > 3340 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3341 if (slot >= nritems) { 3342 split = 0; 3343 } else { 3344 mid = slot; 3345 if (mid != nritems && 3346 leaf_space_used(l, mid, nritems - mid) + 3347 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3348 if (data_size && !tried_avoid_double) 3349 goto push_for_double; 3350 split = 2; 3351 } 3352 } 3353 } 3354 } else { 3355 if (leaf_space_used(l, 0, mid) + data_size > 3356 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3357 if (!extend && data_size && slot == 0) { 3358 split = 0; 3359 } else if ((extend || !data_size) && slot == 0) { 3360 mid = 1; 3361 } else { 3362 mid = slot; 3363 if (mid != nritems && 3364 leaf_space_used(l, mid, nritems - mid) + 3365 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3366 if (data_size && !tried_avoid_double) 3367 goto push_for_double; 3368 split = 2; 3369 } 3370 } 3371 } 3372 } 3373 3374 if (split == 0) 3375 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3376 else 3377 btrfs_item_key(l, &disk_key, mid); 3378 3379 /* 3380 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double 3381 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES 3382 * subclasses, which is 8 at the time of this patch, and we've maxed it 3383 * out. In the future we could add a 3384 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just 3385 * use BTRFS_NESTING_NEW_ROOT. 3386 */ 3387 right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, 3388 l->start, 0, num_doubles ? 3389 BTRFS_NESTING_NEW_ROOT : 3390 BTRFS_NESTING_SPLIT); 3391 if (IS_ERR(right)) 3392 return PTR_ERR(right); 3393 3394 root_add_used(root, fs_info->nodesize); 3395 3396 if (split == 0) { 3397 if (mid <= slot) { 3398 btrfs_set_header_nritems(right, 0); 3399 insert_ptr(trans, path, &disk_key, 3400 right->start, path->slots[1] + 1, 1); 3401 btrfs_tree_unlock(path->nodes[0]); 3402 free_extent_buffer(path->nodes[0]); 3403 path->nodes[0] = right; 3404 path->slots[0] = 0; 3405 path->slots[1] += 1; 3406 } else { 3407 btrfs_set_header_nritems(right, 0); 3408 insert_ptr(trans, path, &disk_key, 3409 right->start, path->slots[1], 1); 3410 btrfs_tree_unlock(path->nodes[0]); 3411 free_extent_buffer(path->nodes[0]); 3412 path->nodes[0] = right; 3413 path->slots[0] = 0; 3414 if (path->slots[1] == 0) 3415 fixup_low_keys(path, &disk_key, 1); 3416 } 3417 /* 3418 * We create a new leaf 'right' for the required ins_len and 3419 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 3420 * the content of ins_len to 'right'. 3421 */ 3422 return ret; 3423 } 3424 3425 copy_for_split(trans, path, l, right, slot, mid, nritems); 3426 3427 if (split == 2) { 3428 BUG_ON(num_doubles != 0); 3429 num_doubles++; 3430 goto again; 3431 } 3432 3433 return 0; 3434 3435 push_for_double: 3436 push_for_double_split(trans, root, path, data_size); 3437 tried_avoid_double = 1; 3438 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3439 return 0; 3440 goto again; 3441 } 3442 3443 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3444 struct btrfs_root *root, 3445 struct btrfs_path *path, int ins_len) 3446 { 3447 struct btrfs_key key; 3448 struct extent_buffer *leaf; 3449 struct btrfs_file_extent_item *fi; 3450 u64 extent_len = 0; 3451 u32 item_size; 3452 int ret; 3453 3454 leaf = path->nodes[0]; 3455 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3456 3457 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 3458 key.type != BTRFS_EXTENT_CSUM_KEY); 3459 3460 if (btrfs_leaf_free_space(leaf) >= ins_len) 3461 return 0; 3462 3463 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3464 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3465 fi = btrfs_item_ptr(leaf, path->slots[0], 3466 struct btrfs_file_extent_item); 3467 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3468 } 3469 btrfs_release_path(path); 3470 3471 path->keep_locks = 1; 3472 path->search_for_split = 1; 3473 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 3474 path->search_for_split = 0; 3475 if (ret > 0) 3476 ret = -EAGAIN; 3477 if (ret < 0) 3478 goto err; 3479 3480 ret = -EAGAIN; 3481 leaf = path->nodes[0]; 3482 /* if our item isn't there, return now */ 3483 if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) 3484 goto err; 3485 3486 /* the leaf has changed, it now has room. return now */ 3487 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) 3488 goto err; 3489 3490 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3491 fi = btrfs_item_ptr(leaf, path->slots[0], 3492 struct btrfs_file_extent_item); 3493 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 3494 goto err; 3495 } 3496 3497 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3498 if (ret) 3499 goto err; 3500 3501 path->keep_locks = 0; 3502 btrfs_unlock_up_safe(path, 1); 3503 return 0; 3504 err: 3505 path->keep_locks = 0; 3506 return ret; 3507 } 3508 3509 static noinline int split_item(struct btrfs_path *path, 3510 const struct btrfs_key *new_key, 3511 unsigned long split_offset) 3512 { 3513 struct extent_buffer *leaf; 3514 struct btrfs_item *item; 3515 struct btrfs_item *new_item; 3516 int slot; 3517 char *buf; 3518 u32 nritems; 3519 u32 item_size; 3520 u32 orig_offset; 3521 struct btrfs_disk_key disk_key; 3522 3523 leaf = path->nodes[0]; 3524 BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); 3525 3526 item = btrfs_item_nr(path->slots[0]); 3527 orig_offset = btrfs_item_offset(leaf, item); 3528 item_size = btrfs_item_size(leaf, item); 3529 3530 buf = kmalloc(item_size, GFP_NOFS); 3531 if (!buf) 3532 return -ENOMEM; 3533 3534 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3535 path->slots[0]), item_size); 3536 3537 slot = path->slots[0] + 1; 3538 nritems = btrfs_header_nritems(leaf); 3539 if (slot != nritems) { 3540 /* shift the items */ 3541 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), 3542 btrfs_item_nr_offset(slot), 3543 (nritems - slot) * sizeof(struct btrfs_item)); 3544 } 3545 3546 btrfs_cpu_key_to_disk(&disk_key, new_key); 3547 btrfs_set_item_key(leaf, &disk_key, slot); 3548 3549 new_item = btrfs_item_nr(slot); 3550 3551 btrfs_set_item_offset(leaf, new_item, orig_offset); 3552 btrfs_set_item_size(leaf, new_item, item_size - split_offset); 3553 3554 btrfs_set_item_offset(leaf, item, 3555 orig_offset + item_size - split_offset); 3556 btrfs_set_item_size(leaf, item, split_offset); 3557 3558 btrfs_set_header_nritems(leaf, nritems + 1); 3559 3560 /* write the data for the start of the original item */ 3561 write_extent_buffer(leaf, buf, 3562 btrfs_item_ptr_offset(leaf, path->slots[0]), 3563 split_offset); 3564 3565 /* write the data for the new item */ 3566 write_extent_buffer(leaf, buf + split_offset, 3567 btrfs_item_ptr_offset(leaf, slot), 3568 item_size - split_offset); 3569 btrfs_mark_buffer_dirty(leaf); 3570 3571 BUG_ON(btrfs_leaf_free_space(leaf) < 0); 3572 kfree(buf); 3573 return 0; 3574 } 3575 3576 /* 3577 * This function splits a single item into two items, 3578 * giving 'new_key' to the new item and splitting the 3579 * old one at split_offset (from the start of the item). 3580 * 3581 * The path may be released by this operation. After 3582 * the split, the path is pointing to the old item. The 3583 * new item is going to be in the same node as the old one. 3584 * 3585 * Note, the item being split must be smaller enough to live alone on 3586 * a tree block with room for one extra struct btrfs_item 3587 * 3588 * This allows us to split the item in place, keeping a lock on the 3589 * leaf the entire time. 3590 */ 3591 int btrfs_split_item(struct btrfs_trans_handle *trans, 3592 struct btrfs_root *root, 3593 struct btrfs_path *path, 3594 const struct btrfs_key *new_key, 3595 unsigned long split_offset) 3596 { 3597 int ret; 3598 ret = setup_leaf_for_split(trans, root, path, 3599 sizeof(struct btrfs_item)); 3600 if (ret) 3601 return ret; 3602 3603 ret = split_item(path, new_key, split_offset); 3604 return ret; 3605 } 3606 3607 /* 3608 * This function duplicate a item, giving 'new_key' to the new item. 3609 * It guarantees both items live in the same tree leaf and the new item 3610 * is contiguous with the original item. 3611 * 3612 * This allows us to split file extent in place, keeping a lock on the 3613 * leaf the entire time. 3614 */ 3615 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 3616 struct btrfs_root *root, 3617 struct btrfs_path *path, 3618 const struct btrfs_key *new_key) 3619 { 3620 struct extent_buffer *leaf; 3621 int ret; 3622 u32 item_size; 3623 3624 leaf = path->nodes[0]; 3625 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3626 ret = setup_leaf_for_split(trans, root, path, 3627 item_size + sizeof(struct btrfs_item)); 3628 if (ret) 3629 return ret; 3630 3631 path->slots[0]++; 3632 setup_items_for_insert(root, path, new_key, &item_size, 1); 3633 leaf = path->nodes[0]; 3634 memcpy_extent_buffer(leaf, 3635 btrfs_item_ptr_offset(leaf, path->slots[0]), 3636 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 3637 item_size); 3638 return 0; 3639 } 3640 3641 /* 3642 * make the item pointed to by the path smaller. new_size indicates 3643 * how small to make it, and from_end tells us if we just chop bytes 3644 * off the end of the item or if we shift the item to chop bytes off 3645 * the front. 3646 */ 3647 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) 3648 { 3649 int slot; 3650 struct extent_buffer *leaf; 3651 struct btrfs_item *item; 3652 u32 nritems; 3653 unsigned int data_end; 3654 unsigned int old_data_start; 3655 unsigned int old_size; 3656 unsigned int size_diff; 3657 int i; 3658 struct btrfs_map_token token; 3659 3660 leaf = path->nodes[0]; 3661 slot = path->slots[0]; 3662 3663 old_size = btrfs_item_size_nr(leaf, slot); 3664 if (old_size == new_size) 3665 return; 3666 3667 nritems = btrfs_header_nritems(leaf); 3668 data_end = leaf_data_end(leaf); 3669 3670 old_data_start = btrfs_item_offset_nr(leaf, slot); 3671 3672 size_diff = old_size - new_size; 3673 3674 BUG_ON(slot < 0); 3675 BUG_ON(slot >= nritems); 3676 3677 /* 3678 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3679 */ 3680 /* first correct the data pointers */ 3681 btrfs_init_map_token(&token, leaf); 3682 for (i = slot; i < nritems; i++) { 3683 u32 ioff; 3684 item = btrfs_item_nr(i); 3685 3686 ioff = btrfs_token_item_offset(&token, item); 3687 btrfs_set_token_item_offset(&token, item, ioff + size_diff); 3688 } 3689 3690 /* shift the data */ 3691 if (from_end) { 3692 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3693 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + 3694 data_end, old_data_start + new_size - data_end); 3695 } else { 3696 struct btrfs_disk_key disk_key; 3697 u64 offset; 3698 3699 btrfs_item_key(leaf, &disk_key, slot); 3700 3701 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 3702 unsigned long ptr; 3703 struct btrfs_file_extent_item *fi; 3704 3705 fi = btrfs_item_ptr(leaf, slot, 3706 struct btrfs_file_extent_item); 3707 fi = (struct btrfs_file_extent_item *)( 3708 (unsigned long)fi - size_diff); 3709 3710 if (btrfs_file_extent_type(leaf, fi) == 3711 BTRFS_FILE_EXTENT_INLINE) { 3712 ptr = btrfs_item_ptr_offset(leaf, slot); 3713 memmove_extent_buffer(leaf, ptr, 3714 (unsigned long)fi, 3715 BTRFS_FILE_EXTENT_INLINE_DATA_START); 3716 } 3717 } 3718 3719 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3720 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + 3721 data_end, old_data_start - data_end); 3722 3723 offset = btrfs_disk_key_offset(&disk_key); 3724 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 3725 btrfs_set_item_key(leaf, &disk_key, slot); 3726 if (slot == 0) 3727 fixup_low_keys(path, &disk_key, 1); 3728 } 3729 3730 item = btrfs_item_nr(slot); 3731 btrfs_set_item_size(leaf, item, new_size); 3732 btrfs_mark_buffer_dirty(leaf); 3733 3734 if (btrfs_leaf_free_space(leaf) < 0) { 3735 btrfs_print_leaf(leaf); 3736 BUG(); 3737 } 3738 } 3739 3740 /* 3741 * make the item pointed to by the path bigger, data_size is the added size. 3742 */ 3743 void btrfs_extend_item(struct btrfs_path *path, u32 data_size) 3744 { 3745 int slot; 3746 struct extent_buffer *leaf; 3747 struct btrfs_item *item; 3748 u32 nritems; 3749 unsigned int data_end; 3750 unsigned int old_data; 3751 unsigned int old_size; 3752 int i; 3753 struct btrfs_map_token token; 3754 3755 leaf = path->nodes[0]; 3756 3757 nritems = btrfs_header_nritems(leaf); 3758 data_end = leaf_data_end(leaf); 3759 3760 if (btrfs_leaf_free_space(leaf) < data_size) { 3761 btrfs_print_leaf(leaf); 3762 BUG(); 3763 } 3764 slot = path->slots[0]; 3765 old_data = btrfs_item_end_nr(leaf, slot); 3766 3767 BUG_ON(slot < 0); 3768 if (slot >= nritems) { 3769 btrfs_print_leaf(leaf); 3770 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", 3771 slot, nritems); 3772 BUG(); 3773 } 3774 3775 /* 3776 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3777 */ 3778 /* first correct the data pointers */ 3779 btrfs_init_map_token(&token, leaf); 3780 for (i = slot; i < nritems; i++) { 3781 u32 ioff; 3782 item = btrfs_item_nr(i); 3783 3784 ioff = btrfs_token_item_offset(&token, item); 3785 btrfs_set_token_item_offset(&token, item, ioff - data_size); 3786 } 3787 3788 /* shift the data */ 3789 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3790 data_end - data_size, BTRFS_LEAF_DATA_OFFSET + 3791 data_end, old_data - data_end); 3792 3793 data_end = old_data; 3794 old_size = btrfs_item_size_nr(leaf, slot); 3795 item = btrfs_item_nr(slot); 3796 btrfs_set_item_size(leaf, item, old_size + data_size); 3797 btrfs_mark_buffer_dirty(leaf); 3798 3799 if (btrfs_leaf_free_space(leaf) < 0) { 3800 btrfs_print_leaf(leaf); 3801 BUG(); 3802 } 3803 } 3804 3805 /** 3806 * setup_items_for_insert - Helper called before inserting one or more items 3807 * to a leaf. Main purpose is to save stack depth by doing the bulk of the work 3808 * in a function that doesn't call btrfs_search_slot 3809 * 3810 * @root: root we are inserting items to 3811 * @path: points to the leaf/slot where we are going to insert new items 3812 * @cpu_key: array of keys for items to be inserted 3813 * @data_size: size of the body of each item we are going to insert 3814 * @nr: size of @cpu_key/@data_size arrays 3815 */ 3816 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, 3817 const struct btrfs_key *cpu_key, u32 *data_size, 3818 int nr) 3819 { 3820 struct btrfs_fs_info *fs_info = root->fs_info; 3821 struct btrfs_item *item; 3822 int i; 3823 u32 nritems; 3824 unsigned int data_end; 3825 struct btrfs_disk_key disk_key; 3826 struct extent_buffer *leaf; 3827 int slot; 3828 struct btrfs_map_token token; 3829 u32 total_size; 3830 u32 total_data = 0; 3831 3832 for (i = 0; i < nr; i++) 3833 total_data += data_size[i]; 3834 total_size = total_data + (nr * sizeof(struct btrfs_item)); 3835 3836 if (path->slots[0] == 0) { 3837 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 3838 fixup_low_keys(path, &disk_key, 1); 3839 } 3840 btrfs_unlock_up_safe(path, 1); 3841 3842 leaf = path->nodes[0]; 3843 slot = path->slots[0]; 3844 3845 nritems = btrfs_header_nritems(leaf); 3846 data_end = leaf_data_end(leaf); 3847 3848 if (btrfs_leaf_free_space(leaf) < total_size) { 3849 btrfs_print_leaf(leaf); 3850 btrfs_crit(fs_info, "not enough freespace need %u have %d", 3851 total_size, btrfs_leaf_free_space(leaf)); 3852 BUG(); 3853 } 3854 3855 btrfs_init_map_token(&token, leaf); 3856 if (slot != nritems) { 3857 unsigned int old_data = btrfs_item_end_nr(leaf, slot); 3858 3859 if (old_data < data_end) { 3860 btrfs_print_leaf(leaf); 3861 btrfs_crit(fs_info, 3862 "item at slot %d with data offset %u beyond data end of leaf %u", 3863 slot, old_data, data_end); 3864 BUG(); 3865 } 3866 /* 3867 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3868 */ 3869 /* first correct the data pointers */ 3870 for (i = slot; i < nritems; i++) { 3871 u32 ioff; 3872 3873 item = btrfs_item_nr(i); 3874 ioff = btrfs_token_item_offset(&token, item); 3875 btrfs_set_token_item_offset(&token, item, 3876 ioff - total_data); 3877 } 3878 /* shift the items */ 3879 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), 3880 btrfs_item_nr_offset(slot), 3881 (nritems - slot) * sizeof(struct btrfs_item)); 3882 3883 /* shift the data */ 3884 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3885 data_end - total_data, BTRFS_LEAF_DATA_OFFSET + 3886 data_end, old_data - data_end); 3887 data_end = old_data; 3888 } 3889 3890 /* setup the item for the new data */ 3891 for (i = 0; i < nr; i++) { 3892 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); 3893 btrfs_set_item_key(leaf, &disk_key, slot + i); 3894 item = btrfs_item_nr(slot + i); 3895 data_end -= data_size[i]; 3896 btrfs_set_token_item_offset(&token, item, data_end); 3897 btrfs_set_token_item_size(&token, item, data_size[i]); 3898 } 3899 3900 btrfs_set_header_nritems(leaf, nritems + nr); 3901 btrfs_mark_buffer_dirty(leaf); 3902 3903 if (btrfs_leaf_free_space(leaf) < 0) { 3904 btrfs_print_leaf(leaf); 3905 BUG(); 3906 } 3907 } 3908 3909 /* 3910 * Given a key and some data, insert items into the tree. 3911 * This does all the path init required, making room in the tree if needed. 3912 */ 3913 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 3914 struct btrfs_root *root, 3915 struct btrfs_path *path, 3916 const struct btrfs_key *cpu_key, u32 *data_size, 3917 int nr) 3918 { 3919 int ret = 0; 3920 int slot; 3921 int i; 3922 u32 total_size = 0; 3923 u32 total_data = 0; 3924 3925 for (i = 0; i < nr; i++) 3926 total_data += data_size[i]; 3927 3928 total_size = total_data + (nr * sizeof(struct btrfs_item)); 3929 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); 3930 if (ret == 0) 3931 return -EEXIST; 3932 if (ret < 0) 3933 return ret; 3934 3935 slot = path->slots[0]; 3936 BUG_ON(slot < 0); 3937 3938 setup_items_for_insert(root, path, cpu_key, data_size, nr); 3939 return 0; 3940 } 3941 3942 /* 3943 * Given a key and some data, insert an item into the tree. 3944 * This does all the path init required, making room in the tree if needed. 3945 */ 3946 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3947 const struct btrfs_key *cpu_key, void *data, 3948 u32 data_size) 3949 { 3950 int ret = 0; 3951 struct btrfs_path *path; 3952 struct extent_buffer *leaf; 3953 unsigned long ptr; 3954 3955 path = btrfs_alloc_path(); 3956 if (!path) 3957 return -ENOMEM; 3958 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 3959 if (!ret) { 3960 leaf = path->nodes[0]; 3961 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 3962 write_extent_buffer(leaf, data, ptr, data_size); 3963 btrfs_mark_buffer_dirty(leaf); 3964 } 3965 btrfs_free_path(path); 3966 return ret; 3967 } 3968 3969 /* 3970 * delete the pointer from a given node. 3971 * 3972 * the tree should have been previously balanced so the deletion does not 3973 * empty a node. 3974 */ 3975 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 3976 int level, int slot) 3977 { 3978 struct extent_buffer *parent = path->nodes[level]; 3979 u32 nritems; 3980 int ret; 3981 3982 nritems = btrfs_header_nritems(parent); 3983 if (slot != nritems - 1) { 3984 if (level) { 3985 ret = btrfs_tree_mod_log_insert_move(parent, slot, 3986 slot + 1, nritems - slot - 1); 3987 BUG_ON(ret < 0); 3988 } 3989 memmove_extent_buffer(parent, 3990 btrfs_node_key_ptr_offset(slot), 3991 btrfs_node_key_ptr_offset(slot + 1), 3992 sizeof(struct btrfs_key_ptr) * 3993 (nritems - slot - 1)); 3994 } else if (level) { 3995 ret = btrfs_tree_mod_log_insert_key(parent, slot, 3996 BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); 3997 BUG_ON(ret < 0); 3998 } 3999 4000 nritems--; 4001 btrfs_set_header_nritems(parent, nritems); 4002 if (nritems == 0 && parent == root->node) { 4003 BUG_ON(btrfs_header_level(root->node) != 1); 4004 /* just turn the root into a leaf and break */ 4005 btrfs_set_header_level(root->node, 0); 4006 } else if (slot == 0) { 4007 struct btrfs_disk_key disk_key; 4008 4009 btrfs_node_key(parent, &disk_key, 0); 4010 fixup_low_keys(path, &disk_key, level + 1); 4011 } 4012 btrfs_mark_buffer_dirty(parent); 4013 } 4014 4015 /* 4016 * a helper function to delete the leaf pointed to by path->slots[1] and 4017 * path->nodes[1]. 4018 * 4019 * This deletes the pointer in path->nodes[1] and frees the leaf 4020 * block extent. zero is returned if it all worked out, < 0 otherwise. 4021 * 4022 * The path must have already been setup for deleting the leaf, including 4023 * all the proper balancing. path->nodes[1] must be locked. 4024 */ 4025 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, 4026 struct btrfs_root *root, 4027 struct btrfs_path *path, 4028 struct extent_buffer *leaf) 4029 { 4030 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4031 del_ptr(root, path, 1, path->slots[1]); 4032 4033 /* 4034 * btrfs_free_extent is expensive, we want to make sure we 4035 * aren't holding any locks when we call it 4036 */ 4037 btrfs_unlock_up_safe(path, 0); 4038 4039 root_sub_used(root, leaf->len); 4040 4041 atomic_inc(&leaf->refs); 4042 btrfs_free_tree_block(trans, root, leaf, 0, 1); 4043 free_extent_buffer_stale(leaf); 4044 } 4045 /* 4046 * delete the item at the leaf level in path. If that empties 4047 * the leaf, remove it from the tree 4048 */ 4049 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4050 struct btrfs_path *path, int slot, int nr) 4051 { 4052 struct btrfs_fs_info *fs_info = root->fs_info; 4053 struct extent_buffer *leaf; 4054 struct btrfs_item *item; 4055 u32 last_off; 4056 u32 dsize = 0; 4057 int ret = 0; 4058 int wret; 4059 int i; 4060 u32 nritems; 4061 4062 leaf = path->nodes[0]; 4063 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); 4064 4065 for (i = 0; i < nr; i++) 4066 dsize += btrfs_item_size_nr(leaf, slot + i); 4067 4068 nritems = btrfs_header_nritems(leaf); 4069 4070 if (slot + nr != nritems) { 4071 int data_end = leaf_data_end(leaf); 4072 struct btrfs_map_token token; 4073 4074 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4075 data_end + dsize, 4076 BTRFS_LEAF_DATA_OFFSET + data_end, 4077 last_off - data_end); 4078 4079 btrfs_init_map_token(&token, leaf); 4080 for (i = slot + nr; i < nritems; i++) { 4081 u32 ioff; 4082 4083 item = btrfs_item_nr(i); 4084 ioff = btrfs_token_item_offset(&token, item); 4085 btrfs_set_token_item_offset(&token, item, ioff + dsize); 4086 } 4087 4088 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), 4089 btrfs_item_nr_offset(slot + nr), 4090 sizeof(struct btrfs_item) * 4091 (nritems - slot - nr)); 4092 } 4093 btrfs_set_header_nritems(leaf, nritems - nr); 4094 nritems -= nr; 4095 4096 /* delete the leaf if we've emptied it */ 4097 if (nritems == 0) { 4098 if (leaf == root->node) { 4099 btrfs_set_header_level(leaf, 0); 4100 } else { 4101 btrfs_clean_tree_block(leaf); 4102 btrfs_del_leaf(trans, root, path, leaf); 4103 } 4104 } else { 4105 int used = leaf_space_used(leaf, 0, nritems); 4106 if (slot == 0) { 4107 struct btrfs_disk_key disk_key; 4108 4109 btrfs_item_key(leaf, &disk_key, 0); 4110 fixup_low_keys(path, &disk_key, 1); 4111 } 4112 4113 /* delete the leaf if it is mostly empty */ 4114 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4115 /* push_leaf_left fixes the path. 4116 * make sure the path still points to our leaf 4117 * for possible call to del_ptr below 4118 */ 4119 slot = path->slots[1]; 4120 atomic_inc(&leaf->refs); 4121 4122 wret = push_leaf_left(trans, root, path, 1, 1, 4123 1, (u32)-1); 4124 if (wret < 0 && wret != -ENOSPC) 4125 ret = wret; 4126 4127 if (path->nodes[0] == leaf && 4128 btrfs_header_nritems(leaf)) { 4129 wret = push_leaf_right(trans, root, path, 1, 4130 1, 1, 0); 4131 if (wret < 0 && wret != -ENOSPC) 4132 ret = wret; 4133 } 4134 4135 if (btrfs_header_nritems(leaf) == 0) { 4136 path->slots[1] = slot; 4137 btrfs_del_leaf(trans, root, path, leaf); 4138 free_extent_buffer(leaf); 4139 ret = 0; 4140 } else { 4141 /* if we're still in the path, make sure 4142 * we're dirty. Otherwise, one of the 4143 * push_leaf functions must have already 4144 * dirtied this buffer 4145 */ 4146 if (path->nodes[0] == leaf) 4147 btrfs_mark_buffer_dirty(leaf); 4148 free_extent_buffer(leaf); 4149 } 4150 } else { 4151 btrfs_mark_buffer_dirty(leaf); 4152 } 4153 } 4154 return ret; 4155 } 4156 4157 /* 4158 * search the tree again to find a leaf with lesser keys 4159 * returns 0 if it found something or 1 if there are no lesser leaves. 4160 * returns < 0 on io errors. 4161 * 4162 * This may release the path, and so you may lose any locks held at the 4163 * time you call it. 4164 */ 4165 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 4166 { 4167 struct btrfs_key key; 4168 struct btrfs_disk_key found_key; 4169 int ret; 4170 4171 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 4172 4173 if (key.offset > 0) { 4174 key.offset--; 4175 } else if (key.type > 0) { 4176 key.type--; 4177 key.offset = (u64)-1; 4178 } else if (key.objectid > 0) { 4179 key.objectid--; 4180 key.type = (u8)-1; 4181 key.offset = (u64)-1; 4182 } else { 4183 return 1; 4184 } 4185 4186 btrfs_release_path(path); 4187 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4188 if (ret < 0) 4189 return ret; 4190 btrfs_item_key(path->nodes[0], &found_key, 0); 4191 ret = comp_keys(&found_key, &key); 4192 /* 4193 * We might have had an item with the previous key in the tree right 4194 * before we released our path. And after we released our path, that 4195 * item might have been pushed to the first slot (0) of the leaf we 4196 * were holding due to a tree balance. Alternatively, an item with the 4197 * previous key can exist as the only element of a leaf (big fat item). 4198 * Therefore account for these 2 cases, so that our callers (like 4199 * btrfs_previous_item) don't miss an existing item with a key matching 4200 * the previous key we computed above. 4201 */ 4202 if (ret <= 0) 4203 return 0; 4204 return 1; 4205 } 4206 4207 /* 4208 * A helper function to walk down the tree starting at min_key, and looking 4209 * for nodes or leaves that are have a minimum transaction id. 4210 * This is used by the btree defrag code, and tree logging 4211 * 4212 * This does not cow, but it does stuff the starting key it finds back 4213 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4214 * key and get a writable path. 4215 * 4216 * This honors path->lowest_level to prevent descent past a given level 4217 * of the tree. 4218 * 4219 * min_trans indicates the oldest transaction that you are interested 4220 * in walking through. Any nodes or leaves older than min_trans are 4221 * skipped over (without reading them). 4222 * 4223 * returns zero if something useful was found, < 0 on error and 1 if there 4224 * was nothing in the tree that matched the search criteria. 4225 */ 4226 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4227 struct btrfs_path *path, 4228 u64 min_trans) 4229 { 4230 struct extent_buffer *cur; 4231 struct btrfs_key found_key; 4232 int slot; 4233 int sret; 4234 u32 nritems; 4235 int level; 4236 int ret = 1; 4237 int keep_locks = path->keep_locks; 4238 4239 path->keep_locks = 1; 4240 again: 4241 cur = btrfs_read_lock_root_node(root); 4242 level = btrfs_header_level(cur); 4243 WARN_ON(path->nodes[level]); 4244 path->nodes[level] = cur; 4245 path->locks[level] = BTRFS_READ_LOCK; 4246 4247 if (btrfs_header_generation(cur) < min_trans) { 4248 ret = 1; 4249 goto out; 4250 } 4251 while (1) { 4252 nritems = btrfs_header_nritems(cur); 4253 level = btrfs_header_level(cur); 4254 sret = btrfs_bin_search(cur, min_key, &slot); 4255 if (sret < 0) { 4256 ret = sret; 4257 goto out; 4258 } 4259 4260 /* at the lowest level, we're done, setup the path and exit */ 4261 if (level == path->lowest_level) { 4262 if (slot >= nritems) 4263 goto find_next_key; 4264 ret = 0; 4265 path->slots[level] = slot; 4266 btrfs_item_key_to_cpu(cur, &found_key, slot); 4267 goto out; 4268 } 4269 if (sret && slot > 0) 4270 slot--; 4271 /* 4272 * check this node pointer against the min_trans parameters. 4273 * If it is too old, skip to the next one. 4274 */ 4275 while (slot < nritems) { 4276 u64 gen; 4277 4278 gen = btrfs_node_ptr_generation(cur, slot); 4279 if (gen < min_trans) { 4280 slot++; 4281 continue; 4282 } 4283 break; 4284 } 4285 find_next_key: 4286 /* 4287 * we didn't find a candidate key in this node, walk forward 4288 * and find another one 4289 */ 4290 if (slot >= nritems) { 4291 path->slots[level] = slot; 4292 sret = btrfs_find_next_key(root, path, min_key, level, 4293 min_trans); 4294 if (sret == 0) { 4295 btrfs_release_path(path); 4296 goto again; 4297 } else { 4298 goto out; 4299 } 4300 } 4301 /* save our key for returning back */ 4302 btrfs_node_key_to_cpu(cur, &found_key, slot); 4303 path->slots[level] = slot; 4304 if (level == path->lowest_level) { 4305 ret = 0; 4306 goto out; 4307 } 4308 cur = btrfs_read_node_slot(cur, slot); 4309 if (IS_ERR(cur)) { 4310 ret = PTR_ERR(cur); 4311 goto out; 4312 } 4313 4314 btrfs_tree_read_lock(cur); 4315 4316 path->locks[level - 1] = BTRFS_READ_LOCK; 4317 path->nodes[level - 1] = cur; 4318 unlock_up(path, level, 1, 0, NULL); 4319 } 4320 out: 4321 path->keep_locks = keep_locks; 4322 if (ret == 0) { 4323 btrfs_unlock_up_safe(path, path->lowest_level + 1); 4324 memcpy(min_key, &found_key, sizeof(found_key)); 4325 } 4326 return ret; 4327 } 4328 4329 /* 4330 * this is similar to btrfs_next_leaf, but does not try to preserve 4331 * and fixup the path. It looks for and returns the next key in the 4332 * tree based on the current path and the min_trans parameters. 4333 * 4334 * 0 is returned if another key is found, < 0 if there are any errors 4335 * and 1 is returned if there are no higher keys in the tree 4336 * 4337 * path->keep_locks should be set to 1 on the search made before 4338 * calling this function. 4339 */ 4340 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4341 struct btrfs_key *key, int level, u64 min_trans) 4342 { 4343 int slot; 4344 struct extent_buffer *c; 4345 4346 WARN_ON(!path->keep_locks && !path->skip_locking); 4347 while (level < BTRFS_MAX_LEVEL) { 4348 if (!path->nodes[level]) 4349 return 1; 4350 4351 slot = path->slots[level] + 1; 4352 c = path->nodes[level]; 4353 next: 4354 if (slot >= btrfs_header_nritems(c)) { 4355 int ret; 4356 int orig_lowest; 4357 struct btrfs_key cur_key; 4358 if (level + 1 >= BTRFS_MAX_LEVEL || 4359 !path->nodes[level + 1]) 4360 return 1; 4361 4362 if (path->locks[level + 1] || path->skip_locking) { 4363 level++; 4364 continue; 4365 } 4366 4367 slot = btrfs_header_nritems(c) - 1; 4368 if (level == 0) 4369 btrfs_item_key_to_cpu(c, &cur_key, slot); 4370 else 4371 btrfs_node_key_to_cpu(c, &cur_key, slot); 4372 4373 orig_lowest = path->lowest_level; 4374 btrfs_release_path(path); 4375 path->lowest_level = level; 4376 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4377 0, 0); 4378 path->lowest_level = orig_lowest; 4379 if (ret < 0) 4380 return ret; 4381 4382 c = path->nodes[level]; 4383 slot = path->slots[level]; 4384 if (ret == 0) 4385 slot++; 4386 goto next; 4387 } 4388 4389 if (level == 0) 4390 btrfs_item_key_to_cpu(c, key, slot); 4391 else { 4392 u64 gen = btrfs_node_ptr_generation(c, slot); 4393 4394 if (gen < min_trans) { 4395 slot++; 4396 goto next; 4397 } 4398 btrfs_node_key_to_cpu(c, key, slot); 4399 } 4400 return 0; 4401 } 4402 return 1; 4403 } 4404 4405 /* 4406 * search the tree again to find a leaf with greater keys 4407 * returns 0 if it found something or 1 if there are no greater leaves. 4408 * returns < 0 on io errors. 4409 */ 4410 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 4411 { 4412 return btrfs_next_old_leaf(root, path, 0); 4413 } 4414 4415 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 4416 u64 time_seq) 4417 { 4418 int slot; 4419 int level; 4420 struct extent_buffer *c; 4421 struct extent_buffer *next; 4422 struct btrfs_key key; 4423 u32 nritems; 4424 int ret; 4425 int i; 4426 4427 nritems = btrfs_header_nritems(path->nodes[0]); 4428 if (nritems == 0) 4429 return 1; 4430 4431 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4432 again: 4433 level = 1; 4434 next = NULL; 4435 btrfs_release_path(path); 4436 4437 path->keep_locks = 1; 4438 4439 if (time_seq) 4440 ret = btrfs_search_old_slot(root, &key, path, time_seq); 4441 else 4442 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4443 path->keep_locks = 0; 4444 4445 if (ret < 0) 4446 return ret; 4447 4448 nritems = btrfs_header_nritems(path->nodes[0]); 4449 /* 4450 * by releasing the path above we dropped all our locks. A balance 4451 * could have added more items next to the key that used to be 4452 * at the very end of the block. So, check again here and 4453 * advance the path if there are now more items available. 4454 */ 4455 if (nritems > 0 && path->slots[0] < nritems - 1) { 4456 if (ret == 0) 4457 path->slots[0]++; 4458 ret = 0; 4459 goto done; 4460 } 4461 /* 4462 * So the above check misses one case: 4463 * - after releasing the path above, someone has removed the item that 4464 * used to be at the very end of the block, and balance between leafs 4465 * gets another one with bigger key.offset to replace it. 4466 * 4467 * This one should be returned as well, or we can get leaf corruption 4468 * later(esp. in __btrfs_drop_extents()). 4469 * 4470 * And a bit more explanation about this check, 4471 * with ret > 0, the key isn't found, the path points to the slot 4472 * where it should be inserted, so the path->slots[0] item must be the 4473 * bigger one. 4474 */ 4475 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { 4476 ret = 0; 4477 goto done; 4478 } 4479 4480 while (level < BTRFS_MAX_LEVEL) { 4481 if (!path->nodes[level]) { 4482 ret = 1; 4483 goto done; 4484 } 4485 4486 slot = path->slots[level] + 1; 4487 c = path->nodes[level]; 4488 if (slot >= btrfs_header_nritems(c)) { 4489 level++; 4490 if (level == BTRFS_MAX_LEVEL) { 4491 ret = 1; 4492 goto done; 4493 } 4494 continue; 4495 } 4496 4497 4498 /* 4499 * Our current level is where we're going to start from, and to 4500 * make sure lockdep doesn't complain we need to drop our locks 4501 * and nodes from 0 to our current level. 4502 */ 4503 for (i = 0; i < level; i++) { 4504 if (path->locks[level]) { 4505 btrfs_tree_read_unlock(path->nodes[i]); 4506 path->locks[i] = 0; 4507 } 4508 free_extent_buffer(path->nodes[i]); 4509 path->nodes[i] = NULL; 4510 } 4511 4512 next = c; 4513 ret = read_block_for_search(root, path, &next, level, 4514 slot, &key); 4515 if (ret == -EAGAIN) 4516 goto again; 4517 4518 if (ret < 0) { 4519 btrfs_release_path(path); 4520 goto done; 4521 } 4522 4523 if (!path->skip_locking) { 4524 ret = btrfs_try_tree_read_lock(next); 4525 if (!ret && time_seq) { 4526 /* 4527 * If we don't get the lock, we may be racing 4528 * with push_leaf_left, holding that lock while 4529 * itself waiting for the leaf we've currently 4530 * locked. To solve this situation, we give up 4531 * on our lock and cycle. 4532 */ 4533 free_extent_buffer(next); 4534 btrfs_release_path(path); 4535 cond_resched(); 4536 goto again; 4537 } 4538 if (!ret) 4539 btrfs_tree_read_lock(next); 4540 } 4541 break; 4542 } 4543 path->slots[level] = slot; 4544 while (1) { 4545 level--; 4546 path->nodes[level] = next; 4547 path->slots[level] = 0; 4548 if (!path->skip_locking) 4549 path->locks[level] = BTRFS_READ_LOCK; 4550 if (!level) 4551 break; 4552 4553 ret = read_block_for_search(root, path, &next, level, 4554 0, &key); 4555 if (ret == -EAGAIN) 4556 goto again; 4557 4558 if (ret < 0) { 4559 btrfs_release_path(path); 4560 goto done; 4561 } 4562 4563 if (!path->skip_locking) 4564 btrfs_tree_read_lock(next); 4565 } 4566 ret = 0; 4567 done: 4568 unlock_up(path, 0, 1, 0, NULL); 4569 4570 return ret; 4571 } 4572 4573 /* 4574 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 4575 * searching until it gets past min_objectid or finds an item of 'type' 4576 * 4577 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4578 */ 4579 int btrfs_previous_item(struct btrfs_root *root, 4580 struct btrfs_path *path, u64 min_objectid, 4581 int type) 4582 { 4583 struct btrfs_key found_key; 4584 struct extent_buffer *leaf; 4585 u32 nritems; 4586 int ret; 4587 4588 while (1) { 4589 if (path->slots[0] == 0) { 4590 ret = btrfs_prev_leaf(root, path); 4591 if (ret != 0) 4592 return ret; 4593 } else { 4594 path->slots[0]--; 4595 } 4596 leaf = path->nodes[0]; 4597 nritems = btrfs_header_nritems(leaf); 4598 if (nritems == 0) 4599 return 1; 4600 if (path->slots[0] == nritems) 4601 path->slots[0]--; 4602 4603 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4604 if (found_key.objectid < min_objectid) 4605 break; 4606 if (found_key.type == type) 4607 return 0; 4608 if (found_key.objectid == min_objectid && 4609 found_key.type < type) 4610 break; 4611 } 4612 return 1; 4613 } 4614 4615 /* 4616 * search in extent tree to find a previous Metadata/Data extent item with 4617 * min objecitd. 4618 * 4619 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4620 */ 4621 int btrfs_previous_extent_item(struct btrfs_root *root, 4622 struct btrfs_path *path, u64 min_objectid) 4623 { 4624 struct btrfs_key found_key; 4625 struct extent_buffer *leaf; 4626 u32 nritems; 4627 int ret; 4628 4629 while (1) { 4630 if (path->slots[0] == 0) { 4631 ret = btrfs_prev_leaf(root, path); 4632 if (ret != 0) 4633 return ret; 4634 } else { 4635 path->slots[0]--; 4636 } 4637 leaf = path->nodes[0]; 4638 nritems = btrfs_header_nritems(leaf); 4639 if (nritems == 0) 4640 return 1; 4641 if (path->slots[0] == nritems) 4642 path->slots[0]--; 4643 4644 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4645 if (found_key.objectid < min_objectid) 4646 break; 4647 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 4648 found_key.type == BTRFS_METADATA_ITEM_KEY) 4649 return 0; 4650 if (found_key.objectid == min_objectid && 4651 found_key.type < BTRFS_EXTENT_ITEM_KEY) 4652 break; 4653 } 4654 return 1; 4655 } 4656