1 /* 2 * Copyright (C) 2007,2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/rbtree.h> 22 #include "ctree.h" 23 #include "disk-io.h" 24 #include "transaction.h" 25 #include "print-tree.h" 26 #include "locking.h" 27 28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 29 *root, struct btrfs_path *path, int level); 30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root 31 *root, struct btrfs_key *ins_key, 32 struct btrfs_path *path, int data_size, int extend); 33 static int push_node_left(struct btrfs_trans_handle *trans, 34 struct btrfs_root *root, struct extent_buffer *dst, 35 struct extent_buffer *src, int empty); 36 static int balance_node_right(struct btrfs_trans_handle *trans, 37 struct btrfs_root *root, 38 struct extent_buffer *dst_buf, 39 struct extent_buffer *src_buf); 40 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 41 struct btrfs_path *path, int level, int slot); 42 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 43 struct extent_buffer *eb); 44 struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, 45 u32 blocksize, u64 parent_transid, 46 u64 time_seq); 47 struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root, 48 u64 bytenr, u32 blocksize, 49 u64 time_seq); 50 51 struct btrfs_path *btrfs_alloc_path(void) 52 { 53 struct btrfs_path *path; 54 path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 55 return path; 56 } 57 58 /* 59 * set all locked nodes in the path to blocking locks. This should 60 * be done before scheduling 61 */ 62 noinline void btrfs_set_path_blocking(struct btrfs_path *p) 63 { 64 int i; 65 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 66 if (!p->nodes[i] || !p->locks[i]) 67 continue; 68 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); 69 if (p->locks[i] == BTRFS_READ_LOCK) 70 p->locks[i] = BTRFS_READ_LOCK_BLOCKING; 71 else if (p->locks[i] == BTRFS_WRITE_LOCK) 72 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; 73 } 74 } 75 76 /* 77 * reset all the locked nodes in the patch to spinning locks. 78 * 79 * held is used to keep lockdep happy, when lockdep is enabled 80 * we set held to a blocking lock before we go around and 81 * retake all the spinlocks in the path. You can safely use NULL 82 * for held 83 */ 84 noinline void btrfs_clear_path_blocking(struct btrfs_path *p, 85 struct extent_buffer *held, int held_rw) 86 { 87 int i; 88 89 #ifdef CONFIG_DEBUG_LOCK_ALLOC 90 /* lockdep really cares that we take all of these spinlocks 91 * in the right order. If any of the locks in the path are not 92 * currently blocking, it is going to complain. So, make really 93 * really sure by forcing the path to blocking before we clear 94 * the path blocking. 95 */ 96 if (held) { 97 btrfs_set_lock_blocking_rw(held, held_rw); 98 if (held_rw == BTRFS_WRITE_LOCK) 99 held_rw = BTRFS_WRITE_LOCK_BLOCKING; 100 else if (held_rw == BTRFS_READ_LOCK) 101 held_rw = BTRFS_READ_LOCK_BLOCKING; 102 } 103 btrfs_set_path_blocking(p); 104 #endif 105 106 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { 107 if (p->nodes[i] && p->locks[i]) { 108 btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); 109 if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) 110 p->locks[i] = BTRFS_WRITE_LOCK; 111 else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) 112 p->locks[i] = BTRFS_READ_LOCK; 113 } 114 } 115 116 #ifdef CONFIG_DEBUG_LOCK_ALLOC 117 if (held) 118 btrfs_clear_lock_blocking_rw(held, held_rw); 119 #endif 120 } 121 122 /* this also releases the path */ 123 void btrfs_free_path(struct btrfs_path *p) 124 { 125 if (!p) 126 return; 127 btrfs_release_path(p); 128 kmem_cache_free(btrfs_path_cachep, p); 129 } 130 131 /* 132 * path release drops references on the extent buffers in the path 133 * and it drops any locks held by this path 134 * 135 * It is safe to call this on paths that no locks or extent buffers held. 136 */ 137 noinline void btrfs_release_path(struct btrfs_path *p) 138 { 139 int i; 140 141 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 142 p->slots[i] = 0; 143 if (!p->nodes[i]) 144 continue; 145 if (p->locks[i]) { 146 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 147 p->locks[i] = 0; 148 } 149 free_extent_buffer(p->nodes[i]); 150 p->nodes[i] = NULL; 151 } 152 } 153 154 /* 155 * safely gets a reference on the root node of a tree. A lock 156 * is not taken, so a concurrent writer may put a different node 157 * at the root of the tree. See btrfs_lock_root_node for the 158 * looping required. 159 * 160 * The extent buffer returned by this has a reference taken, so 161 * it won't disappear. It may stop being the root of the tree 162 * at any time because there are no locks held. 163 */ 164 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 165 { 166 struct extent_buffer *eb; 167 168 while (1) { 169 rcu_read_lock(); 170 eb = rcu_dereference(root->node); 171 172 /* 173 * RCU really hurts here, we could free up the root node because 174 * it was cow'ed but we may not get the new root node yet so do 175 * the inc_not_zero dance and if it doesn't work then 176 * synchronize_rcu and try again. 177 */ 178 if (atomic_inc_not_zero(&eb->refs)) { 179 rcu_read_unlock(); 180 break; 181 } 182 rcu_read_unlock(); 183 synchronize_rcu(); 184 } 185 return eb; 186 } 187 188 /* loop around taking references on and locking the root node of the 189 * tree until you end up with a lock on the root. A locked buffer 190 * is returned, with a reference held. 191 */ 192 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) 193 { 194 struct extent_buffer *eb; 195 196 while (1) { 197 eb = btrfs_root_node(root); 198 btrfs_tree_lock(eb); 199 if (eb == root->node) 200 break; 201 btrfs_tree_unlock(eb); 202 free_extent_buffer(eb); 203 } 204 return eb; 205 } 206 207 /* loop around taking references on and locking the root node of the 208 * tree until you end up with a lock on the root. A locked buffer 209 * is returned, with a reference held. 210 */ 211 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) 212 { 213 struct extent_buffer *eb; 214 215 while (1) { 216 eb = btrfs_root_node(root); 217 btrfs_tree_read_lock(eb); 218 if (eb == root->node) 219 break; 220 btrfs_tree_read_unlock(eb); 221 free_extent_buffer(eb); 222 } 223 return eb; 224 } 225 226 /* cowonly root (everything not a reference counted cow subvolume), just get 227 * put onto a simple dirty list. transaction.c walks this to make sure they 228 * get properly updated on disk. 229 */ 230 static void add_root_to_dirty_list(struct btrfs_root *root) 231 { 232 spin_lock(&root->fs_info->trans_lock); 233 if (root->track_dirty && list_empty(&root->dirty_list)) { 234 list_add(&root->dirty_list, 235 &root->fs_info->dirty_cowonly_roots); 236 } 237 spin_unlock(&root->fs_info->trans_lock); 238 } 239 240 /* 241 * used by snapshot creation to make a copy of a root for a tree with 242 * a given objectid. The buffer with the new root node is returned in 243 * cow_ret, and this func returns zero on success or a negative error code. 244 */ 245 int btrfs_copy_root(struct btrfs_trans_handle *trans, 246 struct btrfs_root *root, 247 struct extent_buffer *buf, 248 struct extent_buffer **cow_ret, u64 new_root_objectid) 249 { 250 struct extent_buffer *cow; 251 int ret = 0; 252 int level; 253 struct btrfs_disk_key disk_key; 254 255 WARN_ON(root->ref_cows && trans->transid != 256 root->fs_info->running_transaction->transid); 257 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 258 259 level = btrfs_header_level(buf); 260 if (level == 0) 261 btrfs_item_key(buf, &disk_key, 0); 262 else 263 btrfs_node_key(buf, &disk_key, 0); 264 265 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 266 new_root_objectid, &disk_key, level, 267 buf->start, 0); 268 if (IS_ERR(cow)) 269 return PTR_ERR(cow); 270 271 copy_extent_buffer(cow, buf, 0, 0, cow->len); 272 btrfs_set_header_bytenr(cow, cow->start); 273 btrfs_set_header_generation(cow, trans->transid); 274 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 275 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 276 BTRFS_HEADER_FLAG_RELOC); 277 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 278 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 279 else 280 btrfs_set_header_owner(cow, new_root_objectid); 281 282 write_extent_buffer(cow, root->fs_info->fsid, 283 (unsigned long)btrfs_header_fsid(cow), 284 BTRFS_FSID_SIZE); 285 286 WARN_ON(btrfs_header_generation(buf) > trans->transid); 287 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 288 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 289 else 290 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 291 292 if (ret) 293 return ret; 294 295 btrfs_mark_buffer_dirty(cow); 296 *cow_ret = cow; 297 return 0; 298 } 299 300 enum mod_log_op { 301 MOD_LOG_KEY_REPLACE, 302 MOD_LOG_KEY_ADD, 303 MOD_LOG_KEY_REMOVE, 304 MOD_LOG_KEY_REMOVE_WHILE_FREEING, 305 MOD_LOG_KEY_REMOVE_WHILE_MOVING, 306 MOD_LOG_MOVE_KEYS, 307 MOD_LOG_ROOT_REPLACE, 308 }; 309 310 struct tree_mod_move { 311 int dst_slot; 312 int nr_items; 313 }; 314 315 struct tree_mod_root { 316 u64 logical; 317 u8 level; 318 }; 319 320 struct tree_mod_elem { 321 struct rb_node node; 322 u64 index; /* shifted logical */ 323 u64 seq; 324 enum mod_log_op op; 325 326 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ 327 int slot; 328 329 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ 330 u64 generation; 331 332 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ 333 struct btrfs_disk_key key; 334 u64 blockptr; 335 336 /* this is used for op == MOD_LOG_MOVE_KEYS */ 337 struct tree_mod_move move; 338 339 /* this is used for op == MOD_LOG_ROOT_REPLACE */ 340 struct tree_mod_root old_root; 341 }; 342 343 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info) 344 { 345 read_lock(&fs_info->tree_mod_log_lock); 346 } 347 348 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info) 349 { 350 read_unlock(&fs_info->tree_mod_log_lock); 351 } 352 353 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info) 354 { 355 write_lock(&fs_info->tree_mod_log_lock); 356 } 357 358 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) 359 { 360 write_unlock(&fs_info->tree_mod_log_lock); 361 } 362 363 /* 364 * This adds a new blocker to the tree mod log's blocker list if the @elem 365 * passed does not already have a sequence number set. So when a caller expects 366 * to record tree modifications, it should ensure to set elem->seq to zero 367 * before calling btrfs_get_tree_mod_seq. 368 * Returns a fresh, unused tree log modification sequence number, even if no new 369 * blocker was added. 370 */ 371 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 372 struct seq_list *elem) 373 { 374 u64 seq; 375 376 tree_mod_log_write_lock(fs_info); 377 spin_lock(&fs_info->tree_mod_seq_lock); 378 if (!elem->seq) { 379 elem->seq = btrfs_inc_tree_mod_seq(fs_info); 380 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 381 } 382 seq = btrfs_inc_tree_mod_seq(fs_info); 383 spin_unlock(&fs_info->tree_mod_seq_lock); 384 tree_mod_log_write_unlock(fs_info); 385 386 return seq; 387 } 388 389 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 390 struct seq_list *elem) 391 { 392 struct rb_root *tm_root; 393 struct rb_node *node; 394 struct rb_node *next; 395 struct seq_list *cur_elem; 396 struct tree_mod_elem *tm; 397 u64 min_seq = (u64)-1; 398 u64 seq_putting = elem->seq; 399 400 if (!seq_putting) 401 return; 402 403 spin_lock(&fs_info->tree_mod_seq_lock); 404 list_del(&elem->list); 405 elem->seq = 0; 406 407 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { 408 if (cur_elem->seq < min_seq) { 409 if (seq_putting > cur_elem->seq) { 410 /* 411 * blocker with lower sequence number exists, we 412 * cannot remove anything from the log 413 */ 414 spin_unlock(&fs_info->tree_mod_seq_lock); 415 return; 416 } 417 min_seq = cur_elem->seq; 418 } 419 } 420 spin_unlock(&fs_info->tree_mod_seq_lock); 421 422 /* 423 * anything that's lower than the lowest existing (read: blocked) 424 * sequence number can be removed from the tree. 425 */ 426 tree_mod_log_write_lock(fs_info); 427 tm_root = &fs_info->tree_mod_log; 428 for (node = rb_first(tm_root); node; node = next) { 429 next = rb_next(node); 430 tm = container_of(node, struct tree_mod_elem, node); 431 if (tm->seq > min_seq) 432 continue; 433 rb_erase(node, tm_root); 434 kfree(tm); 435 } 436 tree_mod_log_write_unlock(fs_info); 437 } 438 439 /* 440 * key order of the log: 441 * index -> sequence 442 * 443 * the index is the shifted logical of the *new* root node for root replace 444 * operations, or the shifted logical of the affected block for all other 445 * operations. 446 */ 447 static noinline int 448 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) 449 { 450 struct rb_root *tm_root; 451 struct rb_node **new; 452 struct rb_node *parent = NULL; 453 struct tree_mod_elem *cur; 454 455 BUG_ON(!tm || !tm->seq); 456 457 tm_root = &fs_info->tree_mod_log; 458 new = &tm_root->rb_node; 459 while (*new) { 460 cur = container_of(*new, struct tree_mod_elem, node); 461 parent = *new; 462 if (cur->index < tm->index) 463 new = &((*new)->rb_left); 464 else if (cur->index > tm->index) 465 new = &((*new)->rb_right); 466 else if (cur->seq < tm->seq) 467 new = &((*new)->rb_left); 468 else if (cur->seq > tm->seq) 469 new = &((*new)->rb_right); 470 else { 471 kfree(tm); 472 return -EEXIST; 473 } 474 } 475 476 rb_link_node(&tm->node, parent, new); 477 rb_insert_color(&tm->node, tm_root); 478 return 0; 479 } 480 481 /* 482 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it 483 * returns zero with the tree_mod_log_lock acquired. The caller must hold 484 * this until all tree mod log insertions are recorded in the rb tree and then 485 * call tree_mod_log_write_unlock() to release. 486 */ 487 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, 488 struct extent_buffer *eb) { 489 smp_mb(); 490 if (list_empty(&(fs_info)->tree_mod_seq_list)) 491 return 1; 492 if (eb && btrfs_header_level(eb) == 0) 493 return 1; 494 495 tree_mod_log_write_lock(fs_info); 496 if (list_empty(&fs_info->tree_mod_seq_list)) { 497 /* 498 * someone emptied the list while we were waiting for the lock. 499 * we must not add to the list when no blocker exists. 500 */ 501 tree_mod_log_write_unlock(fs_info); 502 return 1; 503 } 504 505 return 0; 506 } 507 508 /* 509 * This allocates memory and gets a tree modification sequence number. 510 * 511 * Returns <0 on error. 512 * Returns >0 (the added sequence number) on success. 513 */ 514 static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, 515 struct tree_mod_elem **tm_ret) 516 { 517 struct tree_mod_elem *tm; 518 519 /* 520 * once we switch from spin locks to something different, we should 521 * honor the flags parameter here. 522 */ 523 tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC); 524 if (!tm) 525 return -ENOMEM; 526 527 tm->seq = btrfs_inc_tree_mod_seq(fs_info); 528 return tm->seq; 529 } 530 531 static inline int 532 __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, 533 struct extent_buffer *eb, int slot, 534 enum mod_log_op op, gfp_t flags) 535 { 536 int ret; 537 struct tree_mod_elem *tm; 538 539 ret = tree_mod_alloc(fs_info, flags, &tm); 540 if (ret < 0) 541 return ret; 542 543 tm->index = eb->start >> PAGE_CACHE_SHIFT; 544 if (op != MOD_LOG_KEY_ADD) { 545 btrfs_node_key(eb, &tm->key, slot); 546 tm->blockptr = btrfs_node_blockptr(eb, slot); 547 } 548 tm->op = op; 549 tm->slot = slot; 550 tm->generation = btrfs_node_ptr_generation(eb, slot); 551 552 return __tree_mod_log_insert(fs_info, tm); 553 } 554 555 static noinline int 556 tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, 557 struct extent_buffer *eb, int slot, 558 enum mod_log_op op, gfp_t flags) 559 { 560 int ret; 561 562 if (tree_mod_dont_log(fs_info, eb)) 563 return 0; 564 565 ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); 566 567 tree_mod_log_write_unlock(fs_info); 568 return ret; 569 } 570 571 static noinline int 572 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, 573 int slot, enum mod_log_op op) 574 { 575 return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); 576 } 577 578 static noinline int 579 tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info, 580 struct extent_buffer *eb, int slot, 581 enum mod_log_op op) 582 { 583 return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS); 584 } 585 586 static noinline int 587 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, 588 struct extent_buffer *eb, int dst_slot, int src_slot, 589 int nr_items, gfp_t flags) 590 { 591 struct tree_mod_elem *tm; 592 int ret; 593 int i; 594 595 if (tree_mod_dont_log(fs_info, eb)) 596 return 0; 597 598 /* 599 * When we override something during the move, we log these removals. 600 * This can only happen when we move towards the beginning of the 601 * buffer, i.e. dst_slot < src_slot. 602 */ 603 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 604 ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot, 605 MOD_LOG_KEY_REMOVE_WHILE_MOVING); 606 BUG_ON(ret < 0); 607 } 608 609 ret = tree_mod_alloc(fs_info, flags, &tm); 610 if (ret < 0) 611 goto out; 612 613 tm->index = eb->start >> PAGE_CACHE_SHIFT; 614 tm->slot = src_slot; 615 tm->move.dst_slot = dst_slot; 616 tm->move.nr_items = nr_items; 617 tm->op = MOD_LOG_MOVE_KEYS; 618 619 ret = __tree_mod_log_insert(fs_info, tm); 620 out: 621 tree_mod_log_write_unlock(fs_info); 622 return ret; 623 } 624 625 static inline void 626 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) 627 { 628 int i; 629 u32 nritems; 630 int ret; 631 632 if (btrfs_header_level(eb) == 0) 633 return; 634 635 nritems = btrfs_header_nritems(eb); 636 for (i = nritems - 1; i >= 0; i--) { 637 ret = tree_mod_log_insert_key_locked(fs_info, eb, i, 638 MOD_LOG_KEY_REMOVE_WHILE_FREEING); 639 BUG_ON(ret < 0); 640 } 641 } 642 643 static noinline int 644 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, 645 struct extent_buffer *old_root, 646 struct extent_buffer *new_root, gfp_t flags) 647 { 648 struct tree_mod_elem *tm; 649 int ret; 650 651 if (tree_mod_dont_log(fs_info, NULL)) 652 return 0; 653 654 __tree_mod_log_free_eb(fs_info, old_root); 655 656 ret = tree_mod_alloc(fs_info, flags, &tm); 657 if (ret < 0) 658 goto out; 659 660 tm->index = new_root->start >> PAGE_CACHE_SHIFT; 661 tm->old_root.logical = old_root->start; 662 tm->old_root.level = btrfs_header_level(old_root); 663 tm->generation = btrfs_header_generation(old_root); 664 tm->op = MOD_LOG_ROOT_REPLACE; 665 666 ret = __tree_mod_log_insert(fs_info, tm); 667 out: 668 tree_mod_log_write_unlock(fs_info); 669 return ret; 670 } 671 672 static struct tree_mod_elem * 673 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, 674 int smallest) 675 { 676 struct rb_root *tm_root; 677 struct rb_node *node; 678 struct tree_mod_elem *cur = NULL; 679 struct tree_mod_elem *found = NULL; 680 u64 index = start >> PAGE_CACHE_SHIFT; 681 682 tree_mod_log_read_lock(fs_info); 683 tm_root = &fs_info->tree_mod_log; 684 node = tm_root->rb_node; 685 while (node) { 686 cur = container_of(node, struct tree_mod_elem, node); 687 if (cur->index < index) { 688 node = node->rb_left; 689 } else if (cur->index > index) { 690 node = node->rb_right; 691 } else if (cur->seq < min_seq) { 692 node = node->rb_left; 693 } else if (!smallest) { 694 /* we want the node with the highest seq */ 695 if (found) 696 BUG_ON(found->seq > cur->seq); 697 found = cur; 698 node = node->rb_left; 699 } else if (cur->seq > min_seq) { 700 /* we want the node with the smallest seq */ 701 if (found) 702 BUG_ON(found->seq < cur->seq); 703 found = cur; 704 node = node->rb_right; 705 } else { 706 found = cur; 707 break; 708 } 709 } 710 tree_mod_log_read_unlock(fs_info); 711 712 return found; 713 } 714 715 /* 716 * this returns the element from the log with the smallest time sequence 717 * value that's in the log (the oldest log item). any element with a time 718 * sequence lower than min_seq will be ignored. 719 */ 720 static struct tree_mod_elem * 721 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, 722 u64 min_seq) 723 { 724 return __tree_mod_log_search(fs_info, start, min_seq, 1); 725 } 726 727 /* 728 * this returns the element from the log with the largest time sequence 729 * value that's in the log (the most recent log item). any element with 730 * a time sequence lower than min_seq will be ignored. 731 */ 732 static struct tree_mod_elem * 733 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) 734 { 735 return __tree_mod_log_search(fs_info, start, min_seq, 0); 736 } 737 738 static noinline void 739 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, 740 struct extent_buffer *src, unsigned long dst_offset, 741 unsigned long src_offset, int nr_items, int log_removal) 742 { 743 int ret; 744 int i; 745 746 if (tree_mod_dont_log(fs_info, NULL)) 747 return; 748 749 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) { 750 tree_mod_log_write_unlock(fs_info); 751 return; 752 } 753 754 for (i = 0; i < nr_items; i++) { 755 if (log_removal) { 756 ret = tree_mod_log_insert_key_locked(fs_info, src, 757 i + src_offset, 758 MOD_LOG_KEY_REMOVE); 759 BUG_ON(ret < 0); 760 } 761 ret = tree_mod_log_insert_key_locked(fs_info, dst, 762 i + dst_offset, 763 MOD_LOG_KEY_ADD); 764 BUG_ON(ret < 0); 765 } 766 767 tree_mod_log_write_unlock(fs_info); 768 } 769 770 static inline void 771 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, 772 int dst_offset, int src_offset, int nr_items) 773 { 774 int ret; 775 ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, 776 nr_items, GFP_NOFS); 777 BUG_ON(ret < 0); 778 } 779 780 static noinline void 781 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 782 struct extent_buffer *eb, int slot, int atomic) 783 { 784 int ret; 785 786 ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, 787 MOD_LOG_KEY_REPLACE, 788 atomic ? GFP_ATOMIC : GFP_NOFS); 789 BUG_ON(ret < 0); 790 } 791 792 static noinline void 793 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) 794 { 795 if (tree_mod_dont_log(fs_info, eb)) 796 return; 797 798 __tree_mod_log_free_eb(fs_info, eb); 799 800 tree_mod_log_write_unlock(fs_info); 801 } 802 803 static noinline void 804 tree_mod_log_set_root_pointer(struct btrfs_root *root, 805 struct extent_buffer *new_root_node) 806 { 807 int ret; 808 ret = tree_mod_log_insert_root(root->fs_info, root->node, 809 new_root_node, GFP_NOFS); 810 BUG_ON(ret < 0); 811 } 812 813 /* 814 * check if the tree block can be shared by multiple trees 815 */ 816 int btrfs_block_can_be_shared(struct btrfs_root *root, 817 struct extent_buffer *buf) 818 { 819 /* 820 * Tree blocks not in refernece counted trees and tree roots 821 * are never shared. If a block was allocated after the last 822 * snapshot and the block was not allocated by tree relocation, 823 * we know the block is not shared. 824 */ 825 if (root->ref_cows && 826 buf != root->node && buf != root->commit_root && 827 (btrfs_header_generation(buf) <= 828 btrfs_root_last_snapshot(&root->root_item) || 829 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 830 return 1; 831 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 832 if (root->ref_cows && 833 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 834 return 1; 835 #endif 836 return 0; 837 } 838 839 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 840 struct btrfs_root *root, 841 struct extent_buffer *buf, 842 struct extent_buffer *cow, 843 int *last_ref) 844 { 845 u64 refs; 846 u64 owner; 847 u64 flags; 848 u64 new_flags = 0; 849 int ret; 850 851 /* 852 * Backrefs update rules: 853 * 854 * Always use full backrefs for extent pointers in tree block 855 * allocated by tree relocation. 856 * 857 * If a shared tree block is no longer referenced by its owner 858 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 859 * use full backrefs for extent pointers in tree block. 860 * 861 * If a tree block is been relocating 862 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 863 * use full backrefs for extent pointers in tree block. 864 * The reason for this is some operations (such as drop tree) 865 * are only allowed for blocks use full backrefs. 866 */ 867 868 if (btrfs_block_can_be_shared(root, buf)) { 869 ret = btrfs_lookup_extent_info(trans, root, buf->start, 870 buf->len, &refs, &flags); 871 if (ret) 872 return ret; 873 if (refs == 0) { 874 ret = -EROFS; 875 btrfs_std_error(root->fs_info, ret); 876 return ret; 877 } 878 } else { 879 refs = 1; 880 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 881 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 882 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 883 else 884 flags = 0; 885 } 886 887 owner = btrfs_header_owner(buf); 888 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && 889 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 890 891 if (refs > 1) { 892 if ((owner == root->root_key.objectid || 893 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 894 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 895 ret = btrfs_inc_ref(trans, root, buf, 1, 1); 896 BUG_ON(ret); /* -ENOMEM */ 897 898 if (root->root_key.objectid == 899 BTRFS_TREE_RELOC_OBJECTID) { 900 ret = btrfs_dec_ref(trans, root, buf, 0, 1); 901 BUG_ON(ret); /* -ENOMEM */ 902 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 903 BUG_ON(ret); /* -ENOMEM */ 904 } 905 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 906 } else { 907 908 if (root->root_key.objectid == 909 BTRFS_TREE_RELOC_OBJECTID) 910 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 911 else 912 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 913 BUG_ON(ret); /* -ENOMEM */ 914 } 915 if (new_flags != 0) { 916 ret = btrfs_set_disk_extent_flags(trans, root, 917 buf->start, 918 buf->len, 919 new_flags, 0); 920 if (ret) 921 return ret; 922 } 923 } else { 924 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 925 if (root->root_key.objectid == 926 BTRFS_TREE_RELOC_OBJECTID) 927 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 928 else 929 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 930 BUG_ON(ret); /* -ENOMEM */ 931 ret = btrfs_dec_ref(trans, root, buf, 1, 1); 932 BUG_ON(ret); /* -ENOMEM */ 933 } 934 clean_tree_block(trans, root, buf); 935 *last_ref = 1; 936 } 937 return 0; 938 } 939 940 /* 941 * does the dirty work in cow of a single block. The parent block (if 942 * supplied) is updated to point to the new cow copy. The new buffer is marked 943 * dirty and returned locked. If you modify the block it needs to be marked 944 * dirty again. 945 * 946 * search_start -- an allocation hint for the new block 947 * 948 * empty_size -- a hint that you plan on doing more cow. This is the size in 949 * bytes the allocator should try to find free next to the block it returns. 950 * This is just a hint and may be ignored by the allocator. 951 */ 952 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 953 struct btrfs_root *root, 954 struct extent_buffer *buf, 955 struct extent_buffer *parent, int parent_slot, 956 struct extent_buffer **cow_ret, 957 u64 search_start, u64 empty_size) 958 { 959 struct btrfs_disk_key disk_key; 960 struct extent_buffer *cow; 961 int level, ret; 962 int last_ref = 0; 963 int unlock_orig = 0; 964 u64 parent_start; 965 966 if (*cow_ret == buf) 967 unlock_orig = 1; 968 969 btrfs_assert_tree_locked(buf); 970 971 WARN_ON(root->ref_cows && trans->transid != 972 root->fs_info->running_transaction->transid); 973 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 974 975 level = btrfs_header_level(buf); 976 977 if (level == 0) 978 btrfs_item_key(buf, &disk_key, 0); 979 else 980 btrfs_node_key(buf, &disk_key, 0); 981 982 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 983 if (parent) 984 parent_start = parent->start; 985 else 986 parent_start = 0; 987 } else 988 parent_start = 0; 989 990 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 991 root->root_key.objectid, &disk_key, 992 level, search_start, empty_size); 993 if (IS_ERR(cow)) 994 return PTR_ERR(cow); 995 996 /* cow is set to blocking by btrfs_init_new_buffer */ 997 998 copy_extent_buffer(cow, buf, 0, 0, cow->len); 999 btrfs_set_header_bytenr(cow, cow->start); 1000 btrfs_set_header_generation(cow, trans->transid); 1001 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 1002 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 1003 BTRFS_HEADER_FLAG_RELOC); 1004 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1005 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 1006 else 1007 btrfs_set_header_owner(cow, root->root_key.objectid); 1008 1009 write_extent_buffer(cow, root->fs_info->fsid, 1010 (unsigned long)btrfs_header_fsid(cow), 1011 BTRFS_FSID_SIZE); 1012 1013 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 1014 if (ret) { 1015 btrfs_abort_transaction(trans, root, ret); 1016 return ret; 1017 } 1018 1019 if (root->ref_cows) 1020 btrfs_reloc_cow_block(trans, root, buf, cow); 1021 1022 if (buf == root->node) { 1023 WARN_ON(parent && parent != buf); 1024 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 1025 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 1026 parent_start = buf->start; 1027 else 1028 parent_start = 0; 1029 1030 extent_buffer_get(cow); 1031 tree_mod_log_set_root_pointer(root, cow); 1032 rcu_assign_pointer(root->node, cow); 1033 1034 btrfs_free_tree_block(trans, root, buf, parent_start, 1035 last_ref); 1036 free_extent_buffer(buf); 1037 add_root_to_dirty_list(root); 1038 } else { 1039 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1040 parent_start = parent->start; 1041 else 1042 parent_start = 0; 1043 1044 WARN_ON(trans->transid != btrfs_header_generation(parent)); 1045 tree_mod_log_insert_key(root->fs_info, parent, parent_slot, 1046 MOD_LOG_KEY_REPLACE); 1047 btrfs_set_node_blockptr(parent, parent_slot, 1048 cow->start); 1049 btrfs_set_node_ptr_generation(parent, parent_slot, 1050 trans->transid); 1051 btrfs_mark_buffer_dirty(parent); 1052 tree_mod_log_free_eb(root->fs_info, buf); 1053 btrfs_free_tree_block(trans, root, buf, parent_start, 1054 last_ref); 1055 } 1056 if (unlock_orig) 1057 btrfs_tree_unlock(buf); 1058 free_extent_buffer_stale(buf); 1059 btrfs_mark_buffer_dirty(cow); 1060 *cow_ret = cow; 1061 return 0; 1062 } 1063 1064 /* 1065 * returns the logical address of the oldest predecessor of the given root. 1066 * entries older than time_seq are ignored. 1067 */ 1068 static struct tree_mod_elem * 1069 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, 1070 struct btrfs_root *root, u64 time_seq) 1071 { 1072 struct tree_mod_elem *tm; 1073 struct tree_mod_elem *found = NULL; 1074 u64 root_logical = root->node->start; 1075 int looped = 0; 1076 1077 if (!time_seq) 1078 return 0; 1079 1080 /* 1081 * the very last operation that's logged for a root is the replacement 1082 * operation (if it is replaced at all). this has the index of the *new* 1083 * root, making it the very first operation that's logged for this root. 1084 */ 1085 while (1) { 1086 tm = tree_mod_log_search_oldest(fs_info, root_logical, 1087 time_seq); 1088 if (!looped && !tm) 1089 return 0; 1090 /* 1091 * if there are no tree operation for the oldest root, we simply 1092 * return it. this should only happen if that (old) root is at 1093 * level 0. 1094 */ 1095 if (!tm) 1096 break; 1097 1098 /* 1099 * if there's an operation that's not a root replacement, we 1100 * found the oldest version of our root. normally, we'll find a 1101 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. 1102 */ 1103 if (tm->op != MOD_LOG_ROOT_REPLACE) 1104 break; 1105 1106 found = tm; 1107 root_logical = tm->old_root.logical; 1108 BUG_ON(root_logical == root->node->start); 1109 looped = 1; 1110 } 1111 1112 /* if there's no old root to return, return what we found instead */ 1113 if (!found) 1114 found = tm; 1115 1116 return found; 1117 } 1118 1119 /* 1120 * tm is a pointer to the first operation to rewind within eb. then, all 1121 * previous operations will be rewinded (until we reach something older than 1122 * time_seq). 1123 */ 1124 static void 1125 __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, 1126 struct tree_mod_elem *first_tm) 1127 { 1128 u32 n; 1129 struct rb_node *next; 1130 struct tree_mod_elem *tm = first_tm; 1131 unsigned long o_dst; 1132 unsigned long o_src; 1133 unsigned long p_size = sizeof(struct btrfs_key_ptr); 1134 1135 n = btrfs_header_nritems(eb); 1136 while (tm && tm->seq >= time_seq) { 1137 /* 1138 * all the operations are recorded with the operator used for 1139 * the modification. as we're going backwards, we do the 1140 * opposite of each operation here. 1141 */ 1142 switch (tm->op) { 1143 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1144 BUG_ON(tm->slot < n); 1145 /* Fallthrough */ 1146 case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1147 case MOD_LOG_KEY_REMOVE: 1148 btrfs_set_node_key(eb, &tm->key, tm->slot); 1149 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1150 btrfs_set_node_ptr_generation(eb, tm->slot, 1151 tm->generation); 1152 n++; 1153 break; 1154 case MOD_LOG_KEY_REPLACE: 1155 BUG_ON(tm->slot >= n); 1156 btrfs_set_node_key(eb, &tm->key, tm->slot); 1157 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1158 btrfs_set_node_ptr_generation(eb, tm->slot, 1159 tm->generation); 1160 break; 1161 case MOD_LOG_KEY_ADD: 1162 /* if a move operation is needed it's in the log */ 1163 n--; 1164 break; 1165 case MOD_LOG_MOVE_KEYS: 1166 o_dst = btrfs_node_key_ptr_offset(tm->slot); 1167 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); 1168 memmove_extent_buffer(eb, o_dst, o_src, 1169 tm->move.nr_items * p_size); 1170 break; 1171 case MOD_LOG_ROOT_REPLACE: 1172 /* 1173 * this operation is special. for roots, this must be 1174 * handled explicitly before rewinding. 1175 * for non-roots, this operation may exist if the node 1176 * was a root: root A -> child B; then A gets empty and 1177 * B is promoted to the new root. in the mod log, we'll 1178 * have a root-replace operation for B, a tree block 1179 * that is no root. we simply ignore that operation. 1180 */ 1181 break; 1182 } 1183 next = rb_next(&tm->node); 1184 if (!next) 1185 break; 1186 tm = container_of(next, struct tree_mod_elem, node); 1187 if (tm->index != first_tm->index) 1188 break; 1189 } 1190 btrfs_set_header_nritems(eb, n); 1191 } 1192 1193 static struct extent_buffer * 1194 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, 1195 u64 time_seq) 1196 { 1197 struct extent_buffer *eb_rewin; 1198 struct tree_mod_elem *tm; 1199 1200 if (!time_seq) 1201 return eb; 1202 1203 if (btrfs_header_level(eb) == 0) 1204 return eb; 1205 1206 tm = tree_mod_log_search(fs_info, eb->start, time_seq); 1207 if (!tm) 1208 return eb; 1209 1210 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1211 BUG_ON(tm->slot != 0); 1212 eb_rewin = alloc_dummy_extent_buffer(eb->start, 1213 fs_info->tree_root->nodesize); 1214 BUG_ON(!eb_rewin); 1215 btrfs_set_header_bytenr(eb_rewin, eb->start); 1216 btrfs_set_header_backref_rev(eb_rewin, 1217 btrfs_header_backref_rev(eb)); 1218 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); 1219 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); 1220 } else { 1221 eb_rewin = btrfs_clone_extent_buffer(eb); 1222 BUG_ON(!eb_rewin); 1223 } 1224 1225 extent_buffer_get(eb_rewin); 1226 free_extent_buffer(eb); 1227 1228 __tree_mod_log_rewind(eb_rewin, time_seq, tm); 1229 WARN_ON(btrfs_header_nritems(eb_rewin) > 1230 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); 1231 1232 return eb_rewin; 1233 } 1234 1235 /* 1236 * get_old_root() rewinds the state of @root's root node to the given @time_seq 1237 * value. If there are no changes, the current root->root_node is returned. If 1238 * anything changed in between, there's a fresh buffer allocated on which the 1239 * rewind operations are done. In any case, the returned buffer is read locked. 1240 * Returns NULL on error (with no locks held). 1241 */ 1242 static inline struct extent_buffer * 1243 get_old_root(struct btrfs_root *root, u64 time_seq) 1244 { 1245 struct tree_mod_elem *tm; 1246 struct extent_buffer *eb; 1247 struct extent_buffer *old; 1248 struct tree_mod_root *old_root = NULL; 1249 u64 old_generation = 0; 1250 u64 logical; 1251 u32 blocksize; 1252 1253 eb = btrfs_read_lock_root_node(root); 1254 tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); 1255 if (!tm) 1256 return root->node; 1257 1258 if (tm->op == MOD_LOG_ROOT_REPLACE) { 1259 old_root = &tm->old_root; 1260 old_generation = tm->generation; 1261 logical = old_root->logical; 1262 } else { 1263 logical = root->node->start; 1264 } 1265 1266 tm = tree_mod_log_search(root->fs_info, logical, time_seq); 1267 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1268 btrfs_tree_read_unlock(root->node); 1269 free_extent_buffer(root->node); 1270 blocksize = btrfs_level_size(root, old_root->level); 1271 old = read_tree_block(root, logical, blocksize, 0); 1272 if (!old) { 1273 pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", 1274 logical); 1275 WARN_ON(1); 1276 } else { 1277 eb = btrfs_clone_extent_buffer(old); 1278 free_extent_buffer(old); 1279 } 1280 } else if (old_root) { 1281 btrfs_tree_read_unlock(root->node); 1282 free_extent_buffer(root->node); 1283 eb = alloc_dummy_extent_buffer(logical, root->nodesize); 1284 } else { 1285 eb = btrfs_clone_extent_buffer(root->node); 1286 btrfs_tree_read_unlock(root->node); 1287 free_extent_buffer(root->node); 1288 } 1289 1290 if (!eb) 1291 return NULL; 1292 extent_buffer_get(eb); 1293 btrfs_tree_read_lock(eb); 1294 if (old_root) { 1295 btrfs_set_header_bytenr(eb, eb->start); 1296 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); 1297 btrfs_set_header_owner(eb, root->root_key.objectid); 1298 btrfs_set_header_level(eb, old_root->level); 1299 btrfs_set_header_generation(eb, old_generation); 1300 } 1301 if (tm) 1302 __tree_mod_log_rewind(eb, time_seq, tm); 1303 else 1304 WARN_ON(btrfs_header_level(eb) != 0); 1305 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); 1306 1307 return eb; 1308 } 1309 1310 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) 1311 { 1312 struct tree_mod_elem *tm; 1313 int level; 1314 1315 tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); 1316 if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { 1317 level = tm->old_root.level; 1318 } else { 1319 rcu_read_lock(); 1320 level = btrfs_header_level(root->node); 1321 rcu_read_unlock(); 1322 } 1323 1324 return level; 1325 } 1326 1327 static inline int should_cow_block(struct btrfs_trans_handle *trans, 1328 struct btrfs_root *root, 1329 struct extent_buffer *buf) 1330 { 1331 /* ensure we can see the force_cow */ 1332 smp_rmb(); 1333 1334 /* 1335 * We do not need to cow a block if 1336 * 1) this block is not created or changed in this transaction; 1337 * 2) this block does not belong to TREE_RELOC tree; 1338 * 3) the root is not forced COW. 1339 * 1340 * What is forced COW: 1341 * when we create snapshot during commiting the transaction, 1342 * after we've finished coping src root, we must COW the shared 1343 * block to ensure the metadata consistency. 1344 */ 1345 if (btrfs_header_generation(buf) == trans->transid && 1346 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 1347 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 1348 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 1349 !root->force_cow) 1350 return 0; 1351 return 1; 1352 } 1353 1354 /* 1355 * cows a single block, see __btrfs_cow_block for the real work. 1356 * This version of it has extra checks so that a block isn't cow'd more than 1357 * once per transaction, as long as it hasn't been written yet 1358 */ 1359 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 1360 struct btrfs_root *root, struct extent_buffer *buf, 1361 struct extent_buffer *parent, int parent_slot, 1362 struct extent_buffer **cow_ret) 1363 { 1364 u64 search_start; 1365 int ret; 1366 1367 if (trans->transaction != root->fs_info->running_transaction) 1368 WARN(1, KERN_CRIT "trans %llu running %llu\n", 1369 (unsigned long long)trans->transid, 1370 (unsigned long long) 1371 root->fs_info->running_transaction->transid); 1372 1373 if (trans->transid != root->fs_info->generation) 1374 WARN(1, KERN_CRIT "trans %llu running %llu\n", 1375 (unsigned long long)trans->transid, 1376 (unsigned long long)root->fs_info->generation); 1377 1378 if (!should_cow_block(trans, root, buf)) { 1379 *cow_ret = buf; 1380 return 0; 1381 } 1382 1383 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); 1384 1385 if (parent) 1386 btrfs_set_lock_blocking(parent); 1387 btrfs_set_lock_blocking(buf); 1388 1389 ret = __btrfs_cow_block(trans, root, buf, parent, 1390 parent_slot, cow_ret, search_start, 0); 1391 1392 trace_btrfs_cow_block(root, buf, *cow_ret); 1393 1394 return ret; 1395 } 1396 1397 /* 1398 * helper function for defrag to decide if two blocks pointed to by a 1399 * node are actually close by 1400 */ 1401 static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 1402 { 1403 if (blocknr < other && other - (blocknr + blocksize) < 32768) 1404 return 1; 1405 if (blocknr > other && blocknr - (other + blocksize) < 32768) 1406 return 1; 1407 return 0; 1408 } 1409 1410 /* 1411 * compare two keys in a memcmp fashion 1412 */ 1413 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) 1414 { 1415 struct btrfs_key k1; 1416 1417 btrfs_disk_key_to_cpu(&k1, disk); 1418 1419 return btrfs_comp_cpu_keys(&k1, k2); 1420 } 1421 1422 /* 1423 * same as comp_keys only with two btrfs_key's 1424 */ 1425 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) 1426 { 1427 if (k1->objectid > k2->objectid) 1428 return 1; 1429 if (k1->objectid < k2->objectid) 1430 return -1; 1431 if (k1->type > k2->type) 1432 return 1; 1433 if (k1->type < k2->type) 1434 return -1; 1435 if (k1->offset > k2->offset) 1436 return 1; 1437 if (k1->offset < k2->offset) 1438 return -1; 1439 return 0; 1440 } 1441 1442 /* 1443 * this is used by the defrag code to go through all the 1444 * leaves pointed to by a node and reallocate them so that 1445 * disk order is close to key order 1446 */ 1447 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 1448 struct btrfs_root *root, struct extent_buffer *parent, 1449 int start_slot, u64 *last_ret, 1450 struct btrfs_key *progress) 1451 { 1452 struct extent_buffer *cur; 1453 u64 blocknr; 1454 u64 gen; 1455 u64 search_start = *last_ret; 1456 u64 last_block = 0; 1457 u64 other; 1458 u32 parent_nritems; 1459 int end_slot; 1460 int i; 1461 int err = 0; 1462 int parent_level; 1463 int uptodate; 1464 u32 blocksize; 1465 int progress_passed = 0; 1466 struct btrfs_disk_key disk_key; 1467 1468 parent_level = btrfs_header_level(parent); 1469 1470 WARN_ON(trans->transaction != root->fs_info->running_transaction); 1471 WARN_ON(trans->transid != root->fs_info->generation); 1472 1473 parent_nritems = btrfs_header_nritems(parent); 1474 blocksize = btrfs_level_size(root, parent_level - 1); 1475 end_slot = parent_nritems; 1476 1477 if (parent_nritems == 1) 1478 return 0; 1479 1480 btrfs_set_lock_blocking(parent); 1481 1482 for (i = start_slot; i < end_slot; i++) { 1483 int close = 1; 1484 1485 btrfs_node_key(parent, &disk_key, i); 1486 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 1487 continue; 1488 1489 progress_passed = 1; 1490 blocknr = btrfs_node_blockptr(parent, i); 1491 gen = btrfs_node_ptr_generation(parent, i); 1492 if (last_block == 0) 1493 last_block = blocknr; 1494 1495 if (i > 0) { 1496 other = btrfs_node_blockptr(parent, i - 1); 1497 close = close_blocks(blocknr, other, blocksize); 1498 } 1499 if (!close && i < end_slot - 2) { 1500 other = btrfs_node_blockptr(parent, i + 1); 1501 close = close_blocks(blocknr, other, blocksize); 1502 } 1503 if (close) { 1504 last_block = blocknr; 1505 continue; 1506 } 1507 1508 cur = btrfs_find_tree_block(root, blocknr, blocksize); 1509 if (cur) 1510 uptodate = btrfs_buffer_uptodate(cur, gen, 0); 1511 else 1512 uptodate = 0; 1513 if (!cur || !uptodate) { 1514 if (!cur) { 1515 cur = read_tree_block(root, blocknr, 1516 blocksize, gen); 1517 if (!cur) 1518 return -EIO; 1519 } else if (!uptodate) { 1520 err = btrfs_read_buffer(cur, gen); 1521 if (err) { 1522 free_extent_buffer(cur); 1523 return err; 1524 } 1525 } 1526 } 1527 if (search_start == 0) 1528 search_start = last_block; 1529 1530 btrfs_tree_lock(cur); 1531 btrfs_set_lock_blocking(cur); 1532 err = __btrfs_cow_block(trans, root, cur, parent, i, 1533 &cur, search_start, 1534 min(16 * blocksize, 1535 (end_slot - i) * blocksize)); 1536 if (err) { 1537 btrfs_tree_unlock(cur); 1538 free_extent_buffer(cur); 1539 break; 1540 } 1541 search_start = cur->start; 1542 last_block = cur->start; 1543 *last_ret = search_start; 1544 btrfs_tree_unlock(cur); 1545 free_extent_buffer(cur); 1546 } 1547 return err; 1548 } 1549 1550 /* 1551 * The leaf data grows from end-to-front in the node. 1552 * this returns the address of the start of the last item, 1553 * which is the stop of the leaf data stack 1554 */ 1555 static inline unsigned int leaf_data_end(struct btrfs_root *root, 1556 struct extent_buffer *leaf) 1557 { 1558 u32 nr = btrfs_header_nritems(leaf); 1559 if (nr == 0) 1560 return BTRFS_LEAF_DATA_SIZE(root); 1561 return btrfs_item_offset_nr(leaf, nr - 1); 1562 } 1563 1564 1565 /* 1566 * search for key in the extent_buffer. The items start at offset p, 1567 * and they are item_size apart. There are 'max' items in p. 1568 * 1569 * the slot in the array is returned via slot, and it points to 1570 * the place where you would insert key if it is not found in 1571 * the array. 1572 * 1573 * slot may point to max if the key is bigger than all of the keys 1574 */ 1575 static noinline int generic_bin_search(struct extent_buffer *eb, 1576 unsigned long p, 1577 int item_size, struct btrfs_key *key, 1578 int max, int *slot) 1579 { 1580 int low = 0; 1581 int high = max; 1582 int mid; 1583 int ret; 1584 struct btrfs_disk_key *tmp = NULL; 1585 struct btrfs_disk_key unaligned; 1586 unsigned long offset; 1587 char *kaddr = NULL; 1588 unsigned long map_start = 0; 1589 unsigned long map_len = 0; 1590 int err; 1591 1592 while (low < high) { 1593 mid = (low + high) / 2; 1594 offset = p + mid * item_size; 1595 1596 if (!kaddr || offset < map_start || 1597 (offset + sizeof(struct btrfs_disk_key)) > 1598 map_start + map_len) { 1599 1600 err = map_private_extent_buffer(eb, offset, 1601 sizeof(struct btrfs_disk_key), 1602 &kaddr, &map_start, &map_len); 1603 1604 if (!err) { 1605 tmp = (struct btrfs_disk_key *)(kaddr + offset - 1606 map_start); 1607 } else { 1608 read_extent_buffer(eb, &unaligned, 1609 offset, sizeof(unaligned)); 1610 tmp = &unaligned; 1611 } 1612 1613 } else { 1614 tmp = (struct btrfs_disk_key *)(kaddr + offset - 1615 map_start); 1616 } 1617 ret = comp_keys(tmp, key); 1618 1619 if (ret < 0) 1620 low = mid + 1; 1621 else if (ret > 0) 1622 high = mid; 1623 else { 1624 *slot = mid; 1625 return 0; 1626 } 1627 } 1628 *slot = low; 1629 return 1; 1630 } 1631 1632 /* 1633 * simple bin_search frontend that does the right thing for 1634 * leaves vs nodes 1635 */ 1636 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, 1637 int level, int *slot) 1638 { 1639 if (level == 0) 1640 return generic_bin_search(eb, 1641 offsetof(struct btrfs_leaf, items), 1642 sizeof(struct btrfs_item), 1643 key, btrfs_header_nritems(eb), 1644 slot); 1645 else 1646 return generic_bin_search(eb, 1647 offsetof(struct btrfs_node, ptrs), 1648 sizeof(struct btrfs_key_ptr), 1649 key, btrfs_header_nritems(eb), 1650 slot); 1651 } 1652 1653 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 1654 int level, int *slot) 1655 { 1656 return bin_search(eb, key, level, slot); 1657 } 1658 1659 static void root_add_used(struct btrfs_root *root, u32 size) 1660 { 1661 spin_lock(&root->accounting_lock); 1662 btrfs_set_root_used(&root->root_item, 1663 btrfs_root_used(&root->root_item) + size); 1664 spin_unlock(&root->accounting_lock); 1665 } 1666 1667 static void root_sub_used(struct btrfs_root *root, u32 size) 1668 { 1669 spin_lock(&root->accounting_lock); 1670 btrfs_set_root_used(&root->root_item, 1671 btrfs_root_used(&root->root_item) - size); 1672 spin_unlock(&root->accounting_lock); 1673 } 1674 1675 /* given a node and slot number, this reads the blocks it points to. The 1676 * extent buffer is returned with a reference taken (but unlocked). 1677 * NULL is returned on error. 1678 */ 1679 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, 1680 struct extent_buffer *parent, int slot) 1681 { 1682 int level = btrfs_header_level(parent); 1683 if (slot < 0) 1684 return NULL; 1685 if (slot >= btrfs_header_nritems(parent)) 1686 return NULL; 1687 1688 BUG_ON(level == 0); 1689 1690 return read_tree_block(root, btrfs_node_blockptr(parent, slot), 1691 btrfs_level_size(root, level - 1), 1692 btrfs_node_ptr_generation(parent, slot)); 1693 } 1694 1695 /* 1696 * node level balancing, used to make sure nodes are in proper order for 1697 * item deletion. We balance from the top down, so we have to make sure 1698 * that a deletion won't leave an node completely empty later on. 1699 */ 1700 static noinline int balance_level(struct btrfs_trans_handle *trans, 1701 struct btrfs_root *root, 1702 struct btrfs_path *path, int level) 1703 { 1704 struct extent_buffer *right = NULL; 1705 struct extent_buffer *mid; 1706 struct extent_buffer *left = NULL; 1707 struct extent_buffer *parent = NULL; 1708 int ret = 0; 1709 int wret; 1710 int pslot; 1711 int orig_slot = path->slots[level]; 1712 u64 orig_ptr; 1713 1714 if (level == 0) 1715 return 0; 1716 1717 mid = path->nodes[level]; 1718 1719 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && 1720 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); 1721 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1722 1723 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 1724 1725 if (level < BTRFS_MAX_LEVEL - 1) { 1726 parent = path->nodes[level + 1]; 1727 pslot = path->slots[level + 1]; 1728 } 1729 1730 /* 1731 * deal with the case where there is only one pointer in the root 1732 * by promoting the node below to a root 1733 */ 1734 if (!parent) { 1735 struct extent_buffer *child; 1736 1737 if (btrfs_header_nritems(mid) != 1) 1738 return 0; 1739 1740 /* promote the child to a root */ 1741 child = read_node_slot(root, mid, 0); 1742 if (!child) { 1743 ret = -EROFS; 1744 btrfs_std_error(root->fs_info, ret); 1745 goto enospc; 1746 } 1747 1748 btrfs_tree_lock(child); 1749 btrfs_set_lock_blocking(child); 1750 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1751 if (ret) { 1752 btrfs_tree_unlock(child); 1753 free_extent_buffer(child); 1754 goto enospc; 1755 } 1756 1757 tree_mod_log_set_root_pointer(root, child); 1758 rcu_assign_pointer(root->node, child); 1759 1760 add_root_to_dirty_list(root); 1761 btrfs_tree_unlock(child); 1762 1763 path->locks[level] = 0; 1764 path->nodes[level] = NULL; 1765 clean_tree_block(trans, root, mid); 1766 btrfs_tree_unlock(mid); 1767 /* once for the path */ 1768 free_extent_buffer(mid); 1769 1770 root_sub_used(root, mid->len); 1771 btrfs_free_tree_block(trans, root, mid, 0, 1); 1772 /* once for the root ptr */ 1773 free_extent_buffer_stale(mid); 1774 return 0; 1775 } 1776 if (btrfs_header_nritems(mid) > 1777 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1778 return 0; 1779 1780 left = read_node_slot(root, parent, pslot - 1); 1781 if (left) { 1782 btrfs_tree_lock(left); 1783 btrfs_set_lock_blocking(left); 1784 wret = btrfs_cow_block(trans, root, left, 1785 parent, pslot - 1, &left); 1786 if (wret) { 1787 ret = wret; 1788 goto enospc; 1789 } 1790 } 1791 right = read_node_slot(root, parent, pslot + 1); 1792 if (right) { 1793 btrfs_tree_lock(right); 1794 btrfs_set_lock_blocking(right); 1795 wret = btrfs_cow_block(trans, root, right, 1796 parent, pslot + 1, &right); 1797 if (wret) { 1798 ret = wret; 1799 goto enospc; 1800 } 1801 } 1802 1803 /* first, try to make some room in the middle buffer */ 1804 if (left) { 1805 orig_slot += btrfs_header_nritems(left); 1806 wret = push_node_left(trans, root, left, mid, 1); 1807 if (wret < 0) 1808 ret = wret; 1809 } 1810 1811 /* 1812 * then try to empty the right most buffer into the middle 1813 */ 1814 if (right) { 1815 wret = push_node_left(trans, root, mid, right, 1); 1816 if (wret < 0 && wret != -ENOSPC) 1817 ret = wret; 1818 if (btrfs_header_nritems(right) == 0) { 1819 clean_tree_block(trans, root, right); 1820 btrfs_tree_unlock(right); 1821 del_ptr(trans, root, path, level + 1, pslot + 1); 1822 root_sub_used(root, right->len); 1823 btrfs_free_tree_block(trans, root, right, 0, 1); 1824 free_extent_buffer_stale(right); 1825 right = NULL; 1826 } else { 1827 struct btrfs_disk_key right_key; 1828 btrfs_node_key(right, &right_key, 0); 1829 tree_mod_log_set_node_key(root->fs_info, parent, 1830 pslot + 1, 0); 1831 btrfs_set_node_key(parent, &right_key, pslot + 1); 1832 btrfs_mark_buffer_dirty(parent); 1833 } 1834 } 1835 if (btrfs_header_nritems(mid) == 1) { 1836 /* 1837 * we're not allowed to leave a node with one item in the 1838 * tree during a delete. A deletion from lower in the tree 1839 * could try to delete the only pointer in this node. 1840 * So, pull some keys from the left. 1841 * There has to be a left pointer at this point because 1842 * otherwise we would have pulled some pointers from the 1843 * right 1844 */ 1845 if (!left) { 1846 ret = -EROFS; 1847 btrfs_std_error(root->fs_info, ret); 1848 goto enospc; 1849 } 1850 wret = balance_node_right(trans, root, mid, left); 1851 if (wret < 0) { 1852 ret = wret; 1853 goto enospc; 1854 } 1855 if (wret == 1) { 1856 wret = push_node_left(trans, root, left, mid, 1); 1857 if (wret < 0) 1858 ret = wret; 1859 } 1860 BUG_ON(wret == 1); 1861 } 1862 if (btrfs_header_nritems(mid) == 0) { 1863 clean_tree_block(trans, root, mid); 1864 btrfs_tree_unlock(mid); 1865 del_ptr(trans, root, path, level + 1, pslot); 1866 root_sub_used(root, mid->len); 1867 btrfs_free_tree_block(trans, root, mid, 0, 1); 1868 free_extent_buffer_stale(mid); 1869 mid = NULL; 1870 } else { 1871 /* update the parent key to reflect our changes */ 1872 struct btrfs_disk_key mid_key; 1873 btrfs_node_key(mid, &mid_key, 0); 1874 tree_mod_log_set_node_key(root->fs_info, parent, 1875 pslot, 0); 1876 btrfs_set_node_key(parent, &mid_key, pslot); 1877 btrfs_mark_buffer_dirty(parent); 1878 } 1879 1880 /* update the path */ 1881 if (left) { 1882 if (btrfs_header_nritems(left) > orig_slot) { 1883 extent_buffer_get(left); 1884 /* left was locked after cow */ 1885 path->nodes[level] = left; 1886 path->slots[level + 1] -= 1; 1887 path->slots[level] = orig_slot; 1888 if (mid) { 1889 btrfs_tree_unlock(mid); 1890 free_extent_buffer(mid); 1891 } 1892 } else { 1893 orig_slot -= btrfs_header_nritems(left); 1894 path->slots[level] = orig_slot; 1895 } 1896 } 1897 /* double check we haven't messed things up */ 1898 if (orig_ptr != 1899 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1900 BUG(); 1901 enospc: 1902 if (right) { 1903 btrfs_tree_unlock(right); 1904 free_extent_buffer(right); 1905 } 1906 if (left) { 1907 if (path->nodes[level] != left) 1908 btrfs_tree_unlock(left); 1909 free_extent_buffer(left); 1910 } 1911 return ret; 1912 } 1913 1914 /* Node balancing for insertion. Here we only split or push nodes around 1915 * when they are completely full. This is also done top down, so we 1916 * have to be pessimistic. 1917 */ 1918 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1919 struct btrfs_root *root, 1920 struct btrfs_path *path, int level) 1921 { 1922 struct extent_buffer *right = NULL; 1923 struct extent_buffer *mid; 1924 struct extent_buffer *left = NULL; 1925 struct extent_buffer *parent = NULL; 1926 int ret = 0; 1927 int wret; 1928 int pslot; 1929 int orig_slot = path->slots[level]; 1930 1931 if (level == 0) 1932 return 1; 1933 1934 mid = path->nodes[level]; 1935 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1936 1937 if (level < BTRFS_MAX_LEVEL - 1) { 1938 parent = path->nodes[level + 1]; 1939 pslot = path->slots[level + 1]; 1940 } 1941 1942 if (!parent) 1943 return 1; 1944 1945 left = read_node_slot(root, parent, pslot - 1); 1946 1947 /* first, try to make some room in the middle buffer */ 1948 if (left) { 1949 u32 left_nr; 1950 1951 btrfs_tree_lock(left); 1952 btrfs_set_lock_blocking(left); 1953 1954 left_nr = btrfs_header_nritems(left); 1955 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1956 wret = 1; 1957 } else { 1958 ret = btrfs_cow_block(trans, root, left, parent, 1959 pslot - 1, &left); 1960 if (ret) 1961 wret = 1; 1962 else { 1963 wret = push_node_left(trans, root, 1964 left, mid, 0); 1965 } 1966 } 1967 if (wret < 0) 1968 ret = wret; 1969 if (wret == 0) { 1970 struct btrfs_disk_key disk_key; 1971 orig_slot += left_nr; 1972 btrfs_node_key(mid, &disk_key, 0); 1973 tree_mod_log_set_node_key(root->fs_info, parent, 1974 pslot, 0); 1975 btrfs_set_node_key(parent, &disk_key, pslot); 1976 btrfs_mark_buffer_dirty(parent); 1977 if (btrfs_header_nritems(left) > orig_slot) { 1978 path->nodes[level] = left; 1979 path->slots[level + 1] -= 1; 1980 path->slots[level] = orig_slot; 1981 btrfs_tree_unlock(mid); 1982 free_extent_buffer(mid); 1983 } else { 1984 orig_slot -= 1985 btrfs_header_nritems(left); 1986 path->slots[level] = orig_slot; 1987 btrfs_tree_unlock(left); 1988 free_extent_buffer(left); 1989 } 1990 return 0; 1991 } 1992 btrfs_tree_unlock(left); 1993 free_extent_buffer(left); 1994 } 1995 right = read_node_slot(root, parent, pslot + 1); 1996 1997 /* 1998 * then try to empty the right most buffer into the middle 1999 */ 2000 if (right) { 2001 u32 right_nr; 2002 2003 btrfs_tree_lock(right); 2004 btrfs_set_lock_blocking(right); 2005 2006 right_nr = btrfs_header_nritems(right); 2007 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 2008 wret = 1; 2009 } else { 2010 ret = btrfs_cow_block(trans, root, right, 2011 parent, pslot + 1, 2012 &right); 2013 if (ret) 2014 wret = 1; 2015 else { 2016 wret = balance_node_right(trans, root, 2017 right, mid); 2018 } 2019 } 2020 if (wret < 0) 2021 ret = wret; 2022 if (wret == 0) { 2023 struct btrfs_disk_key disk_key; 2024 2025 btrfs_node_key(right, &disk_key, 0); 2026 tree_mod_log_set_node_key(root->fs_info, parent, 2027 pslot + 1, 0); 2028 btrfs_set_node_key(parent, &disk_key, pslot + 1); 2029 btrfs_mark_buffer_dirty(parent); 2030 2031 if (btrfs_header_nritems(mid) <= orig_slot) { 2032 path->nodes[level] = right; 2033 path->slots[level + 1] += 1; 2034 path->slots[level] = orig_slot - 2035 btrfs_header_nritems(mid); 2036 btrfs_tree_unlock(mid); 2037 free_extent_buffer(mid); 2038 } else { 2039 btrfs_tree_unlock(right); 2040 free_extent_buffer(right); 2041 } 2042 return 0; 2043 } 2044 btrfs_tree_unlock(right); 2045 free_extent_buffer(right); 2046 } 2047 return 1; 2048 } 2049 2050 /* 2051 * readahead one full node of leaves, finding things that are close 2052 * to the block in 'slot', and triggering ra on them. 2053 */ 2054 static void reada_for_search(struct btrfs_root *root, 2055 struct btrfs_path *path, 2056 int level, int slot, u64 objectid) 2057 { 2058 struct extent_buffer *node; 2059 struct btrfs_disk_key disk_key; 2060 u32 nritems; 2061 u64 search; 2062 u64 target; 2063 u64 nread = 0; 2064 u64 gen; 2065 int direction = path->reada; 2066 struct extent_buffer *eb; 2067 u32 nr; 2068 u32 blocksize; 2069 u32 nscan = 0; 2070 2071 if (level != 1) 2072 return; 2073 2074 if (!path->nodes[level]) 2075 return; 2076 2077 node = path->nodes[level]; 2078 2079 search = btrfs_node_blockptr(node, slot); 2080 blocksize = btrfs_level_size(root, level - 1); 2081 eb = btrfs_find_tree_block(root, search, blocksize); 2082 if (eb) { 2083 free_extent_buffer(eb); 2084 return; 2085 } 2086 2087 target = search; 2088 2089 nritems = btrfs_header_nritems(node); 2090 nr = slot; 2091 2092 while (1) { 2093 if (direction < 0) { 2094 if (nr == 0) 2095 break; 2096 nr--; 2097 } else if (direction > 0) { 2098 nr++; 2099 if (nr >= nritems) 2100 break; 2101 } 2102 if (path->reada < 0 && objectid) { 2103 btrfs_node_key(node, &disk_key, nr); 2104 if (btrfs_disk_key_objectid(&disk_key) != objectid) 2105 break; 2106 } 2107 search = btrfs_node_blockptr(node, nr); 2108 if ((search <= target && target - search <= 65536) || 2109 (search > target && search - target <= 65536)) { 2110 gen = btrfs_node_ptr_generation(node, nr); 2111 readahead_tree_block(root, search, blocksize, gen); 2112 nread += blocksize; 2113 } 2114 nscan++; 2115 if ((nread > 65536 || nscan > 32)) 2116 break; 2117 } 2118 } 2119 2120 /* 2121 * returns -EAGAIN if it had to drop the path, or zero if everything was in 2122 * cache 2123 */ 2124 static noinline int reada_for_balance(struct btrfs_root *root, 2125 struct btrfs_path *path, int level) 2126 { 2127 int slot; 2128 int nritems; 2129 struct extent_buffer *parent; 2130 struct extent_buffer *eb; 2131 u64 gen; 2132 u64 block1 = 0; 2133 u64 block2 = 0; 2134 int ret = 0; 2135 int blocksize; 2136 2137 parent = path->nodes[level + 1]; 2138 if (!parent) 2139 return 0; 2140 2141 nritems = btrfs_header_nritems(parent); 2142 slot = path->slots[level + 1]; 2143 blocksize = btrfs_level_size(root, level); 2144 2145 if (slot > 0) { 2146 block1 = btrfs_node_blockptr(parent, slot - 1); 2147 gen = btrfs_node_ptr_generation(parent, slot - 1); 2148 eb = btrfs_find_tree_block(root, block1, blocksize); 2149 /* 2150 * if we get -eagain from btrfs_buffer_uptodate, we 2151 * don't want to return eagain here. That will loop 2152 * forever 2153 */ 2154 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) 2155 block1 = 0; 2156 free_extent_buffer(eb); 2157 } 2158 if (slot + 1 < nritems) { 2159 block2 = btrfs_node_blockptr(parent, slot + 1); 2160 gen = btrfs_node_ptr_generation(parent, slot + 1); 2161 eb = btrfs_find_tree_block(root, block2, blocksize); 2162 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) 2163 block2 = 0; 2164 free_extent_buffer(eb); 2165 } 2166 if (block1 || block2) { 2167 ret = -EAGAIN; 2168 2169 /* release the whole path */ 2170 btrfs_release_path(path); 2171 2172 /* read the blocks */ 2173 if (block1) 2174 readahead_tree_block(root, block1, blocksize, 0); 2175 if (block2) 2176 readahead_tree_block(root, block2, blocksize, 0); 2177 2178 if (block1) { 2179 eb = read_tree_block(root, block1, blocksize, 0); 2180 free_extent_buffer(eb); 2181 } 2182 if (block2) { 2183 eb = read_tree_block(root, block2, blocksize, 0); 2184 free_extent_buffer(eb); 2185 } 2186 } 2187 return ret; 2188 } 2189 2190 2191 /* 2192 * when we walk down the tree, it is usually safe to unlock the higher layers 2193 * in the tree. The exceptions are when our path goes through slot 0, because 2194 * operations on the tree might require changing key pointers higher up in the 2195 * tree. 2196 * 2197 * callers might also have set path->keep_locks, which tells this code to keep 2198 * the lock if the path points to the last slot in the block. This is part of 2199 * walking through the tree, and selecting the next slot in the higher block. 2200 * 2201 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 2202 * if lowest_unlock is 1, level 0 won't be unlocked 2203 */ 2204 static noinline void unlock_up(struct btrfs_path *path, int level, 2205 int lowest_unlock, int min_write_lock_level, 2206 int *write_lock_level) 2207 { 2208 int i; 2209 int skip_level = level; 2210 int no_skips = 0; 2211 struct extent_buffer *t; 2212 2213 if (path->really_keep_locks) 2214 return; 2215 2216 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2217 if (!path->nodes[i]) 2218 break; 2219 if (!path->locks[i]) 2220 break; 2221 if (!no_skips && path->slots[i] == 0) { 2222 skip_level = i + 1; 2223 continue; 2224 } 2225 if (!no_skips && path->keep_locks) { 2226 u32 nritems; 2227 t = path->nodes[i]; 2228 nritems = btrfs_header_nritems(t); 2229 if (nritems < 1 || path->slots[i] >= nritems - 1) { 2230 skip_level = i + 1; 2231 continue; 2232 } 2233 } 2234 if (skip_level < i && i >= lowest_unlock) 2235 no_skips = 1; 2236 2237 t = path->nodes[i]; 2238 if (i >= lowest_unlock && i > skip_level && path->locks[i]) { 2239 btrfs_tree_unlock_rw(t, path->locks[i]); 2240 path->locks[i] = 0; 2241 if (write_lock_level && 2242 i > min_write_lock_level && 2243 i <= *write_lock_level) { 2244 *write_lock_level = i - 1; 2245 } 2246 } 2247 } 2248 } 2249 2250 /* 2251 * This releases any locks held in the path starting at level and 2252 * going all the way up to the root. 2253 * 2254 * btrfs_search_slot will keep the lock held on higher nodes in a few 2255 * corner cases, such as COW of the block at slot zero in the node. This 2256 * ignores those rules, and it should only be called when there are no 2257 * more updates to be done higher up in the tree. 2258 */ 2259 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) 2260 { 2261 int i; 2262 2263 if (path->keep_locks || path->really_keep_locks) 2264 return; 2265 2266 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2267 if (!path->nodes[i]) 2268 continue; 2269 if (!path->locks[i]) 2270 continue; 2271 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 2272 path->locks[i] = 0; 2273 } 2274 } 2275 2276 /* 2277 * helper function for btrfs_search_slot. The goal is to find a block 2278 * in cache without setting the path to blocking. If we find the block 2279 * we return zero and the path is unchanged. 2280 * 2281 * If we can't find the block, we set the path blocking and do some 2282 * reada. -EAGAIN is returned and the search must be repeated. 2283 */ 2284 static int 2285 read_block_for_search(struct btrfs_trans_handle *trans, 2286 struct btrfs_root *root, struct btrfs_path *p, 2287 struct extent_buffer **eb_ret, int level, int slot, 2288 struct btrfs_key *key, u64 time_seq) 2289 { 2290 u64 blocknr; 2291 u64 gen; 2292 u32 blocksize; 2293 struct extent_buffer *b = *eb_ret; 2294 struct extent_buffer *tmp; 2295 int ret; 2296 2297 blocknr = btrfs_node_blockptr(b, slot); 2298 gen = btrfs_node_ptr_generation(b, slot); 2299 blocksize = btrfs_level_size(root, level - 1); 2300 2301 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 2302 if (tmp) { 2303 /* first we do an atomic uptodate check */ 2304 if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { 2305 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 2306 /* 2307 * we found an up to date block without 2308 * sleeping, return 2309 * right away 2310 */ 2311 *eb_ret = tmp; 2312 return 0; 2313 } 2314 /* the pages were up to date, but we failed 2315 * the generation number check. Do a full 2316 * read for the generation number that is correct. 2317 * We must do this without dropping locks so 2318 * we can trust our generation number 2319 */ 2320 free_extent_buffer(tmp); 2321 btrfs_set_path_blocking(p); 2322 2323 /* now we're allowed to do a blocking uptodate check */ 2324 tmp = read_tree_block(root, blocknr, blocksize, gen); 2325 if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { 2326 *eb_ret = tmp; 2327 return 0; 2328 } 2329 free_extent_buffer(tmp); 2330 btrfs_release_path(p); 2331 return -EIO; 2332 } 2333 } 2334 2335 /* 2336 * reduce lock contention at high levels 2337 * of the btree by dropping locks before 2338 * we read. Don't release the lock on the current 2339 * level because we need to walk this node to figure 2340 * out which blocks to read. 2341 */ 2342 btrfs_unlock_up_safe(p, level + 1); 2343 btrfs_set_path_blocking(p); 2344 2345 free_extent_buffer(tmp); 2346 if (p->reada) 2347 reada_for_search(root, p, level, slot, key->objectid); 2348 2349 btrfs_release_path(p); 2350 2351 ret = -EAGAIN; 2352 tmp = read_tree_block(root, blocknr, blocksize, 0); 2353 if (tmp) { 2354 /* 2355 * If the read above didn't mark this buffer up to date, 2356 * it will never end up being up to date. Set ret to EIO now 2357 * and give up so that our caller doesn't loop forever 2358 * on our EAGAINs. 2359 */ 2360 if (!btrfs_buffer_uptodate(tmp, 0, 0)) 2361 ret = -EIO; 2362 free_extent_buffer(tmp); 2363 } 2364 return ret; 2365 } 2366 2367 /* 2368 * helper function for btrfs_search_slot. This does all of the checks 2369 * for node-level blocks and does any balancing required based on 2370 * the ins_len. 2371 * 2372 * If no extra work was required, zero is returned. If we had to 2373 * drop the path, -EAGAIN is returned and btrfs_search_slot must 2374 * start over 2375 */ 2376 static int 2377 setup_nodes_for_search(struct btrfs_trans_handle *trans, 2378 struct btrfs_root *root, struct btrfs_path *p, 2379 struct extent_buffer *b, int level, int ins_len, 2380 int *write_lock_level) 2381 { 2382 int ret; 2383 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 2384 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 2385 int sret; 2386 2387 if (*write_lock_level < level + 1) { 2388 *write_lock_level = level + 1; 2389 btrfs_release_path(p); 2390 goto again; 2391 } 2392 2393 sret = reada_for_balance(root, p, level); 2394 if (sret) 2395 goto again; 2396 2397 btrfs_set_path_blocking(p); 2398 sret = split_node(trans, root, p, level); 2399 btrfs_clear_path_blocking(p, NULL, 0); 2400 2401 BUG_ON(sret > 0); 2402 if (sret) { 2403 ret = sret; 2404 goto done; 2405 } 2406 b = p->nodes[level]; 2407 } else if (ins_len < 0 && btrfs_header_nritems(b) < 2408 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { 2409 int sret; 2410 2411 if (*write_lock_level < level + 1) { 2412 *write_lock_level = level + 1; 2413 btrfs_release_path(p); 2414 goto again; 2415 } 2416 2417 sret = reada_for_balance(root, p, level); 2418 if (sret) 2419 goto again; 2420 2421 btrfs_set_path_blocking(p); 2422 sret = balance_level(trans, root, p, level); 2423 btrfs_clear_path_blocking(p, NULL, 0); 2424 2425 if (sret) { 2426 ret = sret; 2427 goto done; 2428 } 2429 b = p->nodes[level]; 2430 if (!b) { 2431 btrfs_release_path(p); 2432 goto again; 2433 } 2434 BUG_ON(btrfs_header_nritems(b) == 1); 2435 } 2436 return 0; 2437 2438 again: 2439 ret = -EAGAIN; 2440 done: 2441 return ret; 2442 } 2443 2444 /* 2445 * look for key in the tree. path is filled in with nodes along the way 2446 * if key is found, we return zero and you can find the item in the leaf 2447 * level of the path (level 0) 2448 * 2449 * If the key isn't found, the path points to the slot where it should 2450 * be inserted, and 1 is returned. If there are other errors during the 2451 * search a negative error number is returned. 2452 * 2453 * if ins_len > 0, nodes and leaves will be split as we walk down the 2454 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if 2455 * possible) 2456 */ 2457 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root 2458 *root, struct btrfs_key *key, struct btrfs_path *p, int 2459 ins_len, int cow) 2460 { 2461 struct extent_buffer *b; 2462 int slot; 2463 int ret; 2464 int err; 2465 int level; 2466 int lowest_unlock = 1; 2467 int root_lock; 2468 /* everything at write_lock_level or lower must be write locked */ 2469 int write_lock_level = 0; 2470 u8 lowest_level = 0; 2471 int min_write_lock_level; 2472 2473 lowest_level = p->lowest_level; 2474 WARN_ON(lowest_level && ins_len > 0); 2475 WARN_ON(p->nodes[0] != NULL); 2476 2477 if (ins_len < 0) { 2478 lowest_unlock = 2; 2479 2480 /* when we are removing items, we might have to go up to level 2481 * two as we update tree pointers Make sure we keep write 2482 * for those levels as well 2483 */ 2484 write_lock_level = 2; 2485 } else if (ins_len > 0) { 2486 /* 2487 * for inserting items, make sure we have a write lock on 2488 * level 1 so we can update keys 2489 */ 2490 write_lock_level = 1; 2491 } 2492 2493 if (!cow) 2494 write_lock_level = -1; 2495 2496 if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level)) 2497 write_lock_level = BTRFS_MAX_LEVEL; 2498 2499 min_write_lock_level = write_lock_level; 2500 2501 again: 2502 /* 2503 * we try very hard to do read locks on the root 2504 */ 2505 root_lock = BTRFS_READ_LOCK; 2506 level = 0; 2507 if (p->search_commit_root) { 2508 /* 2509 * the commit roots are read only 2510 * so we always do read locks 2511 */ 2512 b = root->commit_root; 2513 extent_buffer_get(b); 2514 level = btrfs_header_level(b); 2515 if (!p->skip_locking) 2516 btrfs_tree_read_lock(b); 2517 } else { 2518 if (p->skip_locking) { 2519 b = btrfs_root_node(root); 2520 level = btrfs_header_level(b); 2521 } else { 2522 /* we don't know the level of the root node 2523 * until we actually have it read locked 2524 */ 2525 b = btrfs_read_lock_root_node(root); 2526 level = btrfs_header_level(b); 2527 if (level <= write_lock_level) { 2528 /* whoops, must trade for write lock */ 2529 btrfs_tree_read_unlock(b); 2530 free_extent_buffer(b); 2531 b = btrfs_lock_root_node(root); 2532 root_lock = BTRFS_WRITE_LOCK; 2533 2534 /* the level might have changed, check again */ 2535 level = btrfs_header_level(b); 2536 } 2537 } 2538 } 2539 p->nodes[level] = b; 2540 if (!p->skip_locking) 2541 p->locks[level] = root_lock; 2542 2543 while (b) { 2544 level = btrfs_header_level(b); 2545 2546 /* 2547 * setup the path here so we can release it under lock 2548 * contention with the cow code 2549 */ 2550 if (cow) { 2551 /* 2552 * if we don't really need to cow this block 2553 * then we don't want to set the path blocking, 2554 * so we test it here 2555 */ 2556 if (!should_cow_block(trans, root, b)) 2557 goto cow_done; 2558 2559 btrfs_set_path_blocking(p); 2560 2561 /* 2562 * must have write locks on this node and the 2563 * parent 2564 */ 2565 if (level > write_lock_level || 2566 (level + 1 > write_lock_level && 2567 level + 1 < BTRFS_MAX_LEVEL && 2568 p->nodes[level + 1])) { 2569 write_lock_level = level + 1; 2570 btrfs_release_path(p); 2571 goto again; 2572 } 2573 2574 err = btrfs_cow_block(trans, root, b, 2575 p->nodes[level + 1], 2576 p->slots[level + 1], &b); 2577 if (err) { 2578 ret = err; 2579 goto done; 2580 } 2581 } 2582 cow_done: 2583 BUG_ON(!cow && ins_len); 2584 2585 p->nodes[level] = b; 2586 btrfs_clear_path_blocking(p, NULL, 0); 2587 2588 /* 2589 * we have a lock on b and as long as we aren't changing 2590 * the tree, there is no way to for the items in b to change. 2591 * It is safe to drop the lock on our parent before we 2592 * go through the expensive btree search on b. 2593 * 2594 * If cow is true, then we might be changing slot zero, 2595 * which may require changing the parent. So, we can't 2596 * drop the lock until after we know which slot we're 2597 * operating on. 2598 */ 2599 if (!cow) 2600 btrfs_unlock_up_safe(p, level + 1); 2601 2602 ret = bin_search(b, key, level, &slot); 2603 2604 if (level != 0) { 2605 int dec = 0; 2606 if (ret && slot > 0) { 2607 dec = 1; 2608 slot -= 1; 2609 } 2610 p->slots[level] = slot; 2611 err = setup_nodes_for_search(trans, root, p, b, level, 2612 ins_len, &write_lock_level); 2613 if (err == -EAGAIN) 2614 goto again; 2615 if (err) { 2616 ret = err; 2617 goto done; 2618 } 2619 b = p->nodes[level]; 2620 slot = p->slots[level]; 2621 2622 /* 2623 * slot 0 is special, if we change the key 2624 * we have to update the parent pointer 2625 * which means we must have a write lock 2626 * on the parent 2627 */ 2628 if (slot == 0 && cow && 2629 write_lock_level < level + 1) { 2630 write_lock_level = level + 1; 2631 btrfs_release_path(p); 2632 goto again; 2633 } 2634 2635 unlock_up(p, level, lowest_unlock, 2636 min_write_lock_level, &write_lock_level); 2637 2638 if (level == lowest_level) { 2639 if (dec) 2640 p->slots[level]++; 2641 goto done; 2642 } 2643 2644 err = read_block_for_search(trans, root, p, 2645 &b, level, slot, key, 0); 2646 if (err == -EAGAIN) 2647 goto again; 2648 if (err) { 2649 ret = err; 2650 goto done; 2651 } 2652 2653 if (!p->skip_locking) { 2654 level = btrfs_header_level(b); 2655 if (level <= write_lock_level) { 2656 err = btrfs_try_tree_write_lock(b); 2657 if (!err) { 2658 btrfs_set_path_blocking(p); 2659 btrfs_tree_lock(b); 2660 btrfs_clear_path_blocking(p, b, 2661 BTRFS_WRITE_LOCK); 2662 } 2663 p->locks[level] = BTRFS_WRITE_LOCK; 2664 } else { 2665 err = btrfs_try_tree_read_lock(b); 2666 if (!err) { 2667 btrfs_set_path_blocking(p); 2668 btrfs_tree_read_lock(b); 2669 btrfs_clear_path_blocking(p, b, 2670 BTRFS_READ_LOCK); 2671 } 2672 p->locks[level] = BTRFS_READ_LOCK; 2673 } 2674 p->nodes[level] = b; 2675 } 2676 } else { 2677 p->slots[level] = slot; 2678 if (ins_len > 0 && 2679 btrfs_leaf_free_space(root, b) < ins_len) { 2680 if (write_lock_level < 1) { 2681 write_lock_level = 1; 2682 btrfs_release_path(p); 2683 goto again; 2684 } 2685 2686 btrfs_set_path_blocking(p); 2687 err = split_leaf(trans, root, key, 2688 p, ins_len, ret == 0); 2689 btrfs_clear_path_blocking(p, NULL, 0); 2690 2691 BUG_ON(err > 0); 2692 if (err) { 2693 ret = err; 2694 goto done; 2695 } 2696 } 2697 if (!p->search_for_split) 2698 unlock_up(p, level, lowest_unlock, 2699 min_write_lock_level, &write_lock_level); 2700 goto done; 2701 } 2702 } 2703 ret = 1; 2704 done: 2705 /* 2706 * we don't really know what they plan on doing with the path 2707 * from here on, so for now just mark it as blocking 2708 */ 2709 if (!p->leave_spinning) 2710 btrfs_set_path_blocking(p); 2711 if (ret < 0) 2712 btrfs_release_path(p); 2713 return ret; 2714 } 2715 2716 /* 2717 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2718 * current state of the tree together with the operations recorded in the tree 2719 * modification log to search for the key in a previous version of this tree, as 2720 * denoted by the time_seq parameter. 2721 * 2722 * Naturally, there is no support for insert, delete or cow operations. 2723 * 2724 * The resulting path and return value will be set up as if we called 2725 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2726 */ 2727 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, 2728 struct btrfs_path *p, u64 time_seq) 2729 { 2730 struct extent_buffer *b; 2731 int slot; 2732 int ret; 2733 int err; 2734 int level; 2735 int lowest_unlock = 1; 2736 u8 lowest_level = 0; 2737 2738 lowest_level = p->lowest_level; 2739 WARN_ON(p->nodes[0] != NULL); 2740 2741 if (p->search_commit_root) { 2742 BUG_ON(time_seq); 2743 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2744 } 2745 2746 again: 2747 b = get_old_root(root, time_seq); 2748 level = btrfs_header_level(b); 2749 p->locks[level] = BTRFS_READ_LOCK; 2750 2751 while (b) { 2752 level = btrfs_header_level(b); 2753 p->nodes[level] = b; 2754 btrfs_clear_path_blocking(p, NULL, 0); 2755 2756 /* 2757 * we have a lock on b and as long as we aren't changing 2758 * the tree, there is no way to for the items in b to change. 2759 * It is safe to drop the lock on our parent before we 2760 * go through the expensive btree search on b. 2761 */ 2762 btrfs_unlock_up_safe(p, level + 1); 2763 2764 ret = bin_search(b, key, level, &slot); 2765 2766 if (level != 0) { 2767 int dec = 0; 2768 if (ret && slot > 0) { 2769 dec = 1; 2770 slot -= 1; 2771 } 2772 p->slots[level] = slot; 2773 unlock_up(p, level, lowest_unlock, 0, NULL); 2774 2775 if (level == lowest_level) { 2776 if (dec) 2777 p->slots[level]++; 2778 goto done; 2779 } 2780 2781 err = read_block_for_search(NULL, root, p, &b, level, 2782 slot, key, time_seq); 2783 if (err == -EAGAIN) 2784 goto again; 2785 if (err) { 2786 ret = err; 2787 goto done; 2788 } 2789 2790 level = btrfs_header_level(b); 2791 err = btrfs_try_tree_read_lock(b); 2792 if (!err) { 2793 btrfs_set_path_blocking(p); 2794 btrfs_tree_read_lock(b); 2795 btrfs_clear_path_blocking(p, b, 2796 BTRFS_READ_LOCK); 2797 } 2798 p->locks[level] = BTRFS_READ_LOCK; 2799 p->nodes[level] = b; 2800 b = tree_mod_log_rewind(root->fs_info, b, time_seq); 2801 if (b != p->nodes[level]) { 2802 btrfs_tree_unlock_rw(p->nodes[level], 2803 p->locks[level]); 2804 p->locks[level] = 0; 2805 p->nodes[level] = b; 2806 } 2807 } else { 2808 p->slots[level] = slot; 2809 unlock_up(p, level, lowest_unlock, 0, NULL); 2810 goto done; 2811 } 2812 } 2813 ret = 1; 2814 done: 2815 if (!p->leave_spinning) 2816 btrfs_set_path_blocking(p); 2817 if (ret < 0) 2818 btrfs_release_path(p); 2819 2820 return ret; 2821 } 2822 2823 /* 2824 * helper to use instead of search slot if no exact match is needed but 2825 * instead the next or previous item should be returned. 2826 * When find_higher is true, the next higher item is returned, the next lower 2827 * otherwise. 2828 * When return_any and find_higher are both true, and no higher item is found, 2829 * return the next lower instead. 2830 * When return_any is true and find_higher is false, and no lower item is found, 2831 * return the next higher instead. 2832 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2833 * < 0 on error 2834 */ 2835 int btrfs_search_slot_for_read(struct btrfs_root *root, 2836 struct btrfs_key *key, struct btrfs_path *p, 2837 int find_higher, int return_any) 2838 { 2839 int ret; 2840 struct extent_buffer *leaf; 2841 2842 again: 2843 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2844 if (ret <= 0) 2845 return ret; 2846 /* 2847 * a return value of 1 means the path is at the position where the 2848 * item should be inserted. Normally this is the next bigger item, 2849 * but in case the previous item is the last in a leaf, path points 2850 * to the first free slot in the previous leaf, i.e. at an invalid 2851 * item. 2852 */ 2853 leaf = p->nodes[0]; 2854 2855 if (find_higher) { 2856 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2857 ret = btrfs_next_leaf(root, p); 2858 if (ret <= 0) 2859 return ret; 2860 if (!return_any) 2861 return 1; 2862 /* 2863 * no higher item found, return the next 2864 * lower instead 2865 */ 2866 return_any = 0; 2867 find_higher = 0; 2868 btrfs_release_path(p); 2869 goto again; 2870 } 2871 } else { 2872 if (p->slots[0] == 0) { 2873 ret = btrfs_prev_leaf(root, p); 2874 if (ret < 0) 2875 return ret; 2876 if (!ret) { 2877 p->slots[0] = btrfs_header_nritems(leaf) - 1; 2878 return 0; 2879 } 2880 if (!return_any) 2881 return 1; 2882 /* 2883 * no lower item found, return the next 2884 * higher instead 2885 */ 2886 return_any = 0; 2887 find_higher = 1; 2888 btrfs_release_path(p); 2889 goto again; 2890 } else { 2891 --p->slots[0]; 2892 } 2893 } 2894 return 0; 2895 } 2896 2897 /* 2898 * adjust the pointers going up the tree, starting at level 2899 * making sure the right key of each node is points to 'key'. 2900 * This is used after shifting pointers to the left, so it stops 2901 * fixing up pointers when a given leaf/node is not in slot 0 of the 2902 * higher levels 2903 * 2904 */ 2905 static void fixup_low_keys(struct btrfs_trans_handle *trans, 2906 struct btrfs_root *root, struct btrfs_path *path, 2907 struct btrfs_disk_key *key, int level) 2908 { 2909 int i; 2910 struct extent_buffer *t; 2911 2912 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2913 int tslot = path->slots[i]; 2914 if (!path->nodes[i]) 2915 break; 2916 t = path->nodes[i]; 2917 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); 2918 btrfs_set_node_key(t, key, tslot); 2919 btrfs_mark_buffer_dirty(path->nodes[i]); 2920 if (tslot != 0) 2921 break; 2922 } 2923 } 2924 2925 /* 2926 * update item key. 2927 * 2928 * This function isn't completely safe. It's the caller's responsibility 2929 * that the new key won't break the order 2930 */ 2931 void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, 2932 struct btrfs_root *root, struct btrfs_path *path, 2933 struct btrfs_key *new_key) 2934 { 2935 struct btrfs_disk_key disk_key; 2936 struct extent_buffer *eb; 2937 int slot; 2938 2939 eb = path->nodes[0]; 2940 slot = path->slots[0]; 2941 if (slot > 0) { 2942 btrfs_item_key(eb, &disk_key, slot - 1); 2943 BUG_ON(comp_keys(&disk_key, new_key) >= 0); 2944 } 2945 if (slot < btrfs_header_nritems(eb) - 1) { 2946 btrfs_item_key(eb, &disk_key, slot + 1); 2947 BUG_ON(comp_keys(&disk_key, new_key) <= 0); 2948 } 2949 2950 btrfs_cpu_key_to_disk(&disk_key, new_key); 2951 btrfs_set_item_key(eb, &disk_key, slot); 2952 btrfs_mark_buffer_dirty(eb); 2953 if (slot == 0) 2954 fixup_low_keys(trans, root, path, &disk_key, 1); 2955 } 2956 2957 /* 2958 * try to push data from one node into the next node left in the 2959 * tree. 2960 * 2961 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2962 * error, and > 0 if there was no room in the left hand block. 2963 */ 2964 static int push_node_left(struct btrfs_trans_handle *trans, 2965 struct btrfs_root *root, struct extent_buffer *dst, 2966 struct extent_buffer *src, int empty) 2967 { 2968 int push_items = 0; 2969 int src_nritems; 2970 int dst_nritems; 2971 int ret = 0; 2972 2973 src_nritems = btrfs_header_nritems(src); 2974 dst_nritems = btrfs_header_nritems(dst); 2975 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; 2976 WARN_ON(btrfs_header_generation(src) != trans->transid); 2977 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2978 2979 if (!empty && src_nritems <= 8) 2980 return 1; 2981 2982 if (push_items <= 0) 2983 return 1; 2984 2985 if (empty) { 2986 push_items = min(src_nritems, push_items); 2987 if (push_items < src_nritems) { 2988 /* leave at least 8 pointers in the node if 2989 * we aren't going to empty it 2990 */ 2991 if (src_nritems - push_items < 8) { 2992 if (push_items <= 8) 2993 return 1; 2994 push_items -= 8; 2995 } 2996 } 2997 } else 2998 push_items = min(src_nritems - 8, push_items); 2999 3000 tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, 3001 push_items, 1); 3002 copy_extent_buffer(dst, src, 3003 btrfs_node_key_ptr_offset(dst_nritems), 3004 btrfs_node_key_ptr_offset(0), 3005 push_items * sizeof(struct btrfs_key_ptr)); 3006 3007 if (push_items < src_nritems) { 3008 /* 3009 * don't call tree_mod_log_eb_move here, key removal was already 3010 * fully logged by tree_mod_log_eb_copy above. 3011 */ 3012 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 3013 btrfs_node_key_ptr_offset(push_items), 3014 (src_nritems - push_items) * 3015 sizeof(struct btrfs_key_ptr)); 3016 } 3017 btrfs_set_header_nritems(src, src_nritems - push_items); 3018 btrfs_set_header_nritems(dst, dst_nritems + push_items); 3019 btrfs_mark_buffer_dirty(src); 3020 btrfs_mark_buffer_dirty(dst); 3021 3022 return ret; 3023 } 3024 3025 /* 3026 * try to push data from one node into the next node right in the 3027 * tree. 3028 * 3029 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 3030 * error, and > 0 if there was no room in the right hand block. 3031 * 3032 * this will only push up to 1/2 the contents of the left node over 3033 */ 3034 static int balance_node_right(struct btrfs_trans_handle *trans, 3035 struct btrfs_root *root, 3036 struct extent_buffer *dst, 3037 struct extent_buffer *src) 3038 { 3039 int push_items = 0; 3040 int max_push; 3041 int src_nritems; 3042 int dst_nritems; 3043 int ret = 0; 3044 3045 WARN_ON(btrfs_header_generation(src) != trans->transid); 3046 WARN_ON(btrfs_header_generation(dst) != trans->transid); 3047 3048 src_nritems = btrfs_header_nritems(src); 3049 dst_nritems = btrfs_header_nritems(dst); 3050 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; 3051 if (push_items <= 0) 3052 return 1; 3053 3054 if (src_nritems < 4) 3055 return 1; 3056 3057 max_push = src_nritems / 2 + 1; 3058 /* don't try to empty the node */ 3059 if (max_push >= src_nritems) 3060 return 1; 3061 3062 if (max_push < push_items) 3063 push_items = max_push; 3064 3065 tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems); 3066 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 3067 btrfs_node_key_ptr_offset(0), 3068 (dst_nritems) * 3069 sizeof(struct btrfs_key_ptr)); 3070 3071 tree_mod_log_eb_copy(root->fs_info, dst, src, 0, 3072 src_nritems - push_items, push_items, 1); 3073 copy_extent_buffer(dst, src, 3074 btrfs_node_key_ptr_offset(0), 3075 btrfs_node_key_ptr_offset(src_nritems - push_items), 3076 push_items * sizeof(struct btrfs_key_ptr)); 3077 3078 btrfs_set_header_nritems(src, src_nritems - push_items); 3079 btrfs_set_header_nritems(dst, dst_nritems + push_items); 3080 3081 btrfs_mark_buffer_dirty(src); 3082 btrfs_mark_buffer_dirty(dst); 3083 3084 return ret; 3085 } 3086 3087 /* 3088 * helper function to insert a new root level in the tree. 3089 * A new node is allocated, and a single item is inserted to 3090 * point to the existing root 3091 * 3092 * returns zero on success or < 0 on failure. 3093 */ 3094 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 3095 struct btrfs_root *root, 3096 struct btrfs_path *path, int level) 3097 { 3098 u64 lower_gen; 3099 struct extent_buffer *lower; 3100 struct extent_buffer *c; 3101 struct extent_buffer *old; 3102 struct btrfs_disk_key lower_key; 3103 3104 BUG_ON(path->nodes[level]); 3105 BUG_ON(path->nodes[level-1] != root->node); 3106 3107 lower = path->nodes[level-1]; 3108 if (level == 1) 3109 btrfs_item_key(lower, &lower_key, 0); 3110 else 3111 btrfs_node_key(lower, &lower_key, 0); 3112 3113 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3114 root->root_key.objectid, &lower_key, 3115 level, root->node->start, 0); 3116 if (IS_ERR(c)) 3117 return PTR_ERR(c); 3118 3119 root_add_used(root, root->nodesize); 3120 3121 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); 3122 btrfs_set_header_nritems(c, 1); 3123 btrfs_set_header_level(c, level); 3124 btrfs_set_header_bytenr(c, c->start); 3125 btrfs_set_header_generation(c, trans->transid); 3126 btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); 3127 btrfs_set_header_owner(c, root->root_key.objectid); 3128 3129 write_extent_buffer(c, root->fs_info->fsid, 3130 (unsigned long)btrfs_header_fsid(c), 3131 BTRFS_FSID_SIZE); 3132 3133 write_extent_buffer(c, root->fs_info->chunk_tree_uuid, 3134 (unsigned long)btrfs_header_chunk_tree_uuid(c), 3135 BTRFS_UUID_SIZE); 3136 3137 btrfs_set_node_key(c, &lower_key, 0); 3138 btrfs_set_node_blockptr(c, 0, lower->start); 3139 lower_gen = btrfs_header_generation(lower); 3140 WARN_ON(lower_gen != trans->transid); 3141 3142 btrfs_set_node_ptr_generation(c, 0, lower_gen); 3143 3144 btrfs_mark_buffer_dirty(c); 3145 3146 old = root->node; 3147 tree_mod_log_set_root_pointer(root, c); 3148 rcu_assign_pointer(root->node, c); 3149 3150 /* the super has an extra ref to root->node */ 3151 free_extent_buffer(old); 3152 3153 add_root_to_dirty_list(root); 3154 extent_buffer_get(c); 3155 path->nodes[level] = c; 3156 path->locks[level] = BTRFS_WRITE_LOCK; 3157 path->slots[level] = 0; 3158 return 0; 3159 } 3160 3161 /* 3162 * worker function to insert a single pointer in a node. 3163 * the node should have enough room for the pointer already 3164 * 3165 * slot and level indicate where you want the key to go, and 3166 * blocknr is the block the key points to. 3167 */ 3168 static void insert_ptr(struct btrfs_trans_handle *trans, 3169 struct btrfs_root *root, struct btrfs_path *path, 3170 struct btrfs_disk_key *key, u64 bytenr, 3171 int slot, int level) 3172 { 3173 struct extent_buffer *lower; 3174 int nritems; 3175 int ret; 3176 3177 BUG_ON(!path->nodes[level]); 3178 btrfs_assert_tree_locked(path->nodes[level]); 3179 lower = path->nodes[level]; 3180 nritems = btrfs_header_nritems(lower); 3181 BUG_ON(slot > nritems); 3182 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); 3183 if (slot != nritems) { 3184 if (level) 3185 tree_mod_log_eb_move(root->fs_info, lower, slot + 1, 3186 slot, nritems - slot); 3187 memmove_extent_buffer(lower, 3188 btrfs_node_key_ptr_offset(slot + 1), 3189 btrfs_node_key_ptr_offset(slot), 3190 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 3191 } 3192 if (level) { 3193 ret = tree_mod_log_insert_key(root->fs_info, lower, slot, 3194 MOD_LOG_KEY_ADD); 3195 BUG_ON(ret < 0); 3196 } 3197 btrfs_set_node_key(lower, key, slot); 3198 btrfs_set_node_blockptr(lower, slot, bytenr); 3199 WARN_ON(trans->transid == 0); 3200 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 3201 btrfs_set_header_nritems(lower, nritems + 1); 3202 btrfs_mark_buffer_dirty(lower); 3203 } 3204 3205 /* 3206 * split the node at the specified level in path in two. 3207 * The path is corrected to point to the appropriate node after the split 3208 * 3209 * Before splitting this tries to make some room in the node by pushing 3210 * left and right, if either one works, it returns right away. 3211 * 3212 * returns 0 on success and < 0 on failure 3213 */ 3214 static noinline int split_node(struct btrfs_trans_handle *trans, 3215 struct btrfs_root *root, 3216 struct btrfs_path *path, int level) 3217 { 3218 struct extent_buffer *c; 3219 struct extent_buffer *split; 3220 struct btrfs_disk_key disk_key; 3221 int mid; 3222 int ret; 3223 u32 c_nritems; 3224 int tree_mod_log_removal = 1; 3225 3226 c = path->nodes[level]; 3227 WARN_ON(btrfs_header_generation(c) != trans->transid); 3228 if (c == root->node) { 3229 /* trying to split the root, lets make a new one */ 3230 ret = insert_new_root(trans, root, path, level + 1); 3231 /* 3232 * removal of root nodes has been logged by 3233 * tree_mod_log_set_root_pointer due to locking 3234 */ 3235 tree_mod_log_removal = 0; 3236 if (ret) 3237 return ret; 3238 } else { 3239 ret = push_nodes_for_insert(trans, root, path, level); 3240 c = path->nodes[level]; 3241 if (!ret && btrfs_header_nritems(c) < 3242 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) 3243 return 0; 3244 if (ret < 0) 3245 return ret; 3246 } 3247 3248 c_nritems = btrfs_header_nritems(c); 3249 mid = (c_nritems + 1) / 2; 3250 btrfs_node_key(c, &disk_key, mid); 3251 3252 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3253 root->root_key.objectid, 3254 &disk_key, level, c->start, 0); 3255 if (IS_ERR(split)) 3256 return PTR_ERR(split); 3257 3258 root_add_used(root, root->nodesize); 3259 3260 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); 3261 btrfs_set_header_level(split, btrfs_header_level(c)); 3262 btrfs_set_header_bytenr(split, split->start); 3263 btrfs_set_header_generation(split, trans->transid); 3264 btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); 3265 btrfs_set_header_owner(split, root->root_key.objectid); 3266 write_extent_buffer(split, root->fs_info->fsid, 3267 (unsigned long)btrfs_header_fsid(split), 3268 BTRFS_FSID_SIZE); 3269 write_extent_buffer(split, root->fs_info->chunk_tree_uuid, 3270 (unsigned long)btrfs_header_chunk_tree_uuid(split), 3271 BTRFS_UUID_SIZE); 3272 3273 tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid, 3274 tree_mod_log_removal); 3275 copy_extent_buffer(split, c, 3276 btrfs_node_key_ptr_offset(0), 3277 btrfs_node_key_ptr_offset(mid), 3278 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 3279 btrfs_set_header_nritems(split, c_nritems - mid); 3280 btrfs_set_header_nritems(c, mid); 3281 ret = 0; 3282 3283 btrfs_mark_buffer_dirty(c); 3284 btrfs_mark_buffer_dirty(split); 3285 3286 insert_ptr(trans, root, path, &disk_key, split->start, 3287 path->slots[level + 1] + 1, level + 1); 3288 3289 if (path->slots[level] >= mid) { 3290 path->slots[level] -= mid; 3291 btrfs_tree_unlock(c); 3292 free_extent_buffer(c); 3293 path->nodes[level] = split; 3294 path->slots[level + 1] += 1; 3295 } else { 3296 btrfs_tree_unlock(split); 3297 free_extent_buffer(split); 3298 } 3299 return ret; 3300 } 3301 3302 /* 3303 * how many bytes are required to store the items in a leaf. start 3304 * and nr indicate which items in the leaf to check. This totals up the 3305 * space used both by the item structs and the item data 3306 */ 3307 static int leaf_space_used(struct extent_buffer *l, int start, int nr) 3308 { 3309 struct btrfs_item *start_item; 3310 struct btrfs_item *end_item; 3311 struct btrfs_map_token token; 3312 int data_len; 3313 int nritems = btrfs_header_nritems(l); 3314 int end = min(nritems, start + nr) - 1; 3315 3316 if (!nr) 3317 return 0; 3318 btrfs_init_map_token(&token); 3319 start_item = btrfs_item_nr(l, start); 3320 end_item = btrfs_item_nr(l, end); 3321 data_len = btrfs_token_item_offset(l, start_item, &token) + 3322 btrfs_token_item_size(l, start_item, &token); 3323 data_len = data_len - btrfs_token_item_offset(l, end_item, &token); 3324 data_len += sizeof(struct btrfs_item) * nr; 3325 WARN_ON(data_len < 0); 3326 return data_len; 3327 } 3328 3329 /* 3330 * The space between the end of the leaf items and 3331 * the start of the leaf data. IOW, how much room 3332 * the leaf has left for both items and data 3333 */ 3334 noinline int btrfs_leaf_free_space(struct btrfs_root *root, 3335 struct extent_buffer *leaf) 3336 { 3337 int nritems = btrfs_header_nritems(leaf); 3338 int ret; 3339 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); 3340 if (ret < 0) { 3341 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " 3342 "used %d nritems %d\n", 3343 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), 3344 leaf_space_used(leaf, 0, nritems), nritems); 3345 } 3346 return ret; 3347 } 3348 3349 /* 3350 * min slot controls the lowest index we're willing to push to the 3351 * right. We'll push up to and including min_slot, but no lower 3352 */ 3353 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 3354 struct btrfs_root *root, 3355 struct btrfs_path *path, 3356 int data_size, int empty, 3357 struct extent_buffer *right, 3358 int free_space, u32 left_nritems, 3359 u32 min_slot) 3360 { 3361 struct extent_buffer *left = path->nodes[0]; 3362 struct extent_buffer *upper = path->nodes[1]; 3363 struct btrfs_map_token token; 3364 struct btrfs_disk_key disk_key; 3365 int slot; 3366 u32 i; 3367 int push_space = 0; 3368 int push_items = 0; 3369 struct btrfs_item *item; 3370 u32 nr; 3371 u32 right_nritems; 3372 u32 data_end; 3373 u32 this_item_size; 3374 3375 btrfs_init_map_token(&token); 3376 3377 if (empty) 3378 nr = 0; 3379 else 3380 nr = max_t(u32, 1, min_slot); 3381 3382 if (path->slots[0] >= left_nritems) 3383 push_space += data_size; 3384 3385 slot = path->slots[1]; 3386 i = left_nritems - 1; 3387 while (i >= nr) { 3388 item = btrfs_item_nr(left, i); 3389 3390 if (!empty && push_items > 0) { 3391 if (path->slots[0] > i) 3392 break; 3393 if (path->slots[0] == i) { 3394 int space = btrfs_leaf_free_space(root, left); 3395 if (space + push_space * 2 > free_space) 3396 break; 3397 } 3398 } 3399 3400 if (path->slots[0] == i) 3401 push_space += data_size; 3402 3403 this_item_size = btrfs_item_size(left, item); 3404 if (this_item_size + sizeof(*item) + push_space > free_space) 3405 break; 3406 3407 push_items++; 3408 push_space += this_item_size + sizeof(*item); 3409 if (i == 0) 3410 break; 3411 i--; 3412 } 3413 3414 if (push_items == 0) 3415 goto out_unlock; 3416 3417 WARN_ON(!empty && push_items == left_nritems); 3418 3419 /* push left to right */ 3420 right_nritems = btrfs_header_nritems(right); 3421 3422 push_space = btrfs_item_end_nr(left, left_nritems - push_items); 3423 push_space -= leaf_data_end(root, left); 3424 3425 /* make room in the right data area */ 3426 data_end = leaf_data_end(root, right); 3427 memmove_extent_buffer(right, 3428 btrfs_leaf_data(right) + data_end - push_space, 3429 btrfs_leaf_data(right) + data_end, 3430 BTRFS_LEAF_DATA_SIZE(root) - data_end); 3431 3432 /* copy from the left data area */ 3433 copy_extent_buffer(right, left, btrfs_leaf_data(right) + 3434 BTRFS_LEAF_DATA_SIZE(root) - push_space, 3435 btrfs_leaf_data(left) + leaf_data_end(root, left), 3436 push_space); 3437 3438 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), 3439 btrfs_item_nr_offset(0), 3440 right_nritems * sizeof(struct btrfs_item)); 3441 3442 /* copy the items from left to right */ 3443 copy_extent_buffer(right, left, btrfs_item_nr_offset(0), 3444 btrfs_item_nr_offset(left_nritems - push_items), 3445 push_items * sizeof(struct btrfs_item)); 3446 3447 /* update the item pointers */ 3448 right_nritems += push_items; 3449 btrfs_set_header_nritems(right, right_nritems); 3450 push_space = BTRFS_LEAF_DATA_SIZE(root); 3451 for (i = 0; i < right_nritems; i++) { 3452 item = btrfs_item_nr(right, i); 3453 push_space -= btrfs_token_item_size(right, item, &token); 3454 btrfs_set_token_item_offset(right, item, push_space, &token); 3455 } 3456 3457 left_nritems -= push_items; 3458 btrfs_set_header_nritems(left, left_nritems); 3459 3460 if (left_nritems) 3461 btrfs_mark_buffer_dirty(left); 3462 else 3463 clean_tree_block(trans, root, left); 3464 3465 btrfs_mark_buffer_dirty(right); 3466 3467 btrfs_item_key(right, &disk_key, 0); 3468 btrfs_set_node_key(upper, &disk_key, slot + 1); 3469 btrfs_mark_buffer_dirty(upper); 3470 3471 /* then fixup the leaf pointer in the path */ 3472 if (path->slots[0] >= left_nritems) { 3473 path->slots[0] -= left_nritems; 3474 if (btrfs_header_nritems(path->nodes[0]) == 0) 3475 clean_tree_block(trans, root, path->nodes[0]); 3476 btrfs_tree_unlock(path->nodes[0]); 3477 free_extent_buffer(path->nodes[0]); 3478 path->nodes[0] = right; 3479 path->slots[1] += 1; 3480 } else { 3481 btrfs_tree_unlock(right); 3482 free_extent_buffer(right); 3483 } 3484 return 0; 3485 3486 out_unlock: 3487 btrfs_tree_unlock(right); 3488 free_extent_buffer(right); 3489 return 1; 3490 } 3491 3492 /* 3493 * push some data in the path leaf to the right, trying to free up at 3494 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3495 * 3496 * returns 1 if the push failed because the other node didn't have enough 3497 * room, 0 if everything worked out and < 0 if there were major errors. 3498 * 3499 * this will push starting from min_slot to the end of the leaf. It won't 3500 * push any slot lower than min_slot 3501 */ 3502 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3503 *root, struct btrfs_path *path, 3504 int min_data_size, int data_size, 3505 int empty, u32 min_slot) 3506 { 3507 struct extent_buffer *left = path->nodes[0]; 3508 struct extent_buffer *right; 3509 struct extent_buffer *upper; 3510 int slot; 3511 int free_space; 3512 u32 left_nritems; 3513 int ret; 3514 3515 if (!path->nodes[1]) 3516 return 1; 3517 3518 slot = path->slots[1]; 3519 upper = path->nodes[1]; 3520 if (slot >= btrfs_header_nritems(upper) - 1) 3521 return 1; 3522 3523 btrfs_assert_tree_locked(path->nodes[1]); 3524 3525 right = read_node_slot(root, upper, slot + 1); 3526 if (right == NULL) 3527 return 1; 3528 3529 btrfs_tree_lock(right); 3530 btrfs_set_lock_blocking(right); 3531 3532 free_space = btrfs_leaf_free_space(root, right); 3533 if (free_space < data_size) 3534 goto out_unlock; 3535 3536 /* cow and double check */ 3537 ret = btrfs_cow_block(trans, root, right, upper, 3538 slot + 1, &right); 3539 if (ret) 3540 goto out_unlock; 3541 3542 free_space = btrfs_leaf_free_space(root, right); 3543 if (free_space < data_size) 3544 goto out_unlock; 3545 3546 left_nritems = btrfs_header_nritems(left); 3547 if (left_nritems == 0) 3548 goto out_unlock; 3549 3550 return __push_leaf_right(trans, root, path, min_data_size, empty, 3551 right, free_space, left_nritems, min_slot); 3552 out_unlock: 3553 btrfs_tree_unlock(right); 3554 free_extent_buffer(right); 3555 return 1; 3556 } 3557 3558 /* 3559 * push some data in the path leaf to the left, trying to free up at 3560 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3561 * 3562 * max_slot can put a limit on how far into the leaf we'll push items. The 3563 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3564 * items 3565 */ 3566 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 3567 struct btrfs_root *root, 3568 struct btrfs_path *path, int data_size, 3569 int empty, struct extent_buffer *left, 3570 int free_space, u32 right_nritems, 3571 u32 max_slot) 3572 { 3573 struct btrfs_disk_key disk_key; 3574 struct extent_buffer *right = path->nodes[0]; 3575 int i; 3576 int push_space = 0; 3577 int push_items = 0; 3578 struct btrfs_item *item; 3579 u32 old_left_nritems; 3580 u32 nr; 3581 int ret = 0; 3582 u32 this_item_size; 3583 u32 old_left_item_size; 3584 struct btrfs_map_token token; 3585 3586 btrfs_init_map_token(&token); 3587 3588 if (empty) 3589 nr = min(right_nritems, max_slot); 3590 else 3591 nr = min(right_nritems - 1, max_slot); 3592 3593 for (i = 0; i < nr; i++) { 3594 item = btrfs_item_nr(right, i); 3595 3596 if (!empty && push_items > 0) { 3597 if (path->slots[0] < i) 3598 break; 3599 if (path->slots[0] == i) { 3600 int space = btrfs_leaf_free_space(root, right); 3601 if (space + push_space * 2 > free_space) 3602 break; 3603 } 3604 } 3605 3606 if (path->slots[0] == i) 3607 push_space += data_size; 3608 3609 this_item_size = btrfs_item_size(right, item); 3610 if (this_item_size + sizeof(*item) + push_space > free_space) 3611 break; 3612 3613 push_items++; 3614 push_space += this_item_size + sizeof(*item); 3615 } 3616 3617 if (push_items == 0) { 3618 ret = 1; 3619 goto out; 3620 } 3621 if (!empty && push_items == btrfs_header_nritems(right)) 3622 WARN_ON(1); 3623 3624 /* push data from right to left */ 3625 copy_extent_buffer(left, right, 3626 btrfs_item_nr_offset(btrfs_header_nritems(left)), 3627 btrfs_item_nr_offset(0), 3628 push_items * sizeof(struct btrfs_item)); 3629 3630 push_space = BTRFS_LEAF_DATA_SIZE(root) - 3631 btrfs_item_offset_nr(right, push_items - 1); 3632 3633 copy_extent_buffer(left, right, btrfs_leaf_data(left) + 3634 leaf_data_end(root, left) - push_space, 3635 btrfs_leaf_data(right) + 3636 btrfs_item_offset_nr(right, push_items - 1), 3637 push_space); 3638 old_left_nritems = btrfs_header_nritems(left); 3639 BUG_ON(old_left_nritems <= 0); 3640 3641 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); 3642 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3643 u32 ioff; 3644 3645 item = btrfs_item_nr(left, i); 3646 3647 ioff = btrfs_token_item_offset(left, item, &token); 3648 btrfs_set_token_item_offset(left, item, 3649 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size), 3650 &token); 3651 } 3652 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3653 3654 /* fixup right node */ 3655 if (push_items > right_nritems) 3656 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3657 right_nritems); 3658 3659 if (push_items < right_nritems) { 3660 push_space = btrfs_item_offset_nr(right, push_items - 1) - 3661 leaf_data_end(root, right); 3662 memmove_extent_buffer(right, btrfs_leaf_data(right) + 3663 BTRFS_LEAF_DATA_SIZE(root) - push_space, 3664 btrfs_leaf_data(right) + 3665 leaf_data_end(root, right), push_space); 3666 3667 memmove_extent_buffer(right, btrfs_item_nr_offset(0), 3668 btrfs_item_nr_offset(push_items), 3669 (btrfs_header_nritems(right) - push_items) * 3670 sizeof(struct btrfs_item)); 3671 } 3672 right_nritems -= push_items; 3673 btrfs_set_header_nritems(right, right_nritems); 3674 push_space = BTRFS_LEAF_DATA_SIZE(root); 3675 for (i = 0; i < right_nritems; i++) { 3676 item = btrfs_item_nr(right, i); 3677 3678 push_space = push_space - btrfs_token_item_size(right, 3679 item, &token); 3680 btrfs_set_token_item_offset(right, item, push_space, &token); 3681 } 3682 3683 btrfs_mark_buffer_dirty(left); 3684 if (right_nritems) 3685 btrfs_mark_buffer_dirty(right); 3686 else 3687 clean_tree_block(trans, root, right); 3688 3689 btrfs_item_key(right, &disk_key, 0); 3690 fixup_low_keys(trans, root, path, &disk_key, 1); 3691 3692 /* then fixup the leaf pointer in the path */ 3693 if (path->slots[0] < push_items) { 3694 path->slots[0] += old_left_nritems; 3695 btrfs_tree_unlock(path->nodes[0]); 3696 free_extent_buffer(path->nodes[0]); 3697 path->nodes[0] = left; 3698 path->slots[1] -= 1; 3699 } else { 3700 btrfs_tree_unlock(left); 3701 free_extent_buffer(left); 3702 path->slots[0] -= push_items; 3703 } 3704 BUG_ON(path->slots[0] < 0); 3705 return ret; 3706 out: 3707 btrfs_tree_unlock(left); 3708 free_extent_buffer(left); 3709 return ret; 3710 } 3711 3712 /* 3713 * push some data in the path leaf to the left, trying to free up at 3714 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3715 * 3716 * max_slot can put a limit on how far into the leaf we'll push items. The 3717 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3718 * items 3719 */ 3720 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3721 *root, struct btrfs_path *path, int min_data_size, 3722 int data_size, int empty, u32 max_slot) 3723 { 3724 struct extent_buffer *right = path->nodes[0]; 3725 struct extent_buffer *left; 3726 int slot; 3727 int free_space; 3728 u32 right_nritems; 3729 int ret = 0; 3730 3731 slot = path->slots[1]; 3732 if (slot == 0) 3733 return 1; 3734 if (!path->nodes[1]) 3735 return 1; 3736 3737 right_nritems = btrfs_header_nritems(right); 3738 if (right_nritems == 0) 3739 return 1; 3740 3741 btrfs_assert_tree_locked(path->nodes[1]); 3742 3743 left = read_node_slot(root, path->nodes[1], slot - 1); 3744 if (left == NULL) 3745 return 1; 3746 3747 btrfs_tree_lock(left); 3748 btrfs_set_lock_blocking(left); 3749 3750 free_space = btrfs_leaf_free_space(root, left); 3751 if (free_space < data_size) { 3752 ret = 1; 3753 goto out; 3754 } 3755 3756 /* cow and double check */ 3757 ret = btrfs_cow_block(trans, root, left, 3758 path->nodes[1], slot - 1, &left); 3759 if (ret) { 3760 /* we hit -ENOSPC, but it isn't fatal here */ 3761 if (ret == -ENOSPC) 3762 ret = 1; 3763 goto out; 3764 } 3765 3766 free_space = btrfs_leaf_free_space(root, left); 3767 if (free_space < data_size) { 3768 ret = 1; 3769 goto out; 3770 } 3771 3772 return __push_leaf_left(trans, root, path, min_data_size, 3773 empty, left, free_space, right_nritems, 3774 max_slot); 3775 out: 3776 btrfs_tree_unlock(left); 3777 free_extent_buffer(left); 3778 return ret; 3779 } 3780 3781 /* 3782 * split the path's leaf in two, making sure there is at least data_size 3783 * available for the resulting leaf level of the path. 3784 */ 3785 static noinline void copy_for_split(struct btrfs_trans_handle *trans, 3786 struct btrfs_root *root, 3787 struct btrfs_path *path, 3788 struct extent_buffer *l, 3789 struct extent_buffer *right, 3790 int slot, int mid, int nritems) 3791 { 3792 int data_copy_size; 3793 int rt_data_off; 3794 int i; 3795 struct btrfs_disk_key disk_key; 3796 struct btrfs_map_token token; 3797 3798 btrfs_init_map_token(&token); 3799 3800 nritems = nritems - mid; 3801 btrfs_set_header_nritems(right, nritems); 3802 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); 3803 3804 copy_extent_buffer(right, l, btrfs_item_nr_offset(0), 3805 btrfs_item_nr_offset(mid), 3806 nritems * sizeof(struct btrfs_item)); 3807 3808 copy_extent_buffer(right, l, 3809 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - 3810 data_copy_size, btrfs_leaf_data(l) + 3811 leaf_data_end(root, l), data_copy_size); 3812 3813 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - 3814 btrfs_item_end_nr(l, mid); 3815 3816 for (i = 0; i < nritems; i++) { 3817 struct btrfs_item *item = btrfs_item_nr(right, i); 3818 u32 ioff; 3819 3820 ioff = btrfs_token_item_offset(right, item, &token); 3821 btrfs_set_token_item_offset(right, item, 3822 ioff + rt_data_off, &token); 3823 } 3824 3825 btrfs_set_header_nritems(l, mid); 3826 btrfs_item_key(right, &disk_key, 0); 3827 insert_ptr(trans, root, path, &disk_key, right->start, 3828 path->slots[1] + 1, 1); 3829 3830 btrfs_mark_buffer_dirty(right); 3831 btrfs_mark_buffer_dirty(l); 3832 BUG_ON(path->slots[0] != slot); 3833 3834 if (mid <= slot) { 3835 btrfs_tree_unlock(path->nodes[0]); 3836 free_extent_buffer(path->nodes[0]); 3837 path->nodes[0] = right; 3838 path->slots[0] -= mid; 3839 path->slots[1] += 1; 3840 } else { 3841 btrfs_tree_unlock(right); 3842 free_extent_buffer(right); 3843 } 3844 3845 BUG_ON(path->slots[0] < 0); 3846 } 3847 3848 /* 3849 * double splits happen when we need to insert a big item in the middle 3850 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3851 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3852 * A B C 3853 * 3854 * We avoid this by trying to push the items on either side of our target 3855 * into the adjacent leaves. If all goes well we can avoid the double split 3856 * completely. 3857 */ 3858 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3859 struct btrfs_root *root, 3860 struct btrfs_path *path, 3861 int data_size) 3862 { 3863 int ret; 3864 int progress = 0; 3865 int slot; 3866 u32 nritems; 3867 3868 slot = path->slots[0]; 3869 3870 /* 3871 * try to push all the items after our slot into the 3872 * right leaf 3873 */ 3874 ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); 3875 if (ret < 0) 3876 return ret; 3877 3878 if (ret == 0) 3879 progress++; 3880 3881 nritems = btrfs_header_nritems(path->nodes[0]); 3882 /* 3883 * our goal is to get our slot at the start or end of a leaf. If 3884 * we've done so we're done 3885 */ 3886 if (path->slots[0] == 0 || path->slots[0] == nritems) 3887 return 0; 3888 3889 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) 3890 return 0; 3891 3892 /* try to push all the items before our slot into the next leaf */ 3893 slot = path->slots[0]; 3894 ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); 3895 if (ret < 0) 3896 return ret; 3897 3898 if (ret == 0) 3899 progress++; 3900 3901 if (progress) 3902 return 0; 3903 return 1; 3904 } 3905 3906 /* 3907 * split the path's leaf in two, making sure there is at least data_size 3908 * available for the resulting leaf level of the path. 3909 * 3910 * returns 0 if all went well and < 0 on failure. 3911 */ 3912 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3913 struct btrfs_root *root, 3914 struct btrfs_key *ins_key, 3915 struct btrfs_path *path, int data_size, 3916 int extend) 3917 { 3918 struct btrfs_disk_key disk_key; 3919 struct extent_buffer *l; 3920 u32 nritems; 3921 int mid; 3922 int slot; 3923 struct extent_buffer *right; 3924 int ret = 0; 3925 int wret; 3926 int split; 3927 int num_doubles = 0; 3928 int tried_avoid_double = 0; 3929 3930 l = path->nodes[0]; 3931 slot = path->slots[0]; 3932 if (extend && data_size + btrfs_item_size_nr(l, slot) + 3933 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) 3934 return -EOVERFLOW; 3935 3936 /* first try to make some room by pushing left and right */ 3937 if (data_size) { 3938 wret = push_leaf_right(trans, root, path, data_size, 3939 data_size, 0, 0); 3940 if (wret < 0) 3941 return wret; 3942 if (wret) { 3943 wret = push_leaf_left(trans, root, path, data_size, 3944 data_size, 0, (u32)-1); 3945 if (wret < 0) 3946 return wret; 3947 } 3948 l = path->nodes[0]; 3949 3950 /* did the pushes work? */ 3951 if (btrfs_leaf_free_space(root, l) >= data_size) 3952 return 0; 3953 } 3954 3955 if (!path->nodes[1]) { 3956 ret = insert_new_root(trans, root, path, 1); 3957 if (ret) 3958 return ret; 3959 } 3960 again: 3961 split = 1; 3962 l = path->nodes[0]; 3963 slot = path->slots[0]; 3964 nritems = btrfs_header_nritems(l); 3965 mid = (nritems + 1) / 2; 3966 3967 if (mid <= slot) { 3968 if (nritems == 1 || 3969 leaf_space_used(l, mid, nritems - mid) + data_size > 3970 BTRFS_LEAF_DATA_SIZE(root)) { 3971 if (slot >= nritems) { 3972 split = 0; 3973 } else { 3974 mid = slot; 3975 if (mid != nritems && 3976 leaf_space_used(l, mid, nritems - mid) + 3977 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 3978 if (data_size && !tried_avoid_double) 3979 goto push_for_double; 3980 split = 2; 3981 } 3982 } 3983 } 3984 } else { 3985 if (leaf_space_used(l, 0, mid) + data_size > 3986 BTRFS_LEAF_DATA_SIZE(root)) { 3987 if (!extend && data_size && slot == 0) { 3988 split = 0; 3989 } else if ((extend || !data_size) && slot == 0) { 3990 mid = 1; 3991 } else { 3992 mid = slot; 3993 if (mid != nritems && 3994 leaf_space_used(l, mid, nritems - mid) + 3995 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 3996 if (data_size && !tried_avoid_double) 3997 goto push_for_double; 3998 split = 2 ; 3999 } 4000 } 4001 } 4002 } 4003 4004 if (split == 0) 4005 btrfs_cpu_key_to_disk(&disk_key, ins_key); 4006 else 4007 btrfs_item_key(l, &disk_key, mid); 4008 4009 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 4010 root->root_key.objectid, 4011 &disk_key, 0, l->start, 0); 4012 if (IS_ERR(right)) 4013 return PTR_ERR(right); 4014 4015 root_add_used(root, root->leafsize); 4016 4017 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 4018 btrfs_set_header_bytenr(right, right->start); 4019 btrfs_set_header_generation(right, trans->transid); 4020 btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); 4021 btrfs_set_header_owner(right, root->root_key.objectid); 4022 btrfs_set_header_level(right, 0); 4023 write_extent_buffer(right, root->fs_info->fsid, 4024 (unsigned long)btrfs_header_fsid(right), 4025 BTRFS_FSID_SIZE); 4026 4027 write_extent_buffer(right, root->fs_info->chunk_tree_uuid, 4028 (unsigned long)btrfs_header_chunk_tree_uuid(right), 4029 BTRFS_UUID_SIZE); 4030 4031 if (split == 0) { 4032 if (mid <= slot) { 4033 btrfs_set_header_nritems(right, 0); 4034 insert_ptr(trans, root, path, &disk_key, right->start, 4035 path->slots[1] + 1, 1); 4036 btrfs_tree_unlock(path->nodes[0]); 4037 free_extent_buffer(path->nodes[0]); 4038 path->nodes[0] = right; 4039 path->slots[0] = 0; 4040 path->slots[1] += 1; 4041 } else { 4042 btrfs_set_header_nritems(right, 0); 4043 insert_ptr(trans, root, path, &disk_key, right->start, 4044 path->slots[1], 1); 4045 btrfs_tree_unlock(path->nodes[0]); 4046 free_extent_buffer(path->nodes[0]); 4047 path->nodes[0] = right; 4048 path->slots[0] = 0; 4049 if (path->slots[1] == 0) 4050 fixup_low_keys(trans, root, path, 4051 &disk_key, 1); 4052 } 4053 btrfs_mark_buffer_dirty(right); 4054 return ret; 4055 } 4056 4057 copy_for_split(trans, root, path, l, right, slot, mid, nritems); 4058 4059 if (split == 2) { 4060 BUG_ON(num_doubles != 0); 4061 num_doubles++; 4062 goto again; 4063 } 4064 4065 return 0; 4066 4067 push_for_double: 4068 push_for_double_split(trans, root, path, data_size); 4069 tried_avoid_double = 1; 4070 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) 4071 return 0; 4072 goto again; 4073 } 4074 4075 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 4076 struct btrfs_root *root, 4077 struct btrfs_path *path, int ins_len) 4078 { 4079 struct btrfs_key key; 4080 struct extent_buffer *leaf; 4081 struct btrfs_file_extent_item *fi; 4082 u64 extent_len = 0; 4083 u32 item_size; 4084 int ret; 4085 4086 leaf = path->nodes[0]; 4087 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4088 4089 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 4090 key.type != BTRFS_EXTENT_CSUM_KEY); 4091 4092 if (btrfs_leaf_free_space(root, leaf) >= ins_len) 4093 return 0; 4094 4095 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 4096 if (key.type == BTRFS_EXTENT_DATA_KEY) { 4097 fi = btrfs_item_ptr(leaf, path->slots[0], 4098 struct btrfs_file_extent_item); 4099 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 4100 } 4101 btrfs_release_path(path); 4102 4103 path->keep_locks = 1; 4104 path->search_for_split = 1; 4105 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 4106 path->search_for_split = 0; 4107 if (ret < 0) 4108 goto err; 4109 4110 ret = -EAGAIN; 4111 leaf = path->nodes[0]; 4112 /* if our item isn't there or got smaller, return now */ 4113 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) 4114 goto err; 4115 4116 /* the leaf has changed, it now has room. return now */ 4117 if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len) 4118 goto err; 4119 4120 if (key.type == BTRFS_EXTENT_DATA_KEY) { 4121 fi = btrfs_item_ptr(leaf, path->slots[0], 4122 struct btrfs_file_extent_item); 4123 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 4124 goto err; 4125 } 4126 4127 btrfs_set_path_blocking(path); 4128 ret = split_leaf(trans, root, &key, path, ins_len, 1); 4129 if (ret) 4130 goto err; 4131 4132 path->keep_locks = 0; 4133 btrfs_unlock_up_safe(path, 1); 4134 return 0; 4135 err: 4136 path->keep_locks = 0; 4137 return ret; 4138 } 4139 4140 static noinline int split_item(struct btrfs_trans_handle *trans, 4141 struct btrfs_root *root, 4142 struct btrfs_path *path, 4143 struct btrfs_key *new_key, 4144 unsigned long split_offset) 4145 { 4146 struct extent_buffer *leaf; 4147 struct btrfs_item *item; 4148 struct btrfs_item *new_item; 4149 int slot; 4150 char *buf; 4151 u32 nritems; 4152 u32 item_size; 4153 u32 orig_offset; 4154 struct btrfs_disk_key disk_key; 4155 4156 leaf = path->nodes[0]; 4157 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); 4158 4159 btrfs_set_path_blocking(path); 4160 4161 item = btrfs_item_nr(leaf, path->slots[0]); 4162 orig_offset = btrfs_item_offset(leaf, item); 4163 item_size = btrfs_item_size(leaf, item); 4164 4165 buf = kmalloc(item_size, GFP_NOFS); 4166 if (!buf) 4167 return -ENOMEM; 4168 4169 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 4170 path->slots[0]), item_size); 4171 4172 slot = path->slots[0] + 1; 4173 nritems = btrfs_header_nritems(leaf); 4174 if (slot != nritems) { 4175 /* shift the items */ 4176 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), 4177 btrfs_item_nr_offset(slot), 4178 (nritems - slot) * sizeof(struct btrfs_item)); 4179 } 4180 4181 btrfs_cpu_key_to_disk(&disk_key, new_key); 4182 btrfs_set_item_key(leaf, &disk_key, slot); 4183 4184 new_item = btrfs_item_nr(leaf, slot); 4185 4186 btrfs_set_item_offset(leaf, new_item, orig_offset); 4187 btrfs_set_item_size(leaf, new_item, item_size - split_offset); 4188 4189 btrfs_set_item_offset(leaf, item, 4190 orig_offset + item_size - split_offset); 4191 btrfs_set_item_size(leaf, item, split_offset); 4192 4193 btrfs_set_header_nritems(leaf, nritems + 1); 4194 4195 /* write the data for the start of the original item */ 4196 write_extent_buffer(leaf, buf, 4197 btrfs_item_ptr_offset(leaf, path->slots[0]), 4198 split_offset); 4199 4200 /* write the data for the new item */ 4201 write_extent_buffer(leaf, buf + split_offset, 4202 btrfs_item_ptr_offset(leaf, slot), 4203 item_size - split_offset); 4204 btrfs_mark_buffer_dirty(leaf); 4205 4206 BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); 4207 kfree(buf); 4208 return 0; 4209 } 4210 4211 /* 4212 * This function splits a single item into two items, 4213 * giving 'new_key' to the new item and splitting the 4214 * old one at split_offset (from the start of the item). 4215 * 4216 * The path may be released by this operation. After 4217 * the split, the path is pointing to the old item. The 4218 * new item is going to be in the same node as the old one. 4219 * 4220 * Note, the item being split must be smaller enough to live alone on 4221 * a tree block with room for one extra struct btrfs_item 4222 * 4223 * This allows us to split the item in place, keeping a lock on the 4224 * leaf the entire time. 4225 */ 4226 int btrfs_split_item(struct btrfs_trans_handle *trans, 4227 struct btrfs_root *root, 4228 struct btrfs_path *path, 4229 struct btrfs_key *new_key, 4230 unsigned long split_offset) 4231 { 4232 int ret; 4233 ret = setup_leaf_for_split(trans, root, path, 4234 sizeof(struct btrfs_item)); 4235 if (ret) 4236 return ret; 4237 4238 ret = split_item(trans, root, path, new_key, split_offset); 4239 return ret; 4240 } 4241 4242 /* 4243 * This function duplicate a item, giving 'new_key' to the new item. 4244 * It guarantees both items live in the same tree leaf and the new item 4245 * is contiguous with the original item. 4246 * 4247 * This allows us to split file extent in place, keeping a lock on the 4248 * leaf the entire time. 4249 */ 4250 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4251 struct btrfs_root *root, 4252 struct btrfs_path *path, 4253 struct btrfs_key *new_key) 4254 { 4255 struct extent_buffer *leaf; 4256 int ret; 4257 u32 item_size; 4258 4259 leaf = path->nodes[0]; 4260 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 4261 ret = setup_leaf_for_split(trans, root, path, 4262 item_size + sizeof(struct btrfs_item)); 4263 if (ret) 4264 return ret; 4265 4266 path->slots[0]++; 4267 setup_items_for_insert(trans, root, path, new_key, &item_size, 4268 item_size, item_size + 4269 sizeof(struct btrfs_item), 1); 4270 leaf = path->nodes[0]; 4271 memcpy_extent_buffer(leaf, 4272 btrfs_item_ptr_offset(leaf, path->slots[0]), 4273 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4274 item_size); 4275 return 0; 4276 } 4277 4278 /* 4279 * make the item pointed to by the path smaller. new_size indicates 4280 * how small to make it, and from_end tells us if we just chop bytes 4281 * off the end of the item or if we shift the item to chop bytes off 4282 * the front. 4283 */ 4284 void btrfs_truncate_item(struct btrfs_trans_handle *trans, 4285 struct btrfs_root *root, 4286 struct btrfs_path *path, 4287 u32 new_size, int from_end) 4288 { 4289 int slot; 4290 struct extent_buffer *leaf; 4291 struct btrfs_item *item; 4292 u32 nritems; 4293 unsigned int data_end; 4294 unsigned int old_data_start; 4295 unsigned int old_size; 4296 unsigned int size_diff; 4297 int i; 4298 struct btrfs_map_token token; 4299 4300 btrfs_init_map_token(&token); 4301 4302 leaf = path->nodes[0]; 4303 slot = path->slots[0]; 4304 4305 old_size = btrfs_item_size_nr(leaf, slot); 4306 if (old_size == new_size) 4307 return; 4308 4309 nritems = btrfs_header_nritems(leaf); 4310 data_end = leaf_data_end(root, leaf); 4311 4312 old_data_start = btrfs_item_offset_nr(leaf, slot); 4313 4314 size_diff = old_size - new_size; 4315 4316 BUG_ON(slot < 0); 4317 BUG_ON(slot >= nritems); 4318 4319 /* 4320 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4321 */ 4322 /* first correct the data pointers */ 4323 for (i = slot; i < nritems; i++) { 4324 u32 ioff; 4325 item = btrfs_item_nr(leaf, i); 4326 4327 ioff = btrfs_token_item_offset(leaf, item, &token); 4328 btrfs_set_token_item_offset(leaf, item, 4329 ioff + size_diff, &token); 4330 } 4331 4332 /* shift the data */ 4333 if (from_end) { 4334 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4335 data_end + size_diff, btrfs_leaf_data(leaf) + 4336 data_end, old_data_start + new_size - data_end); 4337 } else { 4338 struct btrfs_disk_key disk_key; 4339 u64 offset; 4340 4341 btrfs_item_key(leaf, &disk_key, slot); 4342 4343 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 4344 unsigned long ptr; 4345 struct btrfs_file_extent_item *fi; 4346 4347 fi = btrfs_item_ptr(leaf, slot, 4348 struct btrfs_file_extent_item); 4349 fi = (struct btrfs_file_extent_item *)( 4350 (unsigned long)fi - size_diff); 4351 4352 if (btrfs_file_extent_type(leaf, fi) == 4353 BTRFS_FILE_EXTENT_INLINE) { 4354 ptr = btrfs_item_ptr_offset(leaf, slot); 4355 memmove_extent_buffer(leaf, ptr, 4356 (unsigned long)fi, 4357 offsetof(struct btrfs_file_extent_item, 4358 disk_bytenr)); 4359 } 4360 } 4361 4362 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4363 data_end + size_diff, btrfs_leaf_data(leaf) + 4364 data_end, old_data_start - data_end); 4365 4366 offset = btrfs_disk_key_offset(&disk_key); 4367 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4368 btrfs_set_item_key(leaf, &disk_key, slot); 4369 if (slot == 0) 4370 fixup_low_keys(trans, root, path, &disk_key, 1); 4371 } 4372 4373 item = btrfs_item_nr(leaf, slot); 4374 btrfs_set_item_size(leaf, item, new_size); 4375 btrfs_mark_buffer_dirty(leaf); 4376 4377 if (btrfs_leaf_free_space(root, leaf) < 0) { 4378 btrfs_print_leaf(root, leaf); 4379 BUG(); 4380 } 4381 } 4382 4383 /* 4384 * make the item pointed to by the path bigger, data_size is the new size. 4385 */ 4386 void btrfs_extend_item(struct btrfs_trans_handle *trans, 4387 struct btrfs_root *root, struct btrfs_path *path, 4388 u32 data_size) 4389 { 4390 int slot; 4391 struct extent_buffer *leaf; 4392 struct btrfs_item *item; 4393 u32 nritems; 4394 unsigned int data_end; 4395 unsigned int old_data; 4396 unsigned int old_size; 4397 int i; 4398 struct btrfs_map_token token; 4399 4400 btrfs_init_map_token(&token); 4401 4402 leaf = path->nodes[0]; 4403 4404 nritems = btrfs_header_nritems(leaf); 4405 data_end = leaf_data_end(root, leaf); 4406 4407 if (btrfs_leaf_free_space(root, leaf) < data_size) { 4408 btrfs_print_leaf(root, leaf); 4409 BUG(); 4410 } 4411 slot = path->slots[0]; 4412 old_data = btrfs_item_end_nr(leaf, slot); 4413 4414 BUG_ON(slot < 0); 4415 if (slot >= nritems) { 4416 btrfs_print_leaf(root, leaf); 4417 printk(KERN_CRIT "slot %d too large, nritems %d\n", 4418 slot, nritems); 4419 BUG_ON(1); 4420 } 4421 4422 /* 4423 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4424 */ 4425 /* first correct the data pointers */ 4426 for (i = slot; i < nritems; i++) { 4427 u32 ioff; 4428 item = btrfs_item_nr(leaf, i); 4429 4430 ioff = btrfs_token_item_offset(leaf, item, &token); 4431 btrfs_set_token_item_offset(leaf, item, 4432 ioff - data_size, &token); 4433 } 4434 4435 /* shift the data */ 4436 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4437 data_end - data_size, btrfs_leaf_data(leaf) + 4438 data_end, old_data - data_end); 4439 4440 data_end = old_data; 4441 old_size = btrfs_item_size_nr(leaf, slot); 4442 item = btrfs_item_nr(leaf, slot); 4443 btrfs_set_item_size(leaf, item, old_size + data_size); 4444 btrfs_mark_buffer_dirty(leaf); 4445 4446 if (btrfs_leaf_free_space(root, leaf) < 0) { 4447 btrfs_print_leaf(root, leaf); 4448 BUG(); 4449 } 4450 } 4451 4452 /* 4453 * this is a helper for btrfs_insert_empty_items, the main goal here is 4454 * to save stack depth by doing the bulk of the work in a function 4455 * that doesn't call btrfs_search_slot 4456 */ 4457 void setup_items_for_insert(struct btrfs_trans_handle *trans, 4458 struct btrfs_root *root, struct btrfs_path *path, 4459 struct btrfs_key *cpu_key, u32 *data_size, 4460 u32 total_data, u32 total_size, int nr) 4461 { 4462 struct btrfs_item *item; 4463 int i; 4464 u32 nritems; 4465 unsigned int data_end; 4466 struct btrfs_disk_key disk_key; 4467 struct extent_buffer *leaf; 4468 int slot; 4469 struct btrfs_map_token token; 4470 4471 btrfs_init_map_token(&token); 4472 4473 leaf = path->nodes[0]; 4474 slot = path->slots[0]; 4475 4476 nritems = btrfs_header_nritems(leaf); 4477 data_end = leaf_data_end(root, leaf); 4478 4479 if (btrfs_leaf_free_space(root, leaf) < total_size) { 4480 btrfs_print_leaf(root, leaf); 4481 printk(KERN_CRIT "not enough freespace need %u have %d\n", 4482 total_size, btrfs_leaf_free_space(root, leaf)); 4483 BUG(); 4484 } 4485 4486 if (slot != nritems) { 4487 unsigned int old_data = btrfs_item_end_nr(leaf, slot); 4488 4489 if (old_data < data_end) { 4490 btrfs_print_leaf(root, leaf); 4491 printk(KERN_CRIT "slot %d old_data %d data_end %d\n", 4492 slot, old_data, data_end); 4493 BUG_ON(1); 4494 } 4495 /* 4496 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4497 */ 4498 /* first correct the data pointers */ 4499 for (i = slot; i < nritems; i++) { 4500 u32 ioff; 4501 4502 item = btrfs_item_nr(leaf, i); 4503 ioff = btrfs_token_item_offset(leaf, item, &token); 4504 btrfs_set_token_item_offset(leaf, item, 4505 ioff - total_data, &token); 4506 } 4507 /* shift the items */ 4508 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), 4509 btrfs_item_nr_offset(slot), 4510 (nritems - slot) * sizeof(struct btrfs_item)); 4511 4512 /* shift the data */ 4513 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4514 data_end - total_data, btrfs_leaf_data(leaf) + 4515 data_end, old_data - data_end); 4516 data_end = old_data; 4517 } 4518 4519 /* setup the item for the new data */ 4520 for (i = 0; i < nr; i++) { 4521 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); 4522 btrfs_set_item_key(leaf, &disk_key, slot + i); 4523 item = btrfs_item_nr(leaf, slot + i); 4524 btrfs_set_token_item_offset(leaf, item, 4525 data_end - data_size[i], &token); 4526 data_end -= data_size[i]; 4527 btrfs_set_token_item_size(leaf, item, data_size[i], &token); 4528 } 4529 4530 btrfs_set_header_nritems(leaf, nritems + nr); 4531 4532 if (slot == 0) { 4533 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 4534 fixup_low_keys(trans, root, path, &disk_key, 1); 4535 } 4536 btrfs_unlock_up_safe(path, 1); 4537 btrfs_mark_buffer_dirty(leaf); 4538 4539 if (btrfs_leaf_free_space(root, leaf) < 0) { 4540 btrfs_print_leaf(root, leaf); 4541 BUG(); 4542 } 4543 } 4544 4545 /* 4546 * Given a key and some data, insert items into the tree. 4547 * This does all the path init required, making room in the tree if needed. 4548 */ 4549 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4550 struct btrfs_root *root, 4551 struct btrfs_path *path, 4552 struct btrfs_key *cpu_key, u32 *data_size, 4553 int nr) 4554 { 4555 int ret = 0; 4556 int slot; 4557 int i; 4558 u32 total_size = 0; 4559 u32 total_data = 0; 4560 4561 for (i = 0; i < nr; i++) 4562 total_data += data_size[i]; 4563 4564 total_size = total_data + (nr * sizeof(struct btrfs_item)); 4565 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); 4566 if (ret == 0) 4567 return -EEXIST; 4568 if (ret < 0) 4569 return ret; 4570 4571 slot = path->slots[0]; 4572 BUG_ON(slot < 0); 4573 4574 setup_items_for_insert(trans, root, path, cpu_key, data_size, 4575 total_data, total_size, nr); 4576 return 0; 4577 } 4578 4579 /* 4580 * Given a key and some data, insert an item into the tree. 4581 * This does all the path init required, making room in the tree if needed. 4582 */ 4583 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root 4584 *root, struct btrfs_key *cpu_key, void *data, u32 4585 data_size) 4586 { 4587 int ret = 0; 4588 struct btrfs_path *path; 4589 struct extent_buffer *leaf; 4590 unsigned long ptr; 4591 4592 path = btrfs_alloc_path(); 4593 if (!path) 4594 return -ENOMEM; 4595 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4596 if (!ret) { 4597 leaf = path->nodes[0]; 4598 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4599 write_extent_buffer(leaf, data, ptr, data_size); 4600 btrfs_mark_buffer_dirty(leaf); 4601 } 4602 btrfs_free_path(path); 4603 return ret; 4604 } 4605 4606 /* 4607 * delete the pointer from a given node. 4608 * 4609 * the tree should have been previously balanced so the deletion does not 4610 * empty a node. 4611 */ 4612 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4613 struct btrfs_path *path, int level, int slot) 4614 { 4615 struct extent_buffer *parent = path->nodes[level]; 4616 u32 nritems; 4617 int ret; 4618 4619 nritems = btrfs_header_nritems(parent); 4620 if (slot != nritems - 1) { 4621 if (level) 4622 tree_mod_log_eb_move(root->fs_info, parent, slot, 4623 slot + 1, nritems - slot - 1); 4624 memmove_extent_buffer(parent, 4625 btrfs_node_key_ptr_offset(slot), 4626 btrfs_node_key_ptr_offset(slot + 1), 4627 sizeof(struct btrfs_key_ptr) * 4628 (nritems - slot - 1)); 4629 } else if (level) { 4630 ret = tree_mod_log_insert_key(root->fs_info, parent, slot, 4631 MOD_LOG_KEY_REMOVE); 4632 BUG_ON(ret < 0); 4633 } 4634 4635 nritems--; 4636 btrfs_set_header_nritems(parent, nritems); 4637 if (nritems == 0 && parent == root->node) { 4638 BUG_ON(btrfs_header_level(root->node) != 1); 4639 /* just turn the root into a leaf and break */ 4640 btrfs_set_header_level(root->node, 0); 4641 } else if (slot == 0) { 4642 struct btrfs_disk_key disk_key; 4643 4644 btrfs_node_key(parent, &disk_key, 0); 4645 fixup_low_keys(trans, root, path, &disk_key, level + 1); 4646 } 4647 btrfs_mark_buffer_dirty(parent); 4648 } 4649 4650 /* 4651 * a helper function to delete the leaf pointed to by path->slots[1] and 4652 * path->nodes[1]. 4653 * 4654 * This deletes the pointer in path->nodes[1] and frees the leaf 4655 * block extent. zero is returned if it all worked out, < 0 otherwise. 4656 * 4657 * The path must have already been setup for deleting the leaf, including 4658 * all the proper balancing. path->nodes[1] must be locked. 4659 */ 4660 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, 4661 struct btrfs_root *root, 4662 struct btrfs_path *path, 4663 struct extent_buffer *leaf) 4664 { 4665 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4666 del_ptr(trans, root, path, 1, path->slots[1]); 4667 4668 /* 4669 * btrfs_free_extent is expensive, we want to make sure we 4670 * aren't holding any locks when we call it 4671 */ 4672 btrfs_unlock_up_safe(path, 0); 4673 4674 root_sub_used(root, leaf->len); 4675 4676 extent_buffer_get(leaf); 4677 btrfs_free_tree_block(trans, root, leaf, 0, 1); 4678 free_extent_buffer_stale(leaf); 4679 } 4680 /* 4681 * delete the item at the leaf level in path. If that empties 4682 * the leaf, remove it from the tree 4683 */ 4684 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4685 struct btrfs_path *path, int slot, int nr) 4686 { 4687 struct extent_buffer *leaf; 4688 struct btrfs_item *item; 4689 int last_off; 4690 int dsize = 0; 4691 int ret = 0; 4692 int wret; 4693 int i; 4694 u32 nritems; 4695 struct btrfs_map_token token; 4696 4697 btrfs_init_map_token(&token); 4698 4699 leaf = path->nodes[0]; 4700 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); 4701 4702 for (i = 0; i < nr; i++) 4703 dsize += btrfs_item_size_nr(leaf, slot + i); 4704 4705 nritems = btrfs_header_nritems(leaf); 4706 4707 if (slot + nr != nritems) { 4708 int data_end = leaf_data_end(root, leaf); 4709 4710 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4711 data_end + dsize, 4712 btrfs_leaf_data(leaf) + data_end, 4713 last_off - data_end); 4714 4715 for (i = slot + nr; i < nritems; i++) { 4716 u32 ioff; 4717 4718 item = btrfs_item_nr(leaf, i); 4719 ioff = btrfs_token_item_offset(leaf, item, &token); 4720 btrfs_set_token_item_offset(leaf, item, 4721 ioff + dsize, &token); 4722 } 4723 4724 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), 4725 btrfs_item_nr_offset(slot + nr), 4726 sizeof(struct btrfs_item) * 4727 (nritems - slot - nr)); 4728 } 4729 btrfs_set_header_nritems(leaf, nritems - nr); 4730 nritems -= nr; 4731 4732 /* delete the leaf if we've emptied it */ 4733 if (nritems == 0) { 4734 if (leaf == root->node) { 4735 btrfs_set_header_level(leaf, 0); 4736 } else { 4737 btrfs_set_path_blocking(path); 4738 clean_tree_block(trans, root, leaf); 4739 btrfs_del_leaf(trans, root, path, leaf); 4740 } 4741 } else { 4742 int used = leaf_space_used(leaf, 0, nritems); 4743 if (slot == 0) { 4744 struct btrfs_disk_key disk_key; 4745 4746 btrfs_item_key(leaf, &disk_key, 0); 4747 fixup_low_keys(trans, root, path, &disk_key, 1); 4748 } 4749 4750 /* delete the leaf if it is mostly empty */ 4751 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) { 4752 /* push_leaf_left fixes the path. 4753 * make sure the path still points to our leaf 4754 * for possible call to del_ptr below 4755 */ 4756 slot = path->slots[1]; 4757 extent_buffer_get(leaf); 4758 4759 btrfs_set_path_blocking(path); 4760 wret = push_leaf_left(trans, root, path, 1, 1, 4761 1, (u32)-1); 4762 if (wret < 0 && wret != -ENOSPC) 4763 ret = wret; 4764 4765 if (path->nodes[0] == leaf && 4766 btrfs_header_nritems(leaf)) { 4767 wret = push_leaf_right(trans, root, path, 1, 4768 1, 1, 0); 4769 if (wret < 0 && wret != -ENOSPC) 4770 ret = wret; 4771 } 4772 4773 if (btrfs_header_nritems(leaf) == 0) { 4774 path->slots[1] = slot; 4775 btrfs_del_leaf(trans, root, path, leaf); 4776 free_extent_buffer(leaf); 4777 ret = 0; 4778 } else { 4779 /* if we're still in the path, make sure 4780 * we're dirty. Otherwise, one of the 4781 * push_leaf functions must have already 4782 * dirtied this buffer 4783 */ 4784 if (path->nodes[0] == leaf) 4785 btrfs_mark_buffer_dirty(leaf); 4786 free_extent_buffer(leaf); 4787 } 4788 } else { 4789 btrfs_mark_buffer_dirty(leaf); 4790 } 4791 } 4792 return ret; 4793 } 4794 4795 /* 4796 * search the tree again to find a leaf with lesser keys 4797 * returns 0 if it found something or 1 if there are no lesser leaves. 4798 * returns < 0 on io errors. 4799 * 4800 * This may release the path, and so you may lose any locks held at the 4801 * time you call it. 4802 */ 4803 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 4804 { 4805 struct btrfs_key key; 4806 struct btrfs_disk_key found_key; 4807 int ret; 4808 4809 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 4810 4811 if (key.offset > 0) 4812 key.offset--; 4813 else if (key.type > 0) 4814 key.type--; 4815 else if (key.objectid > 0) 4816 key.objectid--; 4817 else 4818 return 1; 4819 4820 btrfs_release_path(path); 4821 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4822 if (ret < 0) 4823 return ret; 4824 btrfs_item_key(path->nodes[0], &found_key, 0); 4825 ret = comp_keys(&found_key, &key); 4826 if (ret < 0) 4827 return 0; 4828 return 1; 4829 } 4830 4831 /* 4832 * A helper function to walk down the tree starting at min_key, and looking 4833 * for nodes or leaves that are have a minimum transaction id. 4834 * This is used by the btree defrag code, and tree logging 4835 * 4836 * This does not cow, but it does stuff the starting key it finds back 4837 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4838 * key and get a writable path. 4839 * 4840 * This does lock as it descends, and path->keep_locks should be set 4841 * to 1 by the caller. 4842 * 4843 * This honors path->lowest_level to prevent descent past a given level 4844 * of the tree. 4845 * 4846 * min_trans indicates the oldest transaction that you are interested 4847 * in walking through. Any nodes or leaves older than min_trans are 4848 * skipped over (without reading them). 4849 * 4850 * returns zero if something useful was found, < 0 on error and 1 if there 4851 * was nothing in the tree that matched the search criteria. 4852 */ 4853 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4854 struct btrfs_key *max_key, 4855 struct btrfs_path *path, 4856 u64 min_trans) 4857 { 4858 struct extent_buffer *cur; 4859 struct btrfs_key found_key; 4860 int slot; 4861 int sret; 4862 u32 nritems; 4863 int level; 4864 int ret = 1; 4865 4866 WARN_ON(!path->keep_locks); 4867 again: 4868 cur = btrfs_read_lock_root_node(root); 4869 level = btrfs_header_level(cur); 4870 WARN_ON(path->nodes[level]); 4871 path->nodes[level] = cur; 4872 path->locks[level] = BTRFS_READ_LOCK; 4873 4874 if (btrfs_header_generation(cur) < min_trans) { 4875 ret = 1; 4876 goto out; 4877 } 4878 while (1) { 4879 nritems = btrfs_header_nritems(cur); 4880 level = btrfs_header_level(cur); 4881 sret = bin_search(cur, min_key, level, &slot); 4882 4883 /* at the lowest level, we're done, setup the path and exit */ 4884 if (level == path->lowest_level) { 4885 if (slot >= nritems) 4886 goto find_next_key; 4887 ret = 0; 4888 path->slots[level] = slot; 4889 btrfs_item_key_to_cpu(cur, &found_key, slot); 4890 goto out; 4891 } 4892 if (sret && slot > 0) 4893 slot--; 4894 /* 4895 * check this node pointer against the min_trans parameters. 4896 * If it is too old, old, skip to the next one. 4897 */ 4898 while (slot < nritems) { 4899 u64 blockptr; 4900 u64 gen; 4901 4902 blockptr = btrfs_node_blockptr(cur, slot); 4903 gen = btrfs_node_ptr_generation(cur, slot); 4904 if (gen < min_trans) { 4905 slot++; 4906 continue; 4907 } 4908 break; 4909 } 4910 find_next_key: 4911 /* 4912 * we didn't find a candidate key in this node, walk forward 4913 * and find another one 4914 */ 4915 if (slot >= nritems) { 4916 path->slots[level] = slot; 4917 btrfs_set_path_blocking(path); 4918 sret = btrfs_find_next_key(root, path, min_key, level, 4919 min_trans); 4920 if (sret == 0) { 4921 btrfs_release_path(path); 4922 goto again; 4923 } else { 4924 goto out; 4925 } 4926 } 4927 /* save our key for returning back */ 4928 btrfs_node_key_to_cpu(cur, &found_key, slot); 4929 path->slots[level] = slot; 4930 if (level == path->lowest_level) { 4931 ret = 0; 4932 unlock_up(path, level, 1, 0, NULL); 4933 goto out; 4934 } 4935 btrfs_set_path_blocking(path); 4936 cur = read_node_slot(root, cur, slot); 4937 BUG_ON(!cur); /* -ENOMEM */ 4938 4939 btrfs_tree_read_lock(cur); 4940 4941 path->locks[level - 1] = BTRFS_READ_LOCK; 4942 path->nodes[level - 1] = cur; 4943 unlock_up(path, level, 1, 0, NULL); 4944 btrfs_clear_path_blocking(path, NULL, 0); 4945 } 4946 out: 4947 if (ret == 0) 4948 memcpy(min_key, &found_key, sizeof(found_key)); 4949 btrfs_set_path_blocking(path); 4950 return ret; 4951 } 4952 4953 static void tree_move_down(struct btrfs_root *root, 4954 struct btrfs_path *path, 4955 int *level, int root_level) 4956 { 4957 BUG_ON(*level == 0); 4958 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], 4959 path->slots[*level]); 4960 path->slots[*level - 1] = 0; 4961 (*level)--; 4962 } 4963 4964 static int tree_move_next_or_upnext(struct btrfs_root *root, 4965 struct btrfs_path *path, 4966 int *level, int root_level) 4967 { 4968 int ret = 0; 4969 int nritems; 4970 nritems = btrfs_header_nritems(path->nodes[*level]); 4971 4972 path->slots[*level]++; 4973 4974 while (path->slots[*level] >= nritems) { 4975 if (*level == root_level) 4976 return -1; 4977 4978 /* move upnext */ 4979 path->slots[*level] = 0; 4980 free_extent_buffer(path->nodes[*level]); 4981 path->nodes[*level] = NULL; 4982 (*level)++; 4983 path->slots[*level]++; 4984 4985 nritems = btrfs_header_nritems(path->nodes[*level]); 4986 ret = 1; 4987 } 4988 return ret; 4989 } 4990 4991 /* 4992 * Returns 1 if it had to move up and next. 0 is returned if it moved only next 4993 * or down. 4994 */ 4995 static int tree_advance(struct btrfs_root *root, 4996 struct btrfs_path *path, 4997 int *level, int root_level, 4998 int allow_down, 4999 struct btrfs_key *key) 5000 { 5001 int ret; 5002 5003 if (*level == 0 || !allow_down) { 5004 ret = tree_move_next_or_upnext(root, path, level, root_level); 5005 } else { 5006 tree_move_down(root, path, level, root_level); 5007 ret = 0; 5008 } 5009 if (ret >= 0) { 5010 if (*level == 0) 5011 btrfs_item_key_to_cpu(path->nodes[*level], key, 5012 path->slots[*level]); 5013 else 5014 btrfs_node_key_to_cpu(path->nodes[*level], key, 5015 path->slots[*level]); 5016 } 5017 return ret; 5018 } 5019 5020 static int tree_compare_item(struct btrfs_root *left_root, 5021 struct btrfs_path *left_path, 5022 struct btrfs_path *right_path, 5023 char *tmp_buf) 5024 { 5025 int cmp; 5026 int len1, len2; 5027 unsigned long off1, off2; 5028 5029 len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); 5030 len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); 5031 if (len1 != len2) 5032 return 1; 5033 5034 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); 5035 off2 = btrfs_item_ptr_offset(right_path->nodes[0], 5036 right_path->slots[0]); 5037 5038 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); 5039 5040 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); 5041 if (cmp) 5042 return 1; 5043 return 0; 5044 } 5045 5046 #define ADVANCE 1 5047 #define ADVANCE_ONLY_NEXT -1 5048 5049 /* 5050 * This function compares two trees and calls the provided callback for 5051 * every changed/new/deleted item it finds. 5052 * If shared tree blocks are encountered, whole subtrees are skipped, making 5053 * the compare pretty fast on snapshotted subvolumes. 5054 * 5055 * This currently works on commit roots only. As commit roots are read only, 5056 * we don't do any locking. The commit roots are protected with transactions. 5057 * Transactions are ended and rejoined when a commit is tried in between. 5058 * 5059 * This function checks for modifications done to the trees while comparing. 5060 * If it detects a change, it aborts immediately. 5061 */ 5062 int btrfs_compare_trees(struct btrfs_root *left_root, 5063 struct btrfs_root *right_root, 5064 btrfs_changed_cb_t changed_cb, void *ctx) 5065 { 5066 int ret; 5067 int cmp; 5068 struct btrfs_trans_handle *trans = NULL; 5069 struct btrfs_path *left_path = NULL; 5070 struct btrfs_path *right_path = NULL; 5071 struct btrfs_key left_key; 5072 struct btrfs_key right_key; 5073 char *tmp_buf = NULL; 5074 int left_root_level; 5075 int right_root_level; 5076 int left_level; 5077 int right_level; 5078 int left_end_reached; 5079 int right_end_reached; 5080 int advance_left; 5081 int advance_right; 5082 u64 left_blockptr; 5083 u64 right_blockptr; 5084 u64 left_start_ctransid; 5085 u64 right_start_ctransid; 5086 u64 ctransid; 5087 5088 left_path = btrfs_alloc_path(); 5089 if (!left_path) { 5090 ret = -ENOMEM; 5091 goto out; 5092 } 5093 right_path = btrfs_alloc_path(); 5094 if (!right_path) { 5095 ret = -ENOMEM; 5096 goto out; 5097 } 5098 5099 tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); 5100 if (!tmp_buf) { 5101 ret = -ENOMEM; 5102 goto out; 5103 } 5104 5105 left_path->search_commit_root = 1; 5106 left_path->skip_locking = 1; 5107 right_path->search_commit_root = 1; 5108 right_path->skip_locking = 1; 5109 5110 spin_lock(&left_root->root_item_lock); 5111 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); 5112 spin_unlock(&left_root->root_item_lock); 5113 5114 spin_lock(&right_root->root_item_lock); 5115 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); 5116 spin_unlock(&right_root->root_item_lock); 5117 5118 trans = btrfs_join_transaction(left_root); 5119 if (IS_ERR(trans)) { 5120 ret = PTR_ERR(trans); 5121 trans = NULL; 5122 goto out; 5123 } 5124 5125 /* 5126 * Strategy: Go to the first items of both trees. Then do 5127 * 5128 * If both trees are at level 0 5129 * Compare keys of current items 5130 * If left < right treat left item as new, advance left tree 5131 * and repeat 5132 * If left > right treat right item as deleted, advance right tree 5133 * and repeat 5134 * If left == right do deep compare of items, treat as changed if 5135 * needed, advance both trees and repeat 5136 * If both trees are at the same level but not at level 0 5137 * Compare keys of current nodes/leafs 5138 * If left < right advance left tree and repeat 5139 * If left > right advance right tree and repeat 5140 * If left == right compare blockptrs of the next nodes/leafs 5141 * If they match advance both trees but stay at the same level 5142 * and repeat 5143 * If they don't match advance both trees while allowing to go 5144 * deeper and repeat 5145 * If tree levels are different 5146 * Advance the tree that needs it and repeat 5147 * 5148 * Advancing a tree means: 5149 * If we are at level 0, try to go to the next slot. If that's not 5150 * possible, go one level up and repeat. Stop when we found a level 5151 * where we could go to the next slot. We may at this point be on a 5152 * node or a leaf. 5153 * 5154 * If we are not at level 0 and not on shared tree blocks, go one 5155 * level deeper. 5156 * 5157 * If we are not at level 0 and on shared tree blocks, go one slot to 5158 * the right if possible or go up and right. 5159 */ 5160 5161 left_level = btrfs_header_level(left_root->commit_root); 5162 left_root_level = left_level; 5163 left_path->nodes[left_level] = left_root->commit_root; 5164 extent_buffer_get(left_path->nodes[left_level]); 5165 5166 right_level = btrfs_header_level(right_root->commit_root); 5167 right_root_level = right_level; 5168 right_path->nodes[right_level] = right_root->commit_root; 5169 extent_buffer_get(right_path->nodes[right_level]); 5170 5171 if (left_level == 0) 5172 btrfs_item_key_to_cpu(left_path->nodes[left_level], 5173 &left_key, left_path->slots[left_level]); 5174 else 5175 btrfs_node_key_to_cpu(left_path->nodes[left_level], 5176 &left_key, left_path->slots[left_level]); 5177 if (right_level == 0) 5178 btrfs_item_key_to_cpu(right_path->nodes[right_level], 5179 &right_key, right_path->slots[right_level]); 5180 else 5181 btrfs_node_key_to_cpu(right_path->nodes[right_level], 5182 &right_key, right_path->slots[right_level]); 5183 5184 left_end_reached = right_end_reached = 0; 5185 advance_left = advance_right = 0; 5186 5187 while (1) { 5188 /* 5189 * We need to make sure the transaction does not get committed 5190 * while we do anything on commit roots. This means, we need to 5191 * join and leave transactions for every item that we process. 5192 */ 5193 if (trans && btrfs_should_end_transaction(trans, left_root)) { 5194 btrfs_release_path(left_path); 5195 btrfs_release_path(right_path); 5196 5197 ret = btrfs_end_transaction(trans, left_root); 5198 trans = NULL; 5199 if (ret < 0) 5200 goto out; 5201 } 5202 /* now rejoin the transaction */ 5203 if (!trans) { 5204 trans = btrfs_join_transaction(left_root); 5205 if (IS_ERR(trans)) { 5206 ret = PTR_ERR(trans); 5207 trans = NULL; 5208 goto out; 5209 } 5210 5211 spin_lock(&left_root->root_item_lock); 5212 ctransid = btrfs_root_ctransid(&left_root->root_item); 5213 spin_unlock(&left_root->root_item_lock); 5214 if (ctransid != left_start_ctransid) 5215 left_start_ctransid = 0; 5216 5217 spin_lock(&right_root->root_item_lock); 5218 ctransid = btrfs_root_ctransid(&right_root->root_item); 5219 spin_unlock(&right_root->root_item_lock); 5220 if (ctransid != right_start_ctransid) 5221 right_start_ctransid = 0; 5222 5223 if (!left_start_ctransid || !right_start_ctransid) { 5224 WARN(1, KERN_WARNING 5225 "btrfs: btrfs_compare_tree detected " 5226 "a change in one of the trees while " 5227 "iterating. This is probably a " 5228 "bug.\n"); 5229 ret = -EIO; 5230 goto out; 5231 } 5232 5233 /* 5234 * the commit root may have changed, so start again 5235 * where we stopped 5236 */ 5237 left_path->lowest_level = left_level; 5238 right_path->lowest_level = right_level; 5239 ret = btrfs_search_slot(NULL, left_root, 5240 &left_key, left_path, 0, 0); 5241 if (ret < 0) 5242 goto out; 5243 ret = btrfs_search_slot(NULL, right_root, 5244 &right_key, right_path, 0, 0); 5245 if (ret < 0) 5246 goto out; 5247 } 5248 5249 if (advance_left && !left_end_reached) { 5250 ret = tree_advance(left_root, left_path, &left_level, 5251 left_root_level, 5252 advance_left != ADVANCE_ONLY_NEXT, 5253 &left_key); 5254 if (ret < 0) 5255 left_end_reached = ADVANCE; 5256 advance_left = 0; 5257 } 5258 if (advance_right && !right_end_reached) { 5259 ret = tree_advance(right_root, right_path, &right_level, 5260 right_root_level, 5261 advance_right != ADVANCE_ONLY_NEXT, 5262 &right_key); 5263 if (ret < 0) 5264 right_end_reached = ADVANCE; 5265 advance_right = 0; 5266 } 5267 5268 if (left_end_reached && right_end_reached) { 5269 ret = 0; 5270 goto out; 5271 } else if (left_end_reached) { 5272 if (right_level == 0) { 5273 ret = changed_cb(left_root, right_root, 5274 left_path, right_path, 5275 &right_key, 5276 BTRFS_COMPARE_TREE_DELETED, 5277 ctx); 5278 if (ret < 0) 5279 goto out; 5280 } 5281 advance_right = ADVANCE; 5282 continue; 5283 } else if (right_end_reached) { 5284 if (left_level == 0) { 5285 ret = changed_cb(left_root, right_root, 5286 left_path, right_path, 5287 &left_key, 5288 BTRFS_COMPARE_TREE_NEW, 5289 ctx); 5290 if (ret < 0) 5291 goto out; 5292 } 5293 advance_left = ADVANCE; 5294 continue; 5295 } 5296 5297 if (left_level == 0 && right_level == 0) { 5298 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 5299 if (cmp < 0) { 5300 ret = changed_cb(left_root, right_root, 5301 left_path, right_path, 5302 &left_key, 5303 BTRFS_COMPARE_TREE_NEW, 5304 ctx); 5305 if (ret < 0) 5306 goto out; 5307 advance_left = ADVANCE; 5308 } else if (cmp > 0) { 5309 ret = changed_cb(left_root, right_root, 5310 left_path, right_path, 5311 &right_key, 5312 BTRFS_COMPARE_TREE_DELETED, 5313 ctx); 5314 if (ret < 0) 5315 goto out; 5316 advance_right = ADVANCE; 5317 } else { 5318 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); 5319 ret = tree_compare_item(left_root, left_path, 5320 right_path, tmp_buf); 5321 if (ret) { 5322 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); 5323 ret = changed_cb(left_root, right_root, 5324 left_path, right_path, 5325 &left_key, 5326 BTRFS_COMPARE_TREE_CHANGED, 5327 ctx); 5328 if (ret < 0) 5329 goto out; 5330 } 5331 advance_left = ADVANCE; 5332 advance_right = ADVANCE; 5333 } 5334 } else if (left_level == right_level) { 5335 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 5336 if (cmp < 0) { 5337 advance_left = ADVANCE; 5338 } else if (cmp > 0) { 5339 advance_right = ADVANCE; 5340 } else { 5341 left_blockptr = btrfs_node_blockptr( 5342 left_path->nodes[left_level], 5343 left_path->slots[left_level]); 5344 right_blockptr = btrfs_node_blockptr( 5345 right_path->nodes[right_level], 5346 right_path->slots[right_level]); 5347 if (left_blockptr == right_blockptr) { 5348 /* 5349 * As we're on a shared block, don't 5350 * allow to go deeper. 5351 */ 5352 advance_left = ADVANCE_ONLY_NEXT; 5353 advance_right = ADVANCE_ONLY_NEXT; 5354 } else { 5355 advance_left = ADVANCE; 5356 advance_right = ADVANCE; 5357 } 5358 } 5359 } else if (left_level < right_level) { 5360 advance_right = ADVANCE; 5361 } else { 5362 advance_left = ADVANCE; 5363 } 5364 } 5365 5366 out: 5367 btrfs_free_path(left_path); 5368 btrfs_free_path(right_path); 5369 kfree(tmp_buf); 5370 5371 if (trans) { 5372 if (!ret) 5373 ret = btrfs_end_transaction(trans, left_root); 5374 else 5375 btrfs_end_transaction(trans, left_root); 5376 } 5377 5378 return ret; 5379 } 5380 5381 /* 5382 * this is similar to btrfs_next_leaf, but does not try to preserve 5383 * and fixup the path. It looks for and returns the next key in the 5384 * tree based on the current path and the min_trans parameters. 5385 * 5386 * 0 is returned if another key is found, < 0 if there are any errors 5387 * and 1 is returned if there are no higher keys in the tree 5388 * 5389 * path->keep_locks should be set to 1 on the search made before 5390 * calling this function. 5391 */ 5392 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 5393 struct btrfs_key *key, int level, u64 min_trans) 5394 { 5395 int slot; 5396 struct extent_buffer *c; 5397 5398 WARN_ON(!path->keep_locks); 5399 while (level < BTRFS_MAX_LEVEL) { 5400 if (!path->nodes[level]) 5401 return 1; 5402 5403 slot = path->slots[level] + 1; 5404 c = path->nodes[level]; 5405 next: 5406 if (slot >= btrfs_header_nritems(c)) { 5407 int ret; 5408 int orig_lowest; 5409 struct btrfs_key cur_key; 5410 if (level + 1 >= BTRFS_MAX_LEVEL || 5411 !path->nodes[level + 1]) 5412 return 1; 5413 5414 if (path->locks[level + 1]) { 5415 level++; 5416 continue; 5417 } 5418 5419 slot = btrfs_header_nritems(c) - 1; 5420 if (level == 0) 5421 btrfs_item_key_to_cpu(c, &cur_key, slot); 5422 else 5423 btrfs_node_key_to_cpu(c, &cur_key, slot); 5424 5425 orig_lowest = path->lowest_level; 5426 btrfs_release_path(path); 5427 path->lowest_level = level; 5428 ret = btrfs_search_slot(NULL, root, &cur_key, path, 5429 0, 0); 5430 path->lowest_level = orig_lowest; 5431 if (ret < 0) 5432 return ret; 5433 5434 c = path->nodes[level]; 5435 slot = path->slots[level]; 5436 if (ret == 0) 5437 slot++; 5438 goto next; 5439 } 5440 5441 if (level == 0) 5442 btrfs_item_key_to_cpu(c, key, slot); 5443 else { 5444 u64 gen = btrfs_node_ptr_generation(c, slot); 5445 5446 if (gen < min_trans) { 5447 slot++; 5448 goto next; 5449 } 5450 btrfs_node_key_to_cpu(c, key, slot); 5451 } 5452 return 0; 5453 } 5454 return 1; 5455 } 5456 5457 /* 5458 * search the tree again to find a leaf with greater keys 5459 * returns 0 if it found something or 1 if there are no greater leaves. 5460 * returns < 0 on io errors. 5461 */ 5462 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 5463 { 5464 return btrfs_next_old_leaf(root, path, 0); 5465 } 5466 5467 /* Release the path up to but not including the given level */ 5468 static void btrfs_release_level(struct btrfs_path *path, int level) 5469 { 5470 int i; 5471 5472 for (i = 0; i < level; i++) { 5473 path->slots[i] = 0; 5474 if (!path->nodes[i]) 5475 continue; 5476 if (path->locks[i]) { 5477 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 5478 path->locks[i] = 0; 5479 } 5480 free_extent_buffer(path->nodes[i]); 5481 path->nodes[i] = NULL; 5482 } 5483 } 5484 5485 /* 5486 * This function assumes 2 things 5487 * 5488 * 1) You are using path->keep_locks 5489 * 2) You are not inserting items. 5490 * 5491 * If either of these are not true do not use this function. If you need a next 5492 * leaf with either of these not being true then this function can be easily 5493 * adapted to do that, but at the moment these are the limitations. 5494 */ 5495 int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, 5496 struct btrfs_root *root, struct btrfs_path *path, 5497 int del) 5498 { 5499 struct extent_buffer *b; 5500 struct btrfs_key key; 5501 u32 nritems; 5502 int level = 1; 5503 int slot; 5504 int ret = 1; 5505 int write_lock_level = BTRFS_MAX_LEVEL; 5506 int ins_len = del ? -1 : 0; 5507 5508 WARN_ON(!(path->keep_locks || path->really_keep_locks)); 5509 5510 nritems = btrfs_header_nritems(path->nodes[0]); 5511 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 5512 5513 while (path->nodes[level]) { 5514 nritems = btrfs_header_nritems(path->nodes[level]); 5515 if (!(path->locks[level] & BTRFS_WRITE_LOCK)) { 5516 search: 5517 btrfs_release_path(path); 5518 ret = btrfs_search_slot(trans, root, &key, path, 5519 ins_len, 1); 5520 if (ret < 0) 5521 goto out; 5522 level = 1; 5523 continue; 5524 } 5525 5526 if (path->slots[level] >= nritems - 1) { 5527 level++; 5528 continue; 5529 } 5530 5531 btrfs_release_level(path, level); 5532 break; 5533 } 5534 5535 if (!path->nodes[level]) { 5536 ret = 1; 5537 goto out; 5538 } 5539 5540 path->slots[level]++; 5541 b = path->nodes[level]; 5542 5543 while (b) { 5544 level = btrfs_header_level(b); 5545 5546 if (!should_cow_block(trans, root, b)) 5547 goto cow_done; 5548 5549 btrfs_set_path_blocking(path); 5550 ret = btrfs_cow_block(trans, root, b, 5551 path->nodes[level + 1], 5552 path->slots[level + 1], &b); 5553 if (ret) 5554 goto out; 5555 cow_done: 5556 path->nodes[level] = b; 5557 btrfs_clear_path_blocking(path, NULL, 0); 5558 if (level != 0) { 5559 ret = setup_nodes_for_search(trans, root, path, b, 5560 level, ins_len, 5561 &write_lock_level); 5562 if (ret == -EAGAIN) 5563 goto search; 5564 if (ret) 5565 goto out; 5566 5567 b = path->nodes[level]; 5568 slot = path->slots[level]; 5569 5570 ret = read_block_for_search(trans, root, path, 5571 &b, level, slot, &key, 0); 5572 if (ret == -EAGAIN) 5573 goto search; 5574 if (ret) 5575 goto out; 5576 level = btrfs_header_level(b); 5577 if (!btrfs_try_tree_write_lock(b)) { 5578 btrfs_set_path_blocking(path); 5579 btrfs_tree_lock(b); 5580 btrfs_clear_path_blocking(path, b, 5581 BTRFS_WRITE_LOCK); 5582 } 5583 path->locks[level] = BTRFS_WRITE_LOCK; 5584 path->nodes[level] = b; 5585 path->slots[level] = 0; 5586 } else { 5587 path->slots[level] = 0; 5588 ret = 0; 5589 break; 5590 } 5591 } 5592 5593 out: 5594 if (ret) 5595 btrfs_release_path(path); 5596 5597 return ret; 5598 } 5599 5600 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 5601 u64 time_seq) 5602 { 5603 int slot; 5604 int level; 5605 struct extent_buffer *c; 5606 struct extent_buffer *next; 5607 struct btrfs_key key; 5608 u32 nritems; 5609 int ret; 5610 int old_spinning = path->leave_spinning; 5611 int next_rw_lock = 0; 5612 5613 nritems = btrfs_header_nritems(path->nodes[0]); 5614 if (nritems == 0) 5615 return 1; 5616 5617 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 5618 again: 5619 level = 1; 5620 next = NULL; 5621 next_rw_lock = 0; 5622 btrfs_release_path(path); 5623 5624 path->keep_locks = 1; 5625 path->leave_spinning = 1; 5626 5627 if (time_seq) 5628 ret = btrfs_search_old_slot(root, &key, path, time_seq); 5629 else 5630 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5631 path->keep_locks = 0; 5632 5633 if (ret < 0) 5634 return ret; 5635 5636 nritems = btrfs_header_nritems(path->nodes[0]); 5637 /* 5638 * by releasing the path above we dropped all our locks. A balance 5639 * could have added more items next to the key that used to be 5640 * at the very end of the block. So, check again here and 5641 * advance the path if there are now more items available. 5642 */ 5643 if (nritems > 0 && path->slots[0] < nritems - 1) { 5644 if (ret == 0) 5645 path->slots[0]++; 5646 ret = 0; 5647 goto done; 5648 } 5649 5650 while (level < BTRFS_MAX_LEVEL) { 5651 if (!path->nodes[level]) { 5652 ret = 1; 5653 goto done; 5654 } 5655 5656 slot = path->slots[level] + 1; 5657 c = path->nodes[level]; 5658 if (slot >= btrfs_header_nritems(c)) { 5659 level++; 5660 if (level == BTRFS_MAX_LEVEL) { 5661 ret = 1; 5662 goto done; 5663 } 5664 continue; 5665 } 5666 5667 if (next) { 5668 btrfs_tree_unlock_rw(next, next_rw_lock); 5669 free_extent_buffer(next); 5670 } 5671 5672 next = c; 5673 next_rw_lock = path->locks[level]; 5674 ret = read_block_for_search(NULL, root, path, &next, level, 5675 slot, &key, 0); 5676 if (ret == -EAGAIN) 5677 goto again; 5678 5679 if (ret < 0) { 5680 btrfs_release_path(path); 5681 goto done; 5682 } 5683 5684 if (!path->skip_locking) { 5685 ret = btrfs_try_tree_read_lock(next); 5686 if (!ret && time_seq) { 5687 /* 5688 * If we don't get the lock, we may be racing 5689 * with push_leaf_left, holding that lock while 5690 * itself waiting for the leaf we've currently 5691 * locked. To solve this situation, we give up 5692 * on our lock and cycle. 5693 */ 5694 free_extent_buffer(next); 5695 btrfs_release_path(path); 5696 cond_resched(); 5697 goto again; 5698 } 5699 if (!ret) { 5700 btrfs_set_path_blocking(path); 5701 btrfs_tree_read_lock(next); 5702 btrfs_clear_path_blocking(path, next, 5703 BTRFS_READ_LOCK); 5704 } 5705 next_rw_lock = BTRFS_READ_LOCK; 5706 } 5707 break; 5708 } 5709 path->slots[level] = slot; 5710 while (1) { 5711 level--; 5712 c = path->nodes[level]; 5713 if (path->locks[level]) 5714 btrfs_tree_unlock_rw(c, path->locks[level]); 5715 5716 free_extent_buffer(c); 5717 path->nodes[level] = next; 5718 path->slots[level] = 0; 5719 if (!path->skip_locking) 5720 path->locks[level] = next_rw_lock; 5721 if (!level) 5722 break; 5723 5724 ret = read_block_for_search(NULL, root, path, &next, level, 5725 0, &key, 0); 5726 if (ret == -EAGAIN) 5727 goto again; 5728 5729 if (ret < 0) { 5730 btrfs_release_path(path); 5731 goto done; 5732 } 5733 5734 if (!path->skip_locking) { 5735 ret = btrfs_try_tree_read_lock(next); 5736 if (!ret) { 5737 btrfs_set_path_blocking(path); 5738 btrfs_tree_read_lock(next); 5739 btrfs_clear_path_blocking(path, next, 5740 BTRFS_READ_LOCK); 5741 } 5742 next_rw_lock = BTRFS_READ_LOCK; 5743 } 5744 } 5745 ret = 0; 5746 done: 5747 unlock_up(path, 0, 1, 0, NULL); 5748 path->leave_spinning = old_spinning; 5749 if (!old_spinning) 5750 btrfs_set_path_blocking(path); 5751 5752 return ret; 5753 } 5754 5755 /* 5756 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 5757 * searching until it gets past min_objectid or finds an item of 'type' 5758 * 5759 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5760 */ 5761 int btrfs_previous_item(struct btrfs_root *root, 5762 struct btrfs_path *path, u64 min_objectid, 5763 int type) 5764 { 5765 struct btrfs_key found_key; 5766 struct extent_buffer *leaf; 5767 u32 nritems; 5768 int ret; 5769 5770 while (1) { 5771 if (path->slots[0] == 0) { 5772 btrfs_set_path_blocking(path); 5773 ret = btrfs_prev_leaf(root, path); 5774 if (ret != 0) 5775 return ret; 5776 } else { 5777 path->slots[0]--; 5778 } 5779 leaf = path->nodes[0]; 5780 nritems = btrfs_header_nritems(leaf); 5781 if (nritems == 0) 5782 return 1; 5783 if (path->slots[0] == nritems) 5784 path->slots[0]--; 5785 5786 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5787 if (found_key.objectid < min_objectid) 5788 break; 5789 if (found_key.type == type) 5790 return 0; 5791 if (found_key.objectid == min_objectid && 5792 found_key.type < type) 5793 break; 5794 } 5795 return 1; 5796 } 5797