1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include "compat.h" 26 #include "hash.h" 27 #include "ctree.h" 28 #include "disk-io.h" 29 #include "print-tree.h" 30 #include "transaction.h" 31 #include "volumes.h" 32 #include "locking.h" 33 #include "free-space-cache.h" 34 35 static int update_block_group(struct btrfs_trans_handle *trans, 36 struct btrfs_root *root, 37 u64 bytenr, u64 num_bytes, int alloc, 38 int mark_free); 39 static int update_reserved_extents(struct btrfs_block_group_cache *cache, 40 u64 num_bytes, int reserve); 41 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 42 struct btrfs_root *root, 43 u64 bytenr, u64 num_bytes, u64 parent, 44 u64 root_objectid, u64 owner_objectid, 45 u64 owner_offset, int refs_to_drop, 46 struct btrfs_delayed_extent_op *extra_op); 47 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 48 struct extent_buffer *leaf, 49 struct btrfs_extent_item *ei); 50 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 51 struct btrfs_root *root, 52 u64 parent, u64 root_objectid, 53 u64 flags, u64 owner, u64 offset, 54 struct btrfs_key *ins, int ref_mod); 55 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 56 struct btrfs_root *root, 57 u64 parent, u64 root_objectid, 58 u64 flags, struct btrfs_disk_key *key, 59 int level, struct btrfs_key *ins); 60 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 61 struct btrfs_root *extent_root, u64 alloc_bytes, 62 u64 flags, int force); 63 static int pin_down_bytes(struct btrfs_trans_handle *trans, 64 struct btrfs_root *root, 65 struct btrfs_path *path, 66 u64 bytenr, u64 num_bytes, 67 int is_data, int reserved, 68 struct extent_buffer **must_clean); 69 static int find_next_key(struct btrfs_path *path, int level, 70 struct btrfs_key *key); 71 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 72 int dump_block_groups); 73 74 static noinline int 75 block_group_cache_done(struct btrfs_block_group_cache *cache) 76 { 77 smp_mb(); 78 return cache->cached == BTRFS_CACHE_FINISHED; 79 } 80 81 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 82 { 83 return (cache->flags & bits) == bits; 84 } 85 86 /* 87 * this adds the block group to the fs_info rb tree for the block group 88 * cache 89 */ 90 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 91 struct btrfs_block_group_cache *block_group) 92 { 93 struct rb_node **p; 94 struct rb_node *parent = NULL; 95 struct btrfs_block_group_cache *cache; 96 97 spin_lock(&info->block_group_cache_lock); 98 p = &info->block_group_cache_tree.rb_node; 99 100 while (*p) { 101 parent = *p; 102 cache = rb_entry(parent, struct btrfs_block_group_cache, 103 cache_node); 104 if (block_group->key.objectid < cache->key.objectid) { 105 p = &(*p)->rb_left; 106 } else if (block_group->key.objectid > cache->key.objectid) { 107 p = &(*p)->rb_right; 108 } else { 109 spin_unlock(&info->block_group_cache_lock); 110 return -EEXIST; 111 } 112 } 113 114 rb_link_node(&block_group->cache_node, parent, p); 115 rb_insert_color(&block_group->cache_node, 116 &info->block_group_cache_tree); 117 spin_unlock(&info->block_group_cache_lock); 118 119 return 0; 120 } 121 122 /* 123 * This will return the block group at or after bytenr if contains is 0, else 124 * it will return the block group that contains the bytenr 125 */ 126 static struct btrfs_block_group_cache * 127 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 128 int contains) 129 { 130 struct btrfs_block_group_cache *cache, *ret = NULL; 131 struct rb_node *n; 132 u64 end, start; 133 134 spin_lock(&info->block_group_cache_lock); 135 n = info->block_group_cache_tree.rb_node; 136 137 while (n) { 138 cache = rb_entry(n, struct btrfs_block_group_cache, 139 cache_node); 140 end = cache->key.objectid + cache->key.offset - 1; 141 start = cache->key.objectid; 142 143 if (bytenr < start) { 144 if (!contains && (!ret || start < ret->key.objectid)) 145 ret = cache; 146 n = n->rb_left; 147 } else if (bytenr > start) { 148 if (contains && bytenr <= end) { 149 ret = cache; 150 break; 151 } 152 n = n->rb_right; 153 } else { 154 ret = cache; 155 break; 156 } 157 } 158 if (ret) 159 atomic_inc(&ret->count); 160 spin_unlock(&info->block_group_cache_lock); 161 162 return ret; 163 } 164 165 static int add_excluded_extent(struct btrfs_root *root, 166 u64 start, u64 num_bytes) 167 { 168 u64 end = start + num_bytes - 1; 169 set_extent_bits(&root->fs_info->freed_extents[0], 170 start, end, EXTENT_UPTODATE, GFP_NOFS); 171 set_extent_bits(&root->fs_info->freed_extents[1], 172 start, end, EXTENT_UPTODATE, GFP_NOFS); 173 return 0; 174 } 175 176 static void free_excluded_extents(struct btrfs_root *root, 177 struct btrfs_block_group_cache *cache) 178 { 179 u64 start, end; 180 181 start = cache->key.objectid; 182 end = start + cache->key.offset - 1; 183 184 clear_extent_bits(&root->fs_info->freed_extents[0], 185 start, end, EXTENT_UPTODATE, GFP_NOFS); 186 clear_extent_bits(&root->fs_info->freed_extents[1], 187 start, end, EXTENT_UPTODATE, GFP_NOFS); 188 } 189 190 static int exclude_super_stripes(struct btrfs_root *root, 191 struct btrfs_block_group_cache *cache) 192 { 193 u64 bytenr; 194 u64 *logical; 195 int stripe_len; 196 int i, nr, ret; 197 198 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 199 bytenr = btrfs_sb_offset(i); 200 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 201 cache->key.objectid, bytenr, 202 0, &logical, &nr, &stripe_len); 203 BUG_ON(ret); 204 205 while (nr--) { 206 cache->bytes_super += stripe_len; 207 ret = add_excluded_extent(root, logical[nr], 208 stripe_len); 209 BUG_ON(ret); 210 } 211 212 kfree(logical); 213 } 214 return 0; 215 } 216 217 static struct btrfs_caching_control * 218 get_caching_control(struct btrfs_block_group_cache *cache) 219 { 220 struct btrfs_caching_control *ctl; 221 222 spin_lock(&cache->lock); 223 if (cache->cached != BTRFS_CACHE_STARTED) { 224 spin_unlock(&cache->lock); 225 return NULL; 226 } 227 228 ctl = cache->caching_ctl; 229 atomic_inc(&ctl->count); 230 spin_unlock(&cache->lock); 231 return ctl; 232 } 233 234 static void put_caching_control(struct btrfs_caching_control *ctl) 235 { 236 if (atomic_dec_and_test(&ctl->count)) 237 kfree(ctl); 238 } 239 240 /* 241 * this is only called by cache_block_group, since we could have freed extents 242 * we need to check the pinned_extents for any extents that can't be used yet 243 * since their free space will be released as soon as the transaction commits. 244 */ 245 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 246 struct btrfs_fs_info *info, u64 start, u64 end) 247 { 248 u64 extent_start, extent_end, size, total_added = 0; 249 int ret; 250 251 while (start < end) { 252 ret = find_first_extent_bit(info->pinned_extents, start, 253 &extent_start, &extent_end, 254 EXTENT_DIRTY | EXTENT_UPTODATE); 255 if (ret) 256 break; 257 258 if (extent_start == start) { 259 start = extent_end + 1; 260 } else if (extent_start > start && extent_start < end) { 261 size = extent_start - start; 262 total_added += size; 263 ret = btrfs_add_free_space(block_group, start, 264 size); 265 BUG_ON(ret); 266 start = extent_end + 1; 267 } else { 268 break; 269 } 270 } 271 272 if (start < end) { 273 size = end - start; 274 total_added += size; 275 ret = btrfs_add_free_space(block_group, start, size); 276 BUG_ON(ret); 277 } 278 279 return total_added; 280 } 281 282 static int caching_kthread(void *data) 283 { 284 struct btrfs_block_group_cache *block_group = data; 285 struct btrfs_fs_info *fs_info = block_group->fs_info; 286 struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; 287 struct btrfs_root *extent_root = fs_info->extent_root; 288 struct btrfs_path *path; 289 struct extent_buffer *leaf; 290 struct btrfs_key key; 291 u64 total_found = 0; 292 u64 last = 0; 293 u32 nritems; 294 int ret = 0; 295 296 path = btrfs_alloc_path(); 297 if (!path) 298 return -ENOMEM; 299 300 exclude_super_stripes(extent_root, block_group); 301 spin_lock(&block_group->space_info->lock); 302 block_group->space_info->bytes_super += block_group->bytes_super; 303 spin_unlock(&block_group->space_info->lock); 304 305 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 306 307 /* 308 * We don't want to deadlock with somebody trying to allocate a new 309 * extent for the extent root while also trying to search the extent 310 * root to add free space. So we skip locking and search the commit 311 * root, since its read-only 312 */ 313 path->skip_locking = 1; 314 path->search_commit_root = 1; 315 path->reada = 2; 316 317 key.objectid = last; 318 key.offset = 0; 319 key.type = BTRFS_EXTENT_ITEM_KEY; 320 again: 321 mutex_lock(&caching_ctl->mutex); 322 /* need to make sure the commit_root doesn't disappear */ 323 down_read(&fs_info->extent_commit_sem); 324 325 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 326 if (ret < 0) 327 goto err; 328 329 leaf = path->nodes[0]; 330 nritems = btrfs_header_nritems(leaf); 331 332 while (1) { 333 smp_mb(); 334 if (fs_info->closing > 1) { 335 last = (u64)-1; 336 break; 337 } 338 339 if (path->slots[0] < nritems) { 340 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 341 } else { 342 ret = find_next_key(path, 0, &key); 343 if (ret) 344 break; 345 346 caching_ctl->progress = last; 347 btrfs_release_path(extent_root, path); 348 up_read(&fs_info->extent_commit_sem); 349 mutex_unlock(&caching_ctl->mutex); 350 if (btrfs_transaction_in_commit(fs_info)) 351 schedule_timeout(1); 352 else 353 cond_resched(); 354 goto again; 355 } 356 357 if (key.objectid < block_group->key.objectid) { 358 path->slots[0]++; 359 continue; 360 } 361 362 if (key.objectid >= block_group->key.objectid + 363 block_group->key.offset) 364 break; 365 366 if (key.type == BTRFS_EXTENT_ITEM_KEY) { 367 total_found += add_new_free_space(block_group, 368 fs_info, last, 369 key.objectid); 370 last = key.objectid + key.offset; 371 372 if (total_found > (1024 * 1024 * 2)) { 373 total_found = 0; 374 wake_up(&caching_ctl->wait); 375 } 376 } 377 path->slots[0]++; 378 } 379 ret = 0; 380 381 total_found += add_new_free_space(block_group, fs_info, last, 382 block_group->key.objectid + 383 block_group->key.offset); 384 caching_ctl->progress = (u64)-1; 385 386 spin_lock(&block_group->lock); 387 block_group->caching_ctl = NULL; 388 block_group->cached = BTRFS_CACHE_FINISHED; 389 spin_unlock(&block_group->lock); 390 391 err: 392 btrfs_free_path(path); 393 up_read(&fs_info->extent_commit_sem); 394 395 free_excluded_extents(extent_root, block_group); 396 397 mutex_unlock(&caching_ctl->mutex); 398 wake_up(&caching_ctl->wait); 399 400 put_caching_control(caching_ctl); 401 atomic_dec(&block_group->space_info->caching_threads); 402 return 0; 403 } 404 405 static int cache_block_group(struct btrfs_block_group_cache *cache) 406 { 407 struct btrfs_fs_info *fs_info = cache->fs_info; 408 struct btrfs_caching_control *caching_ctl; 409 struct task_struct *tsk; 410 int ret = 0; 411 412 smp_mb(); 413 if (cache->cached != BTRFS_CACHE_NO) 414 return 0; 415 416 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); 417 BUG_ON(!caching_ctl); 418 419 INIT_LIST_HEAD(&caching_ctl->list); 420 mutex_init(&caching_ctl->mutex); 421 init_waitqueue_head(&caching_ctl->wait); 422 caching_ctl->block_group = cache; 423 caching_ctl->progress = cache->key.objectid; 424 /* one for caching kthread, one for caching block group list */ 425 atomic_set(&caching_ctl->count, 2); 426 427 spin_lock(&cache->lock); 428 if (cache->cached != BTRFS_CACHE_NO) { 429 spin_unlock(&cache->lock); 430 kfree(caching_ctl); 431 return 0; 432 } 433 cache->caching_ctl = caching_ctl; 434 cache->cached = BTRFS_CACHE_STARTED; 435 spin_unlock(&cache->lock); 436 437 down_write(&fs_info->extent_commit_sem); 438 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 439 up_write(&fs_info->extent_commit_sem); 440 441 atomic_inc(&cache->space_info->caching_threads); 442 443 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", 444 cache->key.objectid); 445 if (IS_ERR(tsk)) { 446 ret = PTR_ERR(tsk); 447 printk(KERN_ERR "error running thread %d\n", ret); 448 BUG(); 449 } 450 451 return ret; 452 } 453 454 /* 455 * return the block group that starts at or after bytenr 456 */ 457 static struct btrfs_block_group_cache * 458 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 459 { 460 struct btrfs_block_group_cache *cache; 461 462 cache = block_group_cache_tree_search(info, bytenr, 0); 463 464 return cache; 465 } 466 467 /* 468 * return the block group that contains the given bytenr 469 */ 470 struct btrfs_block_group_cache *btrfs_lookup_block_group( 471 struct btrfs_fs_info *info, 472 u64 bytenr) 473 { 474 struct btrfs_block_group_cache *cache; 475 476 cache = block_group_cache_tree_search(info, bytenr, 1); 477 478 return cache; 479 } 480 481 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 482 { 483 if (atomic_dec_and_test(&cache->count)) 484 kfree(cache); 485 } 486 487 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 488 u64 flags) 489 { 490 struct list_head *head = &info->space_info; 491 struct btrfs_space_info *found; 492 493 rcu_read_lock(); 494 list_for_each_entry_rcu(found, head, list) { 495 if (found->flags == flags) { 496 rcu_read_unlock(); 497 return found; 498 } 499 } 500 rcu_read_unlock(); 501 return NULL; 502 } 503 504 /* 505 * after adding space to the filesystem, we need to clear the full flags 506 * on all the space infos. 507 */ 508 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 509 { 510 struct list_head *head = &info->space_info; 511 struct btrfs_space_info *found; 512 513 rcu_read_lock(); 514 list_for_each_entry_rcu(found, head, list) 515 found->full = 0; 516 rcu_read_unlock(); 517 } 518 519 static u64 div_factor(u64 num, int factor) 520 { 521 if (factor == 10) 522 return num; 523 num *= factor; 524 do_div(num, 10); 525 return num; 526 } 527 528 u64 btrfs_find_block_group(struct btrfs_root *root, 529 u64 search_start, u64 search_hint, int owner) 530 { 531 struct btrfs_block_group_cache *cache; 532 u64 used; 533 u64 last = max(search_hint, search_start); 534 u64 group_start = 0; 535 int full_search = 0; 536 int factor = 9; 537 int wrapped = 0; 538 again: 539 while (1) { 540 cache = btrfs_lookup_first_block_group(root->fs_info, last); 541 if (!cache) 542 break; 543 544 spin_lock(&cache->lock); 545 last = cache->key.objectid + cache->key.offset; 546 used = btrfs_block_group_used(&cache->item); 547 548 if ((full_search || !cache->ro) && 549 block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { 550 if (used + cache->pinned + cache->reserved < 551 div_factor(cache->key.offset, factor)) { 552 group_start = cache->key.objectid; 553 spin_unlock(&cache->lock); 554 btrfs_put_block_group(cache); 555 goto found; 556 } 557 } 558 spin_unlock(&cache->lock); 559 btrfs_put_block_group(cache); 560 cond_resched(); 561 } 562 if (!wrapped) { 563 last = search_start; 564 wrapped = 1; 565 goto again; 566 } 567 if (!full_search && factor < 10) { 568 last = search_start; 569 full_search = 1; 570 factor = 10; 571 goto again; 572 } 573 found: 574 return group_start; 575 } 576 577 /* simple helper to search for an existing extent at a given offset */ 578 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) 579 { 580 int ret; 581 struct btrfs_key key; 582 struct btrfs_path *path; 583 584 path = btrfs_alloc_path(); 585 BUG_ON(!path); 586 key.objectid = start; 587 key.offset = len; 588 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 589 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 590 0, 0); 591 btrfs_free_path(path); 592 return ret; 593 } 594 595 /* 596 * Back reference rules. Back refs have three main goals: 597 * 598 * 1) differentiate between all holders of references to an extent so that 599 * when a reference is dropped we can make sure it was a valid reference 600 * before freeing the extent. 601 * 602 * 2) Provide enough information to quickly find the holders of an extent 603 * if we notice a given block is corrupted or bad. 604 * 605 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 606 * maintenance. This is actually the same as #2, but with a slightly 607 * different use case. 608 * 609 * There are two kinds of back refs. The implicit back refs is optimized 610 * for pointers in non-shared tree blocks. For a given pointer in a block, 611 * back refs of this kind provide information about the block's owner tree 612 * and the pointer's key. These information allow us to find the block by 613 * b-tree searching. The full back refs is for pointers in tree blocks not 614 * referenced by their owner trees. The location of tree block is recorded 615 * in the back refs. Actually the full back refs is generic, and can be 616 * used in all cases the implicit back refs is used. The major shortcoming 617 * of the full back refs is its overhead. Every time a tree block gets 618 * COWed, we have to update back refs entry for all pointers in it. 619 * 620 * For a newly allocated tree block, we use implicit back refs for 621 * pointers in it. This means most tree related operations only involve 622 * implicit back refs. For a tree block created in old transaction, the 623 * only way to drop a reference to it is COW it. So we can detect the 624 * event that tree block loses its owner tree's reference and do the 625 * back refs conversion. 626 * 627 * When a tree block is COW'd through a tree, there are four cases: 628 * 629 * The reference count of the block is one and the tree is the block's 630 * owner tree. Nothing to do in this case. 631 * 632 * The reference count of the block is one and the tree is not the 633 * block's owner tree. In this case, full back refs is used for pointers 634 * in the block. Remove these full back refs, add implicit back refs for 635 * every pointers in the new block. 636 * 637 * The reference count of the block is greater than one and the tree is 638 * the block's owner tree. In this case, implicit back refs is used for 639 * pointers in the block. Add full back refs for every pointers in the 640 * block, increase lower level extents' reference counts. The original 641 * implicit back refs are entailed to the new block. 642 * 643 * The reference count of the block is greater than one and the tree is 644 * not the block's owner tree. Add implicit back refs for every pointer in 645 * the new block, increase lower level extents' reference count. 646 * 647 * Back Reference Key composing: 648 * 649 * The key objectid corresponds to the first byte in the extent, 650 * The key type is used to differentiate between types of back refs. 651 * There are different meanings of the key offset for different types 652 * of back refs. 653 * 654 * File extents can be referenced by: 655 * 656 * - multiple snapshots, subvolumes, or different generations in one subvol 657 * - different files inside a single subvolume 658 * - different offsets inside a file (bookend extents in file.c) 659 * 660 * The extent ref structure for the implicit back refs has fields for: 661 * 662 * - Objectid of the subvolume root 663 * - objectid of the file holding the reference 664 * - original offset in the file 665 * - how many bookend extents 666 * 667 * The key offset for the implicit back refs is hash of the first 668 * three fields. 669 * 670 * The extent ref structure for the full back refs has field for: 671 * 672 * - number of pointers in the tree leaf 673 * 674 * The key offset for the implicit back refs is the first byte of 675 * the tree leaf 676 * 677 * When a file extent is allocated, The implicit back refs is used. 678 * the fields are filled in: 679 * 680 * (root_key.objectid, inode objectid, offset in file, 1) 681 * 682 * When a file extent is removed file truncation, we find the 683 * corresponding implicit back refs and check the following fields: 684 * 685 * (btrfs_header_owner(leaf), inode objectid, offset in file) 686 * 687 * Btree extents can be referenced by: 688 * 689 * - Different subvolumes 690 * 691 * Both the implicit back refs and the full back refs for tree blocks 692 * only consist of key. The key offset for the implicit back refs is 693 * objectid of block's owner tree. The key offset for the full back refs 694 * is the first byte of parent block. 695 * 696 * When implicit back refs is used, information about the lowest key and 697 * level of the tree block are required. These information are stored in 698 * tree block info structure. 699 */ 700 701 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 702 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 703 struct btrfs_root *root, 704 struct btrfs_path *path, 705 u64 owner, u32 extra_size) 706 { 707 struct btrfs_extent_item *item; 708 struct btrfs_extent_item_v0 *ei0; 709 struct btrfs_extent_ref_v0 *ref0; 710 struct btrfs_tree_block_info *bi; 711 struct extent_buffer *leaf; 712 struct btrfs_key key; 713 struct btrfs_key found_key; 714 u32 new_size = sizeof(*item); 715 u64 refs; 716 int ret; 717 718 leaf = path->nodes[0]; 719 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 720 721 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 722 ei0 = btrfs_item_ptr(leaf, path->slots[0], 723 struct btrfs_extent_item_v0); 724 refs = btrfs_extent_refs_v0(leaf, ei0); 725 726 if (owner == (u64)-1) { 727 while (1) { 728 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 729 ret = btrfs_next_leaf(root, path); 730 if (ret < 0) 731 return ret; 732 BUG_ON(ret > 0); 733 leaf = path->nodes[0]; 734 } 735 btrfs_item_key_to_cpu(leaf, &found_key, 736 path->slots[0]); 737 BUG_ON(key.objectid != found_key.objectid); 738 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 739 path->slots[0]++; 740 continue; 741 } 742 ref0 = btrfs_item_ptr(leaf, path->slots[0], 743 struct btrfs_extent_ref_v0); 744 owner = btrfs_ref_objectid_v0(leaf, ref0); 745 break; 746 } 747 } 748 btrfs_release_path(root, path); 749 750 if (owner < BTRFS_FIRST_FREE_OBJECTID) 751 new_size += sizeof(*bi); 752 753 new_size -= sizeof(*ei0); 754 ret = btrfs_search_slot(trans, root, &key, path, 755 new_size + extra_size, 1); 756 if (ret < 0) 757 return ret; 758 BUG_ON(ret); 759 760 ret = btrfs_extend_item(trans, root, path, new_size); 761 BUG_ON(ret); 762 763 leaf = path->nodes[0]; 764 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 765 btrfs_set_extent_refs(leaf, item, refs); 766 /* FIXME: get real generation */ 767 btrfs_set_extent_generation(leaf, item, 0); 768 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 769 btrfs_set_extent_flags(leaf, item, 770 BTRFS_EXTENT_FLAG_TREE_BLOCK | 771 BTRFS_BLOCK_FLAG_FULL_BACKREF); 772 bi = (struct btrfs_tree_block_info *)(item + 1); 773 /* FIXME: get first key of the block */ 774 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); 775 btrfs_set_tree_block_level(leaf, bi, (int)owner); 776 } else { 777 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 778 } 779 btrfs_mark_buffer_dirty(leaf); 780 return 0; 781 } 782 #endif 783 784 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 785 { 786 u32 high_crc = ~(u32)0; 787 u32 low_crc = ~(u32)0; 788 __le64 lenum; 789 790 lenum = cpu_to_le64(root_objectid); 791 high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 792 lenum = cpu_to_le64(owner); 793 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 794 lenum = cpu_to_le64(offset); 795 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 796 797 return ((u64)high_crc << 31) ^ (u64)low_crc; 798 } 799 800 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 801 struct btrfs_extent_data_ref *ref) 802 { 803 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 804 btrfs_extent_data_ref_objectid(leaf, ref), 805 btrfs_extent_data_ref_offset(leaf, ref)); 806 } 807 808 static int match_extent_data_ref(struct extent_buffer *leaf, 809 struct btrfs_extent_data_ref *ref, 810 u64 root_objectid, u64 owner, u64 offset) 811 { 812 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 813 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 814 btrfs_extent_data_ref_offset(leaf, ref) != offset) 815 return 0; 816 return 1; 817 } 818 819 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 820 struct btrfs_root *root, 821 struct btrfs_path *path, 822 u64 bytenr, u64 parent, 823 u64 root_objectid, 824 u64 owner, u64 offset) 825 { 826 struct btrfs_key key; 827 struct btrfs_extent_data_ref *ref; 828 struct extent_buffer *leaf; 829 u32 nritems; 830 int ret; 831 int recow; 832 int err = -ENOENT; 833 834 key.objectid = bytenr; 835 if (parent) { 836 key.type = BTRFS_SHARED_DATA_REF_KEY; 837 key.offset = parent; 838 } else { 839 key.type = BTRFS_EXTENT_DATA_REF_KEY; 840 key.offset = hash_extent_data_ref(root_objectid, 841 owner, offset); 842 } 843 again: 844 recow = 0; 845 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 846 if (ret < 0) { 847 err = ret; 848 goto fail; 849 } 850 851 if (parent) { 852 if (!ret) 853 return 0; 854 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 855 key.type = BTRFS_EXTENT_REF_V0_KEY; 856 btrfs_release_path(root, path); 857 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 858 if (ret < 0) { 859 err = ret; 860 goto fail; 861 } 862 if (!ret) 863 return 0; 864 #endif 865 goto fail; 866 } 867 868 leaf = path->nodes[0]; 869 nritems = btrfs_header_nritems(leaf); 870 while (1) { 871 if (path->slots[0] >= nritems) { 872 ret = btrfs_next_leaf(root, path); 873 if (ret < 0) 874 err = ret; 875 if (ret) 876 goto fail; 877 878 leaf = path->nodes[0]; 879 nritems = btrfs_header_nritems(leaf); 880 recow = 1; 881 } 882 883 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 884 if (key.objectid != bytenr || 885 key.type != BTRFS_EXTENT_DATA_REF_KEY) 886 goto fail; 887 888 ref = btrfs_item_ptr(leaf, path->slots[0], 889 struct btrfs_extent_data_ref); 890 891 if (match_extent_data_ref(leaf, ref, root_objectid, 892 owner, offset)) { 893 if (recow) { 894 btrfs_release_path(root, path); 895 goto again; 896 } 897 err = 0; 898 break; 899 } 900 path->slots[0]++; 901 } 902 fail: 903 return err; 904 } 905 906 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 907 struct btrfs_root *root, 908 struct btrfs_path *path, 909 u64 bytenr, u64 parent, 910 u64 root_objectid, u64 owner, 911 u64 offset, int refs_to_add) 912 { 913 struct btrfs_key key; 914 struct extent_buffer *leaf; 915 u32 size; 916 u32 num_refs; 917 int ret; 918 919 key.objectid = bytenr; 920 if (parent) { 921 key.type = BTRFS_SHARED_DATA_REF_KEY; 922 key.offset = parent; 923 size = sizeof(struct btrfs_shared_data_ref); 924 } else { 925 key.type = BTRFS_EXTENT_DATA_REF_KEY; 926 key.offset = hash_extent_data_ref(root_objectid, 927 owner, offset); 928 size = sizeof(struct btrfs_extent_data_ref); 929 } 930 931 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 932 if (ret && ret != -EEXIST) 933 goto fail; 934 935 leaf = path->nodes[0]; 936 if (parent) { 937 struct btrfs_shared_data_ref *ref; 938 ref = btrfs_item_ptr(leaf, path->slots[0], 939 struct btrfs_shared_data_ref); 940 if (ret == 0) { 941 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 942 } else { 943 num_refs = btrfs_shared_data_ref_count(leaf, ref); 944 num_refs += refs_to_add; 945 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 946 } 947 } else { 948 struct btrfs_extent_data_ref *ref; 949 while (ret == -EEXIST) { 950 ref = btrfs_item_ptr(leaf, path->slots[0], 951 struct btrfs_extent_data_ref); 952 if (match_extent_data_ref(leaf, ref, root_objectid, 953 owner, offset)) 954 break; 955 btrfs_release_path(root, path); 956 key.offset++; 957 ret = btrfs_insert_empty_item(trans, root, path, &key, 958 size); 959 if (ret && ret != -EEXIST) 960 goto fail; 961 962 leaf = path->nodes[0]; 963 } 964 ref = btrfs_item_ptr(leaf, path->slots[0], 965 struct btrfs_extent_data_ref); 966 if (ret == 0) { 967 btrfs_set_extent_data_ref_root(leaf, ref, 968 root_objectid); 969 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 970 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 971 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 972 } else { 973 num_refs = btrfs_extent_data_ref_count(leaf, ref); 974 num_refs += refs_to_add; 975 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 976 } 977 } 978 btrfs_mark_buffer_dirty(leaf); 979 ret = 0; 980 fail: 981 btrfs_release_path(root, path); 982 return ret; 983 } 984 985 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 986 struct btrfs_root *root, 987 struct btrfs_path *path, 988 int refs_to_drop) 989 { 990 struct btrfs_key key; 991 struct btrfs_extent_data_ref *ref1 = NULL; 992 struct btrfs_shared_data_ref *ref2 = NULL; 993 struct extent_buffer *leaf; 994 u32 num_refs = 0; 995 int ret = 0; 996 997 leaf = path->nodes[0]; 998 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 999 1000 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1001 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1002 struct btrfs_extent_data_ref); 1003 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1004 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1005 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1006 struct btrfs_shared_data_ref); 1007 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1008 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1009 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1010 struct btrfs_extent_ref_v0 *ref0; 1011 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1012 struct btrfs_extent_ref_v0); 1013 num_refs = btrfs_ref_count_v0(leaf, ref0); 1014 #endif 1015 } else { 1016 BUG(); 1017 } 1018 1019 BUG_ON(num_refs < refs_to_drop); 1020 num_refs -= refs_to_drop; 1021 1022 if (num_refs == 0) { 1023 ret = btrfs_del_item(trans, root, path); 1024 } else { 1025 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1026 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1027 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1028 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1029 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1030 else { 1031 struct btrfs_extent_ref_v0 *ref0; 1032 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1033 struct btrfs_extent_ref_v0); 1034 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1035 } 1036 #endif 1037 btrfs_mark_buffer_dirty(leaf); 1038 } 1039 return ret; 1040 } 1041 1042 static noinline u32 extent_data_ref_count(struct btrfs_root *root, 1043 struct btrfs_path *path, 1044 struct btrfs_extent_inline_ref *iref) 1045 { 1046 struct btrfs_key key; 1047 struct extent_buffer *leaf; 1048 struct btrfs_extent_data_ref *ref1; 1049 struct btrfs_shared_data_ref *ref2; 1050 u32 num_refs = 0; 1051 1052 leaf = path->nodes[0]; 1053 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1054 if (iref) { 1055 if (btrfs_extent_inline_ref_type(leaf, iref) == 1056 BTRFS_EXTENT_DATA_REF_KEY) { 1057 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1058 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1059 } else { 1060 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1061 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1062 } 1063 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1064 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1065 struct btrfs_extent_data_ref); 1066 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1067 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1068 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1069 struct btrfs_shared_data_ref); 1070 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1071 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1072 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1073 struct btrfs_extent_ref_v0 *ref0; 1074 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1075 struct btrfs_extent_ref_v0); 1076 num_refs = btrfs_ref_count_v0(leaf, ref0); 1077 #endif 1078 } else { 1079 WARN_ON(1); 1080 } 1081 return num_refs; 1082 } 1083 1084 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1085 struct btrfs_root *root, 1086 struct btrfs_path *path, 1087 u64 bytenr, u64 parent, 1088 u64 root_objectid) 1089 { 1090 struct btrfs_key key; 1091 int ret; 1092 1093 key.objectid = bytenr; 1094 if (parent) { 1095 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1096 key.offset = parent; 1097 } else { 1098 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1099 key.offset = root_objectid; 1100 } 1101 1102 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1103 if (ret > 0) 1104 ret = -ENOENT; 1105 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1106 if (ret == -ENOENT && parent) { 1107 btrfs_release_path(root, path); 1108 key.type = BTRFS_EXTENT_REF_V0_KEY; 1109 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1110 if (ret > 0) 1111 ret = -ENOENT; 1112 } 1113 #endif 1114 return ret; 1115 } 1116 1117 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1118 struct btrfs_root *root, 1119 struct btrfs_path *path, 1120 u64 bytenr, u64 parent, 1121 u64 root_objectid) 1122 { 1123 struct btrfs_key key; 1124 int ret; 1125 1126 key.objectid = bytenr; 1127 if (parent) { 1128 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1129 key.offset = parent; 1130 } else { 1131 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1132 key.offset = root_objectid; 1133 } 1134 1135 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1136 btrfs_release_path(root, path); 1137 return ret; 1138 } 1139 1140 static inline int extent_ref_type(u64 parent, u64 owner) 1141 { 1142 int type; 1143 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1144 if (parent > 0) 1145 type = BTRFS_SHARED_BLOCK_REF_KEY; 1146 else 1147 type = BTRFS_TREE_BLOCK_REF_KEY; 1148 } else { 1149 if (parent > 0) 1150 type = BTRFS_SHARED_DATA_REF_KEY; 1151 else 1152 type = BTRFS_EXTENT_DATA_REF_KEY; 1153 } 1154 return type; 1155 } 1156 1157 static int find_next_key(struct btrfs_path *path, int level, 1158 struct btrfs_key *key) 1159 1160 { 1161 for (; level < BTRFS_MAX_LEVEL; level++) { 1162 if (!path->nodes[level]) 1163 break; 1164 if (path->slots[level] + 1 >= 1165 btrfs_header_nritems(path->nodes[level])) 1166 continue; 1167 if (level == 0) 1168 btrfs_item_key_to_cpu(path->nodes[level], key, 1169 path->slots[level] + 1); 1170 else 1171 btrfs_node_key_to_cpu(path->nodes[level], key, 1172 path->slots[level] + 1); 1173 return 0; 1174 } 1175 return 1; 1176 } 1177 1178 /* 1179 * look for inline back ref. if back ref is found, *ref_ret is set 1180 * to the address of inline back ref, and 0 is returned. 1181 * 1182 * if back ref isn't found, *ref_ret is set to the address where it 1183 * should be inserted, and -ENOENT is returned. 1184 * 1185 * if insert is true and there are too many inline back refs, the path 1186 * points to the extent item, and -EAGAIN is returned. 1187 * 1188 * NOTE: inline back refs are ordered in the same way that back ref 1189 * items in the tree are ordered. 1190 */ 1191 static noinline_for_stack 1192 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1193 struct btrfs_root *root, 1194 struct btrfs_path *path, 1195 struct btrfs_extent_inline_ref **ref_ret, 1196 u64 bytenr, u64 num_bytes, 1197 u64 parent, u64 root_objectid, 1198 u64 owner, u64 offset, int insert) 1199 { 1200 struct btrfs_key key; 1201 struct extent_buffer *leaf; 1202 struct btrfs_extent_item *ei; 1203 struct btrfs_extent_inline_ref *iref; 1204 u64 flags; 1205 u64 item_size; 1206 unsigned long ptr; 1207 unsigned long end; 1208 int extra_size; 1209 int type; 1210 int want; 1211 int ret; 1212 int err = 0; 1213 1214 key.objectid = bytenr; 1215 key.type = BTRFS_EXTENT_ITEM_KEY; 1216 key.offset = num_bytes; 1217 1218 want = extent_ref_type(parent, owner); 1219 if (insert) { 1220 extra_size = btrfs_extent_inline_ref_size(want); 1221 path->keep_locks = 1; 1222 } else 1223 extra_size = -1; 1224 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1225 if (ret < 0) { 1226 err = ret; 1227 goto out; 1228 } 1229 BUG_ON(ret); 1230 1231 leaf = path->nodes[0]; 1232 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1233 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1234 if (item_size < sizeof(*ei)) { 1235 if (!insert) { 1236 err = -ENOENT; 1237 goto out; 1238 } 1239 ret = convert_extent_item_v0(trans, root, path, owner, 1240 extra_size); 1241 if (ret < 0) { 1242 err = ret; 1243 goto out; 1244 } 1245 leaf = path->nodes[0]; 1246 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1247 } 1248 #endif 1249 BUG_ON(item_size < sizeof(*ei)); 1250 1251 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1252 flags = btrfs_extent_flags(leaf, ei); 1253 1254 ptr = (unsigned long)(ei + 1); 1255 end = (unsigned long)ei + item_size; 1256 1257 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1258 ptr += sizeof(struct btrfs_tree_block_info); 1259 BUG_ON(ptr > end); 1260 } else { 1261 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); 1262 } 1263 1264 err = -ENOENT; 1265 while (1) { 1266 if (ptr >= end) { 1267 WARN_ON(ptr > end); 1268 break; 1269 } 1270 iref = (struct btrfs_extent_inline_ref *)ptr; 1271 type = btrfs_extent_inline_ref_type(leaf, iref); 1272 if (want < type) 1273 break; 1274 if (want > type) { 1275 ptr += btrfs_extent_inline_ref_size(type); 1276 continue; 1277 } 1278 1279 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1280 struct btrfs_extent_data_ref *dref; 1281 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1282 if (match_extent_data_ref(leaf, dref, root_objectid, 1283 owner, offset)) { 1284 err = 0; 1285 break; 1286 } 1287 if (hash_extent_data_ref_item(leaf, dref) < 1288 hash_extent_data_ref(root_objectid, owner, offset)) 1289 break; 1290 } else { 1291 u64 ref_offset; 1292 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1293 if (parent > 0) { 1294 if (parent == ref_offset) { 1295 err = 0; 1296 break; 1297 } 1298 if (ref_offset < parent) 1299 break; 1300 } else { 1301 if (root_objectid == ref_offset) { 1302 err = 0; 1303 break; 1304 } 1305 if (ref_offset < root_objectid) 1306 break; 1307 } 1308 } 1309 ptr += btrfs_extent_inline_ref_size(type); 1310 } 1311 if (err == -ENOENT && insert) { 1312 if (item_size + extra_size >= 1313 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1314 err = -EAGAIN; 1315 goto out; 1316 } 1317 /* 1318 * To add new inline back ref, we have to make sure 1319 * there is no corresponding back ref item. 1320 * For simplicity, we just do not add new inline back 1321 * ref if there is any kind of item for this block 1322 */ 1323 if (find_next_key(path, 0, &key) == 0 && 1324 key.objectid == bytenr && 1325 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1326 err = -EAGAIN; 1327 goto out; 1328 } 1329 } 1330 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1331 out: 1332 if (insert) { 1333 path->keep_locks = 0; 1334 btrfs_unlock_up_safe(path, 1); 1335 } 1336 return err; 1337 } 1338 1339 /* 1340 * helper to add new inline back ref 1341 */ 1342 static noinline_for_stack 1343 int setup_inline_extent_backref(struct btrfs_trans_handle *trans, 1344 struct btrfs_root *root, 1345 struct btrfs_path *path, 1346 struct btrfs_extent_inline_ref *iref, 1347 u64 parent, u64 root_objectid, 1348 u64 owner, u64 offset, int refs_to_add, 1349 struct btrfs_delayed_extent_op *extent_op) 1350 { 1351 struct extent_buffer *leaf; 1352 struct btrfs_extent_item *ei; 1353 unsigned long ptr; 1354 unsigned long end; 1355 unsigned long item_offset; 1356 u64 refs; 1357 int size; 1358 int type; 1359 int ret; 1360 1361 leaf = path->nodes[0]; 1362 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1363 item_offset = (unsigned long)iref - (unsigned long)ei; 1364 1365 type = extent_ref_type(parent, owner); 1366 size = btrfs_extent_inline_ref_size(type); 1367 1368 ret = btrfs_extend_item(trans, root, path, size); 1369 BUG_ON(ret); 1370 1371 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1372 refs = btrfs_extent_refs(leaf, ei); 1373 refs += refs_to_add; 1374 btrfs_set_extent_refs(leaf, ei, refs); 1375 if (extent_op) 1376 __run_delayed_extent_op(extent_op, leaf, ei); 1377 1378 ptr = (unsigned long)ei + item_offset; 1379 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1380 if (ptr < end - size) 1381 memmove_extent_buffer(leaf, ptr + size, ptr, 1382 end - size - ptr); 1383 1384 iref = (struct btrfs_extent_inline_ref *)ptr; 1385 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1386 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1387 struct btrfs_extent_data_ref *dref; 1388 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1389 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1390 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1391 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1392 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1393 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1394 struct btrfs_shared_data_ref *sref; 1395 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1396 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1397 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1398 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1399 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1400 } else { 1401 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1402 } 1403 btrfs_mark_buffer_dirty(leaf); 1404 return 0; 1405 } 1406 1407 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1408 struct btrfs_root *root, 1409 struct btrfs_path *path, 1410 struct btrfs_extent_inline_ref **ref_ret, 1411 u64 bytenr, u64 num_bytes, u64 parent, 1412 u64 root_objectid, u64 owner, u64 offset) 1413 { 1414 int ret; 1415 1416 ret = lookup_inline_extent_backref(trans, root, path, ref_ret, 1417 bytenr, num_bytes, parent, 1418 root_objectid, owner, offset, 0); 1419 if (ret != -ENOENT) 1420 return ret; 1421 1422 btrfs_release_path(root, path); 1423 *ref_ret = NULL; 1424 1425 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1426 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, 1427 root_objectid); 1428 } else { 1429 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, 1430 root_objectid, owner, offset); 1431 } 1432 return ret; 1433 } 1434 1435 /* 1436 * helper to update/remove inline back ref 1437 */ 1438 static noinline_for_stack 1439 int update_inline_extent_backref(struct btrfs_trans_handle *trans, 1440 struct btrfs_root *root, 1441 struct btrfs_path *path, 1442 struct btrfs_extent_inline_ref *iref, 1443 int refs_to_mod, 1444 struct btrfs_delayed_extent_op *extent_op) 1445 { 1446 struct extent_buffer *leaf; 1447 struct btrfs_extent_item *ei; 1448 struct btrfs_extent_data_ref *dref = NULL; 1449 struct btrfs_shared_data_ref *sref = NULL; 1450 unsigned long ptr; 1451 unsigned long end; 1452 u32 item_size; 1453 int size; 1454 int type; 1455 int ret; 1456 u64 refs; 1457 1458 leaf = path->nodes[0]; 1459 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1460 refs = btrfs_extent_refs(leaf, ei); 1461 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1462 refs += refs_to_mod; 1463 btrfs_set_extent_refs(leaf, ei, refs); 1464 if (extent_op) 1465 __run_delayed_extent_op(extent_op, leaf, ei); 1466 1467 type = btrfs_extent_inline_ref_type(leaf, iref); 1468 1469 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1470 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1471 refs = btrfs_extent_data_ref_count(leaf, dref); 1472 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1473 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1474 refs = btrfs_shared_data_ref_count(leaf, sref); 1475 } else { 1476 refs = 1; 1477 BUG_ON(refs_to_mod != -1); 1478 } 1479 1480 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1481 refs += refs_to_mod; 1482 1483 if (refs > 0) { 1484 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1485 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1486 else 1487 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1488 } else { 1489 size = btrfs_extent_inline_ref_size(type); 1490 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1491 ptr = (unsigned long)iref; 1492 end = (unsigned long)ei + item_size; 1493 if (ptr + size < end) 1494 memmove_extent_buffer(leaf, ptr, ptr + size, 1495 end - ptr - size); 1496 item_size -= size; 1497 ret = btrfs_truncate_item(trans, root, path, item_size, 1); 1498 BUG_ON(ret); 1499 } 1500 btrfs_mark_buffer_dirty(leaf); 1501 return 0; 1502 } 1503 1504 static noinline_for_stack 1505 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1506 struct btrfs_root *root, 1507 struct btrfs_path *path, 1508 u64 bytenr, u64 num_bytes, u64 parent, 1509 u64 root_objectid, u64 owner, 1510 u64 offset, int refs_to_add, 1511 struct btrfs_delayed_extent_op *extent_op) 1512 { 1513 struct btrfs_extent_inline_ref *iref; 1514 int ret; 1515 1516 ret = lookup_inline_extent_backref(trans, root, path, &iref, 1517 bytenr, num_bytes, parent, 1518 root_objectid, owner, offset, 1); 1519 if (ret == 0) { 1520 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1521 ret = update_inline_extent_backref(trans, root, path, iref, 1522 refs_to_add, extent_op); 1523 } else if (ret == -ENOENT) { 1524 ret = setup_inline_extent_backref(trans, root, path, iref, 1525 parent, root_objectid, 1526 owner, offset, refs_to_add, 1527 extent_op); 1528 } 1529 return ret; 1530 } 1531 1532 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1533 struct btrfs_root *root, 1534 struct btrfs_path *path, 1535 u64 bytenr, u64 parent, u64 root_objectid, 1536 u64 owner, u64 offset, int refs_to_add) 1537 { 1538 int ret; 1539 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1540 BUG_ON(refs_to_add != 1); 1541 ret = insert_tree_block_ref(trans, root, path, bytenr, 1542 parent, root_objectid); 1543 } else { 1544 ret = insert_extent_data_ref(trans, root, path, bytenr, 1545 parent, root_objectid, 1546 owner, offset, refs_to_add); 1547 } 1548 return ret; 1549 } 1550 1551 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1552 struct btrfs_root *root, 1553 struct btrfs_path *path, 1554 struct btrfs_extent_inline_ref *iref, 1555 int refs_to_drop, int is_data) 1556 { 1557 int ret; 1558 1559 BUG_ON(!is_data && refs_to_drop != 1); 1560 if (iref) { 1561 ret = update_inline_extent_backref(trans, root, path, iref, 1562 -refs_to_drop, NULL); 1563 } else if (is_data) { 1564 ret = remove_extent_data_ref(trans, root, path, refs_to_drop); 1565 } else { 1566 ret = btrfs_del_item(trans, root, path); 1567 } 1568 return ret; 1569 } 1570 1571 static void btrfs_issue_discard(struct block_device *bdev, 1572 u64 start, u64 len) 1573 { 1574 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1575 DISCARD_FL_BARRIER); 1576 } 1577 1578 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1579 u64 num_bytes) 1580 { 1581 int ret; 1582 u64 map_length = num_bytes; 1583 struct btrfs_multi_bio *multi = NULL; 1584 1585 if (!btrfs_test_opt(root, DISCARD)) 1586 return 0; 1587 1588 /* Tell the block device(s) that the sectors can be discarded */ 1589 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, 1590 bytenr, &map_length, &multi, 0); 1591 if (!ret) { 1592 struct btrfs_bio_stripe *stripe = multi->stripes; 1593 int i; 1594 1595 if (map_length > num_bytes) 1596 map_length = num_bytes; 1597 1598 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1599 btrfs_issue_discard(stripe->dev->bdev, 1600 stripe->physical, 1601 map_length); 1602 } 1603 kfree(multi); 1604 } 1605 1606 return ret; 1607 } 1608 1609 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1610 struct btrfs_root *root, 1611 u64 bytenr, u64 num_bytes, u64 parent, 1612 u64 root_objectid, u64 owner, u64 offset) 1613 { 1614 int ret; 1615 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1616 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1617 1618 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1619 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 1620 parent, root_objectid, (int)owner, 1621 BTRFS_ADD_DELAYED_REF, NULL); 1622 } else { 1623 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 1624 parent, root_objectid, owner, offset, 1625 BTRFS_ADD_DELAYED_REF, NULL); 1626 } 1627 return ret; 1628 } 1629 1630 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1631 struct btrfs_root *root, 1632 u64 bytenr, u64 num_bytes, 1633 u64 parent, u64 root_objectid, 1634 u64 owner, u64 offset, int refs_to_add, 1635 struct btrfs_delayed_extent_op *extent_op) 1636 { 1637 struct btrfs_path *path; 1638 struct extent_buffer *leaf; 1639 struct btrfs_extent_item *item; 1640 u64 refs; 1641 int ret; 1642 int err = 0; 1643 1644 path = btrfs_alloc_path(); 1645 if (!path) 1646 return -ENOMEM; 1647 1648 path->reada = 1; 1649 path->leave_spinning = 1; 1650 /* this will setup the path even if it fails to insert the back ref */ 1651 ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, 1652 path, bytenr, num_bytes, parent, 1653 root_objectid, owner, offset, 1654 refs_to_add, extent_op); 1655 if (ret == 0) 1656 goto out; 1657 1658 if (ret != -EAGAIN) { 1659 err = ret; 1660 goto out; 1661 } 1662 1663 leaf = path->nodes[0]; 1664 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1665 refs = btrfs_extent_refs(leaf, item); 1666 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 1667 if (extent_op) 1668 __run_delayed_extent_op(extent_op, leaf, item); 1669 1670 btrfs_mark_buffer_dirty(leaf); 1671 btrfs_release_path(root->fs_info->extent_root, path); 1672 1673 path->reada = 1; 1674 path->leave_spinning = 1; 1675 1676 /* now insert the actual backref */ 1677 ret = insert_extent_backref(trans, root->fs_info->extent_root, 1678 path, bytenr, parent, root_objectid, 1679 owner, offset, refs_to_add); 1680 BUG_ON(ret); 1681 out: 1682 btrfs_free_path(path); 1683 return err; 1684 } 1685 1686 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 1687 struct btrfs_root *root, 1688 struct btrfs_delayed_ref_node *node, 1689 struct btrfs_delayed_extent_op *extent_op, 1690 int insert_reserved) 1691 { 1692 int ret = 0; 1693 struct btrfs_delayed_data_ref *ref; 1694 struct btrfs_key ins; 1695 u64 parent = 0; 1696 u64 ref_root = 0; 1697 u64 flags = 0; 1698 1699 ins.objectid = node->bytenr; 1700 ins.offset = node->num_bytes; 1701 ins.type = BTRFS_EXTENT_ITEM_KEY; 1702 1703 ref = btrfs_delayed_node_to_data_ref(node); 1704 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 1705 parent = ref->parent; 1706 else 1707 ref_root = ref->root; 1708 1709 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 1710 if (extent_op) { 1711 BUG_ON(extent_op->update_key); 1712 flags |= extent_op->flags_to_set; 1713 } 1714 ret = alloc_reserved_file_extent(trans, root, 1715 parent, ref_root, flags, 1716 ref->objectid, ref->offset, 1717 &ins, node->ref_mod); 1718 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1719 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 1720 node->num_bytes, parent, 1721 ref_root, ref->objectid, 1722 ref->offset, node->ref_mod, 1723 extent_op); 1724 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1725 ret = __btrfs_free_extent(trans, root, node->bytenr, 1726 node->num_bytes, parent, 1727 ref_root, ref->objectid, 1728 ref->offset, node->ref_mod, 1729 extent_op); 1730 } else { 1731 BUG(); 1732 } 1733 return ret; 1734 } 1735 1736 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 1737 struct extent_buffer *leaf, 1738 struct btrfs_extent_item *ei) 1739 { 1740 u64 flags = btrfs_extent_flags(leaf, ei); 1741 if (extent_op->update_flags) { 1742 flags |= extent_op->flags_to_set; 1743 btrfs_set_extent_flags(leaf, ei, flags); 1744 } 1745 1746 if (extent_op->update_key) { 1747 struct btrfs_tree_block_info *bi; 1748 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 1749 bi = (struct btrfs_tree_block_info *)(ei + 1); 1750 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 1751 } 1752 } 1753 1754 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 1755 struct btrfs_root *root, 1756 struct btrfs_delayed_ref_node *node, 1757 struct btrfs_delayed_extent_op *extent_op) 1758 { 1759 struct btrfs_key key; 1760 struct btrfs_path *path; 1761 struct btrfs_extent_item *ei; 1762 struct extent_buffer *leaf; 1763 u32 item_size; 1764 int ret; 1765 int err = 0; 1766 1767 path = btrfs_alloc_path(); 1768 if (!path) 1769 return -ENOMEM; 1770 1771 key.objectid = node->bytenr; 1772 key.type = BTRFS_EXTENT_ITEM_KEY; 1773 key.offset = node->num_bytes; 1774 1775 path->reada = 1; 1776 path->leave_spinning = 1; 1777 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 1778 path, 0, 1); 1779 if (ret < 0) { 1780 err = ret; 1781 goto out; 1782 } 1783 if (ret > 0) { 1784 err = -EIO; 1785 goto out; 1786 } 1787 1788 leaf = path->nodes[0]; 1789 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1790 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1791 if (item_size < sizeof(*ei)) { 1792 ret = convert_extent_item_v0(trans, root->fs_info->extent_root, 1793 path, (u64)-1, 0); 1794 if (ret < 0) { 1795 err = ret; 1796 goto out; 1797 } 1798 leaf = path->nodes[0]; 1799 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1800 } 1801 #endif 1802 BUG_ON(item_size < sizeof(*ei)); 1803 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1804 __run_delayed_extent_op(extent_op, leaf, ei); 1805 1806 btrfs_mark_buffer_dirty(leaf); 1807 out: 1808 btrfs_free_path(path); 1809 return err; 1810 } 1811 1812 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 1813 struct btrfs_root *root, 1814 struct btrfs_delayed_ref_node *node, 1815 struct btrfs_delayed_extent_op *extent_op, 1816 int insert_reserved) 1817 { 1818 int ret = 0; 1819 struct btrfs_delayed_tree_ref *ref; 1820 struct btrfs_key ins; 1821 u64 parent = 0; 1822 u64 ref_root = 0; 1823 1824 ins.objectid = node->bytenr; 1825 ins.offset = node->num_bytes; 1826 ins.type = BTRFS_EXTENT_ITEM_KEY; 1827 1828 ref = btrfs_delayed_node_to_tree_ref(node); 1829 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 1830 parent = ref->parent; 1831 else 1832 ref_root = ref->root; 1833 1834 BUG_ON(node->ref_mod != 1); 1835 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 1836 BUG_ON(!extent_op || !extent_op->update_flags || 1837 !extent_op->update_key); 1838 ret = alloc_reserved_tree_block(trans, root, 1839 parent, ref_root, 1840 extent_op->flags_to_set, 1841 &extent_op->key, 1842 ref->level, &ins); 1843 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1844 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 1845 node->num_bytes, parent, ref_root, 1846 ref->level, 0, 1, extent_op); 1847 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1848 ret = __btrfs_free_extent(trans, root, node->bytenr, 1849 node->num_bytes, parent, ref_root, 1850 ref->level, 0, 1, extent_op); 1851 } else { 1852 BUG(); 1853 } 1854 return ret; 1855 } 1856 1857 1858 /* helper function to actually process a single delayed ref entry */ 1859 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1860 struct btrfs_root *root, 1861 struct btrfs_delayed_ref_node *node, 1862 struct btrfs_delayed_extent_op *extent_op, 1863 int insert_reserved) 1864 { 1865 int ret; 1866 if (btrfs_delayed_ref_is_head(node)) { 1867 struct btrfs_delayed_ref_head *head; 1868 /* 1869 * we've hit the end of the chain and we were supposed 1870 * to insert this extent into the tree. But, it got 1871 * deleted before we ever needed to insert it, so all 1872 * we have to do is clean up the accounting 1873 */ 1874 BUG_ON(extent_op); 1875 head = btrfs_delayed_node_to_head(node); 1876 if (insert_reserved) { 1877 int mark_free = 0; 1878 struct extent_buffer *must_clean = NULL; 1879 1880 ret = pin_down_bytes(trans, root, NULL, 1881 node->bytenr, node->num_bytes, 1882 head->is_data, 1, &must_clean); 1883 if (ret > 0) 1884 mark_free = 1; 1885 1886 if (must_clean) { 1887 clean_tree_block(NULL, root, must_clean); 1888 btrfs_tree_unlock(must_clean); 1889 free_extent_buffer(must_clean); 1890 } 1891 if (head->is_data) { 1892 ret = btrfs_del_csums(trans, root, 1893 node->bytenr, 1894 node->num_bytes); 1895 BUG_ON(ret); 1896 } 1897 if (mark_free) { 1898 ret = btrfs_free_reserved_extent(root, 1899 node->bytenr, 1900 node->num_bytes); 1901 BUG_ON(ret); 1902 } 1903 } 1904 mutex_unlock(&head->mutex); 1905 return 0; 1906 } 1907 1908 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 1909 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 1910 ret = run_delayed_tree_ref(trans, root, node, extent_op, 1911 insert_reserved); 1912 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 1913 node->type == BTRFS_SHARED_DATA_REF_KEY) 1914 ret = run_delayed_data_ref(trans, root, node, extent_op, 1915 insert_reserved); 1916 else 1917 BUG(); 1918 return ret; 1919 } 1920 1921 static noinline struct btrfs_delayed_ref_node * 1922 select_delayed_ref(struct btrfs_delayed_ref_head *head) 1923 { 1924 struct rb_node *node; 1925 struct btrfs_delayed_ref_node *ref; 1926 int action = BTRFS_ADD_DELAYED_REF; 1927 again: 1928 /* 1929 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 1930 * this prevents ref count from going down to zero when 1931 * there still are pending delayed ref. 1932 */ 1933 node = rb_prev(&head->node.rb_node); 1934 while (1) { 1935 if (!node) 1936 break; 1937 ref = rb_entry(node, struct btrfs_delayed_ref_node, 1938 rb_node); 1939 if (ref->bytenr != head->node.bytenr) 1940 break; 1941 if (ref->action == action) 1942 return ref; 1943 node = rb_prev(node); 1944 } 1945 if (action == BTRFS_ADD_DELAYED_REF) { 1946 action = BTRFS_DROP_DELAYED_REF; 1947 goto again; 1948 } 1949 return NULL; 1950 } 1951 1952 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, 1953 struct btrfs_root *root, 1954 struct list_head *cluster) 1955 { 1956 struct btrfs_delayed_ref_root *delayed_refs; 1957 struct btrfs_delayed_ref_node *ref; 1958 struct btrfs_delayed_ref_head *locked_ref = NULL; 1959 struct btrfs_delayed_extent_op *extent_op; 1960 int ret; 1961 int count = 0; 1962 int must_insert_reserved = 0; 1963 1964 delayed_refs = &trans->transaction->delayed_refs; 1965 while (1) { 1966 if (!locked_ref) { 1967 /* pick a new head ref from the cluster list */ 1968 if (list_empty(cluster)) 1969 break; 1970 1971 locked_ref = list_entry(cluster->next, 1972 struct btrfs_delayed_ref_head, cluster); 1973 1974 /* grab the lock that says we are going to process 1975 * all the refs for this head */ 1976 ret = btrfs_delayed_ref_lock(trans, locked_ref); 1977 1978 /* 1979 * we may have dropped the spin lock to get the head 1980 * mutex lock, and that might have given someone else 1981 * time to free the head. If that's true, it has been 1982 * removed from our list and we can move on. 1983 */ 1984 if (ret == -EAGAIN) { 1985 locked_ref = NULL; 1986 count++; 1987 continue; 1988 } 1989 } 1990 1991 /* 1992 * record the must insert reserved flag before we 1993 * drop the spin lock. 1994 */ 1995 must_insert_reserved = locked_ref->must_insert_reserved; 1996 locked_ref->must_insert_reserved = 0; 1997 1998 extent_op = locked_ref->extent_op; 1999 locked_ref->extent_op = NULL; 2000 2001 /* 2002 * locked_ref is the head node, so we have to go one 2003 * node back for any delayed ref updates 2004 */ 2005 ref = select_delayed_ref(locked_ref); 2006 if (!ref) { 2007 /* All delayed refs have been processed, Go ahead 2008 * and send the head node to run_one_delayed_ref, 2009 * so that any accounting fixes can happen 2010 */ 2011 ref = &locked_ref->node; 2012 2013 if (extent_op && must_insert_reserved) { 2014 kfree(extent_op); 2015 extent_op = NULL; 2016 } 2017 2018 if (extent_op) { 2019 spin_unlock(&delayed_refs->lock); 2020 2021 ret = run_delayed_extent_op(trans, root, 2022 ref, extent_op); 2023 BUG_ON(ret); 2024 kfree(extent_op); 2025 2026 cond_resched(); 2027 spin_lock(&delayed_refs->lock); 2028 continue; 2029 } 2030 2031 list_del_init(&locked_ref->cluster); 2032 locked_ref = NULL; 2033 } 2034 2035 ref->in_tree = 0; 2036 rb_erase(&ref->rb_node, &delayed_refs->root); 2037 delayed_refs->num_entries--; 2038 2039 spin_unlock(&delayed_refs->lock); 2040 2041 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2042 must_insert_reserved); 2043 BUG_ON(ret); 2044 2045 btrfs_put_delayed_ref(ref); 2046 kfree(extent_op); 2047 count++; 2048 2049 cond_resched(); 2050 spin_lock(&delayed_refs->lock); 2051 } 2052 return count; 2053 } 2054 2055 /* 2056 * this starts processing the delayed reference count updates and 2057 * extent insertions we have queued up so far. count can be 2058 * 0, which means to process everything in the tree at the start 2059 * of the run (but not newly added entries), or it can be some target 2060 * number you'd like to process. 2061 */ 2062 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2063 struct btrfs_root *root, unsigned long count) 2064 { 2065 struct rb_node *node; 2066 struct btrfs_delayed_ref_root *delayed_refs; 2067 struct btrfs_delayed_ref_node *ref; 2068 struct list_head cluster; 2069 int ret; 2070 int run_all = count == (unsigned long)-1; 2071 int run_most = 0; 2072 2073 if (root == root->fs_info->extent_root) 2074 root = root->fs_info->tree_root; 2075 2076 delayed_refs = &trans->transaction->delayed_refs; 2077 INIT_LIST_HEAD(&cluster); 2078 again: 2079 spin_lock(&delayed_refs->lock); 2080 if (count == 0) { 2081 count = delayed_refs->num_entries * 2; 2082 run_most = 1; 2083 } 2084 while (1) { 2085 if (!(run_all || run_most) && 2086 delayed_refs->num_heads_ready < 64) 2087 break; 2088 2089 /* 2090 * go find something we can process in the rbtree. We start at 2091 * the beginning of the tree, and then build a cluster 2092 * of refs to process starting at the first one we are able to 2093 * lock 2094 */ 2095 ret = btrfs_find_ref_cluster(trans, &cluster, 2096 delayed_refs->run_delayed_start); 2097 if (ret) 2098 break; 2099 2100 ret = run_clustered_refs(trans, root, &cluster); 2101 BUG_ON(ret < 0); 2102 2103 count -= min_t(unsigned long, ret, count); 2104 2105 if (count == 0) 2106 break; 2107 } 2108 2109 if (run_all) { 2110 node = rb_first(&delayed_refs->root); 2111 if (!node) 2112 goto out; 2113 count = (unsigned long)-1; 2114 2115 while (node) { 2116 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2117 rb_node); 2118 if (btrfs_delayed_ref_is_head(ref)) { 2119 struct btrfs_delayed_ref_head *head; 2120 2121 head = btrfs_delayed_node_to_head(ref); 2122 atomic_inc(&ref->refs); 2123 2124 spin_unlock(&delayed_refs->lock); 2125 mutex_lock(&head->mutex); 2126 mutex_unlock(&head->mutex); 2127 2128 btrfs_put_delayed_ref(ref); 2129 cond_resched(); 2130 goto again; 2131 } 2132 node = rb_next(node); 2133 } 2134 spin_unlock(&delayed_refs->lock); 2135 schedule_timeout(1); 2136 goto again; 2137 } 2138 out: 2139 spin_unlock(&delayed_refs->lock); 2140 return 0; 2141 } 2142 2143 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2144 struct btrfs_root *root, 2145 u64 bytenr, u64 num_bytes, u64 flags, 2146 int is_data) 2147 { 2148 struct btrfs_delayed_extent_op *extent_op; 2149 int ret; 2150 2151 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 2152 if (!extent_op) 2153 return -ENOMEM; 2154 2155 extent_op->flags_to_set = flags; 2156 extent_op->update_flags = 1; 2157 extent_op->update_key = 0; 2158 extent_op->is_data = is_data ? 1 : 0; 2159 2160 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); 2161 if (ret) 2162 kfree(extent_op); 2163 return ret; 2164 } 2165 2166 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, 2167 struct btrfs_root *root, 2168 struct btrfs_path *path, 2169 u64 objectid, u64 offset, u64 bytenr) 2170 { 2171 struct btrfs_delayed_ref_head *head; 2172 struct btrfs_delayed_ref_node *ref; 2173 struct btrfs_delayed_data_ref *data_ref; 2174 struct btrfs_delayed_ref_root *delayed_refs; 2175 struct rb_node *node; 2176 int ret = 0; 2177 2178 ret = -ENOENT; 2179 delayed_refs = &trans->transaction->delayed_refs; 2180 spin_lock(&delayed_refs->lock); 2181 head = btrfs_find_delayed_ref_head(trans, bytenr); 2182 if (!head) 2183 goto out; 2184 2185 if (!mutex_trylock(&head->mutex)) { 2186 atomic_inc(&head->node.refs); 2187 spin_unlock(&delayed_refs->lock); 2188 2189 btrfs_release_path(root->fs_info->extent_root, path); 2190 2191 mutex_lock(&head->mutex); 2192 mutex_unlock(&head->mutex); 2193 btrfs_put_delayed_ref(&head->node); 2194 return -EAGAIN; 2195 } 2196 2197 node = rb_prev(&head->node.rb_node); 2198 if (!node) 2199 goto out_unlock; 2200 2201 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2202 2203 if (ref->bytenr != bytenr) 2204 goto out_unlock; 2205 2206 ret = 1; 2207 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) 2208 goto out_unlock; 2209 2210 data_ref = btrfs_delayed_node_to_data_ref(ref); 2211 2212 node = rb_prev(node); 2213 if (node) { 2214 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2215 if (ref->bytenr == bytenr) 2216 goto out_unlock; 2217 } 2218 2219 if (data_ref->root != root->root_key.objectid || 2220 data_ref->objectid != objectid || data_ref->offset != offset) 2221 goto out_unlock; 2222 2223 ret = 0; 2224 out_unlock: 2225 mutex_unlock(&head->mutex); 2226 out: 2227 spin_unlock(&delayed_refs->lock); 2228 return ret; 2229 } 2230 2231 static noinline int check_committed_ref(struct btrfs_trans_handle *trans, 2232 struct btrfs_root *root, 2233 struct btrfs_path *path, 2234 u64 objectid, u64 offset, u64 bytenr) 2235 { 2236 struct btrfs_root *extent_root = root->fs_info->extent_root; 2237 struct extent_buffer *leaf; 2238 struct btrfs_extent_data_ref *ref; 2239 struct btrfs_extent_inline_ref *iref; 2240 struct btrfs_extent_item *ei; 2241 struct btrfs_key key; 2242 u32 item_size; 2243 int ret; 2244 2245 key.objectid = bytenr; 2246 key.offset = (u64)-1; 2247 key.type = BTRFS_EXTENT_ITEM_KEY; 2248 2249 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2250 if (ret < 0) 2251 goto out; 2252 BUG_ON(ret == 0); 2253 2254 ret = -ENOENT; 2255 if (path->slots[0] == 0) 2256 goto out; 2257 2258 path->slots[0]--; 2259 leaf = path->nodes[0]; 2260 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2261 2262 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2263 goto out; 2264 2265 ret = 1; 2266 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2267 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2268 if (item_size < sizeof(*ei)) { 2269 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 2270 goto out; 2271 } 2272 #endif 2273 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2274 2275 if (item_size != sizeof(*ei) + 2276 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2277 goto out; 2278 2279 if (btrfs_extent_generation(leaf, ei) <= 2280 btrfs_root_last_snapshot(&root->root_item)) 2281 goto out; 2282 2283 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 2284 if (btrfs_extent_inline_ref_type(leaf, iref) != 2285 BTRFS_EXTENT_DATA_REF_KEY) 2286 goto out; 2287 2288 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 2289 if (btrfs_extent_refs(leaf, ei) != 2290 btrfs_extent_data_ref_count(leaf, ref) || 2291 btrfs_extent_data_ref_root(leaf, ref) != 2292 root->root_key.objectid || 2293 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 2294 btrfs_extent_data_ref_offset(leaf, ref) != offset) 2295 goto out; 2296 2297 ret = 0; 2298 out: 2299 return ret; 2300 } 2301 2302 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2303 struct btrfs_root *root, 2304 u64 objectid, u64 offset, u64 bytenr) 2305 { 2306 struct btrfs_path *path; 2307 int ret; 2308 int ret2; 2309 2310 path = btrfs_alloc_path(); 2311 if (!path) 2312 return -ENOENT; 2313 2314 do { 2315 ret = check_committed_ref(trans, root, path, objectid, 2316 offset, bytenr); 2317 if (ret && ret != -ENOENT) 2318 goto out; 2319 2320 ret2 = check_delayed_ref(trans, root, path, objectid, 2321 offset, bytenr); 2322 } while (ret2 == -EAGAIN); 2323 2324 if (ret2 && ret2 != -ENOENT) { 2325 ret = ret2; 2326 goto out; 2327 } 2328 2329 if (ret != -ENOENT || ret2 != -ENOENT) 2330 ret = 0; 2331 out: 2332 btrfs_free_path(path); 2333 return ret; 2334 } 2335 2336 #if 0 2337 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2338 struct extent_buffer *buf, u32 nr_extents) 2339 { 2340 struct btrfs_key key; 2341 struct btrfs_file_extent_item *fi; 2342 u64 root_gen; 2343 u32 nritems; 2344 int i; 2345 int level; 2346 int ret = 0; 2347 int shared = 0; 2348 2349 if (!root->ref_cows) 2350 return 0; 2351 2352 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 2353 shared = 0; 2354 root_gen = root->root_key.offset; 2355 } else { 2356 shared = 1; 2357 root_gen = trans->transid - 1; 2358 } 2359 2360 level = btrfs_header_level(buf); 2361 nritems = btrfs_header_nritems(buf); 2362 2363 if (level == 0) { 2364 struct btrfs_leaf_ref *ref; 2365 struct btrfs_extent_info *info; 2366 2367 ref = btrfs_alloc_leaf_ref(root, nr_extents); 2368 if (!ref) { 2369 ret = -ENOMEM; 2370 goto out; 2371 } 2372 2373 ref->root_gen = root_gen; 2374 ref->bytenr = buf->start; 2375 ref->owner = btrfs_header_owner(buf); 2376 ref->generation = btrfs_header_generation(buf); 2377 ref->nritems = nr_extents; 2378 info = ref->extents; 2379 2380 for (i = 0; nr_extents > 0 && i < nritems; i++) { 2381 u64 disk_bytenr; 2382 btrfs_item_key_to_cpu(buf, &key, i); 2383 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 2384 continue; 2385 fi = btrfs_item_ptr(buf, i, 2386 struct btrfs_file_extent_item); 2387 if (btrfs_file_extent_type(buf, fi) == 2388 BTRFS_FILE_EXTENT_INLINE) 2389 continue; 2390 disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 2391 if (disk_bytenr == 0) 2392 continue; 2393 2394 info->bytenr = disk_bytenr; 2395 info->num_bytes = 2396 btrfs_file_extent_disk_num_bytes(buf, fi); 2397 info->objectid = key.objectid; 2398 info->offset = key.offset; 2399 info++; 2400 } 2401 2402 ret = btrfs_add_leaf_ref(root, ref, shared); 2403 if (ret == -EEXIST && shared) { 2404 struct btrfs_leaf_ref *old; 2405 old = btrfs_lookup_leaf_ref(root, ref->bytenr); 2406 BUG_ON(!old); 2407 btrfs_remove_leaf_ref(root, old); 2408 btrfs_free_leaf_ref(root, old); 2409 ret = btrfs_add_leaf_ref(root, ref, shared); 2410 } 2411 WARN_ON(ret); 2412 btrfs_free_leaf_ref(root, ref); 2413 } 2414 out: 2415 return ret; 2416 } 2417 2418 /* when a block goes through cow, we update the reference counts of 2419 * everything that block points to. The internal pointers of the block 2420 * can be in just about any order, and it is likely to have clusters of 2421 * things that are close together and clusters of things that are not. 2422 * 2423 * To help reduce the seeks that come with updating all of these reference 2424 * counts, sort them by byte number before actual updates are done. 2425 * 2426 * struct refsort is used to match byte number to slot in the btree block. 2427 * we sort based on the byte number and then use the slot to actually 2428 * find the item. 2429 * 2430 * struct refsort is smaller than strcut btrfs_item and smaller than 2431 * struct btrfs_key_ptr. Since we're currently limited to the page size 2432 * for a btree block, there's no way for a kmalloc of refsorts for a 2433 * single node to be bigger than a page. 2434 */ 2435 struct refsort { 2436 u64 bytenr; 2437 u32 slot; 2438 }; 2439 2440 /* 2441 * for passing into sort() 2442 */ 2443 static int refsort_cmp(const void *a_void, const void *b_void) 2444 { 2445 const struct refsort *a = a_void; 2446 const struct refsort *b = b_void; 2447 2448 if (a->bytenr < b->bytenr) 2449 return -1; 2450 if (a->bytenr > b->bytenr) 2451 return 1; 2452 return 0; 2453 } 2454 #endif 2455 2456 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2457 struct btrfs_root *root, 2458 struct extent_buffer *buf, 2459 int full_backref, int inc) 2460 { 2461 u64 bytenr; 2462 u64 num_bytes; 2463 u64 parent; 2464 u64 ref_root; 2465 u32 nritems; 2466 struct btrfs_key key; 2467 struct btrfs_file_extent_item *fi; 2468 int i; 2469 int level; 2470 int ret = 0; 2471 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 2472 u64, u64, u64, u64, u64, u64); 2473 2474 ref_root = btrfs_header_owner(buf); 2475 nritems = btrfs_header_nritems(buf); 2476 level = btrfs_header_level(buf); 2477 2478 if (!root->ref_cows && level == 0) 2479 return 0; 2480 2481 if (inc) 2482 process_func = btrfs_inc_extent_ref; 2483 else 2484 process_func = btrfs_free_extent; 2485 2486 if (full_backref) 2487 parent = buf->start; 2488 else 2489 parent = 0; 2490 2491 for (i = 0; i < nritems; i++) { 2492 if (level == 0) { 2493 btrfs_item_key_to_cpu(buf, &key, i); 2494 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 2495 continue; 2496 fi = btrfs_item_ptr(buf, i, 2497 struct btrfs_file_extent_item); 2498 if (btrfs_file_extent_type(buf, fi) == 2499 BTRFS_FILE_EXTENT_INLINE) 2500 continue; 2501 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 2502 if (bytenr == 0) 2503 continue; 2504 2505 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 2506 key.offset -= btrfs_file_extent_offset(buf, fi); 2507 ret = process_func(trans, root, bytenr, num_bytes, 2508 parent, ref_root, key.objectid, 2509 key.offset); 2510 if (ret) 2511 goto fail; 2512 } else { 2513 bytenr = btrfs_node_blockptr(buf, i); 2514 num_bytes = btrfs_level_size(root, level - 1); 2515 ret = process_func(trans, root, bytenr, num_bytes, 2516 parent, ref_root, level - 1, 0); 2517 if (ret) 2518 goto fail; 2519 } 2520 } 2521 return 0; 2522 fail: 2523 BUG(); 2524 return ret; 2525 } 2526 2527 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2528 struct extent_buffer *buf, int full_backref) 2529 { 2530 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 2531 } 2532 2533 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2534 struct extent_buffer *buf, int full_backref) 2535 { 2536 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 2537 } 2538 2539 static int write_one_cache_group(struct btrfs_trans_handle *trans, 2540 struct btrfs_root *root, 2541 struct btrfs_path *path, 2542 struct btrfs_block_group_cache *cache) 2543 { 2544 int ret; 2545 struct btrfs_root *extent_root = root->fs_info->extent_root; 2546 unsigned long bi; 2547 struct extent_buffer *leaf; 2548 2549 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 2550 if (ret < 0) 2551 goto fail; 2552 BUG_ON(ret); 2553 2554 leaf = path->nodes[0]; 2555 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 2556 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 2557 btrfs_mark_buffer_dirty(leaf); 2558 btrfs_release_path(extent_root, path); 2559 fail: 2560 if (ret) 2561 return ret; 2562 return 0; 2563 2564 } 2565 2566 static struct btrfs_block_group_cache * 2567 next_block_group(struct btrfs_root *root, 2568 struct btrfs_block_group_cache *cache) 2569 { 2570 struct rb_node *node; 2571 spin_lock(&root->fs_info->block_group_cache_lock); 2572 node = rb_next(&cache->cache_node); 2573 btrfs_put_block_group(cache); 2574 if (node) { 2575 cache = rb_entry(node, struct btrfs_block_group_cache, 2576 cache_node); 2577 atomic_inc(&cache->count); 2578 } else 2579 cache = NULL; 2580 spin_unlock(&root->fs_info->block_group_cache_lock); 2581 return cache; 2582 } 2583 2584 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2585 struct btrfs_root *root) 2586 { 2587 struct btrfs_block_group_cache *cache; 2588 int err = 0; 2589 struct btrfs_path *path; 2590 u64 last = 0; 2591 2592 path = btrfs_alloc_path(); 2593 if (!path) 2594 return -ENOMEM; 2595 2596 while (1) { 2597 if (last == 0) { 2598 err = btrfs_run_delayed_refs(trans, root, 2599 (unsigned long)-1); 2600 BUG_ON(err); 2601 } 2602 2603 cache = btrfs_lookup_first_block_group(root->fs_info, last); 2604 while (cache) { 2605 if (cache->dirty) 2606 break; 2607 cache = next_block_group(root, cache); 2608 } 2609 if (!cache) { 2610 if (last == 0) 2611 break; 2612 last = 0; 2613 continue; 2614 } 2615 2616 cache->dirty = 0; 2617 last = cache->key.objectid + cache->key.offset; 2618 2619 err = write_one_cache_group(trans, root, path, cache); 2620 BUG_ON(err); 2621 btrfs_put_block_group(cache); 2622 } 2623 2624 btrfs_free_path(path); 2625 return 0; 2626 } 2627 2628 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 2629 { 2630 struct btrfs_block_group_cache *block_group; 2631 int readonly = 0; 2632 2633 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 2634 if (!block_group || block_group->ro) 2635 readonly = 1; 2636 if (block_group) 2637 btrfs_put_block_group(block_group); 2638 return readonly; 2639 } 2640 2641 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 2642 u64 total_bytes, u64 bytes_used, 2643 struct btrfs_space_info **space_info) 2644 { 2645 struct btrfs_space_info *found; 2646 2647 found = __find_space_info(info, flags); 2648 if (found) { 2649 spin_lock(&found->lock); 2650 found->total_bytes += total_bytes; 2651 found->bytes_used += bytes_used; 2652 found->full = 0; 2653 spin_unlock(&found->lock); 2654 *space_info = found; 2655 return 0; 2656 } 2657 found = kzalloc(sizeof(*found), GFP_NOFS); 2658 if (!found) 2659 return -ENOMEM; 2660 2661 INIT_LIST_HEAD(&found->block_groups); 2662 init_rwsem(&found->groups_sem); 2663 spin_lock_init(&found->lock); 2664 found->flags = flags; 2665 found->total_bytes = total_bytes; 2666 found->bytes_used = bytes_used; 2667 found->bytes_pinned = 0; 2668 found->bytes_reserved = 0; 2669 found->bytes_readonly = 0; 2670 found->bytes_delalloc = 0; 2671 found->full = 0; 2672 found->force_alloc = 0; 2673 *space_info = found; 2674 list_add_rcu(&found->list, &info->space_info); 2675 atomic_set(&found->caching_threads, 0); 2676 return 0; 2677 } 2678 2679 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 2680 { 2681 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | 2682 BTRFS_BLOCK_GROUP_RAID1 | 2683 BTRFS_BLOCK_GROUP_RAID10 | 2684 BTRFS_BLOCK_GROUP_DUP); 2685 if (extra_flags) { 2686 if (flags & BTRFS_BLOCK_GROUP_DATA) 2687 fs_info->avail_data_alloc_bits |= extra_flags; 2688 if (flags & BTRFS_BLOCK_GROUP_METADATA) 2689 fs_info->avail_metadata_alloc_bits |= extra_flags; 2690 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 2691 fs_info->avail_system_alloc_bits |= extra_flags; 2692 } 2693 } 2694 2695 static void set_block_group_readonly(struct btrfs_block_group_cache *cache) 2696 { 2697 spin_lock(&cache->space_info->lock); 2698 spin_lock(&cache->lock); 2699 if (!cache->ro) { 2700 cache->space_info->bytes_readonly += cache->key.offset - 2701 btrfs_block_group_used(&cache->item); 2702 cache->ro = 1; 2703 } 2704 spin_unlock(&cache->lock); 2705 spin_unlock(&cache->space_info->lock); 2706 } 2707 2708 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 2709 { 2710 u64 num_devices = root->fs_info->fs_devices->rw_devices; 2711 2712 if (num_devices == 1) 2713 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 2714 if (num_devices < 4) 2715 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 2716 2717 if ((flags & BTRFS_BLOCK_GROUP_DUP) && 2718 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 2719 BTRFS_BLOCK_GROUP_RAID10))) { 2720 flags &= ~BTRFS_BLOCK_GROUP_DUP; 2721 } 2722 2723 if ((flags & BTRFS_BLOCK_GROUP_RAID1) && 2724 (flags & BTRFS_BLOCK_GROUP_RAID10)) { 2725 flags &= ~BTRFS_BLOCK_GROUP_RAID1; 2726 } 2727 2728 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 2729 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 2730 (flags & BTRFS_BLOCK_GROUP_RAID10) | 2731 (flags & BTRFS_BLOCK_GROUP_DUP))) 2732 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 2733 return flags; 2734 } 2735 2736 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) 2737 { 2738 struct btrfs_fs_info *info = root->fs_info; 2739 u64 alloc_profile; 2740 2741 if (data) { 2742 alloc_profile = info->avail_data_alloc_bits & 2743 info->data_alloc_profile; 2744 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; 2745 } else if (root == root->fs_info->chunk_root) { 2746 alloc_profile = info->avail_system_alloc_bits & 2747 info->system_alloc_profile; 2748 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; 2749 } else { 2750 alloc_profile = info->avail_metadata_alloc_bits & 2751 info->metadata_alloc_profile; 2752 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; 2753 } 2754 2755 return btrfs_reduce_alloc_profile(root, data); 2756 } 2757 2758 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) 2759 { 2760 u64 alloc_target; 2761 2762 alloc_target = btrfs_get_alloc_profile(root, 1); 2763 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, 2764 alloc_target); 2765 } 2766 2767 static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) 2768 { 2769 u64 num_bytes; 2770 int level; 2771 2772 level = BTRFS_MAX_LEVEL - 2; 2773 /* 2774 * NOTE: these calculations are absolutely the worst possible case. 2775 * This assumes that _every_ item we insert will require a new leaf, and 2776 * that the tree has grown to its maximum level size. 2777 */ 2778 2779 /* 2780 * for every item we insert we could insert both an extent item and a 2781 * extent ref item. Then for ever item we insert, we will need to cow 2782 * both the original leaf, plus the leaf to the left and right of it. 2783 * 2784 * Unless we are talking about the extent root, then we just want the 2785 * number of items * 2, since we just need the extent item plus its ref. 2786 */ 2787 if (root == root->fs_info->extent_root) 2788 num_bytes = num_items * 2; 2789 else 2790 num_bytes = (num_items + (2 * num_items)) * 3; 2791 2792 /* 2793 * num_bytes is total number of leaves we could need times the leaf 2794 * size, and then for every leaf we could end up cow'ing 2 nodes per 2795 * level, down to the leaf level. 2796 */ 2797 num_bytes = (num_bytes * root->leafsize) + 2798 (num_bytes * (level * 2)) * root->nodesize; 2799 2800 return num_bytes; 2801 } 2802 2803 /* 2804 * Unreserve metadata space for delalloc. If we have less reserved credits than 2805 * we have extents, this function does nothing. 2806 */ 2807 int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, 2808 struct inode *inode, int num_items) 2809 { 2810 struct btrfs_fs_info *info = root->fs_info; 2811 struct btrfs_space_info *meta_sinfo; 2812 u64 num_bytes; 2813 u64 alloc_target; 2814 bool bug = false; 2815 2816 /* get the space info for where the metadata will live */ 2817 alloc_target = btrfs_get_alloc_profile(root, 0); 2818 meta_sinfo = __find_space_info(info, alloc_target); 2819 2820 num_bytes = calculate_bytes_needed(root->fs_info->extent_root, 2821 num_items); 2822 2823 spin_lock(&meta_sinfo->lock); 2824 spin_lock(&BTRFS_I(inode)->accounting_lock); 2825 if (BTRFS_I(inode)->reserved_extents <= 2826 BTRFS_I(inode)->outstanding_extents) { 2827 spin_unlock(&BTRFS_I(inode)->accounting_lock); 2828 spin_unlock(&meta_sinfo->lock); 2829 return 0; 2830 } 2831 spin_unlock(&BTRFS_I(inode)->accounting_lock); 2832 2833 BTRFS_I(inode)->reserved_extents--; 2834 BUG_ON(BTRFS_I(inode)->reserved_extents < 0); 2835 2836 if (meta_sinfo->bytes_delalloc < num_bytes) { 2837 bug = true; 2838 meta_sinfo->bytes_delalloc = 0; 2839 } else { 2840 meta_sinfo->bytes_delalloc -= num_bytes; 2841 } 2842 spin_unlock(&meta_sinfo->lock); 2843 2844 BUG_ON(bug); 2845 2846 return 0; 2847 } 2848 2849 static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) 2850 { 2851 u64 thresh; 2852 2853 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2854 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2855 meta_sinfo->bytes_super + meta_sinfo->bytes_root + 2856 meta_sinfo->bytes_may_use; 2857 2858 thresh = meta_sinfo->total_bytes - thresh; 2859 thresh *= 80; 2860 do_div(thresh, 100); 2861 if (thresh <= meta_sinfo->bytes_delalloc) 2862 meta_sinfo->force_delalloc = 1; 2863 else 2864 meta_sinfo->force_delalloc = 0; 2865 } 2866 2867 struct async_flush { 2868 struct btrfs_root *root; 2869 struct btrfs_space_info *info; 2870 struct btrfs_work work; 2871 }; 2872 2873 static noinline void flush_delalloc_async(struct btrfs_work *work) 2874 { 2875 struct async_flush *async; 2876 struct btrfs_root *root; 2877 struct btrfs_space_info *info; 2878 2879 async = container_of(work, struct async_flush, work); 2880 root = async->root; 2881 info = async->info; 2882 2883 btrfs_start_delalloc_inodes(root); 2884 wake_up(&info->flush_wait); 2885 btrfs_wait_ordered_extents(root, 0); 2886 2887 spin_lock(&info->lock); 2888 info->flushing = 0; 2889 spin_unlock(&info->lock); 2890 wake_up(&info->flush_wait); 2891 2892 kfree(async); 2893 } 2894 2895 static void wait_on_flush(struct btrfs_space_info *info) 2896 { 2897 DEFINE_WAIT(wait); 2898 u64 used; 2899 2900 while (1) { 2901 prepare_to_wait(&info->flush_wait, &wait, 2902 TASK_UNINTERRUPTIBLE); 2903 spin_lock(&info->lock); 2904 if (!info->flushing) { 2905 spin_unlock(&info->lock); 2906 break; 2907 } 2908 2909 used = info->bytes_used + info->bytes_reserved + 2910 info->bytes_pinned + info->bytes_readonly + 2911 info->bytes_super + info->bytes_root + 2912 info->bytes_may_use + info->bytes_delalloc; 2913 if (used < info->total_bytes) { 2914 spin_unlock(&info->lock); 2915 break; 2916 } 2917 spin_unlock(&info->lock); 2918 schedule(); 2919 } 2920 finish_wait(&info->flush_wait, &wait); 2921 } 2922 2923 static void flush_delalloc(struct btrfs_root *root, 2924 struct btrfs_space_info *info) 2925 { 2926 struct async_flush *async; 2927 bool wait = false; 2928 2929 spin_lock(&info->lock); 2930 2931 if (!info->flushing) { 2932 info->flushing = 1; 2933 init_waitqueue_head(&info->flush_wait); 2934 } else { 2935 wait = true; 2936 } 2937 2938 spin_unlock(&info->lock); 2939 2940 if (wait) { 2941 wait_on_flush(info); 2942 return; 2943 } 2944 2945 async = kzalloc(sizeof(*async), GFP_NOFS); 2946 if (!async) 2947 goto flush; 2948 2949 async->root = root; 2950 async->info = info; 2951 async->work.func = flush_delalloc_async; 2952 2953 btrfs_queue_worker(&root->fs_info->enospc_workers, 2954 &async->work); 2955 wait_on_flush(info); 2956 return; 2957 2958 flush: 2959 btrfs_start_delalloc_inodes(root); 2960 btrfs_wait_ordered_extents(root, 0); 2961 2962 spin_lock(&info->lock); 2963 info->flushing = 0; 2964 spin_unlock(&info->lock); 2965 wake_up(&info->flush_wait); 2966 } 2967 2968 static int maybe_allocate_chunk(struct btrfs_root *root, 2969 struct btrfs_space_info *info) 2970 { 2971 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 2972 struct btrfs_trans_handle *trans; 2973 bool wait = false; 2974 int ret = 0; 2975 u64 min_metadata; 2976 u64 free_space; 2977 2978 free_space = btrfs_super_total_bytes(disk_super); 2979 /* 2980 * we allow the metadata to grow to a max of either 10gb or 5% of the 2981 * space in the volume. 2982 */ 2983 min_metadata = min((u64)10 * 1024 * 1024 * 1024, 2984 div64_u64(free_space * 5, 100)); 2985 if (info->total_bytes >= min_metadata) { 2986 spin_unlock(&info->lock); 2987 return 0; 2988 } 2989 2990 if (info->full) { 2991 spin_unlock(&info->lock); 2992 return 0; 2993 } 2994 2995 if (!info->allocating_chunk) { 2996 info->force_alloc = 1; 2997 info->allocating_chunk = 1; 2998 init_waitqueue_head(&info->allocate_wait); 2999 } else { 3000 wait = true; 3001 } 3002 3003 spin_unlock(&info->lock); 3004 3005 if (wait) { 3006 wait_event(info->allocate_wait, 3007 !info->allocating_chunk); 3008 return 1; 3009 } 3010 3011 trans = btrfs_start_transaction(root, 1); 3012 if (!trans) { 3013 ret = -ENOMEM; 3014 goto out; 3015 } 3016 3017 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3018 4096 + 2 * 1024 * 1024, 3019 info->flags, 0); 3020 btrfs_end_transaction(trans, root); 3021 if (ret) 3022 goto out; 3023 out: 3024 spin_lock(&info->lock); 3025 info->allocating_chunk = 0; 3026 spin_unlock(&info->lock); 3027 wake_up(&info->allocate_wait); 3028 3029 if (ret) 3030 return 0; 3031 return 1; 3032 } 3033 3034 /* 3035 * Reserve metadata space for delalloc. 3036 */ 3037 int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, 3038 struct inode *inode, int num_items) 3039 { 3040 struct btrfs_fs_info *info = root->fs_info; 3041 struct btrfs_space_info *meta_sinfo; 3042 u64 num_bytes; 3043 u64 used; 3044 u64 alloc_target; 3045 int flushed = 0; 3046 int force_delalloc; 3047 3048 /* get the space info for where the metadata will live */ 3049 alloc_target = btrfs_get_alloc_profile(root, 0); 3050 meta_sinfo = __find_space_info(info, alloc_target); 3051 3052 num_bytes = calculate_bytes_needed(root->fs_info->extent_root, 3053 num_items); 3054 again: 3055 spin_lock(&meta_sinfo->lock); 3056 3057 force_delalloc = meta_sinfo->force_delalloc; 3058 3059 if (unlikely(!meta_sinfo->bytes_root)) 3060 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); 3061 3062 if (!flushed) 3063 meta_sinfo->bytes_delalloc += num_bytes; 3064 3065 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 3066 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 3067 meta_sinfo->bytes_super + meta_sinfo->bytes_root + 3068 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; 3069 3070 if (used > meta_sinfo->total_bytes) { 3071 flushed++; 3072 3073 if (flushed == 1) { 3074 if (maybe_allocate_chunk(root, meta_sinfo)) 3075 goto again; 3076 flushed++; 3077 } else { 3078 spin_unlock(&meta_sinfo->lock); 3079 } 3080 3081 if (flushed == 2) { 3082 filemap_flush(inode->i_mapping); 3083 goto again; 3084 } else if (flushed == 3) { 3085 flush_delalloc(root, meta_sinfo); 3086 goto again; 3087 } 3088 spin_lock(&meta_sinfo->lock); 3089 meta_sinfo->bytes_delalloc -= num_bytes; 3090 spin_unlock(&meta_sinfo->lock); 3091 printk(KERN_ERR "enospc, has %d, reserved %d\n", 3092 BTRFS_I(inode)->outstanding_extents, 3093 BTRFS_I(inode)->reserved_extents); 3094 dump_space_info(meta_sinfo, 0, 0); 3095 return -ENOSPC; 3096 } 3097 3098 BTRFS_I(inode)->reserved_extents++; 3099 check_force_delalloc(meta_sinfo); 3100 spin_unlock(&meta_sinfo->lock); 3101 3102 if (!flushed && force_delalloc) 3103 filemap_flush(inode->i_mapping); 3104 3105 return 0; 3106 } 3107 3108 /* 3109 * unreserve num_items number of items worth of metadata space. This needs to 3110 * be paired with btrfs_reserve_metadata_space. 3111 * 3112 * NOTE: if you have the option, run this _AFTER_ you do a 3113 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref 3114 * oprations which will result in more used metadata, so we want to make sure we 3115 * can do that without issue. 3116 */ 3117 int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) 3118 { 3119 struct btrfs_fs_info *info = root->fs_info; 3120 struct btrfs_space_info *meta_sinfo; 3121 u64 num_bytes; 3122 u64 alloc_target; 3123 bool bug = false; 3124 3125 /* get the space info for where the metadata will live */ 3126 alloc_target = btrfs_get_alloc_profile(root, 0); 3127 meta_sinfo = __find_space_info(info, alloc_target); 3128 3129 num_bytes = calculate_bytes_needed(root, num_items); 3130 3131 spin_lock(&meta_sinfo->lock); 3132 if (meta_sinfo->bytes_may_use < num_bytes) { 3133 bug = true; 3134 meta_sinfo->bytes_may_use = 0; 3135 } else { 3136 meta_sinfo->bytes_may_use -= num_bytes; 3137 } 3138 spin_unlock(&meta_sinfo->lock); 3139 3140 BUG_ON(bug); 3141 3142 return 0; 3143 } 3144 3145 /* 3146 * Reserve some metadata space for use. We'll calculate the worste case number 3147 * of bytes that would be needed to modify num_items number of items. If we 3148 * have space, fantastic, if not, you get -ENOSPC. Please call 3149 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of 3150 * items you reserved, since whatever metadata you needed should have already 3151 * been allocated. 3152 * 3153 * This will commit the transaction to make more space if we don't have enough 3154 * metadata space. THe only time we don't do this is if we're reserving space 3155 * inside of a transaction, then we will just return -ENOSPC and it is the 3156 * callers responsibility to handle it properly. 3157 */ 3158 int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) 3159 { 3160 struct btrfs_fs_info *info = root->fs_info; 3161 struct btrfs_space_info *meta_sinfo; 3162 u64 num_bytes; 3163 u64 used; 3164 u64 alloc_target; 3165 int retries = 0; 3166 3167 /* get the space info for where the metadata will live */ 3168 alloc_target = btrfs_get_alloc_profile(root, 0); 3169 meta_sinfo = __find_space_info(info, alloc_target); 3170 3171 num_bytes = calculate_bytes_needed(root, num_items); 3172 again: 3173 spin_lock(&meta_sinfo->lock); 3174 3175 if (unlikely(!meta_sinfo->bytes_root)) 3176 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); 3177 3178 if (!retries) 3179 meta_sinfo->bytes_may_use += num_bytes; 3180 3181 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 3182 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 3183 meta_sinfo->bytes_super + meta_sinfo->bytes_root + 3184 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; 3185 3186 if (used > meta_sinfo->total_bytes) { 3187 retries++; 3188 if (retries == 1) { 3189 if (maybe_allocate_chunk(root, meta_sinfo)) 3190 goto again; 3191 retries++; 3192 } else { 3193 spin_unlock(&meta_sinfo->lock); 3194 } 3195 3196 if (retries == 2) { 3197 flush_delalloc(root, meta_sinfo); 3198 goto again; 3199 } 3200 spin_lock(&meta_sinfo->lock); 3201 meta_sinfo->bytes_may_use -= num_bytes; 3202 spin_unlock(&meta_sinfo->lock); 3203 3204 dump_space_info(meta_sinfo, 0, 0); 3205 return -ENOSPC; 3206 } 3207 3208 check_force_delalloc(meta_sinfo); 3209 spin_unlock(&meta_sinfo->lock); 3210 3211 return 0; 3212 } 3213 3214 /* 3215 * This will check the space that the inode allocates from to make sure we have 3216 * enough space for bytes. 3217 */ 3218 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 3219 u64 bytes) 3220 { 3221 struct btrfs_space_info *data_sinfo; 3222 int ret = 0, committed = 0; 3223 3224 /* make sure bytes are sectorsize aligned */ 3225 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3226 3227 data_sinfo = BTRFS_I(inode)->space_info; 3228 if (!data_sinfo) 3229 goto alloc; 3230 3231 again: 3232 /* make sure we have enough space to handle the data first */ 3233 spin_lock(&data_sinfo->lock); 3234 if (data_sinfo->total_bytes - data_sinfo->bytes_used - 3235 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - 3236 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - 3237 data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { 3238 struct btrfs_trans_handle *trans; 3239 3240 /* 3241 * if we don't have enough free bytes in this space then we need 3242 * to alloc a new chunk. 3243 */ 3244 if (!data_sinfo->full) { 3245 u64 alloc_target; 3246 3247 data_sinfo->force_alloc = 1; 3248 spin_unlock(&data_sinfo->lock); 3249 alloc: 3250 alloc_target = btrfs_get_alloc_profile(root, 1); 3251 trans = btrfs_start_transaction(root, 1); 3252 if (!trans) 3253 return -ENOMEM; 3254 3255 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3256 bytes + 2 * 1024 * 1024, 3257 alloc_target, 0); 3258 btrfs_end_transaction(trans, root); 3259 if (ret) 3260 return ret; 3261 3262 if (!data_sinfo) { 3263 btrfs_set_inode_space_info(root, inode); 3264 data_sinfo = BTRFS_I(inode)->space_info; 3265 } 3266 goto again; 3267 } 3268 spin_unlock(&data_sinfo->lock); 3269 3270 /* commit the current transaction and try again */ 3271 if (!committed && !root->fs_info->open_ioctl_trans) { 3272 committed = 1; 3273 trans = btrfs_join_transaction(root, 1); 3274 if (!trans) 3275 return -ENOMEM; 3276 ret = btrfs_commit_transaction(trans, root); 3277 if (ret) 3278 return ret; 3279 goto again; 3280 } 3281 3282 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 3283 ", %llu bytes_used, %llu bytes_reserved, " 3284 "%llu bytes_pinned, %llu bytes_readonly, %llu may use " 3285 "%llu total\n", (unsigned long long)bytes, 3286 (unsigned long long)data_sinfo->bytes_delalloc, 3287 (unsigned long long)data_sinfo->bytes_used, 3288 (unsigned long long)data_sinfo->bytes_reserved, 3289 (unsigned long long)data_sinfo->bytes_pinned, 3290 (unsigned long long)data_sinfo->bytes_readonly, 3291 (unsigned long long)data_sinfo->bytes_may_use, 3292 (unsigned long long)data_sinfo->total_bytes); 3293 return -ENOSPC; 3294 } 3295 data_sinfo->bytes_may_use += bytes; 3296 BTRFS_I(inode)->reserved_bytes += bytes; 3297 spin_unlock(&data_sinfo->lock); 3298 3299 return 0; 3300 } 3301 3302 /* 3303 * if there was an error for whatever reason after calling 3304 * btrfs_check_data_free_space, call this so we can cleanup the counters. 3305 */ 3306 void btrfs_free_reserved_data_space(struct btrfs_root *root, 3307 struct inode *inode, u64 bytes) 3308 { 3309 struct btrfs_space_info *data_sinfo; 3310 3311 /* make sure bytes are sectorsize aligned */ 3312 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3313 3314 data_sinfo = BTRFS_I(inode)->space_info; 3315 spin_lock(&data_sinfo->lock); 3316 data_sinfo->bytes_may_use -= bytes; 3317 BTRFS_I(inode)->reserved_bytes -= bytes; 3318 spin_unlock(&data_sinfo->lock); 3319 } 3320 3321 /* called when we are adding a delalloc extent to the inode's io_tree */ 3322 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, 3323 u64 bytes) 3324 { 3325 struct btrfs_space_info *data_sinfo; 3326 3327 /* get the space info for where this inode will be storing its data */ 3328 data_sinfo = BTRFS_I(inode)->space_info; 3329 3330 /* make sure we have enough space to handle the data first */ 3331 spin_lock(&data_sinfo->lock); 3332 data_sinfo->bytes_delalloc += bytes; 3333 3334 /* 3335 * we are adding a delalloc extent without calling 3336 * btrfs_check_data_free_space first. This happens on a weird 3337 * writepage condition, but shouldn't hurt our accounting 3338 */ 3339 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) { 3340 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes; 3341 BTRFS_I(inode)->reserved_bytes = 0; 3342 } else { 3343 data_sinfo->bytes_may_use -= bytes; 3344 BTRFS_I(inode)->reserved_bytes -= bytes; 3345 } 3346 3347 spin_unlock(&data_sinfo->lock); 3348 } 3349 3350 /* called when we are clearing an delalloc extent from the inode's io_tree */ 3351 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 3352 u64 bytes) 3353 { 3354 struct btrfs_space_info *info; 3355 3356 info = BTRFS_I(inode)->space_info; 3357 3358 spin_lock(&info->lock); 3359 info->bytes_delalloc -= bytes; 3360 spin_unlock(&info->lock); 3361 } 3362 3363 static void force_metadata_allocation(struct btrfs_fs_info *info) 3364 { 3365 struct list_head *head = &info->space_info; 3366 struct btrfs_space_info *found; 3367 3368 rcu_read_lock(); 3369 list_for_each_entry_rcu(found, head, list) { 3370 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3371 found->force_alloc = 1; 3372 } 3373 rcu_read_unlock(); 3374 } 3375 3376 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3377 struct btrfs_root *extent_root, u64 alloc_bytes, 3378 u64 flags, int force) 3379 { 3380 struct btrfs_space_info *space_info; 3381 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3382 u64 thresh; 3383 int ret = 0; 3384 3385 mutex_lock(&fs_info->chunk_mutex); 3386 3387 flags = btrfs_reduce_alloc_profile(extent_root, flags); 3388 3389 space_info = __find_space_info(extent_root->fs_info, flags); 3390 if (!space_info) { 3391 ret = update_space_info(extent_root->fs_info, flags, 3392 0, 0, &space_info); 3393 BUG_ON(ret); 3394 } 3395 BUG_ON(!space_info); 3396 3397 spin_lock(&space_info->lock); 3398 if (space_info->force_alloc) 3399 force = 1; 3400 if (space_info->full) { 3401 spin_unlock(&space_info->lock); 3402 goto out; 3403 } 3404 3405 thresh = space_info->total_bytes - space_info->bytes_readonly; 3406 thresh = div_factor(thresh, 8); 3407 if (!force && 3408 (space_info->bytes_used + space_info->bytes_pinned + 3409 space_info->bytes_reserved + alloc_bytes) < thresh) { 3410 spin_unlock(&space_info->lock); 3411 goto out; 3412 } 3413 spin_unlock(&space_info->lock); 3414 3415 /* 3416 * if we're doing a data chunk, go ahead and make sure that 3417 * we keep a reasonable number of metadata chunks allocated in the 3418 * FS as well. 3419 */ 3420 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3421 fs_info->data_chunk_allocations++; 3422 if (!(fs_info->data_chunk_allocations % 3423 fs_info->metadata_ratio)) 3424 force_metadata_allocation(fs_info); 3425 } 3426 3427 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3428 spin_lock(&space_info->lock); 3429 if (ret) 3430 space_info->full = 1; 3431 space_info->force_alloc = 0; 3432 spin_unlock(&space_info->lock); 3433 out: 3434 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3435 return ret; 3436 } 3437 3438 static int update_block_group(struct btrfs_trans_handle *trans, 3439 struct btrfs_root *root, 3440 u64 bytenr, u64 num_bytes, int alloc, 3441 int mark_free) 3442 { 3443 struct btrfs_block_group_cache *cache; 3444 struct btrfs_fs_info *info = root->fs_info; 3445 u64 total = num_bytes; 3446 u64 old_val; 3447 u64 byte_in_group; 3448 3449 /* block accounting for super block */ 3450 spin_lock(&info->delalloc_lock); 3451 old_val = btrfs_super_bytes_used(&info->super_copy); 3452 if (alloc) 3453 old_val += num_bytes; 3454 else 3455 old_val -= num_bytes; 3456 btrfs_set_super_bytes_used(&info->super_copy, old_val); 3457 3458 /* block accounting for root item */ 3459 old_val = btrfs_root_used(&root->root_item); 3460 if (alloc) 3461 old_val += num_bytes; 3462 else 3463 old_val -= num_bytes; 3464 btrfs_set_root_used(&root->root_item, old_val); 3465 spin_unlock(&info->delalloc_lock); 3466 3467 while (total) { 3468 cache = btrfs_lookup_block_group(info, bytenr); 3469 if (!cache) 3470 return -1; 3471 byte_in_group = bytenr - cache->key.objectid; 3472 WARN_ON(byte_in_group > cache->key.offset); 3473 3474 spin_lock(&cache->space_info->lock); 3475 spin_lock(&cache->lock); 3476 cache->dirty = 1; 3477 old_val = btrfs_block_group_used(&cache->item); 3478 num_bytes = min(total, cache->key.offset - byte_in_group); 3479 if (alloc) { 3480 old_val += num_bytes; 3481 btrfs_set_block_group_used(&cache->item, old_val); 3482 cache->reserved -= num_bytes; 3483 cache->space_info->bytes_used += num_bytes; 3484 cache->space_info->bytes_reserved -= num_bytes; 3485 if (cache->ro) 3486 cache->space_info->bytes_readonly -= num_bytes; 3487 spin_unlock(&cache->lock); 3488 spin_unlock(&cache->space_info->lock); 3489 } else { 3490 old_val -= num_bytes; 3491 cache->space_info->bytes_used -= num_bytes; 3492 if (cache->ro) 3493 cache->space_info->bytes_readonly += num_bytes; 3494 btrfs_set_block_group_used(&cache->item, old_val); 3495 spin_unlock(&cache->lock); 3496 spin_unlock(&cache->space_info->lock); 3497 if (mark_free) { 3498 int ret; 3499 3500 ret = btrfs_discard_extent(root, bytenr, 3501 num_bytes); 3502 WARN_ON(ret); 3503 3504 ret = btrfs_add_free_space(cache, bytenr, 3505 num_bytes); 3506 WARN_ON(ret); 3507 } 3508 } 3509 btrfs_put_block_group(cache); 3510 total -= num_bytes; 3511 bytenr += num_bytes; 3512 } 3513 return 0; 3514 } 3515 3516 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 3517 { 3518 struct btrfs_block_group_cache *cache; 3519 u64 bytenr; 3520 3521 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 3522 if (!cache) 3523 return 0; 3524 3525 bytenr = cache->key.objectid; 3526 btrfs_put_block_group(cache); 3527 3528 return bytenr; 3529 } 3530 3531 /* 3532 * this function must be called within transaction 3533 */ 3534 int btrfs_pin_extent(struct btrfs_root *root, 3535 u64 bytenr, u64 num_bytes, int reserved) 3536 { 3537 struct btrfs_fs_info *fs_info = root->fs_info; 3538 struct btrfs_block_group_cache *cache; 3539 3540 cache = btrfs_lookup_block_group(fs_info, bytenr); 3541 BUG_ON(!cache); 3542 3543 spin_lock(&cache->space_info->lock); 3544 spin_lock(&cache->lock); 3545 cache->pinned += num_bytes; 3546 cache->space_info->bytes_pinned += num_bytes; 3547 if (reserved) { 3548 cache->reserved -= num_bytes; 3549 cache->space_info->bytes_reserved -= num_bytes; 3550 } 3551 spin_unlock(&cache->lock); 3552 spin_unlock(&cache->space_info->lock); 3553 3554 btrfs_put_block_group(cache); 3555 3556 set_extent_dirty(fs_info->pinned_extents, 3557 bytenr, bytenr + num_bytes - 1, GFP_NOFS); 3558 return 0; 3559 } 3560 3561 static int update_reserved_extents(struct btrfs_block_group_cache *cache, 3562 u64 num_bytes, int reserve) 3563 { 3564 spin_lock(&cache->space_info->lock); 3565 spin_lock(&cache->lock); 3566 if (reserve) { 3567 cache->reserved += num_bytes; 3568 cache->space_info->bytes_reserved += num_bytes; 3569 } else { 3570 cache->reserved -= num_bytes; 3571 cache->space_info->bytes_reserved -= num_bytes; 3572 } 3573 spin_unlock(&cache->lock); 3574 spin_unlock(&cache->space_info->lock); 3575 return 0; 3576 } 3577 3578 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 3579 struct btrfs_root *root) 3580 { 3581 struct btrfs_fs_info *fs_info = root->fs_info; 3582 struct btrfs_caching_control *next; 3583 struct btrfs_caching_control *caching_ctl; 3584 struct btrfs_block_group_cache *cache; 3585 3586 down_write(&fs_info->extent_commit_sem); 3587 3588 list_for_each_entry_safe(caching_ctl, next, 3589 &fs_info->caching_block_groups, list) { 3590 cache = caching_ctl->block_group; 3591 if (block_group_cache_done(cache)) { 3592 cache->last_byte_to_unpin = (u64)-1; 3593 list_del_init(&caching_ctl->list); 3594 put_caching_control(caching_ctl); 3595 } else { 3596 cache->last_byte_to_unpin = caching_ctl->progress; 3597 } 3598 } 3599 3600 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 3601 fs_info->pinned_extents = &fs_info->freed_extents[1]; 3602 else 3603 fs_info->pinned_extents = &fs_info->freed_extents[0]; 3604 3605 up_write(&fs_info->extent_commit_sem); 3606 return 0; 3607 } 3608 3609 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 3610 { 3611 struct btrfs_fs_info *fs_info = root->fs_info; 3612 struct btrfs_block_group_cache *cache = NULL; 3613 u64 len; 3614 3615 while (start <= end) { 3616 if (!cache || 3617 start >= cache->key.objectid + cache->key.offset) { 3618 if (cache) 3619 btrfs_put_block_group(cache); 3620 cache = btrfs_lookup_block_group(fs_info, start); 3621 BUG_ON(!cache); 3622 } 3623 3624 len = cache->key.objectid + cache->key.offset - start; 3625 len = min(len, end + 1 - start); 3626 3627 if (start < cache->last_byte_to_unpin) { 3628 len = min(len, cache->last_byte_to_unpin - start); 3629 btrfs_add_free_space(cache, start, len); 3630 } 3631 3632 spin_lock(&cache->space_info->lock); 3633 spin_lock(&cache->lock); 3634 cache->pinned -= len; 3635 cache->space_info->bytes_pinned -= len; 3636 spin_unlock(&cache->lock); 3637 spin_unlock(&cache->space_info->lock); 3638 3639 start += len; 3640 } 3641 3642 if (cache) 3643 btrfs_put_block_group(cache); 3644 return 0; 3645 } 3646 3647 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 3648 struct btrfs_root *root) 3649 { 3650 struct btrfs_fs_info *fs_info = root->fs_info; 3651 struct extent_io_tree *unpin; 3652 u64 start; 3653 u64 end; 3654 int ret; 3655 3656 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 3657 unpin = &fs_info->freed_extents[1]; 3658 else 3659 unpin = &fs_info->freed_extents[0]; 3660 3661 while (1) { 3662 ret = find_first_extent_bit(unpin, 0, &start, &end, 3663 EXTENT_DIRTY); 3664 if (ret) 3665 break; 3666 3667 ret = btrfs_discard_extent(root, start, end + 1 - start); 3668 3669 clear_extent_dirty(unpin, start, end, GFP_NOFS); 3670 unpin_extent_range(root, start, end); 3671 cond_resched(); 3672 } 3673 3674 return ret; 3675 } 3676 3677 static int pin_down_bytes(struct btrfs_trans_handle *trans, 3678 struct btrfs_root *root, 3679 struct btrfs_path *path, 3680 u64 bytenr, u64 num_bytes, 3681 int is_data, int reserved, 3682 struct extent_buffer **must_clean) 3683 { 3684 int err = 0; 3685 struct extent_buffer *buf; 3686 3687 if (is_data) 3688 goto pinit; 3689 3690 /* 3691 * discard is sloooow, and so triggering discards on 3692 * individual btree blocks isn't a good plan. Just 3693 * pin everything in discard mode. 3694 */ 3695 if (btrfs_test_opt(root, DISCARD)) 3696 goto pinit; 3697 3698 buf = btrfs_find_tree_block(root, bytenr, num_bytes); 3699 if (!buf) 3700 goto pinit; 3701 3702 /* we can reuse a block if it hasn't been written 3703 * and it is from this transaction. We can't 3704 * reuse anything from the tree log root because 3705 * it has tiny sub-transactions. 3706 */ 3707 if (btrfs_buffer_uptodate(buf, 0) && 3708 btrfs_try_tree_lock(buf)) { 3709 u64 header_owner = btrfs_header_owner(buf); 3710 u64 header_transid = btrfs_header_generation(buf); 3711 if (header_owner != BTRFS_TREE_LOG_OBJECTID && 3712 header_transid == trans->transid && 3713 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 3714 *must_clean = buf; 3715 return 1; 3716 } 3717 btrfs_tree_unlock(buf); 3718 } 3719 free_extent_buffer(buf); 3720 pinit: 3721 if (path) 3722 btrfs_set_path_blocking(path); 3723 /* unlocks the pinned mutex */ 3724 btrfs_pin_extent(root, bytenr, num_bytes, reserved); 3725 3726 BUG_ON(err < 0); 3727 return 0; 3728 } 3729 3730 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 3731 struct btrfs_root *root, 3732 u64 bytenr, u64 num_bytes, u64 parent, 3733 u64 root_objectid, u64 owner_objectid, 3734 u64 owner_offset, int refs_to_drop, 3735 struct btrfs_delayed_extent_op *extent_op) 3736 { 3737 struct btrfs_key key; 3738 struct btrfs_path *path; 3739 struct btrfs_fs_info *info = root->fs_info; 3740 struct btrfs_root *extent_root = info->extent_root; 3741 struct extent_buffer *leaf; 3742 struct btrfs_extent_item *ei; 3743 struct btrfs_extent_inline_ref *iref; 3744 int ret; 3745 int is_data; 3746 int extent_slot = 0; 3747 int found_extent = 0; 3748 int num_to_del = 1; 3749 u32 item_size; 3750 u64 refs; 3751 3752 path = btrfs_alloc_path(); 3753 if (!path) 3754 return -ENOMEM; 3755 3756 path->reada = 1; 3757 path->leave_spinning = 1; 3758 3759 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 3760 BUG_ON(!is_data && refs_to_drop != 1); 3761 3762 ret = lookup_extent_backref(trans, extent_root, path, &iref, 3763 bytenr, num_bytes, parent, 3764 root_objectid, owner_objectid, 3765 owner_offset); 3766 if (ret == 0) { 3767 extent_slot = path->slots[0]; 3768 while (extent_slot >= 0) { 3769 btrfs_item_key_to_cpu(path->nodes[0], &key, 3770 extent_slot); 3771 if (key.objectid != bytenr) 3772 break; 3773 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3774 key.offset == num_bytes) { 3775 found_extent = 1; 3776 break; 3777 } 3778 if (path->slots[0] - extent_slot > 5) 3779 break; 3780 extent_slot--; 3781 } 3782 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 3783 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 3784 if (found_extent && item_size < sizeof(*ei)) 3785 found_extent = 0; 3786 #endif 3787 if (!found_extent) { 3788 BUG_ON(iref); 3789 ret = remove_extent_backref(trans, extent_root, path, 3790 NULL, refs_to_drop, 3791 is_data); 3792 BUG_ON(ret); 3793 btrfs_release_path(extent_root, path); 3794 path->leave_spinning = 1; 3795 3796 key.objectid = bytenr; 3797 key.type = BTRFS_EXTENT_ITEM_KEY; 3798 key.offset = num_bytes; 3799 3800 ret = btrfs_search_slot(trans, extent_root, 3801 &key, path, -1, 1); 3802 if (ret) { 3803 printk(KERN_ERR "umm, got %d back from search" 3804 ", was looking for %llu\n", ret, 3805 (unsigned long long)bytenr); 3806 btrfs_print_leaf(extent_root, path->nodes[0]); 3807 } 3808 BUG_ON(ret); 3809 extent_slot = path->slots[0]; 3810 } 3811 } else { 3812 btrfs_print_leaf(extent_root, path->nodes[0]); 3813 WARN_ON(1); 3814 printk(KERN_ERR "btrfs unable to find ref byte nr %llu " 3815 "parent %llu root %llu owner %llu offset %llu\n", 3816 (unsigned long long)bytenr, 3817 (unsigned long long)parent, 3818 (unsigned long long)root_objectid, 3819 (unsigned long long)owner_objectid, 3820 (unsigned long long)owner_offset); 3821 } 3822 3823 leaf = path->nodes[0]; 3824 item_size = btrfs_item_size_nr(leaf, extent_slot); 3825 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 3826 if (item_size < sizeof(*ei)) { 3827 BUG_ON(found_extent || extent_slot != path->slots[0]); 3828 ret = convert_extent_item_v0(trans, extent_root, path, 3829 owner_objectid, 0); 3830 BUG_ON(ret < 0); 3831 3832 btrfs_release_path(extent_root, path); 3833 path->leave_spinning = 1; 3834 3835 key.objectid = bytenr; 3836 key.type = BTRFS_EXTENT_ITEM_KEY; 3837 key.offset = num_bytes; 3838 3839 ret = btrfs_search_slot(trans, extent_root, &key, path, 3840 -1, 1); 3841 if (ret) { 3842 printk(KERN_ERR "umm, got %d back from search" 3843 ", was looking for %llu\n", ret, 3844 (unsigned long long)bytenr); 3845 btrfs_print_leaf(extent_root, path->nodes[0]); 3846 } 3847 BUG_ON(ret); 3848 extent_slot = path->slots[0]; 3849 leaf = path->nodes[0]; 3850 item_size = btrfs_item_size_nr(leaf, extent_slot); 3851 } 3852 #endif 3853 BUG_ON(item_size < sizeof(*ei)); 3854 ei = btrfs_item_ptr(leaf, extent_slot, 3855 struct btrfs_extent_item); 3856 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 3857 struct btrfs_tree_block_info *bi; 3858 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 3859 bi = (struct btrfs_tree_block_info *)(ei + 1); 3860 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 3861 } 3862 3863 refs = btrfs_extent_refs(leaf, ei); 3864 BUG_ON(refs < refs_to_drop); 3865 refs -= refs_to_drop; 3866 3867 if (refs > 0) { 3868 if (extent_op) 3869 __run_delayed_extent_op(extent_op, leaf, ei); 3870 /* 3871 * In the case of inline back ref, reference count will 3872 * be updated by remove_extent_backref 3873 */ 3874 if (iref) { 3875 BUG_ON(!found_extent); 3876 } else { 3877 btrfs_set_extent_refs(leaf, ei, refs); 3878 btrfs_mark_buffer_dirty(leaf); 3879 } 3880 if (found_extent) { 3881 ret = remove_extent_backref(trans, extent_root, path, 3882 iref, refs_to_drop, 3883 is_data); 3884 BUG_ON(ret); 3885 } 3886 } else { 3887 int mark_free = 0; 3888 struct extent_buffer *must_clean = NULL; 3889 3890 if (found_extent) { 3891 BUG_ON(is_data && refs_to_drop != 3892 extent_data_ref_count(root, path, iref)); 3893 if (iref) { 3894 BUG_ON(path->slots[0] != extent_slot); 3895 } else { 3896 BUG_ON(path->slots[0] != extent_slot + 1); 3897 path->slots[0] = extent_slot; 3898 num_to_del = 2; 3899 } 3900 } 3901 3902 ret = pin_down_bytes(trans, root, path, bytenr, 3903 num_bytes, is_data, 0, &must_clean); 3904 if (ret > 0) 3905 mark_free = 1; 3906 BUG_ON(ret < 0); 3907 /* 3908 * it is going to be very rare for someone to be waiting 3909 * on the block we're freeing. del_items might need to 3910 * schedule, so rather than get fancy, just force it 3911 * to blocking here 3912 */ 3913 if (must_clean) 3914 btrfs_set_lock_blocking(must_clean); 3915 3916 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 3917 num_to_del); 3918 BUG_ON(ret); 3919 btrfs_release_path(extent_root, path); 3920 3921 if (must_clean) { 3922 clean_tree_block(NULL, root, must_clean); 3923 btrfs_tree_unlock(must_clean); 3924 free_extent_buffer(must_clean); 3925 } 3926 3927 if (is_data) { 3928 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 3929 BUG_ON(ret); 3930 } else { 3931 invalidate_mapping_pages(info->btree_inode->i_mapping, 3932 bytenr >> PAGE_CACHE_SHIFT, 3933 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); 3934 } 3935 3936 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 3937 mark_free); 3938 BUG_ON(ret); 3939 } 3940 btrfs_free_path(path); 3941 return ret; 3942 } 3943 3944 /* 3945 * when we free an extent, it is possible (and likely) that we free the last 3946 * delayed ref for that extent as well. This searches the delayed ref tree for 3947 * a given extent, and if there are no other delayed refs to be processed, it 3948 * removes it from the tree. 3949 */ 3950 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 3951 struct btrfs_root *root, u64 bytenr) 3952 { 3953 struct btrfs_delayed_ref_head *head; 3954 struct btrfs_delayed_ref_root *delayed_refs; 3955 struct btrfs_delayed_ref_node *ref; 3956 struct rb_node *node; 3957 int ret; 3958 3959 delayed_refs = &trans->transaction->delayed_refs; 3960 spin_lock(&delayed_refs->lock); 3961 head = btrfs_find_delayed_ref_head(trans, bytenr); 3962 if (!head) 3963 goto out; 3964 3965 node = rb_prev(&head->node.rb_node); 3966 if (!node) 3967 goto out; 3968 3969 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 3970 3971 /* there are still entries for this ref, we can't drop it */ 3972 if (ref->bytenr == bytenr) 3973 goto out; 3974 3975 if (head->extent_op) { 3976 if (!head->must_insert_reserved) 3977 goto out; 3978 kfree(head->extent_op); 3979 head->extent_op = NULL; 3980 } 3981 3982 /* 3983 * waiting for the lock here would deadlock. If someone else has it 3984 * locked they are already in the process of dropping it anyway 3985 */ 3986 if (!mutex_trylock(&head->mutex)) 3987 goto out; 3988 3989 /* 3990 * at this point we have a head with no other entries. Go 3991 * ahead and process it. 3992 */ 3993 head->node.in_tree = 0; 3994 rb_erase(&head->node.rb_node, &delayed_refs->root); 3995 3996 delayed_refs->num_entries--; 3997 3998 /* 3999 * we don't take a ref on the node because we're removing it from the 4000 * tree, so we just steal the ref the tree was holding. 4001 */ 4002 delayed_refs->num_heads--; 4003 if (list_empty(&head->cluster)) 4004 delayed_refs->num_heads_ready--; 4005 4006 list_del_init(&head->cluster); 4007 spin_unlock(&delayed_refs->lock); 4008 4009 ret = run_one_delayed_ref(trans, root->fs_info->tree_root, 4010 &head->node, head->extent_op, 4011 head->must_insert_reserved); 4012 BUG_ON(ret); 4013 btrfs_put_delayed_ref(&head->node); 4014 return 0; 4015 out: 4016 spin_unlock(&delayed_refs->lock); 4017 return 0; 4018 } 4019 4020 int btrfs_free_extent(struct btrfs_trans_handle *trans, 4021 struct btrfs_root *root, 4022 u64 bytenr, u64 num_bytes, u64 parent, 4023 u64 root_objectid, u64 owner, u64 offset) 4024 { 4025 int ret; 4026 4027 /* 4028 * tree log blocks never actually go into the extent allocation 4029 * tree, just update pinning info and exit early. 4030 */ 4031 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 4032 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 4033 /* unlocks the pinned mutex */ 4034 btrfs_pin_extent(root, bytenr, num_bytes, 1); 4035 ret = 0; 4036 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 4037 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 4038 parent, root_objectid, (int)owner, 4039 BTRFS_DROP_DELAYED_REF, NULL); 4040 BUG_ON(ret); 4041 ret = check_ref_cleanup(trans, root, bytenr); 4042 BUG_ON(ret); 4043 } else { 4044 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 4045 parent, root_objectid, owner, 4046 offset, BTRFS_DROP_DELAYED_REF, NULL); 4047 BUG_ON(ret); 4048 } 4049 return ret; 4050 } 4051 4052 static u64 stripe_align(struct btrfs_root *root, u64 val) 4053 { 4054 u64 mask = ((u64)root->stripesize - 1); 4055 u64 ret = (val + mask) & ~mask; 4056 return ret; 4057 } 4058 4059 /* 4060 * when we wait for progress in the block group caching, its because 4061 * our allocation attempt failed at least once. So, we must sleep 4062 * and let some progress happen before we try again. 4063 * 4064 * This function will sleep at least once waiting for new free space to 4065 * show up, and then it will check the block group free space numbers 4066 * for our min num_bytes. Another option is to have it go ahead 4067 * and look in the rbtree for a free extent of a given size, but this 4068 * is a good start. 4069 */ 4070 static noinline int 4071 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 4072 u64 num_bytes) 4073 { 4074 struct btrfs_caching_control *caching_ctl; 4075 DEFINE_WAIT(wait); 4076 4077 caching_ctl = get_caching_control(cache); 4078 if (!caching_ctl) 4079 return 0; 4080 4081 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 4082 (cache->free_space >= num_bytes)); 4083 4084 put_caching_control(caching_ctl); 4085 return 0; 4086 } 4087 4088 static noinline int 4089 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 4090 { 4091 struct btrfs_caching_control *caching_ctl; 4092 DEFINE_WAIT(wait); 4093 4094 caching_ctl = get_caching_control(cache); 4095 if (!caching_ctl) 4096 return 0; 4097 4098 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 4099 4100 put_caching_control(caching_ctl); 4101 return 0; 4102 } 4103 4104 enum btrfs_loop_type { 4105 LOOP_FIND_IDEAL = 0, 4106 LOOP_CACHING_NOWAIT = 1, 4107 LOOP_CACHING_WAIT = 2, 4108 LOOP_ALLOC_CHUNK = 3, 4109 LOOP_NO_EMPTY_SIZE = 4, 4110 }; 4111 4112 /* 4113 * walks the btree of allocated extents and find a hole of a given size. 4114 * The key ins is changed to record the hole: 4115 * ins->objectid == block start 4116 * ins->flags = BTRFS_EXTENT_ITEM_KEY 4117 * ins->offset == number of blocks 4118 * Any available blocks before search_start are skipped. 4119 */ 4120 static noinline int find_free_extent(struct btrfs_trans_handle *trans, 4121 struct btrfs_root *orig_root, 4122 u64 num_bytes, u64 empty_size, 4123 u64 search_start, u64 search_end, 4124 u64 hint_byte, struct btrfs_key *ins, 4125 u64 exclude_start, u64 exclude_nr, 4126 int data) 4127 { 4128 int ret = 0; 4129 struct btrfs_root *root = orig_root->fs_info->extent_root; 4130 struct btrfs_free_cluster *last_ptr = NULL; 4131 struct btrfs_block_group_cache *block_group = NULL; 4132 int empty_cluster = 2 * 1024 * 1024; 4133 int allowed_chunk_alloc = 0; 4134 int done_chunk_alloc = 0; 4135 struct btrfs_space_info *space_info; 4136 int last_ptr_loop = 0; 4137 int loop = 0; 4138 bool found_uncached_bg = false; 4139 bool failed_cluster_refill = false; 4140 bool failed_alloc = false; 4141 u64 ideal_cache_percent = 0; 4142 u64 ideal_cache_offset = 0; 4143 4144 WARN_ON(num_bytes < root->sectorsize); 4145 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 4146 ins->objectid = 0; 4147 ins->offset = 0; 4148 4149 space_info = __find_space_info(root->fs_info, data); 4150 4151 if (orig_root->ref_cows || empty_size) 4152 allowed_chunk_alloc = 1; 4153 4154 if (data & BTRFS_BLOCK_GROUP_METADATA) { 4155 last_ptr = &root->fs_info->meta_alloc_cluster; 4156 if (!btrfs_test_opt(root, SSD)) 4157 empty_cluster = 64 * 1024; 4158 } 4159 4160 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { 4161 last_ptr = &root->fs_info->data_alloc_cluster; 4162 } 4163 4164 if (last_ptr) { 4165 spin_lock(&last_ptr->lock); 4166 if (last_ptr->block_group) 4167 hint_byte = last_ptr->window_start; 4168 spin_unlock(&last_ptr->lock); 4169 } 4170 4171 search_start = max(search_start, first_logical_byte(root, 0)); 4172 search_start = max(search_start, hint_byte); 4173 4174 if (!last_ptr) 4175 empty_cluster = 0; 4176 4177 if (search_start == hint_byte) { 4178 ideal_cache: 4179 block_group = btrfs_lookup_block_group(root->fs_info, 4180 search_start); 4181 /* 4182 * we don't want to use the block group if it doesn't match our 4183 * allocation bits, or if its not cached. 4184 * 4185 * However if we are re-searching with an ideal block group 4186 * picked out then we don't care that the block group is cached. 4187 */ 4188 if (block_group && block_group_bits(block_group, data) && 4189 (block_group->cached != BTRFS_CACHE_NO || 4190 search_start == ideal_cache_offset)) { 4191 down_read(&space_info->groups_sem); 4192 if (list_empty(&block_group->list) || 4193 block_group->ro) { 4194 /* 4195 * someone is removing this block group, 4196 * we can't jump into the have_block_group 4197 * target because our list pointers are not 4198 * valid 4199 */ 4200 btrfs_put_block_group(block_group); 4201 up_read(&space_info->groups_sem); 4202 } else { 4203 goto have_block_group; 4204 } 4205 } else if (block_group) { 4206 btrfs_put_block_group(block_group); 4207 } 4208 } 4209 search: 4210 down_read(&space_info->groups_sem); 4211 list_for_each_entry(block_group, &space_info->block_groups, list) { 4212 u64 offset; 4213 int cached; 4214 4215 atomic_inc(&block_group->count); 4216 search_start = block_group->key.objectid; 4217 4218 have_block_group: 4219 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4220 u64 free_percent; 4221 4222 free_percent = btrfs_block_group_used(&block_group->item); 4223 free_percent *= 100; 4224 free_percent = div64_u64(free_percent, 4225 block_group->key.offset); 4226 free_percent = 100 - free_percent; 4227 if (free_percent > ideal_cache_percent && 4228 likely(!block_group->ro)) { 4229 ideal_cache_offset = block_group->key.objectid; 4230 ideal_cache_percent = free_percent; 4231 } 4232 4233 /* 4234 * We only want to start kthread caching if we are at 4235 * the point where we will wait for caching to make 4236 * progress, or if our ideal search is over and we've 4237 * found somebody to start caching. 4238 */ 4239 if (loop > LOOP_CACHING_NOWAIT || 4240 (loop > LOOP_FIND_IDEAL && 4241 atomic_read(&space_info->caching_threads) < 2)) { 4242 ret = cache_block_group(block_group); 4243 BUG_ON(ret); 4244 } 4245 found_uncached_bg = true; 4246 4247 /* 4248 * If loop is set for cached only, try the next block 4249 * group. 4250 */ 4251 if (loop == LOOP_FIND_IDEAL) 4252 goto loop; 4253 } 4254 4255 cached = block_group_cache_done(block_group); 4256 if (unlikely(!cached)) 4257 found_uncached_bg = true; 4258 4259 if (unlikely(block_group->ro)) 4260 goto loop; 4261 4262 /* 4263 * Ok we want to try and use the cluster allocator, so lets look 4264 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will 4265 * have tried the cluster allocator plenty of times at this 4266 * point and not have found anything, so we are likely way too 4267 * fragmented for the clustering stuff to find anything, so lets 4268 * just skip it and let the allocator find whatever block it can 4269 * find 4270 */ 4271 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { 4272 /* 4273 * the refill lock keeps out other 4274 * people trying to start a new cluster 4275 */ 4276 spin_lock(&last_ptr->refill_lock); 4277 if (last_ptr->block_group && 4278 (last_ptr->block_group->ro || 4279 !block_group_bits(last_ptr->block_group, data))) { 4280 offset = 0; 4281 goto refill_cluster; 4282 } 4283 4284 offset = btrfs_alloc_from_cluster(block_group, last_ptr, 4285 num_bytes, search_start); 4286 if (offset) { 4287 /* we have a block, we're done */ 4288 spin_unlock(&last_ptr->refill_lock); 4289 goto checks; 4290 } 4291 4292 spin_lock(&last_ptr->lock); 4293 /* 4294 * whoops, this cluster doesn't actually point to 4295 * this block group. Get a ref on the block 4296 * group is does point to and try again 4297 */ 4298 if (!last_ptr_loop && last_ptr->block_group && 4299 last_ptr->block_group != block_group) { 4300 4301 btrfs_put_block_group(block_group); 4302 block_group = last_ptr->block_group; 4303 atomic_inc(&block_group->count); 4304 spin_unlock(&last_ptr->lock); 4305 spin_unlock(&last_ptr->refill_lock); 4306 4307 last_ptr_loop = 1; 4308 search_start = block_group->key.objectid; 4309 /* 4310 * we know this block group is properly 4311 * in the list because 4312 * btrfs_remove_block_group, drops the 4313 * cluster before it removes the block 4314 * group from the list 4315 */ 4316 goto have_block_group; 4317 } 4318 spin_unlock(&last_ptr->lock); 4319 refill_cluster: 4320 /* 4321 * this cluster didn't work out, free it and 4322 * start over 4323 */ 4324 btrfs_return_cluster_to_free_space(NULL, last_ptr); 4325 4326 last_ptr_loop = 0; 4327 4328 /* allocate a cluster in this block group */ 4329 ret = btrfs_find_space_cluster(trans, root, 4330 block_group, last_ptr, 4331 offset, num_bytes, 4332 empty_cluster + empty_size); 4333 if (ret == 0) { 4334 /* 4335 * now pull our allocation out of this 4336 * cluster 4337 */ 4338 offset = btrfs_alloc_from_cluster(block_group, 4339 last_ptr, num_bytes, 4340 search_start); 4341 if (offset) { 4342 /* we found one, proceed */ 4343 spin_unlock(&last_ptr->refill_lock); 4344 goto checks; 4345 } 4346 } else if (!cached && loop > LOOP_CACHING_NOWAIT 4347 && !failed_cluster_refill) { 4348 spin_unlock(&last_ptr->refill_lock); 4349 4350 failed_cluster_refill = true; 4351 wait_block_group_cache_progress(block_group, 4352 num_bytes + empty_cluster + empty_size); 4353 goto have_block_group; 4354 } 4355 4356 /* 4357 * at this point we either didn't find a cluster 4358 * or we weren't able to allocate a block from our 4359 * cluster. Free the cluster we've been trying 4360 * to use, and go to the next block group 4361 */ 4362 btrfs_return_cluster_to_free_space(NULL, last_ptr); 4363 spin_unlock(&last_ptr->refill_lock); 4364 goto loop; 4365 } 4366 4367 offset = btrfs_find_space_for_alloc(block_group, search_start, 4368 num_bytes, empty_size); 4369 /* 4370 * If we didn't find a chunk, and we haven't failed on this 4371 * block group before, and this block group is in the middle of 4372 * caching and we are ok with waiting, then go ahead and wait 4373 * for progress to be made, and set failed_alloc to true. 4374 * 4375 * If failed_alloc is true then we've already waited on this 4376 * block group once and should move on to the next block group. 4377 */ 4378 if (!offset && !failed_alloc && !cached && 4379 loop > LOOP_CACHING_NOWAIT) { 4380 wait_block_group_cache_progress(block_group, 4381 num_bytes + empty_size); 4382 failed_alloc = true; 4383 goto have_block_group; 4384 } else if (!offset) { 4385 goto loop; 4386 } 4387 checks: 4388 search_start = stripe_align(root, offset); 4389 /* move on to the next group */ 4390 if (search_start + num_bytes >= search_end) { 4391 btrfs_add_free_space(block_group, offset, num_bytes); 4392 goto loop; 4393 } 4394 4395 /* move on to the next group */ 4396 if (search_start + num_bytes > 4397 block_group->key.objectid + block_group->key.offset) { 4398 btrfs_add_free_space(block_group, offset, num_bytes); 4399 goto loop; 4400 } 4401 4402 if (exclude_nr > 0 && 4403 (search_start + num_bytes > exclude_start && 4404 search_start < exclude_start + exclude_nr)) { 4405 search_start = exclude_start + exclude_nr; 4406 4407 btrfs_add_free_space(block_group, offset, num_bytes); 4408 /* 4409 * if search_start is still in this block group 4410 * then we just re-search this block group 4411 */ 4412 if (search_start >= block_group->key.objectid && 4413 search_start < (block_group->key.objectid + 4414 block_group->key.offset)) 4415 goto have_block_group; 4416 goto loop; 4417 } 4418 4419 ins->objectid = search_start; 4420 ins->offset = num_bytes; 4421 4422 if (offset < search_start) 4423 btrfs_add_free_space(block_group, offset, 4424 search_start - offset); 4425 BUG_ON(offset > search_start); 4426 4427 update_reserved_extents(block_group, num_bytes, 1); 4428 4429 /* we are all good, lets return */ 4430 break; 4431 loop: 4432 failed_cluster_refill = false; 4433 failed_alloc = false; 4434 btrfs_put_block_group(block_group); 4435 } 4436 up_read(&space_info->groups_sem); 4437 4438 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for 4439 * for them to make caching progress. Also 4440 * determine the best possible bg to cache 4441 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 4442 * caching kthreads as we move along 4443 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 4444 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 4445 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 4446 * again 4447 */ 4448 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4449 (found_uncached_bg || empty_size || empty_cluster || 4450 allowed_chunk_alloc)) { 4451 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 4452 found_uncached_bg = false; 4453 loop++; 4454 if (!ideal_cache_percent && 4455 atomic_read(&space_info->caching_threads)) 4456 goto search; 4457 4458 /* 4459 * 1 of the following 2 things have happened so far 4460 * 4461 * 1) We found an ideal block group for caching that 4462 * is mostly full and will cache quickly, so we might 4463 * as well wait for it. 4464 * 4465 * 2) We searched for cached only and we didn't find 4466 * anything, and we didn't start any caching kthreads 4467 * either, so chances are we will loop through and 4468 * start a couple caching kthreads, and then come back 4469 * around and just wait for them. This will be slower 4470 * because we will have 2 caching kthreads reading at 4471 * the same time when we could have just started one 4472 * and waited for it to get far enough to give us an 4473 * allocation, so go ahead and go to the wait caching 4474 * loop. 4475 */ 4476 loop = LOOP_CACHING_WAIT; 4477 search_start = ideal_cache_offset; 4478 ideal_cache_percent = 0; 4479 goto ideal_cache; 4480 } else if (loop == LOOP_FIND_IDEAL) { 4481 /* 4482 * Didn't find a uncached bg, wait on anything we find 4483 * next. 4484 */ 4485 loop = LOOP_CACHING_WAIT; 4486 goto search; 4487 } 4488 4489 if (loop < LOOP_CACHING_WAIT) { 4490 loop++; 4491 goto search; 4492 } 4493 4494 if (loop == LOOP_ALLOC_CHUNK) { 4495 empty_size = 0; 4496 empty_cluster = 0; 4497 } 4498 4499 if (allowed_chunk_alloc) { 4500 ret = do_chunk_alloc(trans, root, num_bytes + 4501 2 * 1024 * 1024, data, 1); 4502 allowed_chunk_alloc = 0; 4503 done_chunk_alloc = 1; 4504 } else if (!done_chunk_alloc) { 4505 space_info->force_alloc = 1; 4506 } 4507 4508 if (loop < LOOP_NO_EMPTY_SIZE) { 4509 loop++; 4510 goto search; 4511 } 4512 ret = -ENOSPC; 4513 } else if (!ins->objectid) { 4514 ret = -ENOSPC; 4515 } 4516 4517 /* we found what we needed */ 4518 if (ins->objectid) { 4519 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 4520 trans->block_group = block_group->key.objectid; 4521 4522 btrfs_put_block_group(block_group); 4523 ret = 0; 4524 } 4525 4526 return ret; 4527 } 4528 4529 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 4530 int dump_block_groups) 4531 { 4532 struct btrfs_block_group_cache *cache; 4533 4534 spin_lock(&info->lock); 4535 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4536 (unsigned long long)(info->total_bytes - info->bytes_used - 4537 info->bytes_pinned - info->bytes_reserved - 4538 info->bytes_super), 4539 (info->full) ? "" : "not "); 4540 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4541 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" 4542 "\n", 4543 (unsigned long long)info->total_bytes, 4544 (unsigned long long)info->bytes_pinned, 4545 (unsigned long long)info->bytes_delalloc, 4546 (unsigned long long)info->bytes_may_use, 4547 (unsigned long long)info->bytes_used, 4548 (unsigned long long)info->bytes_root, 4549 (unsigned long long)info->bytes_super, 4550 (unsigned long long)info->bytes_reserved); 4551 spin_unlock(&info->lock); 4552 4553 if (!dump_block_groups) 4554 return; 4555 4556 down_read(&info->groups_sem); 4557 list_for_each_entry(cache, &info->block_groups, list) { 4558 spin_lock(&cache->lock); 4559 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 4560 "%llu pinned %llu reserved\n", 4561 (unsigned long long)cache->key.objectid, 4562 (unsigned long long)cache->key.offset, 4563 (unsigned long long)btrfs_block_group_used(&cache->item), 4564 (unsigned long long)cache->pinned, 4565 (unsigned long long)cache->reserved); 4566 btrfs_dump_free_space(cache, bytes); 4567 spin_unlock(&cache->lock); 4568 } 4569 up_read(&info->groups_sem); 4570 } 4571 4572 int btrfs_reserve_extent(struct btrfs_trans_handle *trans, 4573 struct btrfs_root *root, 4574 u64 num_bytes, u64 min_alloc_size, 4575 u64 empty_size, u64 hint_byte, 4576 u64 search_end, struct btrfs_key *ins, 4577 u64 data) 4578 { 4579 int ret; 4580 u64 search_start = 0; 4581 struct btrfs_fs_info *info = root->fs_info; 4582 4583 data = btrfs_get_alloc_profile(root, data); 4584 again: 4585 /* 4586 * the only place that sets empty_size is btrfs_realloc_node, which 4587 * is not called recursively on allocations 4588 */ 4589 if (empty_size || root->ref_cows) { 4590 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { 4591 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4592 2 * 1024 * 1024, 4593 BTRFS_BLOCK_GROUP_METADATA | 4594 (info->metadata_alloc_profile & 4595 info->avail_metadata_alloc_bits), 0); 4596 } 4597 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4598 num_bytes + 2 * 1024 * 1024, data, 0); 4599 } 4600 4601 WARN_ON(num_bytes < root->sectorsize); 4602 ret = find_free_extent(trans, root, num_bytes, empty_size, 4603 search_start, search_end, hint_byte, ins, 4604 trans->alloc_exclude_start, 4605 trans->alloc_exclude_nr, data); 4606 4607 if (ret == -ENOSPC && num_bytes > min_alloc_size) { 4608 num_bytes = num_bytes >> 1; 4609 num_bytes = num_bytes & ~(root->sectorsize - 1); 4610 num_bytes = max(num_bytes, min_alloc_size); 4611 do_chunk_alloc(trans, root->fs_info->extent_root, 4612 num_bytes, data, 1); 4613 goto again; 4614 } 4615 if (ret == -ENOSPC) { 4616 struct btrfs_space_info *sinfo; 4617 4618 sinfo = __find_space_info(root->fs_info, data); 4619 printk(KERN_ERR "btrfs allocation failed flags %llu, " 4620 "wanted %llu\n", (unsigned long long)data, 4621 (unsigned long long)num_bytes); 4622 dump_space_info(sinfo, num_bytes, 1); 4623 } 4624 4625 return ret; 4626 } 4627 4628 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) 4629 { 4630 struct btrfs_block_group_cache *cache; 4631 int ret = 0; 4632 4633 cache = btrfs_lookup_block_group(root->fs_info, start); 4634 if (!cache) { 4635 printk(KERN_ERR "Unable to find block group for %llu\n", 4636 (unsigned long long)start); 4637 return -ENOSPC; 4638 } 4639 4640 ret = btrfs_discard_extent(root, start, len); 4641 4642 btrfs_add_free_space(cache, start, len); 4643 update_reserved_extents(cache, len, 0); 4644 btrfs_put_block_group(cache); 4645 4646 return ret; 4647 } 4648 4649 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 4650 struct btrfs_root *root, 4651 u64 parent, u64 root_objectid, 4652 u64 flags, u64 owner, u64 offset, 4653 struct btrfs_key *ins, int ref_mod) 4654 { 4655 int ret; 4656 struct btrfs_fs_info *fs_info = root->fs_info; 4657 struct btrfs_extent_item *extent_item; 4658 struct btrfs_extent_inline_ref *iref; 4659 struct btrfs_path *path; 4660 struct extent_buffer *leaf; 4661 int type; 4662 u32 size; 4663 4664 if (parent > 0) 4665 type = BTRFS_SHARED_DATA_REF_KEY; 4666 else 4667 type = BTRFS_EXTENT_DATA_REF_KEY; 4668 4669 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 4670 4671 path = btrfs_alloc_path(); 4672 BUG_ON(!path); 4673 4674 path->leave_spinning = 1; 4675 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 4676 ins, size); 4677 BUG_ON(ret); 4678 4679 leaf = path->nodes[0]; 4680 extent_item = btrfs_item_ptr(leaf, path->slots[0], 4681 struct btrfs_extent_item); 4682 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 4683 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 4684 btrfs_set_extent_flags(leaf, extent_item, 4685 flags | BTRFS_EXTENT_FLAG_DATA); 4686 4687 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 4688 btrfs_set_extent_inline_ref_type(leaf, iref, type); 4689 if (parent > 0) { 4690 struct btrfs_shared_data_ref *ref; 4691 ref = (struct btrfs_shared_data_ref *)(iref + 1); 4692 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 4693 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 4694 } else { 4695 struct btrfs_extent_data_ref *ref; 4696 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 4697 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 4698 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 4699 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 4700 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 4701 } 4702 4703 btrfs_mark_buffer_dirty(path->nodes[0]); 4704 btrfs_free_path(path); 4705 4706 ret = update_block_group(trans, root, ins->objectid, ins->offset, 4707 1, 0); 4708 if (ret) { 4709 printk(KERN_ERR "btrfs update block group failed for %llu " 4710 "%llu\n", (unsigned long long)ins->objectid, 4711 (unsigned long long)ins->offset); 4712 BUG(); 4713 } 4714 return ret; 4715 } 4716 4717 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 4718 struct btrfs_root *root, 4719 u64 parent, u64 root_objectid, 4720 u64 flags, struct btrfs_disk_key *key, 4721 int level, struct btrfs_key *ins) 4722 { 4723 int ret; 4724 struct btrfs_fs_info *fs_info = root->fs_info; 4725 struct btrfs_extent_item *extent_item; 4726 struct btrfs_tree_block_info *block_info; 4727 struct btrfs_extent_inline_ref *iref; 4728 struct btrfs_path *path; 4729 struct extent_buffer *leaf; 4730 u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); 4731 4732 path = btrfs_alloc_path(); 4733 BUG_ON(!path); 4734 4735 path->leave_spinning = 1; 4736 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 4737 ins, size); 4738 BUG_ON(ret); 4739 4740 leaf = path->nodes[0]; 4741 extent_item = btrfs_item_ptr(leaf, path->slots[0], 4742 struct btrfs_extent_item); 4743 btrfs_set_extent_refs(leaf, extent_item, 1); 4744 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 4745 btrfs_set_extent_flags(leaf, extent_item, 4746 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 4747 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 4748 4749 btrfs_set_tree_block_key(leaf, block_info, key); 4750 btrfs_set_tree_block_level(leaf, block_info, level); 4751 4752 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 4753 if (parent > 0) { 4754 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 4755 btrfs_set_extent_inline_ref_type(leaf, iref, 4756 BTRFS_SHARED_BLOCK_REF_KEY); 4757 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 4758 } else { 4759 btrfs_set_extent_inline_ref_type(leaf, iref, 4760 BTRFS_TREE_BLOCK_REF_KEY); 4761 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 4762 } 4763 4764 btrfs_mark_buffer_dirty(leaf); 4765 btrfs_free_path(path); 4766 4767 ret = update_block_group(trans, root, ins->objectid, ins->offset, 4768 1, 0); 4769 if (ret) { 4770 printk(KERN_ERR "btrfs update block group failed for %llu " 4771 "%llu\n", (unsigned long long)ins->objectid, 4772 (unsigned long long)ins->offset); 4773 BUG(); 4774 } 4775 return ret; 4776 } 4777 4778 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 4779 struct btrfs_root *root, 4780 u64 root_objectid, u64 owner, 4781 u64 offset, struct btrfs_key *ins) 4782 { 4783 int ret; 4784 4785 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 4786 4787 ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, 4788 0, root_objectid, owner, offset, 4789 BTRFS_ADD_DELAYED_EXTENT, NULL); 4790 return ret; 4791 } 4792 4793 /* 4794 * this is used by the tree logging recovery code. It records that 4795 * an extent has been allocated and makes sure to clear the free 4796 * space cache bits as well 4797 */ 4798 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 4799 struct btrfs_root *root, 4800 u64 root_objectid, u64 owner, u64 offset, 4801 struct btrfs_key *ins) 4802 { 4803 int ret; 4804 struct btrfs_block_group_cache *block_group; 4805 struct btrfs_caching_control *caching_ctl; 4806 u64 start = ins->objectid; 4807 u64 num_bytes = ins->offset; 4808 4809 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 4810 cache_block_group(block_group); 4811 caching_ctl = get_caching_control(block_group); 4812 4813 if (!caching_ctl) { 4814 BUG_ON(!block_group_cache_done(block_group)); 4815 ret = btrfs_remove_free_space(block_group, start, num_bytes); 4816 BUG_ON(ret); 4817 } else { 4818 mutex_lock(&caching_ctl->mutex); 4819 4820 if (start >= caching_ctl->progress) { 4821 ret = add_excluded_extent(root, start, num_bytes); 4822 BUG_ON(ret); 4823 } else if (start + num_bytes <= caching_ctl->progress) { 4824 ret = btrfs_remove_free_space(block_group, 4825 start, num_bytes); 4826 BUG_ON(ret); 4827 } else { 4828 num_bytes = caching_ctl->progress - start; 4829 ret = btrfs_remove_free_space(block_group, 4830 start, num_bytes); 4831 BUG_ON(ret); 4832 4833 start = caching_ctl->progress; 4834 num_bytes = ins->objectid + ins->offset - 4835 caching_ctl->progress; 4836 ret = add_excluded_extent(root, start, num_bytes); 4837 BUG_ON(ret); 4838 } 4839 4840 mutex_unlock(&caching_ctl->mutex); 4841 put_caching_control(caching_ctl); 4842 } 4843 4844 update_reserved_extents(block_group, ins->offset, 1); 4845 btrfs_put_block_group(block_group); 4846 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 4847 0, owner, offset, ins, 1); 4848 return ret; 4849 } 4850 4851 /* 4852 * finds a free extent and does all the dirty work required for allocation 4853 * returns the key for the extent through ins, and a tree buffer for 4854 * the first block of the extent through buf. 4855 * 4856 * returns 0 if everything worked, non-zero otherwise. 4857 */ 4858 static int alloc_tree_block(struct btrfs_trans_handle *trans, 4859 struct btrfs_root *root, 4860 u64 num_bytes, u64 parent, u64 root_objectid, 4861 struct btrfs_disk_key *key, int level, 4862 u64 empty_size, u64 hint_byte, u64 search_end, 4863 struct btrfs_key *ins) 4864 { 4865 int ret; 4866 u64 flags = 0; 4867 4868 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes, 4869 empty_size, hint_byte, search_end, 4870 ins, 0); 4871 if (ret) 4872 return ret; 4873 4874 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 4875 if (parent == 0) 4876 parent = ins->objectid; 4877 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 4878 } else 4879 BUG_ON(parent > 0); 4880 4881 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 4882 struct btrfs_delayed_extent_op *extent_op; 4883 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 4884 BUG_ON(!extent_op); 4885 if (key) 4886 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 4887 else 4888 memset(&extent_op->key, 0, sizeof(extent_op->key)); 4889 extent_op->flags_to_set = flags; 4890 extent_op->update_key = 1; 4891 extent_op->update_flags = 1; 4892 extent_op->is_data = 0; 4893 4894 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid, 4895 ins->offset, parent, root_objectid, 4896 level, BTRFS_ADD_DELAYED_EXTENT, 4897 extent_op); 4898 BUG_ON(ret); 4899 } 4900 return ret; 4901 } 4902 4903 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 4904 struct btrfs_root *root, 4905 u64 bytenr, u32 blocksize, 4906 int level) 4907 { 4908 struct extent_buffer *buf; 4909 4910 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 4911 if (!buf) 4912 return ERR_PTR(-ENOMEM); 4913 btrfs_set_header_generation(buf, trans->transid); 4914 btrfs_set_buffer_lockdep_class(buf, level); 4915 btrfs_tree_lock(buf); 4916 clean_tree_block(trans, root, buf); 4917 4918 btrfs_set_lock_blocking(buf); 4919 btrfs_set_buffer_uptodate(buf); 4920 4921 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 4922 set_extent_dirty(&root->dirty_log_pages, buf->start, 4923 buf->start + buf->len - 1, GFP_NOFS); 4924 } else { 4925 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 4926 buf->start + buf->len - 1, GFP_NOFS); 4927 } 4928 trans->blocks_used++; 4929 /* this returns a buffer locked for blocking */ 4930 return buf; 4931 } 4932 4933 /* 4934 * helper function to allocate a block for a given tree 4935 * returns the tree buffer or NULL. 4936 */ 4937 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 4938 struct btrfs_root *root, u32 blocksize, 4939 u64 parent, u64 root_objectid, 4940 struct btrfs_disk_key *key, int level, 4941 u64 hint, u64 empty_size) 4942 { 4943 struct btrfs_key ins; 4944 int ret; 4945 struct extent_buffer *buf; 4946 4947 ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, 4948 key, level, empty_size, hint, (u64)-1, &ins); 4949 if (ret) { 4950 BUG_ON(ret > 0); 4951 return ERR_PTR(ret); 4952 } 4953 4954 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 4955 blocksize, level); 4956 return buf; 4957 } 4958 4959 struct walk_control { 4960 u64 refs[BTRFS_MAX_LEVEL]; 4961 u64 flags[BTRFS_MAX_LEVEL]; 4962 struct btrfs_key update_progress; 4963 int stage; 4964 int level; 4965 int shared_level; 4966 int update_ref; 4967 int keep_locks; 4968 int reada_slot; 4969 int reada_count; 4970 }; 4971 4972 #define DROP_REFERENCE 1 4973 #define UPDATE_BACKREF 2 4974 4975 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 4976 struct btrfs_root *root, 4977 struct walk_control *wc, 4978 struct btrfs_path *path) 4979 { 4980 u64 bytenr; 4981 u64 generation; 4982 u64 refs; 4983 u64 flags; 4984 u64 last = 0; 4985 u32 nritems; 4986 u32 blocksize; 4987 struct btrfs_key key; 4988 struct extent_buffer *eb; 4989 int ret; 4990 int slot; 4991 int nread = 0; 4992 4993 if (path->slots[wc->level] < wc->reada_slot) { 4994 wc->reada_count = wc->reada_count * 2 / 3; 4995 wc->reada_count = max(wc->reada_count, 2); 4996 } else { 4997 wc->reada_count = wc->reada_count * 3 / 2; 4998 wc->reada_count = min_t(int, wc->reada_count, 4999 BTRFS_NODEPTRS_PER_BLOCK(root)); 5000 } 5001 5002 eb = path->nodes[wc->level]; 5003 nritems = btrfs_header_nritems(eb); 5004 blocksize = btrfs_level_size(root, wc->level - 1); 5005 5006 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 5007 if (nread >= wc->reada_count) 5008 break; 5009 5010 cond_resched(); 5011 bytenr = btrfs_node_blockptr(eb, slot); 5012 generation = btrfs_node_ptr_generation(eb, slot); 5013 5014 if (slot == path->slots[wc->level]) 5015 goto reada; 5016 5017 if (wc->stage == UPDATE_BACKREF && 5018 generation <= root->root_key.offset) 5019 continue; 5020 5021 /* We don't lock the tree block, it's OK to be racy here */ 5022 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, 5023 &refs, &flags); 5024 BUG_ON(ret); 5025 BUG_ON(refs == 0); 5026 5027 if (wc->stage == DROP_REFERENCE) { 5028 if (refs == 1) 5029 goto reada; 5030 5031 if (wc->level == 1 && 5032 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5033 continue; 5034 if (!wc->update_ref || 5035 generation <= root->root_key.offset) 5036 continue; 5037 btrfs_node_key_to_cpu(eb, &key, slot); 5038 ret = btrfs_comp_cpu_keys(&key, 5039 &wc->update_progress); 5040 if (ret < 0) 5041 continue; 5042 } else { 5043 if (wc->level == 1 && 5044 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5045 continue; 5046 } 5047 reada: 5048 ret = readahead_tree_block(root, bytenr, blocksize, 5049 generation); 5050 if (ret) 5051 break; 5052 last = bytenr + blocksize; 5053 nread++; 5054 } 5055 wc->reada_slot = slot; 5056 } 5057 5058 /* 5059 * hepler to process tree block while walking down the tree. 5060 * 5061 * when wc->stage == UPDATE_BACKREF, this function updates 5062 * back refs for pointers in the block. 5063 * 5064 * NOTE: return value 1 means we should stop walking down. 5065 */ 5066 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 5067 struct btrfs_root *root, 5068 struct btrfs_path *path, 5069 struct walk_control *wc, int lookup_info) 5070 { 5071 int level = wc->level; 5072 struct extent_buffer *eb = path->nodes[level]; 5073 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 5074 int ret; 5075 5076 if (wc->stage == UPDATE_BACKREF && 5077 btrfs_header_owner(eb) != root->root_key.objectid) 5078 return 1; 5079 5080 /* 5081 * when reference count of tree block is 1, it won't increase 5082 * again. once full backref flag is set, we never clear it. 5083 */ 5084 if (lookup_info && 5085 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 5086 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 5087 BUG_ON(!path->locks[level]); 5088 ret = btrfs_lookup_extent_info(trans, root, 5089 eb->start, eb->len, 5090 &wc->refs[level], 5091 &wc->flags[level]); 5092 BUG_ON(ret); 5093 BUG_ON(wc->refs[level] == 0); 5094 } 5095 5096 if (wc->stage == DROP_REFERENCE) { 5097 if (wc->refs[level] > 1) 5098 return 1; 5099 5100 if (path->locks[level] && !wc->keep_locks) { 5101 btrfs_tree_unlock(eb); 5102 path->locks[level] = 0; 5103 } 5104 return 0; 5105 } 5106 5107 /* wc->stage == UPDATE_BACKREF */ 5108 if (!(wc->flags[level] & flag)) { 5109 BUG_ON(!path->locks[level]); 5110 ret = btrfs_inc_ref(trans, root, eb, 1); 5111 BUG_ON(ret); 5112 ret = btrfs_dec_ref(trans, root, eb, 0); 5113 BUG_ON(ret); 5114 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 5115 eb->len, flag, 0); 5116 BUG_ON(ret); 5117 wc->flags[level] |= flag; 5118 } 5119 5120 /* 5121 * the block is shared by multiple trees, so it's not good to 5122 * keep the tree lock 5123 */ 5124 if (path->locks[level] && level > 0) { 5125 btrfs_tree_unlock(eb); 5126 path->locks[level] = 0; 5127 } 5128 return 0; 5129 } 5130 5131 /* 5132 * hepler to process tree block pointer. 5133 * 5134 * when wc->stage == DROP_REFERENCE, this function checks 5135 * reference count of the block pointed to. if the block 5136 * is shared and we need update back refs for the subtree 5137 * rooted at the block, this function changes wc->stage to 5138 * UPDATE_BACKREF. if the block is shared and there is no 5139 * need to update back, this function drops the reference 5140 * to the block. 5141 * 5142 * NOTE: return value 1 means we should stop walking down. 5143 */ 5144 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 5145 struct btrfs_root *root, 5146 struct btrfs_path *path, 5147 struct walk_control *wc, int *lookup_info) 5148 { 5149 u64 bytenr; 5150 u64 generation; 5151 u64 parent; 5152 u32 blocksize; 5153 struct btrfs_key key; 5154 struct extent_buffer *next; 5155 int level = wc->level; 5156 int reada = 0; 5157 int ret = 0; 5158 5159 generation = btrfs_node_ptr_generation(path->nodes[level], 5160 path->slots[level]); 5161 /* 5162 * if the lower level block was created before the snapshot 5163 * was created, we know there is no need to update back refs 5164 * for the subtree 5165 */ 5166 if (wc->stage == UPDATE_BACKREF && 5167 generation <= root->root_key.offset) { 5168 *lookup_info = 1; 5169 return 1; 5170 } 5171 5172 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 5173 blocksize = btrfs_level_size(root, level - 1); 5174 5175 next = btrfs_find_tree_block(root, bytenr, blocksize); 5176 if (!next) { 5177 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 5178 reada = 1; 5179 } 5180 btrfs_tree_lock(next); 5181 btrfs_set_lock_blocking(next); 5182 5183 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, 5184 &wc->refs[level - 1], 5185 &wc->flags[level - 1]); 5186 BUG_ON(ret); 5187 BUG_ON(wc->refs[level - 1] == 0); 5188 *lookup_info = 0; 5189 5190 if (wc->stage == DROP_REFERENCE) { 5191 if (wc->refs[level - 1] > 1) { 5192 if (level == 1 && 5193 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5194 goto skip; 5195 5196 if (!wc->update_ref || 5197 generation <= root->root_key.offset) 5198 goto skip; 5199 5200 btrfs_node_key_to_cpu(path->nodes[level], &key, 5201 path->slots[level]); 5202 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 5203 if (ret < 0) 5204 goto skip; 5205 5206 wc->stage = UPDATE_BACKREF; 5207 wc->shared_level = level - 1; 5208 } 5209 } else { 5210 if (level == 1 && 5211 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5212 goto skip; 5213 } 5214 5215 if (!btrfs_buffer_uptodate(next, generation)) { 5216 btrfs_tree_unlock(next); 5217 free_extent_buffer(next); 5218 next = NULL; 5219 *lookup_info = 1; 5220 } 5221 5222 if (!next) { 5223 if (reada && level == 1) 5224 reada_walk_down(trans, root, wc, path); 5225 next = read_tree_block(root, bytenr, blocksize, generation); 5226 btrfs_tree_lock(next); 5227 btrfs_set_lock_blocking(next); 5228 } 5229 5230 level--; 5231 BUG_ON(level != btrfs_header_level(next)); 5232 path->nodes[level] = next; 5233 path->slots[level] = 0; 5234 path->locks[level] = 1; 5235 wc->level = level; 5236 if (wc->level == 1) 5237 wc->reada_slot = 0; 5238 return 0; 5239 skip: 5240 wc->refs[level - 1] = 0; 5241 wc->flags[level - 1] = 0; 5242 if (wc->stage == DROP_REFERENCE) { 5243 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 5244 parent = path->nodes[level]->start; 5245 } else { 5246 BUG_ON(root->root_key.objectid != 5247 btrfs_header_owner(path->nodes[level])); 5248 parent = 0; 5249 } 5250 5251 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 5252 root->root_key.objectid, level - 1, 0); 5253 BUG_ON(ret); 5254 } 5255 btrfs_tree_unlock(next); 5256 free_extent_buffer(next); 5257 *lookup_info = 1; 5258 return 1; 5259 } 5260 5261 /* 5262 * hepler to process tree block while walking up the tree. 5263 * 5264 * when wc->stage == DROP_REFERENCE, this function drops 5265 * reference count on the block. 5266 * 5267 * when wc->stage == UPDATE_BACKREF, this function changes 5268 * wc->stage back to DROP_REFERENCE if we changed wc->stage 5269 * to UPDATE_BACKREF previously while processing the block. 5270 * 5271 * NOTE: return value 1 means we should stop walking up. 5272 */ 5273 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 5274 struct btrfs_root *root, 5275 struct btrfs_path *path, 5276 struct walk_control *wc) 5277 { 5278 int ret = 0; 5279 int level = wc->level; 5280 struct extent_buffer *eb = path->nodes[level]; 5281 u64 parent = 0; 5282 5283 if (wc->stage == UPDATE_BACKREF) { 5284 BUG_ON(wc->shared_level < level); 5285 if (level < wc->shared_level) 5286 goto out; 5287 5288 ret = find_next_key(path, level + 1, &wc->update_progress); 5289 if (ret > 0) 5290 wc->update_ref = 0; 5291 5292 wc->stage = DROP_REFERENCE; 5293 wc->shared_level = -1; 5294 path->slots[level] = 0; 5295 5296 /* 5297 * check reference count again if the block isn't locked. 5298 * we should start walking down the tree again if reference 5299 * count is one. 5300 */ 5301 if (!path->locks[level]) { 5302 BUG_ON(level == 0); 5303 btrfs_tree_lock(eb); 5304 btrfs_set_lock_blocking(eb); 5305 path->locks[level] = 1; 5306 5307 ret = btrfs_lookup_extent_info(trans, root, 5308 eb->start, eb->len, 5309 &wc->refs[level], 5310 &wc->flags[level]); 5311 BUG_ON(ret); 5312 BUG_ON(wc->refs[level] == 0); 5313 if (wc->refs[level] == 1) { 5314 btrfs_tree_unlock(eb); 5315 path->locks[level] = 0; 5316 return 1; 5317 } 5318 } 5319 } 5320 5321 /* wc->stage == DROP_REFERENCE */ 5322 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 5323 5324 if (wc->refs[level] == 1) { 5325 if (level == 0) { 5326 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5327 ret = btrfs_dec_ref(trans, root, eb, 1); 5328 else 5329 ret = btrfs_dec_ref(trans, root, eb, 0); 5330 BUG_ON(ret); 5331 } 5332 /* make block locked assertion in clean_tree_block happy */ 5333 if (!path->locks[level] && 5334 btrfs_header_generation(eb) == trans->transid) { 5335 btrfs_tree_lock(eb); 5336 btrfs_set_lock_blocking(eb); 5337 path->locks[level] = 1; 5338 } 5339 clean_tree_block(trans, root, eb); 5340 } 5341 5342 if (eb == root->node) { 5343 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5344 parent = eb->start; 5345 else 5346 BUG_ON(root->root_key.objectid != 5347 btrfs_header_owner(eb)); 5348 } else { 5349 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5350 parent = path->nodes[level + 1]->start; 5351 else 5352 BUG_ON(root->root_key.objectid != 5353 btrfs_header_owner(path->nodes[level + 1])); 5354 } 5355 5356 ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, 5357 root->root_key.objectid, level, 0); 5358 BUG_ON(ret); 5359 out: 5360 wc->refs[level] = 0; 5361 wc->flags[level] = 0; 5362 return ret; 5363 } 5364 5365 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 5366 struct btrfs_root *root, 5367 struct btrfs_path *path, 5368 struct walk_control *wc) 5369 { 5370 int level = wc->level; 5371 int lookup_info = 1; 5372 int ret; 5373 5374 while (level >= 0) { 5375 if (path->slots[level] >= 5376 btrfs_header_nritems(path->nodes[level])) 5377 break; 5378 5379 ret = walk_down_proc(trans, root, path, wc, lookup_info); 5380 if (ret > 0) 5381 break; 5382 5383 if (level == 0) 5384 break; 5385 5386 ret = do_walk_down(trans, root, path, wc, &lookup_info); 5387 if (ret > 0) { 5388 path->slots[level]++; 5389 continue; 5390 } 5391 level = wc->level; 5392 } 5393 return 0; 5394 } 5395 5396 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 5397 struct btrfs_root *root, 5398 struct btrfs_path *path, 5399 struct walk_control *wc, int max_level) 5400 { 5401 int level = wc->level; 5402 int ret; 5403 5404 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 5405 while (level < max_level && path->nodes[level]) { 5406 wc->level = level; 5407 if (path->slots[level] + 1 < 5408 btrfs_header_nritems(path->nodes[level])) { 5409 path->slots[level]++; 5410 return 0; 5411 } else { 5412 ret = walk_up_proc(trans, root, path, wc); 5413 if (ret > 0) 5414 return 0; 5415 5416 if (path->locks[level]) { 5417 btrfs_tree_unlock(path->nodes[level]); 5418 path->locks[level] = 0; 5419 } 5420 free_extent_buffer(path->nodes[level]); 5421 path->nodes[level] = NULL; 5422 level++; 5423 } 5424 } 5425 return 1; 5426 } 5427 5428 /* 5429 * drop a subvolume tree. 5430 * 5431 * this function traverses the tree freeing any blocks that only 5432 * referenced by the tree. 5433 * 5434 * when a shared tree block is found. this function decreases its 5435 * reference count by one. if update_ref is true, this function 5436 * also make sure backrefs for the shared block and all lower level 5437 * blocks are properly updated. 5438 */ 5439 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) 5440 { 5441 struct btrfs_path *path; 5442 struct btrfs_trans_handle *trans; 5443 struct btrfs_root *tree_root = root->fs_info->tree_root; 5444 struct btrfs_root_item *root_item = &root->root_item; 5445 struct walk_control *wc; 5446 struct btrfs_key key; 5447 int err = 0; 5448 int ret; 5449 int level; 5450 5451 path = btrfs_alloc_path(); 5452 BUG_ON(!path); 5453 5454 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5455 BUG_ON(!wc); 5456 5457 trans = btrfs_start_transaction(tree_root, 1); 5458 5459 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5460 level = btrfs_header_level(root->node); 5461 path->nodes[level] = btrfs_lock_root_node(root); 5462 btrfs_set_lock_blocking(path->nodes[level]); 5463 path->slots[level] = 0; 5464 path->locks[level] = 1; 5465 memset(&wc->update_progress, 0, 5466 sizeof(wc->update_progress)); 5467 } else { 5468 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 5469 memcpy(&wc->update_progress, &key, 5470 sizeof(wc->update_progress)); 5471 5472 level = root_item->drop_level; 5473 BUG_ON(level == 0); 5474 path->lowest_level = level; 5475 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5476 path->lowest_level = 0; 5477 if (ret < 0) { 5478 err = ret; 5479 goto out; 5480 } 5481 WARN_ON(ret > 0); 5482 5483 /* 5484 * unlock our path, this is safe because only this 5485 * function is allowed to delete this snapshot 5486 */ 5487 btrfs_unlock_up_safe(path, 0); 5488 5489 level = btrfs_header_level(root->node); 5490 while (1) { 5491 btrfs_tree_lock(path->nodes[level]); 5492 btrfs_set_lock_blocking(path->nodes[level]); 5493 5494 ret = btrfs_lookup_extent_info(trans, root, 5495 path->nodes[level]->start, 5496 path->nodes[level]->len, 5497 &wc->refs[level], 5498 &wc->flags[level]); 5499 BUG_ON(ret); 5500 BUG_ON(wc->refs[level] == 0); 5501 5502 if (level == root_item->drop_level) 5503 break; 5504 5505 btrfs_tree_unlock(path->nodes[level]); 5506 WARN_ON(wc->refs[level] != 1); 5507 level--; 5508 } 5509 } 5510 5511 wc->level = level; 5512 wc->shared_level = -1; 5513 wc->stage = DROP_REFERENCE; 5514 wc->update_ref = update_ref; 5515 wc->keep_locks = 0; 5516 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 5517 5518 while (1) { 5519 ret = walk_down_tree(trans, root, path, wc); 5520 if (ret < 0) { 5521 err = ret; 5522 break; 5523 } 5524 5525 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 5526 if (ret < 0) { 5527 err = ret; 5528 break; 5529 } 5530 5531 if (ret > 0) { 5532 BUG_ON(wc->stage != DROP_REFERENCE); 5533 break; 5534 } 5535 5536 if (wc->stage == DROP_REFERENCE) { 5537 level = wc->level; 5538 btrfs_node_key(path->nodes[level], 5539 &root_item->drop_progress, 5540 path->slots[level]); 5541 root_item->drop_level = level; 5542 } 5543 5544 BUG_ON(wc->level == 0); 5545 if (trans->transaction->in_commit || 5546 trans->transaction->delayed_refs.flushing) { 5547 ret = btrfs_update_root(trans, tree_root, 5548 &root->root_key, 5549 root_item); 5550 BUG_ON(ret); 5551 5552 btrfs_end_transaction(trans, tree_root); 5553 trans = btrfs_start_transaction(tree_root, 1); 5554 } else { 5555 unsigned long update; 5556 update = trans->delayed_ref_updates; 5557 trans->delayed_ref_updates = 0; 5558 if (update) 5559 btrfs_run_delayed_refs(trans, tree_root, 5560 update); 5561 } 5562 } 5563 btrfs_release_path(root, path); 5564 BUG_ON(err); 5565 5566 ret = btrfs_del_root(trans, tree_root, &root->root_key); 5567 BUG_ON(ret); 5568 5569 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 5570 ret = btrfs_find_last_root(tree_root, root->root_key.objectid, 5571 NULL, NULL); 5572 BUG_ON(ret < 0); 5573 if (ret > 0) { 5574 ret = btrfs_del_orphan_item(trans, tree_root, 5575 root->root_key.objectid); 5576 BUG_ON(ret); 5577 } 5578 } 5579 5580 if (root->in_radix) { 5581 btrfs_free_fs_root(tree_root->fs_info, root); 5582 } else { 5583 free_extent_buffer(root->node); 5584 free_extent_buffer(root->commit_root); 5585 kfree(root); 5586 } 5587 out: 5588 btrfs_end_transaction(trans, tree_root); 5589 kfree(wc); 5590 btrfs_free_path(path); 5591 return err; 5592 } 5593 5594 /* 5595 * drop subtree rooted at tree block 'node'. 5596 * 5597 * NOTE: this function will unlock and release tree block 'node' 5598 */ 5599 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 5600 struct btrfs_root *root, 5601 struct extent_buffer *node, 5602 struct extent_buffer *parent) 5603 { 5604 struct btrfs_path *path; 5605 struct walk_control *wc; 5606 int level; 5607 int parent_level; 5608 int ret = 0; 5609 int wret; 5610 5611 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 5612 5613 path = btrfs_alloc_path(); 5614 BUG_ON(!path); 5615 5616 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5617 BUG_ON(!wc); 5618 5619 btrfs_assert_tree_locked(parent); 5620 parent_level = btrfs_header_level(parent); 5621 extent_buffer_get(parent); 5622 path->nodes[parent_level] = parent; 5623 path->slots[parent_level] = btrfs_header_nritems(parent); 5624 5625 btrfs_assert_tree_locked(node); 5626 level = btrfs_header_level(node); 5627 path->nodes[level] = node; 5628 path->slots[level] = 0; 5629 path->locks[level] = 1; 5630 5631 wc->refs[parent_level] = 1; 5632 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 5633 wc->level = level; 5634 wc->shared_level = -1; 5635 wc->stage = DROP_REFERENCE; 5636 wc->update_ref = 0; 5637 wc->keep_locks = 1; 5638 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 5639 5640 while (1) { 5641 wret = walk_down_tree(trans, root, path, wc); 5642 if (wret < 0) { 5643 ret = wret; 5644 break; 5645 } 5646 5647 wret = walk_up_tree(trans, root, path, wc, parent_level); 5648 if (wret < 0) 5649 ret = wret; 5650 if (wret != 0) 5651 break; 5652 } 5653 5654 kfree(wc); 5655 btrfs_free_path(path); 5656 return ret; 5657 } 5658 5659 #if 0 5660 static unsigned long calc_ra(unsigned long start, unsigned long last, 5661 unsigned long nr) 5662 { 5663 return min(last, start + nr - 1); 5664 } 5665 5666 static noinline int relocate_inode_pages(struct inode *inode, u64 start, 5667 u64 len) 5668 { 5669 u64 page_start; 5670 u64 page_end; 5671 unsigned long first_index; 5672 unsigned long last_index; 5673 unsigned long i; 5674 struct page *page; 5675 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5676 struct file_ra_state *ra; 5677 struct btrfs_ordered_extent *ordered; 5678 unsigned int total_read = 0; 5679 unsigned int total_dirty = 0; 5680 int ret = 0; 5681 5682 ra = kzalloc(sizeof(*ra), GFP_NOFS); 5683 5684 mutex_lock(&inode->i_mutex); 5685 first_index = start >> PAGE_CACHE_SHIFT; 5686 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; 5687 5688 /* make sure the dirty trick played by the caller work */ 5689 ret = invalidate_inode_pages2_range(inode->i_mapping, 5690 first_index, last_index); 5691 if (ret) 5692 goto out_unlock; 5693 5694 file_ra_state_init(ra, inode->i_mapping); 5695 5696 for (i = first_index ; i <= last_index; i++) { 5697 if (total_read % ra->ra_pages == 0) { 5698 btrfs_force_ra(inode->i_mapping, ra, NULL, i, 5699 calc_ra(i, last_index, ra->ra_pages)); 5700 } 5701 total_read++; 5702 again: 5703 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) 5704 BUG_ON(1); 5705 page = grab_cache_page(inode->i_mapping, i); 5706 if (!page) { 5707 ret = -ENOMEM; 5708 goto out_unlock; 5709 } 5710 if (!PageUptodate(page)) { 5711 btrfs_readpage(NULL, page); 5712 lock_page(page); 5713 if (!PageUptodate(page)) { 5714 unlock_page(page); 5715 page_cache_release(page); 5716 ret = -EIO; 5717 goto out_unlock; 5718 } 5719 } 5720 wait_on_page_writeback(page); 5721 5722 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 5723 page_end = page_start + PAGE_CACHE_SIZE - 1; 5724 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 5725 5726 ordered = btrfs_lookup_ordered_extent(inode, page_start); 5727 if (ordered) { 5728 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5729 unlock_page(page); 5730 page_cache_release(page); 5731 btrfs_start_ordered_extent(inode, ordered, 1); 5732 btrfs_put_ordered_extent(ordered); 5733 goto again; 5734 } 5735 set_page_extent_mapped(page); 5736 5737 if (i == first_index) 5738 set_extent_bits(io_tree, page_start, page_end, 5739 EXTENT_BOUNDARY, GFP_NOFS); 5740 btrfs_set_extent_delalloc(inode, page_start, page_end); 5741 5742 set_page_dirty(page); 5743 total_dirty++; 5744 5745 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5746 unlock_page(page); 5747 page_cache_release(page); 5748 } 5749 5750 out_unlock: 5751 kfree(ra); 5752 mutex_unlock(&inode->i_mutex); 5753 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); 5754 return ret; 5755 } 5756 5757 static noinline int relocate_data_extent(struct inode *reloc_inode, 5758 struct btrfs_key *extent_key, 5759 u64 offset) 5760 { 5761 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 5762 struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree; 5763 struct extent_map *em; 5764 u64 start = extent_key->objectid - offset; 5765 u64 end = start + extent_key->offset - 1; 5766 5767 em = alloc_extent_map(GFP_NOFS); 5768 BUG_ON(!em || IS_ERR(em)); 5769 5770 em->start = start; 5771 em->len = extent_key->offset; 5772 em->block_len = extent_key->offset; 5773 em->block_start = extent_key->objectid; 5774 em->bdev = root->fs_info->fs_devices->latest_bdev; 5775 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5776 5777 /* setup extent map to cheat btrfs_readpage */ 5778 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); 5779 while (1) { 5780 int ret; 5781 write_lock(&em_tree->lock); 5782 ret = add_extent_mapping(em_tree, em); 5783 write_unlock(&em_tree->lock); 5784 if (ret != -EEXIST) { 5785 free_extent_map(em); 5786 break; 5787 } 5788 btrfs_drop_extent_cache(reloc_inode, start, end, 0); 5789 } 5790 unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); 5791 5792 return relocate_inode_pages(reloc_inode, start, extent_key->offset); 5793 } 5794 5795 struct btrfs_ref_path { 5796 u64 extent_start; 5797 u64 nodes[BTRFS_MAX_LEVEL]; 5798 u64 root_objectid; 5799 u64 root_generation; 5800 u64 owner_objectid; 5801 u32 num_refs; 5802 int lowest_level; 5803 int current_level; 5804 int shared_level; 5805 5806 struct btrfs_key node_keys[BTRFS_MAX_LEVEL]; 5807 u64 new_nodes[BTRFS_MAX_LEVEL]; 5808 }; 5809 5810 struct disk_extent { 5811 u64 ram_bytes; 5812 u64 disk_bytenr; 5813 u64 disk_num_bytes; 5814 u64 offset; 5815 u64 num_bytes; 5816 u8 compression; 5817 u8 encryption; 5818 u16 other_encoding; 5819 }; 5820 5821 static int is_cowonly_root(u64 root_objectid) 5822 { 5823 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || 5824 root_objectid == BTRFS_EXTENT_TREE_OBJECTID || 5825 root_objectid == BTRFS_CHUNK_TREE_OBJECTID || 5826 root_objectid == BTRFS_DEV_TREE_OBJECTID || 5827 root_objectid == BTRFS_TREE_LOG_OBJECTID || 5828 root_objectid == BTRFS_CSUM_TREE_OBJECTID) 5829 return 1; 5830 return 0; 5831 } 5832 5833 static noinline int __next_ref_path(struct btrfs_trans_handle *trans, 5834 struct btrfs_root *extent_root, 5835 struct btrfs_ref_path *ref_path, 5836 int first_time) 5837 { 5838 struct extent_buffer *leaf; 5839 struct btrfs_path *path; 5840 struct btrfs_extent_ref *ref; 5841 struct btrfs_key key; 5842 struct btrfs_key found_key; 5843 u64 bytenr; 5844 u32 nritems; 5845 int level; 5846 int ret = 1; 5847 5848 path = btrfs_alloc_path(); 5849 if (!path) 5850 return -ENOMEM; 5851 5852 if (first_time) { 5853 ref_path->lowest_level = -1; 5854 ref_path->current_level = -1; 5855 ref_path->shared_level = -1; 5856 goto walk_up; 5857 } 5858 walk_down: 5859 level = ref_path->current_level - 1; 5860 while (level >= -1) { 5861 u64 parent; 5862 if (level < ref_path->lowest_level) 5863 break; 5864 5865 if (level >= 0) 5866 bytenr = ref_path->nodes[level]; 5867 else 5868 bytenr = ref_path->extent_start; 5869 BUG_ON(bytenr == 0); 5870 5871 parent = ref_path->nodes[level + 1]; 5872 ref_path->nodes[level + 1] = 0; 5873 ref_path->current_level = level; 5874 BUG_ON(parent == 0); 5875 5876 key.objectid = bytenr; 5877 key.offset = parent + 1; 5878 key.type = BTRFS_EXTENT_REF_KEY; 5879 5880 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); 5881 if (ret < 0) 5882 goto out; 5883 BUG_ON(ret == 0); 5884 5885 leaf = path->nodes[0]; 5886 nritems = btrfs_header_nritems(leaf); 5887 if (path->slots[0] >= nritems) { 5888 ret = btrfs_next_leaf(extent_root, path); 5889 if (ret < 0) 5890 goto out; 5891 if (ret > 0) 5892 goto next; 5893 leaf = path->nodes[0]; 5894 } 5895 5896 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5897 if (found_key.objectid == bytenr && 5898 found_key.type == BTRFS_EXTENT_REF_KEY) { 5899 if (level < ref_path->shared_level) 5900 ref_path->shared_level = level; 5901 goto found; 5902 } 5903 next: 5904 level--; 5905 btrfs_release_path(extent_root, path); 5906 cond_resched(); 5907 } 5908 /* reached lowest level */ 5909 ret = 1; 5910 goto out; 5911 walk_up: 5912 level = ref_path->current_level; 5913 while (level < BTRFS_MAX_LEVEL - 1) { 5914 u64 ref_objectid; 5915 5916 if (level >= 0) 5917 bytenr = ref_path->nodes[level]; 5918 else 5919 bytenr = ref_path->extent_start; 5920 5921 BUG_ON(bytenr == 0); 5922 5923 key.objectid = bytenr; 5924 key.offset = 0; 5925 key.type = BTRFS_EXTENT_REF_KEY; 5926 5927 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); 5928 if (ret < 0) 5929 goto out; 5930 5931 leaf = path->nodes[0]; 5932 nritems = btrfs_header_nritems(leaf); 5933 if (path->slots[0] >= nritems) { 5934 ret = btrfs_next_leaf(extent_root, path); 5935 if (ret < 0) 5936 goto out; 5937 if (ret > 0) { 5938 /* the extent was freed by someone */ 5939 if (ref_path->lowest_level == level) 5940 goto out; 5941 btrfs_release_path(extent_root, path); 5942 goto walk_down; 5943 } 5944 leaf = path->nodes[0]; 5945 } 5946 5947 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5948 if (found_key.objectid != bytenr || 5949 found_key.type != BTRFS_EXTENT_REF_KEY) { 5950 /* the extent was freed by someone */ 5951 if (ref_path->lowest_level == level) { 5952 ret = 1; 5953 goto out; 5954 } 5955 btrfs_release_path(extent_root, path); 5956 goto walk_down; 5957 } 5958 found: 5959 ref = btrfs_item_ptr(leaf, path->slots[0], 5960 struct btrfs_extent_ref); 5961 ref_objectid = btrfs_ref_objectid(leaf, ref); 5962 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) { 5963 if (first_time) { 5964 level = (int)ref_objectid; 5965 BUG_ON(level >= BTRFS_MAX_LEVEL); 5966 ref_path->lowest_level = level; 5967 ref_path->current_level = level; 5968 ref_path->nodes[level] = bytenr; 5969 } else { 5970 WARN_ON(ref_objectid != level); 5971 } 5972 } else { 5973 WARN_ON(level != -1); 5974 } 5975 first_time = 0; 5976 5977 if (ref_path->lowest_level == level) { 5978 ref_path->owner_objectid = ref_objectid; 5979 ref_path->num_refs = btrfs_ref_num_refs(leaf, ref); 5980 } 5981 5982 /* 5983 * the block is tree root or the block isn't in reference 5984 * counted tree. 5985 */ 5986 if (found_key.objectid == found_key.offset || 5987 is_cowonly_root(btrfs_ref_root(leaf, ref))) { 5988 ref_path->root_objectid = btrfs_ref_root(leaf, ref); 5989 ref_path->root_generation = 5990 btrfs_ref_generation(leaf, ref); 5991 if (level < 0) { 5992 /* special reference from the tree log */ 5993 ref_path->nodes[0] = found_key.offset; 5994 ref_path->current_level = 0; 5995 } 5996 ret = 0; 5997 goto out; 5998 } 5999 6000 level++; 6001 BUG_ON(ref_path->nodes[level] != 0); 6002 ref_path->nodes[level] = found_key.offset; 6003 ref_path->current_level = level; 6004 6005 /* 6006 * the reference was created in the running transaction, 6007 * no need to continue walking up. 6008 */ 6009 if (btrfs_ref_generation(leaf, ref) == trans->transid) { 6010 ref_path->root_objectid = btrfs_ref_root(leaf, ref); 6011 ref_path->root_generation = 6012 btrfs_ref_generation(leaf, ref); 6013 ret = 0; 6014 goto out; 6015 } 6016 6017 btrfs_release_path(extent_root, path); 6018 cond_resched(); 6019 } 6020 /* reached max tree level, but no tree root found. */ 6021 BUG(); 6022 out: 6023 btrfs_free_path(path); 6024 return ret; 6025 } 6026 6027 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans, 6028 struct btrfs_root *extent_root, 6029 struct btrfs_ref_path *ref_path, 6030 u64 extent_start) 6031 { 6032 memset(ref_path, 0, sizeof(*ref_path)); 6033 ref_path->extent_start = extent_start; 6034 6035 return __next_ref_path(trans, extent_root, ref_path, 1); 6036 } 6037 6038 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans, 6039 struct btrfs_root *extent_root, 6040 struct btrfs_ref_path *ref_path) 6041 { 6042 return __next_ref_path(trans, extent_root, ref_path, 0); 6043 } 6044 6045 static noinline int get_new_locations(struct inode *reloc_inode, 6046 struct btrfs_key *extent_key, 6047 u64 offset, int no_fragment, 6048 struct disk_extent **extents, 6049 int *nr_extents) 6050 { 6051 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 6052 struct btrfs_path *path; 6053 struct btrfs_file_extent_item *fi; 6054 struct extent_buffer *leaf; 6055 struct disk_extent *exts = *extents; 6056 struct btrfs_key found_key; 6057 u64 cur_pos; 6058 u64 last_byte; 6059 u32 nritems; 6060 int nr = 0; 6061 int max = *nr_extents; 6062 int ret; 6063 6064 WARN_ON(!no_fragment && *extents); 6065 if (!exts) { 6066 max = 1; 6067 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS); 6068 if (!exts) 6069 return -ENOMEM; 6070 } 6071 6072 path = btrfs_alloc_path(); 6073 BUG_ON(!path); 6074 6075 cur_pos = extent_key->objectid - offset; 6076 last_byte = extent_key->objectid + extent_key->offset; 6077 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, 6078 cur_pos, 0); 6079 if (ret < 0) 6080 goto out; 6081 if (ret > 0) { 6082 ret = -ENOENT; 6083 goto out; 6084 } 6085 6086 while (1) { 6087 leaf = path->nodes[0]; 6088 nritems = btrfs_header_nritems(leaf); 6089 if (path->slots[0] >= nritems) { 6090 ret = btrfs_next_leaf(root, path); 6091 if (ret < 0) 6092 goto out; 6093 if (ret > 0) 6094 break; 6095 leaf = path->nodes[0]; 6096 } 6097 6098 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6099 if (found_key.offset != cur_pos || 6100 found_key.type != BTRFS_EXTENT_DATA_KEY || 6101 found_key.objectid != reloc_inode->i_ino) 6102 break; 6103 6104 fi = btrfs_item_ptr(leaf, path->slots[0], 6105 struct btrfs_file_extent_item); 6106 if (btrfs_file_extent_type(leaf, fi) != 6107 BTRFS_FILE_EXTENT_REG || 6108 btrfs_file_extent_disk_bytenr(leaf, fi) == 0) 6109 break; 6110 6111 if (nr == max) { 6112 struct disk_extent *old = exts; 6113 max *= 2; 6114 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); 6115 memcpy(exts, old, sizeof(*exts) * nr); 6116 if (old != *extents) 6117 kfree(old); 6118 } 6119 6120 exts[nr].disk_bytenr = 6121 btrfs_file_extent_disk_bytenr(leaf, fi); 6122 exts[nr].disk_num_bytes = 6123 btrfs_file_extent_disk_num_bytes(leaf, fi); 6124 exts[nr].offset = btrfs_file_extent_offset(leaf, fi); 6125 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 6126 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 6127 exts[nr].compression = btrfs_file_extent_compression(leaf, fi); 6128 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); 6129 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, 6130 fi); 6131 BUG_ON(exts[nr].offset > 0); 6132 BUG_ON(exts[nr].compression || exts[nr].encryption); 6133 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); 6134 6135 cur_pos += exts[nr].num_bytes; 6136 nr++; 6137 6138 if (cur_pos + offset >= last_byte) 6139 break; 6140 6141 if (no_fragment) { 6142 ret = 1; 6143 goto out; 6144 } 6145 path->slots[0]++; 6146 } 6147 6148 BUG_ON(cur_pos + offset > last_byte); 6149 if (cur_pos + offset < last_byte) { 6150 ret = -ENOENT; 6151 goto out; 6152 } 6153 ret = 0; 6154 out: 6155 btrfs_free_path(path); 6156 if (ret) { 6157 if (exts != *extents) 6158 kfree(exts); 6159 } else { 6160 *extents = exts; 6161 *nr_extents = nr; 6162 } 6163 return ret; 6164 } 6165 6166 static noinline int replace_one_extent(struct btrfs_trans_handle *trans, 6167 struct btrfs_root *root, 6168 struct btrfs_path *path, 6169 struct btrfs_key *extent_key, 6170 struct btrfs_key *leaf_key, 6171 struct btrfs_ref_path *ref_path, 6172 struct disk_extent *new_extents, 6173 int nr_extents) 6174 { 6175 struct extent_buffer *leaf; 6176 struct btrfs_file_extent_item *fi; 6177 struct inode *inode = NULL; 6178 struct btrfs_key key; 6179 u64 lock_start = 0; 6180 u64 lock_end = 0; 6181 u64 num_bytes; 6182 u64 ext_offset; 6183 u64 search_end = (u64)-1; 6184 u32 nritems; 6185 int nr_scaned = 0; 6186 int extent_locked = 0; 6187 int extent_type; 6188 int ret; 6189 6190 memcpy(&key, leaf_key, sizeof(key)); 6191 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 6192 if (key.objectid < ref_path->owner_objectid || 6193 (key.objectid == ref_path->owner_objectid && 6194 key.type < BTRFS_EXTENT_DATA_KEY)) { 6195 key.objectid = ref_path->owner_objectid; 6196 key.type = BTRFS_EXTENT_DATA_KEY; 6197 key.offset = 0; 6198 } 6199 } 6200 6201 while (1) { 6202 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 6203 if (ret < 0) 6204 goto out; 6205 6206 leaf = path->nodes[0]; 6207 nritems = btrfs_header_nritems(leaf); 6208 next: 6209 if (extent_locked && ret > 0) { 6210 /* 6211 * the file extent item was modified by someone 6212 * before the extent got locked. 6213 */ 6214 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, 6215 lock_end, GFP_NOFS); 6216 extent_locked = 0; 6217 } 6218 6219 if (path->slots[0] >= nritems) { 6220 if (++nr_scaned > 2) 6221 break; 6222 6223 BUG_ON(extent_locked); 6224 ret = btrfs_next_leaf(root, path); 6225 if (ret < 0) 6226 goto out; 6227 if (ret > 0) 6228 break; 6229 leaf = path->nodes[0]; 6230 nritems = btrfs_header_nritems(leaf); 6231 } 6232 6233 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 6234 6235 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 6236 if ((key.objectid > ref_path->owner_objectid) || 6237 (key.objectid == ref_path->owner_objectid && 6238 key.type > BTRFS_EXTENT_DATA_KEY) || 6239 key.offset >= search_end) 6240 break; 6241 } 6242 6243 if (inode && key.objectid != inode->i_ino) { 6244 BUG_ON(extent_locked); 6245 btrfs_release_path(root, path); 6246 mutex_unlock(&inode->i_mutex); 6247 iput(inode); 6248 inode = NULL; 6249 continue; 6250 } 6251 6252 if (key.type != BTRFS_EXTENT_DATA_KEY) { 6253 path->slots[0]++; 6254 ret = 1; 6255 goto next; 6256 } 6257 fi = btrfs_item_ptr(leaf, path->slots[0], 6258 struct btrfs_file_extent_item); 6259 extent_type = btrfs_file_extent_type(leaf, fi); 6260 if ((extent_type != BTRFS_FILE_EXTENT_REG && 6261 extent_type != BTRFS_FILE_EXTENT_PREALLOC) || 6262 (btrfs_file_extent_disk_bytenr(leaf, fi) != 6263 extent_key->objectid)) { 6264 path->slots[0]++; 6265 ret = 1; 6266 goto next; 6267 } 6268 6269 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 6270 ext_offset = btrfs_file_extent_offset(leaf, fi); 6271 6272 if (search_end == (u64)-1) { 6273 search_end = key.offset - ext_offset + 6274 btrfs_file_extent_ram_bytes(leaf, fi); 6275 } 6276 6277 if (!extent_locked) { 6278 lock_start = key.offset; 6279 lock_end = lock_start + num_bytes - 1; 6280 } else { 6281 if (lock_start > key.offset || 6282 lock_end + 1 < key.offset + num_bytes) { 6283 unlock_extent(&BTRFS_I(inode)->io_tree, 6284 lock_start, lock_end, GFP_NOFS); 6285 extent_locked = 0; 6286 } 6287 } 6288 6289 if (!inode) { 6290 btrfs_release_path(root, path); 6291 6292 inode = btrfs_iget_locked(root->fs_info->sb, 6293 key.objectid, root); 6294 if (inode->i_state & I_NEW) { 6295 BTRFS_I(inode)->root = root; 6296 BTRFS_I(inode)->location.objectid = 6297 key.objectid; 6298 BTRFS_I(inode)->location.type = 6299 BTRFS_INODE_ITEM_KEY; 6300 BTRFS_I(inode)->location.offset = 0; 6301 btrfs_read_locked_inode(inode); 6302 unlock_new_inode(inode); 6303 } 6304 /* 6305 * some code call btrfs_commit_transaction while 6306 * holding the i_mutex, so we can't use mutex_lock 6307 * here. 6308 */ 6309 if (is_bad_inode(inode) || 6310 !mutex_trylock(&inode->i_mutex)) { 6311 iput(inode); 6312 inode = NULL; 6313 key.offset = (u64)-1; 6314 goto skip; 6315 } 6316 } 6317 6318 if (!extent_locked) { 6319 struct btrfs_ordered_extent *ordered; 6320 6321 btrfs_release_path(root, path); 6322 6323 lock_extent(&BTRFS_I(inode)->io_tree, lock_start, 6324 lock_end, GFP_NOFS); 6325 ordered = btrfs_lookup_first_ordered_extent(inode, 6326 lock_end); 6327 if (ordered && 6328 ordered->file_offset <= lock_end && 6329 ordered->file_offset + ordered->len > lock_start) { 6330 unlock_extent(&BTRFS_I(inode)->io_tree, 6331 lock_start, lock_end, GFP_NOFS); 6332 btrfs_start_ordered_extent(inode, ordered, 1); 6333 btrfs_put_ordered_extent(ordered); 6334 key.offset += num_bytes; 6335 goto skip; 6336 } 6337 if (ordered) 6338 btrfs_put_ordered_extent(ordered); 6339 6340 extent_locked = 1; 6341 continue; 6342 } 6343 6344 if (nr_extents == 1) { 6345 /* update extent pointer in place */ 6346 btrfs_set_file_extent_disk_bytenr(leaf, fi, 6347 new_extents[0].disk_bytenr); 6348 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 6349 new_extents[0].disk_num_bytes); 6350 btrfs_mark_buffer_dirty(leaf); 6351 6352 btrfs_drop_extent_cache(inode, key.offset, 6353 key.offset + num_bytes - 1, 0); 6354 6355 ret = btrfs_inc_extent_ref(trans, root, 6356 new_extents[0].disk_bytenr, 6357 new_extents[0].disk_num_bytes, 6358 leaf->start, 6359 root->root_key.objectid, 6360 trans->transid, 6361 key.objectid); 6362 BUG_ON(ret); 6363 6364 ret = btrfs_free_extent(trans, root, 6365 extent_key->objectid, 6366 extent_key->offset, 6367 leaf->start, 6368 btrfs_header_owner(leaf), 6369 btrfs_header_generation(leaf), 6370 key.objectid, 0); 6371 BUG_ON(ret); 6372 6373 btrfs_release_path(root, path); 6374 key.offset += num_bytes; 6375 } else { 6376 BUG_ON(1); 6377 #if 0 6378 u64 alloc_hint; 6379 u64 extent_len; 6380 int i; 6381 /* 6382 * drop old extent pointer at first, then insert the 6383 * new pointers one bye one 6384 */ 6385 btrfs_release_path(root, path); 6386 ret = btrfs_drop_extents(trans, root, inode, key.offset, 6387 key.offset + num_bytes, 6388 key.offset, &alloc_hint); 6389 BUG_ON(ret); 6390 6391 for (i = 0; i < nr_extents; i++) { 6392 if (ext_offset >= new_extents[i].num_bytes) { 6393 ext_offset -= new_extents[i].num_bytes; 6394 continue; 6395 } 6396 extent_len = min(new_extents[i].num_bytes - 6397 ext_offset, num_bytes); 6398 6399 ret = btrfs_insert_empty_item(trans, root, 6400 path, &key, 6401 sizeof(*fi)); 6402 BUG_ON(ret); 6403 6404 leaf = path->nodes[0]; 6405 fi = btrfs_item_ptr(leaf, path->slots[0], 6406 struct btrfs_file_extent_item); 6407 btrfs_set_file_extent_generation(leaf, fi, 6408 trans->transid); 6409 btrfs_set_file_extent_type(leaf, fi, 6410 BTRFS_FILE_EXTENT_REG); 6411 btrfs_set_file_extent_disk_bytenr(leaf, fi, 6412 new_extents[i].disk_bytenr); 6413 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 6414 new_extents[i].disk_num_bytes); 6415 btrfs_set_file_extent_ram_bytes(leaf, fi, 6416 new_extents[i].ram_bytes); 6417 6418 btrfs_set_file_extent_compression(leaf, fi, 6419 new_extents[i].compression); 6420 btrfs_set_file_extent_encryption(leaf, fi, 6421 new_extents[i].encryption); 6422 btrfs_set_file_extent_other_encoding(leaf, fi, 6423 new_extents[i].other_encoding); 6424 6425 btrfs_set_file_extent_num_bytes(leaf, fi, 6426 extent_len); 6427 ext_offset += new_extents[i].offset; 6428 btrfs_set_file_extent_offset(leaf, fi, 6429 ext_offset); 6430 btrfs_mark_buffer_dirty(leaf); 6431 6432 btrfs_drop_extent_cache(inode, key.offset, 6433 key.offset + extent_len - 1, 0); 6434 6435 ret = btrfs_inc_extent_ref(trans, root, 6436 new_extents[i].disk_bytenr, 6437 new_extents[i].disk_num_bytes, 6438 leaf->start, 6439 root->root_key.objectid, 6440 trans->transid, key.objectid); 6441 BUG_ON(ret); 6442 btrfs_release_path(root, path); 6443 6444 inode_add_bytes(inode, extent_len); 6445 6446 ext_offset = 0; 6447 num_bytes -= extent_len; 6448 key.offset += extent_len; 6449 6450 if (num_bytes == 0) 6451 break; 6452 } 6453 BUG_ON(i >= nr_extents); 6454 #endif 6455 } 6456 6457 if (extent_locked) { 6458 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, 6459 lock_end, GFP_NOFS); 6460 extent_locked = 0; 6461 } 6462 skip: 6463 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && 6464 key.offset >= search_end) 6465 break; 6466 6467 cond_resched(); 6468 } 6469 ret = 0; 6470 out: 6471 btrfs_release_path(root, path); 6472 if (inode) { 6473 mutex_unlock(&inode->i_mutex); 6474 if (extent_locked) { 6475 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, 6476 lock_end, GFP_NOFS); 6477 } 6478 iput(inode); 6479 } 6480 return ret; 6481 } 6482 6483 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, 6484 struct btrfs_root *root, 6485 struct extent_buffer *buf, u64 orig_start) 6486 { 6487 int level; 6488 int ret; 6489 6490 BUG_ON(btrfs_header_generation(buf) != trans->transid); 6491 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 6492 6493 level = btrfs_header_level(buf); 6494 if (level == 0) { 6495 struct btrfs_leaf_ref *ref; 6496 struct btrfs_leaf_ref *orig_ref; 6497 6498 orig_ref = btrfs_lookup_leaf_ref(root, orig_start); 6499 if (!orig_ref) 6500 return -ENOENT; 6501 6502 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems); 6503 if (!ref) { 6504 btrfs_free_leaf_ref(root, orig_ref); 6505 return -ENOMEM; 6506 } 6507 6508 ref->nritems = orig_ref->nritems; 6509 memcpy(ref->extents, orig_ref->extents, 6510 sizeof(ref->extents[0]) * ref->nritems); 6511 6512 btrfs_free_leaf_ref(root, orig_ref); 6513 6514 ref->root_gen = trans->transid; 6515 ref->bytenr = buf->start; 6516 ref->owner = btrfs_header_owner(buf); 6517 ref->generation = btrfs_header_generation(buf); 6518 6519 ret = btrfs_add_leaf_ref(root, ref, 0); 6520 WARN_ON(ret); 6521 btrfs_free_leaf_ref(root, ref); 6522 } 6523 return 0; 6524 } 6525 6526 static noinline int invalidate_extent_cache(struct btrfs_root *root, 6527 struct extent_buffer *leaf, 6528 struct btrfs_block_group_cache *group, 6529 struct btrfs_root *target_root) 6530 { 6531 struct btrfs_key key; 6532 struct inode *inode = NULL; 6533 struct btrfs_file_extent_item *fi; 6534 u64 num_bytes; 6535 u64 skip_objectid = 0; 6536 u32 nritems; 6537 u32 i; 6538 6539 nritems = btrfs_header_nritems(leaf); 6540 for (i = 0; i < nritems; i++) { 6541 btrfs_item_key_to_cpu(leaf, &key, i); 6542 if (key.objectid == skip_objectid || 6543 key.type != BTRFS_EXTENT_DATA_KEY) 6544 continue; 6545 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 6546 if (btrfs_file_extent_type(leaf, fi) == 6547 BTRFS_FILE_EXTENT_INLINE) 6548 continue; 6549 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) 6550 continue; 6551 if (!inode || inode->i_ino != key.objectid) { 6552 iput(inode); 6553 inode = btrfs_ilookup(target_root->fs_info->sb, 6554 key.objectid, target_root, 1); 6555 } 6556 if (!inode) { 6557 skip_objectid = key.objectid; 6558 continue; 6559 } 6560 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 6561 6562 lock_extent(&BTRFS_I(inode)->io_tree, key.offset, 6563 key.offset + num_bytes - 1, GFP_NOFS); 6564 btrfs_drop_extent_cache(inode, key.offset, 6565 key.offset + num_bytes - 1, 1); 6566 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, 6567 key.offset + num_bytes - 1, GFP_NOFS); 6568 cond_resched(); 6569 } 6570 iput(inode); 6571 return 0; 6572 } 6573 6574 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, 6575 struct btrfs_root *root, 6576 struct extent_buffer *leaf, 6577 struct btrfs_block_group_cache *group, 6578 struct inode *reloc_inode) 6579 { 6580 struct btrfs_key key; 6581 struct btrfs_key extent_key; 6582 struct btrfs_file_extent_item *fi; 6583 struct btrfs_leaf_ref *ref; 6584 struct disk_extent *new_extent; 6585 u64 bytenr; 6586 u64 num_bytes; 6587 u32 nritems; 6588 u32 i; 6589 int ext_index; 6590 int nr_extent; 6591 int ret; 6592 6593 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); 6594 BUG_ON(!new_extent); 6595 6596 ref = btrfs_lookup_leaf_ref(root, leaf->start); 6597 BUG_ON(!ref); 6598 6599 ext_index = -1; 6600 nritems = btrfs_header_nritems(leaf); 6601 for (i = 0; i < nritems; i++) { 6602 btrfs_item_key_to_cpu(leaf, &key, i); 6603 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 6604 continue; 6605 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 6606 if (btrfs_file_extent_type(leaf, fi) == 6607 BTRFS_FILE_EXTENT_INLINE) 6608 continue; 6609 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 6610 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 6611 if (bytenr == 0) 6612 continue; 6613 6614 ext_index++; 6615 if (bytenr >= group->key.objectid + group->key.offset || 6616 bytenr + num_bytes <= group->key.objectid) 6617 continue; 6618 6619 extent_key.objectid = bytenr; 6620 extent_key.offset = num_bytes; 6621 extent_key.type = BTRFS_EXTENT_ITEM_KEY; 6622 nr_extent = 1; 6623 ret = get_new_locations(reloc_inode, &extent_key, 6624 group->key.objectid, 1, 6625 &new_extent, &nr_extent); 6626 if (ret > 0) 6627 continue; 6628 BUG_ON(ret < 0); 6629 6630 BUG_ON(ref->extents[ext_index].bytenr != bytenr); 6631 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes); 6632 ref->extents[ext_index].bytenr = new_extent->disk_bytenr; 6633 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; 6634 6635 btrfs_set_file_extent_disk_bytenr(leaf, fi, 6636 new_extent->disk_bytenr); 6637 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 6638 new_extent->disk_num_bytes); 6639 btrfs_mark_buffer_dirty(leaf); 6640 6641 ret = btrfs_inc_extent_ref(trans, root, 6642 new_extent->disk_bytenr, 6643 new_extent->disk_num_bytes, 6644 leaf->start, 6645 root->root_key.objectid, 6646 trans->transid, key.objectid); 6647 BUG_ON(ret); 6648 6649 ret = btrfs_free_extent(trans, root, 6650 bytenr, num_bytes, leaf->start, 6651 btrfs_header_owner(leaf), 6652 btrfs_header_generation(leaf), 6653 key.objectid, 0); 6654 BUG_ON(ret); 6655 cond_resched(); 6656 } 6657 kfree(new_extent); 6658 BUG_ON(ext_index + 1 != ref->nritems); 6659 btrfs_free_leaf_ref(root, ref); 6660 return 0; 6661 } 6662 6663 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, 6664 struct btrfs_root *root) 6665 { 6666 struct btrfs_root *reloc_root; 6667 int ret; 6668 6669 if (root->reloc_root) { 6670 reloc_root = root->reloc_root; 6671 root->reloc_root = NULL; 6672 list_add(&reloc_root->dead_list, 6673 &root->fs_info->dead_reloc_roots); 6674 6675 btrfs_set_root_bytenr(&reloc_root->root_item, 6676 reloc_root->node->start); 6677 btrfs_set_root_level(&root->root_item, 6678 btrfs_header_level(reloc_root->node)); 6679 memset(&reloc_root->root_item.drop_progress, 0, 6680 sizeof(struct btrfs_disk_key)); 6681 reloc_root->root_item.drop_level = 0; 6682 6683 ret = btrfs_update_root(trans, root->fs_info->tree_root, 6684 &reloc_root->root_key, 6685 &reloc_root->root_item); 6686 BUG_ON(ret); 6687 } 6688 return 0; 6689 } 6690 6691 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) 6692 { 6693 struct btrfs_trans_handle *trans; 6694 struct btrfs_root *reloc_root; 6695 struct btrfs_root *prev_root = NULL; 6696 struct list_head dead_roots; 6697 int ret; 6698 unsigned long nr; 6699 6700 INIT_LIST_HEAD(&dead_roots); 6701 list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots); 6702 6703 while (!list_empty(&dead_roots)) { 6704 reloc_root = list_entry(dead_roots.prev, 6705 struct btrfs_root, dead_list); 6706 list_del_init(&reloc_root->dead_list); 6707 6708 BUG_ON(reloc_root->commit_root != NULL); 6709 while (1) { 6710 trans = btrfs_join_transaction(root, 1); 6711 BUG_ON(!trans); 6712 6713 mutex_lock(&root->fs_info->drop_mutex); 6714 ret = btrfs_drop_snapshot(trans, reloc_root); 6715 if (ret != -EAGAIN) 6716 break; 6717 mutex_unlock(&root->fs_info->drop_mutex); 6718 6719 nr = trans->blocks_used; 6720 ret = btrfs_end_transaction(trans, root); 6721 BUG_ON(ret); 6722 btrfs_btree_balance_dirty(root, nr); 6723 } 6724 6725 free_extent_buffer(reloc_root->node); 6726 6727 ret = btrfs_del_root(trans, root->fs_info->tree_root, 6728 &reloc_root->root_key); 6729 BUG_ON(ret); 6730 mutex_unlock(&root->fs_info->drop_mutex); 6731 6732 nr = trans->blocks_used; 6733 ret = btrfs_end_transaction(trans, root); 6734 BUG_ON(ret); 6735 btrfs_btree_balance_dirty(root, nr); 6736 6737 kfree(prev_root); 6738 prev_root = reloc_root; 6739 } 6740 if (prev_root) { 6741 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0); 6742 kfree(prev_root); 6743 } 6744 return 0; 6745 } 6746 6747 int btrfs_add_dead_reloc_root(struct btrfs_root *root) 6748 { 6749 list_add(&root->dead_list, &root->fs_info->dead_reloc_roots); 6750 return 0; 6751 } 6752 6753 int btrfs_cleanup_reloc_trees(struct btrfs_root *root) 6754 { 6755 struct btrfs_root *reloc_root; 6756 struct btrfs_trans_handle *trans; 6757 struct btrfs_key location; 6758 int found; 6759 int ret; 6760 6761 mutex_lock(&root->fs_info->tree_reloc_mutex); 6762 ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL); 6763 BUG_ON(ret); 6764 found = !list_empty(&root->fs_info->dead_reloc_roots); 6765 mutex_unlock(&root->fs_info->tree_reloc_mutex); 6766 6767 if (found) { 6768 trans = btrfs_start_transaction(root, 1); 6769 BUG_ON(!trans); 6770 ret = btrfs_commit_transaction(trans, root); 6771 BUG_ON(ret); 6772 } 6773 6774 location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; 6775 location.offset = (u64)-1; 6776 location.type = BTRFS_ROOT_ITEM_KEY; 6777 6778 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 6779 BUG_ON(!reloc_root); 6780 btrfs_orphan_cleanup(reloc_root); 6781 return 0; 6782 } 6783 6784 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, 6785 struct btrfs_root *root) 6786 { 6787 struct btrfs_root *reloc_root; 6788 struct extent_buffer *eb; 6789 struct btrfs_root_item *root_item; 6790 struct btrfs_key root_key; 6791 int ret; 6792 6793 BUG_ON(!root->ref_cows); 6794 if (root->reloc_root) 6795 return 0; 6796 6797 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 6798 BUG_ON(!root_item); 6799 6800 ret = btrfs_copy_root(trans, root, root->commit_root, 6801 &eb, BTRFS_TREE_RELOC_OBJECTID); 6802 BUG_ON(ret); 6803 6804 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 6805 root_key.offset = root->root_key.objectid; 6806 root_key.type = BTRFS_ROOT_ITEM_KEY; 6807 6808 memcpy(root_item, &root->root_item, sizeof(root_item)); 6809 btrfs_set_root_refs(root_item, 0); 6810 btrfs_set_root_bytenr(root_item, eb->start); 6811 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 6812 btrfs_set_root_generation(root_item, trans->transid); 6813 6814 btrfs_tree_unlock(eb); 6815 free_extent_buffer(eb); 6816 6817 ret = btrfs_insert_root(trans, root->fs_info->tree_root, 6818 &root_key, root_item); 6819 BUG_ON(ret); 6820 kfree(root_item); 6821 6822 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, 6823 &root_key); 6824 BUG_ON(!reloc_root); 6825 reloc_root->last_trans = trans->transid; 6826 reloc_root->commit_root = NULL; 6827 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; 6828 6829 root->reloc_root = reloc_root; 6830 return 0; 6831 } 6832 6833 /* 6834 * Core function of space balance. 6835 * 6836 * The idea is using reloc trees to relocate tree blocks in reference 6837 * counted roots. There is one reloc tree for each subvol, and all 6838 * reloc trees share same root key objectid. Reloc trees are snapshots 6839 * of the latest committed roots of subvols (root->commit_root). 6840 * 6841 * To relocate a tree block referenced by a subvol, there are two steps. 6842 * COW the block through subvol's reloc tree, then update block pointer 6843 * in the subvol to point to the new block. Since all reloc trees share 6844 * same root key objectid, doing special handing for tree blocks owned 6845 * by them is easy. Once a tree block has been COWed in one reloc tree, 6846 * we can use the resulting new block directly when the same block is 6847 * required to COW again through other reloc trees. By this way, relocated 6848 * tree blocks are shared between reloc trees, so they are also shared 6849 * between subvols. 6850 */ 6851 static noinline int relocate_one_path(struct btrfs_trans_handle *trans, 6852 struct btrfs_root *root, 6853 struct btrfs_path *path, 6854 struct btrfs_key *first_key, 6855 struct btrfs_ref_path *ref_path, 6856 struct btrfs_block_group_cache *group, 6857 struct inode *reloc_inode) 6858 { 6859 struct btrfs_root *reloc_root; 6860 struct extent_buffer *eb = NULL; 6861 struct btrfs_key *keys; 6862 u64 *nodes; 6863 int level; 6864 int shared_level; 6865 int lowest_level = 0; 6866 int ret; 6867 6868 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 6869 lowest_level = ref_path->owner_objectid; 6870 6871 if (!root->ref_cows) { 6872 path->lowest_level = lowest_level; 6873 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1); 6874 BUG_ON(ret < 0); 6875 path->lowest_level = 0; 6876 btrfs_release_path(root, path); 6877 return 0; 6878 } 6879 6880 mutex_lock(&root->fs_info->tree_reloc_mutex); 6881 ret = init_reloc_tree(trans, root); 6882 BUG_ON(ret); 6883 reloc_root = root->reloc_root; 6884 6885 shared_level = ref_path->shared_level; 6886 ref_path->shared_level = BTRFS_MAX_LEVEL - 1; 6887 6888 keys = ref_path->node_keys; 6889 nodes = ref_path->new_nodes; 6890 memset(&keys[shared_level + 1], 0, 6891 sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1)); 6892 memset(&nodes[shared_level + 1], 0, 6893 sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1)); 6894 6895 if (nodes[lowest_level] == 0) { 6896 path->lowest_level = lowest_level; 6897 ret = btrfs_search_slot(trans, reloc_root, first_key, path, 6898 0, 1); 6899 BUG_ON(ret); 6900 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) { 6901 eb = path->nodes[level]; 6902 if (!eb || eb == reloc_root->node) 6903 break; 6904 nodes[level] = eb->start; 6905 if (level == 0) 6906 btrfs_item_key_to_cpu(eb, &keys[level], 0); 6907 else 6908 btrfs_node_key_to_cpu(eb, &keys[level], 0); 6909 } 6910 if (nodes[0] && 6911 ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 6912 eb = path->nodes[0]; 6913 ret = replace_extents_in_leaf(trans, reloc_root, eb, 6914 group, reloc_inode); 6915 BUG_ON(ret); 6916 } 6917 btrfs_release_path(reloc_root, path); 6918 } else { 6919 ret = btrfs_merge_path(trans, reloc_root, keys, nodes, 6920 lowest_level); 6921 BUG_ON(ret); 6922 } 6923 6924 /* 6925 * replace tree blocks in the fs tree with tree blocks in 6926 * the reloc tree. 6927 */ 6928 ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level); 6929 BUG_ON(ret < 0); 6930 6931 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 6932 ret = btrfs_search_slot(trans, reloc_root, first_key, path, 6933 0, 0); 6934 BUG_ON(ret); 6935 extent_buffer_get(path->nodes[0]); 6936 eb = path->nodes[0]; 6937 btrfs_release_path(reloc_root, path); 6938 ret = invalidate_extent_cache(reloc_root, eb, group, root); 6939 BUG_ON(ret); 6940 free_extent_buffer(eb); 6941 } 6942 6943 mutex_unlock(&root->fs_info->tree_reloc_mutex); 6944 path->lowest_level = 0; 6945 return 0; 6946 } 6947 6948 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, 6949 struct btrfs_root *root, 6950 struct btrfs_path *path, 6951 struct btrfs_key *first_key, 6952 struct btrfs_ref_path *ref_path) 6953 { 6954 int ret; 6955 6956 ret = relocate_one_path(trans, root, path, first_key, 6957 ref_path, NULL, NULL); 6958 BUG_ON(ret); 6959 6960 return 0; 6961 } 6962 6963 static noinline int del_extent_zero(struct btrfs_trans_handle *trans, 6964 struct btrfs_root *extent_root, 6965 struct btrfs_path *path, 6966 struct btrfs_key *extent_key) 6967 { 6968 int ret; 6969 6970 ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1); 6971 if (ret) 6972 goto out; 6973 ret = btrfs_del_item(trans, extent_root, path); 6974 out: 6975 btrfs_release_path(extent_root, path); 6976 return ret; 6977 } 6978 6979 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info, 6980 struct btrfs_ref_path *ref_path) 6981 { 6982 struct btrfs_key root_key; 6983 6984 root_key.objectid = ref_path->root_objectid; 6985 root_key.type = BTRFS_ROOT_ITEM_KEY; 6986 if (is_cowonly_root(ref_path->root_objectid)) 6987 root_key.offset = 0; 6988 else 6989 root_key.offset = (u64)-1; 6990 6991 return btrfs_read_fs_root_no_name(fs_info, &root_key); 6992 } 6993 6994 static noinline int relocate_one_extent(struct btrfs_root *extent_root, 6995 struct btrfs_path *path, 6996 struct btrfs_key *extent_key, 6997 struct btrfs_block_group_cache *group, 6998 struct inode *reloc_inode, int pass) 6999 { 7000 struct btrfs_trans_handle *trans; 7001 struct btrfs_root *found_root; 7002 struct btrfs_ref_path *ref_path = NULL; 7003 struct disk_extent *new_extents = NULL; 7004 int nr_extents = 0; 7005 int loops; 7006 int ret; 7007 int level; 7008 struct btrfs_key first_key; 7009 u64 prev_block = 0; 7010 7011 7012 trans = btrfs_start_transaction(extent_root, 1); 7013 BUG_ON(!trans); 7014 7015 if (extent_key->objectid == 0) { 7016 ret = del_extent_zero(trans, extent_root, path, extent_key); 7017 goto out; 7018 } 7019 7020 ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS); 7021 if (!ref_path) { 7022 ret = -ENOMEM; 7023 goto out; 7024 } 7025 7026 for (loops = 0; ; loops++) { 7027 if (loops == 0) { 7028 ret = btrfs_first_ref_path(trans, extent_root, ref_path, 7029 extent_key->objectid); 7030 } else { 7031 ret = btrfs_next_ref_path(trans, extent_root, ref_path); 7032 } 7033 if (ret < 0) 7034 goto out; 7035 if (ret > 0) 7036 break; 7037 7038 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID || 7039 ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID) 7040 continue; 7041 7042 found_root = read_ref_root(extent_root->fs_info, ref_path); 7043 BUG_ON(!found_root); 7044 /* 7045 * for reference counted tree, only process reference paths 7046 * rooted at the latest committed root. 7047 */ 7048 if (found_root->ref_cows && 7049 ref_path->root_generation != found_root->root_key.offset) 7050 continue; 7051 7052 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 7053 if (pass == 0) { 7054 /* 7055 * copy data extents to new locations 7056 */ 7057 u64 group_start = group->key.objectid; 7058 ret = relocate_data_extent(reloc_inode, 7059 extent_key, 7060 group_start); 7061 if (ret < 0) 7062 goto out; 7063 break; 7064 } 7065 level = 0; 7066 } else { 7067 level = ref_path->owner_objectid; 7068 } 7069 7070 if (prev_block != ref_path->nodes[level]) { 7071 struct extent_buffer *eb; 7072 u64 block_start = ref_path->nodes[level]; 7073 u64 block_size = btrfs_level_size(found_root, level); 7074 7075 eb = read_tree_block(found_root, block_start, 7076 block_size, 0); 7077 btrfs_tree_lock(eb); 7078 BUG_ON(level != btrfs_header_level(eb)); 7079 7080 if (level == 0) 7081 btrfs_item_key_to_cpu(eb, &first_key, 0); 7082 else 7083 btrfs_node_key_to_cpu(eb, &first_key, 0); 7084 7085 btrfs_tree_unlock(eb); 7086 free_extent_buffer(eb); 7087 prev_block = block_start; 7088 } 7089 7090 mutex_lock(&extent_root->fs_info->trans_mutex); 7091 btrfs_record_root_in_trans(found_root); 7092 mutex_unlock(&extent_root->fs_info->trans_mutex); 7093 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 7094 /* 7095 * try to update data extent references while 7096 * keeping metadata shared between snapshots. 7097 */ 7098 if (pass == 1) { 7099 ret = relocate_one_path(trans, found_root, 7100 path, &first_key, ref_path, 7101 group, reloc_inode); 7102 if (ret < 0) 7103 goto out; 7104 continue; 7105 } 7106 /* 7107 * use fallback method to process the remaining 7108 * references. 7109 */ 7110 if (!new_extents) { 7111 u64 group_start = group->key.objectid; 7112 new_extents = kmalloc(sizeof(*new_extents), 7113 GFP_NOFS); 7114 nr_extents = 1; 7115 ret = get_new_locations(reloc_inode, 7116 extent_key, 7117 group_start, 1, 7118 &new_extents, 7119 &nr_extents); 7120 if (ret) 7121 goto out; 7122 } 7123 ret = replace_one_extent(trans, found_root, 7124 path, extent_key, 7125 &first_key, ref_path, 7126 new_extents, nr_extents); 7127 } else { 7128 ret = relocate_tree_block(trans, found_root, path, 7129 &first_key, ref_path); 7130 } 7131 if (ret < 0) 7132 goto out; 7133 } 7134 ret = 0; 7135 out: 7136 btrfs_end_transaction(trans, extent_root); 7137 kfree(new_extents); 7138 kfree(ref_path); 7139 return ret; 7140 } 7141 #endif 7142 7143 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 7144 { 7145 u64 num_devices; 7146 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7147 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7148 7149 num_devices = root->fs_info->fs_devices->rw_devices; 7150 if (num_devices == 1) { 7151 stripped |= BTRFS_BLOCK_GROUP_DUP; 7152 stripped = flags & ~stripped; 7153 7154 /* turn raid0 into single device chunks */ 7155 if (flags & BTRFS_BLOCK_GROUP_RAID0) 7156 return stripped; 7157 7158 /* turn mirroring into duplication */ 7159 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 7160 BTRFS_BLOCK_GROUP_RAID10)) 7161 return stripped | BTRFS_BLOCK_GROUP_DUP; 7162 return flags; 7163 } else { 7164 /* they already had raid on here, just return */ 7165 if (flags & stripped) 7166 return flags; 7167 7168 stripped |= BTRFS_BLOCK_GROUP_DUP; 7169 stripped = flags & ~stripped; 7170 7171 /* switch duplicated blocks with raid1 */ 7172 if (flags & BTRFS_BLOCK_GROUP_DUP) 7173 return stripped | BTRFS_BLOCK_GROUP_RAID1; 7174 7175 /* turn single device chunks into raid0 */ 7176 return stripped | BTRFS_BLOCK_GROUP_RAID0; 7177 } 7178 return flags; 7179 } 7180 7181 static int __alloc_chunk_for_shrink(struct btrfs_root *root, 7182 struct btrfs_block_group_cache *shrink_block_group, 7183 int force) 7184 { 7185 struct btrfs_trans_handle *trans; 7186 u64 new_alloc_flags; 7187 u64 calc; 7188 7189 spin_lock(&shrink_block_group->lock); 7190 if (btrfs_block_group_used(&shrink_block_group->item) + 7191 shrink_block_group->reserved > 0) { 7192 spin_unlock(&shrink_block_group->lock); 7193 7194 trans = btrfs_start_transaction(root, 1); 7195 spin_lock(&shrink_block_group->lock); 7196 7197 new_alloc_flags = update_block_group_flags(root, 7198 shrink_block_group->flags); 7199 if (new_alloc_flags != shrink_block_group->flags) { 7200 calc = 7201 btrfs_block_group_used(&shrink_block_group->item); 7202 } else { 7203 calc = shrink_block_group->key.offset; 7204 } 7205 spin_unlock(&shrink_block_group->lock); 7206 7207 do_chunk_alloc(trans, root->fs_info->extent_root, 7208 calc + 2 * 1024 * 1024, new_alloc_flags, force); 7209 7210 btrfs_end_transaction(trans, root); 7211 } else 7212 spin_unlock(&shrink_block_group->lock); 7213 return 0; 7214 } 7215 7216 7217 int btrfs_prepare_block_group_relocation(struct btrfs_root *root, 7218 struct btrfs_block_group_cache *group) 7219 7220 { 7221 __alloc_chunk_for_shrink(root, group, 1); 7222 set_block_group_readonly(group); 7223 return 0; 7224 } 7225 7226 /* 7227 * checks to see if its even possible to relocate this block group. 7228 * 7229 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 7230 * ok to go ahead and try. 7231 */ 7232 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) 7233 { 7234 struct btrfs_block_group_cache *block_group; 7235 struct btrfs_space_info *space_info; 7236 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 7237 struct btrfs_device *device; 7238 int full = 0; 7239 int ret = 0; 7240 7241 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 7242 7243 /* odd, couldn't find the block group, leave it alone */ 7244 if (!block_group) 7245 return -1; 7246 7247 /* no bytes used, we're good */ 7248 if (!btrfs_block_group_used(&block_group->item)) 7249 goto out; 7250 7251 space_info = block_group->space_info; 7252 spin_lock(&space_info->lock); 7253 7254 full = space_info->full; 7255 7256 /* 7257 * if this is the last block group we have in this space, we can't 7258 * relocate it unless we're able to allocate a new chunk below. 7259 * 7260 * Otherwise, we need to make sure we have room in the space to handle 7261 * all of the extents from this block group. If we can, we're good 7262 */ 7263 if ((space_info->total_bytes != block_group->key.offset) && 7264 (space_info->bytes_used + space_info->bytes_reserved + 7265 space_info->bytes_pinned + space_info->bytes_readonly + 7266 btrfs_block_group_used(&block_group->item) < 7267 space_info->total_bytes)) { 7268 spin_unlock(&space_info->lock); 7269 goto out; 7270 } 7271 spin_unlock(&space_info->lock); 7272 7273 /* 7274 * ok we don't have enough space, but maybe we have free space on our 7275 * devices to allocate new chunks for relocation, so loop through our 7276 * alloc devices and guess if we have enough space. However, if we 7277 * were marked as full, then we know there aren't enough chunks, and we 7278 * can just return. 7279 */ 7280 ret = -1; 7281 if (full) 7282 goto out; 7283 7284 mutex_lock(&root->fs_info->chunk_mutex); 7285 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 7286 u64 min_free = btrfs_block_group_used(&block_group->item); 7287 u64 dev_offset, max_avail; 7288 7289 /* 7290 * check to make sure we can actually find a chunk with enough 7291 * space to fit our block group in. 7292 */ 7293 if (device->total_bytes > device->bytes_used + min_free) { 7294 ret = find_free_dev_extent(NULL, device, min_free, 7295 &dev_offset, &max_avail); 7296 if (!ret) 7297 break; 7298 ret = -1; 7299 } 7300 } 7301 mutex_unlock(&root->fs_info->chunk_mutex); 7302 out: 7303 btrfs_put_block_group(block_group); 7304 return ret; 7305 } 7306 7307 static int find_first_block_group(struct btrfs_root *root, 7308 struct btrfs_path *path, struct btrfs_key *key) 7309 { 7310 int ret = 0; 7311 struct btrfs_key found_key; 7312 struct extent_buffer *leaf; 7313 int slot; 7314 7315 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 7316 if (ret < 0) 7317 goto out; 7318 7319 while (1) { 7320 slot = path->slots[0]; 7321 leaf = path->nodes[0]; 7322 if (slot >= btrfs_header_nritems(leaf)) { 7323 ret = btrfs_next_leaf(root, path); 7324 if (ret == 0) 7325 continue; 7326 if (ret < 0) 7327 goto out; 7328 break; 7329 } 7330 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7331 7332 if (found_key.objectid >= key->objectid && 7333 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 7334 ret = 0; 7335 goto out; 7336 } 7337 path->slots[0]++; 7338 } 7339 ret = -ENOENT; 7340 out: 7341 return ret; 7342 } 7343 7344 int btrfs_free_block_groups(struct btrfs_fs_info *info) 7345 { 7346 struct btrfs_block_group_cache *block_group; 7347 struct btrfs_space_info *space_info; 7348 struct btrfs_caching_control *caching_ctl; 7349 struct rb_node *n; 7350 7351 down_write(&info->extent_commit_sem); 7352 while (!list_empty(&info->caching_block_groups)) { 7353 caching_ctl = list_entry(info->caching_block_groups.next, 7354 struct btrfs_caching_control, list); 7355 list_del(&caching_ctl->list); 7356 put_caching_control(caching_ctl); 7357 } 7358 up_write(&info->extent_commit_sem); 7359 7360 spin_lock(&info->block_group_cache_lock); 7361 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 7362 block_group = rb_entry(n, struct btrfs_block_group_cache, 7363 cache_node); 7364 rb_erase(&block_group->cache_node, 7365 &info->block_group_cache_tree); 7366 spin_unlock(&info->block_group_cache_lock); 7367 7368 down_write(&block_group->space_info->groups_sem); 7369 list_del(&block_group->list); 7370 up_write(&block_group->space_info->groups_sem); 7371 7372 if (block_group->cached == BTRFS_CACHE_STARTED) 7373 wait_block_group_cache_done(block_group); 7374 7375 btrfs_remove_free_space_cache(block_group); 7376 7377 WARN_ON(atomic_read(&block_group->count) != 1); 7378 kfree(block_group); 7379 7380 spin_lock(&info->block_group_cache_lock); 7381 } 7382 spin_unlock(&info->block_group_cache_lock); 7383 7384 /* now that all the block groups are freed, go through and 7385 * free all the space_info structs. This is only called during 7386 * the final stages of unmount, and so we know nobody is 7387 * using them. We call synchronize_rcu() once before we start, 7388 * just to be on the safe side. 7389 */ 7390 synchronize_rcu(); 7391 7392 while(!list_empty(&info->space_info)) { 7393 space_info = list_entry(info->space_info.next, 7394 struct btrfs_space_info, 7395 list); 7396 7397 list_del(&space_info->list); 7398 kfree(space_info); 7399 } 7400 return 0; 7401 } 7402 7403 int btrfs_read_block_groups(struct btrfs_root *root) 7404 { 7405 struct btrfs_path *path; 7406 int ret; 7407 struct btrfs_block_group_cache *cache; 7408 struct btrfs_fs_info *info = root->fs_info; 7409 struct btrfs_space_info *space_info; 7410 struct btrfs_key key; 7411 struct btrfs_key found_key; 7412 struct extent_buffer *leaf; 7413 7414 root = info->extent_root; 7415 key.objectid = 0; 7416 key.offset = 0; 7417 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); 7418 path = btrfs_alloc_path(); 7419 if (!path) 7420 return -ENOMEM; 7421 7422 while (1) { 7423 ret = find_first_block_group(root, path, &key); 7424 if (ret > 0) { 7425 ret = 0; 7426 goto error; 7427 } 7428 if (ret != 0) 7429 goto error; 7430 7431 leaf = path->nodes[0]; 7432 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 7433 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7434 if (!cache) { 7435 ret = -ENOMEM; 7436 break; 7437 } 7438 7439 atomic_set(&cache->count, 1); 7440 spin_lock_init(&cache->lock); 7441 spin_lock_init(&cache->tree_lock); 7442 cache->fs_info = info; 7443 INIT_LIST_HEAD(&cache->list); 7444 INIT_LIST_HEAD(&cache->cluster_list); 7445 7446 /* 7447 * we only want to have 32k of ram per block group for keeping 7448 * track of free space, and if we pass 1/2 of that we want to 7449 * start converting things over to using bitmaps 7450 */ 7451 cache->extents_thresh = ((1024 * 32) / 2) / 7452 sizeof(struct btrfs_free_space); 7453 7454 read_extent_buffer(leaf, &cache->item, 7455 btrfs_item_ptr_offset(leaf, path->slots[0]), 7456 sizeof(cache->item)); 7457 memcpy(&cache->key, &found_key, sizeof(found_key)); 7458 7459 key.objectid = found_key.objectid + found_key.offset; 7460 btrfs_release_path(root, path); 7461 cache->flags = btrfs_block_group_flags(&cache->item); 7462 cache->sectorsize = root->sectorsize; 7463 7464 /* 7465 * check for two cases, either we are full, and therefore 7466 * don't need to bother with the caching work since we won't 7467 * find any space, or we are empty, and we can just add all 7468 * the space in and be done with it. This saves us _alot_ of 7469 * time, particularly in the full case. 7470 */ 7471 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 7472 exclude_super_stripes(root, cache); 7473 cache->last_byte_to_unpin = (u64)-1; 7474 cache->cached = BTRFS_CACHE_FINISHED; 7475 free_excluded_extents(root, cache); 7476 } else if (btrfs_block_group_used(&cache->item) == 0) { 7477 exclude_super_stripes(root, cache); 7478 cache->last_byte_to_unpin = (u64)-1; 7479 cache->cached = BTRFS_CACHE_FINISHED; 7480 add_new_free_space(cache, root->fs_info, 7481 found_key.objectid, 7482 found_key.objectid + 7483 found_key.offset); 7484 free_excluded_extents(root, cache); 7485 } 7486 7487 ret = update_space_info(info, cache->flags, found_key.offset, 7488 btrfs_block_group_used(&cache->item), 7489 &space_info); 7490 BUG_ON(ret); 7491 cache->space_info = space_info; 7492 spin_lock(&cache->space_info->lock); 7493 cache->space_info->bytes_super += cache->bytes_super; 7494 spin_unlock(&cache->space_info->lock); 7495 7496 down_write(&space_info->groups_sem); 7497 list_add_tail(&cache->list, &space_info->block_groups); 7498 up_write(&space_info->groups_sem); 7499 7500 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7501 BUG_ON(ret); 7502 7503 set_avail_alloc_bits(root->fs_info, cache->flags); 7504 if (btrfs_chunk_readonly(root, cache->key.objectid)) 7505 set_block_group_readonly(cache); 7506 } 7507 ret = 0; 7508 error: 7509 btrfs_free_path(path); 7510 return ret; 7511 } 7512 7513 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 7514 struct btrfs_root *root, u64 bytes_used, 7515 u64 type, u64 chunk_objectid, u64 chunk_offset, 7516 u64 size) 7517 { 7518 int ret; 7519 struct btrfs_root *extent_root; 7520 struct btrfs_block_group_cache *cache; 7521 7522 extent_root = root->fs_info->extent_root; 7523 7524 root->fs_info->last_trans_log_full_commit = trans->transid; 7525 7526 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7527 if (!cache) 7528 return -ENOMEM; 7529 7530 cache->key.objectid = chunk_offset; 7531 cache->key.offset = size; 7532 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 7533 cache->sectorsize = root->sectorsize; 7534 7535 /* 7536 * we only want to have 32k of ram per block group for keeping track 7537 * of free space, and if we pass 1/2 of that we want to start 7538 * converting things over to using bitmaps 7539 */ 7540 cache->extents_thresh = ((1024 * 32) / 2) / 7541 sizeof(struct btrfs_free_space); 7542 atomic_set(&cache->count, 1); 7543 spin_lock_init(&cache->lock); 7544 spin_lock_init(&cache->tree_lock); 7545 INIT_LIST_HEAD(&cache->list); 7546 INIT_LIST_HEAD(&cache->cluster_list); 7547 7548 btrfs_set_block_group_used(&cache->item, bytes_used); 7549 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 7550 cache->flags = type; 7551 btrfs_set_block_group_flags(&cache->item, type); 7552 7553 cache->last_byte_to_unpin = (u64)-1; 7554 cache->cached = BTRFS_CACHE_FINISHED; 7555 exclude_super_stripes(root, cache); 7556 7557 add_new_free_space(cache, root->fs_info, chunk_offset, 7558 chunk_offset + size); 7559 7560 free_excluded_extents(root, cache); 7561 7562 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7563 &cache->space_info); 7564 BUG_ON(ret); 7565 7566 spin_lock(&cache->space_info->lock); 7567 cache->space_info->bytes_super += cache->bytes_super; 7568 spin_unlock(&cache->space_info->lock); 7569 7570 down_write(&cache->space_info->groups_sem); 7571 list_add_tail(&cache->list, &cache->space_info->block_groups); 7572 up_write(&cache->space_info->groups_sem); 7573 7574 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7575 BUG_ON(ret); 7576 7577 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, 7578 sizeof(cache->item)); 7579 BUG_ON(ret); 7580 7581 set_avail_alloc_bits(extent_root->fs_info, type); 7582 7583 return 0; 7584 } 7585 7586 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 7587 struct btrfs_root *root, u64 group_start) 7588 { 7589 struct btrfs_path *path; 7590 struct btrfs_block_group_cache *block_group; 7591 struct btrfs_free_cluster *cluster; 7592 struct btrfs_key key; 7593 int ret; 7594 7595 root = root->fs_info->extent_root; 7596 7597 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 7598 BUG_ON(!block_group); 7599 BUG_ON(!block_group->ro); 7600 7601 memcpy(&key, &block_group->key, sizeof(key)); 7602 7603 /* make sure this block group isn't part of an allocation cluster */ 7604 cluster = &root->fs_info->data_alloc_cluster; 7605 spin_lock(&cluster->refill_lock); 7606 btrfs_return_cluster_to_free_space(block_group, cluster); 7607 spin_unlock(&cluster->refill_lock); 7608 7609 /* 7610 * make sure this block group isn't part of a metadata 7611 * allocation cluster 7612 */ 7613 cluster = &root->fs_info->meta_alloc_cluster; 7614 spin_lock(&cluster->refill_lock); 7615 btrfs_return_cluster_to_free_space(block_group, cluster); 7616 spin_unlock(&cluster->refill_lock); 7617 7618 path = btrfs_alloc_path(); 7619 BUG_ON(!path); 7620 7621 spin_lock(&root->fs_info->block_group_cache_lock); 7622 rb_erase(&block_group->cache_node, 7623 &root->fs_info->block_group_cache_tree); 7624 spin_unlock(&root->fs_info->block_group_cache_lock); 7625 7626 down_write(&block_group->space_info->groups_sem); 7627 /* 7628 * we must use list_del_init so people can check to see if they 7629 * are still on the list after taking the semaphore 7630 */ 7631 list_del_init(&block_group->list); 7632 up_write(&block_group->space_info->groups_sem); 7633 7634 if (block_group->cached == BTRFS_CACHE_STARTED) 7635 wait_block_group_cache_done(block_group); 7636 7637 btrfs_remove_free_space_cache(block_group); 7638 7639 spin_lock(&block_group->space_info->lock); 7640 block_group->space_info->total_bytes -= block_group->key.offset; 7641 block_group->space_info->bytes_readonly -= block_group->key.offset; 7642 spin_unlock(&block_group->space_info->lock); 7643 7644 btrfs_clear_space_info_full(root->fs_info); 7645 7646 btrfs_put_block_group(block_group); 7647 btrfs_put_block_group(block_group); 7648 7649 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 7650 if (ret > 0) 7651 ret = -EIO; 7652 if (ret < 0) 7653 goto out; 7654 7655 ret = btrfs_del_item(trans, root, path); 7656 out: 7657 btrfs_free_path(path); 7658 return ret; 7659 } 7660