1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include <linux/slab.h> 26 #include "compat.h" 27 #include "hash.h" 28 #include "ctree.h" 29 #include "disk-io.h" 30 #include "print-tree.h" 31 #include "transaction.h" 32 #include "volumes.h" 33 #include "locking.h" 34 #include "free-space-cache.h" 35 36 /* control flags for do_chunk_alloc's force field 37 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 38 * if we really need one. 39 * 40 * CHUNK_ALLOC_FORCE means it must try to allocate one 41 * 42 * CHUNK_ALLOC_LIMITED means to only try and allocate one 43 * if we have very few chunks already allocated. This is 44 * used as part of the clustering code to help make sure 45 * we have a good pool of storage to cluster in, without 46 * filling the FS with empty chunks 47 * 48 */ 49 enum { 50 CHUNK_ALLOC_NO_FORCE = 0, 51 CHUNK_ALLOC_FORCE = 1, 52 CHUNK_ALLOC_LIMITED = 2, 53 }; 54 55 static int update_block_group(struct btrfs_trans_handle *trans, 56 struct btrfs_root *root, 57 u64 bytenr, u64 num_bytes, int alloc); 58 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 59 struct btrfs_root *root, 60 u64 bytenr, u64 num_bytes, u64 parent, 61 u64 root_objectid, u64 owner_objectid, 62 u64 owner_offset, int refs_to_drop, 63 struct btrfs_delayed_extent_op *extra_op); 64 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 65 struct extent_buffer *leaf, 66 struct btrfs_extent_item *ei); 67 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 68 struct btrfs_root *root, 69 u64 parent, u64 root_objectid, 70 u64 flags, u64 owner, u64 offset, 71 struct btrfs_key *ins, int ref_mod); 72 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 73 struct btrfs_root *root, 74 u64 parent, u64 root_objectid, 75 u64 flags, struct btrfs_disk_key *key, 76 int level, struct btrfs_key *ins); 77 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 78 struct btrfs_root *extent_root, u64 alloc_bytes, 79 u64 flags, int force); 80 static int find_next_key(struct btrfs_path *path, int level, 81 struct btrfs_key *key); 82 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 83 int dump_block_groups); 84 85 static noinline int 86 block_group_cache_done(struct btrfs_block_group_cache *cache) 87 { 88 smp_mb(); 89 return cache->cached == BTRFS_CACHE_FINISHED; 90 } 91 92 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 93 { 94 return (cache->flags & bits) == bits; 95 } 96 97 void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 98 { 99 atomic_inc(&cache->count); 100 } 101 102 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 103 { 104 if (atomic_dec_and_test(&cache->count)) { 105 WARN_ON(cache->pinned > 0); 106 WARN_ON(cache->reserved > 0); 107 WARN_ON(cache->reserved_pinned > 0); 108 kfree(cache); 109 } 110 } 111 112 /* 113 * this adds the block group to the fs_info rb tree for the block group 114 * cache 115 */ 116 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 117 struct btrfs_block_group_cache *block_group) 118 { 119 struct rb_node **p; 120 struct rb_node *parent = NULL; 121 struct btrfs_block_group_cache *cache; 122 123 spin_lock(&info->block_group_cache_lock); 124 p = &info->block_group_cache_tree.rb_node; 125 126 while (*p) { 127 parent = *p; 128 cache = rb_entry(parent, struct btrfs_block_group_cache, 129 cache_node); 130 if (block_group->key.objectid < cache->key.objectid) { 131 p = &(*p)->rb_left; 132 } else if (block_group->key.objectid > cache->key.objectid) { 133 p = &(*p)->rb_right; 134 } else { 135 spin_unlock(&info->block_group_cache_lock); 136 return -EEXIST; 137 } 138 } 139 140 rb_link_node(&block_group->cache_node, parent, p); 141 rb_insert_color(&block_group->cache_node, 142 &info->block_group_cache_tree); 143 spin_unlock(&info->block_group_cache_lock); 144 145 return 0; 146 } 147 148 /* 149 * This will return the block group at or after bytenr if contains is 0, else 150 * it will return the block group that contains the bytenr 151 */ 152 static struct btrfs_block_group_cache * 153 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 154 int contains) 155 { 156 struct btrfs_block_group_cache *cache, *ret = NULL; 157 struct rb_node *n; 158 u64 end, start; 159 160 spin_lock(&info->block_group_cache_lock); 161 n = info->block_group_cache_tree.rb_node; 162 163 while (n) { 164 cache = rb_entry(n, struct btrfs_block_group_cache, 165 cache_node); 166 end = cache->key.objectid + cache->key.offset - 1; 167 start = cache->key.objectid; 168 169 if (bytenr < start) { 170 if (!contains && (!ret || start < ret->key.objectid)) 171 ret = cache; 172 n = n->rb_left; 173 } else if (bytenr > start) { 174 if (contains && bytenr <= end) { 175 ret = cache; 176 break; 177 } 178 n = n->rb_right; 179 } else { 180 ret = cache; 181 break; 182 } 183 } 184 if (ret) 185 btrfs_get_block_group(ret); 186 spin_unlock(&info->block_group_cache_lock); 187 188 return ret; 189 } 190 191 static int add_excluded_extent(struct btrfs_root *root, 192 u64 start, u64 num_bytes) 193 { 194 u64 end = start + num_bytes - 1; 195 set_extent_bits(&root->fs_info->freed_extents[0], 196 start, end, EXTENT_UPTODATE, GFP_NOFS); 197 set_extent_bits(&root->fs_info->freed_extents[1], 198 start, end, EXTENT_UPTODATE, GFP_NOFS); 199 return 0; 200 } 201 202 static void free_excluded_extents(struct btrfs_root *root, 203 struct btrfs_block_group_cache *cache) 204 { 205 u64 start, end; 206 207 start = cache->key.objectid; 208 end = start + cache->key.offset - 1; 209 210 clear_extent_bits(&root->fs_info->freed_extents[0], 211 start, end, EXTENT_UPTODATE, GFP_NOFS); 212 clear_extent_bits(&root->fs_info->freed_extents[1], 213 start, end, EXTENT_UPTODATE, GFP_NOFS); 214 } 215 216 static int exclude_super_stripes(struct btrfs_root *root, 217 struct btrfs_block_group_cache *cache) 218 { 219 u64 bytenr; 220 u64 *logical; 221 int stripe_len; 222 int i, nr, ret; 223 224 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 225 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 226 cache->bytes_super += stripe_len; 227 ret = add_excluded_extent(root, cache->key.objectid, 228 stripe_len); 229 BUG_ON(ret); 230 } 231 232 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 233 bytenr = btrfs_sb_offset(i); 234 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 235 cache->key.objectid, bytenr, 236 0, &logical, &nr, &stripe_len); 237 BUG_ON(ret); 238 239 while (nr--) { 240 cache->bytes_super += stripe_len; 241 ret = add_excluded_extent(root, logical[nr], 242 stripe_len); 243 BUG_ON(ret); 244 } 245 246 kfree(logical); 247 } 248 return 0; 249 } 250 251 static struct btrfs_caching_control * 252 get_caching_control(struct btrfs_block_group_cache *cache) 253 { 254 struct btrfs_caching_control *ctl; 255 256 spin_lock(&cache->lock); 257 if (cache->cached != BTRFS_CACHE_STARTED) { 258 spin_unlock(&cache->lock); 259 return NULL; 260 } 261 262 /* We're loading it the fast way, so we don't have a caching_ctl. */ 263 if (!cache->caching_ctl) { 264 spin_unlock(&cache->lock); 265 return NULL; 266 } 267 268 ctl = cache->caching_ctl; 269 atomic_inc(&ctl->count); 270 spin_unlock(&cache->lock); 271 return ctl; 272 } 273 274 static void put_caching_control(struct btrfs_caching_control *ctl) 275 { 276 if (atomic_dec_and_test(&ctl->count)) 277 kfree(ctl); 278 } 279 280 /* 281 * this is only called by cache_block_group, since we could have freed extents 282 * we need to check the pinned_extents for any extents that can't be used yet 283 * since their free space will be released as soon as the transaction commits. 284 */ 285 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 286 struct btrfs_fs_info *info, u64 start, u64 end) 287 { 288 u64 extent_start, extent_end, size, total_added = 0; 289 int ret; 290 291 while (start < end) { 292 ret = find_first_extent_bit(info->pinned_extents, start, 293 &extent_start, &extent_end, 294 EXTENT_DIRTY | EXTENT_UPTODATE); 295 if (ret) 296 break; 297 298 if (extent_start <= start) { 299 start = extent_end + 1; 300 } else if (extent_start > start && extent_start < end) { 301 size = extent_start - start; 302 total_added += size; 303 ret = btrfs_add_free_space(block_group, start, 304 size); 305 BUG_ON(ret); 306 start = extent_end + 1; 307 } else { 308 break; 309 } 310 } 311 312 if (start < end) { 313 size = end - start; 314 total_added += size; 315 ret = btrfs_add_free_space(block_group, start, size); 316 BUG_ON(ret); 317 } 318 319 return total_added; 320 } 321 322 static int caching_kthread(void *data) 323 { 324 struct btrfs_block_group_cache *block_group = data; 325 struct btrfs_fs_info *fs_info = block_group->fs_info; 326 struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; 327 struct btrfs_root *extent_root = fs_info->extent_root; 328 struct btrfs_path *path; 329 struct extent_buffer *leaf; 330 struct btrfs_key key; 331 u64 total_found = 0; 332 u64 last = 0; 333 u32 nritems; 334 int ret = 0; 335 336 path = btrfs_alloc_path(); 337 if (!path) 338 return -ENOMEM; 339 340 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 341 342 /* 343 * We don't want to deadlock with somebody trying to allocate a new 344 * extent for the extent root while also trying to search the extent 345 * root to add free space. So we skip locking and search the commit 346 * root, since its read-only 347 */ 348 path->skip_locking = 1; 349 path->search_commit_root = 1; 350 path->reada = 2; 351 352 key.objectid = last; 353 key.offset = 0; 354 key.type = BTRFS_EXTENT_ITEM_KEY; 355 again: 356 mutex_lock(&caching_ctl->mutex); 357 /* need to make sure the commit_root doesn't disappear */ 358 down_read(&fs_info->extent_commit_sem); 359 360 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 361 if (ret < 0) 362 goto err; 363 364 leaf = path->nodes[0]; 365 nritems = btrfs_header_nritems(leaf); 366 367 while (1) { 368 smp_mb(); 369 if (fs_info->closing > 1) { 370 last = (u64)-1; 371 break; 372 } 373 374 if (path->slots[0] < nritems) { 375 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 376 } else { 377 ret = find_next_key(path, 0, &key); 378 if (ret) 379 break; 380 381 caching_ctl->progress = last; 382 btrfs_release_path(extent_root, path); 383 up_read(&fs_info->extent_commit_sem); 384 mutex_unlock(&caching_ctl->mutex); 385 if (btrfs_transaction_in_commit(fs_info)) 386 schedule_timeout(1); 387 else 388 cond_resched(); 389 goto again; 390 } 391 392 if (key.objectid < block_group->key.objectid) { 393 path->slots[0]++; 394 continue; 395 } 396 397 if (key.objectid >= block_group->key.objectid + 398 block_group->key.offset) 399 break; 400 401 if (key.type == BTRFS_EXTENT_ITEM_KEY) { 402 total_found += add_new_free_space(block_group, 403 fs_info, last, 404 key.objectid); 405 last = key.objectid + key.offset; 406 407 if (total_found > (1024 * 1024 * 2)) { 408 total_found = 0; 409 wake_up(&caching_ctl->wait); 410 } 411 } 412 path->slots[0]++; 413 } 414 ret = 0; 415 416 total_found += add_new_free_space(block_group, fs_info, last, 417 block_group->key.objectid + 418 block_group->key.offset); 419 caching_ctl->progress = (u64)-1; 420 421 spin_lock(&block_group->lock); 422 block_group->caching_ctl = NULL; 423 block_group->cached = BTRFS_CACHE_FINISHED; 424 spin_unlock(&block_group->lock); 425 426 err: 427 btrfs_free_path(path); 428 up_read(&fs_info->extent_commit_sem); 429 430 free_excluded_extents(extent_root, block_group); 431 432 mutex_unlock(&caching_ctl->mutex); 433 wake_up(&caching_ctl->wait); 434 435 put_caching_control(caching_ctl); 436 atomic_dec(&block_group->space_info->caching_threads); 437 btrfs_put_block_group(block_group); 438 439 return 0; 440 } 441 442 static int cache_block_group(struct btrfs_block_group_cache *cache, 443 struct btrfs_trans_handle *trans, 444 struct btrfs_root *root, 445 int load_cache_only) 446 { 447 struct btrfs_fs_info *fs_info = cache->fs_info; 448 struct btrfs_caching_control *caching_ctl; 449 struct task_struct *tsk; 450 int ret = 0; 451 452 smp_mb(); 453 if (cache->cached != BTRFS_CACHE_NO) 454 return 0; 455 456 /* 457 * We can't do the read from on-disk cache during a commit since we need 458 * to have the normal tree locking. Also if we are currently trying to 459 * allocate blocks for the tree root we can't do the fast caching since 460 * we likely hold important locks. 461 */ 462 if (trans && (!trans->transaction->in_commit) && 463 (root && root != root->fs_info->tree_root)) { 464 spin_lock(&cache->lock); 465 if (cache->cached != BTRFS_CACHE_NO) { 466 spin_unlock(&cache->lock); 467 return 0; 468 } 469 cache->cached = BTRFS_CACHE_STARTED; 470 spin_unlock(&cache->lock); 471 472 ret = load_free_space_cache(fs_info, cache); 473 474 spin_lock(&cache->lock); 475 if (ret == 1) { 476 cache->cached = BTRFS_CACHE_FINISHED; 477 cache->last_byte_to_unpin = (u64)-1; 478 } else { 479 cache->cached = BTRFS_CACHE_NO; 480 } 481 spin_unlock(&cache->lock); 482 if (ret == 1) { 483 free_excluded_extents(fs_info->extent_root, cache); 484 return 0; 485 } 486 } 487 488 if (load_cache_only) 489 return 0; 490 491 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 492 BUG_ON(!caching_ctl); 493 494 INIT_LIST_HEAD(&caching_ctl->list); 495 mutex_init(&caching_ctl->mutex); 496 init_waitqueue_head(&caching_ctl->wait); 497 caching_ctl->block_group = cache; 498 caching_ctl->progress = cache->key.objectid; 499 /* one for caching kthread, one for caching block group list */ 500 atomic_set(&caching_ctl->count, 2); 501 502 spin_lock(&cache->lock); 503 if (cache->cached != BTRFS_CACHE_NO) { 504 spin_unlock(&cache->lock); 505 kfree(caching_ctl); 506 return 0; 507 } 508 cache->caching_ctl = caching_ctl; 509 cache->cached = BTRFS_CACHE_STARTED; 510 spin_unlock(&cache->lock); 511 512 down_write(&fs_info->extent_commit_sem); 513 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 514 up_write(&fs_info->extent_commit_sem); 515 516 atomic_inc(&cache->space_info->caching_threads); 517 btrfs_get_block_group(cache); 518 519 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", 520 cache->key.objectid); 521 if (IS_ERR(tsk)) { 522 ret = PTR_ERR(tsk); 523 printk(KERN_ERR "error running thread %d\n", ret); 524 BUG(); 525 } 526 527 return ret; 528 } 529 530 /* 531 * return the block group that starts at or after bytenr 532 */ 533 static struct btrfs_block_group_cache * 534 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 535 { 536 struct btrfs_block_group_cache *cache; 537 538 cache = block_group_cache_tree_search(info, bytenr, 0); 539 540 return cache; 541 } 542 543 /* 544 * return the block group that contains the given bytenr 545 */ 546 struct btrfs_block_group_cache *btrfs_lookup_block_group( 547 struct btrfs_fs_info *info, 548 u64 bytenr) 549 { 550 struct btrfs_block_group_cache *cache; 551 552 cache = block_group_cache_tree_search(info, bytenr, 1); 553 554 return cache; 555 } 556 557 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 558 u64 flags) 559 { 560 struct list_head *head = &info->space_info; 561 struct btrfs_space_info *found; 562 563 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | 564 BTRFS_BLOCK_GROUP_METADATA; 565 566 rcu_read_lock(); 567 list_for_each_entry_rcu(found, head, list) { 568 if (found->flags & flags) { 569 rcu_read_unlock(); 570 return found; 571 } 572 } 573 rcu_read_unlock(); 574 return NULL; 575 } 576 577 /* 578 * after adding space to the filesystem, we need to clear the full flags 579 * on all the space infos. 580 */ 581 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 582 { 583 struct list_head *head = &info->space_info; 584 struct btrfs_space_info *found; 585 586 rcu_read_lock(); 587 list_for_each_entry_rcu(found, head, list) 588 found->full = 0; 589 rcu_read_unlock(); 590 } 591 592 static u64 div_factor(u64 num, int factor) 593 { 594 if (factor == 10) 595 return num; 596 num *= factor; 597 do_div(num, 10); 598 return num; 599 } 600 601 static u64 div_factor_fine(u64 num, int factor) 602 { 603 if (factor == 100) 604 return num; 605 num *= factor; 606 do_div(num, 100); 607 return num; 608 } 609 610 u64 btrfs_find_block_group(struct btrfs_root *root, 611 u64 search_start, u64 search_hint, int owner) 612 { 613 struct btrfs_block_group_cache *cache; 614 u64 used; 615 u64 last = max(search_hint, search_start); 616 u64 group_start = 0; 617 int full_search = 0; 618 int factor = 9; 619 int wrapped = 0; 620 again: 621 while (1) { 622 cache = btrfs_lookup_first_block_group(root->fs_info, last); 623 if (!cache) 624 break; 625 626 spin_lock(&cache->lock); 627 last = cache->key.objectid + cache->key.offset; 628 used = btrfs_block_group_used(&cache->item); 629 630 if ((full_search || !cache->ro) && 631 block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { 632 if (used + cache->pinned + cache->reserved < 633 div_factor(cache->key.offset, factor)) { 634 group_start = cache->key.objectid; 635 spin_unlock(&cache->lock); 636 btrfs_put_block_group(cache); 637 goto found; 638 } 639 } 640 spin_unlock(&cache->lock); 641 btrfs_put_block_group(cache); 642 cond_resched(); 643 } 644 if (!wrapped) { 645 last = search_start; 646 wrapped = 1; 647 goto again; 648 } 649 if (!full_search && factor < 10) { 650 last = search_start; 651 full_search = 1; 652 factor = 10; 653 goto again; 654 } 655 found: 656 return group_start; 657 } 658 659 /* simple helper to search for an existing extent at a given offset */ 660 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) 661 { 662 int ret; 663 struct btrfs_key key; 664 struct btrfs_path *path; 665 666 path = btrfs_alloc_path(); 667 BUG_ON(!path); 668 key.objectid = start; 669 key.offset = len; 670 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 671 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 672 0, 0); 673 btrfs_free_path(path); 674 return ret; 675 } 676 677 /* 678 * helper function to lookup reference count and flags of extent. 679 * 680 * the head node for delayed ref is used to store the sum of all the 681 * reference count modifications queued up in the rbtree. the head 682 * node may also store the extent flags to set. This way you can check 683 * to see what the reference count and extent flags would be if all of 684 * the delayed refs are not processed. 685 */ 686 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 687 struct btrfs_root *root, u64 bytenr, 688 u64 num_bytes, u64 *refs, u64 *flags) 689 { 690 struct btrfs_delayed_ref_head *head; 691 struct btrfs_delayed_ref_root *delayed_refs; 692 struct btrfs_path *path; 693 struct btrfs_extent_item *ei; 694 struct extent_buffer *leaf; 695 struct btrfs_key key; 696 u32 item_size; 697 u64 num_refs; 698 u64 extent_flags; 699 int ret; 700 701 path = btrfs_alloc_path(); 702 if (!path) 703 return -ENOMEM; 704 705 key.objectid = bytenr; 706 key.type = BTRFS_EXTENT_ITEM_KEY; 707 key.offset = num_bytes; 708 if (!trans) { 709 path->skip_locking = 1; 710 path->search_commit_root = 1; 711 } 712 again: 713 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 714 &key, path, 0, 0); 715 if (ret < 0) 716 goto out_free; 717 718 if (ret == 0) { 719 leaf = path->nodes[0]; 720 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 721 if (item_size >= sizeof(*ei)) { 722 ei = btrfs_item_ptr(leaf, path->slots[0], 723 struct btrfs_extent_item); 724 num_refs = btrfs_extent_refs(leaf, ei); 725 extent_flags = btrfs_extent_flags(leaf, ei); 726 } else { 727 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 728 struct btrfs_extent_item_v0 *ei0; 729 BUG_ON(item_size != sizeof(*ei0)); 730 ei0 = btrfs_item_ptr(leaf, path->slots[0], 731 struct btrfs_extent_item_v0); 732 num_refs = btrfs_extent_refs_v0(leaf, ei0); 733 /* FIXME: this isn't correct for data */ 734 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 735 #else 736 BUG(); 737 #endif 738 } 739 BUG_ON(num_refs == 0); 740 } else { 741 num_refs = 0; 742 extent_flags = 0; 743 ret = 0; 744 } 745 746 if (!trans) 747 goto out; 748 749 delayed_refs = &trans->transaction->delayed_refs; 750 spin_lock(&delayed_refs->lock); 751 head = btrfs_find_delayed_ref_head(trans, bytenr); 752 if (head) { 753 if (!mutex_trylock(&head->mutex)) { 754 atomic_inc(&head->node.refs); 755 spin_unlock(&delayed_refs->lock); 756 757 btrfs_release_path(root->fs_info->extent_root, path); 758 759 mutex_lock(&head->mutex); 760 mutex_unlock(&head->mutex); 761 btrfs_put_delayed_ref(&head->node); 762 goto again; 763 } 764 if (head->extent_op && head->extent_op->update_flags) 765 extent_flags |= head->extent_op->flags_to_set; 766 else 767 BUG_ON(num_refs == 0); 768 769 num_refs += head->node.ref_mod; 770 mutex_unlock(&head->mutex); 771 } 772 spin_unlock(&delayed_refs->lock); 773 out: 774 WARN_ON(num_refs == 0); 775 if (refs) 776 *refs = num_refs; 777 if (flags) 778 *flags = extent_flags; 779 out_free: 780 btrfs_free_path(path); 781 return ret; 782 } 783 784 /* 785 * Back reference rules. Back refs have three main goals: 786 * 787 * 1) differentiate between all holders of references to an extent so that 788 * when a reference is dropped we can make sure it was a valid reference 789 * before freeing the extent. 790 * 791 * 2) Provide enough information to quickly find the holders of an extent 792 * if we notice a given block is corrupted or bad. 793 * 794 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 795 * maintenance. This is actually the same as #2, but with a slightly 796 * different use case. 797 * 798 * There are two kinds of back refs. The implicit back refs is optimized 799 * for pointers in non-shared tree blocks. For a given pointer in a block, 800 * back refs of this kind provide information about the block's owner tree 801 * and the pointer's key. These information allow us to find the block by 802 * b-tree searching. The full back refs is for pointers in tree blocks not 803 * referenced by their owner trees. The location of tree block is recorded 804 * in the back refs. Actually the full back refs is generic, and can be 805 * used in all cases the implicit back refs is used. The major shortcoming 806 * of the full back refs is its overhead. Every time a tree block gets 807 * COWed, we have to update back refs entry for all pointers in it. 808 * 809 * For a newly allocated tree block, we use implicit back refs for 810 * pointers in it. This means most tree related operations only involve 811 * implicit back refs. For a tree block created in old transaction, the 812 * only way to drop a reference to it is COW it. So we can detect the 813 * event that tree block loses its owner tree's reference and do the 814 * back refs conversion. 815 * 816 * When a tree block is COW'd through a tree, there are four cases: 817 * 818 * The reference count of the block is one and the tree is the block's 819 * owner tree. Nothing to do in this case. 820 * 821 * The reference count of the block is one and the tree is not the 822 * block's owner tree. In this case, full back refs is used for pointers 823 * in the block. Remove these full back refs, add implicit back refs for 824 * every pointers in the new block. 825 * 826 * The reference count of the block is greater than one and the tree is 827 * the block's owner tree. In this case, implicit back refs is used for 828 * pointers in the block. Add full back refs for every pointers in the 829 * block, increase lower level extents' reference counts. The original 830 * implicit back refs are entailed to the new block. 831 * 832 * The reference count of the block is greater than one and the tree is 833 * not the block's owner tree. Add implicit back refs for every pointer in 834 * the new block, increase lower level extents' reference count. 835 * 836 * Back Reference Key composing: 837 * 838 * The key objectid corresponds to the first byte in the extent, 839 * The key type is used to differentiate between types of back refs. 840 * There are different meanings of the key offset for different types 841 * of back refs. 842 * 843 * File extents can be referenced by: 844 * 845 * - multiple snapshots, subvolumes, or different generations in one subvol 846 * - different files inside a single subvolume 847 * - different offsets inside a file (bookend extents in file.c) 848 * 849 * The extent ref structure for the implicit back refs has fields for: 850 * 851 * - Objectid of the subvolume root 852 * - objectid of the file holding the reference 853 * - original offset in the file 854 * - how many bookend extents 855 * 856 * The key offset for the implicit back refs is hash of the first 857 * three fields. 858 * 859 * The extent ref structure for the full back refs has field for: 860 * 861 * - number of pointers in the tree leaf 862 * 863 * The key offset for the implicit back refs is the first byte of 864 * the tree leaf 865 * 866 * When a file extent is allocated, The implicit back refs is used. 867 * the fields are filled in: 868 * 869 * (root_key.objectid, inode objectid, offset in file, 1) 870 * 871 * When a file extent is removed file truncation, we find the 872 * corresponding implicit back refs and check the following fields: 873 * 874 * (btrfs_header_owner(leaf), inode objectid, offset in file) 875 * 876 * Btree extents can be referenced by: 877 * 878 * - Different subvolumes 879 * 880 * Both the implicit back refs and the full back refs for tree blocks 881 * only consist of key. The key offset for the implicit back refs is 882 * objectid of block's owner tree. The key offset for the full back refs 883 * is the first byte of parent block. 884 * 885 * When implicit back refs is used, information about the lowest key and 886 * level of the tree block are required. These information are stored in 887 * tree block info structure. 888 */ 889 890 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 891 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 892 struct btrfs_root *root, 893 struct btrfs_path *path, 894 u64 owner, u32 extra_size) 895 { 896 struct btrfs_extent_item *item; 897 struct btrfs_extent_item_v0 *ei0; 898 struct btrfs_extent_ref_v0 *ref0; 899 struct btrfs_tree_block_info *bi; 900 struct extent_buffer *leaf; 901 struct btrfs_key key; 902 struct btrfs_key found_key; 903 u32 new_size = sizeof(*item); 904 u64 refs; 905 int ret; 906 907 leaf = path->nodes[0]; 908 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 909 910 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 911 ei0 = btrfs_item_ptr(leaf, path->slots[0], 912 struct btrfs_extent_item_v0); 913 refs = btrfs_extent_refs_v0(leaf, ei0); 914 915 if (owner == (u64)-1) { 916 while (1) { 917 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 918 ret = btrfs_next_leaf(root, path); 919 if (ret < 0) 920 return ret; 921 BUG_ON(ret > 0); 922 leaf = path->nodes[0]; 923 } 924 btrfs_item_key_to_cpu(leaf, &found_key, 925 path->slots[0]); 926 BUG_ON(key.objectid != found_key.objectid); 927 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 928 path->slots[0]++; 929 continue; 930 } 931 ref0 = btrfs_item_ptr(leaf, path->slots[0], 932 struct btrfs_extent_ref_v0); 933 owner = btrfs_ref_objectid_v0(leaf, ref0); 934 break; 935 } 936 } 937 btrfs_release_path(root, path); 938 939 if (owner < BTRFS_FIRST_FREE_OBJECTID) 940 new_size += sizeof(*bi); 941 942 new_size -= sizeof(*ei0); 943 ret = btrfs_search_slot(trans, root, &key, path, 944 new_size + extra_size, 1); 945 if (ret < 0) 946 return ret; 947 BUG_ON(ret); 948 949 ret = btrfs_extend_item(trans, root, path, new_size); 950 BUG_ON(ret); 951 952 leaf = path->nodes[0]; 953 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 954 btrfs_set_extent_refs(leaf, item, refs); 955 /* FIXME: get real generation */ 956 btrfs_set_extent_generation(leaf, item, 0); 957 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 958 btrfs_set_extent_flags(leaf, item, 959 BTRFS_EXTENT_FLAG_TREE_BLOCK | 960 BTRFS_BLOCK_FLAG_FULL_BACKREF); 961 bi = (struct btrfs_tree_block_info *)(item + 1); 962 /* FIXME: get first key of the block */ 963 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); 964 btrfs_set_tree_block_level(leaf, bi, (int)owner); 965 } else { 966 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 967 } 968 btrfs_mark_buffer_dirty(leaf); 969 return 0; 970 } 971 #endif 972 973 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 974 { 975 u32 high_crc = ~(u32)0; 976 u32 low_crc = ~(u32)0; 977 __le64 lenum; 978 979 lenum = cpu_to_le64(root_objectid); 980 high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 981 lenum = cpu_to_le64(owner); 982 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 983 lenum = cpu_to_le64(offset); 984 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 985 986 return ((u64)high_crc << 31) ^ (u64)low_crc; 987 } 988 989 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 990 struct btrfs_extent_data_ref *ref) 991 { 992 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 993 btrfs_extent_data_ref_objectid(leaf, ref), 994 btrfs_extent_data_ref_offset(leaf, ref)); 995 } 996 997 static int match_extent_data_ref(struct extent_buffer *leaf, 998 struct btrfs_extent_data_ref *ref, 999 u64 root_objectid, u64 owner, u64 offset) 1000 { 1001 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1002 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1003 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1004 return 0; 1005 return 1; 1006 } 1007 1008 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1009 struct btrfs_root *root, 1010 struct btrfs_path *path, 1011 u64 bytenr, u64 parent, 1012 u64 root_objectid, 1013 u64 owner, u64 offset) 1014 { 1015 struct btrfs_key key; 1016 struct btrfs_extent_data_ref *ref; 1017 struct extent_buffer *leaf; 1018 u32 nritems; 1019 int ret; 1020 int recow; 1021 int err = -ENOENT; 1022 1023 key.objectid = bytenr; 1024 if (parent) { 1025 key.type = BTRFS_SHARED_DATA_REF_KEY; 1026 key.offset = parent; 1027 } else { 1028 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1029 key.offset = hash_extent_data_ref(root_objectid, 1030 owner, offset); 1031 } 1032 again: 1033 recow = 0; 1034 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1035 if (ret < 0) { 1036 err = ret; 1037 goto fail; 1038 } 1039 1040 if (parent) { 1041 if (!ret) 1042 return 0; 1043 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1044 key.type = BTRFS_EXTENT_REF_V0_KEY; 1045 btrfs_release_path(root, path); 1046 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1047 if (ret < 0) { 1048 err = ret; 1049 goto fail; 1050 } 1051 if (!ret) 1052 return 0; 1053 #endif 1054 goto fail; 1055 } 1056 1057 leaf = path->nodes[0]; 1058 nritems = btrfs_header_nritems(leaf); 1059 while (1) { 1060 if (path->slots[0] >= nritems) { 1061 ret = btrfs_next_leaf(root, path); 1062 if (ret < 0) 1063 err = ret; 1064 if (ret) 1065 goto fail; 1066 1067 leaf = path->nodes[0]; 1068 nritems = btrfs_header_nritems(leaf); 1069 recow = 1; 1070 } 1071 1072 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1073 if (key.objectid != bytenr || 1074 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1075 goto fail; 1076 1077 ref = btrfs_item_ptr(leaf, path->slots[0], 1078 struct btrfs_extent_data_ref); 1079 1080 if (match_extent_data_ref(leaf, ref, root_objectid, 1081 owner, offset)) { 1082 if (recow) { 1083 btrfs_release_path(root, path); 1084 goto again; 1085 } 1086 err = 0; 1087 break; 1088 } 1089 path->slots[0]++; 1090 } 1091 fail: 1092 return err; 1093 } 1094 1095 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1096 struct btrfs_root *root, 1097 struct btrfs_path *path, 1098 u64 bytenr, u64 parent, 1099 u64 root_objectid, u64 owner, 1100 u64 offset, int refs_to_add) 1101 { 1102 struct btrfs_key key; 1103 struct extent_buffer *leaf; 1104 u32 size; 1105 u32 num_refs; 1106 int ret; 1107 1108 key.objectid = bytenr; 1109 if (parent) { 1110 key.type = BTRFS_SHARED_DATA_REF_KEY; 1111 key.offset = parent; 1112 size = sizeof(struct btrfs_shared_data_ref); 1113 } else { 1114 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1115 key.offset = hash_extent_data_ref(root_objectid, 1116 owner, offset); 1117 size = sizeof(struct btrfs_extent_data_ref); 1118 } 1119 1120 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1121 if (ret && ret != -EEXIST) 1122 goto fail; 1123 1124 leaf = path->nodes[0]; 1125 if (parent) { 1126 struct btrfs_shared_data_ref *ref; 1127 ref = btrfs_item_ptr(leaf, path->slots[0], 1128 struct btrfs_shared_data_ref); 1129 if (ret == 0) { 1130 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1131 } else { 1132 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1133 num_refs += refs_to_add; 1134 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1135 } 1136 } else { 1137 struct btrfs_extent_data_ref *ref; 1138 while (ret == -EEXIST) { 1139 ref = btrfs_item_ptr(leaf, path->slots[0], 1140 struct btrfs_extent_data_ref); 1141 if (match_extent_data_ref(leaf, ref, root_objectid, 1142 owner, offset)) 1143 break; 1144 btrfs_release_path(root, path); 1145 key.offset++; 1146 ret = btrfs_insert_empty_item(trans, root, path, &key, 1147 size); 1148 if (ret && ret != -EEXIST) 1149 goto fail; 1150 1151 leaf = path->nodes[0]; 1152 } 1153 ref = btrfs_item_ptr(leaf, path->slots[0], 1154 struct btrfs_extent_data_ref); 1155 if (ret == 0) { 1156 btrfs_set_extent_data_ref_root(leaf, ref, 1157 root_objectid); 1158 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1159 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1160 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1161 } else { 1162 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1163 num_refs += refs_to_add; 1164 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1165 } 1166 } 1167 btrfs_mark_buffer_dirty(leaf); 1168 ret = 0; 1169 fail: 1170 btrfs_release_path(root, path); 1171 return ret; 1172 } 1173 1174 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1175 struct btrfs_root *root, 1176 struct btrfs_path *path, 1177 int refs_to_drop) 1178 { 1179 struct btrfs_key key; 1180 struct btrfs_extent_data_ref *ref1 = NULL; 1181 struct btrfs_shared_data_ref *ref2 = NULL; 1182 struct extent_buffer *leaf; 1183 u32 num_refs = 0; 1184 int ret = 0; 1185 1186 leaf = path->nodes[0]; 1187 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1188 1189 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1190 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1191 struct btrfs_extent_data_ref); 1192 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1193 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1194 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1195 struct btrfs_shared_data_ref); 1196 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1197 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1198 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1199 struct btrfs_extent_ref_v0 *ref0; 1200 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1201 struct btrfs_extent_ref_v0); 1202 num_refs = btrfs_ref_count_v0(leaf, ref0); 1203 #endif 1204 } else { 1205 BUG(); 1206 } 1207 1208 BUG_ON(num_refs < refs_to_drop); 1209 num_refs -= refs_to_drop; 1210 1211 if (num_refs == 0) { 1212 ret = btrfs_del_item(trans, root, path); 1213 } else { 1214 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1215 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1216 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1217 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1218 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1219 else { 1220 struct btrfs_extent_ref_v0 *ref0; 1221 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1222 struct btrfs_extent_ref_v0); 1223 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1224 } 1225 #endif 1226 btrfs_mark_buffer_dirty(leaf); 1227 } 1228 return ret; 1229 } 1230 1231 static noinline u32 extent_data_ref_count(struct btrfs_root *root, 1232 struct btrfs_path *path, 1233 struct btrfs_extent_inline_ref *iref) 1234 { 1235 struct btrfs_key key; 1236 struct extent_buffer *leaf; 1237 struct btrfs_extent_data_ref *ref1; 1238 struct btrfs_shared_data_ref *ref2; 1239 u32 num_refs = 0; 1240 1241 leaf = path->nodes[0]; 1242 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1243 if (iref) { 1244 if (btrfs_extent_inline_ref_type(leaf, iref) == 1245 BTRFS_EXTENT_DATA_REF_KEY) { 1246 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1247 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1248 } else { 1249 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1250 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1251 } 1252 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1253 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1254 struct btrfs_extent_data_ref); 1255 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1256 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1257 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1258 struct btrfs_shared_data_ref); 1259 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1260 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1261 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1262 struct btrfs_extent_ref_v0 *ref0; 1263 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1264 struct btrfs_extent_ref_v0); 1265 num_refs = btrfs_ref_count_v0(leaf, ref0); 1266 #endif 1267 } else { 1268 WARN_ON(1); 1269 } 1270 return num_refs; 1271 } 1272 1273 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1274 struct btrfs_root *root, 1275 struct btrfs_path *path, 1276 u64 bytenr, u64 parent, 1277 u64 root_objectid) 1278 { 1279 struct btrfs_key key; 1280 int ret; 1281 1282 key.objectid = bytenr; 1283 if (parent) { 1284 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1285 key.offset = parent; 1286 } else { 1287 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1288 key.offset = root_objectid; 1289 } 1290 1291 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1292 if (ret > 0) 1293 ret = -ENOENT; 1294 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1295 if (ret == -ENOENT && parent) { 1296 btrfs_release_path(root, path); 1297 key.type = BTRFS_EXTENT_REF_V0_KEY; 1298 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1299 if (ret > 0) 1300 ret = -ENOENT; 1301 } 1302 #endif 1303 return ret; 1304 } 1305 1306 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1307 struct btrfs_root *root, 1308 struct btrfs_path *path, 1309 u64 bytenr, u64 parent, 1310 u64 root_objectid) 1311 { 1312 struct btrfs_key key; 1313 int ret; 1314 1315 key.objectid = bytenr; 1316 if (parent) { 1317 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1318 key.offset = parent; 1319 } else { 1320 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1321 key.offset = root_objectid; 1322 } 1323 1324 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1325 btrfs_release_path(root, path); 1326 return ret; 1327 } 1328 1329 static inline int extent_ref_type(u64 parent, u64 owner) 1330 { 1331 int type; 1332 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1333 if (parent > 0) 1334 type = BTRFS_SHARED_BLOCK_REF_KEY; 1335 else 1336 type = BTRFS_TREE_BLOCK_REF_KEY; 1337 } else { 1338 if (parent > 0) 1339 type = BTRFS_SHARED_DATA_REF_KEY; 1340 else 1341 type = BTRFS_EXTENT_DATA_REF_KEY; 1342 } 1343 return type; 1344 } 1345 1346 static int find_next_key(struct btrfs_path *path, int level, 1347 struct btrfs_key *key) 1348 1349 { 1350 for (; level < BTRFS_MAX_LEVEL; level++) { 1351 if (!path->nodes[level]) 1352 break; 1353 if (path->slots[level] + 1 >= 1354 btrfs_header_nritems(path->nodes[level])) 1355 continue; 1356 if (level == 0) 1357 btrfs_item_key_to_cpu(path->nodes[level], key, 1358 path->slots[level] + 1); 1359 else 1360 btrfs_node_key_to_cpu(path->nodes[level], key, 1361 path->slots[level] + 1); 1362 return 0; 1363 } 1364 return 1; 1365 } 1366 1367 /* 1368 * look for inline back ref. if back ref is found, *ref_ret is set 1369 * to the address of inline back ref, and 0 is returned. 1370 * 1371 * if back ref isn't found, *ref_ret is set to the address where it 1372 * should be inserted, and -ENOENT is returned. 1373 * 1374 * if insert is true and there are too many inline back refs, the path 1375 * points to the extent item, and -EAGAIN is returned. 1376 * 1377 * NOTE: inline back refs are ordered in the same way that back ref 1378 * items in the tree are ordered. 1379 */ 1380 static noinline_for_stack 1381 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1382 struct btrfs_root *root, 1383 struct btrfs_path *path, 1384 struct btrfs_extent_inline_ref **ref_ret, 1385 u64 bytenr, u64 num_bytes, 1386 u64 parent, u64 root_objectid, 1387 u64 owner, u64 offset, int insert) 1388 { 1389 struct btrfs_key key; 1390 struct extent_buffer *leaf; 1391 struct btrfs_extent_item *ei; 1392 struct btrfs_extent_inline_ref *iref; 1393 u64 flags; 1394 u64 item_size; 1395 unsigned long ptr; 1396 unsigned long end; 1397 int extra_size; 1398 int type; 1399 int want; 1400 int ret; 1401 int err = 0; 1402 1403 key.objectid = bytenr; 1404 key.type = BTRFS_EXTENT_ITEM_KEY; 1405 key.offset = num_bytes; 1406 1407 want = extent_ref_type(parent, owner); 1408 if (insert) { 1409 extra_size = btrfs_extent_inline_ref_size(want); 1410 path->keep_locks = 1; 1411 } else 1412 extra_size = -1; 1413 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1414 if (ret < 0) { 1415 err = ret; 1416 goto out; 1417 } 1418 BUG_ON(ret); 1419 1420 leaf = path->nodes[0]; 1421 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1422 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1423 if (item_size < sizeof(*ei)) { 1424 if (!insert) { 1425 err = -ENOENT; 1426 goto out; 1427 } 1428 ret = convert_extent_item_v0(trans, root, path, owner, 1429 extra_size); 1430 if (ret < 0) { 1431 err = ret; 1432 goto out; 1433 } 1434 leaf = path->nodes[0]; 1435 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1436 } 1437 #endif 1438 BUG_ON(item_size < sizeof(*ei)); 1439 1440 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1441 flags = btrfs_extent_flags(leaf, ei); 1442 1443 ptr = (unsigned long)(ei + 1); 1444 end = (unsigned long)ei + item_size; 1445 1446 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1447 ptr += sizeof(struct btrfs_tree_block_info); 1448 BUG_ON(ptr > end); 1449 } else { 1450 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); 1451 } 1452 1453 err = -ENOENT; 1454 while (1) { 1455 if (ptr >= end) { 1456 WARN_ON(ptr > end); 1457 break; 1458 } 1459 iref = (struct btrfs_extent_inline_ref *)ptr; 1460 type = btrfs_extent_inline_ref_type(leaf, iref); 1461 if (want < type) 1462 break; 1463 if (want > type) { 1464 ptr += btrfs_extent_inline_ref_size(type); 1465 continue; 1466 } 1467 1468 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1469 struct btrfs_extent_data_ref *dref; 1470 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1471 if (match_extent_data_ref(leaf, dref, root_objectid, 1472 owner, offset)) { 1473 err = 0; 1474 break; 1475 } 1476 if (hash_extent_data_ref_item(leaf, dref) < 1477 hash_extent_data_ref(root_objectid, owner, offset)) 1478 break; 1479 } else { 1480 u64 ref_offset; 1481 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1482 if (parent > 0) { 1483 if (parent == ref_offset) { 1484 err = 0; 1485 break; 1486 } 1487 if (ref_offset < parent) 1488 break; 1489 } else { 1490 if (root_objectid == ref_offset) { 1491 err = 0; 1492 break; 1493 } 1494 if (ref_offset < root_objectid) 1495 break; 1496 } 1497 } 1498 ptr += btrfs_extent_inline_ref_size(type); 1499 } 1500 if (err == -ENOENT && insert) { 1501 if (item_size + extra_size >= 1502 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1503 err = -EAGAIN; 1504 goto out; 1505 } 1506 /* 1507 * To add new inline back ref, we have to make sure 1508 * there is no corresponding back ref item. 1509 * For simplicity, we just do not add new inline back 1510 * ref if there is any kind of item for this block 1511 */ 1512 if (find_next_key(path, 0, &key) == 0 && 1513 key.objectid == bytenr && 1514 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1515 err = -EAGAIN; 1516 goto out; 1517 } 1518 } 1519 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1520 out: 1521 if (insert) { 1522 path->keep_locks = 0; 1523 btrfs_unlock_up_safe(path, 1); 1524 } 1525 return err; 1526 } 1527 1528 /* 1529 * helper to add new inline back ref 1530 */ 1531 static noinline_for_stack 1532 int setup_inline_extent_backref(struct btrfs_trans_handle *trans, 1533 struct btrfs_root *root, 1534 struct btrfs_path *path, 1535 struct btrfs_extent_inline_ref *iref, 1536 u64 parent, u64 root_objectid, 1537 u64 owner, u64 offset, int refs_to_add, 1538 struct btrfs_delayed_extent_op *extent_op) 1539 { 1540 struct extent_buffer *leaf; 1541 struct btrfs_extent_item *ei; 1542 unsigned long ptr; 1543 unsigned long end; 1544 unsigned long item_offset; 1545 u64 refs; 1546 int size; 1547 int type; 1548 int ret; 1549 1550 leaf = path->nodes[0]; 1551 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1552 item_offset = (unsigned long)iref - (unsigned long)ei; 1553 1554 type = extent_ref_type(parent, owner); 1555 size = btrfs_extent_inline_ref_size(type); 1556 1557 ret = btrfs_extend_item(trans, root, path, size); 1558 BUG_ON(ret); 1559 1560 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1561 refs = btrfs_extent_refs(leaf, ei); 1562 refs += refs_to_add; 1563 btrfs_set_extent_refs(leaf, ei, refs); 1564 if (extent_op) 1565 __run_delayed_extent_op(extent_op, leaf, ei); 1566 1567 ptr = (unsigned long)ei + item_offset; 1568 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1569 if (ptr < end - size) 1570 memmove_extent_buffer(leaf, ptr + size, ptr, 1571 end - size - ptr); 1572 1573 iref = (struct btrfs_extent_inline_ref *)ptr; 1574 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1575 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1576 struct btrfs_extent_data_ref *dref; 1577 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1578 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1579 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1580 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1581 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1582 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1583 struct btrfs_shared_data_ref *sref; 1584 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1585 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1586 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1587 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1588 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1589 } else { 1590 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1591 } 1592 btrfs_mark_buffer_dirty(leaf); 1593 return 0; 1594 } 1595 1596 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1597 struct btrfs_root *root, 1598 struct btrfs_path *path, 1599 struct btrfs_extent_inline_ref **ref_ret, 1600 u64 bytenr, u64 num_bytes, u64 parent, 1601 u64 root_objectid, u64 owner, u64 offset) 1602 { 1603 int ret; 1604 1605 ret = lookup_inline_extent_backref(trans, root, path, ref_ret, 1606 bytenr, num_bytes, parent, 1607 root_objectid, owner, offset, 0); 1608 if (ret != -ENOENT) 1609 return ret; 1610 1611 btrfs_release_path(root, path); 1612 *ref_ret = NULL; 1613 1614 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1615 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, 1616 root_objectid); 1617 } else { 1618 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, 1619 root_objectid, owner, offset); 1620 } 1621 return ret; 1622 } 1623 1624 /* 1625 * helper to update/remove inline back ref 1626 */ 1627 static noinline_for_stack 1628 int update_inline_extent_backref(struct btrfs_trans_handle *trans, 1629 struct btrfs_root *root, 1630 struct btrfs_path *path, 1631 struct btrfs_extent_inline_ref *iref, 1632 int refs_to_mod, 1633 struct btrfs_delayed_extent_op *extent_op) 1634 { 1635 struct extent_buffer *leaf; 1636 struct btrfs_extent_item *ei; 1637 struct btrfs_extent_data_ref *dref = NULL; 1638 struct btrfs_shared_data_ref *sref = NULL; 1639 unsigned long ptr; 1640 unsigned long end; 1641 u32 item_size; 1642 int size; 1643 int type; 1644 int ret; 1645 u64 refs; 1646 1647 leaf = path->nodes[0]; 1648 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1649 refs = btrfs_extent_refs(leaf, ei); 1650 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1651 refs += refs_to_mod; 1652 btrfs_set_extent_refs(leaf, ei, refs); 1653 if (extent_op) 1654 __run_delayed_extent_op(extent_op, leaf, ei); 1655 1656 type = btrfs_extent_inline_ref_type(leaf, iref); 1657 1658 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1659 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1660 refs = btrfs_extent_data_ref_count(leaf, dref); 1661 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1662 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1663 refs = btrfs_shared_data_ref_count(leaf, sref); 1664 } else { 1665 refs = 1; 1666 BUG_ON(refs_to_mod != -1); 1667 } 1668 1669 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1670 refs += refs_to_mod; 1671 1672 if (refs > 0) { 1673 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1674 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1675 else 1676 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1677 } else { 1678 size = btrfs_extent_inline_ref_size(type); 1679 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1680 ptr = (unsigned long)iref; 1681 end = (unsigned long)ei + item_size; 1682 if (ptr + size < end) 1683 memmove_extent_buffer(leaf, ptr, ptr + size, 1684 end - ptr - size); 1685 item_size -= size; 1686 ret = btrfs_truncate_item(trans, root, path, item_size, 1); 1687 BUG_ON(ret); 1688 } 1689 btrfs_mark_buffer_dirty(leaf); 1690 return 0; 1691 } 1692 1693 static noinline_for_stack 1694 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1695 struct btrfs_root *root, 1696 struct btrfs_path *path, 1697 u64 bytenr, u64 num_bytes, u64 parent, 1698 u64 root_objectid, u64 owner, 1699 u64 offset, int refs_to_add, 1700 struct btrfs_delayed_extent_op *extent_op) 1701 { 1702 struct btrfs_extent_inline_ref *iref; 1703 int ret; 1704 1705 ret = lookup_inline_extent_backref(trans, root, path, &iref, 1706 bytenr, num_bytes, parent, 1707 root_objectid, owner, offset, 1); 1708 if (ret == 0) { 1709 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1710 ret = update_inline_extent_backref(trans, root, path, iref, 1711 refs_to_add, extent_op); 1712 } else if (ret == -ENOENT) { 1713 ret = setup_inline_extent_backref(trans, root, path, iref, 1714 parent, root_objectid, 1715 owner, offset, refs_to_add, 1716 extent_op); 1717 } 1718 return ret; 1719 } 1720 1721 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1722 struct btrfs_root *root, 1723 struct btrfs_path *path, 1724 u64 bytenr, u64 parent, u64 root_objectid, 1725 u64 owner, u64 offset, int refs_to_add) 1726 { 1727 int ret; 1728 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1729 BUG_ON(refs_to_add != 1); 1730 ret = insert_tree_block_ref(trans, root, path, bytenr, 1731 parent, root_objectid); 1732 } else { 1733 ret = insert_extent_data_ref(trans, root, path, bytenr, 1734 parent, root_objectid, 1735 owner, offset, refs_to_add); 1736 } 1737 return ret; 1738 } 1739 1740 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1741 struct btrfs_root *root, 1742 struct btrfs_path *path, 1743 struct btrfs_extent_inline_ref *iref, 1744 int refs_to_drop, int is_data) 1745 { 1746 int ret; 1747 1748 BUG_ON(!is_data && refs_to_drop != 1); 1749 if (iref) { 1750 ret = update_inline_extent_backref(trans, root, path, iref, 1751 -refs_to_drop, NULL); 1752 } else if (is_data) { 1753 ret = remove_extent_data_ref(trans, root, path, refs_to_drop); 1754 } else { 1755 ret = btrfs_del_item(trans, root, path); 1756 } 1757 return ret; 1758 } 1759 1760 static int btrfs_issue_discard(struct block_device *bdev, 1761 u64 start, u64 len) 1762 { 1763 return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); 1764 } 1765 1766 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1767 u64 num_bytes, u64 *actual_bytes) 1768 { 1769 int ret; 1770 u64 discarded_bytes = 0; 1771 struct btrfs_multi_bio *multi = NULL; 1772 1773 1774 /* Tell the block device(s) that the sectors can be discarded */ 1775 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1776 bytenr, &num_bytes, &multi, 0); 1777 if (!ret) { 1778 struct btrfs_bio_stripe *stripe = multi->stripes; 1779 int i; 1780 1781 1782 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1783 ret = btrfs_issue_discard(stripe->dev->bdev, 1784 stripe->physical, 1785 stripe->length); 1786 if (!ret) 1787 discarded_bytes += stripe->length; 1788 else if (ret != -EOPNOTSUPP) 1789 break; 1790 } 1791 kfree(multi); 1792 } 1793 if (discarded_bytes && ret == -EOPNOTSUPP) 1794 ret = 0; 1795 1796 if (actual_bytes) 1797 *actual_bytes = discarded_bytes; 1798 1799 1800 return ret; 1801 } 1802 1803 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1804 struct btrfs_root *root, 1805 u64 bytenr, u64 num_bytes, u64 parent, 1806 u64 root_objectid, u64 owner, u64 offset) 1807 { 1808 int ret; 1809 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1810 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1811 1812 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1813 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 1814 parent, root_objectid, (int)owner, 1815 BTRFS_ADD_DELAYED_REF, NULL); 1816 } else { 1817 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 1818 parent, root_objectid, owner, offset, 1819 BTRFS_ADD_DELAYED_REF, NULL); 1820 } 1821 return ret; 1822 } 1823 1824 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1825 struct btrfs_root *root, 1826 u64 bytenr, u64 num_bytes, 1827 u64 parent, u64 root_objectid, 1828 u64 owner, u64 offset, int refs_to_add, 1829 struct btrfs_delayed_extent_op *extent_op) 1830 { 1831 struct btrfs_path *path; 1832 struct extent_buffer *leaf; 1833 struct btrfs_extent_item *item; 1834 u64 refs; 1835 int ret; 1836 int err = 0; 1837 1838 path = btrfs_alloc_path(); 1839 if (!path) 1840 return -ENOMEM; 1841 1842 path->reada = 1; 1843 path->leave_spinning = 1; 1844 /* this will setup the path even if it fails to insert the back ref */ 1845 ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, 1846 path, bytenr, num_bytes, parent, 1847 root_objectid, owner, offset, 1848 refs_to_add, extent_op); 1849 if (ret == 0) 1850 goto out; 1851 1852 if (ret != -EAGAIN) { 1853 err = ret; 1854 goto out; 1855 } 1856 1857 leaf = path->nodes[0]; 1858 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1859 refs = btrfs_extent_refs(leaf, item); 1860 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 1861 if (extent_op) 1862 __run_delayed_extent_op(extent_op, leaf, item); 1863 1864 btrfs_mark_buffer_dirty(leaf); 1865 btrfs_release_path(root->fs_info->extent_root, path); 1866 1867 path->reada = 1; 1868 path->leave_spinning = 1; 1869 1870 /* now insert the actual backref */ 1871 ret = insert_extent_backref(trans, root->fs_info->extent_root, 1872 path, bytenr, parent, root_objectid, 1873 owner, offset, refs_to_add); 1874 BUG_ON(ret); 1875 out: 1876 btrfs_free_path(path); 1877 return err; 1878 } 1879 1880 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 1881 struct btrfs_root *root, 1882 struct btrfs_delayed_ref_node *node, 1883 struct btrfs_delayed_extent_op *extent_op, 1884 int insert_reserved) 1885 { 1886 int ret = 0; 1887 struct btrfs_delayed_data_ref *ref; 1888 struct btrfs_key ins; 1889 u64 parent = 0; 1890 u64 ref_root = 0; 1891 u64 flags = 0; 1892 1893 ins.objectid = node->bytenr; 1894 ins.offset = node->num_bytes; 1895 ins.type = BTRFS_EXTENT_ITEM_KEY; 1896 1897 ref = btrfs_delayed_node_to_data_ref(node); 1898 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 1899 parent = ref->parent; 1900 else 1901 ref_root = ref->root; 1902 1903 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 1904 if (extent_op) { 1905 BUG_ON(extent_op->update_key); 1906 flags |= extent_op->flags_to_set; 1907 } 1908 ret = alloc_reserved_file_extent(trans, root, 1909 parent, ref_root, flags, 1910 ref->objectid, ref->offset, 1911 &ins, node->ref_mod); 1912 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1913 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 1914 node->num_bytes, parent, 1915 ref_root, ref->objectid, 1916 ref->offset, node->ref_mod, 1917 extent_op); 1918 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1919 ret = __btrfs_free_extent(trans, root, node->bytenr, 1920 node->num_bytes, parent, 1921 ref_root, ref->objectid, 1922 ref->offset, node->ref_mod, 1923 extent_op); 1924 } else { 1925 BUG(); 1926 } 1927 return ret; 1928 } 1929 1930 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 1931 struct extent_buffer *leaf, 1932 struct btrfs_extent_item *ei) 1933 { 1934 u64 flags = btrfs_extent_flags(leaf, ei); 1935 if (extent_op->update_flags) { 1936 flags |= extent_op->flags_to_set; 1937 btrfs_set_extent_flags(leaf, ei, flags); 1938 } 1939 1940 if (extent_op->update_key) { 1941 struct btrfs_tree_block_info *bi; 1942 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 1943 bi = (struct btrfs_tree_block_info *)(ei + 1); 1944 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 1945 } 1946 } 1947 1948 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 1949 struct btrfs_root *root, 1950 struct btrfs_delayed_ref_node *node, 1951 struct btrfs_delayed_extent_op *extent_op) 1952 { 1953 struct btrfs_key key; 1954 struct btrfs_path *path; 1955 struct btrfs_extent_item *ei; 1956 struct extent_buffer *leaf; 1957 u32 item_size; 1958 int ret; 1959 int err = 0; 1960 1961 path = btrfs_alloc_path(); 1962 if (!path) 1963 return -ENOMEM; 1964 1965 key.objectid = node->bytenr; 1966 key.type = BTRFS_EXTENT_ITEM_KEY; 1967 key.offset = node->num_bytes; 1968 1969 path->reada = 1; 1970 path->leave_spinning = 1; 1971 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 1972 path, 0, 1); 1973 if (ret < 0) { 1974 err = ret; 1975 goto out; 1976 } 1977 if (ret > 0) { 1978 err = -EIO; 1979 goto out; 1980 } 1981 1982 leaf = path->nodes[0]; 1983 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1984 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1985 if (item_size < sizeof(*ei)) { 1986 ret = convert_extent_item_v0(trans, root->fs_info->extent_root, 1987 path, (u64)-1, 0); 1988 if (ret < 0) { 1989 err = ret; 1990 goto out; 1991 } 1992 leaf = path->nodes[0]; 1993 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1994 } 1995 #endif 1996 BUG_ON(item_size < sizeof(*ei)); 1997 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1998 __run_delayed_extent_op(extent_op, leaf, ei); 1999 2000 btrfs_mark_buffer_dirty(leaf); 2001 out: 2002 btrfs_free_path(path); 2003 return err; 2004 } 2005 2006 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2007 struct btrfs_root *root, 2008 struct btrfs_delayed_ref_node *node, 2009 struct btrfs_delayed_extent_op *extent_op, 2010 int insert_reserved) 2011 { 2012 int ret = 0; 2013 struct btrfs_delayed_tree_ref *ref; 2014 struct btrfs_key ins; 2015 u64 parent = 0; 2016 u64 ref_root = 0; 2017 2018 ins.objectid = node->bytenr; 2019 ins.offset = node->num_bytes; 2020 ins.type = BTRFS_EXTENT_ITEM_KEY; 2021 2022 ref = btrfs_delayed_node_to_tree_ref(node); 2023 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2024 parent = ref->parent; 2025 else 2026 ref_root = ref->root; 2027 2028 BUG_ON(node->ref_mod != 1); 2029 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2030 BUG_ON(!extent_op || !extent_op->update_flags || 2031 !extent_op->update_key); 2032 ret = alloc_reserved_tree_block(trans, root, 2033 parent, ref_root, 2034 extent_op->flags_to_set, 2035 &extent_op->key, 2036 ref->level, &ins); 2037 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2038 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2039 node->num_bytes, parent, ref_root, 2040 ref->level, 0, 1, extent_op); 2041 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2042 ret = __btrfs_free_extent(trans, root, node->bytenr, 2043 node->num_bytes, parent, ref_root, 2044 ref->level, 0, 1, extent_op); 2045 } else { 2046 BUG(); 2047 } 2048 return ret; 2049 } 2050 2051 /* helper function to actually process a single delayed ref entry */ 2052 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2053 struct btrfs_root *root, 2054 struct btrfs_delayed_ref_node *node, 2055 struct btrfs_delayed_extent_op *extent_op, 2056 int insert_reserved) 2057 { 2058 int ret; 2059 if (btrfs_delayed_ref_is_head(node)) { 2060 struct btrfs_delayed_ref_head *head; 2061 /* 2062 * we've hit the end of the chain and we were supposed 2063 * to insert this extent into the tree. But, it got 2064 * deleted before we ever needed to insert it, so all 2065 * we have to do is clean up the accounting 2066 */ 2067 BUG_ON(extent_op); 2068 head = btrfs_delayed_node_to_head(node); 2069 if (insert_reserved) { 2070 btrfs_pin_extent(root, node->bytenr, 2071 node->num_bytes, 1); 2072 if (head->is_data) { 2073 ret = btrfs_del_csums(trans, root, 2074 node->bytenr, 2075 node->num_bytes); 2076 BUG_ON(ret); 2077 } 2078 } 2079 mutex_unlock(&head->mutex); 2080 return 0; 2081 } 2082 2083 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2084 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2085 ret = run_delayed_tree_ref(trans, root, node, extent_op, 2086 insert_reserved); 2087 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2088 node->type == BTRFS_SHARED_DATA_REF_KEY) 2089 ret = run_delayed_data_ref(trans, root, node, extent_op, 2090 insert_reserved); 2091 else 2092 BUG(); 2093 return ret; 2094 } 2095 2096 static noinline struct btrfs_delayed_ref_node * 2097 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2098 { 2099 struct rb_node *node; 2100 struct btrfs_delayed_ref_node *ref; 2101 int action = BTRFS_ADD_DELAYED_REF; 2102 again: 2103 /* 2104 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 2105 * this prevents ref count from going down to zero when 2106 * there still are pending delayed ref. 2107 */ 2108 node = rb_prev(&head->node.rb_node); 2109 while (1) { 2110 if (!node) 2111 break; 2112 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2113 rb_node); 2114 if (ref->bytenr != head->node.bytenr) 2115 break; 2116 if (ref->action == action) 2117 return ref; 2118 node = rb_prev(node); 2119 } 2120 if (action == BTRFS_ADD_DELAYED_REF) { 2121 action = BTRFS_DROP_DELAYED_REF; 2122 goto again; 2123 } 2124 return NULL; 2125 } 2126 2127 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, 2128 struct btrfs_root *root, 2129 struct list_head *cluster) 2130 { 2131 struct btrfs_delayed_ref_root *delayed_refs; 2132 struct btrfs_delayed_ref_node *ref; 2133 struct btrfs_delayed_ref_head *locked_ref = NULL; 2134 struct btrfs_delayed_extent_op *extent_op; 2135 int ret; 2136 int count = 0; 2137 int must_insert_reserved = 0; 2138 2139 delayed_refs = &trans->transaction->delayed_refs; 2140 while (1) { 2141 if (!locked_ref) { 2142 /* pick a new head ref from the cluster list */ 2143 if (list_empty(cluster)) 2144 break; 2145 2146 locked_ref = list_entry(cluster->next, 2147 struct btrfs_delayed_ref_head, cluster); 2148 2149 /* grab the lock that says we are going to process 2150 * all the refs for this head */ 2151 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2152 2153 /* 2154 * we may have dropped the spin lock to get the head 2155 * mutex lock, and that might have given someone else 2156 * time to free the head. If that's true, it has been 2157 * removed from our list and we can move on. 2158 */ 2159 if (ret == -EAGAIN) { 2160 locked_ref = NULL; 2161 count++; 2162 continue; 2163 } 2164 } 2165 2166 /* 2167 * record the must insert reserved flag before we 2168 * drop the spin lock. 2169 */ 2170 must_insert_reserved = locked_ref->must_insert_reserved; 2171 locked_ref->must_insert_reserved = 0; 2172 2173 extent_op = locked_ref->extent_op; 2174 locked_ref->extent_op = NULL; 2175 2176 /* 2177 * locked_ref is the head node, so we have to go one 2178 * node back for any delayed ref updates 2179 */ 2180 ref = select_delayed_ref(locked_ref); 2181 if (!ref) { 2182 /* All delayed refs have been processed, Go ahead 2183 * and send the head node to run_one_delayed_ref, 2184 * so that any accounting fixes can happen 2185 */ 2186 ref = &locked_ref->node; 2187 2188 if (extent_op && must_insert_reserved) { 2189 kfree(extent_op); 2190 extent_op = NULL; 2191 } 2192 2193 if (extent_op) { 2194 spin_unlock(&delayed_refs->lock); 2195 2196 ret = run_delayed_extent_op(trans, root, 2197 ref, extent_op); 2198 BUG_ON(ret); 2199 kfree(extent_op); 2200 2201 cond_resched(); 2202 spin_lock(&delayed_refs->lock); 2203 continue; 2204 } 2205 2206 list_del_init(&locked_ref->cluster); 2207 locked_ref = NULL; 2208 } 2209 2210 ref->in_tree = 0; 2211 rb_erase(&ref->rb_node, &delayed_refs->root); 2212 delayed_refs->num_entries--; 2213 2214 spin_unlock(&delayed_refs->lock); 2215 2216 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2217 must_insert_reserved); 2218 BUG_ON(ret); 2219 2220 btrfs_put_delayed_ref(ref); 2221 kfree(extent_op); 2222 count++; 2223 2224 cond_resched(); 2225 spin_lock(&delayed_refs->lock); 2226 } 2227 return count; 2228 } 2229 2230 /* 2231 * this starts processing the delayed reference count updates and 2232 * extent insertions we have queued up so far. count can be 2233 * 0, which means to process everything in the tree at the start 2234 * of the run (but not newly added entries), or it can be some target 2235 * number you'd like to process. 2236 */ 2237 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2238 struct btrfs_root *root, unsigned long count) 2239 { 2240 struct rb_node *node; 2241 struct btrfs_delayed_ref_root *delayed_refs; 2242 struct btrfs_delayed_ref_node *ref; 2243 struct list_head cluster; 2244 int ret; 2245 int run_all = count == (unsigned long)-1; 2246 int run_most = 0; 2247 2248 if (root == root->fs_info->extent_root) 2249 root = root->fs_info->tree_root; 2250 2251 delayed_refs = &trans->transaction->delayed_refs; 2252 INIT_LIST_HEAD(&cluster); 2253 again: 2254 spin_lock(&delayed_refs->lock); 2255 if (count == 0) { 2256 count = delayed_refs->num_entries * 2; 2257 run_most = 1; 2258 } 2259 while (1) { 2260 if (!(run_all || run_most) && 2261 delayed_refs->num_heads_ready < 64) 2262 break; 2263 2264 /* 2265 * go find something we can process in the rbtree. We start at 2266 * the beginning of the tree, and then build a cluster 2267 * of refs to process starting at the first one we are able to 2268 * lock 2269 */ 2270 ret = btrfs_find_ref_cluster(trans, &cluster, 2271 delayed_refs->run_delayed_start); 2272 if (ret) 2273 break; 2274 2275 ret = run_clustered_refs(trans, root, &cluster); 2276 BUG_ON(ret < 0); 2277 2278 count -= min_t(unsigned long, ret, count); 2279 2280 if (count == 0) 2281 break; 2282 } 2283 2284 if (run_all) { 2285 node = rb_first(&delayed_refs->root); 2286 if (!node) 2287 goto out; 2288 count = (unsigned long)-1; 2289 2290 while (node) { 2291 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2292 rb_node); 2293 if (btrfs_delayed_ref_is_head(ref)) { 2294 struct btrfs_delayed_ref_head *head; 2295 2296 head = btrfs_delayed_node_to_head(ref); 2297 atomic_inc(&ref->refs); 2298 2299 spin_unlock(&delayed_refs->lock); 2300 mutex_lock(&head->mutex); 2301 mutex_unlock(&head->mutex); 2302 2303 btrfs_put_delayed_ref(ref); 2304 cond_resched(); 2305 goto again; 2306 } 2307 node = rb_next(node); 2308 } 2309 spin_unlock(&delayed_refs->lock); 2310 schedule_timeout(1); 2311 goto again; 2312 } 2313 out: 2314 spin_unlock(&delayed_refs->lock); 2315 return 0; 2316 } 2317 2318 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2319 struct btrfs_root *root, 2320 u64 bytenr, u64 num_bytes, u64 flags, 2321 int is_data) 2322 { 2323 struct btrfs_delayed_extent_op *extent_op; 2324 int ret; 2325 2326 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 2327 if (!extent_op) 2328 return -ENOMEM; 2329 2330 extent_op->flags_to_set = flags; 2331 extent_op->update_flags = 1; 2332 extent_op->update_key = 0; 2333 extent_op->is_data = is_data ? 1 : 0; 2334 2335 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); 2336 if (ret) 2337 kfree(extent_op); 2338 return ret; 2339 } 2340 2341 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, 2342 struct btrfs_root *root, 2343 struct btrfs_path *path, 2344 u64 objectid, u64 offset, u64 bytenr) 2345 { 2346 struct btrfs_delayed_ref_head *head; 2347 struct btrfs_delayed_ref_node *ref; 2348 struct btrfs_delayed_data_ref *data_ref; 2349 struct btrfs_delayed_ref_root *delayed_refs; 2350 struct rb_node *node; 2351 int ret = 0; 2352 2353 ret = -ENOENT; 2354 delayed_refs = &trans->transaction->delayed_refs; 2355 spin_lock(&delayed_refs->lock); 2356 head = btrfs_find_delayed_ref_head(trans, bytenr); 2357 if (!head) 2358 goto out; 2359 2360 if (!mutex_trylock(&head->mutex)) { 2361 atomic_inc(&head->node.refs); 2362 spin_unlock(&delayed_refs->lock); 2363 2364 btrfs_release_path(root->fs_info->extent_root, path); 2365 2366 mutex_lock(&head->mutex); 2367 mutex_unlock(&head->mutex); 2368 btrfs_put_delayed_ref(&head->node); 2369 return -EAGAIN; 2370 } 2371 2372 node = rb_prev(&head->node.rb_node); 2373 if (!node) 2374 goto out_unlock; 2375 2376 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2377 2378 if (ref->bytenr != bytenr) 2379 goto out_unlock; 2380 2381 ret = 1; 2382 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) 2383 goto out_unlock; 2384 2385 data_ref = btrfs_delayed_node_to_data_ref(ref); 2386 2387 node = rb_prev(node); 2388 if (node) { 2389 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2390 if (ref->bytenr == bytenr) 2391 goto out_unlock; 2392 } 2393 2394 if (data_ref->root != root->root_key.objectid || 2395 data_ref->objectid != objectid || data_ref->offset != offset) 2396 goto out_unlock; 2397 2398 ret = 0; 2399 out_unlock: 2400 mutex_unlock(&head->mutex); 2401 out: 2402 spin_unlock(&delayed_refs->lock); 2403 return ret; 2404 } 2405 2406 static noinline int check_committed_ref(struct btrfs_trans_handle *trans, 2407 struct btrfs_root *root, 2408 struct btrfs_path *path, 2409 u64 objectid, u64 offset, u64 bytenr) 2410 { 2411 struct btrfs_root *extent_root = root->fs_info->extent_root; 2412 struct extent_buffer *leaf; 2413 struct btrfs_extent_data_ref *ref; 2414 struct btrfs_extent_inline_ref *iref; 2415 struct btrfs_extent_item *ei; 2416 struct btrfs_key key; 2417 u32 item_size; 2418 int ret; 2419 2420 key.objectid = bytenr; 2421 key.offset = (u64)-1; 2422 key.type = BTRFS_EXTENT_ITEM_KEY; 2423 2424 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2425 if (ret < 0) 2426 goto out; 2427 BUG_ON(ret == 0); 2428 2429 ret = -ENOENT; 2430 if (path->slots[0] == 0) 2431 goto out; 2432 2433 path->slots[0]--; 2434 leaf = path->nodes[0]; 2435 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2436 2437 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2438 goto out; 2439 2440 ret = 1; 2441 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2442 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2443 if (item_size < sizeof(*ei)) { 2444 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 2445 goto out; 2446 } 2447 #endif 2448 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2449 2450 if (item_size != sizeof(*ei) + 2451 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2452 goto out; 2453 2454 if (btrfs_extent_generation(leaf, ei) <= 2455 btrfs_root_last_snapshot(&root->root_item)) 2456 goto out; 2457 2458 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 2459 if (btrfs_extent_inline_ref_type(leaf, iref) != 2460 BTRFS_EXTENT_DATA_REF_KEY) 2461 goto out; 2462 2463 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 2464 if (btrfs_extent_refs(leaf, ei) != 2465 btrfs_extent_data_ref_count(leaf, ref) || 2466 btrfs_extent_data_ref_root(leaf, ref) != 2467 root->root_key.objectid || 2468 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 2469 btrfs_extent_data_ref_offset(leaf, ref) != offset) 2470 goto out; 2471 2472 ret = 0; 2473 out: 2474 return ret; 2475 } 2476 2477 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2478 struct btrfs_root *root, 2479 u64 objectid, u64 offset, u64 bytenr) 2480 { 2481 struct btrfs_path *path; 2482 int ret; 2483 int ret2; 2484 2485 path = btrfs_alloc_path(); 2486 if (!path) 2487 return -ENOENT; 2488 2489 do { 2490 ret = check_committed_ref(trans, root, path, objectid, 2491 offset, bytenr); 2492 if (ret && ret != -ENOENT) 2493 goto out; 2494 2495 ret2 = check_delayed_ref(trans, root, path, objectid, 2496 offset, bytenr); 2497 } while (ret2 == -EAGAIN); 2498 2499 if (ret2 && ret2 != -ENOENT) { 2500 ret = ret2; 2501 goto out; 2502 } 2503 2504 if (ret != -ENOENT || ret2 != -ENOENT) 2505 ret = 0; 2506 out: 2507 btrfs_free_path(path); 2508 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 2509 WARN_ON(ret > 0); 2510 return ret; 2511 } 2512 2513 #if 0 2514 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2515 struct extent_buffer *buf, u32 nr_extents) 2516 { 2517 struct btrfs_key key; 2518 struct btrfs_file_extent_item *fi; 2519 u64 root_gen; 2520 u32 nritems; 2521 int i; 2522 int level; 2523 int ret = 0; 2524 int shared = 0; 2525 2526 if (!root->ref_cows) 2527 return 0; 2528 2529 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 2530 shared = 0; 2531 root_gen = root->root_key.offset; 2532 } else { 2533 shared = 1; 2534 root_gen = trans->transid - 1; 2535 } 2536 2537 level = btrfs_header_level(buf); 2538 nritems = btrfs_header_nritems(buf); 2539 2540 if (level == 0) { 2541 struct btrfs_leaf_ref *ref; 2542 struct btrfs_extent_info *info; 2543 2544 ref = btrfs_alloc_leaf_ref(root, nr_extents); 2545 if (!ref) { 2546 ret = -ENOMEM; 2547 goto out; 2548 } 2549 2550 ref->root_gen = root_gen; 2551 ref->bytenr = buf->start; 2552 ref->owner = btrfs_header_owner(buf); 2553 ref->generation = btrfs_header_generation(buf); 2554 ref->nritems = nr_extents; 2555 info = ref->extents; 2556 2557 for (i = 0; nr_extents > 0 && i < nritems; i++) { 2558 u64 disk_bytenr; 2559 btrfs_item_key_to_cpu(buf, &key, i); 2560 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 2561 continue; 2562 fi = btrfs_item_ptr(buf, i, 2563 struct btrfs_file_extent_item); 2564 if (btrfs_file_extent_type(buf, fi) == 2565 BTRFS_FILE_EXTENT_INLINE) 2566 continue; 2567 disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 2568 if (disk_bytenr == 0) 2569 continue; 2570 2571 info->bytenr = disk_bytenr; 2572 info->num_bytes = 2573 btrfs_file_extent_disk_num_bytes(buf, fi); 2574 info->objectid = key.objectid; 2575 info->offset = key.offset; 2576 info++; 2577 } 2578 2579 ret = btrfs_add_leaf_ref(root, ref, shared); 2580 if (ret == -EEXIST && shared) { 2581 struct btrfs_leaf_ref *old; 2582 old = btrfs_lookup_leaf_ref(root, ref->bytenr); 2583 BUG_ON(!old); 2584 btrfs_remove_leaf_ref(root, old); 2585 btrfs_free_leaf_ref(root, old); 2586 ret = btrfs_add_leaf_ref(root, ref, shared); 2587 } 2588 WARN_ON(ret); 2589 btrfs_free_leaf_ref(root, ref); 2590 } 2591 out: 2592 return ret; 2593 } 2594 2595 /* when a block goes through cow, we update the reference counts of 2596 * everything that block points to. The internal pointers of the block 2597 * can be in just about any order, and it is likely to have clusters of 2598 * things that are close together and clusters of things that are not. 2599 * 2600 * To help reduce the seeks that come with updating all of these reference 2601 * counts, sort them by byte number before actual updates are done. 2602 * 2603 * struct refsort is used to match byte number to slot in the btree block. 2604 * we sort based on the byte number and then use the slot to actually 2605 * find the item. 2606 * 2607 * struct refsort is smaller than strcut btrfs_item and smaller than 2608 * struct btrfs_key_ptr. Since we're currently limited to the page size 2609 * for a btree block, there's no way for a kmalloc of refsorts for a 2610 * single node to be bigger than a page. 2611 */ 2612 struct refsort { 2613 u64 bytenr; 2614 u32 slot; 2615 }; 2616 2617 /* 2618 * for passing into sort() 2619 */ 2620 static int refsort_cmp(const void *a_void, const void *b_void) 2621 { 2622 const struct refsort *a = a_void; 2623 const struct refsort *b = b_void; 2624 2625 if (a->bytenr < b->bytenr) 2626 return -1; 2627 if (a->bytenr > b->bytenr) 2628 return 1; 2629 return 0; 2630 } 2631 #endif 2632 2633 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2634 struct btrfs_root *root, 2635 struct extent_buffer *buf, 2636 int full_backref, int inc) 2637 { 2638 u64 bytenr; 2639 u64 num_bytes; 2640 u64 parent; 2641 u64 ref_root; 2642 u32 nritems; 2643 struct btrfs_key key; 2644 struct btrfs_file_extent_item *fi; 2645 int i; 2646 int level; 2647 int ret = 0; 2648 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 2649 u64, u64, u64, u64, u64, u64); 2650 2651 ref_root = btrfs_header_owner(buf); 2652 nritems = btrfs_header_nritems(buf); 2653 level = btrfs_header_level(buf); 2654 2655 if (!root->ref_cows && level == 0) 2656 return 0; 2657 2658 if (inc) 2659 process_func = btrfs_inc_extent_ref; 2660 else 2661 process_func = btrfs_free_extent; 2662 2663 if (full_backref) 2664 parent = buf->start; 2665 else 2666 parent = 0; 2667 2668 for (i = 0; i < nritems; i++) { 2669 if (level == 0) { 2670 btrfs_item_key_to_cpu(buf, &key, i); 2671 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 2672 continue; 2673 fi = btrfs_item_ptr(buf, i, 2674 struct btrfs_file_extent_item); 2675 if (btrfs_file_extent_type(buf, fi) == 2676 BTRFS_FILE_EXTENT_INLINE) 2677 continue; 2678 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 2679 if (bytenr == 0) 2680 continue; 2681 2682 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 2683 key.offset -= btrfs_file_extent_offset(buf, fi); 2684 ret = process_func(trans, root, bytenr, num_bytes, 2685 parent, ref_root, key.objectid, 2686 key.offset); 2687 if (ret) 2688 goto fail; 2689 } else { 2690 bytenr = btrfs_node_blockptr(buf, i); 2691 num_bytes = btrfs_level_size(root, level - 1); 2692 ret = process_func(trans, root, bytenr, num_bytes, 2693 parent, ref_root, level - 1, 0); 2694 if (ret) 2695 goto fail; 2696 } 2697 } 2698 return 0; 2699 fail: 2700 BUG(); 2701 return ret; 2702 } 2703 2704 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2705 struct extent_buffer *buf, int full_backref) 2706 { 2707 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 2708 } 2709 2710 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2711 struct extent_buffer *buf, int full_backref) 2712 { 2713 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 2714 } 2715 2716 static int write_one_cache_group(struct btrfs_trans_handle *trans, 2717 struct btrfs_root *root, 2718 struct btrfs_path *path, 2719 struct btrfs_block_group_cache *cache) 2720 { 2721 int ret; 2722 struct btrfs_root *extent_root = root->fs_info->extent_root; 2723 unsigned long bi; 2724 struct extent_buffer *leaf; 2725 2726 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 2727 if (ret < 0) 2728 goto fail; 2729 BUG_ON(ret); 2730 2731 leaf = path->nodes[0]; 2732 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 2733 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 2734 btrfs_mark_buffer_dirty(leaf); 2735 btrfs_release_path(extent_root, path); 2736 fail: 2737 if (ret) 2738 return ret; 2739 return 0; 2740 2741 } 2742 2743 static struct btrfs_block_group_cache * 2744 next_block_group(struct btrfs_root *root, 2745 struct btrfs_block_group_cache *cache) 2746 { 2747 struct rb_node *node; 2748 spin_lock(&root->fs_info->block_group_cache_lock); 2749 node = rb_next(&cache->cache_node); 2750 btrfs_put_block_group(cache); 2751 if (node) { 2752 cache = rb_entry(node, struct btrfs_block_group_cache, 2753 cache_node); 2754 btrfs_get_block_group(cache); 2755 } else 2756 cache = NULL; 2757 spin_unlock(&root->fs_info->block_group_cache_lock); 2758 return cache; 2759 } 2760 2761 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 2762 struct btrfs_trans_handle *trans, 2763 struct btrfs_path *path) 2764 { 2765 struct btrfs_root *root = block_group->fs_info->tree_root; 2766 struct inode *inode = NULL; 2767 u64 alloc_hint = 0; 2768 int dcs = BTRFS_DC_ERROR; 2769 int num_pages = 0; 2770 int retries = 0; 2771 int ret = 0; 2772 2773 /* 2774 * If this block group is smaller than 100 megs don't bother caching the 2775 * block group. 2776 */ 2777 if (block_group->key.offset < (100 * 1024 * 1024)) { 2778 spin_lock(&block_group->lock); 2779 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 2780 spin_unlock(&block_group->lock); 2781 return 0; 2782 } 2783 2784 again: 2785 inode = lookup_free_space_inode(root, block_group, path); 2786 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 2787 ret = PTR_ERR(inode); 2788 btrfs_release_path(root, path); 2789 goto out; 2790 } 2791 2792 if (IS_ERR(inode)) { 2793 BUG_ON(retries); 2794 retries++; 2795 2796 if (block_group->ro) 2797 goto out_free; 2798 2799 ret = create_free_space_inode(root, trans, block_group, path); 2800 if (ret) 2801 goto out_free; 2802 goto again; 2803 } 2804 2805 /* 2806 * We want to set the generation to 0, that way if anything goes wrong 2807 * from here on out we know not to trust this cache when we load up next 2808 * time. 2809 */ 2810 BTRFS_I(inode)->generation = 0; 2811 ret = btrfs_update_inode(trans, root, inode); 2812 WARN_ON(ret); 2813 2814 if (i_size_read(inode) > 0) { 2815 ret = btrfs_truncate_free_space_cache(root, trans, path, 2816 inode); 2817 if (ret) 2818 goto out_put; 2819 } 2820 2821 spin_lock(&block_group->lock); 2822 if (block_group->cached != BTRFS_CACHE_FINISHED) { 2823 /* We're not cached, don't bother trying to write stuff out */ 2824 dcs = BTRFS_DC_WRITTEN; 2825 spin_unlock(&block_group->lock); 2826 goto out_put; 2827 } 2828 spin_unlock(&block_group->lock); 2829 2830 num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024); 2831 if (!num_pages) 2832 num_pages = 1; 2833 2834 /* 2835 * Just to make absolutely sure we have enough space, we're going to 2836 * preallocate 12 pages worth of space for each block group. In 2837 * practice we ought to use at most 8, but we need extra space so we can 2838 * add our header and have a terminator between the extents and the 2839 * bitmaps. 2840 */ 2841 num_pages *= 16; 2842 num_pages *= PAGE_CACHE_SIZE; 2843 2844 ret = btrfs_check_data_free_space(inode, num_pages); 2845 if (ret) 2846 goto out_put; 2847 2848 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 2849 num_pages, num_pages, 2850 &alloc_hint); 2851 if (!ret) 2852 dcs = BTRFS_DC_SETUP; 2853 btrfs_free_reserved_data_space(inode, num_pages); 2854 out_put: 2855 iput(inode); 2856 out_free: 2857 btrfs_release_path(root, path); 2858 out: 2859 spin_lock(&block_group->lock); 2860 block_group->disk_cache_state = dcs; 2861 spin_unlock(&block_group->lock); 2862 2863 return ret; 2864 } 2865 2866 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2867 struct btrfs_root *root) 2868 { 2869 struct btrfs_block_group_cache *cache; 2870 int err = 0; 2871 struct btrfs_path *path; 2872 u64 last = 0; 2873 2874 path = btrfs_alloc_path(); 2875 if (!path) 2876 return -ENOMEM; 2877 2878 again: 2879 while (1) { 2880 cache = btrfs_lookup_first_block_group(root->fs_info, last); 2881 while (cache) { 2882 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 2883 break; 2884 cache = next_block_group(root, cache); 2885 } 2886 if (!cache) { 2887 if (last == 0) 2888 break; 2889 last = 0; 2890 continue; 2891 } 2892 err = cache_save_setup(cache, trans, path); 2893 last = cache->key.objectid + cache->key.offset; 2894 btrfs_put_block_group(cache); 2895 } 2896 2897 while (1) { 2898 if (last == 0) { 2899 err = btrfs_run_delayed_refs(trans, root, 2900 (unsigned long)-1); 2901 BUG_ON(err); 2902 } 2903 2904 cache = btrfs_lookup_first_block_group(root->fs_info, last); 2905 while (cache) { 2906 if (cache->disk_cache_state == BTRFS_DC_CLEAR) { 2907 btrfs_put_block_group(cache); 2908 goto again; 2909 } 2910 2911 if (cache->dirty) 2912 break; 2913 cache = next_block_group(root, cache); 2914 } 2915 if (!cache) { 2916 if (last == 0) 2917 break; 2918 last = 0; 2919 continue; 2920 } 2921 2922 if (cache->disk_cache_state == BTRFS_DC_SETUP) 2923 cache->disk_cache_state = BTRFS_DC_NEED_WRITE; 2924 cache->dirty = 0; 2925 last = cache->key.objectid + cache->key.offset; 2926 2927 err = write_one_cache_group(trans, root, path, cache); 2928 BUG_ON(err); 2929 btrfs_put_block_group(cache); 2930 } 2931 2932 while (1) { 2933 /* 2934 * I don't think this is needed since we're just marking our 2935 * preallocated extent as written, but just in case it can't 2936 * hurt. 2937 */ 2938 if (last == 0) { 2939 err = btrfs_run_delayed_refs(trans, root, 2940 (unsigned long)-1); 2941 BUG_ON(err); 2942 } 2943 2944 cache = btrfs_lookup_first_block_group(root->fs_info, last); 2945 while (cache) { 2946 /* 2947 * Really this shouldn't happen, but it could if we 2948 * couldn't write the entire preallocated extent and 2949 * splitting the extent resulted in a new block. 2950 */ 2951 if (cache->dirty) { 2952 btrfs_put_block_group(cache); 2953 goto again; 2954 } 2955 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 2956 break; 2957 cache = next_block_group(root, cache); 2958 } 2959 if (!cache) { 2960 if (last == 0) 2961 break; 2962 last = 0; 2963 continue; 2964 } 2965 2966 btrfs_write_out_cache(root, trans, cache, path); 2967 2968 /* 2969 * If we didn't have an error then the cache state is still 2970 * NEED_WRITE, so we can set it to WRITTEN. 2971 */ 2972 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 2973 cache->disk_cache_state = BTRFS_DC_WRITTEN; 2974 last = cache->key.objectid + cache->key.offset; 2975 btrfs_put_block_group(cache); 2976 } 2977 2978 btrfs_free_path(path); 2979 return 0; 2980 } 2981 2982 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 2983 { 2984 struct btrfs_block_group_cache *block_group; 2985 int readonly = 0; 2986 2987 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 2988 if (!block_group || block_group->ro) 2989 readonly = 1; 2990 if (block_group) 2991 btrfs_put_block_group(block_group); 2992 return readonly; 2993 } 2994 2995 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 2996 u64 total_bytes, u64 bytes_used, 2997 struct btrfs_space_info **space_info) 2998 { 2999 struct btrfs_space_info *found; 3000 int i; 3001 int factor; 3002 3003 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3004 BTRFS_BLOCK_GROUP_RAID10)) 3005 factor = 2; 3006 else 3007 factor = 1; 3008 3009 found = __find_space_info(info, flags); 3010 if (found) { 3011 spin_lock(&found->lock); 3012 found->total_bytes += total_bytes; 3013 found->disk_total += total_bytes * factor; 3014 found->bytes_used += bytes_used; 3015 found->disk_used += bytes_used * factor; 3016 found->full = 0; 3017 spin_unlock(&found->lock); 3018 *space_info = found; 3019 return 0; 3020 } 3021 found = kzalloc(sizeof(*found), GFP_NOFS); 3022 if (!found) 3023 return -ENOMEM; 3024 3025 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3026 INIT_LIST_HEAD(&found->block_groups[i]); 3027 init_rwsem(&found->groups_sem); 3028 spin_lock_init(&found->lock); 3029 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | 3030 BTRFS_BLOCK_GROUP_SYSTEM | 3031 BTRFS_BLOCK_GROUP_METADATA); 3032 found->total_bytes = total_bytes; 3033 found->disk_total = total_bytes * factor; 3034 found->bytes_used = bytes_used; 3035 found->disk_used = bytes_used * factor; 3036 found->bytes_pinned = 0; 3037 found->bytes_reserved = 0; 3038 found->bytes_readonly = 0; 3039 found->bytes_may_use = 0; 3040 found->full = 0; 3041 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3042 found->chunk_alloc = 0; 3043 *space_info = found; 3044 list_add_rcu(&found->list, &info->space_info); 3045 atomic_set(&found->caching_threads, 0); 3046 return 0; 3047 } 3048 3049 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3050 { 3051 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | 3052 BTRFS_BLOCK_GROUP_RAID1 | 3053 BTRFS_BLOCK_GROUP_RAID10 | 3054 BTRFS_BLOCK_GROUP_DUP); 3055 if (extra_flags) { 3056 if (flags & BTRFS_BLOCK_GROUP_DATA) 3057 fs_info->avail_data_alloc_bits |= extra_flags; 3058 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3059 fs_info->avail_metadata_alloc_bits |= extra_flags; 3060 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3061 fs_info->avail_system_alloc_bits |= extra_flags; 3062 } 3063 } 3064 3065 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3066 { 3067 /* 3068 * we add in the count of missing devices because we want 3069 * to make sure that any RAID levels on a degraded FS 3070 * continue to be honored. 3071 */ 3072 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3073 root->fs_info->fs_devices->missing_devices; 3074 3075 if (num_devices == 1) 3076 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3077 if (num_devices < 4) 3078 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3079 3080 if ((flags & BTRFS_BLOCK_GROUP_DUP) && 3081 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3082 BTRFS_BLOCK_GROUP_RAID10))) { 3083 flags &= ~BTRFS_BLOCK_GROUP_DUP; 3084 } 3085 3086 if ((flags & BTRFS_BLOCK_GROUP_RAID1) && 3087 (flags & BTRFS_BLOCK_GROUP_RAID10)) { 3088 flags &= ~BTRFS_BLOCK_GROUP_RAID1; 3089 } 3090 3091 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3092 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3093 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3094 (flags & BTRFS_BLOCK_GROUP_DUP))) 3095 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3096 return flags; 3097 } 3098 3099 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3100 { 3101 if (flags & BTRFS_BLOCK_GROUP_DATA) 3102 flags |= root->fs_info->avail_data_alloc_bits & 3103 root->fs_info->data_alloc_profile; 3104 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3105 flags |= root->fs_info->avail_system_alloc_bits & 3106 root->fs_info->system_alloc_profile; 3107 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3108 flags |= root->fs_info->avail_metadata_alloc_bits & 3109 root->fs_info->metadata_alloc_profile; 3110 return btrfs_reduce_alloc_profile(root, flags); 3111 } 3112 3113 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3114 { 3115 u64 flags; 3116 3117 if (data) 3118 flags = BTRFS_BLOCK_GROUP_DATA; 3119 else if (root == root->fs_info->chunk_root) 3120 flags = BTRFS_BLOCK_GROUP_SYSTEM; 3121 else 3122 flags = BTRFS_BLOCK_GROUP_METADATA; 3123 3124 return get_alloc_profile(root, flags); 3125 } 3126 3127 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) 3128 { 3129 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, 3130 BTRFS_BLOCK_GROUP_DATA); 3131 } 3132 3133 /* 3134 * This will check the space that the inode allocates from to make sure we have 3135 * enough space for bytes. 3136 */ 3137 int btrfs_check_data_free_space(struct inode *inode, u64 bytes) 3138 { 3139 struct btrfs_space_info *data_sinfo; 3140 struct btrfs_root *root = BTRFS_I(inode)->root; 3141 u64 used; 3142 int ret = 0, committed = 0, alloc_chunk = 1; 3143 3144 /* make sure bytes are sectorsize aligned */ 3145 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3146 3147 if (root == root->fs_info->tree_root) { 3148 alloc_chunk = 0; 3149 committed = 1; 3150 } 3151 3152 data_sinfo = BTRFS_I(inode)->space_info; 3153 if (!data_sinfo) 3154 goto alloc; 3155 3156 again: 3157 /* make sure we have enough space to handle the data first */ 3158 spin_lock(&data_sinfo->lock); 3159 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + 3160 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + 3161 data_sinfo->bytes_may_use; 3162 3163 if (used + bytes > data_sinfo->total_bytes) { 3164 struct btrfs_trans_handle *trans; 3165 3166 /* 3167 * if we don't have enough free bytes in this space then we need 3168 * to alloc a new chunk. 3169 */ 3170 if (!data_sinfo->full && alloc_chunk) { 3171 u64 alloc_target; 3172 3173 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 3174 spin_unlock(&data_sinfo->lock); 3175 alloc: 3176 alloc_target = btrfs_get_alloc_profile(root, 1); 3177 trans = btrfs_join_transaction(root, 1); 3178 if (IS_ERR(trans)) 3179 return PTR_ERR(trans); 3180 3181 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3182 bytes + 2 * 1024 * 1024, 3183 alloc_target, 3184 CHUNK_ALLOC_NO_FORCE); 3185 btrfs_end_transaction(trans, root); 3186 if (ret < 0) { 3187 if (ret != -ENOSPC) 3188 return ret; 3189 else 3190 goto commit_trans; 3191 } 3192 3193 if (!data_sinfo) { 3194 btrfs_set_inode_space_info(root, inode); 3195 data_sinfo = BTRFS_I(inode)->space_info; 3196 } 3197 goto again; 3198 } 3199 spin_unlock(&data_sinfo->lock); 3200 3201 /* commit the current transaction and try again */ 3202 commit_trans: 3203 if (!committed && !root->fs_info->open_ioctl_trans) { 3204 committed = 1; 3205 trans = btrfs_join_transaction(root, 1); 3206 if (IS_ERR(trans)) 3207 return PTR_ERR(trans); 3208 ret = btrfs_commit_transaction(trans, root); 3209 if (ret) 3210 return ret; 3211 goto again; 3212 } 3213 3214 #if 0 /* I hope we never need this code again, just in case */ 3215 printk(KERN_ERR "no space left, need %llu, %llu bytes_used, " 3216 "%llu bytes_reserved, " "%llu bytes_pinned, " 3217 "%llu bytes_readonly, %llu may use %llu total\n", 3218 (unsigned long long)bytes, 3219 (unsigned long long)data_sinfo->bytes_used, 3220 (unsigned long long)data_sinfo->bytes_reserved, 3221 (unsigned long long)data_sinfo->bytes_pinned, 3222 (unsigned long long)data_sinfo->bytes_readonly, 3223 (unsigned long long)data_sinfo->bytes_may_use, 3224 (unsigned long long)data_sinfo->total_bytes); 3225 #endif 3226 return -ENOSPC; 3227 } 3228 data_sinfo->bytes_may_use += bytes; 3229 BTRFS_I(inode)->reserved_bytes += bytes; 3230 spin_unlock(&data_sinfo->lock); 3231 3232 return 0; 3233 } 3234 3235 /* 3236 * called when we are clearing an delalloc extent from the 3237 * inode's io_tree or there was an error for whatever reason 3238 * after calling btrfs_check_data_free_space 3239 */ 3240 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3241 { 3242 struct btrfs_root *root = BTRFS_I(inode)->root; 3243 struct btrfs_space_info *data_sinfo; 3244 3245 /* make sure bytes are sectorsize aligned */ 3246 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3247 3248 data_sinfo = BTRFS_I(inode)->space_info; 3249 spin_lock(&data_sinfo->lock); 3250 data_sinfo->bytes_may_use -= bytes; 3251 BTRFS_I(inode)->reserved_bytes -= bytes; 3252 spin_unlock(&data_sinfo->lock); 3253 } 3254 3255 static void force_metadata_allocation(struct btrfs_fs_info *info) 3256 { 3257 struct list_head *head = &info->space_info; 3258 struct btrfs_space_info *found; 3259 3260 rcu_read_lock(); 3261 list_for_each_entry_rcu(found, head, list) { 3262 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3263 found->force_alloc = CHUNK_ALLOC_FORCE; 3264 } 3265 rcu_read_unlock(); 3266 } 3267 3268 static int should_alloc_chunk(struct btrfs_root *root, 3269 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3270 int force) 3271 { 3272 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3273 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3274 u64 thresh; 3275 3276 if (force == CHUNK_ALLOC_FORCE) 3277 return 1; 3278 3279 /* 3280 * in limited mode, we want to have some free space up to 3281 * about 1% of the FS size. 3282 */ 3283 if (force == CHUNK_ALLOC_LIMITED) { 3284 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3285 thresh = max_t(u64, 64 * 1024 * 1024, 3286 div_factor_fine(thresh, 1)); 3287 3288 if (num_bytes - num_allocated < thresh) 3289 return 1; 3290 } 3291 3292 /* 3293 * we have two similar checks here, one based on percentage 3294 * and once based on a hard number of 256MB. The idea 3295 * is that if we have a good amount of free 3296 * room, don't allocate a chunk. A good mount is 3297 * less than 80% utilized of the chunks we have allocated, 3298 * or more than 256MB free 3299 */ 3300 if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes) 3301 return 0; 3302 3303 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) 3304 return 0; 3305 3306 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3307 3308 /* 256MB or 5% of the FS */ 3309 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3310 3311 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) 3312 return 0; 3313 return 1; 3314 } 3315 3316 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3317 struct btrfs_root *extent_root, u64 alloc_bytes, 3318 u64 flags, int force) 3319 { 3320 struct btrfs_space_info *space_info; 3321 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3322 int wait_for_alloc = 0; 3323 int ret = 0; 3324 3325 flags = btrfs_reduce_alloc_profile(extent_root, flags); 3326 3327 space_info = __find_space_info(extent_root->fs_info, flags); 3328 if (!space_info) { 3329 ret = update_space_info(extent_root->fs_info, flags, 3330 0, 0, &space_info); 3331 BUG_ON(ret); 3332 } 3333 BUG_ON(!space_info); 3334 3335 again: 3336 spin_lock(&space_info->lock); 3337 if (space_info->force_alloc) 3338 force = space_info->force_alloc; 3339 if (space_info->full) { 3340 spin_unlock(&space_info->lock); 3341 return 0; 3342 } 3343 3344 if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { 3345 spin_unlock(&space_info->lock); 3346 return 0; 3347 } else if (space_info->chunk_alloc) { 3348 wait_for_alloc = 1; 3349 } else { 3350 space_info->chunk_alloc = 1; 3351 } 3352 3353 spin_unlock(&space_info->lock); 3354 3355 mutex_lock(&fs_info->chunk_mutex); 3356 3357 /* 3358 * The chunk_mutex is held throughout the entirety of a chunk 3359 * allocation, so once we've acquired the chunk_mutex we know that the 3360 * other guy is done and we need to recheck and see if we should 3361 * allocate. 3362 */ 3363 if (wait_for_alloc) { 3364 mutex_unlock(&fs_info->chunk_mutex); 3365 wait_for_alloc = 0; 3366 goto again; 3367 } 3368 3369 /* 3370 * If we have mixed data/metadata chunks we want to make sure we keep 3371 * allocating mixed chunks instead of individual chunks. 3372 */ 3373 if (btrfs_mixed_space_info(space_info)) 3374 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 3375 3376 /* 3377 * if we're doing a data chunk, go ahead and make sure that 3378 * we keep a reasonable number of metadata chunks allocated in the 3379 * FS as well. 3380 */ 3381 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3382 fs_info->data_chunk_allocations++; 3383 if (!(fs_info->data_chunk_allocations % 3384 fs_info->metadata_ratio)) 3385 force_metadata_allocation(fs_info); 3386 } 3387 3388 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3389 spin_lock(&space_info->lock); 3390 if (ret) 3391 space_info->full = 1; 3392 else 3393 ret = 1; 3394 3395 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3396 space_info->chunk_alloc = 0; 3397 spin_unlock(&space_info->lock); 3398 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3399 return ret; 3400 } 3401 3402 /* 3403 * shrink metadata reservation for delalloc 3404 */ 3405 static int shrink_delalloc(struct btrfs_trans_handle *trans, 3406 struct btrfs_root *root, u64 to_reclaim, int sync) 3407 { 3408 struct btrfs_block_rsv *block_rsv; 3409 struct btrfs_space_info *space_info; 3410 u64 reserved; 3411 u64 max_reclaim; 3412 u64 reclaimed = 0; 3413 long time_left; 3414 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3415 int loops = 0; 3416 unsigned long progress; 3417 3418 block_rsv = &root->fs_info->delalloc_block_rsv; 3419 space_info = block_rsv->space_info; 3420 3421 smp_mb(); 3422 reserved = space_info->bytes_reserved; 3423 progress = space_info->reservation_progress; 3424 3425 if (reserved == 0) 3426 return 0; 3427 3428 max_reclaim = min(reserved, to_reclaim); 3429 3430 while (loops < 1024) { 3431 /* have the flusher threads jump in and do some IO */ 3432 smp_mb(); 3433 nr_pages = min_t(unsigned long, nr_pages, 3434 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); 3435 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); 3436 3437 spin_lock(&space_info->lock); 3438 if (reserved > space_info->bytes_reserved) 3439 reclaimed += reserved - space_info->bytes_reserved; 3440 reserved = space_info->bytes_reserved; 3441 spin_unlock(&space_info->lock); 3442 3443 loops++; 3444 3445 if (reserved == 0 || reclaimed >= max_reclaim) 3446 break; 3447 3448 if (trans && trans->transaction->blocked) 3449 return -EAGAIN; 3450 3451 time_left = schedule_timeout_interruptible(1); 3452 3453 /* We were interrupted, exit */ 3454 if (time_left) 3455 break; 3456 3457 /* we've kicked the IO a few times, if anything has been freed, 3458 * exit. There is no sense in looping here for a long time 3459 * when we really need to commit the transaction, or there are 3460 * just too many writers without enough free space 3461 */ 3462 3463 if (loops > 3) { 3464 smp_mb(); 3465 if (progress != space_info->reservation_progress) 3466 break; 3467 } 3468 3469 } 3470 return reclaimed >= to_reclaim; 3471 } 3472 3473 /* 3474 * Retries tells us how many times we've called reserve_metadata_bytes. The 3475 * idea is if this is the first call (retries == 0) then we will add to our 3476 * reserved count if we can't make the allocation in order to hold our place 3477 * while we go and try and free up space. That way for retries > 1 we don't try 3478 * and add space, we just check to see if the amount of unused space is >= the 3479 * total space, meaning that our reservation is valid. 3480 * 3481 * However if we don't intend to retry this reservation, pass -1 as retries so 3482 * that it short circuits this logic. 3483 */ 3484 static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, 3485 struct btrfs_root *root, 3486 struct btrfs_block_rsv *block_rsv, 3487 u64 orig_bytes, int flush) 3488 { 3489 struct btrfs_space_info *space_info = block_rsv->space_info; 3490 u64 unused; 3491 u64 num_bytes = orig_bytes; 3492 int retries = 0; 3493 int ret = 0; 3494 bool reserved = false; 3495 bool committed = false; 3496 3497 again: 3498 ret = -ENOSPC; 3499 if (reserved) 3500 num_bytes = 0; 3501 3502 spin_lock(&space_info->lock); 3503 unused = space_info->bytes_used + space_info->bytes_reserved + 3504 space_info->bytes_pinned + space_info->bytes_readonly + 3505 space_info->bytes_may_use; 3506 3507 /* 3508 * The idea here is that we've not already over-reserved the block group 3509 * then we can go ahead and save our reservation first and then start 3510 * flushing if we need to. Otherwise if we've already overcommitted 3511 * lets start flushing stuff first and then come back and try to make 3512 * our reservation. 3513 */ 3514 if (unused <= space_info->total_bytes) { 3515 unused = space_info->total_bytes - unused; 3516 if (unused >= num_bytes) { 3517 if (!reserved) 3518 space_info->bytes_reserved += orig_bytes; 3519 ret = 0; 3520 } else { 3521 /* 3522 * Ok set num_bytes to orig_bytes since we aren't 3523 * overocmmitted, this way we only try and reclaim what 3524 * we need. 3525 */ 3526 num_bytes = orig_bytes; 3527 } 3528 } else { 3529 /* 3530 * Ok we're over committed, set num_bytes to the overcommitted 3531 * amount plus the amount of bytes that we need for this 3532 * reservation. 3533 */ 3534 num_bytes = unused - space_info->total_bytes + 3535 (orig_bytes * (retries + 1)); 3536 } 3537 3538 /* 3539 * Couldn't make our reservation, save our place so while we're trying 3540 * to reclaim space we can actually use it instead of somebody else 3541 * stealing it from us. 3542 */ 3543 if (ret && !reserved) { 3544 space_info->bytes_reserved += orig_bytes; 3545 reserved = true; 3546 } 3547 3548 spin_unlock(&space_info->lock); 3549 3550 if (!ret) 3551 return 0; 3552 3553 if (!flush) 3554 goto out; 3555 3556 /* 3557 * We do synchronous shrinking since we don't actually unreserve 3558 * metadata until after the IO is completed. 3559 */ 3560 ret = shrink_delalloc(trans, root, num_bytes, 1); 3561 if (ret > 0) 3562 return 0; 3563 else if (ret < 0) 3564 goto out; 3565 3566 /* 3567 * So if we were overcommitted it's possible that somebody else flushed 3568 * out enough space and we simply didn't have enough space to reclaim, 3569 * so go back around and try again. 3570 */ 3571 if (retries < 2) { 3572 retries++; 3573 goto again; 3574 } 3575 3576 spin_lock(&space_info->lock); 3577 /* 3578 * Not enough space to be reclaimed, don't bother committing the 3579 * transaction. 3580 */ 3581 if (space_info->bytes_pinned < orig_bytes) 3582 ret = -ENOSPC; 3583 spin_unlock(&space_info->lock); 3584 if (ret) 3585 goto out; 3586 3587 ret = -EAGAIN; 3588 if (trans || committed) 3589 goto out; 3590 3591 ret = -ENOSPC; 3592 trans = btrfs_join_transaction(root, 1); 3593 if (IS_ERR(trans)) 3594 goto out; 3595 ret = btrfs_commit_transaction(trans, root); 3596 if (!ret) { 3597 trans = NULL; 3598 committed = true; 3599 goto again; 3600 } 3601 3602 out: 3603 if (reserved) { 3604 spin_lock(&space_info->lock); 3605 space_info->bytes_reserved -= orig_bytes; 3606 spin_unlock(&space_info->lock); 3607 } 3608 3609 return ret; 3610 } 3611 3612 static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, 3613 struct btrfs_root *root) 3614 { 3615 struct btrfs_block_rsv *block_rsv; 3616 if (root->ref_cows) 3617 block_rsv = trans->block_rsv; 3618 else 3619 block_rsv = root->block_rsv; 3620 3621 if (!block_rsv) 3622 block_rsv = &root->fs_info->empty_block_rsv; 3623 3624 return block_rsv; 3625 } 3626 3627 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 3628 u64 num_bytes) 3629 { 3630 int ret = -ENOSPC; 3631 spin_lock(&block_rsv->lock); 3632 if (block_rsv->reserved >= num_bytes) { 3633 block_rsv->reserved -= num_bytes; 3634 if (block_rsv->reserved < block_rsv->size) 3635 block_rsv->full = 0; 3636 ret = 0; 3637 } 3638 spin_unlock(&block_rsv->lock); 3639 return ret; 3640 } 3641 3642 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 3643 u64 num_bytes, int update_size) 3644 { 3645 spin_lock(&block_rsv->lock); 3646 block_rsv->reserved += num_bytes; 3647 if (update_size) 3648 block_rsv->size += num_bytes; 3649 else if (block_rsv->reserved >= block_rsv->size) 3650 block_rsv->full = 1; 3651 spin_unlock(&block_rsv->lock); 3652 } 3653 3654 void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, 3655 struct btrfs_block_rsv *dest, u64 num_bytes) 3656 { 3657 struct btrfs_space_info *space_info = block_rsv->space_info; 3658 3659 spin_lock(&block_rsv->lock); 3660 if (num_bytes == (u64)-1) 3661 num_bytes = block_rsv->size; 3662 block_rsv->size -= num_bytes; 3663 if (block_rsv->reserved >= block_rsv->size) { 3664 num_bytes = block_rsv->reserved - block_rsv->size; 3665 block_rsv->reserved = block_rsv->size; 3666 block_rsv->full = 1; 3667 } else { 3668 num_bytes = 0; 3669 } 3670 spin_unlock(&block_rsv->lock); 3671 3672 if (num_bytes > 0) { 3673 if (dest) { 3674 spin_lock(&dest->lock); 3675 if (!dest->full) { 3676 u64 bytes_to_add; 3677 3678 bytes_to_add = dest->size - dest->reserved; 3679 bytes_to_add = min(num_bytes, bytes_to_add); 3680 dest->reserved += bytes_to_add; 3681 if (dest->reserved >= dest->size) 3682 dest->full = 1; 3683 num_bytes -= bytes_to_add; 3684 } 3685 spin_unlock(&dest->lock); 3686 } 3687 if (num_bytes) { 3688 spin_lock(&space_info->lock); 3689 space_info->bytes_reserved -= num_bytes; 3690 space_info->reservation_progress++; 3691 spin_unlock(&space_info->lock); 3692 } 3693 } 3694 } 3695 3696 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, 3697 struct btrfs_block_rsv *dst, u64 num_bytes) 3698 { 3699 int ret; 3700 3701 ret = block_rsv_use_bytes(src, num_bytes); 3702 if (ret) 3703 return ret; 3704 3705 block_rsv_add_bytes(dst, num_bytes, 1); 3706 return 0; 3707 } 3708 3709 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) 3710 { 3711 memset(rsv, 0, sizeof(*rsv)); 3712 spin_lock_init(&rsv->lock); 3713 atomic_set(&rsv->usage, 1); 3714 rsv->priority = 6; 3715 INIT_LIST_HEAD(&rsv->list); 3716 } 3717 3718 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 3719 { 3720 struct btrfs_block_rsv *block_rsv; 3721 struct btrfs_fs_info *fs_info = root->fs_info; 3722 3723 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 3724 if (!block_rsv) 3725 return NULL; 3726 3727 btrfs_init_block_rsv(block_rsv); 3728 block_rsv->space_info = __find_space_info(fs_info, 3729 BTRFS_BLOCK_GROUP_METADATA); 3730 return block_rsv; 3731 } 3732 3733 void btrfs_free_block_rsv(struct btrfs_root *root, 3734 struct btrfs_block_rsv *rsv) 3735 { 3736 if (rsv && atomic_dec_and_test(&rsv->usage)) { 3737 btrfs_block_rsv_release(root, rsv, (u64)-1); 3738 if (!rsv->durable) 3739 kfree(rsv); 3740 } 3741 } 3742 3743 /* 3744 * make the block_rsv struct be able to capture freed space. 3745 * the captured space will re-add to the the block_rsv struct 3746 * after transaction commit 3747 */ 3748 void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, 3749 struct btrfs_block_rsv *block_rsv) 3750 { 3751 block_rsv->durable = 1; 3752 mutex_lock(&fs_info->durable_block_rsv_mutex); 3753 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); 3754 mutex_unlock(&fs_info->durable_block_rsv_mutex); 3755 } 3756 3757 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 3758 struct btrfs_root *root, 3759 struct btrfs_block_rsv *block_rsv, 3760 u64 num_bytes) 3761 { 3762 int ret; 3763 3764 if (num_bytes == 0) 3765 return 0; 3766 3767 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); 3768 if (!ret) { 3769 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3770 return 0; 3771 } 3772 3773 return ret; 3774 } 3775 3776 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 3777 struct btrfs_root *root, 3778 struct btrfs_block_rsv *block_rsv, 3779 u64 min_reserved, int min_factor) 3780 { 3781 u64 num_bytes = 0; 3782 int commit_trans = 0; 3783 int ret = -ENOSPC; 3784 3785 if (!block_rsv) 3786 return 0; 3787 3788 spin_lock(&block_rsv->lock); 3789 if (min_factor > 0) 3790 num_bytes = div_factor(block_rsv->size, min_factor); 3791 if (min_reserved > num_bytes) 3792 num_bytes = min_reserved; 3793 3794 if (block_rsv->reserved >= num_bytes) { 3795 ret = 0; 3796 } else { 3797 num_bytes -= block_rsv->reserved; 3798 if (block_rsv->durable && 3799 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) 3800 commit_trans = 1; 3801 } 3802 spin_unlock(&block_rsv->lock); 3803 if (!ret) 3804 return 0; 3805 3806 if (block_rsv->refill_used) { 3807 ret = reserve_metadata_bytes(trans, root, block_rsv, 3808 num_bytes, 0); 3809 if (!ret) { 3810 block_rsv_add_bytes(block_rsv, num_bytes, 0); 3811 return 0; 3812 } 3813 } 3814 3815 if (commit_trans) { 3816 if (trans) 3817 return -EAGAIN; 3818 3819 trans = btrfs_join_transaction(root, 1); 3820 BUG_ON(IS_ERR(trans)); 3821 ret = btrfs_commit_transaction(trans, root); 3822 return 0; 3823 } 3824 3825 return -ENOSPC; 3826 } 3827 3828 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3829 struct btrfs_block_rsv *dst_rsv, 3830 u64 num_bytes) 3831 { 3832 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 3833 } 3834 3835 void btrfs_block_rsv_release(struct btrfs_root *root, 3836 struct btrfs_block_rsv *block_rsv, 3837 u64 num_bytes) 3838 { 3839 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3840 if (global_rsv->full || global_rsv == block_rsv || 3841 block_rsv->space_info != global_rsv->space_info) 3842 global_rsv = NULL; 3843 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); 3844 } 3845 3846 /* 3847 * helper to calculate size of global block reservation. 3848 * the desired value is sum of space used by extent tree, 3849 * checksum tree and root tree 3850 */ 3851 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) 3852 { 3853 struct btrfs_space_info *sinfo; 3854 u64 num_bytes; 3855 u64 meta_used; 3856 u64 data_used; 3857 int csum_size = btrfs_super_csum_size(&fs_info->super_copy); 3858 #if 0 3859 /* 3860 * per tree used space accounting can be inaccuracy, so we 3861 * can't rely on it. 3862 */ 3863 spin_lock(&fs_info->extent_root->accounting_lock); 3864 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item); 3865 spin_unlock(&fs_info->extent_root->accounting_lock); 3866 3867 spin_lock(&fs_info->csum_root->accounting_lock); 3868 num_bytes += btrfs_root_used(&fs_info->csum_root->root_item); 3869 spin_unlock(&fs_info->csum_root->accounting_lock); 3870 3871 spin_lock(&fs_info->tree_root->accounting_lock); 3872 num_bytes += btrfs_root_used(&fs_info->tree_root->root_item); 3873 spin_unlock(&fs_info->tree_root->accounting_lock); 3874 #endif 3875 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 3876 spin_lock(&sinfo->lock); 3877 data_used = sinfo->bytes_used; 3878 spin_unlock(&sinfo->lock); 3879 3880 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 3881 spin_lock(&sinfo->lock); 3882 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) 3883 data_used = 0; 3884 meta_used = sinfo->bytes_used; 3885 spin_unlock(&sinfo->lock); 3886 3887 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * 3888 csum_size * 2; 3889 num_bytes += div64_u64(data_used + meta_used, 50); 3890 3891 if (num_bytes * 3 > meta_used) 3892 num_bytes = div64_u64(meta_used, 3); 3893 3894 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); 3895 } 3896 3897 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 3898 { 3899 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 3900 struct btrfs_space_info *sinfo = block_rsv->space_info; 3901 u64 num_bytes; 3902 3903 num_bytes = calc_global_metadata_size(fs_info); 3904 3905 spin_lock(&block_rsv->lock); 3906 spin_lock(&sinfo->lock); 3907 3908 block_rsv->size = num_bytes; 3909 3910 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 3911 sinfo->bytes_reserved + sinfo->bytes_readonly + 3912 sinfo->bytes_may_use; 3913 3914 if (sinfo->total_bytes > num_bytes) { 3915 num_bytes = sinfo->total_bytes - num_bytes; 3916 block_rsv->reserved += num_bytes; 3917 sinfo->bytes_reserved += num_bytes; 3918 } 3919 3920 if (block_rsv->reserved >= block_rsv->size) { 3921 num_bytes = block_rsv->reserved - block_rsv->size; 3922 sinfo->bytes_reserved -= num_bytes; 3923 sinfo->reservation_progress++; 3924 block_rsv->reserved = block_rsv->size; 3925 block_rsv->full = 1; 3926 } 3927 #if 0 3928 printk(KERN_INFO"global block rsv size %llu reserved %llu\n", 3929 block_rsv->size, block_rsv->reserved); 3930 #endif 3931 spin_unlock(&sinfo->lock); 3932 spin_unlock(&block_rsv->lock); 3933 } 3934 3935 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 3936 { 3937 struct btrfs_space_info *space_info; 3938 3939 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3940 fs_info->chunk_block_rsv.space_info = space_info; 3941 fs_info->chunk_block_rsv.priority = 10; 3942 3943 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 3944 fs_info->global_block_rsv.space_info = space_info; 3945 fs_info->global_block_rsv.priority = 10; 3946 fs_info->global_block_rsv.refill_used = 1; 3947 fs_info->delalloc_block_rsv.space_info = space_info; 3948 fs_info->trans_block_rsv.space_info = space_info; 3949 fs_info->empty_block_rsv.space_info = space_info; 3950 fs_info->empty_block_rsv.priority = 10; 3951 3952 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 3953 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 3954 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 3955 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 3956 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 3957 3958 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); 3959 3960 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); 3961 3962 update_global_block_rsv(fs_info); 3963 } 3964 3965 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 3966 { 3967 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); 3968 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 3969 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 3970 WARN_ON(fs_info->trans_block_rsv.size > 0); 3971 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 3972 WARN_ON(fs_info->chunk_block_rsv.size > 0); 3973 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 3974 } 3975 3976 static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) 3977 { 3978 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 3979 3 * num_items; 3980 } 3981 3982 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, 3983 struct btrfs_root *root, 3984 int num_items) 3985 { 3986 u64 num_bytes; 3987 int ret; 3988 3989 if (num_items == 0 || root->fs_info->chunk_root == root) 3990 return 0; 3991 3992 num_bytes = calc_trans_metadata_size(root, num_items); 3993 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, 3994 num_bytes); 3995 if (!ret) { 3996 trans->bytes_reserved += num_bytes; 3997 trans->block_rsv = &root->fs_info->trans_block_rsv; 3998 } 3999 return ret; 4000 } 4001 4002 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4003 struct btrfs_root *root) 4004 { 4005 if (!trans->bytes_reserved) 4006 return; 4007 4008 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); 4009 btrfs_block_rsv_release(root, trans->block_rsv, 4010 trans->bytes_reserved); 4011 trans->bytes_reserved = 0; 4012 } 4013 4014 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 4015 struct inode *inode) 4016 { 4017 struct btrfs_root *root = BTRFS_I(inode)->root; 4018 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4019 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 4020 4021 /* 4022 * one for deleting orphan item, one for updating inode and 4023 * two for calling btrfs_truncate_inode_items. 4024 * 4025 * btrfs_truncate_inode_items is a delete operation, it frees 4026 * more space than it uses in most cases. So two units of 4027 * metadata space should be enough for calling it many times. 4028 * If all of the metadata space is used, we can commit 4029 * transaction and use space it freed. 4030 */ 4031 u64 num_bytes = calc_trans_metadata_size(root, 4); 4032 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4033 } 4034 4035 void btrfs_orphan_release_metadata(struct inode *inode) 4036 { 4037 struct btrfs_root *root = BTRFS_I(inode)->root; 4038 u64 num_bytes = calc_trans_metadata_size(root, 4); 4039 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4040 } 4041 4042 int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, 4043 struct btrfs_pending_snapshot *pending) 4044 { 4045 struct btrfs_root *root = pending->root; 4046 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4047 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; 4048 /* 4049 * two for root back/forward refs, two for directory entries 4050 * and one for root of the snapshot. 4051 */ 4052 u64 num_bytes = calc_trans_metadata_size(root, 5); 4053 dst_rsv->space_info = src_rsv->space_info; 4054 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4055 } 4056 4057 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) 4058 { 4059 return num_bytes >>= 3; 4060 } 4061 4062 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4063 { 4064 struct btrfs_root *root = BTRFS_I(inode)->root; 4065 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4066 u64 to_reserve; 4067 int nr_extents; 4068 int reserved_extents; 4069 int ret; 4070 4071 if (btrfs_transaction_in_commit(root->fs_info)) 4072 schedule_timeout(1); 4073 4074 num_bytes = ALIGN(num_bytes, root->sectorsize); 4075 4076 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; 4077 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); 4078 4079 if (nr_extents > reserved_extents) { 4080 nr_extents -= reserved_extents; 4081 to_reserve = calc_trans_metadata_size(root, nr_extents); 4082 } else { 4083 nr_extents = 0; 4084 to_reserve = 0; 4085 } 4086 4087 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4088 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); 4089 if (ret) 4090 return ret; 4091 4092 atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents); 4093 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 4094 4095 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4096 4097 if (block_rsv->size > 512 * 1024 * 1024) 4098 shrink_delalloc(NULL, root, to_reserve, 0); 4099 4100 return 0; 4101 } 4102 4103 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4104 { 4105 struct btrfs_root *root = BTRFS_I(inode)->root; 4106 u64 to_free; 4107 int nr_extents; 4108 int reserved_extents; 4109 4110 num_bytes = ALIGN(num_bytes, root->sectorsize); 4111 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 4112 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0); 4113 4114 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); 4115 do { 4116 int old, new; 4117 4118 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); 4119 if (nr_extents >= reserved_extents) { 4120 nr_extents = 0; 4121 break; 4122 } 4123 old = reserved_extents; 4124 nr_extents = reserved_extents - nr_extents; 4125 new = reserved_extents - nr_extents; 4126 old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents, 4127 reserved_extents, new); 4128 if (likely(old == reserved_extents)) 4129 break; 4130 reserved_extents = old; 4131 } while (1); 4132 4133 to_free = calc_csum_metadata_size(inode, num_bytes); 4134 if (nr_extents > 0) 4135 to_free += calc_trans_metadata_size(root, nr_extents); 4136 4137 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4138 to_free); 4139 } 4140 4141 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 4142 { 4143 int ret; 4144 4145 ret = btrfs_check_data_free_space(inode, num_bytes); 4146 if (ret) 4147 return ret; 4148 4149 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); 4150 if (ret) { 4151 btrfs_free_reserved_data_space(inode, num_bytes); 4152 return ret; 4153 } 4154 4155 return 0; 4156 } 4157 4158 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 4159 { 4160 btrfs_delalloc_release_metadata(inode, num_bytes); 4161 btrfs_free_reserved_data_space(inode, num_bytes); 4162 } 4163 4164 static int update_block_group(struct btrfs_trans_handle *trans, 4165 struct btrfs_root *root, 4166 u64 bytenr, u64 num_bytes, int alloc) 4167 { 4168 struct btrfs_block_group_cache *cache = NULL; 4169 struct btrfs_fs_info *info = root->fs_info; 4170 u64 total = num_bytes; 4171 u64 old_val; 4172 u64 byte_in_group; 4173 int factor; 4174 4175 /* block accounting for super block */ 4176 spin_lock(&info->delalloc_lock); 4177 old_val = btrfs_super_bytes_used(&info->super_copy); 4178 if (alloc) 4179 old_val += num_bytes; 4180 else 4181 old_val -= num_bytes; 4182 btrfs_set_super_bytes_used(&info->super_copy, old_val); 4183 spin_unlock(&info->delalloc_lock); 4184 4185 while (total) { 4186 cache = btrfs_lookup_block_group(info, bytenr); 4187 if (!cache) 4188 return -1; 4189 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 4190 BTRFS_BLOCK_GROUP_RAID1 | 4191 BTRFS_BLOCK_GROUP_RAID10)) 4192 factor = 2; 4193 else 4194 factor = 1; 4195 /* 4196 * If this block group has free space cache written out, we 4197 * need to make sure to load it if we are removing space. This 4198 * is because we need the unpinning stage to actually add the 4199 * space back to the block group, otherwise we will leak space. 4200 */ 4201 if (!alloc && cache->cached == BTRFS_CACHE_NO) 4202 cache_block_group(cache, trans, NULL, 1); 4203 4204 byte_in_group = bytenr - cache->key.objectid; 4205 WARN_ON(byte_in_group > cache->key.offset); 4206 4207 spin_lock(&cache->space_info->lock); 4208 spin_lock(&cache->lock); 4209 4210 if (btrfs_super_cache_generation(&info->super_copy) != 0 && 4211 cache->disk_cache_state < BTRFS_DC_CLEAR) 4212 cache->disk_cache_state = BTRFS_DC_CLEAR; 4213 4214 cache->dirty = 1; 4215 old_val = btrfs_block_group_used(&cache->item); 4216 num_bytes = min(total, cache->key.offset - byte_in_group); 4217 if (alloc) { 4218 old_val += num_bytes; 4219 btrfs_set_block_group_used(&cache->item, old_val); 4220 cache->reserved -= num_bytes; 4221 cache->space_info->bytes_reserved -= num_bytes; 4222 cache->space_info->reservation_progress++; 4223 cache->space_info->bytes_used += num_bytes; 4224 cache->space_info->disk_used += num_bytes * factor; 4225 spin_unlock(&cache->lock); 4226 spin_unlock(&cache->space_info->lock); 4227 } else { 4228 old_val -= num_bytes; 4229 btrfs_set_block_group_used(&cache->item, old_val); 4230 cache->pinned += num_bytes; 4231 cache->space_info->bytes_pinned += num_bytes; 4232 cache->space_info->bytes_used -= num_bytes; 4233 cache->space_info->disk_used -= num_bytes * factor; 4234 spin_unlock(&cache->lock); 4235 spin_unlock(&cache->space_info->lock); 4236 4237 set_extent_dirty(info->pinned_extents, 4238 bytenr, bytenr + num_bytes - 1, 4239 GFP_NOFS | __GFP_NOFAIL); 4240 } 4241 btrfs_put_block_group(cache); 4242 total -= num_bytes; 4243 bytenr += num_bytes; 4244 } 4245 return 0; 4246 } 4247 4248 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 4249 { 4250 struct btrfs_block_group_cache *cache; 4251 u64 bytenr; 4252 4253 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 4254 if (!cache) 4255 return 0; 4256 4257 bytenr = cache->key.objectid; 4258 btrfs_put_block_group(cache); 4259 4260 return bytenr; 4261 } 4262 4263 static int pin_down_extent(struct btrfs_root *root, 4264 struct btrfs_block_group_cache *cache, 4265 u64 bytenr, u64 num_bytes, int reserved) 4266 { 4267 spin_lock(&cache->space_info->lock); 4268 spin_lock(&cache->lock); 4269 cache->pinned += num_bytes; 4270 cache->space_info->bytes_pinned += num_bytes; 4271 if (reserved) { 4272 cache->reserved -= num_bytes; 4273 cache->space_info->bytes_reserved -= num_bytes; 4274 cache->space_info->reservation_progress++; 4275 } 4276 spin_unlock(&cache->lock); 4277 spin_unlock(&cache->space_info->lock); 4278 4279 set_extent_dirty(root->fs_info->pinned_extents, bytenr, 4280 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 4281 return 0; 4282 } 4283 4284 /* 4285 * this function must be called within transaction 4286 */ 4287 int btrfs_pin_extent(struct btrfs_root *root, 4288 u64 bytenr, u64 num_bytes, int reserved) 4289 { 4290 struct btrfs_block_group_cache *cache; 4291 4292 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 4293 BUG_ON(!cache); 4294 4295 pin_down_extent(root, cache, bytenr, num_bytes, reserved); 4296 4297 btrfs_put_block_group(cache); 4298 return 0; 4299 } 4300 4301 /* 4302 * update size of reserved extents. this function may return -EAGAIN 4303 * if 'reserve' is true or 'sinfo' is false. 4304 */ 4305 int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 4306 u64 num_bytes, int reserve, int sinfo) 4307 { 4308 int ret = 0; 4309 if (sinfo) { 4310 struct btrfs_space_info *space_info = cache->space_info; 4311 spin_lock(&space_info->lock); 4312 spin_lock(&cache->lock); 4313 if (reserve) { 4314 if (cache->ro) { 4315 ret = -EAGAIN; 4316 } else { 4317 cache->reserved += num_bytes; 4318 space_info->bytes_reserved += num_bytes; 4319 } 4320 } else { 4321 if (cache->ro) 4322 space_info->bytes_readonly += num_bytes; 4323 cache->reserved -= num_bytes; 4324 space_info->bytes_reserved -= num_bytes; 4325 space_info->reservation_progress++; 4326 } 4327 spin_unlock(&cache->lock); 4328 spin_unlock(&space_info->lock); 4329 } else { 4330 spin_lock(&cache->lock); 4331 if (cache->ro) { 4332 ret = -EAGAIN; 4333 } else { 4334 if (reserve) 4335 cache->reserved += num_bytes; 4336 else 4337 cache->reserved -= num_bytes; 4338 } 4339 spin_unlock(&cache->lock); 4340 } 4341 return ret; 4342 } 4343 4344 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 4345 struct btrfs_root *root) 4346 { 4347 struct btrfs_fs_info *fs_info = root->fs_info; 4348 struct btrfs_caching_control *next; 4349 struct btrfs_caching_control *caching_ctl; 4350 struct btrfs_block_group_cache *cache; 4351 4352 down_write(&fs_info->extent_commit_sem); 4353 4354 list_for_each_entry_safe(caching_ctl, next, 4355 &fs_info->caching_block_groups, list) { 4356 cache = caching_ctl->block_group; 4357 if (block_group_cache_done(cache)) { 4358 cache->last_byte_to_unpin = (u64)-1; 4359 list_del_init(&caching_ctl->list); 4360 put_caching_control(caching_ctl); 4361 } else { 4362 cache->last_byte_to_unpin = caching_ctl->progress; 4363 } 4364 } 4365 4366 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4367 fs_info->pinned_extents = &fs_info->freed_extents[1]; 4368 else 4369 fs_info->pinned_extents = &fs_info->freed_extents[0]; 4370 4371 up_write(&fs_info->extent_commit_sem); 4372 4373 update_global_block_rsv(fs_info); 4374 return 0; 4375 } 4376 4377 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 4378 { 4379 struct btrfs_fs_info *fs_info = root->fs_info; 4380 struct btrfs_block_group_cache *cache = NULL; 4381 u64 len; 4382 4383 while (start <= end) { 4384 if (!cache || 4385 start >= cache->key.objectid + cache->key.offset) { 4386 if (cache) 4387 btrfs_put_block_group(cache); 4388 cache = btrfs_lookup_block_group(fs_info, start); 4389 BUG_ON(!cache); 4390 } 4391 4392 len = cache->key.objectid + cache->key.offset - start; 4393 len = min(len, end + 1 - start); 4394 4395 if (start < cache->last_byte_to_unpin) { 4396 len = min(len, cache->last_byte_to_unpin - start); 4397 btrfs_add_free_space(cache, start, len); 4398 } 4399 4400 start += len; 4401 4402 spin_lock(&cache->space_info->lock); 4403 spin_lock(&cache->lock); 4404 cache->pinned -= len; 4405 cache->space_info->bytes_pinned -= len; 4406 if (cache->ro) { 4407 cache->space_info->bytes_readonly += len; 4408 } else if (cache->reserved_pinned > 0) { 4409 len = min(len, cache->reserved_pinned); 4410 cache->reserved_pinned -= len; 4411 cache->space_info->bytes_reserved += len; 4412 } 4413 spin_unlock(&cache->lock); 4414 spin_unlock(&cache->space_info->lock); 4415 } 4416 4417 if (cache) 4418 btrfs_put_block_group(cache); 4419 return 0; 4420 } 4421 4422 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 4423 struct btrfs_root *root) 4424 { 4425 struct btrfs_fs_info *fs_info = root->fs_info; 4426 struct extent_io_tree *unpin; 4427 struct btrfs_block_rsv *block_rsv; 4428 struct btrfs_block_rsv *next_rsv; 4429 u64 start; 4430 u64 end; 4431 int idx; 4432 int ret; 4433 4434 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4435 unpin = &fs_info->freed_extents[1]; 4436 else 4437 unpin = &fs_info->freed_extents[0]; 4438 4439 while (1) { 4440 ret = find_first_extent_bit(unpin, 0, &start, &end, 4441 EXTENT_DIRTY); 4442 if (ret) 4443 break; 4444 4445 if (btrfs_test_opt(root, DISCARD)) 4446 ret = btrfs_discard_extent(root, start, 4447 end + 1 - start, NULL); 4448 4449 clear_extent_dirty(unpin, start, end, GFP_NOFS); 4450 unpin_extent_range(root, start, end); 4451 cond_resched(); 4452 } 4453 4454 mutex_lock(&fs_info->durable_block_rsv_mutex); 4455 list_for_each_entry_safe(block_rsv, next_rsv, 4456 &fs_info->durable_block_rsv_list, list) { 4457 4458 idx = trans->transid & 0x1; 4459 if (block_rsv->freed[idx] > 0) { 4460 block_rsv_add_bytes(block_rsv, 4461 block_rsv->freed[idx], 0); 4462 block_rsv->freed[idx] = 0; 4463 } 4464 if (atomic_read(&block_rsv->usage) == 0) { 4465 btrfs_block_rsv_release(root, block_rsv, (u64)-1); 4466 4467 if (block_rsv->freed[0] == 0 && 4468 block_rsv->freed[1] == 0) { 4469 list_del_init(&block_rsv->list); 4470 kfree(block_rsv); 4471 } 4472 } else { 4473 btrfs_block_rsv_release(root, block_rsv, 0); 4474 } 4475 } 4476 mutex_unlock(&fs_info->durable_block_rsv_mutex); 4477 4478 return 0; 4479 } 4480 4481 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 4482 struct btrfs_root *root, 4483 u64 bytenr, u64 num_bytes, u64 parent, 4484 u64 root_objectid, u64 owner_objectid, 4485 u64 owner_offset, int refs_to_drop, 4486 struct btrfs_delayed_extent_op *extent_op) 4487 { 4488 struct btrfs_key key; 4489 struct btrfs_path *path; 4490 struct btrfs_fs_info *info = root->fs_info; 4491 struct btrfs_root *extent_root = info->extent_root; 4492 struct extent_buffer *leaf; 4493 struct btrfs_extent_item *ei; 4494 struct btrfs_extent_inline_ref *iref; 4495 int ret; 4496 int is_data; 4497 int extent_slot = 0; 4498 int found_extent = 0; 4499 int num_to_del = 1; 4500 u32 item_size; 4501 u64 refs; 4502 4503 path = btrfs_alloc_path(); 4504 if (!path) 4505 return -ENOMEM; 4506 4507 path->reada = 1; 4508 path->leave_spinning = 1; 4509 4510 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 4511 BUG_ON(!is_data && refs_to_drop != 1); 4512 4513 ret = lookup_extent_backref(trans, extent_root, path, &iref, 4514 bytenr, num_bytes, parent, 4515 root_objectid, owner_objectid, 4516 owner_offset); 4517 if (ret == 0) { 4518 extent_slot = path->slots[0]; 4519 while (extent_slot >= 0) { 4520 btrfs_item_key_to_cpu(path->nodes[0], &key, 4521 extent_slot); 4522 if (key.objectid != bytenr) 4523 break; 4524 if (key.type == BTRFS_EXTENT_ITEM_KEY && 4525 key.offset == num_bytes) { 4526 found_extent = 1; 4527 break; 4528 } 4529 if (path->slots[0] - extent_slot > 5) 4530 break; 4531 extent_slot--; 4532 } 4533 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 4534 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 4535 if (found_extent && item_size < sizeof(*ei)) 4536 found_extent = 0; 4537 #endif 4538 if (!found_extent) { 4539 BUG_ON(iref); 4540 ret = remove_extent_backref(trans, extent_root, path, 4541 NULL, refs_to_drop, 4542 is_data); 4543 BUG_ON(ret); 4544 btrfs_release_path(extent_root, path); 4545 path->leave_spinning = 1; 4546 4547 key.objectid = bytenr; 4548 key.type = BTRFS_EXTENT_ITEM_KEY; 4549 key.offset = num_bytes; 4550 4551 ret = btrfs_search_slot(trans, extent_root, 4552 &key, path, -1, 1); 4553 if (ret) { 4554 printk(KERN_ERR "umm, got %d back from search" 4555 ", was looking for %llu\n", ret, 4556 (unsigned long long)bytenr); 4557 btrfs_print_leaf(extent_root, path->nodes[0]); 4558 } 4559 BUG_ON(ret); 4560 extent_slot = path->slots[0]; 4561 } 4562 } else { 4563 btrfs_print_leaf(extent_root, path->nodes[0]); 4564 WARN_ON(1); 4565 printk(KERN_ERR "btrfs unable to find ref byte nr %llu " 4566 "parent %llu root %llu owner %llu offset %llu\n", 4567 (unsigned long long)bytenr, 4568 (unsigned long long)parent, 4569 (unsigned long long)root_objectid, 4570 (unsigned long long)owner_objectid, 4571 (unsigned long long)owner_offset); 4572 } 4573 4574 leaf = path->nodes[0]; 4575 item_size = btrfs_item_size_nr(leaf, extent_slot); 4576 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 4577 if (item_size < sizeof(*ei)) { 4578 BUG_ON(found_extent || extent_slot != path->slots[0]); 4579 ret = convert_extent_item_v0(trans, extent_root, path, 4580 owner_objectid, 0); 4581 BUG_ON(ret < 0); 4582 4583 btrfs_release_path(extent_root, path); 4584 path->leave_spinning = 1; 4585 4586 key.objectid = bytenr; 4587 key.type = BTRFS_EXTENT_ITEM_KEY; 4588 key.offset = num_bytes; 4589 4590 ret = btrfs_search_slot(trans, extent_root, &key, path, 4591 -1, 1); 4592 if (ret) { 4593 printk(KERN_ERR "umm, got %d back from search" 4594 ", was looking for %llu\n", ret, 4595 (unsigned long long)bytenr); 4596 btrfs_print_leaf(extent_root, path->nodes[0]); 4597 } 4598 BUG_ON(ret); 4599 extent_slot = path->slots[0]; 4600 leaf = path->nodes[0]; 4601 item_size = btrfs_item_size_nr(leaf, extent_slot); 4602 } 4603 #endif 4604 BUG_ON(item_size < sizeof(*ei)); 4605 ei = btrfs_item_ptr(leaf, extent_slot, 4606 struct btrfs_extent_item); 4607 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 4608 struct btrfs_tree_block_info *bi; 4609 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 4610 bi = (struct btrfs_tree_block_info *)(ei + 1); 4611 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 4612 } 4613 4614 refs = btrfs_extent_refs(leaf, ei); 4615 BUG_ON(refs < refs_to_drop); 4616 refs -= refs_to_drop; 4617 4618 if (refs > 0) { 4619 if (extent_op) 4620 __run_delayed_extent_op(extent_op, leaf, ei); 4621 /* 4622 * In the case of inline back ref, reference count will 4623 * be updated by remove_extent_backref 4624 */ 4625 if (iref) { 4626 BUG_ON(!found_extent); 4627 } else { 4628 btrfs_set_extent_refs(leaf, ei, refs); 4629 btrfs_mark_buffer_dirty(leaf); 4630 } 4631 if (found_extent) { 4632 ret = remove_extent_backref(trans, extent_root, path, 4633 iref, refs_to_drop, 4634 is_data); 4635 BUG_ON(ret); 4636 } 4637 } else { 4638 if (found_extent) { 4639 BUG_ON(is_data && refs_to_drop != 4640 extent_data_ref_count(root, path, iref)); 4641 if (iref) { 4642 BUG_ON(path->slots[0] != extent_slot); 4643 } else { 4644 BUG_ON(path->slots[0] != extent_slot + 1); 4645 path->slots[0] = extent_slot; 4646 num_to_del = 2; 4647 } 4648 } 4649 4650 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 4651 num_to_del); 4652 BUG_ON(ret); 4653 btrfs_release_path(extent_root, path); 4654 4655 if (is_data) { 4656 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 4657 BUG_ON(ret); 4658 } else { 4659 invalidate_mapping_pages(info->btree_inode->i_mapping, 4660 bytenr >> PAGE_CACHE_SHIFT, 4661 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); 4662 } 4663 4664 ret = update_block_group(trans, root, bytenr, num_bytes, 0); 4665 BUG_ON(ret); 4666 } 4667 btrfs_free_path(path); 4668 return ret; 4669 } 4670 4671 /* 4672 * when we free an block, it is possible (and likely) that we free the last 4673 * delayed ref for that extent as well. This searches the delayed ref tree for 4674 * a given extent, and if there are no other delayed refs to be processed, it 4675 * removes it from the tree. 4676 */ 4677 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 4678 struct btrfs_root *root, u64 bytenr) 4679 { 4680 struct btrfs_delayed_ref_head *head; 4681 struct btrfs_delayed_ref_root *delayed_refs; 4682 struct btrfs_delayed_ref_node *ref; 4683 struct rb_node *node; 4684 int ret = 0; 4685 4686 delayed_refs = &trans->transaction->delayed_refs; 4687 spin_lock(&delayed_refs->lock); 4688 head = btrfs_find_delayed_ref_head(trans, bytenr); 4689 if (!head) 4690 goto out; 4691 4692 node = rb_prev(&head->node.rb_node); 4693 if (!node) 4694 goto out; 4695 4696 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 4697 4698 /* there are still entries for this ref, we can't drop it */ 4699 if (ref->bytenr == bytenr) 4700 goto out; 4701 4702 if (head->extent_op) { 4703 if (!head->must_insert_reserved) 4704 goto out; 4705 kfree(head->extent_op); 4706 head->extent_op = NULL; 4707 } 4708 4709 /* 4710 * waiting for the lock here would deadlock. If someone else has it 4711 * locked they are already in the process of dropping it anyway 4712 */ 4713 if (!mutex_trylock(&head->mutex)) 4714 goto out; 4715 4716 /* 4717 * at this point we have a head with no other entries. Go 4718 * ahead and process it. 4719 */ 4720 head->node.in_tree = 0; 4721 rb_erase(&head->node.rb_node, &delayed_refs->root); 4722 4723 delayed_refs->num_entries--; 4724 4725 /* 4726 * we don't take a ref on the node because we're removing it from the 4727 * tree, so we just steal the ref the tree was holding. 4728 */ 4729 delayed_refs->num_heads--; 4730 if (list_empty(&head->cluster)) 4731 delayed_refs->num_heads_ready--; 4732 4733 list_del_init(&head->cluster); 4734 spin_unlock(&delayed_refs->lock); 4735 4736 BUG_ON(head->extent_op); 4737 if (head->must_insert_reserved) 4738 ret = 1; 4739 4740 mutex_unlock(&head->mutex); 4741 btrfs_put_delayed_ref(&head->node); 4742 return ret; 4743 out: 4744 spin_unlock(&delayed_refs->lock); 4745 return 0; 4746 } 4747 4748 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 4749 struct btrfs_root *root, 4750 struct extent_buffer *buf, 4751 u64 parent, int last_ref) 4752 { 4753 struct btrfs_block_rsv *block_rsv; 4754 struct btrfs_block_group_cache *cache = NULL; 4755 int ret; 4756 4757 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4758 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, 4759 parent, root->root_key.objectid, 4760 btrfs_header_level(buf), 4761 BTRFS_DROP_DELAYED_REF, NULL); 4762 BUG_ON(ret); 4763 } 4764 4765 if (!last_ref) 4766 return; 4767 4768 block_rsv = get_block_rsv(trans, root); 4769 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 4770 if (block_rsv->space_info != cache->space_info) 4771 goto out; 4772 4773 if (btrfs_header_generation(buf) == trans->transid) { 4774 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4775 ret = check_ref_cleanup(trans, root, buf->start); 4776 if (!ret) 4777 goto pin; 4778 } 4779 4780 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 4781 pin_down_extent(root, cache, buf->start, buf->len, 1); 4782 goto pin; 4783 } 4784 4785 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4786 4787 btrfs_add_free_space(cache, buf->start, buf->len); 4788 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); 4789 if (ret == -EAGAIN) { 4790 /* block group became read-only */ 4791 btrfs_update_reserved_bytes(cache, buf->len, 0, 1); 4792 goto out; 4793 } 4794 4795 ret = 1; 4796 spin_lock(&block_rsv->lock); 4797 if (block_rsv->reserved < block_rsv->size) { 4798 block_rsv->reserved += buf->len; 4799 ret = 0; 4800 } 4801 spin_unlock(&block_rsv->lock); 4802 4803 if (ret) { 4804 spin_lock(&cache->space_info->lock); 4805 cache->space_info->bytes_reserved -= buf->len; 4806 cache->space_info->reservation_progress++; 4807 spin_unlock(&cache->space_info->lock); 4808 } 4809 goto out; 4810 } 4811 pin: 4812 if (block_rsv->durable && !cache->ro) { 4813 ret = 0; 4814 spin_lock(&cache->lock); 4815 if (!cache->ro) { 4816 cache->reserved_pinned += buf->len; 4817 ret = 1; 4818 } 4819 spin_unlock(&cache->lock); 4820 4821 if (ret) { 4822 spin_lock(&block_rsv->lock); 4823 block_rsv->freed[trans->transid & 0x1] += buf->len; 4824 spin_unlock(&block_rsv->lock); 4825 } 4826 } 4827 out: 4828 /* 4829 * Deleting the buffer, clear the corrupt flag since it doesn't matter 4830 * anymore. 4831 */ 4832 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 4833 btrfs_put_block_group(cache); 4834 } 4835 4836 int btrfs_free_extent(struct btrfs_trans_handle *trans, 4837 struct btrfs_root *root, 4838 u64 bytenr, u64 num_bytes, u64 parent, 4839 u64 root_objectid, u64 owner, u64 offset) 4840 { 4841 int ret; 4842 4843 /* 4844 * tree log blocks never actually go into the extent allocation 4845 * tree, just update pinning info and exit early. 4846 */ 4847 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 4848 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 4849 /* unlocks the pinned mutex */ 4850 btrfs_pin_extent(root, bytenr, num_bytes, 1); 4851 ret = 0; 4852 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 4853 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 4854 parent, root_objectid, (int)owner, 4855 BTRFS_DROP_DELAYED_REF, NULL); 4856 BUG_ON(ret); 4857 } else { 4858 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 4859 parent, root_objectid, owner, 4860 offset, BTRFS_DROP_DELAYED_REF, NULL); 4861 BUG_ON(ret); 4862 } 4863 return ret; 4864 } 4865 4866 static u64 stripe_align(struct btrfs_root *root, u64 val) 4867 { 4868 u64 mask = ((u64)root->stripesize - 1); 4869 u64 ret = (val + mask) & ~mask; 4870 return ret; 4871 } 4872 4873 /* 4874 * when we wait for progress in the block group caching, its because 4875 * our allocation attempt failed at least once. So, we must sleep 4876 * and let some progress happen before we try again. 4877 * 4878 * This function will sleep at least once waiting for new free space to 4879 * show up, and then it will check the block group free space numbers 4880 * for our min num_bytes. Another option is to have it go ahead 4881 * and look in the rbtree for a free extent of a given size, but this 4882 * is a good start. 4883 */ 4884 static noinline int 4885 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 4886 u64 num_bytes) 4887 { 4888 struct btrfs_caching_control *caching_ctl; 4889 DEFINE_WAIT(wait); 4890 4891 caching_ctl = get_caching_control(cache); 4892 if (!caching_ctl) 4893 return 0; 4894 4895 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 4896 (cache->free_space >= num_bytes)); 4897 4898 put_caching_control(caching_ctl); 4899 return 0; 4900 } 4901 4902 static noinline int 4903 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 4904 { 4905 struct btrfs_caching_control *caching_ctl; 4906 DEFINE_WAIT(wait); 4907 4908 caching_ctl = get_caching_control(cache); 4909 if (!caching_ctl) 4910 return 0; 4911 4912 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 4913 4914 put_caching_control(caching_ctl); 4915 return 0; 4916 } 4917 4918 static int get_block_group_index(struct btrfs_block_group_cache *cache) 4919 { 4920 int index; 4921 if (cache->flags & BTRFS_BLOCK_GROUP_RAID10) 4922 index = 0; 4923 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1) 4924 index = 1; 4925 else if (cache->flags & BTRFS_BLOCK_GROUP_DUP) 4926 index = 2; 4927 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) 4928 index = 3; 4929 else 4930 index = 4; 4931 return index; 4932 } 4933 4934 enum btrfs_loop_type { 4935 LOOP_FIND_IDEAL = 0, 4936 LOOP_CACHING_NOWAIT = 1, 4937 LOOP_CACHING_WAIT = 2, 4938 LOOP_ALLOC_CHUNK = 3, 4939 LOOP_NO_EMPTY_SIZE = 4, 4940 }; 4941 4942 /* 4943 * walks the btree of allocated extents and find a hole of a given size. 4944 * The key ins is changed to record the hole: 4945 * ins->objectid == block start 4946 * ins->flags = BTRFS_EXTENT_ITEM_KEY 4947 * ins->offset == number of blocks 4948 * Any available blocks before search_start are skipped. 4949 */ 4950 static noinline int find_free_extent(struct btrfs_trans_handle *trans, 4951 struct btrfs_root *orig_root, 4952 u64 num_bytes, u64 empty_size, 4953 u64 search_start, u64 search_end, 4954 u64 hint_byte, struct btrfs_key *ins, 4955 int data) 4956 { 4957 int ret = 0; 4958 struct btrfs_root *root = orig_root->fs_info->extent_root; 4959 struct btrfs_free_cluster *last_ptr = NULL; 4960 struct btrfs_block_group_cache *block_group = NULL; 4961 int empty_cluster = 2 * 1024 * 1024; 4962 int allowed_chunk_alloc = 0; 4963 int done_chunk_alloc = 0; 4964 struct btrfs_space_info *space_info; 4965 int last_ptr_loop = 0; 4966 int loop = 0; 4967 int index = 0; 4968 bool found_uncached_bg = false; 4969 bool failed_cluster_refill = false; 4970 bool failed_alloc = false; 4971 bool use_cluster = true; 4972 u64 ideal_cache_percent = 0; 4973 u64 ideal_cache_offset = 0; 4974 4975 WARN_ON(num_bytes < root->sectorsize); 4976 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 4977 ins->objectid = 0; 4978 ins->offset = 0; 4979 4980 space_info = __find_space_info(root->fs_info, data); 4981 if (!space_info) { 4982 printk(KERN_ERR "No space info for %d\n", data); 4983 return -ENOSPC; 4984 } 4985 4986 /* 4987 * If the space info is for both data and metadata it means we have a 4988 * small filesystem and we can't use the clustering stuff. 4989 */ 4990 if (btrfs_mixed_space_info(space_info)) 4991 use_cluster = false; 4992 4993 if (orig_root->ref_cows || empty_size) 4994 allowed_chunk_alloc = 1; 4995 4996 if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 4997 last_ptr = &root->fs_info->meta_alloc_cluster; 4998 if (!btrfs_test_opt(root, SSD)) 4999 empty_cluster = 64 * 1024; 5000 } 5001 5002 if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster && 5003 btrfs_test_opt(root, SSD)) { 5004 last_ptr = &root->fs_info->data_alloc_cluster; 5005 } 5006 5007 if (last_ptr) { 5008 spin_lock(&last_ptr->lock); 5009 if (last_ptr->block_group) 5010 hint_byte = last_ptr->window_start; 5011 spin_unlock(&last_ptr->lock); 5012 } 5013 5014 search_start = max(search_start, first_logical_byte(root, 0)); 5015 search_start = max(search_start, hint_byte); 5016 5017 if (!last_ptr) 5018 empty_cluster = 0; 5019 5020 if (search_start == hint_byte) { 5021 ideal_cache: 5022 block_group = btrfs_lookup_block_group(root->fs_info, 5023 search_start); 5024 /* 5025 * we don't want to use the block group if it doesn't match our 5026 * allocation bits, or if its not cached. 5027 * 5028 * However if we are re-searching with an ideal block group 5029 * picked out then we don't care that the block group is cached. 5030 */ 5031 if (block_group && block_group_bits(block_group, data) && 5032 (block_group->cached != BTRFS_CACHE_NO || 5033 search_start == ideal_cache_offset)) { 5034 down_read(&space_info->groups_sem); 5035 if (list_empty(&block_group->list) || 5036 block_group->ro) { 5037 /* 5038 * someone is removing this block group, 5039 * we can't jump into the have_block_group 5040 * target because our list pointers are not 5041 * valid 5042 */ 5043 btrfs_put_block_group(block_group); 5044 up_read(&space_info->groups_sem); 5045 } else { 5046 index = get_block_group_index(block_group); 5047 goto have_block_group; 5048 } 5049 } else if (block_group) { 5050 btrfs_put_block_group(block_group); 5051 } 5052 } 5053 search: 5054 down_read(&space_info->groups_sem); 5055 list_for_each_entry(block_group, &space_info->block_groups[index], 5056 list) { 5057 u64 offset; 5058 int cached; 5059 5060 btrfs_get_block_group(block_group); 5061 search_start = block_group->key.objectid; 5062 5063 /* 5064 * this can happen if we end up cycling through all the 5065 * raid types, but we want to make sure we only allocate 5066 * for the proper type. 5067 */ 5068 if (!block_group_bits(block_group, data)) { 5069 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5070 BTRFS_BLOCK_GROUP_RAID1 | 5071 BTRFS_BLOCK_GROUP_RAID10; 5072 5073 /* 5074 * if they asked for extra copies and this block group 5075 * doesn't provide them, bail. This does allow us to 5076 * fill raid0 from raid1. 5077 */ 5078 if ((data & extra) && !(block_group->flags & extra)) 5079 goto loop; 5080 } 5081 5082 have_block_group: 5083 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 5084 u64 free_percent; 5085 5086 ret = cache_block_group(block_group, trans, 5087 orig_root, 1); 5088 if (block_group->cached == BTRFS_CACHE_FINISHED) 5089 goto have_block_group; 5090 5091 free_percent = btrfs_block_group_used(&block_group->item); 5092 free_percent *= 100; 5093 free_percent = div64_u64(free_percent, 5094 block_group->key.offset); 5095 free_percent = 100 - free_percent; 5096 if (free_percent > ideal_cache_percent && 5097 likely(!block_group->ro)) { 5098 ideal_cache_offset = block_group->key.objectid; 5099 ideal_cache_percent = free_percent; 5100 } 5101 5102 /* 5103 * We only want to start kthread caching if we are at 5104 * the point where we will wait for caching to make 5105 * progress, or if our ideal search is over and we've 5106 * found somebody to start caching. 5107 */ 5108 if (loop > LOOP_CACHING_NOWAIT || 5109 (loop > LOOP_FIND_IDEAL && 5110 atomic_read(&space_info->caching_threads) < 2)) { 5111 ret = cache_block_group(block_group, trans, 5112 orig_root, 0); 5113 BUG_ON(ret); 5114 } 5115 found_uncached_bg = true; 5116 5117 /* 5118 * If loop is set for cached only, try the next block 5119 * group. 5120 */ 5121 if (loop == LOOP_FIND_IDEAL) 5122 goto loop; 5123 } 5124 5125 cached = block_group_cache_done(block_group); 5126 if (unlikely(!cached)) 5127 found_uncached_bg = true; 5128 5129 if (unlikely(block_group->ro)) 5130 goto loop; 5131 5132 /* 5133 * Ok we want to try and use the cluster allocator, so lets look 5134 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will 5135 * have tried the cluster allocator plenty of times at this 5136 * point and not have found anything, so we are likely way too 5137 * fragmented for the clustering stuff to find anything, so lets 5138 * just skip it and let the allocator find whatever block it can 5139 * find 5140 */ 5141 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { 5142 /* 5143 * the refill lock keeps out other 5144 * people trying to start a new cluster 5145 */ 5146 spin_lock(&last_ptr->refill_lock); 5147 if (last_ptr->block_group && 5148 (last_ptr->block_group->ro || 5149 !block_group_bits(last_ptr->block_group, data))) { 5150 offset = 0; 5151 goto refill_cluster; 5152 } 5153 5154 offset = btrfs_alloc_from_cluster(block_group, last_ptr, 5155 num_bytes, search_start); 5156 if (offset) { 5157 /* we have a block, we're done */ 5158 spin_unlock(&last_ptr->refill_lock); 5159 goto checks; 5160 } 5161 5162 spin_lock(&last_ptr->lock); 5163 /* 5164 * whoops, this cluster doesn't actually point to 5165 * this block group. Get a ref on the block 5166 * group is does point to and try again 5167 */ 5168 if (!last_ptr_loop && last_ptr->block_group && 5169 last_ptr->block_group != block_group) { 5170 5171 btrfs_put_block_group(block_group); 5172 block_group = last_ptr->block_group; 5173 btrfs_get_block_group(block_group); 5174 spin_unlock(&last_ptr->lock); 5175 spin_unlock(&last_ptr->refill_lock); 5176 5177 last_ptr_loop = 1; 5178 search_start = block_group->key.objectid; 5179 /* 5180 * we know this block group is properly 5181 * in the list because 5182 * btrfs_remove_block_group, drops the 5183 * cluster before it removes the block 5184 * group from the list 5185 */ 5186 goto have_block_group; 5187 } 5188 spin_unlock(&last_ptr->lock); 5189 refill_cluster: 5190 /* 5191 * this cluster didn't work out, free it and 5192 * start over 5193 */ 5194 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5195 5196 last_ptr_loop = 0; 5197 5198 /* allocate a cluster in this block group */ 5199 ret = btrfs_find_space_cluster(trans, root, 5200 block_group, last_ptr, 5201 offset, num_bytes, 5202 empty_cluster + empty_size); 5203 if (ret == 0) { 5204 /* 5205 * now pull our allocation out of this 5206 * cluster 5207 */ 5208 offset = btrfs_alloc_from_cluster(block_group, 5209 last_ptr, num_bytes, 5210 search_start); 5211 if (offset) { 5212 /* we found one, proceed */ 5213 spin_unlock(&last_ptr->refill_lock); 5214 goto checks; 5215 } 5216 } else if (!cached && loop > LOOP_CACHING_NOWAIT 5217 && !failed_cluster_refill) { 5218 spin_unlock(&last_ptr->refill_lock); 5219 5220 failed_cluster_refill = true; 5221 wait_block_group_cache_progress(block_group, 5222 num_bytes + empty_cluster + empty_size); 5223 goto have_block_group; 5224 } 5225 5226 /* 5227 * at this point we either didn't find a cluster 5228 * or we weren't able to allocate a block from our 5229 * cluster. Free the cluster we've been trying 5230 * to use, and go to the next block group 5231 */ 5232 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5233 spin_unlock(&last_ptr->refill_lock); 5234 goto loop; 5235 } 5236 5237 offset = btrfs_find_space_for_alloc(block_group, search_start, 5238 num_bytes, empty_size); 5239 /* 5240 * If we didn't find a chunk, and we haven't failed on this 5241 * block group before, and this block group is in the middle of 5242 * caching and we are ok with waiting, then go ahead and wait 5243 * for progress to be made, and set failed_alloc to true. 5244 * 5245 * If failed_alloc is true then we've already waited on this 5246 * block group once and should move on to the next block group. 5247 */ 5248 if (!offset && !failed_alloc && !cached && 5249 loop > LOOP_CACHING_NOWAIT) { 5250 wait_block_group_cache_progress(block_group, 5251 num_bytes + empty_size); 5252 failed_alloc = true; 5253 goto have_block_group; 5254 } else if (!offset) { 5255 goto loop; 5256 } 5257 checks: 5258 search_start = stripe_align(root, offset); 5259 /* move on to the next group */ 5260 if (search_start + num_bytes >= search_end) { 5261 btrfs_add_free_space(block_group, offset, num_bytes); 5262 goto loop; 5263 } 5264 5265 /* move on to the next group */ 5266 if (search_start + num_bytes > 5267 block_group->key.objectid + block_group->key.offset) { 5268 btrfs_add_free_space(block_group, offset, num_bytes); 5269 goto loop; 5270 } 5271 5272 ins->objectid = search_start; 5273 ins->offset = num_bytes; 5274 5275 if (offset < search_start) 5276 btrfs_add_free_space(block_group, offset, 5277 search_start - offset); 5278 BUG_ON(offset > search_start); 5279 5280 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, 5281 (data & BTRFS_BLOCK_GROUP_DATA)); 5282 if (ret == -EAGAIN) { 5283 btrfs_add_free_space(block_group, offset, num_bytes); 5284 goto loop; 5285 } 5286 5287 /* we are all good, lets return */ 5288 ins->objectid = search_start; 5289 ins->offset = num_bytes; 5290 5291 if (offset < search_start) 5292 btrfs_add_free_space(block_group, offset, 5293 search_start - offset); 5294 BUG_ON(offset > search_start); 5295 break; 5296 loop: 5297 failed_cluster_refill = false; 5298 failed_alloc = false; 5299 BUG_ON(index != get_block_group_index(block_group)); 5300 btrfs_put_block_group(block_group); 5301 } 5302 up_read(&space_info->groups_sem); 5303 5304 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 5305 goto search; 5306 5307 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for 5308 * for them to make caching progress. Also 5309 * determine the best possible bg to cache 5310 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 5311 * caching kthreads as we move along 5312 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 5313 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 5314 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 5315 * again 5316 */ 5317 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 5318 (found_uncached_bg || empty_size || empty_cluster || 5319 allowed_chunk_alloc)) { 5320 index = 0; 5321 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 5322 found_uncached_bg = false; 5323 loop++; 5324 if (!ideal_cache_percent && 5325 atomic_read(&space_info->caching_threads)) 5326 goto search; 5327 5328 /* 5329 * 1 of the following 2 things have happened so far 5330 * 5331 * 1) We found an ideal block group for caching that 5332 * is mostly full and will cache quickly, so we might 5333 * as well wait for it. 5334 * 5335 * 2) We searched for cached only and we didn't find 5336 * anything, and we didn't start any caching kthreads 5337 * either, so chances are we will loop through and 5338 * start a couple caching kthreads, and then come back 5339 * around and just wait for them. This will be slower 5340 * because we will have 2 caching kthreads reading at 5341 * the same time when we could have just started one 5342 * and waited for it to get far enough to give us an 5343 * allocation, so go ahead and go to the wait caching 5344 * loop. 5345 */ 5346 loop = LOOP_CACHING_WAIT; 5347 search_start = ideal_cache_offset; 5348 ideal_cache_percent = 0; 5349 goto ideal_cache; 5350 } else if (loop == LOOP_FIND_IDEAL) { 5351 /* 5352 * Didn't find a uncached bg, wait on anything we find 5353 * next. 5354 */ 5355 loop = LOOP_CACHING_WAIT; 5356 goto search; 5357 } 5358 5359 if (loop < LOOP_CACHING_WAIT) { 5360 loop++; 5361 goto search; 5362 } 5363 5364 if (loop == LOOP_ALLOC_CHUNK) { 5365 empty_size = 0; 5366 empty_cluster = 0; 5367 } 5368 5369 if (allowed_chunk_alloc) { 5370 ret = do_chunk_alloc(trans, root, num_bytes + 5371 2 * 1024 * 1024, data, 5372 CHUNK_ALLOC_LIMITED); 5373 allowed_chunk_alloc = 0; 5374 done_chunk_alloc = 1; 5375 } else if (!done_chunk_alloc && 5376 space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) { 5377 space_info->force_alloc = CHUNK_ALLOC_LIMITED; 5378 } 5379 5380 if (loop < LOOP_NO_EMPTY_SIZE) { 5381 loop++; 5382 goto search; 5383 } 5384 ret = -ENOSPC; 5385 } else if (!ins->objectid) { 5386 ret = -ENOSPC; 5387 } 5388 5389 /* we found what we needed */ 5390 if (ins->objectid) { 5391 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 5392 trans->block_group = block_group->key.objectid; 5393 5394 btrfs_put_block_group(block_group); 5395 ret = 0; 5396 } 5397 5398 return ret; 5399 } 5400 5401 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 5402 int dump_block_groups) 5403 { 5404 struct btrfs_block_group_cache *cache; 5405 int index = 0; 5406 5407 spin_lock(&info->lock); 5408 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 5409 (unsigned long long)(info->total_bytes - info->bytes_used - 5410 info->bytes_pinned - info->bytes_reserved - 5411 info->bytes_readonly), 5412 (info->full) ? "" : "not "); 5413 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " 5414 "reserved=%llu, may_use=%llu, readonly=%llu\n", 5415 (unsigned long long)info->total_bytes, 5416 (unsigned long long)info->bytes_used, 5417 (unsigned long long)info->bytes_pinned, 5418 (unsigned long long)info->bytes_reserved, 5419 (unsigned long long)info->bytes_may_use, 5420 (unsigned long long)info->bytes_readonly); 5421 spin_unlock(&info->lock); 5422 5423 if (!dump_block_groups) 5424 return; 5425 5426 down_read(&info->groups_sem); 5427 again: 5428 list_for_each_entry(cache, &info->block_groups[index], list) { 5429 spin_lock(&cache->lock); 5430 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 5431 "%llu pinned %llu reserved\n", 5432 (unsigned long long)cache->key.objectid, 5433 (unsigned long long)cache->key.offset, 5434 (unsigned long long)btrfs_block_group_used(&cache->item), 5435 (unsigned long long)cache->pinned, 5436 (unsigned long long)cache->reserved); 5437 btrfs_dump_free_space(cache, bytes); 5438 spin_unlock(&cache->lock); 5439 } 5440 if (++index < BTRFS_NR_RAID_TYPES) 5441 goto again; 5442 up_read(&info->groups_sem); 5443 } 5444 5445 int btrfs_reserve_extent(struct btrfs_trans_handle *trans, 5446 struct btrfs_root *root, 5447 u64 num_bytes, u64 min_alloc_size, 5448 u64 empty_size, u64 hint_byte, 5449 u64 search_end, struct btrfs_key *ins, 5450 u64 data) 5451 { 5452 int ret; 5453 u64 search_start = 0; 5454 5455 data = btrfs_get_alloc_profile(root, data); 5456 again: 5457 /* 5458 * the only place that sets empty_size is btrfs_realloc_node, which 5459 * is not called recursively on allocations 5460 */ 5461 if (empty_size || root->ref_cows) 5462 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 5463 num_bytes + 2 * 1024 * 1024, data, 5464 CHUNK_ALLOC_NO_FORCE); 5465 5466 WARN_ON(num_bytes < root->sectorsize); 5467 ret = find_free_extent(trans, root, num_bytes, empty_size, 5468 search_start, search_end, hint_byte, 5469 ins, data); 5470 5471 if (ret == -ENOSPC && num_bytes > min_alloc_size) { 5472 num_bytes = num_bytes >> 1; 5473 num_bytes = num_bytes & ~(root->sectorsize - 1); 5474 num_bytes = max(num_bytes, min_alloc_size); 5475 do_chunk_alloc(trans, root->fs_info->extent_root, 5476 num_bytes, data, CHUNK_ALLOC_FORCE); 5477 goto again; 5478 } 5479 if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { 5480 struct btrfs_space_info *sinfo; 5481 5482 sinfo = __find_space_info(root->fs_info, data); 5483 printk(KERN_ERR "btrfs allocation failed flags %llu, " 5484 "wanted %llu\n", (unsigned long long)data, 5485 (unsigned long long)num_bytes); 5486 dump_space_info(sinfo, num_bytes, 1); 5487 } 5488 5489 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); 5490 5491 return ret; 5492 } 5493 5494 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) 5495 { 5496 struct btrfs_block_group_cache *cache; 5497 int ret = 0; 5498 5499 cache = btrfs_lookup_block_group(root->fs_info, start); 5500 if (!cache) { 5501 printk(KERN_ERR "Unable to find block group for %llu\n", 5502 (unsigned long long)start); 5503 return -ENOSPC; 5504 } 5505 5506 if (btrfs_test_opt(root, DISCARD)) 5507 ret = btrfs_discard_extent(root, start, len, NULL); 5508 5509 btrfs_add_free_space(cache, start, len); 5510 btrfs_update_reserved_bytes(cache, len, 0, 1); 5511 btrfs_put_block_group(cache); 5512 5513 trace_btrfs_reserved_extent_free(root, start, len); 5514 5515 return ret; 5516 } 5517 5518 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 5519 struct btrfs_root *root, 5520 u64 parent, u64 root_objectid, 5521 u64 flags, u64 owner, u64 offset, 5522 struct btrfs_key *ins, int ref_mod) 5523 { 5524 int ret; 5525 struct btrfs_fs_info *fs_info = root->fs_info; 5526 struct btrfs_extent_item *extent_item; 5527 struct btrfs_extent_inline_ref *iref; 5528 struct btrfs_path *path; 5529 struct extent_buffer *leaf; 5530 int type; 5531 u32 size; 5532 5533 if (parent > 0) 5534 type = BTRFS_SHARED_DATA_REF_KEY; 5535 else 5536 type = BTRFS_EXTENT_DATA_REF_KEY; 5537 5538 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 5539 5540 path = btrfs_alloc_path(); 5541 if (!path) 5542 return -ENOMEM; 5543 5544 path->leave_spinning = 1; 5545 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 5546 ins, size); 5547 BUG_ON(ret); 5548 5549 leaf = path->nodes[0]; 5550 extent_item = btrfs_item_ptr(leaf, path->slots[0], 5551 struct btrfs_extent_item); 5552 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 5553 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 5554 btrfs_set_extent_flags(leaf, extent_item, 5555 flags | BTRFS_EXTENT_FLAG_DATA); 5556 5557 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 5558 btrfs_set_extent_inline_ref_type(leaf, iref, type); 5559 if (parent > 0) { 5560 struct btrfs_shared_data_ref *ref; 5561 ref = (struct btrfs_shared_data_ref *)(iref + 1); 5562 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 5563 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 5564 } else { 5565 struct btrfs_extent_data_ref *ref; 5566 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 5567 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 5568 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 5569 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 5570 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 5571 } 5572 5573 btrfs_mark_buffer_dirty(path->nodes[0]); 5574 btrfs_free_path(path); 5575 5576 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 5577 if (ret) { 5578 printk(KERN_ERR "btrfs update block group failed for %llu " 5579 "%llu\n", (unsigned long long)ins->objectid, 5580 (unsigned long long)ins->offset); 5581 BUG(); 5582 } 5583 return ret; 5584 } 5585 5586 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 5587 struct btrfs_root *root, 5588 u64 parent, u64 root_objectid, 5589 u64 flags, struct btrfs_disk_key *key, 5590 int level, struct btrfs_key *ins) 5591 { 5592 int ret; 5593 struct btrfs_fs_info *fs_info = root->fs_info; 5594 struct btrfs_extent_item *extent_item; 5595 struct btrfs_tree_block_info *block_info; 5596 struct btrfs_extent_inline_ref *iref; 5597 struct btrfs_path *path; 5598 struct extent_buffer *leaf; 5599 u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); 5600 5601 path = btrfs_alloc_path(); 5602 BUG_ON(!path); 5603 5604 path->leave_spinning = 1; 5605 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 5606 ins, size); 5607 BUG_ON(ret); 5608 5609 leaf = path->nodes[0]; 5610 extent_item = btrfs_item_ptr(leaf, path->slots[0], 5611 struct btrfs_extent_item); 5612 btrfs_set_extent_refs(leaf, extent_item, 1); 5613 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 5614 btrfs_set_extent_flags(leaf, extent_item, 5615 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 5616 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 5617 5618 btrfs_set_tree_block_key(leaf, block_info, key); 5619 btrfs_set_tree_block_level(leaf, block_info, level); 5620 5621 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 5622 if (parent > 0) { 5623 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 5624 btrfs_set_extent_inline_ref_type(leaf, iref, 5625 BTRFS_SHARED_BLOCK_REF_KEY); 5626 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 5627 } else { 5628 btrfs_set_extent_inline_ref_type(leaf, iref, 5629 BTRFS_TREE_BLOCK_REF_KEY); 5630 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 5631 } 5632 5633 btrfs_mark_buffer_dirty(leaf); 5634 btrfs_free_path(path); 5635 5636 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 5637 if (ret) { 5638 printk(KERN_ERR "btrfs update block group failed for %llu " 5639 "%llu\n", (unsigned long long)ins->objectid, 5640 (unsigned long long)ins->offset); 5641 BUG(); 5642 } 5643 return ret; 5644 } 5645 5646 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 5647 struct btrfs_root *root, 5648 u64 root_objectid, u64 owner, 5649 u64 offset, struct btrfs_key *ins) 5650 { 5651 int ret; 5652 5653 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 5654 5655 ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, 5656 0, root_objectid, owner, offset, 5657 BTRFS_ADD_DELAYED_EXTENT, NULL); 5658 return ret; 5659 } 5660 5661 /* 5662 * this is used by the tree logging recovery code. It records that 5663 * an extent has been allocated and makes sure to clear the free 5664 * space cache bits as well 5665 */ 5666 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 5667 struct btrfs_root *root, 5668 u64 root_objectid, u64 owner, u64 offset, 5669 struct btrfs_key *ins) 5670 { 5671 int ret; 5672 struct btrfs_block_group_cache *block_group; 5673 struct btrfs_caching_control *caching_ctl; 5674 u64 start = ins->objectid; 5675 u64 num_bytes = ins->offset; 5676 5677 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 5678 cache_block_group(block_group, trans, NULL, 0); 5679 caching_ctl = get_caching_control(block_group); 5680 5681 if (!caching_ctl) { 5682 BUG_ON(!block_group_cache_done(block_group)); 5683 ret = btrfs_remove_free_space(block_group, start, num_bytes); 5684 BUG_ON(ret); 5685 } else { 5686 mutex_lock(&caching_ctl->mutex); 5687 5688 if (start >= caching_ctl->progress) { 5689 ret = add_excluded_extent(root, start, num_bytes); 5690 BUG_ON(ret); 5691 } else if (start + num_bytes <= caching_ctl->progress) { 5692 ret = btrfs_remove_free_space(block_group, 5693 start, num_bytes); 5694 BUG_ON(ret); 5695 } else { 5696 num_bytes = caching_ctl->progress - start; 5697 ret = btrfs_remove_free_space(block_group, 5698 start, num_bytes); 5699 BUG_ON(ret); 5700 5701 start = caching_ctl->progress; 5702 num_bytes = ins->objectid + ins->offset - 5703 caching_ctl->progress; 5704 ret = add_excluded_extent(root, start, num_bytes); 5705 BUG_ON(ret); 5706 } 5707 5708 mutex_unlock(&caching_ctl->mutex); 5709 put_caching_control(caching_ctl); 5710 } 5711 5712 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); 5713 BUG_ON(ret); 5714 btrfs_put_block_group(block_group); 5715 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5716 0, owner, offset, ins, 1); 5717 return ret; 5718 } 5719 5720 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 5721 struct btrfs_root *root, 5722 u64 bytenr, u32 blocksize, 5723 int level) 5724 { 5725 struct extent_buffer *buf; 5726 5727 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 5728 if (!buf) 5729 return ERR_PTR(-ENOMEM); 5730 btrfs_set_header_generation(buf, trans->transid); 5731 btrfs_set_buffer_lockdep_class(buf, level); 5732 btrfs_tree_lock(buf); 5733 clean_tree_block(trans, root, buf); 5734 5735 btrfs_set_lock_blocking(buf); 5736 btrfs_set_buffer_uptodate(buf); 5737 5738 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 5739 /* 5740 * we allow two log transactions at a time, use different 5741 * EXENT bit to differentiate dirty pages. 5742 */ 5743 if (root->log_transid % 2 == 0) 5744 set_extent_dirty(&root->dirty_log_pages, buf->start, 5745 buf->start + buf->len - 1, GFP_NOFS); 5746 else 5747 set_extent_new(&root->dirty_log_pages, buf->start, 5748 buf->start + buf->len - 1, GFP_NOFS); 5749 } else { 5750 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 5751 buf->start + buf->len - 1, GFP_NOFS); 5752 } 5753 trans->blocks_used++; 5754 /* this returns a buffer locked for blocking */ 5755 return buf; 5756 } 5757 5758 static struct btrfs_block_rsv * 5759 use_block_rsv(struct btrfs_trans_handle *trans, 5760 struct btrfs_root *root, u32 blocksize) 5761 { 5762 struct btrfs_block_rsv *block_rsv; 5763 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 5764 int ret; 5765 5766 block_rsv = get_block_rsv(trans, root); 5767 5768 if (block_rsv->size == 0) { 5769 ret = reserve_metadata_bytes(trans, root, block_rsv, 5770 blocksize, 0); 5771 /* 5772 * If we couldn't reserve metadata bytes try and use some from 5773 * the global reserve. 5774 */ 5775 if (ret && block_rsv != global_rsv) { 5776 ret = block_rsv_use_bytes(global_rsv, blocksize); 5777 if (!ret) 5778 return global_rsv; 5779 return ERR_PTR(ret); 5780 } else if (ret) { 5781 return ERR_PTR(ret); 5782 } 5783 return block_rsv; 5784 } 5785 5786 ret = block_rsv_use_bytes(block_rsv, blocksize); 5787 if (!ret) 5788 return block_rsv; 5789 if (ret) { 5790 WARN_ON(1); 5791 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 5792 0); 5793 if (!ret) { 5794 spin_lock(&block_rsv->lock); 5795 block_rsv->size += blocksize; 5796 spin_unlock(&block_rsv->lock); 5797 return block_rsv; 5798 } else if (ret && block_rsv != global_rsv) { 5799 ret = block_rsv_use_bytes(global_rsv, blocksize); 5800 if (!ret) 5801 return global_rsv; 5802 } 5803 } 5804 5805 return ERR_PTR(-ENOSPC); 5806 } 5807 5808 static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) 5809 { 5810 block_rsv_add_bytes(block_rsv, blocksize, 0); 5811 block_rsv_release_bytes(block_rsv, NULL, 0); 5812 } 5813 5814 /* 5815 * finds a free extent and does all the dirty work required for allocation 5816 * returns the key for the extent through ins, and a tree buffer for 5817 * the first block of the extent through buf. 5818 * 5819 * returns the tree buffer or NULL. 5820 */ 5821 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 5822 struct btrfs_root *root, u32 blocksize, 5823 u64 parent, u64 root_objectid, 5824 struct btrfs_disk_key *key, int level, 5825 u64 hint, u64 empty_size) 5826 { 5827 struct btrfs_key ins; 5828 struct btrfs_block_rsv *block_rsv; 5829 struct extent_buffer *buf; 5830 u64 flags = 0; 5831 int ret; 5832 5833 5834 block_rsv = use_block_rsv(trans, root, blocksize); 5835 if (IS_ERR(block_rsv)) 5836 return ERR_CAST(block_rsv); 5837 5838 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, 5839 empty_size, hint, (u64)-1, &ins, 0); 5840 if (ret) { 5841 unuse_block_rsv(block_rsv, blocksize); 5842 return ERR_PTR(ret); 5843 } 5844 5845 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 5846 blocksize, level); 5847 BUG_ON(IS_ERR(buf)); 5848 5849 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 5850 if (parent == 0) 5851 parent = ins.objectid; 5852 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 5853 } else 5854 BUG_ON(parent > 0); 5855 5856 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 5857 struct btrfs_delayed_extent_op *extent_op; 5858 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 5859 BUG_ON(!extent_op); 5860 if (key) 5861 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 5862 else 5863 memset(&extent_op->key, 0, sizeof(extent_op->key)); 5864 extent_op->flags_to_set = flags; 5865 extent_op->update_key = 1; 5866 extent_op->update_flags = 1; 5867 extent_op->is_data = 0; 5868 5869 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, 5870 ins.offset, parent, root_objectid, 5871 level, BTRFS_ADD_DELAYED_EXTENT, 5872 extent_op); 5873 BUG_ON(ret); 5874 } 5875 return buf; 5876 } 5877 5878 struct walk_control { 5879 u64 refs[BTRFS_MAX_LEVEL]; 5880 u64 flags[BTRFS_MAX_LEVEL]; 5881 struct btrfs_key update_progress; 5882 int stage; 5883 int level; 5884 int shared_level; 5885 int update_ref; 5886 int keep_locks; 5887 int reada_slot; 5888 int reada_count; 5889 }; 5890 5891 #define DROP_REFERENCE 1 5892 #define UPDATE_BACKREF 2 5893 5894 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 5895 struct btrfs_root *root, 5896 struct walk_control *wc, 5897 struct btrfs_path *path) 5898 { 5899 u64 bytenr; 5900 u64 generation; 5901 u64 refs; 5902 u64 flags; 5903 u32 nritems; 5904 u32 blocksize; 5905 struct btrfs_key key; 5906 struct extent_buffer *eb; 5907 int ret; 5908 int slot; 5909 int nread = 0; 5910 5911 if (path->slots[wc->level] < wc->reada_slot) { 5912 wc->reada_count = wc->reada_count * 2 / 3; 5913 wc->reada_count = max(wc->reada_count, 2); 5914 } else { 5915 wc->reada_count = wc->reada_count * 3 / 2; 5916 wc->reada_count = min_t(int, wc->reada_count, 5917 BTRFS_NODEPTRS_PER_BLOCK(root)); 5918 } 5919 5920 eb = path->nodes[wc->level]; 5921 nritems = btrfs_header_nritems(eb); 5922 blocksize = btrfs_level_size(root, wc->level - 1); 5923 5924 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 5925 if (nread >= wc->reada_count) 5926 break; 5927 5928 cond_resched(); 5929 bytenr = btrfs_node_blockptr(eb, slot); 5930 generation = btrfs_node_ptr_generation(eb, slot); 5931 5932 if (slot == path->slots[wc->level]) 5933 goto reada; 5934 5935 if (wc->stage == UPDATE_BACKREF && 5936 generation <= root->root_key.offset) 5937 continue; 5938 5939 /* We don't lock the tree block, it's OK to be racy here */ 5940 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, 5941 &refs, &flags); 5942 BUG_ON(ret); 5943 BUG_ON(refs == 0); 5944 5945 if (wc->stage == DROP_REFERENCE) { 5946 if (refs == 1) 5947 goto reada; 5948 5949 if (wc->level == 1 && 5950 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5951 continue; 5952 if (!wc->update_ref || 5953 generation <= root->root_key.offset) 5954 continue; 5955 btrfs_node_key_to_cpu(eb, &key, slot); 5956 ret = btrfs_comp_cpu_keys(&key, 5957 &wc->update_progress); 5958 if (ret < 0) 5959 continue; 5960 } else { 5961 if (wc->level == 1 && 5962 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5963 continue; 5964 } 5965 reada: 5966 ret = readahead_tree_block(root, bytenr, blocksize, 5967 generation); 5968 if (ret) 5969 break; 5970 nread++; 5971 } 5972 wc->reada_slot = slot; 5973 } 5974 5975 /* 5976 * hepler to process tree block while walking down the tree. 5977 * 5978 * when wc->stage == UPDATE_BACKREF, this function updates 5979 * back refs for pointers in the block. 5980 * 5981 * NOTE: return value 1 means we should stop walking down. 5982 */ 5983 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 5984 struct btrfs_root *root, 5985 struct btrfs_path *path, 5986 struct walk_control *wc, int lookup_info) 5987 { 5988 int level = wc->level; 5989 struct extent_buffer *eb = path->nodes[level]; 5990 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 5991 int ret; 5992 5993 if (wc->stage == UPDATE_BACKREF && 5994 btrfs_header_owner(eb) != root->root_key.objectid) 5995 return 1; 5996 5997 /* 5998 * when reference count of tree block is 1, it won't increase 5999 * again. once full backref flag is set, we never clear it. 6000 */ 6001 if (lookup_info && 6002 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 6003 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 6004 BUG_ON(!path->locks[level]); 6005 ret = btrfs_lookup_extent_info(trans, root, 6006 eb->start, eb->len, 6007 &wc->refs[level], 6008 &wc->flags[level]); 6009 BUG_ON(ret); 6010 BUG_ON(wc->refs[level] == 0); 6011 } 6012 6013 if (wc->stage == DROP_REFERENCE) { 6014 if (wc->refs[level] > 1) 6015 return 1; 6016 6017 if (path->locks[level] && !wc->keep_locks) { 6018 btrfs_tree_unlock(eb); 6019 path->locks[level] = 0; 6020 } 6021 return 0; 6022 } 6023 6024 /* wc->stage == UPDATE_BACKREF */ 6025 if (!(wc->flags[level] & flag)) { 6026 BUG_ON(!path->locks[level]); 6027 ret = btrfs_inc_ref(trans, root, eb, 1); 6028 BUG_ON(ret); 6029 ret = btrfs_dec_ref(trans, root, eb, 0); 6030 BUG_ON(ret); 6031 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 6032 eb->len, flag, 0); 6033 BUG_ON(ret); 6034 wc->flags[level] |= flag; 6035 } 6036 6037 /* 6038 * the block is shared by multiple trees, so it's not good to 6039 * keep the tree lock 6040 */ 6041 if (path->locks[level] && level > 0) { 6042 btrfs_tree_unlock(eb); 6043 path->locks[level] = 0; 6044 } 6045 return 0; 6046 } 6047 6048 /* 6049 * hepler to process tree block pointer. 6050 * 6051 * when wc->stage == DROP_REFERENCE, this function checks 6052 * reference count of the block pointed to. if the block 6053 * is shared and we need update back refs for the subtree 6054 * rooted at the block, this function changes wc->stage to 6055 * UPDATE_BACKREF. if the block is shared and there is no 6056 * need to update back, this function drops the reference 6057 * to the block. 6058 * 6059 * NOTE: return value 1 means we should stop walking down. 6060 */ 6061 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 6062 struct btrfs_root *root, 6063 struct btrfs_path *path, 6064 struct walk_control *wc, int *lookup_info) 6065 { 6066 u64 bytenr; 6067 u64 generation; 6068 u64 parent; 6069 u32 blocksize; 6070 struct btrfs_key key; 6071 struct extent_buffer *next; 6072 int level = wc->level; 6073 int reada = 0; 6074 int ret = 0; 6075 6076 generation = btrfs_node_ptr_generation(path->nodes[level], 6077 path->slots[level]); 6078 /* 6079 * if the lower level block was created before the snapshot 6080 * was created, we know there is no need to update back refs 6081 * for the subtree 6082 */ 6083 if (wc->stage == UPDATE_BACKREF && 6084 generation <= root->root_key.offset) { 6085 *lookup_info = 1; 6086 return 1; 6087 } 6088 6089 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 6090 blocksize = btrfs_level_size(root, level - 1); 6091 6092 next = btrfs_find_tree_block(root, bytenr, blocksize); 6093 if (!next) { 6094 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 6095 if (!next) 6096 return -ENOMEM; 6097 reada = 1; 6098 } 6099 btrfs_tree_lock(next); 6100 btrfs_set_lock_blocking(next); 6101 6102 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, 6103 &wc->refs[level - 1], 6104 &wc->flags[level - 1]); 6105 BUG_ON(ret); 6106 BUG_ON(wc->refs[level - 1] == 0); 6107 *lookup_info = 0; 6108 6109 if (wc->stage == DROP_REFERENCE) { 6110 if (wc->refs[level - 1] > 1) { 6111 if (level == 1 && 6112 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 6113 goto skip; 6114 6115 if (!wc->update_ref || 6116 generation <= root->root_key.offset) 6117 goto skip; 6118 6119 btrfs_node_key_to_cpu(path->nodes[level], &key, 6120 path->slots[level]); 6121 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 6122 if (ret < 0) 6123 goto skip; 6124 6125 wc->stage = UPDATE_BACKREF; 6126 wc->shared_level = level - 1; 6127 } 6128 } else { 6129 if (level == 1 && 6130 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 6131 goto skip; 6132 } 6133 6134 if (!btrfs_buffer_uptodate(next, generation)) { 6135 btrfs_tree_unlock(next); 6136 free_extent_buffer(next); 6137 next = NULL; 6138 *lookup_info = 1; 6139 } 6140 6141 if (!next) { 6142 if (reada && level == 1) 6143 reada_walk_down(trans, root, wc, path); 6144 next = read_tree_block(root, bytenr, blocksize, generation); 6145 if (!next) 6146 return -EIO; 6147 btrfs_tree_lock(next); 6148 btrfs_set_lock_blocking(next); 6149 } 6150 6151 level--; 6152 BUG_ON(level != btrfs_header_level(next)); 6153 path->nodes[level] = next; 6154 path->slots[level] = 0; 6155 path->locks[level] = 1; 6156 wc->level = level; 6157 if (wc->level == 1) 6158 wc->reada_slot = 0; 6159 return 0; 6160 skip: 6161 wc->refs[level - 1] = 0; 6162 wc->flags[level - 1] = 0; 6163 if (wc->stage == DROP_REFERENCE) { 6164 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 6165 parent = path->nodes[level]->start; 6166 } else { 6167 BUG_ON(root->root_key.objectid != 6168 btrfs_header_owner(path->nodes[level])); 6169 parent = 0; 6170 } 6171 6172 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 6173 root->root_key.objectid, level - 1, 0); 6174 BUG_ON(ret); 6175 } 6176 btrfs_tree_unlock(next); 6177 free_extent_buffer(next); 6178 *lookup_info = 1; 6179 return 1; 6180 } 6181 6182 /* 6183 * hepler to process tree block while walking up the tree. 6184 * 6185 * when wc->stage == DROP_REFERENCE, this function drops 6186 * reference count on the block. 6187 * 6188 * when wc->stage == UPDATE_BACKREF, this function changes 6189 * wc->stage back to DROP_REFERENCE if we changed wc->stage 6190 * to UPDATE_BACKREF previously while processing the block. 6191 * 6192 * NOTE: return value 1 means we should stop walking up. 6193 */ 6194 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 6195 struct btrfs_root *root, 6196 struct btrfs_path *path, 6197 struct walk_control *wc) 6198 { 6199 int ret; 6200 int level = wc->level; 6201 struct extent_buffer *eb = path->nodes[level]; 6202 u64 parent = 0; 6203 6204 if (wc->stage == UPDATE_BACKREF) { 6205 BUG_ON(wc->shared_level < level); 6206 if (level < wc->shared_level) 6207 goto out; 6208 6209 ret = find_next_key(path, level + 1, &wc->update_progress); 6210 if (ret > 0) 6211 wc->update_ref = 0; 6212 6213 wc->stage = DROP_REFERENCE; 6214 wc->shared_level = -1; 6215 path->slots[level] = 0; 6216 6217 /* 6218 * check reference count again if the block isn't locked. 6219 * we should start walking down the tree again if reference 6220 * count is one. 6221 */ 6222 if (!path->locks[level]) { 6223 BUG_ON(level == 0); 6224 btrfs_tree_lock(eb); 6225 btrfs_set_lock_blocking(eb); 6226 path->locks[level] = 1; 6227 6228 ret = btrfs_lookup_extent_info(trans, root, 6229 eb->start, eb->len, 6230 &wc->refs[level], 6231 &wc->flags[level]); 6232 BUG_ON(ret); 6233 BUG_ON(wc->refs[level] == 0); 6234 if (wc->refs[level] == 1) { 6235 btrfs_tree_unlock(eb); 6236 path->locks[level] = 0; 6237 return 1; 6238 } 6239 } 6240 } 6241 6242 /* wc->stage == DROP_REFERENCE */ 6243 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 6244 6245 if (wc->refs[level] == 1) { 6246 if (level == 0) { 6247 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 6248 ret = btrfs_dec_ref(trans, root, eb, 1); 6249 else 6250 ret = btrfs_dec_ref(trans, root, eb, 0); 6251 BUG_ON(ret); 6252 } 6253 /* make block locked assertion in clean_tree_block happy */ 6254 if (!path->locks[level] && 6255 btrfs_header_generation(eb) == trans->transid) { 6256 btrfs_tree_lock(eb); 6257 btrfs_set_lock_blocking(eb); 6258 path->locks[level] = 1; 6259 } 6260 clean_tree_block(trans, root, eb); 6261 } 6262 6263 if (eb == root->node) { 6264 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 6265 parent = eb->start; 6266 else 6267 BUG_ON(root->root_key.objectid != 6268 btrfs_header_owner(eb)); 6269 } else { 6270 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 6271 parent = path->nodes[level + 1]->start; 6272 else 6273 BUG_ON(root->root_key.objectid != 6274 btrfs_header_owner(path->nodes[level + 1])); 6275 } 6276 6277 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 6278 out: 6279 wc->refs[level] = 0; 6280 wc->flags[level] = 0; 6281 return 0; 6282 } 6283 6284 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 6285 struct btrfs_root *root, 6286 struct btrfs_path *path, 6287 struct walk_control *wc) 6288 { 6289 int level = wc->level; 6290 int lookup_info = 1; 6291 int ret; 6292 6293 while (level >= 0) { 6294 ret = walk_down_proc(trans, root, path, wc, lookup_info); 6295 if (ret > 0) 6296 break; 6297 6298 if (level == 0) 6299 break; 6300 6301 if (path->slots[level] >= 6302 btrfs_header_nritems(path->nodes[level])) 6303 break; 6304 6305 ret = do_walk_down(trans, root, path, wc, &lookup_info); 6306 if (ret > 0) { 6307 path->slots[level]++; 6308 continue; 6309 } else if (ret < 0) 6310 return ret; 6311 level = wc->level; 6312 } 6313 return 0; 6314 } 6315 6316 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 6317 struct btrfs_root *root, 6318 struct btrfs_path *path, 6319 struct walk_control *wc, int max_level) 6320 { 6321 int level = wc->level; 6322 int ret; 6323 6324 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 6325 while (level < max_level && path->nodes[level]) { 6326 wc->level = level; 6327 if (path->slots[level] + 1 < 6328 btrfs_header_nritems(path->nodes[level])) { 6329 path->slots[level]++; 6330 return 0; 6331 } else { 6332 ret = walk_up_proc(trans, root, path, wc); 6333 if (ret > 0) 6334 return 0; 6335 6336 if (path->locks[level]) { 6337 btrfs_tree_unlock(path->nodes[level]); 6338 path->locks[level] = 0; 6339 } 6340 free_extent_buffer(path->nodes[level]); 6341 path->nodes[level] = NULL; 6342 level++; 6343 } 6344 } 6345 return 1; 6346 } 6347 6348 /* 6349 * drop a subvolume tree. 6350 * 6351 * this function traverses the tree freeing any blocks that only 6352 * referenced by the tree. 6353 * 6354 * when a shared tree block is found. this function decreases its 6355 * reference count by one. if update_ref is true, this function 6356 * also make sure backrefs for the shared block and all lower level 6357 * blocks are properly updated. 6358 */ 6359 int btrfs_drop_snapshot(struct btrfs_root *root, 6360 struct btrfs_block_rsv *block_rsv, int update_ref) 6361 { 6362 struct btrfs_path *path; 6363 struct btrfs_trans_handle *trans; 6364 struct btrfs_root *tree_root = root->fs_info->tree_root; 6365 struct btrfs_root_item *root_item = &root->root_item; 6366 struct walk_control *wc; 6367 struct btrfs_key key; 6368 int err = 0; 6369 int ret; 6370 int level; 6371 6372 path = btrfs_alloc_path(); 6373 BUG_ON(!path); 6374 6375 wc = kzalloc(sizeof(*wc), GFP_NOFS); 6376 BUG_ON(!wc); 6377 6378 trans = btrfs_start_transaction(tree_root, 0); 6379 BUG_ON(IS_ERR(trans)); 6380 6381 if (block_rsv) 6382 trans->block_rsv = block_rsv; 6383 6384 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 6385 level = btrfs_header_level(root->node); 6386 path->nodes[level] = btrfs_lock_root_node(root); 6387 btrfs_set_lock_blocking(path->nodes[level]); 6388 path->slots[level] = 0; 6389 path->locks[level] = 1; 6390 memset(&wc->update_progress, 0, 6391 sizeof(wc->update_progress)); 6392 } else { 6393 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 6394 memcpy(&wc->update_progress, &key, 6395 sizeof(wc->update_progress)); 6396 6397 level = root_item->drop_level; 6398 BUG_ON(level == 0); 6399 path->lowest_level = level; 6400 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6401 path->lowest_level = 0; 6402 if (ret < 0) { 6403 err = ret; 6404 goto out; 6405 } 6406 WARN_ON(ret > 0); 6407 6408 /* 6409 * unlock our path, this is safe because only this 6410 * function is allowed to delete this snapshot 6411 */ 6412 btrfs_unlock_up_safe(path, 0); 6413 6414 level = btrfs_header_level(root->node); 6415 while (1) { 6416 btrfs_tree_lock(path->nodes[level]); 6417 btrfs_set_lock_blocking(path->nodes[level]); 6418 6419 ret = btrfs_lookup_extent_info(trans, root, 6420 path->nodes[level]->start, 6421 path->nodes[level]->len, 6422 &wc->refs[level], 6423 &wc->flags[level]); 6424 BUG_ON(ret); 6425 BUG_ON(wc->refs[level] == 0); 6426 6427 if (level == root_item->drop_level) 6428 break; 6429 6430 btrfs_tree_unlock(path->nodes[level]); 6431 WARN_ON(wc->refs[level] != 1); 6432 level--; 6433 } 6434 } 6435 6436 wc->level = level; 6437 wc->shared_level = -1; 6438 wc->stage = DROP_REFERENCE; 6439 wc->update_ref = update_ref; 6440 wc->keep_locks = 0; 6441 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 6442 6443 while (1) { 6444 ret = walk_down_tree(trans, root, path, wc); 6445 if (ret < 0) { 6446 err = ret; 6447 break; 6448 } 6449 6450 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 6451 if (ret < 0) { 6452 err = ret; 6453 break; 6454 } 6455 6456 if (ret > 0) { 6457 BUG_ON(wc->stage != DROP_REFERENCE); 6458 break; 6459 } 6460 6461 if (wc->stage == DROP_REFERENCE) { 6462 level = wc->level; 6463 btrfs_node_key(path->nodes[level], 6464 &root_item->drop_progress, 6465 path->slots[level]); 6466 root_item->drop_level = level; 6467 } 6468 6469 BUG_ON(wc->level == 0); 6470 if (btrfs_should_end_transaction(trans, tree_root)) { 6471 ret = btrfs_update_root(trans, tree_root, 6472 &root->root_key, 6473 root_item); 6474 BUG_ON(ret); 6475 6476 btrfs_end_transaction_throttle(trans, tree_root); 6477 trans = btrfs_start_transaction(tree_root, 0); 6478 BUG_ON(IS_ERR(trans)); 6479 if (block_rsv) 6480 trans->block_rsv = block_rsv; 6481 } 6482 } 6483 btrfs_release_path(root, path); 6484 BUG_ON(err); 6485 6486 ret = btrfs_del_root(trans, tree_root, &root->root_key); 6487 BUG_ON(ret); 6488 6489 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 6490 ret = btrfs_find_last_root(tree_root, root->root_key.objectid, 6491 NULL, NULL); 6492 BUG_ON(ret < 0); 6493 if (ret > 0) { 6494 /* if we fail to delete the orphan item this time 6495 * around, it'll get picked up the next time. 6496 * 6497 * The most common failure here is just -ENOENT. 6498 */ 6499 btrfs_del_orphan_item(trans, tree_root, 6500 root->root_key.objectid); 6501 } 6502 } 6503 6504 if (root->in_radix) { 6505 btrfs_free_fs_root(tree_root->fs_info, root); 6506 } else { 6507 free_extent_buffer(root->node); 6508 free_extent_buffer(root->commit_root); 6509 kfree(root); 6510 } 6511 out: 6512 btrfs_end_transaction_throttle(trans, tree_root); 6513 kfree(wc); 6514 btrfs_free_path(path); 6515 return err; 6516 } 6517 6518 /* 6519 * drop subtree rooted at tree block 'node'. 6520 * 6521 * NOTE: this function will unlock and release tree block 'node' 6522 */ 6523 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 6524 struct btrfs_root *root, 6525 struct extent_buffer *node, 6526 struct extent_buffer *parent) 6527 { 6528 struct btrfs_path *path; 6529 struct walk_control *wc; 6530 int level; 6531 int parent_level; 6532 int ret = 0; 6533 int wret; 6534 6535 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 6536 6537 path = btrfs_alloc_path(); 6538 if (!path) 6539 return -ENOMEM; 6540 6541 wc = kzalloc(sizeof(*wc), GFP_NOFS); 6542 if (!wc) { 6543 btrfs_free_path(path); 6544 return -ENOMEM; 6545 } 6546 6547 btrfs_assert_tree_locked(parent); 6548 parent_level = btrfs_header_level(parent); 6549 extent_buffer_get(parent); 6550 path->nodes[parent_level] = parent; 6551 path->slots[parent_level] = btrfs_header_nritems(parent); 6552 6553 btrfs_assert_tree_locked(node); 6554 level = btrfs_header_level(node); 6555 path->nodes[level] = node; 6556 path->slots[level] = 0; 6557 path->locks[level] = 1; 6558 6559 wc->refs[parent_level] = 1; 6560 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 6561 wc->level = level; 6562 wc->shared_level = -1; 6563 wc->stage = DROP_REFERENCE; 6564 wc->update_ref = 0; 6565 wc->keep_locks = 1; 6566 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 6567 6568 while (1) { 6569 wret = walk_down_tree(trans, root, path, wc); 6570 if (wret < 0) { 6571 ret = wret; 6572 break; 6573 } 6574 6575 wret = walk_up_tree(trans, root, path, wc, parent_level); 6576 if (wret < 0) 6577 ret = wret; 6578 if (wret != 0) 6579 break; 6580 } 6581 6582 kfree(wc); 6583 btrfs_free_path(path); 6584 return ret; 6585 } 6586 6587 #if 0 6588 static unsigned long calc_ra(unsigned long start, unsigned long last, 6589 unsigned long nr) 6590 { 6591 return min(last, start + nr - 1); 6592 } 6593 6594 static noinline int relocate_inode_pages(struct inode *inode, u64 start, 6595 u64 len) 6596 { 6597 u64 page_start; 6598 u64 page_end; 6599 unsigned long first_index; 6600 unsigned long last_index; 6601 unsigned long i; 6602 struct page *page; 6603 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6604 struct file_ra_state *ra; 6605 struct btrfs_ordered_extent *ordered; 6606 unsigned int total_read = 0; 6607 unsigned int total_dirty = 0; 6608 int ret = 0; 6609 6610 ra = kzalloc(sizeof(*ra), GFP_NOFS); 6611 if (!ra) 6612 return -ENOMEM; 6613 6614 mutex_lock(&inode->i_mutex); 6615 first_index = start >> PAGE_CACHE_SHIFT; 6616 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; 6617 6618 /* make sure the dirty trick played by the caller work */ 6619 ret = invalidate_inode_pages2_range(inode->i_mapping, 6620 first_index, last_index); 6621 if (ret) 6622 goto out_unlock; 6623 6624 file_ra_state_init(ra, inode->i_mapping); 6625 6626 for (i = first_index ; i <= last_index; i++) { 6627 if (total_read % ra->ra_pages == 0) { 6628 btrfs_force_ra(inode->i_mapping, ra, NULL, i, 6629 calc_ra(i, last_index, ra->ra_pages)); 6630 } 6631 total_read++; 6632 again: 6633 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) 6634 BUG_ON(1); 6635 page = grab_cache_page(inode->i_mapping, i); 6636 if (!page) { 6637 ret = -ENOMEM; 6638 goto out_unlock; 6639 } 6640 if (!PageUptodate(page)) { 6641 btrfs_readpage(NULL, page); 6642 lock_page(page); 6643 if (!PageUptodate(page)) { 6644 unlock_page(page); 6645 page_cache_release(page); 6646 ret = -EIO; 6647 goto out_unlock; 6648 } 6649 } 6650 wait_on_page_writeback(page); 6651 6652 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 6653 page_end = page_start + PAGE_CACHE_SIZE - 1; 6654 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 6655 6656 ordered = btrfs_lookup_ordered_extent(inode, page_start); 6657 if (ordered) { 6658 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 6659 unlock_page(page); 6660 page_cache_release(page); 6661 btrfs_start_ordered_extent(inode, ordered, 1); 6662 btrfs_put_ordered_extent(ordered); 6663 goto again; 6664 } 6665 set_page_extent_mapped(page); 6666 6667 if (i == first_index) 6668 set_extent_bits(io_tree, page_start, page_end, 6669 EXTENT_BOUNDARY, GFP_NOFS); 6670 btrfs_set_extent_delalloc(inode, page_start, page_end); 6671 6672 set_page_dirty(page); 6673 total_dirty++; 6674 6675 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 6676 unlock_page(page); 6677 page_cache_release(page); 6678 } 6679 6680 out_unlock: 6681 kfree(ra); 6682 mutex_unlock(&inode->i_mutex); 6683 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); 6684 return ret; 6685 } 6686 6687 static noinline int relocate_data_extent(struct inode *reloc_inode, 6688 struct btrfs_key *extent_key, 6689 u64 offset) 6690 { 6691 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 6692 struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree; 6693 struct extent_map *em; 6694 u64 start = extent_key->objectid - offset; 6695 u64 end = start + extent_key->offset - 1; 6696 6697 em = alloc_extent_map(GFP_NOFS); 6698 BUG_ON(!em); 6699 6700 em->start = start; 6701 em->len = extent_key->offset; 6702 em->block_len = extent_key->offset; 6703 em->block_start = extent_key->objectid; 6704 em->bdev = root->fs_info->fs_devices->latest_bdev; 6705 set_bit(EXTENT_FLAG_PINNED, &em->flags); 6706 6707 /* setup extent map to cheat btrfs_readpage */ 6708 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); 6709 while (1) { 6710 int ret; 6711 write_lock(&em_tree->lock); 6712 ret = add_extent_mapping(em_tree, em); 6713 write_unlock(&em_tree->lock); 6714 if (ret != -EEXIST) { 6715 free_extent_map(em); 6716 break; 6717 } 6718 btrfs_drop_extent_cache(reloc_inode, start, end, 0); 6719 } 6720 unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); 6721 6722 return relocate_inode_pages(reloc_inode, start, extent_key->offset); 6723 } 6724 6725 struct btrfs_ref_path { 6726 u64 extent_start; 6727 u64 nodes[BTRFS_MAX_LEVEL]; 6728 u64 root_objectid; 6729 u64 root_generation; 6730 u64 owner_objectid; 6731 u32 num_refs; 6732 int lowest_level; 6733 int current_level; 6734 int shared_level; 6735 6736 struct btrfs_key node_keys[BTRFS_MAX_LEVEL]; 6737 u64 new_nodes[BTRFS_MAX_LEVEL]; 6738 }; 6739 6740 struct disk_extent { 6741 u64 ram_bytes; 6742 u64 disk_bytenr; 6743 u64 disk_num_bytes; 6744 u64 offset; 6745 u64 num_bytes; 6746 u8 compression; 6747 u8 encryption; 6748 u16 other_encoding; 6749 }; 6750 6751 static int is_cowonly_root(u64 root_objectid) 6752 { 6753 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || 6754 root_objectid == BTRFS_EXTENT_TREE_OBJECTID || 6755 root_objectid == BTRFS_CHUNK_TREE_OBJECTID || 6756 root_objectid == BTRFS_DEV_TREE_OBJECTID || 6757 root_objectid == BTRFS_TREE_LOG_OBJECTID || 6758 root_objectid == BTRFS_CSUM_TREE_OBJECTID) 6759 return 1; 6760 return 0; 6761 } 6762 6763 static noinline int __next_ref_path(struct btrfs_trans_handle *trans, 6764 struct btrfs_root *extent_root, 6765 struct btrfs_ref_path *ref_path, 6766 int first_time) 6767 { 6768 struct extent_buffer *leaf; 6769 struct btrfs_path *path; 6770 struct btrfs_extent_ref *ref; 6771 struct btrfs_key key; 6772 struct btrfs_key found_key; 6773 u64 bytenr; 6774 u32 nritems; 6775 int level; 6776 int ret = 1; 6777 6778 path = btrfs_alloc_path(); 6779 if (!path) 6780 return -ENOMEM; 6781 6782 if (first_time) { 6783 ref_path->lowest_level = -1; 6784 ref_path->current_level = -1; 6785 ref_path->shared_level = -1; 6786 goto walk_up; 6787 } 6788 walk_down: 6789 level = ref_path->current_level - 1; 6790 while (level >= -1) { 6791 u64 parent; 6792 if (level < ref_path->lowest_level) 6793 break; 6794 6795 if (level >= 0) 6796 bytenr = ref_path->nodes[level]; 6797 else 6798 bytenr = ref_path->extent_start; 6799 BUG_ON(bytenr == 0); 6800 6801 parent = ref_path->nodes[level + 1]; 6802 ref_path->nodes[level + 1] = 0; 6803 ref_path->current_level = level; 6804 BUG_ON(parent == 0); 6805 6806 key.objectid = bytenr; 6807 key.offset = parent + 1; 6808 key.type = BTRFS_EXTENT_REF_KEY; 6809 6810 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); 6811 if (ret < 0) 6812 goto out; 6813 BUG_ON(ret == 0); 6814 6815 leaf = path->nodes[0]; 6816 nritems = btrfs_header_nritems(leaf); 6817 if (path->slots[0] >= nritems) { 6818 ret = btrfs_next_leaf(extent_root, path); 6819 if (ret < 0) 6820 goto out; 6821 if (ret > 0) 6822 goto next; 6823 leaf = path->nodes[0]; 6824 } 6825 6826 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6827 if (found_key.objectid == bytenr && 6828 found_key.type == BTRFS_EXTENT_REF_KEY) { 6829 if (level < ref_path->shared_level) 6830 ref_path->shared_level = level; 6831 goto found; 6832 } 6833 next: 6834 level--; 6835 btrfs_release_path(extent_root, path); 6836 cond_resched(); 6837 } 6838 /* reached lowest level */ 6839 ret = 1; 6840 goto out; 6841 walk_up: 6842 level = ref_path->current_level; 6843 while (level < BTRFS_MAX_LEVEL - 1) { 6844 u64 ref_objectid; 6845 6846 if (level >= 0) 6847 bytenr = ref_path->nodes[level]; 6848 else 6849 bytenr = ref_path->extent_start; 6850 6851 BUG_ON(bytenr == 0); 6852 6853 key.objectid = bytenr; 6854 key.offset = 0; 6855 key.type = BTRFS_EXTENT_REF_KEY; 6856 6857 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); 6858 if (ret < 0) 6859 goto out; 6860 6861 leaf = path->nodes[0]; 6862 nritems = btrfs_header_nritems(leaf); 6863 if (path->slots[0] >= nritems) { 6864 ret = btrfs_next_leaf(extent_root, path); 6865 if (ret < 0) 6866 goto out; 6867 if (ret > 0) { 6868 /* the extent was freed by someone */ 6869 if (ref_path->lowest_level == level) 6870 goto out; 6871 btrfs_release_path(extent_root, path); 6872 goto walk_down; 6873 } 6874 leaf = path->nodes[0]; 6875 } 6876 6877 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6878 if (found_key.objectid != bytenr || 6879 found_key.type != BTRFS_EXTENT_REF_KEY) { 6880 /* the extent was freed by someone */ 6881 if (ref_path->lowest_level == level) { 6882 ret = 1; 6883 goto out; 6884 } 6885 btrfs_release_path(extent_root, path); 6886 goto walk_down; 6887 } 6888 found: 6889 ref = btrfs_item_ptr(leaf, path->slots[0], 6890 struct btrfs_extent_ref); 6891 ref_objectid = btrfs_ref_objectid(leaf, ref); 6892 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) { 6893 if (first_time) { 6894 level = (int)ref_objectid; 6895 BUG_ON(level >= BTRFS_MAX_LEVEL); 6896 ref_path->lowest_level = level; 6897 ref_path->current_level = level; 6898 ref_path->nodes[level] = bytenr; 6899 } else { 6900 WARN_ON(ref_objectid != level); 6901 } 6902 } else { 6903 WARN_ON(level != -1); 6904 } 6905 first_time = 0; 6906 6907 if (ref_path->lowest_level == level) { 6908 ref_path->owner_objectid = ref_objectid; 6909 ref_path->num_refs = btrfs_ref_num_refs(leaf, ref); 6910 } 6911 6912 /* 6913 * the block is tree root or the block isn't in reference 6914 * counted tree. 6915 */ 6916 if (found_key.objectid == found_key.offset || 6917 is_cowonly_root(btrfs_ref_root(leaf, ref))) { 6918 ref_path->root_objectid = btrfs_ref_root(leaf, ref); 6919 ref_path->root_generation = 6920 btrfs_ref_generation(leaf, ref); 6921 if (level < 0) { 6922 /* special reference from the tree log */ 6923 ref_path->nodes[0] = found_key.offset; 6924 ref_path->current_level = 0; 6925 } 6926 ret = 0; 6927 goto out; 6928 } 6929 6930 level++; 6931 BUG_ON(ref_path->nodes[level] != 0); 6932 ref_path->nodes[level] = found_key.offset; 6933 ref_path->current_level = level; 6934 6935 /* 6936 * the reference was created in the running transaction, 6937 * no need to continue walking up. 6938 */ 6939 if (btrfs_ref_generation(leaf, ref) == trans->transid) { 6940 ref_path->root_objectid = btrfs_ref_root(leaf, ref); 6941 ref_path->root_generation = 6942 btrfs_ref_generation(leaf, ref); 6943 ret = 0; 6944 goto out; 6945 } 6946 6947 btrfs_release_path(extent_root, path); 6948 cond_resched(); 6949 } 6950 /* reached max tree level, but no tree root found. */ 6951 BUG(); 6952 out: 6953 btrfs_free_path(path); 6954 return ret; 6955 } 6956 6957 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans, 6958 struct btrfs_root *extent_root, 6959 struct btrfs_ref_path *ref_path, 6960 u64 extent_start) 6961 { 6962 memset(ref_path, 0, sizeof(*ref_path)); 6963 ref_path->extent_start = extent_start; 6964 6965 return __next_ref_path(trans, extent_root, ref_path, 1); 6966 } 6967 6968 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans, 6969 struct btrfs_root *extent_root, 6970 struct btrfs_ref_path *ref_path) 6971 { 6972 return __next_ref_path(trans, extent_root, ref_path, 0); 6973 } 6974 6975 static noinline int get_new_locations(struct inode *reloc_inode, 6976 struct btrfs_key *extent_key, 6977 u64 offset, int no_fragment, 6978 struct disk_extent **extents, 6979 int *nr_extents) 6980 { 6981 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 6982 struct btrfs_path *path; 6983 struct btrfs_file_extent_item *fi; 6984 struct extent_buffer *leaf; 6985 struct disk_extent *exts = *extents; 6986 struct btrfs_key found_key; 6987 u64 cur_pos; 6988 u64 last_byte; 6989 u32 nritems; 6990 int nr = 0; 6991 int max = *nr_extents; 6992 int ret; 6993 6994 WARN_ON(!no_fragment && *extents); 6995 if (!exts) { 6996 max = 1; 6997 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS); 6998 if (!exts) 6999 return -ENOMEM; 7000 } 7001 7002 path = btrfs_alloc_path(); 7003 if (!path) { 7004 if (exts != *extents) 7005 kfree(exts); 7006 return -ENOMEM; 7007 } 7008 7009 cur_pos = extent_key->objectid - offset; 7010 last_byte = extent_key->objectid + extent_key->offset; 7011 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, 7012 cur_pos, 0); 7013 if (ret < 0) 7014 goto out; 7015 if (ret > 0) { 7016 ret = -ENOENT; 7017 goto out; 7018 } 7019 7020 while (1) { 7021 leaf = path->nodes[0]; 7022 nritems = btrfs_header_nritems(leaf); 7023 if (path->slots[0] >= nritems) { 7024 ret = btrfs_next_leaf(root, path); 7025 if (ret < 0) 7026 goto out; 7027 if (ret > 0) 7028 break; 7029 leaf = path->nodes[0]; 7030 } 7031 7032 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 7033 if (found_key.offset != cur_pos || 7034 found_key.type != BTRFS_EXTENT_DATA_KEY || 7035 found_key.objectid != reloc_inode->i_ino) 7036 break; 7037 7038 fi = btrfs_item_ptr(leaf, path->slots[0], 7039 struct btrfs_file_extent_item); 7040 if (btrfs_file_extent_type(leaf, fi) != 7041 BTRFS_FILE_EXTENT_REG || 7042 btrfs_file_extent_disk_bytenr(leaf, fi) == 0) 7043 break; 7044 7045 if (nr == max) { 7046 struct disk_extent *old = exts; 7047 max *= 2; 7048 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); 7049 if (!exts) { 7050 ret = -ENOMEM; 7051 goto out; 7052 } 7053 memcpy(exts, old, sizeof(*exts) * nr); 7054 if (old != *extents) 7055 kfree(old); 7056 } 7057 7058 exts[nr].disk_bytenr = 7059 btrfs_file_extent_disk_bytenr(leaf, fi); 7060 exts[nr].disk_num_bytes = 7061 btrfs_file_extent_disk_num_bytes(leaf, fi); 7062 exts[nr].offset = btrfs_file_extent_offset(leaf, fi); 7063 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 7064 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 7065 exts[nr].compression = btrfs_file_extent_compression(leaf, fi); 7066 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); 7067 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, 7068 fi); 7069 BUG_ON(exts[nr].offset > 0); 7070 BUG_ON(exts[nr].compression || exts[nr].encryption); 7071 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); 7072 7073 cur_pos += exts[nr].num_bytes; 7074 nr++; 7075 7076 if (cur_pos + offset >= last_byte) 7077 break; 7078 7079 if (no_fragment) { 7080 ret = 1; 7081 goto out; 7082 } 7083 path->slots[0]++; 7084 } 7085 7086 BUG_ON(cur_pos + offset > last_byte); 7087 if (cur_pos + offset < last_byte) { 7088 ret = -ENOENT; 7089 goto out; 7090 } 7091 ret = 0; 7092 out: 7093 btrfs_free_path(path); 7094 if (ret) { 7095 if (exts != *extents) 7096 kfree(exts); 7097 } else { 7098 *extents = exts; 7099 *nr_extents = nr; 7100 } 7101 return ret; 7102 } 7103 7104 static noinline int replace_one_extent(struct btrfs_trans_handle *trans, 7105 struct btrfs_root *root, 7106 struct btrfs_path *path, 7107 struct btrfs_key *extent_key, 7108 struct btrfs_key *leaf_key, 7109 struct btrfs_ref_path *ref_path, 7110 struct disk_extent *new_extents, 7111 int nr_extents) 7112 { 7113 struct extent_buffer *leaf; 7114 struct btrfs_file_extent_item *fi; 7115 struct inode *inode = NULL; 7116 struct btrfs_key key; 7117 u64 lock_start = 0; 7118 u64 lock_end = 0; 7119 u64 num_bytes; 7120 u64 ext_offset; 7121 u64 search_end = (u64)-1; 7122 u32 nritems; 7123 int nr_scaned = 0; 7124 int extent_locked = 0; 7125 int extent_type; 7126 int ret; 7127 7128 memcpy(&key, leaf_key, sizeof(key)); 7129 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 7130 if (key.objectid < ref_path->owner_objectid || 7131 (key.objectid == ref_path->owner_objectid && 7132 key.type < BTRFS_EXTENT_DATA_KEY)) { 7133 key.objectid = ref_path->owner_objectid; 7134 key.type = BTRFS_EXTENT_DATA_KEY; 7135 key.offset = 0; 7136 } 7137 } 7138 7139 while (1) { 7140 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 7141 if (ret < 0) 7142 goto out; 7143 7144 leaf = path->nodes[0]; 7145 nritems = btrfs_header_nritems(leaf); 7146 next: 7147 if (extent_locked && ret > 0) { 7148 /* 7149 * the file extent item was modified by someone 7150 * before the extent got locked. 7151 */ 7152 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, 7153 lock_end, GFP_NOFS); 7154 extent_locked = 0; 7155 } 7156 7157 if (path->slots[0] >= nritems) { 7158 if (++nr_scaned > 2) 7159 break; 7160 7161 BUG_ON(extent_locked); 7162 ret = btrfs_next_leaf(root, path); 7163 if (ret < 0) 7164 goto out; 7165 if (ret > 0) 7166 break; 7167 leaf = path->nodes[0]; 7168 nritems = btrfs_header_nritems(leaf); 7169 } 7170 7171 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 7172 7173 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 7174 if ((key.objectid > ref_path->owner_objectid) || 7175 (key.objectid == ref_path->owner_objectid && 7176 key.type > BTRFS_EXTENT_DATA_KEY) || 7177 key.offset >= search_end) 7178 break; 7179 } 7180 7181 if (inode && key.objectid != inode->i_ino) { 7182 BUG_ON(extent_locked); 7183 btrfs_release_path(root, path); 7184 mutex_unlock(&inode->i_mutex); 7185 iput(inode); 7186 inode = NULL; 7187 continue; 7188 } 7189 7190 if (key.type != BTRFS_EXTENT_DATA_KEY) { 7191 path->slots[0]++; 7192 ret = 1; 7193 goto next; 7194 } 7195 fi = btrfs_item_ptr(leaf, path->slots[0], 7196 struct btrfs_file_extent_item); 7197 extent_type = btrfs_file_extent_type(leaf, fi); 7198 if ((extent_type != BTRFS_FILE_EXTENT_REG && 7199 extent_type != BTRFS_FILE_EXTENT_PREALLOC) || 7200 (btrfs_file_extent_disk_bytenr(leaf, fi) != 7201 extent_key->objectid)) { 7202 path->slots[0]++; 7203 ret = 1; 7204 goto next; 7205 } 7206 7207 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 7208 ext_offset = btrfs_file_extent_offset(leaf, fi); 7209 7210 if (search_end == (u64)-1) { 7211 search_end = key.offset - ext_offset + 7212 btrfs_file_extent_ram_bytes(leaf, fi); 7213 } 7214 7215 if (!extent_locked) { 7216 lock_start = key.offset; 7217 lock_end = lock_start + num_bytes - 1; 7218 } else { 7219 if (lock_start > key.offset || 7220 lock_end + 1 < key.offset + num_bytes) { 7221 unlock_extent(&BTRFS_I(inode)->io_tree, 7222 lock_start, lock_end, GFP_NOFS); 7223 extent_locked = 0; 7224 } 7225 } 7226 7227 if (!inode) { 7228 btrfs_release_path(root, path); 7229 7230 inode = btrfs_iget_locked(root->fs_info->sb, 7231 key.objectid, root); 7232 if (inode->i_state & I_NEW) { 7233 BTRFS_I(inode)->root = root; 7234 BTRFS_I(inode)->location.objectid = 7235 key.objectid; 7236 BTRFS_I(inode)->location.type = 7237 BTRFS_INODE_ITEM_KEY; 7238 BTRFS_I(inode)->location.offset = 0; 7239 btrfs_read_locked_inode(inode); 7240 unlock_new_inode(inode); 7241 } 7242 /* 7243 * some code call btrfs_commit_transaction while 7244 * holding the i_mutex, so we can't use mutex_lock 7245 * here. 7246 */ 7247 if (is_bad_inode(inode) || 7248 !mutex_trylock(&inode->i_mutex)) { 7249 iput(inode); 7250 inode = NULL; 7251 key.offset = (u64)-1; 7252 goto skip; 7253 } 7254 } 7255 7256 if (!extent_locked) { 7257 struct btrfs_ordered_extent *ordered; 7258 7259 btrfs_release_path(root, path); 7260 7261 lock_extent(&BTRFS_I(inode)->io_tree, lock_start, 7262 lock_end, GFP_NOFS); 7263 ordered = btrfs_lookup_first_ordered_extent(inode, 7264 lock_end); 7265 if (ordered && 7266 ordered->file_offset <= lock_end && 7267 ordered->file_offset + ordered->len > lock_start) { 7268 unlock_extent(&BTRFS_I(inode)->io_tree, 7269 lock_start, lock_end, GFP_NOFS); 7270 btrfs_start_ordered_extent(inode, ordered, 1); 7271 btrfs_put_ordered_extent(ordered); 7272 key.offset += num_bytes; 7273 goto skip; 7274 } 7275 if (ordered) 7276 btrfs_put_ordered_extent(ordered); 7277 7278 extent_locked = 1; 7279 continue; 7280 } 7281 7282 if (nr_extents == 1) { 7283 /* update extent pointer in place */ 7284 btrfs_set_file_extent_disk_bytenr(leaf, fi, 7285 new_extents[0].disk_bytenr); 7286 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 7287 new_extents[0].disk_num_bytes); 7288 btrfs_mark_buffer_dirty(leaf); 7289 7290 btrfs_drop_extent_cache(inode, key.offset, 7291 key.offset + num_bytes - 1, 0); 7292 7293 ret = btrfs_inc_extent_ref(trans, root, 7294 new_extents[0].disk_bytenr, 7295 new_extents[0].disk_num_bytes, 7296 leaf->start, 7297 root->root_key.objectid, 7298 trans->transid, 7299 key.objectid); 7300 BUG_ON(ret); 7301 7302 ret = btrfs_free_extent(trans, root, 7303 extent_key->objectid, 7304 extent_key->offset, 7305 leaf->start, 7306 btrfs_header_owner(leaf), 7307 btrfs_header_generation(leaf), 7308 key.objectid, 0); 7309 BUG_ON(ret); 7310 7311 btrfs_release_path(root, path); 7312 key.offset += num_bytes; 7313 } else { 7314 BUG_ON(1); 7315 #if 0 7316 u64 alloc_hint; 7317 u64 extent_len; 7318 int i; 7319 /* 7320 * drop old extent pointer at first, then insert the 7321 * new pointers one bye one 7322 */ 7323 btrfs_release_path(root, path); 7324 ret = btrfs_drop_extents(trans, root, inode, key.offset, 7325 key.offset + num_bytes, 7326 key.offset, &alloc_hint); 7327 BUG_ON(ret); 7328 7329 for (i = 0; i < nr_extents; i++) { 7330 if (ext_offset >= new_extents[i].num_bytes) { 7331 ext_offset -= new_extents[i].num_bytes; 7332 continue; 7333 } 7334 extent_len = min(new_extents[i].num_bytes - 7335 ext_offset, num_bytes); 7336 7337 ret = btrfs_insert_empty_item(trans, root, 7338 path, &key, 7339 sizeof(*fi)); 7340 BUG_ON(ret); 7341 7342 leaf = path->nodes[0]; 7343 fi = btrfs_item_ptr(leaf, path->slots[0], 7344 struct btrfs_file_extent_item); 7345 btrfs_set_file_extent_generation(leaf, fi, 7346 trans->transid); 7347 btrfs_set_file_extent_type(leaf, fi, 7348 BTRFS_FILE_EXTENT_REG); 7349 btrfs_set_file_extent_disk_bytenr(leaf, fi, 7350 new_extents[i].disk_bytenr); 7351 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 7352 new_extents[i].disk_num_bytes); 7353 btrfs_set_file_extent_ram_bytes(leaf, fi, 7354 new_extents[i].ram_bytes); 7355 7356 btrfs_set_file_extent_compression(leaf, fi, 7357 new_extents[i].compression); 7358 btrfs_set_file_extent_encryption(leaf, fi, 7359 new_extents[i].encryption); 7360 btrfs_set_file_extent_other_encoding(leaf, fi, 7361 new_extents[i].other_encoding); 7362 7363 btrfs_set_file_extent_num_bytes(leaf, fi, 7364 extent_len); 7365 ext_offset += new_extents[i].offset; 7366 btrfs_set_file_extent_offset(leaf, fi, 7367 ext_offset); 7368 btrfs_mark_buffer_dirty(leaf); 7369 7370 btrfs_drop_extent_cache(inode, key.offset, 7371 key.offset + extent_len - 1, 0); 7372 7373 ret = btrfs_inc_extent_ref(trans, root, 7374 new_extents[i].disk_bytenr, 7375 new_extents[i].disk_num_bytes, 7376 leaf->start, 7377 root->root_key.objectid, 7378 trans->transid, key.objectid); 7379 BUG_ON(ret); 7380 btrfs_release_path(root, path); 7381 7382 inode_add_bytes(inode, extent_len); 7383 7384 ext_offset = 0; 7385 num_bytes -= extent_len; 7386 key.offset += extent_len; 7387 7388 if (num_bytes == 0) 7389 break; 7390 } 7391 BUG_ON(i >= nr_extents); 7392 #endif 7393 } 7394 7395 if (extent_locked) { 7396 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, 7397 lock_end, GFP_NOFS); 7398 extent_locked = 0; 7399 } 7400 skip: 7401 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && 7402 key.offset >= search_end) 7403 break; 7404 7405 cond_resched(); 7406 } 7407 ret = 0; 7408 out: 7409 btrfs_release_path(root, path); 7410 if (inode) { 7411 mutex_unlock(&inode->i_mutex); 7412 if (extent_locked) { 7413 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, 7414 lock_end, GFP_NOFS); 7415 } 7416 iput(inode); 7417 } 7418 return ret; 7419 } 7420 7421 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, 7422 struct btrfs_root *root, 7423 struct extent_buffer *buf, u64 orig_start) 7424 { 7425 int level; 7426 int ret; 7427 7428 BUG_ON(btrfs_header_generation(buf) != trans->transid); 7429 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 7430 7431 level = btrfs_header_level(buf); 7432 if (level == 0) { 7433 struct btrfs_leaf_ref *ref; 7434 struct btrfs_leaf_ref *orig_ref; 7435 7436 orig_ref = btrfs_lookup_leaf_ref(root, orig_start); 7437 if (!orig_ref) 7438 return -ENOENT; 7439 7440 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems); 7441 if (!ref) { 7442 btrfs_free_leaf_ref(root, orig_ref); 7443 return -ENOMEM; 7444 } 7445 7446 ref->nritems = orig_ref->nritems; 7447 memcpy(ref->extents, orig_ref->extents, 7448 sizeof(ref->extents[0]) * ref->nritems); 7449 7450 btrfs_free_leaf_ref(root, orig_ref); 7451 7452 ref->root_gen = trans->transid; 7453 ref->bytenr = buf->start; 7454 ref->owner = btrfs_header_owner(buf); 7455 ref->generation = btrfs_header_generation(buf); 7456 7457 ret = btrfs_add_leaf_ref(root, ref, 0); 7458 WARN_ON(ret); 7459 btrfs_free_leaf_ref(root, ref); 7460 } 7461 return 0; 7462 } 7463 7464 static noinline int invalidate_extent_cache(struct btrfs_root *root, 7465 struct extent_buffer *leaf, 7466 struct btrfs_block_group_cache *group, 7467 struct btrfs_root *target_root) 7468 { 7469 struct btrfs_key key; 7470 struct inode *inode = NULL; 7471 struct btrfs_file_extent_item *fi; 7472 struct extent_state *cached_state = NULL; 7473 u64 num_bytes; 7474 u64 skip_objectid = 0; 7475 u32 nritems; 7476 u32 i; 7477 7478 nritems = btrfs_header_nritems(leaf); 7479 for (i = 0; i < nritems; i++) { 7480 btrfs_item_key_to_cpu(leaf, &key, i); 7481 if (key.objectid == skip_objectid || 7482 key.type != BTRFS_EXTENT_DATA_KEY) 7483 continue; 7484 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 7485 if (btrfs_file_extent_type(leaf, fi) == 7486 BTRFS_FILE_EXTENT_INLINE) 7487 continue; 7488 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) 7489 continue; 7490 if (!inode || inode->i_ino != key.objectid) { 7491 iput(inode); 7492 inode = btrfs_ilookup(target_root->fs_info->sb, 7493 key.objectid, target_root, 1); 7494 } 7495 if (!inode) { 7496 skip_objectid = key.objectid; 7497 continue; 7498 } 7499 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 7500 7501 lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset, 7502 key.offset + num_bytes - 1, 0, &cached_state, 7503 GFP_NOFS); 7504 btrfs_drop_extent_cache(inode, key.offset, 7505 key.offset + num_bytes - 1, 1); 7506 unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset, 7507 key.offset + num_bytes - 1, &cached_state, 7508 GFP_NOFS); 7509 cond_resched(); 7510 } 7511 iput(inode); 7512 return 0; 7513 } 7514 7515 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, 7516 struct btrfs_root *root, 7517 struct extent_buffer *leaf, 7518 struct btrfs_block_group_cache *group, 7519 struct inode *reloc_inode) 7520 { 7521 struct btrfs_key key; 7522 struct btrfs_key extent_key; 7523 struct btrfs_file_extent_item *fi; 7524 struct btrfs_leaf_ref *ref; 7525 struct disk_extent *new_extent; 7526 u64 bytenr; 7527 u64 num_bytes; 7528 u32 nritems; 7529 u32 i; 7530 int ext_index; 7531 int nr_extent; 7532 int ret; 7533 7534 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); 7535 if (!new_extent) 7536 return -ENOMEM; 7537 7538 ref = btrfs_lookup_leaf_ref(root, leaf->start); 7539 BUG_ON(!ref); 7540 7541 ext_index = -1; 7542 nritems = btrfs_header_nritems(leaf); 7543 for (i = 0; i < nritems; i++) { 7544 btrfs_item_key_to_cpu(leaf, &key, i); 7545 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 7546 continue; 7547 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 7548 if (btrfs_file_extent_type(leaf, fi) == 7549 BTRFS_FILE_EXTENT_INLINE) 7550 continue; 7551 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 7552 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 7553 if (bytenr == 0) 7554 continue; 7555 7556 ext_index++; 7557 if (bytenr >= group->key.objectid + group->key.offset || 7558 bytenr + num_bytes <= group->key.objectid) 7559 continue; 7560 7561 extent_key.objectid = bytenr; 7562 extent_key.offset = num_bytes; 7563 extent_key.type = BTRFS_EXTENT_ITEM_KEY; 7564 nr_extent = 1; 7565 ret = get_new_locations(reloc_inode, &extent_key, 7566 group->key.objectid, 1, 7567 &new_extent, &nr_extent); 7568 if (ret > 0) 7569 continue; 7570 BUG_ON(ret < 0); 7571 7572 BUG_ON(ref->extents[ext_index].bytenr != bytenr); 7573 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes); 7574 ref->extents[ext_index].bytenr = new_extent->disk_bytenr; 7575 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; 7576 7577 btrfs_set_file_extent_disk_bytenr(leaf, fi, 7578 new_extent->disk_bytenr); 7579 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 7580 new_extent->disk_num_bytes); 7581 btrfs_mark_buffer_dirty(leaf); 7582 7583 ret = btrfs_inc_extent_ref(trans, root, 7584 new_extent->disk_bytenr, 7585 new_extent->disk_num_bytes, 7586 leaf->start, 7587 root->root_key.objectid, 7588 trans->transid, key.objectid); 7589 BUG_ON(ret); 7590 7591 ret = btrfs_free_extent(trans, root, 7592 bytenr, num_bytes, leaf->start, 7593 btrfs_header_owner(leaf), 7594 btrfs_header_generation(leaf), 7595 key.objectid, 0); 7596 BUG_ON(ret); 7597 cond_resched(); 7598 } 7599 kfree(new_extent); 7600 BUG_ON(ext_index + 1 != ref->nritems); 7601 btrfs_free_leaf_ref(root, ref); 7602 return 0; 7603 } 7604 7605 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, 7606 struct btrfs_root *root) 7607 { 7608 struct btrfs_root *reloc_root; 7609 int ret; 7610 7611 if (root->reloc_root) { 7612 reloc_root = root->reloc_root; 7613 root->reloc_root = NULL; 7614 list_add(&reloc_root->dead_list, 7615 &root->fs_info->dead_reloc_roots); 7616 7617 btrfs_set_root_bytenr(&reloc_root->root_item, 7618 reloc_root->node->start); 7619 btrfs_set_root_level(&root->root_item, 7620 btrfs_header_level(reloc_root->node)); 7621 memset(&reloc_root->root_item.drop_progress, 0, 7622 sizeof(struct btrfs_disk_key)); 7623 reloc_root->root_item.drop_level = 0; 7624 7625 ret = btrfs_update_root(trans, root->fs_info->tree_root, 7626 &reloc_root->root_key, 7627 &reloc_root->root_item); 7628 BUG_ON(ret); 7629 } 7630 return 0; 7631 } 7632 7633 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) 7634 { 7635 struct btrfs_trans_handle *trans; 7636 struct btrfs_root *reloc_root; 7637 struct btrfs_root *prev_root = NULL; 7638 struct list_head dead_roots; 7639 int ret; 7640 unsigned long nr; 7641 7642 INIT_LIST_HEAD(&dead_roots); 7643 list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots); 7644 7645 while (!list_empty(&dead_roots)) { 7646 reloc_root = list_entry(dead_roots.prev, 7647 struct btrfs_root, dead_list); 7648 list_del_init(&reloc_root->dead_list); 7649 7650 BUG_ON(reloc_root->commit_root != NULL); 7651 while (1) { 7652 trans = btrfs_join_transaction(root, 1); 7653 BUG_ON(IS_ERR(trans)); 7654 7655 mutex_lock(&root->fs_info->drop_mutex); 7656 ret = btrfs_drop_snapshot(trans, reloc_root); 7657 if (ret != -EAGAIN) 7658 break; 7659 mutex_unlock(&root->fs_info->drop_mutex); 7660 7661 nr = trans->blocks_used; 7662 ret = btrfs_end_transaction(trans, root); 7663 BUG_ON(ret); 7664 btrfs_btree_balance_dirty(root, nr); 7665 } 7666 7667 free_extent_buffer(reloc_root->node); 7668 7669 ret = btrfs_del_root(trans, root->fs_info->tree_root, 7670 &reloc_root->root_key); 7671 BUG_ON(ret); 7672 mutex_unlock(&root->fs_info->drop_mutex); 7673 7674 nr = trans->blocks_used; 7675 ret = btrfs_end_transaction(trans, root); 7676 BUG_ON(ret); 7677 btrfs_btree_balance_dirty(root, nr); 7678 7679 kfree(prev_root); 7680 prev_root = reloc_root; 7681 } 7682 if (prev_root) { 7683 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0); 7684 kfree(prev_root); 7685 } 7686 return 0; 7687 } 7688 7689 int btrfs_add_dead_reloc_root(struct btrfs_root *root) 7690 { 7691 list_add(&root->dead_list, &root->fs_info->dead_reloc_roots); 7692 return 0; 7693 } 7694 7695 int btrfs_cleanup_reloc_trees(struct btrfs_root *root) 7696 { 7697 struct btrfs_root *reloc_root; 7698 struct btrfs_trans_handle *trans; 7699 struct btrfs_key location; 7700 int found; 7701 int ret; 7702 7703 mutex_lock(&root->fs_info->tree_reloc_mutex); 7704 ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL); 7705 BUG_ON(ret); 7706 found = !list_empty(&root->fs_info->dead_reloc_roots); 7707 mutex_unlock(&root->fs_info->tree_reloc_mutex); 7708 7709 if (found) { 7710 trans = btrfs_start_transaction(root, 1); 7711 BUG_ON(IS_ERR(trans)); 7712 ret = btrfs_commit_transaction(trans, root); 7713 BUG_ON(ret); 7714 } 7715 7716 location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; 7717 location.offset = (u64)-1; 7718 location.type = BTRFS_ROOT_ITEM_KEY; 7719 7720 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 7721 BUG_ON(!reloc_root); 7722 ret = btrfs_orphan_cleanup(reloc_root); 7723 BUG_ON(ret); 7724 return 0; 7725 } 7726 7727 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, 7728 struct btrfs_root *root) 7729 { 7730 struct btrfs_root *reloc_root; 7731 struct extent_buffer *eb; 7732 struct btrfs_root_item *root_item; 7733 struct btrfs_key root_key; 7734 int ret; 7735 7736 BUG_ON(!root->ref_cows); 7737 if (root->reloc_root) 7738 return 0; 7739 7740 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 7741 if (!root_item) 7742 return -ENOMEM; 7743 7744 ret = btrfs_copy_root(trans, root, root->commit_root, 7745 &eb, BTRFS_TREE_RELOC_OBJECTID); 7746 BUG_ON(ret); 7747 7748 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 7749 root_key.offset = root->root_key.objectid; 7750 root_key.type = BTRFS_ROOT_ITEM_KEY; 7751 7752 memcpy(root_item, &root->root_item, sizeof(root_item)); 7753 btrfs_set_root_refs(root_item, 0); 7754 btrfs_set_root_bytenr(root_item, eb->start); 7755 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 7756 btrfs_set_root_generation(root_item, trans->transid); 7757 7758 btrfs_tree_unlock(eb); 7759 free_extent_buffer(eb); 7760 7761 ret = btrfs_insert_root(trans, root->fs_info->tree_root, 7762 &root_key, root_item); 7763 BUG_ON(ret); 7764 kfree(root_item); 7765 7766 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, 7767 &root_key); 7768 BUG_ON(IS_ERR(reloc_root)); 7769 reloc_root->last_trans = trans->transid; 7770 reloc_root->commit_root = NULL; 7771 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; 7772 7773 root->reloc_root = reloc_root; 7774 return 0; 7775 } 7776 7777 /* 7778 * Core function of space balance. 7779 * 7780 * The idea is using reloc trees to relocate tree blocks in reference 7781 * counted roots. There is one reloc tree for each subvol, and all 7782 * reloc trees share same root key objectid. Reloc trees are snapshots 7783 * of the latest committed roots of subvols (root->commit_root). 7784 * 7785 * To relocate a tree block referenced by a subvol, there are two steps. 7786 * COW the block through subvol's reloc tree, then update block pointer 7787 * in the subvol to point to the new block. Since all reloc trees share 7788 * same root key objectid, doing special handing for tree blocks owned 7789 * by them is easy. Once a tree block has been COWed in one reloc tree, 7790 * we can use the resulting new block directly when the same block is 7791 * required to COW again through other reloc trees. By this way, relocated 7792 * tree blocks are shared between reloc trees, so they are also shared 7793 * between subvols. 7794 */ 7795 static noinline int relocate_one_path(struct btrfs_trans_handle *trans, 7796 struct btrfs_root *root, 7797 struct btrfs_path *path, 7798 struct btrfs_key *first_key, 7799 struct btrfs_ref_path *ref_path, 7800 struct btrfs_block_group_cache *group, 7801 struct inode *reloc_inode) 7802 { 7803 struct btrfs_root *reloc_root; 7804 struct extent_buffer *eb = NULL; 7805 struct btrfs_key *keys; 7806 u64 *nodes; 7807 int level; 7808 int shared_level; 7809 int lowest_level = 0; 7810 int ret; 7811 7812 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 7813 lowest_level = ref_path->owner_objectid; 7814 7815 if (!root->ref_cows) { 7816 path->lowest_level = lowest_level; 7817 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1); 7818 BUG_ON(ret < 0); 7819 path->lowest_level = 0; 7820 btrfs_release_path(root, path); 7821 return 0; 7822 } 7823 7824 mutex_lock(&root->fs_info->tree_reloc_mutex); 7825 ret = init_reloc_tree(trans, root); 7826 BUG_ON(ret); 7827 reloc_root = root->reloc_root; 7828 7829 shared_level = ref_path->shared_level; 7830 ref_path->shared_level = BTRFS_MAX_LEVEL - 1; 7831 7832 keys = ref_path->node_keys; 7833 nodes = ref_path->new_nodes; 7834 memset(&keys[shared_level + 1], 0, 7835 sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1)); 7836 memset(&nodes[shared_level + 1], 0, 7837 sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1)); 7838 7839 if (nodes[lowest_level] == 0) { 7840 path->lowest_level = lowest_level; 7841 ret = btrfs_search_slot(trans, reloc_root, first_key, path, 7842 0, 1); 7843 BUG_ON(ret); 7844 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) { 7845 eb = path->nodes[level]; 7846 if (!eb || eb == reloc_root->node) 7847 break; 7848 nodes[level] = eb->start; 7849 if (level == 0) 7850 btrfs_item_key_to_cpu(eb, &keys[level], 0); 7851 else 7852 btrfs_node_key_to_cpu(eb, &keys[level], 0); 7853 } 7854 if (nodes[0] && 7855 ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 7856 eb = path->nodes[0]; 7857 ret = replace_extents_in_leaf(trans, reloc_root, eb, 7858 group, reloc_inode); 7859 BUG_ON(ret); 7860 } 7861 btrfs_release_path(reloc_root, path); 7862 } else { 7863 ret = btrfs_merge_path(trans, reloc_root, keys, nodes, 7864 lowest_level); 7865 BUG_ON(ret); 7866 } 7867 7868 /* 7869 * replace tree blocks in the fs tree with tree blocks in 7870 * the reloc tree. 7871 */ 7872 ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level); 7873 BUG_ON(ret < 0); 7874 7875 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 7876 ret = btrfs_search_slot(trans, reloc_root, first_key, path, 7877 0, 0); 7878 BUG_ON(ret); 7879 extent_buffer_get(path->nodes[0]); 7880 eb = path->nodes[0]; 7881 btrfs_release_path(reloc_root, path); 7882 ret = invalidate_extent_cache(reloc_root, eb, group, root); 7883 BUG_ON(ret); 7884 free_extent_buffer(eb); 7885 } 7886 7887 mutex_unlock(&root->fs_info->tree_reloc_mutex); 7888 path->lowest_level = 0; 7889 return 0; 7890 } 7891 7892 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, 7893 struct btrfs_root *root, 7894 struct btrfs_path *path, 7895 struct btrfs_key *first_key, 7896 struct btrfs_ref_path *ref_path) 7897 { 7898 int ret; 7899 7900 ret = relocate_one_path(trans, root, path, first_key, 7901 ref_path, NULL, NULL); 7902 BUG_ON(ret); 7903 7904 return 0; 7905 } 7906 7907 static noinline int del_extent_zero(struct btrfs_trans_handle *trans, 7908 struct btrfs_root *extent_root, 7909 struct btrfs_path *path, 7910 struct btrfs_key *extent_key) 7911 { 7912 int ret; 7913 7914 ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1); 7915 if (ret) 7916 goto out; 7917 ret = btrfs_del_item(trans, extent_root, path); 7918 out: 7919 btrfs_release_path(extent_root, path); 7920 return ret; 7921 } 7922 7923 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info, 7924 struct btrfs_ref_path *ref_path) 7925 { 7926 struct btrfs_key root_key; 7927 7928 root_key.objectid = ref_path->root_objectid; 7929 root_key.type = BTRFS_ROOT_ITEM_KEY; 7930 if (is_cowonly_root(ref_path->root_objectid)) 7931 root_key.offset = 0; 7932 else 7933 root_key.offset = (u64)-1; 7934 7935 return btrfs_read_fs_root_no_name(fs_info, &root_key); 7936 } 7937 7938 static noinline int relocate_one_extent(struct btrfs_root *extent_root, 7939 struct btrfs_path *path, 7940 struct btrfs_key *extent_key, 7941 struct btrfs_block_group_cache *group, 7942 struct inode *reloc_inode, int pass) 7943 { 7944 struct btrfs_trans_handle *trans; 7945 struct btrfs_root *found_root; 7946 struct btrfs_ref_path *ref_path = NULL; 7947 struct disk_extent *new_extents = NULL; 7948 int nr_extents = 0; 7949 int loops; 7950 int ret; 7951 int level; 7952 struct btrfs_key first_key; 7953 u64 prev_block = 0; 7954 7955 7956 trans = btrfs_start_transaction(extent_root, 1); 7957 BUG_ON(IS_ERR(trans)); 7958 7959 if (extent_key->objectid == 0) { 7960 ret = del_extent_zero(trans, extent_root, path, extent_key); 7961 goto out; 7962 } 7963 7964 ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS); 7965 if (!ref_path) { 7966 ret = -ENOMEM; 7967 goto out; 7968 } 7969 7970 for (loops = 0; ; loops++) { 7971 if (loops == 0) { 7972 ret = btrfs_first_ref_path(trans, extent_root, ref_path, 7973 extent_key->objectid); 7974 } else { 7975 ret = btrfs_next_ref_path(trans, extent_root, ref_path); 7976 } 7977 if (ret < 0) 7978 goto out; 7979 if (ret > 0) 7980 break; 7981 7982 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID || 7983 ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID) 7984 continue; 7985 7986 found_root = read_ref_root(extent_root->fs_info, ref_path); 7987 BUG_ON(!found_root); 7988 /* 7989 * for reference counted tree, only process reference paths 7990 * rooted at the latest committed root. 7991 */ 7992 if (found_root->ref_cows && 7993 ref_path->root_generation != found_root->root_key.offset) 7994 continue; 7995 7996 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 7997 if (pass == 0) { 7998 /* 7999 * copy data extents to new locations 8000 */ 8001 u64 group_start = group->key.objectid; 8002 ret = relocate_data_extent(reloc_inode, 8003 extent_key, 8004 group_start); 8005 if (ret < 0) 8006 goto out; 8007 break; 8008 } 8009 level = 0; 8010 } else { 8011 level = ref_path->owner_objectid; 8012 } 8013 8014 if (prev_block != ref_path->nodes[level]) { 8015 struct extent_buffer *eb; 8016 u64 block_start = ref_path->nodes[level]; 8017 u64 block_size = btrfs_level_size(found_root, level); 8018 8019 eb = read_tree_block(found_root, block_start, 8020 block_size, 0); 8021 if (!eb) { 8022 ret = -EIO; 8023 goto out; 8024 } 8025 btrfs_tree_lock(eb); 8026 BUG_ON(level != btrfs_header_level(eb)); 8027 8028 if (level == 0) 8029 btrfs_item_key_to_cpu(eb, &first_key, 0); 8030 else 8031 btrfs_node_key_to_cpu(eb, &first_key, 0); 8032 8033 btrfs_tree_unlock(eb); 8034 free_extent_buffer(eb); 8035 prev_block = block_start; 8036 } 8037 8038 mutex_lock(&extent_root->fs_info->trans_mutex); 8039 btrfs_record_root_in_trans(found_root); 8040 mutex_unlock(&extent_root->fs_info->trans_mutex); 8041 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 8042 /* 8043 * try to update data extent references while 8044 * keeping metadata shared between snapshots. 8045 */ 8046 if (pass == 1) { 8047 ret = relocate_one_path(trans, found_root, 8048 path, &first_key, ref_path, 8049 group, reloc_inode); 8050 if (ret < 0) 8051 goto out; 8052 continue; 8053 } 8054 /* 8055 * use fallback method to process the remaining 8056 * references. 8057 */ 8058 if (!new_extents) { 8059 u64 group_start = group->key.objectid; 8060 new_extents = kmalloc(sizeof(*new_extents), 8061 GFP_NOFS); 8062 if (!new_extents) { 8063 ret = -ENOMEM; 8064 goto out; 8065 } 8066 nr_extents = 1; 8067 ret = get_new_locations(reloc_inode, 8068 extent_key, 8069 group_start, 1, 8070 &new_extents, 8071 &nr_extents); 8072 if (ret) 8073 goto out; 8074 } 8075 ret = replace_one_extent(trans, found_root, 8076 path, extent_key, 8077 &first_key, ref_path, 8078 new_extents, nr_extents); 8079 } else { 8080 ret = relocate_tree_block(trans, found_root, path, 8081 &first_key, ref_path); 8082 } 8083 if (ret < 0) 8084 goto out; 8085 } 8086 ret = 0; 8087 out: 8088 btrfs_end_transaction(trans, extent_root); 8089 kfree(new_extents); 8090 kfree(ref_path); 8091 return ret; 8092 } 8093 #endif 8094 8095 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 8096 { 8097 u64 num_devices; 8098 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 8099 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 8100 8101 /* 8102 * we add in the count of missing devices because we want 8103 * to make sure that any RAID levels on a degraded FS 8104 * continue to be honored. 8105 */ 8106 num_devices = root->fs_info->fs_devices->rw_devices + 8107 root->fs_info->fs_devices->missing_devices; 8108 8109 if (num_devices == 1) { 8110 stripped |= BTRFS_BLOCK_GROUP_DUP; 8111 stripped = flags & ~stripped; 8112 8113 /* turn raid0 into single device chunks */ 8114 if (flags & BTRFS_BLOCK_GROUP_RAID0) 8115 return stripped; 8116 8117 /* turn mirroring into duplication */ 8118 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 8119 BTRFS_BLOCK_GROUP_RAID10)) 8120 return stripped | BTRFS_BLOCK_GROUP_DUP; 8121 return flags; 8122 } else { 8123 /* they already had raid on here, just return */ 8124 if (flags & stripped) 8125 return flags; 8126 8127 stripped |= BTRFS_BLOCK_GROUP_DUP; 8128 stripped = flags & ~stripped; 8129 8130 /* switch duplicated blocks with raid1 */ 8131 if (flags & BTRFS_BLOCK_GROUP_DUP) 8132 return stripped | BTRFS_BLOCK_GROUP_RAID1; 8133 8134 /* turn single device chunks into raid0 */ 8135 return stripped | BTRFS_BLOCK_GROUP_RAID0; 8136 } 8137 return flags; 8138 } 8139 8140 static int set_block_group_ro(struct btrfs_block_group_cache *cache) 8141 { 8142 struct btrfs_space_info *sinfo = cache->space_info; 8143 u64 num_bytes; 8144 int ret = -ENOSPC; 8145 8146 if (cache->ro) 8147 return 0; 8148 8149 spin_lock(&sinfo->lock); 8150 spin_lock(&cache->lock); 8151 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 8152 cache->bytes_super - btrfs_block_group_used(&cache->item); 8153 8154 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 8155 sinfo->bytes_may_use + sinfo->bytes_readonly + 8156 cache->reserved_pinned + num_bytes <= sinfo->total_bytes) { 8157 sinfo->bytes_readonly += num_bytes; 8158 sinfo->bytes_reserved += cache->reserved_pinned; 8159 cache->reserved_pinned = 0; 8160 cache->ro = 1; 8161 ret = 0; 8162 } 8163 8164 spin_unlock(&cache->lock); 8165 spin_unlock(&sinfo->lock); 8166 return ret; 8167 } 8168 8169 int btrfs_set_block_group_ro(struct btrfs_root *root, 8170 struct btrfs_block_group_cache *cache) 8171 8172 { 8173 struct btrfs_trans_handle *trans; 8174 u64 alloc_flags; 8175 int ret; 8176 8177 BUG_ON(cache->ro); 8178 8179 trans = btrfs_join_transaction(root, 1); 8180 BUG_ON(IS_ERR(trans)); 8181 8182 alloc_flags = update_block_group_flags(root, cache->flags); 8183 if (alloc_flags != cache->flags) 8184 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 8185 CHUNK_ALLOC_FORCE); 8186 8187 ret = set_block_group_ro(cache); 8188 if (!ret) 8189 goto out; 8190 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 8191 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 8192 CHUNK_ALLOC_FORCE); 8193 if (ret < 0) 8194 goto out; 8195 ret = set_block_group_ro(cache); 8196 out: 8197 btrfs_end_transaction(trans, root); 8198 return ret; 8199 } 8200 8201 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 8202 struct btrfs_root *root, u64 type) 8203 { 8204 u64 alloc_flags = get_alloc_profile(root, type); 8205 return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 8206 CHUNK_ALLOC_FORCE); 8207 } 8208 8209 /* 8210 * helper to account the unused space of all the readonly block group in the 8211 * list. takes mirrors into account. 8212 */ 8213 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) 8214 { 8215 struct btrfs_block_group_cache *block_group; 8216 u64 free_bytes = 0; 8217 int factor; 8218 8219 list_for_each_entry(block_group, groups_list, list) { 8220 spin_lock(&block_group->lock); 8221 8222 if (!block_group->ro) { 8223 spin_unlock(&block_group->lock); 8224 continue; 8225 } 8226 8227 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 8228 BTRFS_BLOCK_GROUP_RAID10 | 8229 BTRFS_BLOCK_GROUP_DUP)) 8230 factor = 2; 8231 else 8232 factor = 1; 8233 8234 free_bytes += (block_group->key.offset - 8235 btrfs_block_group_used(&block_group->item)) * 8236 factor; 8237 8238 spin_unlock(&block_group->lock); 8239 } 8240 8241 return free_bytes; 8242 } 8243 8244 /* 8245 * helper to account the unused space of all the readonly block group in the 8246 * space_info. takes mirrors into account. 8247 */ 8248 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 8249 { 8250 int i; 8251 u64 free_bytes = 0; 8252 8253 spin_lock(&sinfo->lock); 8254 8255 for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) 8256 if (!list_empty(&sinfo->block_groups[i])) 8257 free_bytes += __btrfs_get_ro_block_group_free_space( 8258 &sinfo->block_groups[i]); 8259 8260 spin_unlock(&sinfo->lock); 8261 8262 return free_bytes; 8263 } 8264 8265 int btrfs_set_block_group_rw(struct btrfs_root *root, 8266 struct btrfs_block_group_cache *cache) 8267 { 8268 struct btrfs_space_info *sinfo = cache->space_info; 8269 u64 num_bytes; 8270 8271 BUG_ON(!cache->ro); 8272 8273 spin_lock(&sinfo->lock); 8274 spin_lock(&cache->lock); 8275 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 8276 cache->bytes_super - btrfs_block_group_used(&cache->item); 8277 sinfo->bytes_readonly -= num_bytes; 8278 cache->ro = 0; 8279 spin_unlock(&cache->lock); 8280 spin_unlock(&sinfo->lock); 8281 return 0; 8282 } 8283 8284 /* 8285 * checks to see if its even possible to relocate this block group. 8286 * 8287 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 8288 * ok to go ahead and try. 8289 */ 8290 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) 8291 { 8292 struct btrfs_block_group_cache *block_group; 8293 struct btrfs_space_info *space_info; 8294 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 8295 struct btrfs_device *device; 8296 int full = 0; 8297 int ret = 0; 8298 8299 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 8300 8301 /* odd, couldn't find the block group, leave it alone */ 8302 if (!block_group) 8303 return -1; 8304 8305 /* no bytes used, we're good */ 8306 if (!btrfs_block_group_used(&block_group->item)) 8307 goto out; 8308 8309 space_info = block_group->space_info; 8310 spin_lock(&space_info->lock); 8311 8312 full = space_info->full; 8313 8314 /* 8315 * if this is the last block group we have in this space, we can't 8316 * relocate it unless we're able to allocate a new chunk below. 8317 * 8318 * Otherwise, we need to make sure we have room in the space to handle 8319 * all of the extents from this block group. If we can, we're good 8320 */ 8321 if ((space_info->total_bytes != block_group->key.offset) && 8322 (space_info->bytes_used + space_info->bytes_reserved + 8323 space_info->bytes_pinned + space_info->bytes_readonly + 8324 btrfs_block_group_used(&block_group->item) < 8325 space_info->total_bytes)) { 8326 spin_unlock(&space_info->lock); 8327 goto out; 8328 } 8329 spin_unlock(&space_info->lock); 8330 8331 /* 8332 * ok we don't have enough space, but maybe we have free space on our 8333 * devices to allocate new chunks for relocation, so loop through our 8334 * alloc devices and guess if we have enough space. However, if we 8335 * were marked as full, then we know there aren't enough chunks, and we 8336 * can just return. 8337 */ 8338 ret = -1; 8339 if (full) 8340 goto out; 8341 8342 mutex_lock(&root->fs_info->chunk_mutex); 8343 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8344 u64 min_free = btrfs_block_group_used(&block_group->item); 8345 u64 dev_offset; 8346 8347 /* 8348 * check to make sure we can actually find a chunk with enough 8349 * space to fit our block group in. 8350 */ 8351 if (device->total_bytes > device->bytes_used + min_free) { 8352 ret = find_free_dev_extent(NULL, device, min_free, 8353 &dev_offset, NULL); 8354 if (!ret) 8355 break; 8356 ret = -1; 8357 } 8358 } 8359 mutex_unlock(&root->fs_info->chunk_mutex); 8360 out: 8361 btrfs_put_block_group(block_group); 8362 return ret; 8363 } 8364 8365 static int find_first_block_group(struct btrfs_root *root, 8366 struct btrfs_path *path, struct btrfs_key *key) 8367 { 8368 int ret = 0; 8369 struct btrfs_key found_key; 8370 struct extent_buffer *leaf; 8371 int slot; 8372 8373 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 8374 if (ret < 0) 8375 goto out; 8376 8377 while (1) { 8378 slot = path->slots[0]; 8379 leaf = path->nodes[0]; 8380 if (slot >= btrfs_header_nritems(leaf)) { 8381 ret = btrfs_next_leaf(root, path); 8382 if (ret == 0) 8383 continue; 8384 if (ret < 0) 8385 goto out; 8386 break; 8387 } 8388 btrfs_item_key_to_cpu(leaf, &found_key, slot); 8389 8390 if (found_key.objectid >= key->objectid && 8391 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 8392 ret = 0; 8393 goto out; 8394 } 8395 path->slots[0]++; 8396 } 8397 out: 8398 return ret; 8399 } 8400 8401 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 8402 { 8403 struct btrfs_block_group_cache *block_group; 8404 u64 last = 0; 8405 8406 while (1) { 8407 struct inode *inode; 8408 8409 block_group = btrfs_lookup_first_block_group(info, last); 8410 while (block_group) { 8411 spin_lock(&block_group->lock); 8412 if (block_group->iref) 8413 break; 8414 spin_unlock(&block_group->lock); 8415 block_group = next_block_group(info->tree_root, 8416 block_group); 8417 } 8418 if (!block_group) { 8419 if (last == 0) 8420 break; 8421 last = 0; 8422 continue; 8423 } 8424 8425 inode = block_group->inode; 8426 block_group->iref = 0; 8427 block_group->inode = NULL; 8428 spin_unlock(&block_group->lock); 8429 iput(inode); 8430 last = block_group->key.objectid + block_group->key.offset; 8431 btrfs_put_block_group(block_group); 8432 } 8433 } 8434 8435 int btrfs_free_block_groups(struct btrfs_fs_info *info) 8436 { 8437 struct btrfs_block_group_cache *block_group; 8438 struct btrfs_space_info *space_info; 8439 struct btrfs_caching_control *caching_ctl; 8440 struct rb_node *n; 8441 8442 down_write(&info->extent_commit_sem); 8443 while (!list_empty(&info->caching_block_groups)) { 8444 caching_ctl = list_entry(info->caching_block_groups.next, 8445 struct btrfs_caching_control, list); 8446 list_del(&caching_ctl->list); 8447 put_caching_control(caching_ctl); 8448 } 8449 up_write(&info->extent_commit_sem); 8450 8451 spin_lock(&info->block_group_cache_lock); 8452 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 8453 block_group = rb_entry(n, struct btrfs_block_group_cache, 8454 cache_node); 8455 rb_erase(&block_group->cache_node, 8456 &info->block_group_cache_tree); 8457 spin_unlock(&info->block_group_cache_lock); 8458 8459 down_write(&block_group->space_info->groups_sem); 8460 list_del(&block_group->list); 8461 up_write(&block_group->space_info->groups_sem); 8462 8463 if (block_group->cached == BTRFS_CACHE_STARTED) 8464 wait_block_group_cache_done(block_group); 8465 8466 /* 8467 * We haven't cached this block group, which means we could 8468 * possibly have excluded extents on this block group. 8469 */ 8470 if (block_group->cached == BTRFS_CACHE_NO) 8471 free_excluded_extents(info->extent_root, block_group); 8472 8473 btrfs_remove_free_space_cache(block_group); 8474 btrfs_put_block_group(block_group); 8475 8476 spin_lock(&info->block_group_cache_lock); 8477 } 8478 spin_unlock(&info->block_group_cache_lock); 8479 8480 /* now that all the block groups are freed, go through and 8481 * free all the space_info structs. This is only called during 8482 * the final stages of unmount, and so we know nobody is 8483 * using them. We call synchronize_rcu() once before we start, 8484 * just to be on the safe side. 8485 */ 8486 synchronize_rcu(); 8487 8488 release_global_block_rsv(info); 8489 8490 while(!list_empty(&info->space_info)) { 8491 space_info = list_entry(info->space_info.next, 8492 struct btrfs_space_info, 8493 list); 8494 if (space_info->bytes_pinned > 0 || 8495 space_info->bytes_reserved > 0) { 8496 WARN_ON(1); 8497 dump_space_info(space_info, 0, 0); 8498 } 8499 list_del(&space_info->list); 8500 kfree(space_info); 8501 } 8502 return 0; 8503 } 8504 8505 static void __link_block_group(struct btrfs_space_info *space_info, 8506 struct btrfs_block_group_cache *cache) 8507 { 8508 int index = get_block_group_index(cache); 8509 8510 down_write(&space_info->groups_sem); 8511 list_add_tail(&cache->list, &space_info->block_groups[index]); 8512 up_write(&space_info->groups_sem); 8513 } 8514 8515 int btrfs_read_block_groups(struct btrfs_root *root) 8516 { 8517 struct btrfs_path *path; 8518 int ret; 8519 struct btrfs_block_group_cache *cache; 8520 struct btrfs_fs_info *info = root->fs_info; 8521 struct btrfs_space_info *space_info; 8522 struct btrfs_key key; 8523 struct btrfs_key found_key; 8524 struct extent_buffer *leaf; 8525 int need_clear = 0; 8526 u64 cache_gen; 8527 8528 root = info->extent_root; 8529 key.objectid = 0; 8530 key.offset = 0; 8531 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); 8532 path = btrfs_alloc_path(); 8533 if (!path) 8534 return -ENOMEM; 8535 8536 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); 8537 if (cache_gen != 0 && 8538 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) 8539 need_clear = 1; 8540 if (btrfs_test_opt(root, CLEAR_CACHE)) 8541 need_clear = 1; 8542 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen) 8543 printk(KERN_INFO "btrfs: disk space caching is enabled\n"); 8544 8545 while (1) { 8546 ret = find_first_block_group(root, path, &key); 8547 if (ret > 0) 8548 break; 8549 if (ret != 0) 8550 goto error; 8551 leaf = path->nodes[0]; 8552 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 8553 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8554 if (!cache) { 8555 ret = -ENOMEM; 8556 goto error; 8557 } 8558 8559 atomic_set(&cache->count, 1); 8560 spin_lock_init(&cache->lock); 8561 spin_lock_init(&cache->tree_lock); 8562 cache->fs_info = info; 8563 INIT_LIST_HEAD(&cache->list); 8564 INIT_LIST_HEAD(&cache->cluster_list); 8565 8566 if (need_clear) 8567 cache->disk_cache_state = BTRFS_DC_CLEAR; 8568 8569 /* 8570 * we only want to have 32k of ram per block group for keeping 8571 * track of free space, and if we pass 1/2 of that we want to 8572 * start converting things over to using bitmaps 8573 */ 8574 cache->extents_thresh = ((1024 * 32) / 2) / 8575 sizeof(struct btrfs_free_space); 8576 8577 read_extent_buffer(leaf, &cache->item, 8578 btrfs_item_ptr_offset(leaf, path->slots[0]), 8579 sizeof(cache->item)); 8580 memcpy(&cache->key, &found_key, sizeof(found_key)); 8581 8582 key.objectid = found_key.objectid + found_key.offset; 8583 btrfs_release_path(root, path); 8584 cache->flags = btrfs_block_group_flags(&cache->item); 8585 cache->sectorsize = root->sectorsize; 8586 8587 /* 8588 * We need to exclude the super stripes now so that the space 8589 * info has super bytes accounted for, otherwise we'll think 8590 * we have more space than we actually do. 8591 */ 8592 exclude_super_stripes(root, cache); 8593 8594 /* 8595 * check for two cases, either we are full, and therefore 8596 * don't need to bother with the caching work since we won't 8597 * find any space, or we are empty, and we can just add all 8598 * the space in and be done with it. This saves us _alot_ of 8599 * time, particularly in the full case. 8600 */ 8601 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 8602 cache->last_byte_to_unpin = (u64)-1; 8603 cache->cached = BTRFS_CACHE_FINISHED; 8604 free_excluded_extents(root, cache); 8605 } else if (btrfs_block_group_used(&cache->item) == 0) { 8606 cache->last_byte_to_unpin = (u64)-1; 8607 cache->cached = BTRFS_CACHE_FINISHED; 8608 add_new_free_space(cache, root->fs_info, 8609 found_key.objectid, 8610 found_key.objectid + 8611 found_key.offset); 8612 free_excluded_extents(root, cache); 8613 } 8614 8615 ret = update_space_info(info, cache->flags, found_key.offset, 8616 btrfs_block_group_used(&cache->item), 8617 &space_info); 8618 BUG_ON(ret); 8619 cache->space_info = space_info; 8620 spin_lock(&cache->space_info->lock); 8621 cache->space_info->bytes_readonly += cache->bytes_super; 8622 spin_unlock(&cache->space_info->lock); 8623 8624 __link_block_group(space_info, cache); 8625 8626 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8627 BUG_ON(ret); 8628 8629 set_avail_alloc_bits(root->fs_info, cache->flags); 8630 if (btrfs_chunk_readonly(root, cache->key.objectid)) 8631 set_block_group_ro(cache); 8632 } 8633 8634 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 8635 if (!(get_alloc_profile(root, space_info->flags) & 8636 (BTRFS_BLOCK_GROUP_RAID10 | 8637 BTRFS_BLOCK_GROUP_RAID1 | 8638 BTRFS_BLOCK_GROUP_DUP))) 8639 continue; 8640 /* 8641 * avoid allocating from un-mirrored block group if there are 8642 * mirrored block groups. 8643 */ 8644 list_for_each_entry(cache, &space_info->block_groups[3], list) 8645 set_block_group_ro(cache); 8646 list_for_each_entry(cache, &space_info->block_groups[4], list) 8647 set_block_group_ro(cache); 8648 } 8649 8650 init_global_block_rsv(info); 8651 ret = 0; 8652 error: 8653 btrfs_free_path(path); 8654 return ret; 8655 } 8656 8657 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 8658 struct btrfs_root *root, u64 bytes_used, 8659 u64 type, u64 chunk_objectid, u64 chunk_offset, 8660 u64 size) 8661 { 8662 int ret; 8663 struct btrfs_root *extent_root; 8664 struct btrfs_block_group_cache *cache; 8665 8666 extent_root = root->fs_info->extent_root; 8667 8668 root->fs_info->last_trans_log_full_commit = trans->transid; 8669 8670 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8671 if (!cache) 8672 return -ENOMEM; 8673 8674 cache->key.objectid = chunk_offset; 8675 cache->key.offset = size; 8676 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8677 cache->sectorsize = root->sectorsize; 8678 cache->fs_info = root->fs_info; 8679 8680 /* 8681 * we only want to have 32k of ram per block group for keeping track 8682 * of free space, and if we pass 1/2 of that we want to start 8683 * converting things over to using bitmaps 8684 */ 8685 cache->extents_thresh = ((1024 * 32) / 2) / 8686 sizeof(struct btrfs_free_space); 8687 atomic_set(&cache->count, 1); 8688 spin_lock_init(&cache->lock); 8689 spin_lock_init(&cache->tree_lock); 8690 INIT_LIST_HEAD(&cache->list); 8691 INIT_LIST_HEAD(&cache->cluster_list); 8692 8693 btrfs_set_block_group_used(&cache->item, bytes_used); 8694 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 8695 cache->flags = type; 8696 btrfs_set_block_group_flags(&cache->item, type); 8697 8698 cache->last_byte_to_unpin = (u64)-1; 8699 cache->cached = BTRFS_CACHE_FINISHED; 8700 exclude_super_stripes(root, cache); 8701 8702 add_new_free_space(cache, root->fs_info, chunk_offset, 8703 chunk_offset + size); 8704 8705 free_excluded_extents(root, cache); 8706 8707 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 8708 &cache->space_info); 8709 BUG_ON(ret); 8710 8711 spin_lock(&cache->space_info->lock); 8712 cache->space_info->bytes_readonly += cache->bytes_super; 8713 spin_unlock(&cache->space_info->lock); 8714 8715 __link_block_group(cache->space_info, cache); 8716 8717 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8718 BUG_ON(ret); 8719 8720 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, 8721 sizeof(cache->item)); 8722 BUG_ON(ret); 8723 8724 set_avail_alloc_bits(extent_root->fs_info, type); 8725 8726 return 0; 8727 } 8728 8729 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 8730 struct btrfs_root *root, u64 group_start) 8731 { 8732 struct btrfs_path *path; 8733 struct btrfs_block_group_cache *block_group; 8734 struct btrfs_free_cluster *cluster; 8735 struct btrfs_root *tree_root = root->fs_info->tree_root; 8736 struct btrfs_key key; 8737 struct inode *inode; 8738 int ret; 8739 int factor; 8740 8741 root = root->fs_info->extent_root; 8742 8743 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 8744 BUG_ON(!block_group); 8745 BUG_ON(!block_group->ro); 8746 8747 /* 8748 * Free the reserved super bytes from this block group before 8749 * remove it. 8750 */ 8751 free_excluded_extents(root, block_group); 8752 8753 memcpy(&key, &block_group->key, sizeof(key)); 8754 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 8755 BTRFS_BLOCK_GROUP_RAID1 | 8756 BTRFS_BLOCK_GROUP_RAID10)) 8757 factor = 2; 8758 else 8759 factor = 1; 8760 8761 /* make sure this block group isn't part of an allocation cluster */ 8762 cluster = &root->fs_info->data_alloc_cluster; 8763 spin_lock(&cluster->refill_lock); 8764 btrfs_return_cluster_to_free_space(block_group, cluster); 8765 spin_unlock(&cluster->refill_lock); 8766 8767 /* 8768 * make sure this block group isn't part of a metadata 8769 * allocation cluster 8770 */ 8771 cluster = &root->fs_info->meta_alloc_cluster; 8772 spin_lock(&cluster->refill_lock); 8773 btrfs_return_cluster_to_free_space(block_group, cluster); 8774 spin_unlock(&cluster->refill_lock); 8775 8776 path = btrfs_alloc_path(); 8777 BUG_ON(!path); 8778 8779 inode = lookup_free_space_inode(root, block_group, path); 8780 if (!IS_ERR(inode)) { 8781 btrfs_orphan_add(trans, inode); 8782 clear_nlink(inode); 8783 /* One for the block groups ref */ 8784 spin_lock(&block_group->lock); 8785 if (block_group->iref) { 8786 block_group->iref = 0; 8787 block_group->inode = NULL; 8788 spin_unlock(&block_group->lock); 8789 iput(inode); 8790 } else { 8791 spin_unlock(&block_group->lock); 8792 } 8793 /* One for our lookup ref */ 8794 iput(inode); 8795 } 8796 8797 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 8798 key.offset = block_group->key.objectid; 8799 key.type = 0; 8800 8801 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 8802 if (ret < 0) 8803 goto out; 8804 if (ret > 0) 8805 btrfs_release_path(tree_root, path); 8806 if (ret == 0) { 8807 ret = btrfs_del_item(trans, tree_root, path); 8808 if (ret) 8809 goto out; 8810 btrfs_release_path(tree_root, path); 8811 } 8812 8813 spin_lock(&root->fs_info->block_group_cache_lock); 8814 rb_erase(&block_group->cache_node, 8815 &root->fs_info->block_group_cache_tree); 8816 spin_unlock(&root->fs_info->block_group_cache_lock); 8817 8818 down_write(&block_group->space_info->groups_sem); 8819 /* 8820 * we must use list_del_init so people can check to see if they 8821 * are still on the list after taking the semaphore 8822 */ 8823 list_del_init(&block_group->list); 8824 up_write(&block_group->space_info->groups_sem); 8825 8826 if (block_group->cached == BTRFS_CACHE_STARTED) 8827 wait_block_group_cache_done(block_group); 8828 8829 btrfs_remove_free_space_cache(block_group); 8830 8831 spin_lock(&block_group->space_info->lock); 8832 block_group->space_info->total_bytes -= block_group->key.offset; 8833 block_group->space_info->bytes_readonly -= block_group->key.offset; 8834 block_group->space_info->disk_total -= block_group->key.offset * factor; 8835 spin_unlock(&block_group->space_info->lock); 8836 8837 memcpy(&key, &block_group->key, sizeof(key)); 8838 8839 btrfs_clear_space_info_full(root->fs_info); 8840 8841 btrfs_put_block_group(block_group); 8842 btrfs_put_block_group(block_group); 8843 8844 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 8845 if (ret > 0) 8846 ret = -EIO; 8847 if (ret < 0) 8848 goto out; 8849 8850 ret = btrfs_del_item(trans, root, path); 8851 out: 8852 btrfs_free_path(path); 8853 return ret; 8854 } 8855 8856 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 8857 { 8858 struct btrfs_space_info *space_info; 8859 struct btrfs_super_block *disk_super; 8860 u64 features; 8861 u64 flags; 8862 int mixed = 0; 8863 int ret; 8864 8865 disk_super = &fs_info->super_copy; 8866 if (!btrfs_super_root(disk_super)) 8867 return 1; 8868 8869 features = btrfs_super_incompat_flags(disk_super); 8870 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 8871 mixed = 1; 8872 8873 flags = BTRFS_BLOCK_GROUP_SYSTEM; 8874 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8875 if (ret) 8876 goto out; 8877 8878 if (mixed) { 8879 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 8880 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8881 } else { 8882 flags = BTRFS_BLOCK_GROUP_METADATA; 8883 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8884 if (ret) 8885 goto out; 8886 8887 flags = BTRFS_BLOCK_GROUP_DATA; 8888 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8889 } 8890 out: 8891 return ret; 8892 } 8893 8894 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 8895 { 8896 return unpin_extent_range(root, start, end); 8897 } 8898 8899 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, 8900 u64 num_bytes, u64 *actual_bytes) 8901 { 8902 return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); 8903 } 8904 8905 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) 8906 { 8907 struct btrfs_fs_info *fs_info = root->fs_info; 8908 struct btrfs_block_group_cache *cache = NULL; 8909 u64 group_trimmed; 8910 u64 start; 8911 u64 end; 8912 u64 trimmed = 0; 8913 int ret = 0; 8914 8915 cache = btrfs_lookup_block_group(fs_info, range->start); 8916 8917 while (cache) { 8918 if (cache->key.objectid >= (range->start + range->len)) { 8919 btrfs_put_block_group(cache); 8920 break; 8921 } 8922 8923 start = max(range->start, cache->key.objectid); 8924 end = min(range->start + range->len, 8925 cache->key.objectid + cache->key.offset); 8926 8927 if (end - start >= range->minlen) { 8928 if (!block_group_cache_done(cache)) { 8929 ret = cache_block_group(cache, NULL, root, 0); 8930 if (!ret) 8931 wait_block_group_cache_done(cache); 8932 } 8933 ret = btrfs_trim_block_group(cache, 8934 &group_trimmed, 8935 start, 8936 end, 8937 range->minlen); 8938 8939 trimmed += group_trimmed; 8940 if (ret) { 8941 btrfs_put_block_group(cache); 8942 break; 8943 } 8944 } 8945 8946 cache = next_block_group(fs_info->tree_root, cache); 8947 } 8948 8949 range->len = trimmed; 8950 return ret; 8951 } 8952