1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include <linux/slab.h> 26 #include <linux/ratelimit.h> 27 #include "compat.h" 28 #include "hash.h" 29 #include "ctree.h" 30 #include "disk-io.h" 31 #include "print-tree.h" 32 #include "transaction.h" 33 #include "volumes.h" 34 #include "raid56.h" 35 #include "locking.h" 36 #include "free-space-cache.h" 37 #include "math.h" 38 39 #undef SCRAMBLE_DELAYED_REFS 40 41 /* 42 * control flags for do_chunk_alloc's force field 43 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 44 * if we really need one. 45 * 46 * CHUNK_ALLOC_LIMITED means to only try and allocate one 47 * if we have very few chunks already allocated. This is 48 * used as part of the clustering code to help make sure 49 * we have a good pool of storage to cluster in, without 50 * filling the FS with empty chunks 51 * 52 * CHUNK_ALLOC_FORCE means it must try to allocate one 53 * 54 */ 55 enum { 56 CHUNK_ALLOC_NO_FORCE = 0, 57 CHUNK_ALLOC_LIMITED = 1, 58 CHUNK_ALLOC_FORCE = 2, 59 }; 60 61 /* 62 * Control how reservations are dealt with. 63 * 64 * RESERVE_FREE - freeing a reservation. 65 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for 66 * ENOSPC accounting 67 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update 68 * bytes_may_use as the ENOSPC accounting is done elsewhere 69 */ 70 enum { 71 RESERVE_FREE = 0, 72 RESERVE_ALLOC = 1, 73 RESERVE_ALLOC_NO_ACCOUNT = 2, 74 }; 75 76 static int update_block_group(struct btrfs_root *root, 77 u64 bytenr, u64 num_bytes, int alloc); 78 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 79 struct btrfs_root *root, 80 u64 bytenr, u64 num_bytes, u64 parent, 81 u64 root_objectid, u64 owner_objectid, 82 u64 owner_offset, int refs_to_drop, 83 struct btrfs_delayed_extent_op *extra_op); 84 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 85 struct extent_buffer *leaf, 86 struct btrfs_extent_item *ei); 87 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 88 struct btrfs_root *root, 89 u64 parent, u64 root_objectid, 90 u64 flags, u64 owner, u64 offset, 91 struct btrfs_key *ins, int ref_mod); 92 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 93 struct btrfs_root *root, 94 u64 parent, u64 root_objectid, 95 u64 flags, struct btrfs_disk_key *key, 96 int level, struct btrfs_key *ins); 97 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 98 struct btrfs_root *extent_root, u64 flags, 99 int force); 100 static int find_next_key(struct btrfs_path *path, int level, 101 struct btrfs_key *key); 102 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 103 int dump_block_groups); 104 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 105 u64 num_bytes, int reserve); 106 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 107 u64 num_bytes); 108 int btrfs_pin_extent(struct btrfs_root *root, 109 u64 bytenr, u64 num_bytes, int reserved); 110 111 static noinline int 112 block_group_cache_done(struct btrfs_block_group_cache *cache) 113 { 114 smp_mb(); 115 return cache->cached == BTRFS_CACHE_FINISHED; 116 } 117 118 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 119 { 120 return (cache->flags & bits) == bits; 121 } 122 123 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 124 { 125 atomic_inc(&cache->count); 126 } 127 128 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 129 { 130 if (atomic_dec_and_test(&cache->count)) { 131 WARN_ON(cache->pinned > 0); 132 WARN_ON(cache->reserved > 0); 133 kfree(cache->free_space_ctl); 134 kfree(cache); 135 } 136 } 137 138 /* 139 * this adds the block group to the fs_info rb tree for the block group 140 * cache 141 */ 142 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 143 struct btrfs_block_group_cache *block_group) 144 { 145 struct rb_node **p; 146 struct rb_node *parent = NULL; 147 struct btrfs_block_group_cache *cache; 148 149 spin_lock(&info->block_group_cache_lock); 150 p = &info->block_group_cache_tree.rb_node; 151 152 while (*p) { 153 parent = *p; 154 cache = rb_entry(parent, struct btrfs_block_group_cache, 155 cache_node); 156 if (block_group->key.objectid < cache->key.objectid) { 157 p = &(*p)->rb_left; 158 } else if (block_group->key.objectid > cache->key.objectid) { 159 p = &(*p)->rb_right; 160 } else { 161 spin_unlock(&info->block_group_cache_lock); 162 return -EEXIST; 163 } 164 } 165 166 rb_link_node(&block_group->cache_node, parent, p); 167 rb_insert_color(&block_group->cache_node, 168 &info->block_group_cache_tree); 169 170 if (info->first_logical_byte > block_group->key.objectid) 171 info->first_logical_byte = block_group->key.objectid; 172 173 spin_unlock(&info->block_group_cache_lock); 174 175 return 0; 176 } 177 178 /* 179 * This will return the block group at or after bytenr if contains is 0, else 180 * it will return the block group that contains the bytenr 181 */ 182 static struct btrfs_block_group_cache * 183 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 184 int contains) 185 { 186 struct btrfs_block_group_cache *cache, *ret = NULL; 187 struct rb_node *n; 188 u64 end, start; 189 190 spin_lock(&info->block_group_cache_lock); 191 n = info->block_group_cache_tree.rb_node; 192 193 while (n) { 194 cache = rb_entry(n, struct btrfs_block_group_cache, 195 cache_node); 196 end = cache->key.objectid + cache->key.offset - 1; 197 start = cache->key.objectid; 198 199 if (bytenr < start) { 200 if (!contains && (!ret || start < ret->key.objectid)) 201 ret = cache; 202 n = n->rb_left; 203 } else if (bytenr > start) { 204 if (contains && bytenr <= end) { 205 ret = cache; 206 break; 207 } 208 n = n->rb_right; 209 } else { 210 ret = cache; 211 break; 212 } 213 } 214 if (ret) { 215 btrfs_get_block_group(ret); 216 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 217 info->first_logical_byte = ret->key.objectid; 218 } 219 spin_unlock(&info->block_group_cache_lock); 220 221 return ret; 222 } 223 224 static int add_excluded_extent(struct btrfs_root *root, 225 u64 start, u64 num_bytes) 226 { 227 u64 end = start + num_bytes - 1; 228 set_extent_bits(&root->fs_info->freed_extents[0], 229 start, end, EXTENT_UPTODATE, GFP_NOFS); 230 set_extent_bits(&root->fs_info->freed_extents[1], 231 start, end, EXTENT_UPTODATE, GFP_NOFS); 232 return 0; 233 } 234 235 static void free_excluded_extents(struct btrfs_root *root, 236 struct btrfs_block_group_cache *cache) 237 { 238 u64 start, end; 239 240 start = cache->key.objectid; 241 end = start + cache->key.offset - 1; 242 243 clear_extent_bits(&root->fs_info->freed_extents[0], 244 start, end, EXTENT_UPTODATE, GFP_NOFS); 245 clear_extent_bits(&root->fs_info->freed_extents[1], 246 start, end, EXTENT_UPTODATE, GFP_NOFS); 247 } 248 249 static int exclude_super_stripes(struct btrfs_root *root, 250 struct btrfs_block_group_cache *cache) 251 { 252 u64 bytenr; 253 u64 *logical; 254 int stripe_len; 255 int i, nr, ret; 256 257 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 258 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 259 cache->bytes_super += stripe_len; 260 ret = add_excluded_extent(root, cache->key.objectid, 261 stripe_len); 262 if (ret) 263 return ret; 264 } 265 266 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 267 bytenr = btrfs_sb_offset(i); 268 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 269 cache->key.objectid, bytenr, 270 0, &logical, &nr, &stripe_len); 271 if (ret) 272 return ret; 273 274 while (nr--) { 275 u64 start, len; 276 277 if (logical[nr] > cache->key.objectid + 278 cache->key.offset) 279 continue; 280 281 if (logical[nr] + stripe_len <= cache->key.objectid) 282 continue; 283 284 start = logical[nr]; 285 if (start < cache->key.objectid) { 286 start = cache->key.objectid; 287 len = (logical[nr] + stripe_len) - start; 288 } else { 289 len = min_t(u64, stripe_len, 290 cache->key.objectid + 291 cache->key.offset - start); 292 } 293 294 cache->bytes_super += len; 295 ret = add_excluded_extent(root, start, len); 296 if (ret) { 297 kfree(logical); 298 return ret; 299 } 300 } 301 302 kfree(logical); 303 } 304 return 0; 305 } 306 307 static struct btrfs_caching_control * 308 get_caching_control(struct btrfs_block_group_cache *cache) 309 { 310 struct btrfs_caching_control *ctl; 311 312 spin_lock(&cache->lock); 313 if (cache->cached != BTRFS_CACHE_STARTED) { 314 spin_unlock(&cache->lock); 315 return NULL; 316 } 317 318 /* We're loading it the fast way, so we don't have a caching_ctl. */ 319 if (!cache->caching_ctl) { 320 spin_unlock(&cache->lock); 321 return NULL; 322 } 323 324 ctl = cache->caching_ctl; 325 atomic_inc(&ctl->count); 326 spin_unlock(&cache->lock); 327 return ctl; 328 } 329 330 static void put_caching_control(struct btrfs_caching_control *ctl) 331 { 332 if (atomic_dec_and_test(&ctl->count)) 333 kfree(ctl); 334 } 335 336 /* 337 * this is only called by cache_block_group, since we could have freed extents 338 * we need to check the pinned_extents for any extents that can't be used yet 339 * since their free space will be released as soon as the transaction commits. 340 */ 341 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 342 struct btrfs_fs_info *info, u64 start, u64 end) 343 { 344 u64 extent_start, extent_end, size, total_added = 0; 345 int ret; 346 347 while (start < end) { 348 ret = find_first_extent_bit(info->pinned_extents, start, 349 &extent_start, &extent_end, 350 EXTENT_DIRTY | EXTENT_UPTODATE, 351 NULL); 352 if (ret) 353 break; 354 355 if (extent_start <= start) { 356 start = extent_end + 1; 357 } else if (extent_start > start && extent_start < end) { 358 size = extent_start - start; 359 total_added += size; 360 ret = btrfs_add_free_space(block_group, start, 361 size); 362 BUG_ON(ret); /* -ENOMEM or logic error */ 363 start = extent_end + 1; 364 } else { 365 break; 366 } 367 } 368 369 if (start < end) { 370 size = end - start; 371 total_added += size; 372 ret = btrfs_add_free_space(block_group, start, size); 373 BUG_ON(ret); /* -ENOMEM or logic error */ 374 } 375 376 return total_added; 377 } 378 379 static noinline void caching_thread(struct btrfs_work *work) 380 { 381 struct btrfs_block_group_cache *block_group; 382 struct btrfs_fs_info *fs_info; 383 struct btrfs_caching_control *caching_ctl; 384 struct btrfs_root *extent_root; 385 struct btrfs_path *path; 386 struct extent_buffer *leaf; 387 struct btrfs_key key; 388 u64 total_found = 0; 389 u64 last = 0; 390 u32 nritems; 391 int ret = 0; 392 393 caching_ctl = container_of(work, struct btrfs_caching_control, work); 394 block_group = caching_ctl->block_group; 395 fs_info = block_group->fs_info; 396 extent_root = fs_info->extent_root; 397 398 path = btrfs_alloc_path(); 399 if (!path) 400 goto out; 401 402 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 403 404 /* 405 * We don't want to deadlock with somebody trying to allocate a new 406 * extent for the extent root while also trying to search the extent 407 * root to add free space. So we skip locking and search the commit 408 * root, since its read-only 409 */ 410 path->skip_locking = 1; 411 path->search_commit_root = 1; 412 path->reada = 1; 413 414 key.objectid = last; 415 key.offset = 0; 416 key.type = BTRFS_EXTENT_ITEM_KEY; 417 again: 418 mutex_lock(&caching_ctl->mutex); 419 /* need to make sure the commit_root doesn't disappear */ 420 down_read(&fs_info->extent_commit_sem); 421 422 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 423 if (ret < 0) 424 goto err; 425 426 leaf = path->nodes[0]; 427 nritems = btrfs_header_nritems(leaf); 428 429 while (1) { 430 if (btrfs_fs_closing(fs_info) > 1) { 431 last = (u64)-1; 432 break; 433 } 434 435 if (path->slots[0] < nritems) { 436 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 437 } else { 438 ret = find_next_key(path, 0, &key); 439 if (ret) 440 break; 441 442 if (need_resched()) { 443 caching_ctl->progress = last; 444 btrfs_release_path(path); 445 up_read(&fs_info->extent_commit_sem); 446 mutex_unlock(&caching_ctl->mutex); 447 cond_resched(); 448 goto again; 449 } 450 451 ret = btrfs_next_leaf(extent_root, path); 452 if (ret < 0) 453 goto err; 454 if (ret) 455 break; 456 leaf = path->nodes[0]; 457 nritems = btrfs_header_nritems(leaf); 458 continue; 459 } 460 461 if (key.objectid < block_group->key.objectid) { 462 path->slots[0]++; 463 continue; 464 } 465 466 if (key.objectid >= block_group->key.objectid + 467 block_group->key.offset) 468 break; 469 470 if (key.type == BTRFS_EXTENT_ITEM_KEY || 471 key.type == BTRFS_METADATA_ITEM_KEY) { 472 total_found += add_new_free_space(block_group, 473 fs_info, last, 474 key.objectid); 475 if (key.type == BTRFS_METADATA_ITEM_KEY) 476 last = key.objectid + 477 fs_info->tree_root->leafsize; 478 else 479 last = key.objectid + key.offset; 480 481 if (total_found > (1024 * 1024 * 2)) { 482 total_found = 0; 483 wake_up(&caching_ctl->wait); 484 } 485 } 486 path->slots[0]++; 487 } 488 ret = 0; 489 490 total_found += add_new_free_space(block_group, fs_info, last, 491 block_group->key.objectid + 492 block_group->key.offset); 493 caching_ctl->progress = (u64)-1; 494 495 spin_lock(&block_group->lock); 496 block_group->caching_ctl = NULL; 497 block_group->cached = BTRFS_CACHE_FINISHED; 498 spin_unlock(&block_group->lock); 499 500 err: 501 btrfs_free_path(path); 502 up_read(&fs_info->extent_commit_sem); 503 504 free_excluded_extents(extent_root, block_group); 505 506 mutex_unlock(&caching_ctl->mutex); 507 out: 508 wake_up(&caching_ctl->wait); 509 510 put_caching_control(caching_ctl); 511 btrfs_put_block_group(block_group); 512 } 513 514 static int cache_block_group(struct btrfs_block_group_cache *cache, 515 int load_cache_only) 516 { 517 DEFINE_WAIT(wait); 518 struct btrfs_fs_info *fs_info = cache->fs_info; 519 struct btrfs_caching_control *caching_ctl; 520 int ret = 0; 521 522 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 523 if (!caching_ctl) 524 return -ENOMEM; 525 526 INIT_LIST_HEAD(&caching_ctl->list); 527 mutex_init(&caching_ctl->mutex); 528 init_waitqueue_head(&caching_ctl->wait); 529 caching_ctl->block_group = cache; 530 caching_ctl->progress = cache->key.objectid; 531 atomic_set(&caching_ctl->count, 1); 532 caching_ctl->work.func = caching_thread; 533 534 spin_lock(&cache->lock); 535 /* 536 * This should be a rare occasion, but this could happen I think in the 537 * case where one thread starts to load the space cache info, and then 538 * some other thread starts a transaction commit which tries to do an 539 * allocation while the other thread is still loading the space cache 540 * info. The previous loop should have kept us from choosing this block 541 * group, but if we've moved to the state where we will wait on caching 542 * block groups we need to first check if we're doing a fast load here, 543 * so we can wait for it to finish, otherwise we could end up allocating 544 * from a block group who's cache gets evicted for one reason or 545 * another. 546 */ 547 while (cache->cached == BTRFS_CACHE_FAST) { 548 struct btrfs_caching_control *ctl; 549 550 ctl = cache->caching_ctl; 551 atomic_inc(&ctl->count); 552 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 553 spin_unlock(&cache->lock); 554 555 schedule(); 556 557 finish_wait(&ctl->wait, &wait); 558 put_caching_control(ctl); 559 spin_lock(&cache->lock); 560 } 561 562 if (cache->cached != BTRFS_CACHE_NO) { 563 spin_unlock(&cache->lock); 564 kfree(caching_ctl); 565 return 0; 566 } 567 WARN_ON(cache->caching_ctl); 568 cache->caching_ctl = caching_ctl; 569 cache->cached = BTRFS_CACHE_FAST; 570 spin_unlock(&cache->lock); 571 572 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 573 ret = load_free_space_cache(fs_info, cache); 574 575 spin_lock(&cache->lock); 576 if (ret == 1) { 577 cache->caching_ctl = NULL; 578 cache->cached = BTRFS_CACHE_FINISHED; 579 cache->last_byte_to_unpin = (u64)-1; 580 } else { 581 if (load_cache_only) { 582 cache->caching_ctl = NULL; 583 cache->cached = BTRFS_CACHE_NO; 584 } else { 585 cache->cached = BTRFS_CACHE_STARTED; 586 } 587 } 588 spin_unlock(&cache->lock); 589 wake_up(&caching_ctl->wait); 590 if (ret == 1) { 591 put_caching_control(caching_ctl); 592 free_excluded_extents(fs_info->extent_root, cache); 593 return 0; 594 } 595 } else { 596 /* 597 * We are not going to do the fast caching, set cached to the 598 * appropriate value and wakeup any waiters. 599 */ 600 spin_lock(&cache->lock); 601 if (load_cache_only) { 602 cache->caching_ctl = NULL; 603 cache->cached = BTRFS_CACHE_NO; 604 } else { 605 cache->cached = BTRFS_CACHE_STARTED; 606 } 607 spin_unlock(&cache->lock); 608 wake_up(&caching_ctl->wait); 609 } 610 611 if (load_cache_only) { 612 put_caching_control(caching_ctl); 613 return 0; 614 } 615 616 down_write(&fs_info->extent_commit_sem); 617 atomic_inc(&caching_ctl->count); 618 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 619 up_write(&fs_info->extent_commit_sem); 620 621 btrfs_get_block_group(cache); 622 623 btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); 624 625 return ret; 626 } 627 628 /* 629 * return the block group that starts at or after bytenr 630 */ 631 static struct btrfs_block_group_cache * 632 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 633 { 634 struct btrfs_block_group_cache *cache; 635 636 cache = block_group_cache_tree_search(info, bytenr, 0); 637 638 return cache; 639 } 640 641 /* 642 * return the block group that contains the given bytenr 643 */ 644 struct btrfs_block_group_cache *btrfs_lookup_block_group( 645 struct btrfs_fs_info *info, 646 u64 bytenr) 647 { 648 struct btrfs_block_group_cache *cache; 649 650 cache = block_group_cache_tree_search(info, bytenr, 1); 651 652 return cache; 653 } 654 655 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 656 u64 flags) 657 { 658 struct list_head *head = &info->space_info; 659 struct btrfs_space_info *found; 660 661 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 662 663 rcu_read_lock(); 664 list_for_each_entry_rcu(found, head, list) { 665 if (found->flags & flags) { 666 rcu_read_unlock(); 667 return found; 668 } 669 } 670 rcu_read_unlock(); 671 return NULL; 672 } 673 674 /* 675 * after adding space to the filesystem, we need to clear the full flags 676 * on all the space infos. 677 */ 678 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 679 { 680 struct list_head *head = &info->space_info; 681 struct btrfs_space_info *found; 682 683 rcu_read_lock(); 684 list_for_each_entry_rcu(found, head, list) 685 found->full = 0; 686 rcu_read_unlock(); 687 } 688 689 /* simple helper to search for an existing extent at a given offset */ 690 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) 691 { 692 int ret; 693 struct btrfs_key key; 694 struct btrfs_path *path; 695 696 path = btrfs_alloc_path(); 697 if (!path) 698 return -ENOMEM; 699 700 key.objectid = start; 701 key.offset = len; 702 key.type = BTRFS_EXTENT_ITEM_KEY; 703 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 704 0, 0); 705 if (ret > 0) { 706 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 707 if (key.objectid == start && 708 key.type == BTRFS_METADATA_ITEM_KEY) 709 ret = 0; 710 } 711 btrfs_free_path(path); 712 return ret; 713 } 714 715 /* 716 * helper function to lookup reference count and flags of a tree block. 717 * 718 * the head node for delayed ref is used to store the sum of all the 719 * reference count modifications queued up in the rbtree. the head 720 * node may also store the extent flags to set. This way you can check 721 * to see what the reference count and extent flags would be if all of 722 * the delayed refs are not processed. 723 */ 724 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 725 struct btrfs_root *root, u64 bytenr, 726 u64 offset, int metadata, u64 *refs, u64 *flags) 727 { 728 struct btrfs_delayed_ref_head *head; 729 struct btrfs_delayed_ref_root *delayed_refs; 730 struct btrfs_path *path; 731 struct btrfs_extent_item *ei; 732 struct extent_buffer *leaf; 733 struct btrfs_key key; 734 u32 item_size; 735 u64 num_refs; 736 u64 extent_flags; 737 int ret; 738 739 /* 740 * If we don't have skinny metadata, don't bother doing anything 741 * different 742 */ 743 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { 744 offset = root->leafsize; 745 metadata = 0; 746 } 747 748 path = btrfs_alloc_path(); 749 if (!path) 750 return -ENOMEM; 751 752 if (metadata) { 753 key.objectid = bytenr; 754 key.type = BTRFS_METADATA_ITEM_KEY; 755 key.offset = offset; 756 } else { 757 key.objectid = bytenr; 758 key.type = BTRFS_EXTENT_ITEM_KEY; 759 key.offset = offset; 760 } 761 762 if (!trans) { 763 path->skip_locking = 1; 764 path->search_commit_root = 1; 765 } 766 again: 767 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 768 &key, path, 0, 0); 769 if (ret < 0) 770 goto out_free; 771 772 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 773 key.type = BTRFS_EXTENT_ITEM_KEY; 774 key.offset = root->leafsize; 775 btrfs_release_path(path); 776 goto again; 777 } 778 779 if (ret == 0) { 780 leaf = path->nodes[0]; 781 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 782 if (item_size >= sizeof(*ei)) { 783 ei = btrfs_item_ptr(leaf, path->slots[0], 784 struct btrfs_extent_item); 785 num_refs = btrfs_extent_refs(leaf, ei); 786 extent_flags = btrfs_extent_flags(leaf, ei); 787 } else { 788 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 789 struct btrfs_extent_item_v0 *ei0; 790 BUG_ON(item_size != sizeof(*ei0)); 791 ei0 = btrfs_item_ptr(leaf, path->slots[0], 792 struct btrfs_extent_item_v0); 793 num_refs = btrfs_extent_refs_v0(leaf, ei0); 794 /* FIXME: this isn't correct for data */ 795 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 796 #else 797 BUG(); 798 #endif 799 } 800 BUG_ON(num_refs == 0); 801 } else { 802 num_refs = 0; 803 extent_flags = 0; 804 ret = 0; 805 } 806 807 if (!trans) 808 goto out; 809 810 delayed_refs = &trans->transaction->delayed_refs; 811 spin_lock(&delayed_refs->lock); 812 head = btrfs_find_delayed_ref_head(trans, bytenr); 813 if (head) { 814 if (!mutex_trylock(&head->mutex)) { 815 atomic_inc(&head->node.refs); 816 spin_unlock(&delayed_refs->lock); 817 818 btrfs_release_path(path); 819 820 /* 821 * Mutex was contended, block until it's released and try 822 * again 823 */ 824 mutex_lock(&head->mutex); 825 mutex_unlock(&head->mutex); 826 btrfs_put_delayed_ref(&head->node); 827 goto again; 828 } 829 if (head->extent_op && head->extent_op->update_flags) 830 extent_flags |= head->extent_op->flags_to_set; 831 else 832 BUG_ON(num_refs == 0); 833 834 num_refs += head->node.ref_mod; 835 mutex_unlock(&head->mutex); 836 } 837 spin_unlock(&delayed_refs->lock); 838 out: 839 WARN_ON(num_refs == 0); 840 if (refs) 841 *refs = num_refs; 842 if (flags) 843 *flags = extent_flags; 844 out_free: 845 btrfs_free_path(path); 846 return ret; 847 } 848 849 /* 850 * Back reference rules. Back refs have three main goals: 851 * 852 * 1) differentiate between all holders of references to an extent so that 853 * when a reference is dropped we can make sure it was a valid reference 854 * before freeing the extent. 855 * 856 * 2) Provide enough information to quickly find the holders of an extent 857 * if we notice a given block is corrupted or bad. 858 * 859 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 860 * maintenance. This is actually the same as #2, but with a slightly 861 * different use case. 862 * 863 * There are two kinds of back refs. The implicit back refs is optimized 864 * for pointers in non-shared tree blocks. For a given pointer in a block, 865 * back refs of this kind provide information about the block's owner tree 866 * and the pointer's key. These information allow us to find the block by 867 * b-tree searching. The full back refs is for pointers in tree blocks not 868 * referenced by their owner trees. The location of tree block is recorded 869 * in the back refs. Actually the full back refs is generic, and can be 870 * used in all cases the implicit back refs is used. The major shortcoming 871 * of the full back refs is its overhead. Every time a tree block gets 872 * COWed, we have to update back refs entry for all pointers in it. 873 * 874 * For a newly allocated tree block, we use implicit back refs for 875 * pointers in it. This means most tree related operations only involve 876 * implicit back refs. For a tree block created in old transaction, the 877 * only way to drop a reference to it is COW it. So we can detect the 878 * event that tree block loses its owner tree's reference and do the 879 * back refs conversion. 880 * 881 * When a tree block is COW'd through a tree, there are four cases: 882 * 883 * The reference count of the block is one and the tree is the block's 884 * owner tree. Nothing to do in this case. 885 * 886 * The reference count of the block is one and the tree is not the 887 * block's owner tree. In this case, full back refs is used for pointers 888 * in the block. Remove these full back refs, add implicit back refs for 889 * every pointers in the new block. 890 * 891 * The reference count of the block is greater than one and the tree is 892 * the block's owner tree. In this case, implicit back refs is used for 893 * pointers in the block. Add full back refs for every pointers in the 894 * block, increase lower level extents' reference counts. The original 895 * implicit back refs are entailed to the new block. 896 * 897 * The reference count of the block is greater than one and the tree is 898 * not the block's owner tree. Add implicit back refs for every pointer in 899 * the new block, increase lower level extents' reference count. 900 * 901 * Back Reference Key composing: 902 * 903 * The key objectid corresponds to the first byte in the extent, 904 * The key type is used to differentiate between types of back refs. 905 * There are different meanings of the key offset for different types 906 * of back refs. 907 * 908 * File extents can be referenced by: 909 * 910 * - multiple snapshots, subvolumes, or different generations in one subvol 911 * - different files inside a single subvolume 912 * - different offsets inside a file (bookend extents in file.c) 913 * 914 * The extent ref structure for the implicit back refs has fields for: 915 * 916 * - Objectid of the subvolume root 917 * - objectid of the file holding the reference 918 * - original offset in the file 919 * - how many bookend extents 920 * 921 * The key offset for the implicit back refs is hash of the first 922 * three fields. 923 * 924 * The extent ref structure for the full back refs has field for: 925 * 926 * - number of pointers in the tree leaf 927 * 928 * The key offset for the implicit back refs is the first byte of 929 * the tree leaf 930 * 931 * When a file extent is allocated, The implicit back refs is used. 932 * the fields are filled in: 933 * 934 * (root_key.objectid, inode objectid, offset in file, 1) 935 * 936 * When a file extent is removed file truncation, we find the 937 * corresponding implicit back refs and check the following fields: 938 * 939 * (btrfs_header_owner(leaf), inode objectid, offset in file) 940 * 941 * Btree extents can be referenced by: 942 * 943 * - Different subvolumes 944 * 945 * Both the implicit back refs and the full back refs for tree blocks 946 * only consist of key. The key offset for the implicit back refs is 947 * objectid of block's owner tree. The key offset for the full back refs 948 * is the first byte of parent block. 949 * 950 * When implicit back refs is used, information about the lowest key and 951 * level of the tree block are required. These information are stored in 952 * tree block info structure. 953 */ 954 955 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 956 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 957 struct btrfs_root *root, 958 struct btrfs_path *path, 959 u64 owner, u32 extra_size) 960 { 961 struct btrfs_extent_item *item; 962 struct btrfs_extent_item_v0 *ei0; 963 struct btrfs_extent_ref_v0 *ref0; 964 struct btrfs_tree_block_info *bi; 965 struct extent_buffer *leaf; 966 struct btrfs_key key; 967 struct btrfs_key found_key; 968 u32 new_size = sizeof(*item); 969 u64 refs; 970 int ret; 971 972 leaf = path->nodes[0]; 973 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 974 975 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 976 ei0 = btrfs_item_ptr(leaf, path->slots[0], 977 struct btrfs_extent_item_v0); 978 refs = btrfs_extent_refs_v0(leaf, ei0); 979 980 if (owner == (u64)-1) { 981 while (1) { 982 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 983 ret = btrfs_next_leaf(root, path); 984 if (ret < 0) 985 return ret; 986 BUG_ON(ret > 0); /* Corruption */ 987 leaf = path->nodes[0]; 988 } 989 btrfs_item_key_to_cpu(leaf, &found_key, 990 path->slots[0]); 991 BUG_ON(key.objectid != found_key.objectid); 992 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 993 path->slots[0]++; 994 continue; 995 } 996 ref0 = btrfs_item_ptr(leaf, path->slots[0], 997 struct btrfs_extent_ref_v0); 998 owner = btrfs_ref_objectid_v0(leaf, ref0); 999 break; 1000 } 1001 } 1002 btrfs_release_path(path); 1003 1004 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1005 new_size += sizeof(*bi); 1006 1007 new_size -= sizeof(*ei0); 1008 ret = btrfs_search_slot(trans, root, &key, path, 1009 new_size + extra_size, 1); 1010 if (ret < 0) 1011 return ret; 1012 BUG_ON(ret); /* Corruption */ 1013 1014 btrfs_extend_item(root, path, new_size); 1015 1016 leaf = path->nodes[0]; 1017 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1018 btrfs_set_extent_refs(leaf, item, refs); 1019 /* FIXME: get real generation */ 1020 btrfs_set_extent_generation(leaf, item, 0); 1021 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1022 btrfs_set_extent_flags(leaf, item, 1023 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1024 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1025 bi = (struct btrfs_tree_block_info *)(item + 1); 1026 /* FIXME: get first key of the block */ 1027 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); 1028 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1029 } else { 1030 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1031 } 1032 btrfs_mark_buffer_dirty(leaf); 1033 return 0; 1034 } 1035 #endif 1036 1037 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1038 { 1039 u32 high_crc = ~(u32)0; 1040 u32 low_crc = ~(u32)0; 1041 __le64 lenum; 1042 1043 lenum = cpu_to_le64(root_objectid); 1044 high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 1045 lenum = cpu_to_le64(owner); 1046 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1047 lenum = cpu_to_le64(offset); 1048 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1049 1050 return ((u64)high_crc << 31) ^ (u64)low_crc; 1051 } 1052 1053 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1054 struct btrfs_extent_data_ref *ref) 1055 { 1056 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1057 btrfs_extent_data_ref_objectid(leaf, ref), 1058 btrfs_extent_data_ref_offset(leaf, ref)); 1059 } 1060 1061 static int match_extent_data_ref(struct extent_buffer *leaf, 1062 struct btrfs_extent_data_ref *ref, 1063 u64 root_objectid, u64 owner, u64 offset) 1064 { 1065 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1066 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1067 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1068 return 0; 1069 return 1; 1070 } 1071 1072 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1073 struct btrfs_root *root, 1074 struct btrfs_path *path, 1075 u64 bytenr, u64 parent, 1076 u64 root_objectid, 1077 u64 owner, u64 offset) 1078 { 1079 struct btrfs_key key; 1080 struct btrfs_extent_data_ref *ref; 1081 struct extent_buffer *leaf; 1082 u32 nritems; 1083 int ret; 1084 int recow; 1085 int err = -ENOENT; 1086 1087 key.objectid = bytenr; 1088 if (parent) { 1089 key.type = BTRFS_SHARED_DATA_REF_KEY; 1090 key.offset = parent; 1091 } else { 1092 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1093 key.offset = hash_extent_data_ref(root_objectid, 1094 owner, offset); 1095 } 1096 again: 1097 recow = 0; 1098 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1099 if (ret < 0) { 1100 err = ret; 1101 goto fail; 1102 } 1103 1104 if (parent) { 1105 if (!ret) 1106 return 0; 1107 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1108 key.type = BTRFS_EXTENT_REF_V0_KEY; 1109 btrfs_release_path(path); 1110 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1111 if (ret < 0) { 1112 err = ret; 1113 goto fail; 1114 } 1115 if (!ret) 1116 return 0; 1117 #endif 1118 goto fail; 1119 } 1120 1121 leaf = path->nodes[0]; 1122 nritems = btrfs_header_nritems(leaf); 1123 while (1) { 1124 if (path->slots[0] >= nritems) { 1125 ret = btrfs_next_leaf(root, path); 1126 if (ret < 0) 1127 err = ret; 1128 if (ret) 1129 goto fail; 1130 1131 leaf = path->nodes[0]; 1132 nritems = btrfs_header_nritems(leaf); 1133 recow = 1; 1134 } 1135 1136 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1137 if (key.objectid != bytenr || 1138 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1139 goto fail; 1140 1141 ref = btrfs_item_ptr(leaf, path->slots[0], 1142 struct btrfs_extent_data_ref); 1143 1144 if (match_extent_data_ref(leaf, ref, root_objectid, 1145 owner, offset)) { 1146 if (recow) { 1147 btrfs_release_path(path); 1148 goto again; 1149 } 1150 err = 0; 1151 break; 1152 } 1153 path->slots[0]++; 1154 } 1155 fail: 1156 return err; 1157 } 1158 1159 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1160 struct btrfs_root *root, 1161 struct btrfs_path *path, 1162 u64 bytenr, u64 parent, 1163 u64 root_objectid, u64 owner, 1164 u64 offset, int refs_to_add) 1165 { 1166 struct btrfs_key key; 1167 struct extent_buffer *leaf; 1168 u32 size; 1169 u32 num_refs; 1170 int ret; 1171 1172 key.objectid = bytenr; 1173 if (parent) { 1174 key.type = BTRFS_SHARED_DATA_REF_KEY; 1175 key.offset = parent; 1176 size = sizeof(struct btrfs_shared_data_ref); 1177 } else { 1178 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1179 key.offset = hash_extent_data_ref(root_objectid, 1180 owner, offset); 1181 size = sizeof(struct btrfs_extent_data_ref); 1182 } 1183 1184 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1185 if (ret && ret != -EEXIST) 1186 goto fail; 1187 1188 leaf = path->nodes[0]; 1189 if (parent) { 1190 struct btrfs_shared_data_ref *ref; 1191 ref = btrfs_item_ptr(leaf, path->slots[0], 1192 struct btrfs_shared_data_ref); 1193 if (ret == 0) { 1194 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1195 } else { 1196 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1197 num_refs += refs_to_add; 1198 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1199 } 1200 } else { 1201 struct btrfs_extent_data_ref *ref; 1202 while (ret == -EEXIST) { 1203 ref = btrfs_item_ptr(leaf, path->slots[0], 1204 struct btrfs_extent_data_ref); 1205 if (match_extent_data_ref(leaf, ref, root_objectid, 1206 owner, offset)) 1207 break; 1208 btrfs_release_path(path); 1209 key.offset++; 1210 ret = btrfs_insert_empty_item(trans, root, path, &key, 1211 size); 1212 if (ret && ret != -EEXIST) 1213 goto fail; 1214 1215 leaf = path->nodes[0]; 1216 } 1217 ref = btrfs_item_ptr(leaf, path->slots[0], 1218 struct btrfs_extent_data_ref); 1219 if (ret == 0) { 1220 btrfs_set_extent_data_ref_root(leaf, ref, 1221 root_objectid); 1222 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1223 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1224 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1225 } else { 1226 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1227 num_refs += refs_to_add; 1228 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1229 } 1230 } 1231 btrfs_mark_buffer_dirty(leaf); 1232 ret = 0; 1233 fail: 1234 btrfs_release_path(path); 1235 return ret; 1236 } 1237 1238 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1239 struct btrfs_root *root, 1240 struct btrfs_path *path, 1241 int refs_to_drop) 1242 { 1243 struct btrfs_key key; 1244 struct btrfs_extent_data_ref *ref1 = NULL; 1245 struct btrfs_shared_data_ref *ref2 = NULL; 1246 struct extent_buffer *leaf; 1247 u32 num_refs = 0; 1248 int ret = 0; 1249 1250 leaf = path->nodes[0]; 1251 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1252 1253 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1254 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1255 struct btrfs_extent_data_ref); 1256 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1257 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1258 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1259 struct btrfs_shared_data_ref); 1260 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1261 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1262 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1263 struct btrfs_extent_ref_v0 *ref0; 1264 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1265 struct btrfs_extent_ref_v0); 1266 num_refs = btrfs_ref_count_v0(leaf, ref0); 1267 #endif 1268 } else { 1269 BUG(); 1270 } 1271 1272 BUG_ON(num_refs < refs_to_drop); 1273 num_refs -= refs_to_drop; 1274 1275 if (num_refs == 0) { 1276 ret = btrfs_del_item(trans, root, path); 1277 } else { 1278 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1279 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1280 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1281 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1282 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1283 else { 1284 struct btrfs_extent_ref_v0 *ref0; 1285 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1286 struct btrfs_extent_ref_v0); 1287 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1288 } 1289 #endif 1290 btrfs_mark_buffer_dirty(leaf); 1291 } 1292 return ret; 1293 } 1294 1295 static noinline u32 extent_data_ref_count(struct btrfs_root *root, 1296 struct btrfs_path *path, 1297 struct btrfs_extent_inline_ref *iref) 1298 { 1299 struct btrfs_key key; 1300 struct extent_buffer *leaf; 1301 struct btrfs_extent_data_ref *ref1; 1302 struct btrfs_shared_data_ref *ref2; 1303 u32 num_refs = 0; 1304 1305 leaf = path->nodes[0]; 1306 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1307 if (iref) { 1308 if (btrfs_extent_inline_ref_type(leaf, iref) == 1309 BTRFS_EXTENT_DATA_REF_KEY) { 1310 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1311 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1312 } else { 1313 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1314 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1315 } 1316 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1317 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1318 struct btrfs_extent_data_ref); 1319 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1320 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1321 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1322 struct btrfs_shared_data_ref); 1323 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1324 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1325 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1326 struct btrfs_extent_ref_v0 *ref0; 1327 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1328 struct btrfs_extent_ref_v0); 1329 num_refs = btrfs_ref_count_v0(leaf, ref0); 1330 #endif 1331 } else { 1332 WARN_ON(1); 1333 } 1334 return num_refs; 1335 } 1336 1337 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1338 struct btrfs_root *root, 1339 struct btrfs_path *path, 1340 u64 bytenr, u64 parent, 1341 u64 root_objectid) 1342 { 1343 struct btrfs_key key; 1344 int ret; 1345 1346 key.objectid = bytenr; 1347 if (parent) { 1348 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1349 key.offset = parent; 1350 } else { 1351 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1352 key.offset = root_objectid; 1353 } 1354 1355 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1356 if (ret > 0) 1357 ret = -ENOENT; 1358 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1359 if (ret == -ENOENT && parent) { 1360 btrfs_release_path(path); 1361 key.type = BTRFS_EXTENT_REF_V0_KEY; 1362 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1363 if (ret > 0) 1364 ret = -ENOENT; 1365 } 1366 #endif 1367 return ret; 1368 } 1369 1370 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1371 struct btrfs_root *root, 1372 struct btrfs_path *path, 1373 u64 bytenr, u64 parent, 1374 u64 root_objectid) 1375 { 1376 struct btrfs_key key; 1377 int ret; 1378 1379 key.objectid = bytenr; 1380 if (parent) { 1381 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1382 key.offset = parent; 1383 } else { 1384 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1385 key.offset = root_objectid; 1386 } 1387 1388 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1389 btrfs_release_path(path); 1390 return ret; 1391 } 1392 1393 static inline int extent_ref_type(u64 parent, u64 owner) 1394 { 1395 int type; 1396 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1397 if (parent > 0) 1398 type = BTRFS_SHARED_BLOCK_REF_KEY; 1399 else 1400 type = BTRFS_TREE_BLOCK_REF_KEY; 1401 } else { 1402 if (parent > 0) 1403 type = BTRFS_SHARED_DATA_REF_KEY; 1404 else 1405 type = BTRFS_EXTENT_DATA_REF_KEY; 1406 } 1407 return type; 1408 } 1409 1410 static int find_next_key(struct btrfs_path *path, int level, 1411 struct btrfs_key *key) 1412 1413 { 1414 for (; level < BTRFS_MAX_LEVEL; level++) { 1415 if (!path->nodes[level]) 1416 break; 1417 if (path->slots[level] + 1 >= 1418 btrfs_header_nritems(path->nodes[level])) 1419 continue; 1420 if (level == 0) 1421 btrfs_item_key_to_cpu(path->nodes[level], key, 1422 path->slots[level] + 1); 1423 else 1424 btrfs_node_key_to_cpu(path->nodes[level], key, 1425 path->slots[level] + 1); 1426 return 0; 1427 } 1428 return 1; 1429 } 1430 1431 /* 1432 * look for inline back ref. if back ref is found, *ref_ret is set 1433 * to the address of inline back ref, and 0 is returned. 1434 * 1435 * if back ref isn't found, *ref_ret is set to the address where it 1436 * should be inserted, and -ENOENT is returned. 1437 * 1438 * if insert is true and there are too many inline back refs, the path 1439 * points to the extent item, and -EAGAIN is returned. 1440 * 1441 * NOTE: inline back refs are ordered in the same way that back ref 1442 * items in the tree are ordered. 1443 */ 1444 static noinline_for_stack 1445 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1446 struct btrfs_root *root, 1447 struct btrfs_path *path, 1448 struct btrfs_extent_inline_ref **ref_ret, 1449 u64 bytenr, u64 num_bytes, 1450 u64 parent, u64 root_objectid, 1451 u64 owner, u64 offset, int insert) 1452 { 1453 struct btrfs_key key; 1454 struct extent_buffer *leaf; 1455 struct btrfs_extent_item *ei; 1456 struct btrfs_extent_inline_ref *iref; 1457 u64 flags; 1458 u64 item_size; 1459 unsigned long ptr; 1460 unsigned long end; 1461 int extra_size; 1462 int type; 1463 int want; 1464 int ret; 1465 int err = 0; 1466 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 1467 SKINNY_METADATA); 1468 1469 key.objectid = bytenr; 1470 key.type = BTRFS_EXTENT_ITEM_KEY; 1471 key.offset = num_bytes; 1472 1473 want = extent_ref_type(parent, owner); 1474 if (insert) { 1475 extra_size = btrfs_extent_inline_ref_size(want); 1476 path->keep_locks = 1; 1477 } else 1478 extra_size = -1; 1479 1480 /* 1481 * Owner is our parent level, so we can just add one to get the level 1482 * for the block we are interested in. 1483 */ 1484 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1485 key.type = BTRFS_METADATA_ITEM_KEY; 1486 key.offset = owner; 1487 } 1488 1489 again: 1490 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1491 if (ret < 0) { 1492 err = ret; 1493 goto out; 1494 } 1495 1496 /* 1497 * We may be a newly converted file system which still has the old fat 1498 * extent entries for metadata, so try and see if we have one of those. 1499 */ 1500 if (ret > 0 && skinny_metadata) { 1501 skinny_metadata = false; 1502 if (path->slots[0]) { 1503 path->slots[0]--; 1504 btrfs_item_key_to_cpu(path->nodes[0], &key, 1505 path->slots[0]); 1506 if (key.objectid == bytenr && 1507 key.type == BTRFS_EXTENT_ITEM_KEY && 1508 key.offset == num_bytes) 1509 ret = 0; 1510 } 1511 if (ret) { 1512 key.type = BTRFS_EXTENT_ITEM_KEY; 1513 key.offset = num_bytes; 1514 btrfs_release_path(path); 1515 goto again; 1516 } 1517 } 1518 1519 if (ret && !insert) { 1520 err = -ENOENT; 1521 goto out; 1522 } else if (ret) { 1523 err = -EIO; 1524 WARN_ON(1); 1525 goto out; 1526 } 1527 1528 leaf = path->nodes[0]; 1529 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1530 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1531 if (item_size < sizeof(*ei)) { 1532 if (!insert) { 1533 err = -ENOENT; 1534 goto out; 1535 } 1536 ret = convert_extent_item_v0(trans, root, path, owner, 1537 extra_size); 1538 if (ret < 0) { 1539 err = ret; 1540 goto out; 1541 } 1542 leaf = path->nodes[0]; 1543 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1544 } 1545 #endif 1546 BUG_ON(item_size < sizeof(*ei)); 1547 1548 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1549 flags = btrfs_extent_flags(leaf, ei); 1550 1551 ptr = (unsigned long)(ei + 1); 1552 end = (unsigned long)ei + item_size; 1553 1554 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1555 ptr += sizeof(struct btrfs_tree_block_info); 1556 BUG_ON(ptr > end); 1557 } 1558 1559 err = -ENOENT; 1560 while (1) { 1561 if (ptr >= end) { 1562 WARN_ON(ptr > end); 1563 break; 1564 } 1565 iref = (struct btrfs_extent_inline_ref *)ptr; 1566 type = btrfs_extent_inline_ref_type(leaf, iref); 1567 if (want < type) 1568 break; 1569 if (want > type) { 1570 ptr += btrfs_extent_inline_ref_size(type); 1571 continue; 1572 } 1573 1574 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1575 struct btrfs_extent_data_ref *dref; 1576 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1577 if (match_extent_data_ref(leaf, dref, root_objectid, 1578 owner, offset)) { 1579 err = 0; 1580 break; 1581 } 1582 if (hash_extent_data_ref_item(leaf, dref) < 1583 hash_extent_data_ref(root_objectid, owner, offset)) 1584 break; 1585 } else { 1586 u64 ref_offset; 1587 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1588 if (parent > 0) { 1589 if (parent == ref_offset) { 1590 err = 0; 1591 break; 1592 } 1593 if (ref_offset < parent) 1594 break; 1595 } else { 1596 if (root_objectid == ref_offset) { 1597 err = 0; 1598 break; 1599 } 1600 if (ref_offset < root_objectid) 1601 break; 1602 } 1603 } 1604 ptr += btrfs_extent_inline_ref_size(type); 1605 } 1606 if (err == -ENOENT && insert) { 1607 if (item_size + extra_size >= 1608 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1609 err = -EAGAIN; 1610 goto out; 1611 } 1612 /* 1613 * To add new inline back ref, we have to make sure 1614 * there is no corresponding back ref item. 1615 * For simplicity, we just do not add new inline back 1616 * ref if there is any kind of item for this block 1617 */ 1618 if (find_next_key(path, 0, &key) == 0 && 1619 key.objectid == bytenr && 1620 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1621 err = -EAGAIN; 1622 goto out; 1623 } 1624 } 1625 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1626 out: 1627 if (insert) { 1628 path->keep_locks = 0; 1629 btrfs_unlock_up_safe(path, 1); 1630 } 1631 return err; 1632 } 1633 1634 /* 1635 * helper to add new inline back ref 1636 */ 1637 static noinline_for_stack 1638 void setup_inline_extent_backref(struct btrfs_root *root, 1639 struct btrfs_path *path, 1640 struct btrfs_extent_inline_ref *iref, 1641 u64 parent, u64 root_objectid, 1642 u64 owner, u64 offset, int refs_to_add, 1643 struct btrfs_delayed_extent_op *extent_op) 1644 { 1645 struct extent_buffer *leaf; 1646 struct btrfs_extent_item *ei; 1647 unsigned long ptr; 1648 unsigned long end; 1649 unsigned long item_offset; 1650 u64 refs; 1651 int size; 1652 int type; 1653 1654 leaf = path->nodes[0]; 1655 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1656 item_offset = (unsigned long)iref - (unsigned long)ei; 1657 1658 type = extent_ref_type(parent, owner); 1659 size = btrfs_extent_inline_ref_size(type); 1660 1661 btrfs_extend_item(root, path, size); 1662 1663 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1664 refs = btrfs_extent_refs(leaf, ei); 1665 refs += refs_to_add; 1666 btrfs_set_extent_refs(leaf, ei, refs); 1667 if (extent_op) 1668 __run_delayed_extent_op(extent_op, leaf, ei); 1669 1670 ptr = (unsigned long)ei + item_offset; 1671 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1672 if (ptr < end - size) 1673 memmove_extent_buffer(leaf, ptr + size, ptr, 1674 end - size - ptr); 1675 1676 iref = (struct btrfs_extent_inline_ref *)ptr; 1677 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1678 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1679 struct btrfs_extent_data_ref *dref; 1680 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1681 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1682 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1683 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1684 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1685 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1686 struct btrfs_shared_data_ref *sref; 1687 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1688 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1689 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1690 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1691 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1692 } else { 1693 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1694 } 1695 btrfs_mark_buffer_dirty(leaf); 1696 } 1697 1698 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1699 struct btrfs_root *root, 1700 struct btrfs_path *path, 1701 struct btrfs_extent_inline_ref **ref_ret, 1702 u64 bytenr, u64 num_bytes, u64 parent, 1703 u64 root_objectid, u64 owner, u64 offset) 1704 { 1705 int ret; 1706 1707 ret = lookup_inline_extent_backref(trans, root, path, ref_ret, 1708 bytenr, num_bytes, parent, 1709 root_objectid, owner, offset, 0); 1710 if (ret != -ENOENT) 1711 return ret; 1712 1713 btrfs_release_path(path); 1714 *ref_ret = NULL; 1715 1716 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1717 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, 1718 root_objectid); 1719 } else { 1720 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, 1721 root_objectid, owner, offset); 1722 } 1723 return ret; 1724 } 1725 1726 /* 1727 * helper to update/remove inline back ref 1728 */ 1729 static noinline_for_stack 1730 void update_inline_extent_backref(struct btrfs_root *root, 1731 struct btrfs_path *path, 1732 struct btrfs_extent_inline_ref *iref, 1733 int refs_to_mod, 1734 struct btrfs_delayed_extent_op *extent_op) 1735 { 1736 struct extent_buffer *leaf; 1737 struct btrfs_extent_item *ei; 1738 struct btrfs_extent_data_ref *dref = NULL; 1739 struct btrfs_shared_data_ref *sref = NULL; 1740 unsigned long ptr; 1741 unsigned long end; 1742 u32 item_size; 1743 int size; 1744 int type; 1745 u64 refs; 1746 1747 leaf = path->nodes[0]; 1748 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1749 refs = btrfs_extent_refs(leaf, ei); 1750 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1751 refs += refs_to_mod; 1752 btrfs_set_extent_refs(leaf, ei, refs); 1753 if (extent_op) 1754 __run_delayed_extent_op(extent_op, leaf, ei); 1755 1756 type = btrfs_extent_inline_ref_type(leaf, iref); 1757 1758 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1759 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1760 refs = btrfs_extent_data_ref_count(leaf, dref); 1761 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1762 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1763 refs = btrfs_shared_data_ref_count(leaf, sref); 1764 } else { 1765 refs = 1; 1766 BUG_ON(refs_to_mod != -1); 1767 } 1768 1769 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1770 refs += refs_to_mod; 1771 1772 if (refs > 0) { 1773 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1774 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1775 else 1776 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1777 } else { 1778 size = btrfs_extent_inline_ref_size(type); 1779 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1780 ptr = (unsigned long)iref; 1781 end = (unsigned long)ei + item_size; 1782 if (ptr + size < end) 1783 memmove_extent_buffer(leaf, ptr, ptr + size, 1784 end - ptr - size); 1785 item_size -= size; 1786 btrfs_truncate_item(root, path, item_size, 1); 1787 } 1788 btrfs_mark_buffer_dirty(leaf); 1789 } 1790 1791 static noinline_for_stack 1792 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1793 struct btrfs_root *root, 1794 struct btrfs_path *path, 1795 u64 bytenr, u64 num_bytes, u64 parent, 1796 u64 root_objectid, u64 owner, 1797 u64 offset, int refs_to_add, 1798 struct btrfs_delayed_extent_op *extent_op) 1799 { 1800 struct btrfs_extent_inline_ref *iref; 1801 int ret; 1802 1803 ret = lookup_inline_extent_backref(trans, root, path, &iref, 1804 bytenr, num_bytes, parent, 1805 root_objectid, owner, offset, 1); 1806 if (ret == 0) { 1807 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1808 update_inline_extent_backref(root, path, iref, 1809 refs_to_add, extent_op); 1810 } else if (ret == -ENOENT) { 1811 setup_inline_extent_backref(root, path, iref, parent, 1812 root_objectid, owner, offset, 1813 refs_to_add, extent_op); 1814 ret = 0; 1815 } 1816 return ret; 1817 } 1818 1819 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1820 struct btrfs_root *root, 1821 struct btrfs_path *path, 1822 u64 bytenr, u64 parent, u64 root_objectid, 1823 u64 owner, u64 offset, int refs_to_add) 1824 { 1825 int ret; 1826 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1827 BUG_ON(refs_to_add != 1); 1828 ret = insert_tree_block_ref(trans, root, path, bytenr, 1829 parent, root_objectid); 1830 } else { 1831 ret = insert_extent_data_ref(trans, root, path, bytenr, 1832 parent, root_objectid, 1833 owner, offset, refs_to_add); 1834 } 1835 return ret; 1836 } 1837 1838 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1839 struct btrfs_root *root, 1840 struct btrfs_path *path, 1841 struct btrfs_extent_inline_ref *iref, 1842 int refs_to_drop, int is_data) 1843 { 1844 int ret = 0; 1845 1846 BUG_ON(!is_data && refs_to_drop != 1); 1847 if (iref) { 1848 update_inline_extent_backref(root, path, iref, 1849 -refs_to_drop, NULL); 1850 } else if (is_data) { 1851 ret = remove_extent_data_ref(trans, root, path, refs_to_drop); 1852 } else { 1853 ret = btrfs_del_item(trans, root, path); 1854 } 1855 return ret; 1856 } 1857 1858 static int btrfs_issue_discard(struct block_device *bdev, 1859 u64 start, u64 len) 1860 { 1861 return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); 1862 } 1863 1864 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1865 u64 num_bytes, u64 *actual_bytes) 1866 { 1867 int ret; 1868 u64 discarded_bytes = 0; 1869 struct btrfs_bio *bbio = NULL; 1870 1871 1872 /* Tell the block device(s) that the sectors can be discarded */ 1873 ret = btrfs_map_block(root->fs_info, REQ_DISCARD, 1874 bytenr, &num_bytes, &bbio, 0); 1875 /* Error condition is -ENOMEM */ 1876 if (!ret) { 1877 struct btrfs_bio_stripe *stripe = bbio->stripes; 1878 int i; 1879 1880 1881 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 1882 if (!stripe->dev->can_discard) 1883 continue; 1884 1885 ret = btrfs_issue_discard(stripe->dev->bdev, 1886 stripe->physical, 1887 stripe->length); 1888 if (!ret) 1889 discarded_bytes += stripe->length; 1890 else if (ret != -EOPNOTSUPP) 1891 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 1892 1893 /* 1894 * Just in case we get back EOPNOTSUPP for some reason, 1895 * just ignore the return value so we don't screw up 1896 * people calling discard_extent. 1897 */ 1898 ret = 0; 1899 } 1900 kfree(bbio); 1901 } 1902 1903 if (actual_bytes) 1904 *actual_bytes = discarded_bytes; 1905 1906 1907 if (ret == -EOPNOTSUPP) 1908 ret = 0; 1909 return ret; 1910 } 1911 1912 /* Can return -ENOMEM */ 1913 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1914 struct btrfs_root *root, 1915 u64 bytenr, u64 num_bytes, u64 parent, 1916 u64 root_objectid, u64 owner, u64 offset, int for_cow) 1917 { 1918 int ret; 1919 struct btrfs_fs_info *fs_info = root->fs_info; 1920 1921 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1922 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1923 1924 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1925 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 1926 num_bytes, 1927 parent, root_objectid, (int)owner, 1928 BTRFS_ADD_DELAYED_REF, NULL, for_cow); 1929 } else { 1930 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 1931 num_bytes, 1932 parent, root_objectid, owner, offset, 1933 BTRFS_ADD_DELAYED_REF, NULL, for_cow); 1934 } 1935 return ret; 1936 } 1937 1938 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1939 struct btrfs_root *root, 1940 u64 bytenr, u64 num_bytes, 1941 u64 parent, u64 root_objectid, 1942 u64 owner, u64 offset, int refs_to_add, 1943 struct btrfs_delayed_extent_op *extent_op) 1944 { 1945 struct btrfs_path *path; 1946 struct extent_buffer *leaf; 1947 struct btrfs_extent_item *item; 1948 u64 refs; 1949 int ret; 1950 int err = 0; 1951 1952 path = btrfs_alloc_path(); 1953 if (!path) 1954 return -ENOMEM; 1955 1956 path->reada = 1; 1957 path->leave_spinning = 1; 1958 /* this will setup the path even if it fails to insert the back ref */ 1959 ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, 1960 path, bytenr, num_bytes, parent, 1961 root_objectid, owner, offset, 1962 refs_to_add, extent_op); 1963 if (ret == 0) 1964 goto out; 1965 1966 if (ret != -EAGAIN) { 1967 err = ret; 1968 goto out; 1969 } 1970 1971 leaf = path->nodes[0]; 1972 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1973 refs = btrfs_extent_refs(leaf, item); 1974 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 1975 if (extent_op) 1976 __run_delayed_extent_op(extent_op, leaf, item); 1977 1978 btrfs_mark_buffer_dirty(leaf); 1979 btrfs_release_path(path); 1980 1981 path->reada = 1; 1982 path->leave_spinning = 1; 1983 1984 /* now insert the actual backref */ 1985 ret = insert_extent_backref(trans, root->fs_info->extent_root, 1986 path, bytenr, parent, root_objectid, 1987 owner, offset, refs_to_add); 1988 if (ret) 1989 btrfs_abort_transaction(trans, root, ret); 1990 out: 1991 btrfs_free_path(path); 1992 return err; 1993 } 1994 1995 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 1996 struct btrfs_root *root, 1997 struct btrfs_delayed_ref_node *node, 1998 struct btrfs_delayed_extent_op *extent_op, 1999 int insert_reserved) 2000 { 2001 int ret = 0; 2002 struct btrfs_delayed_data_ref *ref; 2003 struct btrfs_key ins; 2004 u64 parent = 0; 2005 u64 ref_root = 0; 2006 u64 flags = 0; 2007 2008 ins.objectid = node->bytenr; 2009 ins.offset = node->num_bytes; 2010 ins.type = BTRFS_EXTENT_ITEM_KEY; 2011 2012 ref = btrfs_delayed_node_to_data_ref(node); 2013 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2014 parent = ref->parent; 2015 else 2016 ref_root = ref->root; 2017 2018 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2019 if (extent_op) 2020 flags |= extent_op->flags_to_set; 2021 ret = alloc_reserved_file_extent(trans, root, 2022 parent, ref_root, flags, 2023 ref->objectid, ref->offset, 2024 &ins, node->ref_mod); 2025 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2026 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2027 node->num_bytes, parent, 2028 ref_root, ref->objectid, 2029 ref->offset, node->ref_mod, 2030 extent_op); 2031 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2032 ret = __btrfs_free_extent(trans, root, node->bytenr, 2033 node->num_bytes, parent, 2034 ref_root, ref->objectid, 2035 ref->offset, node->ref_mod, 2036 extent_op); 2037 } else { 2038 BUG(); 2039 } 2040 return ret; 2041 } 2042 2043 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2044 struct extent_buffer *leaf, 2045 struct btrfs_extent_item *ei) 2046 { 2047 u64 flags = btrfs_extent_flags(leaf, ei); 2048 if (extent_op->update_flags) { 2049 flags |= extent_op->flags_to_set; 2050 btrfs_set_extent_flags(leaf, ei, flags); 2051 } 2052 2053 if (extent_op->update_key) { 2054 struct btrfs_tree_block_info *bi; 2055 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2056 bi = (struct btrfs_tree_block_info *)(ei + 1); 2057 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2058 } 2059 } 2060 2061 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2062 struct btrfs_root *root, 2063 struct btrfs_delayed_ref_node *node, 2064 struct btrfs_delayed_extent_op *extent_op) 2065 { 2066 struct btrfs_key key; 2067 struct btrfs_path *path; 2068 struct btrfs_extent_item *ei; 2069 struct extent_buffer *leaf; 2070 u32 item_size; 2071 int ret; 2072 int err = 0; 2073 int metadata = (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2074 node->type == BTRFS_SHARED_BLOCK_REF_KEY); 2075 2076 if (trans->aborted) 2077 return 0; 2078 2079 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2080 metadata = 0; 2081 2082 path = btrfs_alloc_path(); 2083 if (!path) 2084 return -ENOMEM; 2085 2086 key.objectid = node->bytenr; 2087 2088 if (metadata) { 2089 struct btrfs_delayed_tree_ref *tree_ref; 2090 2091 tree_ref = btrfs_delayed_node_to_tree_ref(node); 2092 key.type = BTRFS_METADATA_ITEM_KEY; 2093 key.offset = tree_ref->level; 2094 } else { 2095 key.type = BTRFS_EXTENT_ITEM_KEY; 2096 key.offset = node->num_bytes; 2097 } 2098 2099 again: 2100 path->reada = 1; 2101 path->leave_spinning = 1; 2102 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 2103 path, 0, 1); 2104 if (ret < 0) { 2105 err = ret; 2106 goto out; 2107 } 2108 if (ret > 0) { 2109 if (metadata) { 2110 btrfs_release_path(path); 2111 metadata = 0; 2112 2113 key.offset = node->num_bytes; 2114 key.type = BTRFS_EXTENT_ITEM_KEY; 2115 goto again; 2116 } 2117 err = -EIO; 2118 goto out; 2119 } 2120 2121 leaf = path->nodes[0]; 2122 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2123 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2124 if (item_size < sizeof(*ei)) { 2125 ret = convert_extent_item_v0(trans, root->fs_info->extent_root, 2126 path, (u64)-1, 0); 2127 if (ret < 0) { 2128 err = ret; 2129 goto out; 2130 } 2131 leaf = path->nodes[0]; 2132 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2133 } 2134 #endif 2135 BUG_ON(item_size < sizeof(*ei)); 2136 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2137 __run_delayed_extent_op(extent_op, leaf, ei); 2138 2139 btrfs_mark_buffer_dirty(leaf); 2140 out: 2141 btrfs_free_path(path); 2142 return err; 2143 } 2144 2145 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2146 struct btrfs_root *root, 2147 struct btrfs_delayed_ref_node *node, 2148 struct btrfs_delayed_extent_op *extent_op, 2149 int insert_reserved) 2150 { 2151 int ret = 0; 2152 struct btrfs_delayed_tree_ref *ref; 2153 struct btrfs_key ins; 2154 u64 parent = 0; 2155 u64 ref_root = 0; 2156 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 2157 SKINNY_METADATA); 2158 2159 ref = btrfs_delayed_node_to_tree_ref(node); 2160 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2161 parent = ref->parent; 2162 else 2163 ref_root = ref->root; 2164 2165 ins.objectid = node->bytenr; 2166 if (skinny_metadata) { 2167 ins.offset = ref->level; 2168 ins.type = BTRFS_METADATA_ITEM_KEY; 2169 } else { 2170 ins.offset = node->num_bytes; 2171 ins.type = BTRFS_EXTENT_ITEM_KEY; 2172 } 2173 2174 BUG_ON(node->ref_mod != 1); 2175 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2176 BUG_ON(!extent_op || !extent_op->update_flags); 2177 ret = alloc_reserved_tree_block(trans, root, 2178 parent, ref_root, 2179 extent_op->flags_to_set, 2180 &extent_op->key, 2181 ref->level, &ins); 2182 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2183 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2184 node->num_bytes, parent, ref_root, 2185 ref->level, 0, 1, extent_op); 2186 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2187 ret = __btrfs_free_extent(trans, root, node->bytenr, 2188 node->num_bytes, parent, ref_root, 2189 ref->level, 0, 1, extent_op); 2190 } else { 2191 BUG(); 2192 } 2193 return ret; 2194 } 2195 2196 /* helper function to actually process a single delayed ref entry */ 2197 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2198 struct btrfs_root *root, 2199 struct btrfs_delayed_ref_node *node, 2200 struct btrfs_delayed_extent_op *extent_op, 2201 int insert_reserved) 2202 { 2203 int ret = 0; 2204 2205 if (trans->aborted) 2206 return 0; 2207 2208 if (btrfs_delayed_ref_is_head(node)) { 2209 struct btrfs_delayed_ref_head *head; 2210 /* 2211 * we've hit the end of the chain and we were supposed 2212 * to insert this extent into the tree. But, it got 2213 * deleted before we ever needed to insert it, so all 2214 * we have to do is clean up the accounting 2215 */ 2216 BUG_ON(extent_op); 2217 head = btrfs_delayed_node_to_head(node); 2218 if (insert_reserved) { 2219 btrfs_pin_extent(root, node->bytenr, 2220 node->num_bytes, 1); 2221 if (head->is_data) { 2222 ret = btrfs_del_csums(trans, root, 2223 node->bytenr, 2224 node->num_bytes); 2225 } 2226 } 2227 return ret; 2228 } 2229 2230 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2231 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2232 ret = run_delayed_tree_ref(trans, root, node, extent_op, 2233 insert_reserved); 2234 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2235 node->type == BTRFS_SHARED_DATA_REF_KEY) 2236 ret = run_delayed_data_ref(trans, root, node, extent_op, 2237 insert_reserved); 2238 else 2239 BUG(); 2240 return ret; 2241 } 2242 2243 static noinline struct btrfs_delayed_ref_node * 2244 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2245 { 2246 struct rb_node *node; 2247 struct btrfs_delayed_ref_node *ref; 2248 int action = BTRFS_ADD_DELAYED_REF; 2249 again: 2250 /* 2251 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 2252 * this prevents ref count from going down to zero when 2253 * there still are pending delayed ref. 2254 */ 2255 node = rb_prev(&head->node.rb_node); 2256 while (1) { 2257 if (!node) 2258 break; 2259 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2260 rb_node); 2261 if (ref->bytenr != head->node.bytenr) 2262 break; 2263 if (ref->action == action) 2264 return ref; 2265 node = rb_prev(node); 2266 } 2267 if (action == BTRFS_ADD_DELAYED_REF) { 2268 action = BTRFS_DROP_DELAYED_REF; 2269 goto again; 2270 } 2271 return NULL; 2272 } 2273 2274 /* 2275 * Returns 0 on success or if called with an already aborted transaction. 2276 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2277 */ 2278 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, 2279 struct btrfs_root *root, 2280 struct list_head *cluster) 2281 { 2282 struct btrfs_delayed_ref_root *delayed_refs; 2283 struct btrfs_delayed_ref_node *ref; 2284 struct btrfs_delayed_ref_head *locked_ref = NULL; 2285 struct btrfs_delayed_extent_op *extent_op; 2286 struct btrfs_fs_info *fs_info = root->fs_info; 2287 int ret; 2288 int count = 0; 2289 int must_insert_reserved = 0; 2290 2291 delayed_refs = &trans->transaction->delayed_refs; 2292 while (1) { 2293 if (!locked_ref) { 2294 /* pick a new head ref from the cluster list */ 2295 if (list_empty(cluster)) 2296 break; 2297 2298 locked_ref = list_entry(cluster->next, 2299 struct btrfs_delayed_ref_head, cluster); 2300 2301 /* grab the lock that says we are going to process 2302 * all the refs for this head */ 2303 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2304 2305 /* 2306 * we may have dropped the spin lock to get the head 2307 * mutex lock, and that might have given someone else 2308 * time to free the head. If that's true, it has been 2309 * removed from our list and we can move on. 2310 */ 2311 if (ret == -EAGAIN) { 2312 locked_ref = NULL; 2313 count++; 2314 continue; 2315 } 2316 } 2317 2318 /* 2319 * We need to try and merge add/drops of the same ref since we 2320 * can run into issues with relocate dropping the implicit ref 2321 * and then it being added back again before the drop can 2322 * finish. If we merged anything we need to re-loop so we can 2323 * get a good ref. 2324 */ 2325 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2326 locked_ref); 2327 2328 /* 2329 * locked_ref is the head node, so we have to go one 2330 * node back for any delayed ref updates 2331 */ 2332 ref = select_delayed_ref(locked_ref); 2333 2334 if (ref && ref->seq && 2335 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2336 /* 2337 * there are still refs with lower seq numbers in the 2338 * process of being added. Don't run this ref yet. 2339 */ 2340 list_del_init(&locked_ref->cluster); 2341 btrfs_delayed_ref_unlock(locked_ref); 2342 locked_ref = NULL; 2343 delayed_refs->num_heads_ready++; 2344 spin_unlock(&delayed_refs->lock); 2345 cond_resched(); 2346 spin_lock(&delayed_refs->lock); 2347 continue; 2348 } 2349 2350 /* 2351 * record the must insert reserved flag before we 2352 * drop the spin lock. 2353 */ 2354 must_insert_reserved = locked_ref->must_insert_reserved; 2355 locked_ref->must_insert_reserved = 0; 2356 2357 extent_op = locked_ref->extent_op; 2358 locked_ref->extent_op = NULL; 2359 2360 if (!ref) { 2361 /* All delayed refs have been processed, Go ahead 2362 * and send the head node to run_one_delayed_ref, 2363 * so that any accounting fixes can happen 2364 */ 2365 ref = &locked_ref->node; 2366 2367 if (extent_op && must_insert_reserved) { 2368 btrfs_free_delayed_extent_op(extent_op); 2369 extent_op = NULL; 2370 } 2371 2372 if (extent_op) { 2373 spin_unlock(&delayed_refs->lock); 2374 2375 ret = run_delayed_extent_op(trans, root, 2376 ref, extent_op); 2377 btrfs_free_delayed_extent_op(extent_op); 2378 2379 if (ret) { 2380 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2381 spin_lock(&delayed_refs->lock); 2382 btrfs_delayed_ref_unlock(locked_ref); 2383 return ret; 2384 } 2385 2386 goto next; 2387 } 2388 } 2389 2390 ref->in_tree = 0; 2391 rb_erase(&ref->rb_node, &delayed_refs->root); 2392 delayed_refs->num_entries--; 2393 if (!btrfs_delayed_ref_is_head(ref)) { 2394 /* 2395 * when we play the delayed ref, also correct the 2396 * ref_mod on head 2397 */ 2398 switch (ref->action) { 2399 case BTRFS_ADD_DELAYED_REF: 2400 case BTRFS_ADD_DELAYED_EXTENT: 2401 locked_ref->node.ref_mod -= ref->ref_mod; 2402 break; 2403 case BTRFS_DROP_DELAYED_REF: 2404 locked_ref->node.ref_mod += ref->ref_mod; 2405 break; 2406 default: 2407 WARN_ON(1); 2408 } 2409 } 2410 spin_unlock(&delayed_refs->lock); 2411 2412 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2413 must_insert_reserved); 2414 2415 btrfs_free_delayed_extent_op(extent_op); 2416 if (ret) { 2417 btrfs_delayed_ref_unlock(locked_ref); 2418 btrfs_put_delayed_ref(ref); 2419 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); 2420 spin_lock(&delayed_refs->lock); 2421 return ret; 2422 } 2423 2424 /* 2425 * If this node is a head, that means all the refs in this head 2426 * have been dealt with, and we will pick the next head to deal 2427 * with, so we must unlock the head and drop it from the cluster 2428 * list before we release it. 2429 */ 2430 if (btrfs_delayed_ref_is_head(ref)) { 2431 list_del_init(&locked_ref->cluster); 2432 btrfs_delayed_ref_unlock(locked_ref); 2433 locked_ref = NULL; 2434 } 2435 btrfs_put_delayed_ref(ref); 2436 count++; 2437 next: 2438 cond_resched(); 2439 spin_lock(&delayed_refs->lock); 2440 } 2441 return count; 2442 } 2443 2444 #ifdef SCRAMBLE_DELAYED_REFS 2445 /* 2446 * Normally delayed refs get processed in ascending bytenr order. This 2447 * correlates in most cases to the order added. To expose dependencies on this 2448 * order, we start to process the tree in the middle instead of the beginning 2449 */ 2450 static u64 find_middle(struct rb_root *root) 2451 { 2452 struct rb_node *n = root->rb_node; 2453 struct btrfs_delayed_ref_node *entry; 2454 int alt = 1; 2455 u64 middle; 2456 u64 first = 0, last = 0; 2457 2458 n = rb_first(root); 2459 if (n) { 2460 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2461 first = entry->bytenr; 2462 } 2463 n = rb_last(root); 2464 if (n) { 2465 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2466 last = entry->bytenr; 2467 } 2468 n = root->rb_node; 2469 2470 while (n) { 2471 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2472 WARN_ON(!entry->in_tree); 2473 2474 middle = entry->bytenr; 2475 2476 if (alt) 2477 n = n->rb_left; 2478 else 2479 n = n->rb_right; 2480 2481 alt = 1 - alt; 2482 } 2483 return middle; 2484 } 2485 #endif 2486 2487 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 2488 struct btrfs_fs_info *fs_info) 2489 { 2490 struct qgroup_update *qgroup_update; 2491 int ret = 0; 2492 2493 if (list_empty(&trans->qgroup_ref_list) != 2494 !trans->delayed_ref_elem.seq) { 2495 /* list without seq or seq without list */ 2496 btrfs_err(fs_info, 2497 "qgroup accounting update error, list is%s empty, seq is %#x.%x", 2498 list_empty(&trans->qgroup_ref_list) ? "" : " not", 2499 (u32)(trans->delayed_ref_elem.seq >> 32), 2500 (u32)trans->delayed_ref_elem.seq); 2501 BUG(); 2502 } 2503 2504 if (!trans->delayed_ref_elem.seq) 2505 return 0; 2506 2507 while (!list_empty(&trans->qgroup_ref_list)) { 2508 qgroup_update = list_first_entry(&trans->qgroup_ref_list, 2509 struct qgroup_update, list); 2510 list_del(&qgroup_update->list); 2511 if (!ret) 2512 ret = btrfs_qgroup_account_ref( 2513 trans, fs_info, qgroup_update->node, 2514 qgroup_update->extent_op); 2515 kfree(qgroup_update); 2516 } 2517 2518 btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); 2519 2520 return ret; 2521 } 2522 2523 static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, 2524 int count) 2525 { 2526 int val = atomic_read(&delayed_refs->ref_seq); 2527 2528 if (val < seq || val >= seq + count) 2529 return 1; 2530 return 0; 2531 } 2532 2533 /* 2534 * this starts processing the delayed reference count updates and 2535 * extent insertions we have queued up so far. count can be 2536 * 0, which means to process everything in the tree at the start 2537 * of the run (but not newly added entries), or it can be some target 2538 * number you'd like to process. 2539 * 2540 * Returns 0 on success or if called with an aborted transaction 2541 * Returns <0 on error and aborts the transaction 2542 */ 2543 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2544 struct btrfs_root *root, unsigned long count) 2545 { 2546 struct rb_node *node; 2547 struct btrfs_delayed_ref_root *delayed_refs; 2548 struct btrfs_delayed_ref_node *ref; 2549 struct list_head cluster; 2550 int ret; 2551 u64 delayed_start; 2552 int run_all = count == (unsigned long)-1; 2553 int run_most = 0; 2554 int loops; 2555 2556 /* We'll clean this up in btrfs_cleanup_transaction */ 2557 if (trans->aborted) 2558 return 0; 2559 2560 if (root == root->fs_info->extent_root) 2561 root = root->fs_info->tree_root; 2562 2563 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); 2564 2565 delayed_refs = &trans->transaction->delayed_refs; 2566 INIT_LIST_HEAD(&cluster); 2567 if (count == 0) { 2568 count = delayed_refs->num_entries * 2; 2569 run_most = 1; 2570 } 2571 2572 if (!run_all && !run_most) { 2573 int old; 2574 int seq = atomic_read(&delayed_refs->ref_seq); 2575 2576 progress: 2577 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); 2578 if (old) { 2579 DEFINE_WAIT(__wait); 2580 if (delayed_refs->num_entries < 16348) 2581 return 0; 2582 2583 prepare_to_wait(&delayed_refs->wait, &__wait, 2584 TASK_UNINTERRUPTIBLE); 2585 2586 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); 2587 if (old) { 2588 schedule(); 2589 finish_wait(&delayed_refs->wait, &__wait); 2590 2591 if (!refs_newer(delayed_refs, seq, 256)) 2592 goto progress; 2593 else 2594 return 0; 2595 } else { 2596 finish_wait(&delayed_refs->wait, &__wait); 2597 goto again; 2598 } 2599 } 2600 2601 } else { 2602 atomic_inc(&delayed_refs->procs_running_refs); 2603 } 2604 2605 again: 2606 loops = 0; 2607 spin_lock(&delayed_refs->lock); 2608 2609 #ifdef SCRAMBLE_DELAYED_REFS 2610 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2611 #endif 2612 2613 while (1) { 2614 if (!(run_all || run_most) && 2615 delayed_refs->num_heads_ready < 64) 2616 break; 2617 2618 /* 2619 * go find something we can process in the rbtree. We start at 2620 * the beginning of the tree, and then build a cluster 2621 * of refs to process starting at the first one we are able to 2622 * lock 2623 */ 2624 delayed_start = delayed_refs->run_delayed_start; 2625 ret = btrfs_find_ref_cluster(trans, &cluster, 2626 delayed_refs->run_delayed_start); 2627 if (ret) 2628 break; 2629 2630 ret = run_clustered_refs(trans, root, &cluster); 2631 if (ret < 0) { 2632 btrfs_release_ref_cluster(&cluster); 2633 spin_unlock(&delayed_refs->lock); 2634 btrfs_abort_transaction(trans, root, ret); 2635 atomic_dec(&delayed_refs->procs_running_refs); 2636 return ret; 2637 } 2638 2639 atomic_add(ret, &delayed_refs->ref_seq); 2640 2641 count -= min_t(unsigned long, ret, count); 2642 2643 if (count == 0) 2644 break; 2645 2646 if (delayed_start >= delayed_refs->run_delayed_start) { 2647 if (loops == 0) { 2648 /* 2649 * btrfs_find_ref_cluster looped. let's do one 2650 * more cycle. if we don't run any delayed ref 2651 * during that cycle (because we can't because 2652 * all of them are blocked), bail out. 2653 */ 2654 loops = 1; 2655 } else { 2656 /* 2657 * no runnable refs left, stop trying 2658 */ 2659 BUG_ON(run_all); 2660 break; 2661 } 2662 } 2663 if (ret) { 2664 /* refs were run, let's reset staleness detection */ 2665 loops = 0; 2666 } 2667 } 2668 2669 if (run_all) { 2670 if (!list_empty(&trans->new_bgs)) { 2671 spin_unlock(&delayed_refs->lock); 2672 btrfs_create_pending_block_groups(trans, root); 2673 spin_lock(&delayed_refs->lock); 2674 } 2675 2676 node = rb_first(&delayed_refs->root); 2677 if (!node) 2678 goto out; 2679 count = (unsigned long)-1; 2680 2681 while (node) { 2682 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2683 rb_node); 2684 if (btrfs_delayed_ref_is_head(ref)) { 2685 struct btrfs_delayed_ref_head *head; 2686 2687 head = btrfs_delayed_node_to_head(ref); 2688 atomic_inc(&ref->refs); 2689 2690 spin_unlock(&delayed_refs->lock); 2691 /* 2692 * Mutex was contended, block until it's 2693 * released and try again 2694 */ 2695 mutex_lock(&head->mutex); 2696 mutex_unlock(&head->mutex); 2697 2698 btrfs_put_delayed_ref(ref); 2699 cond_resched(); 2700 goto again; 2701 } 2702 node = rb_next(node); 2703 } 2704 spin_unlock(&delayed_refs->lock); 2705 schedule_timeout(1); 2706 goto again; 2707 } 2708 out: 2709 atomic_dec(&delayed_refs->procs_running_refs); 2710 smp_mb(); 2711 if (waitqueue_active(&delayed_refs->wait)) 2712 wake_up(&delayed_refs->wait); 2713 2714 spin_unlock(&delayed_refs->lock); 2715 assert_qgroups_uptodate(trans); 2716 return 0; 2717 } 2718 2719 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2720 struct btrfs_root *root, 2721 u64 bytenr, u64 num_bytes, u64 flags, 2722 int is_data) 2723 { 2724 struct btrfs_delayed_extent_op *extent_op; 2725 int ret; 2726 2727 extent_op = btrfs_alloc_delayed_extent_op(); 2728 if (!extent_op) 2729 return -ENOMEM; 2730 2731 extent_op->flags_to_set = flags; 2732 extent_op->update_flags = 1; 2733 extent_op->update_key = 0; 2734 extent_op->is_data = is_data ? 1 : 0; 2735 2736 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2737 num_bytes, extent_op); 2738 if (ret) 2739 btrfs_free_delayed_extent_op(extent_op); 2740 return ret; 2741 } 2742 2743 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, 2744 struct btrfs_root *root, 2745 struct btrfs_path *path, 2746 u64 objectid, u64 offset, u64 bytenr) 2747 { 2748 struct btrfs_delayed_ref_head *head; 2749 struct btrfs_delayed_ref_node *ref; 2750 struct btrfs_delayed_data_ref *data_ref; 2751 struct btrfs_delayed_ref_root *delayed_refs; 2752 struct rb_node *node; 2753 int ret = 0; 2754 2755 ret = -ENOENT; 2756 delayed_refs = &trans->transaction->delayed_refs; 2757 spin_lock(&delayed_refs->lock); 2758 head = btrfs_find_delayed_ref_head(trans, bytenr); 2759 if (!head) 2760 goto out; 2761 2762 if (!mutex_trylock(&head->mutex)) { 2763 atomic_inc(&head->node.refs); 2764 spin_unlock(&delayed_refs->lock); 2765 2766 btrfs_release_path(path); 2767 2768 /* 2769 * Mutex was contended, block until it's released and let 2770 * caller try again 2771 */ 2772 mutex_lock(&head->mutex); 2773 mutex_unlock(&head->mutex); 2774 btrfs_put_delayed_ref(&head->node); 2775 return -EAGAIN; 2776 } 2777 2778 node = rb_prev(&head->node.rb_node); 2779 if (!node) 2780 goto out_unlock; 2781 2782 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2783 2784 if (ref->bytenr != bytenr) 2785 goto out_unlock; 2786 2787 ret = 1; 2788 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) 2789 goto out_unlock; 2790 2791 data_ref = btrfs_delayed_node_to_data_ref(ref); 2792 2793 node = rb_prev(node); 2794 if (node) { 2795 int seq = ref->seq; 2796 2797 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2798 if (ref->bytenr == bytenr && ref->seq == seq) 2799 goto out_unlock; 2800 } 2801 2802 if (data_ref->root != root->root_key.objectid || 2803 data_ref->objectid != objectid || data_ref->offset != offset) 2804 goto out_unlock; 2805 2806 ret = 0; 2807 out_unlock: 2808 mutex_unlock(&head->mutex); 2809 out: 2810 spin_unlock(&delayed_refs->lock); 2811 return ret; 2812 } 2813 2814 static noinline int check_committed_ref(struct btrfs_trans_handle *trans, 2815 struct btrfs_root *root, 2816 struct btrfs_path *path, 2817 u64 objectid, u64 offset, u64 bytenr) 2818 { 2819 struct btrfs_root *extent_root = root->fs_info->extent_root; 2820 struct extent_buffer *leaf; 2821 struct btrfs_extent_data_ref *ref; 2822 struct btrfs_extent_inline_ref *iref; 2823 struct btrfs_extent_item *ei; 2824 struct btrfs_key key; 2825 u32 item_size; 2826 int ret; 2827 2828 key.objectid = bytenr; 2829 key.offset = (u64)-1; 2830 key.type = BTRFS_EXTENT_ITEM_KEY; 2831 2832 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2833 if (ret < 0) 2834 goto out; 2835 BUG_ON(ret == 0); /* Corruption */ 2836 2837 ret = -ENOENT; 2838 if (path->slots[0] == 0) 2839 goto out; 2840 2841 path->slots[0]--; 2842 leaf = path->nodes[0]; 2843 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2844 2845 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2846 goto out; 2847 2848 ret = 1; 2849 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2850 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2851 if (item_size < sizeof(*ei)) { 2852 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 2853 goto out; 2854 } 2855 #endif 2856 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2857 2858 if (item_size != sizeof(*ei) + 2859 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2860 goto out; 2861 2862 if (btrfs_extent_generation(leaf, ei) <= 2863 btrfs_root_last_snapshot(&root->root_item)) 2864 goto out; 2865 2866 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 2867 if (btrfs_extent_inline_ref_type(leaf, iref) != 2868 BTRFS_EXTENT_DATA_REF_KEY) 2869 goto out; 2870 2871 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 2872 if (btrfs_extent_refs(leaf, ei) != 2873 btrfs_extent_data_ref_count(leaf, ref) || 2874 btrfs_extent_data_ref_root(leaf, ref) != 2875 root->root_key.objectid || 2876 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 2877 btrfs_extent_data_ref_offset(leaf, ref) != offset) 2878 goto out; 2879 2880 ret = 0; 2881 out: 2882 return ret; 2883 } 2884 2885 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2886 struct btrfs_root *root, 2887 u64 objectid, u64 offset, u64 bytenr) 2888 { 2889 struct btrfs_path *path; 2890 int ret; 2891 int ret2; 2892 2893 path = btrfs_alloc_path(); 2894 if (!path) 2895 return -ENOENT; 2896 2897 do { 2898 ret = check_committed_ref(trans, root, path, objectid, 2899 offset, bytenr); 2900 if (ret && ret != -ENOENT) 2901 goto out; 2902 2903 ret2 = check_delayed_ref(trans, root, path, objectid, 2904 offset, bytenr); 2905 } while (ret2 == -EAGAIN); 2906 2907 if (ret2 && ret2 != -ENOENT) { 2908 ret = ret2; 2909 goto out; 2910 } 2911 2912 if (ret != -ENOENT || ret2 != -ENOENT) 2913 ret = 0; 2914 out: 2915 btrfs_free_path(path); 2916 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 2917 WARN_ON(ret > 0); 2918 return ret; 2919 } 2920 2921 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2922 struct btrfs_root *root, 2923 struct extent_buffer *buf, 2924 int full_backref, int inc, int for_cow) 2925 { 2926 u64 bytenr; 2927 u64 num_bytes; 2928 u64 parent; 2929 u64 ref_root; 2930 u32 nritems; 2931 struct btrfs_key key; 2932 struct btrfs_file_extent_item *fi; 2933 int i; 2934 int level; 2935 int ret = 0; 2936 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 2937 u64, u64, u64, u64, u64, u64, int); 2938 2939 ref_root = btrfs_header_owner(buf); 2940 nritems = btrfs_header_nritems(buf); 2941 level = btrfs_header_level(buf); 2942 2943 if (!root->ref_cows && level == 0) 2944 return 0; 2945 2946 if (inc) 2947 process_func = btrfs_inc_extent_ref; 2948 else 2949 process_func = btrfs_free_extent; 2950 2951 if (full_backref) 2952 parent = buf->start; 2953 else 2954 parent = 0; 2955 2956 for (i = 0; i < nritems; i++) { 2957 if (level == 0) { 2958 btrfs_item_key_to_cpu(buf, &key, i); 2959 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 2960 continue; 2961 fi = btrfs_item_ptr(buf, i, 2962 struct btrfs_file_extent_item); 2963 if (btrfs_file_extent_type(buf, fi) == 2964 BTRFS_FILE_EXTENT_INLINE) 2965 continue; 2966 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 2967 if (bytenr == 0) 2968 continue; 2969 2970 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 2971 key.offset -= btrfs_file_extent_offset(buf, fi); 2972 ret = process_func(trans, root, bytenr, num_bytes, 2973 parent, ref_root, key.objectid, 2974 key.offset, for_cow); 2975 if (ret) 2976 goto fail; 2977 } else { 2978 bytenr = btrfs_node_blockptr(buf, i); 2979 num_bytes = btrfs_level_size(root, level - 1); 2980 ret = process_func(trans, root, bytenr, num_bytes, 2981 parent, ref_root, level - 1, 0, 2982 for_cow); 2983 if (ret) 2984 goto fail; 2985 } 2986 } 2987 return 0; 2988 fail: 2989 return ret; 2990 } 2991 2992 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2993 struct extent_buffer *buf, int full_backref, int for_cow) 2994 { 2995 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); 2996 } 2997 2998 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2999 struct extent_buffer *buf, int full_backref, int for_cow) 3000 { 3001 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); 3002 } 3003 3004 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3005 struct btrfs_root *root, 3006 struct btrfs_path *path, 3007 struct btrfs_block_group_cache *cache) 3008 { 3009 int ret; 3010 struct btrfs_root *extent_root = root->fs_info->extent_root; 3011 unsigned long bi; 3012 struct extent_buffer *leaf; 3013 3014 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3015 if (ret < 0) 3016 goto fail; 3017 BUG_ON(ret); /* Corruption */ 3018 3019 leaf = path->nodes[0]; 3020 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3021 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3022 btrfs_mark_buffer_dirty(leaf); 3023 btrfs_release_path(path); 3024 fail: 3025 if (ret) { 3026 btrfs_abort_transaction(trans, root, ret); 3027 return ret; 3028 } 3029 return 0; 3030 3031 } 3032 3033 static struct btrfs_block_group_cache * 3034 next_block_group(struct btrfs_root *root, 3035 struct btrfs_block_group_cache *cache) 3036 { 3037 struct rb_node *node; 3038 spin_lock(&root->fs_info->block_group_cache_lock); 3039 node = rb_next(&cache->cache_node); 3040 btrfs_put_block_group(cache); 3041 if (node) { 3042 cache = rb_entry(node, struct btrfs_block_group_cache, 3043 cache_node); 3044 btrfs_get_block_group(cache); 3045 } else 3046 cache = NULL; 3047 spin_unlock(&root->fs_info->block_group_cache_lock); 3048 return cache; 3049 } 3050 3051 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3052 struct btrfs_trans_handle *trans, 3053 struct btrfs_path *path) 3054 { 3055 struct btrfs_root *root = block_group->fs_info->tree_root; 3056 struct inode *inode = NULL; 3057 u64 alloc_hint = 0; 3058 int dcs = BTRFS_DC_ERROR; 3059 int num_pages = 0; 3060 int retries = 0; 3061 int ret = 0; 3062 3063 /* 3064 * If this block group is smaller than 100 megs don't bother caching the 3065 * block group. 3066 */ 3067 if (block_group->key.offset < (100 * 1024 * 1024)) { 3068 spin_lock(&block_group->lock); 3069 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3070 spin_unlock(&block_group->lock); 3071 return 0; 3072 } 3073 3074 again: 3075 inode = lookup_free_space_inode(root, block_group, path); 3076 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3077 ret = PTR_ERR(inode); 3078 btrfs_release_path(path); 3079 goto out; 3080 } 3081 3082 if (IS_ERR(inode)) { 3083 BUG_ON(retries); 3084 retries++; 3085 3086 if (block_group->ro) 3087 goto out_free; 3088 3089 ret = create_free_space_inode(root, trans, block_group, path); 3090 if (ret) 3091 goto out_free; 3092 goto again; 3093 } 3094 3095 /* We've already setup this transaction, go ahead and exit */ 3096 if (block_group->cache_generation == trans->transid && 3097 i_size_read(inode)) { 3098 dcs = BTRFS_DC_SETUP; 3099 goto out_put; 3100 } 3101 3102 /* 3103 * We want to set the generation to 0, that way if anything goes wrong 3104 * from here on out we know not to trust this cache when we load up next 3105 * time. 3106 */ 3107 BTRFS_I(inode)->generation = 0; 3108 ret = btrfs_update_inode(trans, root, inode); 3109 WARN_ON(ret); 3110 3111 if (i_size_read(inode) > 0) { 3112 ret = btrfs_truncate_free_space_cache(root, trans, path, 3113 inode); 3114 if (ret) 3115 goto out_put; 3116 } 3117 3118 spin_lock(&block_group->lock); 3119 if (block_group->cached != BTRFS_CACHE_FINISHED || 3120 !btrfs_test_opt(root, SPACE_CACHE)) { 3121 /* 3122 * don't bother trying to write stuff out _if_ 3123 * a) we're not cached, 3124 * b) we're with nospace_cache mount option. 3125 */ 3126 dcs = BTRFS_DC_WRITTEN; 3127 spin_unlock(&block_group->lock); 3128 goto out_put; 3129 } 3130 spin_unlock(&block_group->lock); 3131 3132 /* 3133 * Try to preallocate enough space based on how big the block group is. 3134 * Keep in mind this has to include any pinned space which could end up 3135 * taking up quite a bit since it's not folded into the other space 3136 * cache. 3137 */ 3138 num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); 3139 if (!num_pages) 3140 num_pages = 1; 3141 3142 num_pages *= 16; 3143 num_pages *= PAGE_CACHE_SIZE; 3144 3145 ret = btrfs_check_data_free_space(inode, num_pages); 3146 if (ret) 3147 goto out_put; 3148 3149 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3150 num_pages, num_pages, 3151 &alloc_hint); 3152 if (!ret) 3153 dcs = BTRFS_DC_SETUP; 3154 btrfs_free_reserved_data_space(inode, num_pages); 3155 3156 out_put: 3157 iput(inode); 3158 out_free: 3159 btrfs_release_path(path); 3160 out: 3161 spin_lock(&block_group->lock); 3162 if (!ret && dcs == BTRFS_DC_SETUP) 3163 block_group->cache_generation = trans->transid; 3164 block_group->disk_cache_state = dcs; 3165 spin_unlock(&block_group->lock); 3166 3167 return ret; 3168 } 3169 3170 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3171 struct btrfs_root *root) 3172 { 3173 struct btrfs_block_group_cache *cache; 3174 int err = 0; 3175 struct btrfs_path *path; 3176 u64 last = 0; 3177 3178 path = btrfs_alloc_path(); 3179 if (!path) 3180 return -ENOMEM; 3181 3182 again: 3183 while (1) { 3184 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3185 while (cache) { 3186 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3187 break; 3188 cache = next_block_group(root, cache); 3189 } 3190 if (!cache) { 3191 if (last == 0) 3192 break; 3193 last = 0; 3194 continue; 3195 } 3196 err = cache_save_setup(cache, trans, path); 3197 last = cache->key.objectid + cache->key.offset; 3198 btrfs_put_block_group(cache); 3199 } 3200 3201 while (1) { 3202 if (last == 0) { 3203 err = btrfs_run_delayed_refs(trans, root, 3204 (unsigned long)-1); 3205 if (err) /* File system offline */ 3206 goto out; 3207 } 3208 3209 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3210 while (cache) { 3211 if (cache->disk_cache_state == BTRFS_DC_CLEAR) { 3212 btrfs_put_block_group(cache); 3213 goto again; 3214 } 3215 3216 if (cache->dirty) 3217 break; 3218 cache = next_block_group(root, cache); 3219 } 3220 if (!cache) { 3221 if (last == 0) 3222 break; 3223 last = 0; 3224 continue; 3225 } 3226 3227 if (cache->disk_cache_state == BTRFS_DC_SETUP) 3228 cache->disk_cache_state = BTRFS_DC_NEED_WRITE; 3229 cache->dirty = 0; 3230 last = cache->key.objectid + cache->key.offset; 3231 3232 err = write_one_cache_group(trans, root, path, cache); 3233 if (err) /* File system offline */ 3234 goto out; 3235 3236 btrfs_put_block_group(cache); 3237 } 3238 3239 while (1) { 3240 /* 3241 * I don't think this is needed since we're just marking our 3242 * preallocated extent as written, but just in case it can't 3243 * hurt. 3244 */ 3245 if (last == 0) { 3246 err = btrfs_run_delayed_refs(trans, root, 3247 (unsigned long)-1); 3248 if (err) /* File system offline */ 3249 goto out; 3250 } 3251 3252 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3253 while (cache) { 3254 /* 3255 * Really this shouldn't happen, but it could if we 3256 * couldn't write the entire preallocated extent and 3257 * splitting the extent resulted in a new block. 3258 */ 3259 if (cache->dirty) { 3260 btrfs_put_block_group(cache); 3261 goto again; 3262 } 3263 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 3264 break; 3265 cache = next_block_group(root, cache); 3266 } 3267 if (!cache) { 3268 if (last == 0) 3269 break; 3270 last = 0; 3271 continue; 3272 } 3273 3274 err = btrfs_write_out_cache(root, trans, cache, path); 3275 3276 /* 3277 * If we didn't have an error then the cache state is still 3278 * NEED_WRITE, so we can set it to WRITTEN. 3279 */ 3280 if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 3281 cache->disk_cache_state = BTRFS_DC_WRITTEN; 3282 last = cache->key.objectid + cache->key.offset; 3283 btrfs_put_block_group(cache); 3284 } 3285 out: 3286 3287 btrfs_free_path(path); 3288 return err; 3289 } 3290 3291 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 3292 { 3293 struct btrfs_block_group_cache *block_group; 3294 int readonly = 0; 3295 3296 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 3297 if (!block_group || block_group->ro) 3298 readonly = 1; 3299 if (block_group) 3300 btrfs_put_block_group(block_group); 3301 return readonly; 3302 } 3303 3304 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3305 u64 total_bytes, u64 bytes_used, 3306 struct btrfs_space_info **space_info) 3307 { 3308 struct btrfs_space_info *found; 3309 int i; 3310 int factor; 3311 3312 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3313 BTRFS_BLOCK_GROUP_RAID10)) 3314 factor = 2; 3315 else 3316 factor = 1; 3317 3318 found = __find_space_info(info, flags); 3319 if (found) { 3320 spin_lock(&found->lock); 3321 found->total_bytes += total_bytes; 3322 found->disk_total += total_bytes * factor; 3323 found->bytes_used += bytes_used; 3324 found->disk_used += bytes_used * factor; 3325 found->full = 0; 3326 spin_unlock(&found->lock); 3327 *space_info = found; 3328 return 0; 3329 } 3330 found = kzalloc(sizeof(*found), GFP_NOFS); 3331 if (!found) 3332 return -ENOMEM; 3333 3334 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3335 INIT_LIST_HEAD(&found->block_groups[i]); 3336 init_rwsem(&found->groups_sem); 3337 spin_lock_init(&found->lock); 3338 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3339 found->total_bytes = total_bytes; 3340 found->disk_total = total_bytes * factor; 3341 found->bytes_used = bytes_used; 3342 found->disk_used = bytes_used * factor; 3343 found->bytes_pinned = 0; 3344 found->bytes_reserved = 0; 3345 found->bytes_readonly = 0; 3346 found->bytes_may_use = 0; 3347 found->full = 0; 3348 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3349 found->chunk_alloc = 0; 3350 found->flush = 0; 3351 init_waitqueue_head(&found->wait); 3352 *space_info = found; 3353 list_add_rcu(&found->list, &info->space_info); 3354 if (flags & BTRFS_BLOCK_GROUP_DATA) 3355 info->data_sinfo = found; 3356 return 0; 3357 } 3358 3359 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3360 { 3361 u64 extra_flags = chunk_to_extended(flags) & 3362 BTRFS_EXTENDED_PROFILE_MASK; 3363 3364 write_seqlock(&fs_info->profiles_lock); 3365 if (flags & BTRFS_BLOCK_GROUP_DATA) 3366 fs_info->avail_data_alloc_bits |= extra_flags; 3367 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3368 fs_info->avail_metadata_alloc_bits |= extra_flags; 3369 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3370 fs_info->avail_system_alloc_bits |= extra_flags; 3371 write_sequnlock(&fs_info->profiles_lock); 3372 } 3373 3374 /* 3375 * returns target flags in extended format or 0 if restripe for this 3376 * chunk_type is not in progress 3377 * 3378 * should be called with either volume_mutex or balance_lock held 3379 */ 3380 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 3381 { 3382 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3383 u64 target = 0; 3384 3385 if (!bctl) 3386 return 0; 3387 3388 if (flags & BTRFS_BLOCK_GROUP_DATA && 3389 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3390 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 3391 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 3392 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3393 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 3394 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 3395 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3396 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 3397 } 3398 3399 return target; 3400 } 3401 3402 /* 3403 * @flags: available profiles in extended format (see ctree.h) 3404 * 3405 * Returns reduced profile in chunk format. If profile changing is in 3406 * progress (either running or paused) picks the target profile (if it's 3407 * already available), otherwise falls back to plain reducing. 3408 */ 3409 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3410 { 3411 /* 3412 * we add in the count of missing devices because we want 3413 * to make sure that any RAID levels on a degraded FS 3414 * continue to be honored. 3415 */ 3416 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3417 root->fs_info->fs_devices->missing_devices; 3418 u64 target; 3419 u64 tmp; 3420 3421 /* 3422 * see if restripe for this chunk_type is in progress, if so 3423 * try to reduce to the target profile 3424 */ 3425 spin_lock(&root->fs_info->balance_lock); 3426 target = get_restripe_target(root->fs_info, flags); 3427 if (target) { 3428 /* pick target profile only if it's already available */ 3429 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 3430 spin_unlock(&root->fs_info->balance_lock); 3431 return extended_to_chunk(target); 3432 } 3433 } 3434 spin_unlock(&root->fs_info->balance_lock); 3435 3436 /* First, mask out the RAID levels which aren't possible */ 3437 if (num_devices == 1) 3438 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | 3439 BTRFS_BLOCK_GROUP_RAID5); 3440 if (num_devices < 3) 3441 flags &= ~BTRFS_BLOCK_GROUP_RAID6; 3442 if (num_devices < 4) 3443 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3444 3445 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3446 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | 3447 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); 3448 flags &= ~tmp; 3449 3450 if (tmp & BTRFS_BLOCK_GROUP_RAID6) 3451 tmp = BTRFS_BLOCK_GROUP_RAID6; 3452 else if (tmp & BTRFS_BLOCK_GROUP_RAID5) 3453 tmp = BTRFS_BLOCK_GROUP_RAID5; 3454 else if (tmp & BTRFS_BLOCK_GROUP_RAID10) 3455 tmp = BTRFS_BLOCK_GROUP_RAID10; 3456 else if (tmp & BTRFS_BLOCK_GROUP_RAID1) 3457 tmp = BTRFS_BLOCK_GROUP_RAID1; 3458 else if (tmp & BTRFS_BLOCK_GROUP_RAID0) 3459 tmp = BTRFS_BLOCK_GROUP_RAID0; 3460 3461 return extended_to_chunk(flags | tmp); 3462 } 3463 3464 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3465 { 3466 unsigned seq; 3467 3468 do { 3469 seq = read_seqbegin(&root->fs_info->profiles_lock); 3470 3471 if (flags & BTRFS_BLOCK_GROUP_DATA) 3472 flags |= root->fs_info->avail_data_alloc_bits; 3473 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3474 flags |= root->fs_info->avail_system_alloc_bits; 3475 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3476 flags |= root->fs_info->avail_metadata_alloc_bits; 3477 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 3478 3479 return btrfs_reduce_alloc_profile(root, flags); 3480 } 3481 3482 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3483 { 3484 u64 flags; 3485 u64 ret; 3486 3487 if (data) 3488 flags = BTRFS_BLOCK_GROUP_DATA; 3489 else if (root == root->fs_info->chunk_root) 3490 flags = BTRFS_BLOCK_GROUP_SYSTEM; 3491 else 3492 flags = BTRFS_BLOCK_GROUP_METADATA; 3493 3494 ret = get_alloc_profile(root, flags); 3495 return ret; 3496 } 3497 3498 /* 3499 * This will check the space that the inode allocates from to make sure we have 3500 * enough space for bytes. 3501 */ 3502 int btrfs_check_data_free_space(struct inode *inode, u64 bytes) 3503 { 3504 struct btrfs_space_info *data_sinfo; 3505 struct btrfs_root *root = BTRFS_I(inode)->root; 3506 struct btrfs_fs_info *fs_info = root->fs_info; 3507 u64 used; 3508 int ret = 0, committed = 0, alloc_chunk = 1; 3509 3510 /* make sure bytes are sectorsize aligned */ 3511 bytes = ALIGN(bytes, root->sectorsize); 3512 3513 if (root == root->fs_info->tree_root || 3514 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { 3515 alloc_chunk = 0; 3516 committed = 1; 3517 } 3518 3519 data_sinfo = fs_info->data_sinfo; 3520 if (!data_sinfo) 3521 goto alloc; 3522 3523 again: 3524 /* make sure we have enough space to handle the data first */ 3525 spin_lock(&data_sinfo->lock); 3526 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + 3527 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + 3528 data_sinfo->bytes_may_use; 3529 3530 if (used + bytes > data_sinfo->total_bytes) { 3531 struct btrfs_trans_handle *trans; 3532 3533 /* 3534 * if we don't have enough free bytes in this space then we need 3535 * to alloc a new chunk. 3536 */ 3537 if (!data_sinfo->full && alloc_chunk) { 3538 u64 alloc_target; 3539 3540 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 3541 spin_unlock(&data_sinfo->lock); 3542 alloc: 3543 alloc_target = btrfs_get_alloc_profile(root, 1); 3544 trans = btrfs_join_transaction(root); 3545 if (IS_ERR(trans)) 3546 return PTR_ERR(trans); 3547 3548 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3549 alloc_target, 3550 CHUNK_ALLOC_NO_FORCE); 3551 btrfs_end_transaction(trans, root); 3552 if (ret < 0) { 3553 if (ret != -ENOSPC) 3554 return ret; 3555 else 3556 goto commit_trans; 3557 } 3558 3559 if (!data_sinfo) 3560 data_sinfo = fs_info->data_sinfo; 3561 3562 goto again; 3563 } 3564 3565 /* 3566 * If we have less pinned bytes than we want to allocate then 3567 * don't bother committing the transaction, it won't help us. 3568 */ 3569 if (data_sinfo->bytes_pinned < bytes) 3570 committed = 1; 3571 spin_unlock(&data_sinfo->lock); 3572 3573 /* commit the current transaction and try again */ 3574 commit_trans: 3575 if (!committed && 3576 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3577 committed = 1; 3578 trans = btrfs_join_transaction(root); 3579 if (IS_ERR(trans)) 3580 return PTR_ERR(trans); 3581 ret = btrfs_commit_transaction(trans, root); 3582 if (ret) 3583 return ret; 3584 goto again; 3585 } 3586 3587 return -ENOSPC; 3588 } 3589 data_sinfo->bytes_may_use += bytes; 3590 trace_btrfs_space_reservation(root->fs_info, "space_info", 3591 data_sinfo->flags, bytes, 1); 3592 spin_unlock(&data_sinfo->lock); 3593 3594 return 0; 3595 } 3596 3597 /* 3598 * Called if we need to clear a data reservation for this inode. 3599 */ 3600 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3601 { 3602 struct btrfs_root *root = BTRFS_I(inode)->root; 3603 struct btrfs_space_info *data_sinfo; 3604 3605 /* make sure bytes are sectorsize aligned */ 3606 bytes = ALIGN(bytes, root->sectorsize); 3607 3608 data_sinfo = root->fs_info->data_sinfo; 3609 spin_lock(&data_sinfo->lock); 3610 data_sinfo->bytes_may_use -= bytes; 3611 trace_btrfs_space_reservation(root->fs_info, "space_info", 3612 data_sinfo->flags, bytes, 0); 3613 spin_unlock(&data_sinfo->lock); 3614 } 3615 3616 static void force_metadata_allocation(struct btrfs_fs_info *info) 3617 { 3618 struct list_head *head = &info->space_info; 3619 struct btrfs_space_info *found; 3620 3621 rcu_read_lock(); 3622 list_for_each_entry_rcu(found, head, list) { 3623 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3624 found->force_alloc = CHUNK_ALLOC_FORCE; 3625 } 3626 rcu_read_unlock(); 3627 } 3628 3629 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 3630 { 3631 return (global->size << 1); 3632 } 3633 3634 static int should_alloc_chunk(struct btrfs_root *root, 3635 struct btrfs_space_info *sinfo, int force) 3636 { 3637 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3638 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3639 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3640 u64 thresh; 3641 3642 if (force == CHUNK_ALLOC_FORCE) 3643 return 1; 3644 3645 /* 3646 * We need to take into account the global rsv because for all intents 3647 * and purposes it's used space. Don't worry about locking the 3648 * global_rsv, it doesn't change except when the transaction commits. 3649 */ 3650 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 3651 num_allocated += calc_global_rsv_need_space(global_rsv); 3652 3653 /* 3654 * in limited mode, we want to have some free space up to 3655 * about 1% of the FS size. 3656 */ 3657 if (force == CHUNK_ALLOC_LIMITED) { 3658 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 3659 thresh = max_t(u64, 64 * 1024 * 1024, 3660 div_factor_fine(thresh, 1)); 3661 3662 if (num_bytes - num_allocated < thresh) 3663 return 1; 3664 } 3665 3666 if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) 3667 return 0; 3668 return 1; 3669 } 3670 3671 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) 3672 { 3673 u64 num_dev; 3674 3675 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 3676 BTRFS_BLOCK_GROUP_RAID0 | 3677 BTRFS_BLOCK_GROUP_RAID5 | 3678 BTRFS_BLOCK_GROUP_RAID6)) 3679 num_dev = root->fs_info->fs_devices->rw_devices; 3680 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3681 num_dev = 2; 3682 else 3683 num_dev = 1; /* DUP or single */ 3684 3685 /* metadata for updaing devices and chunk tree */ 3686 return btrfs_calc_trans_metadata_size(root, num_dev + 1); 3687 } 3688 3689 static void check_system_chunk(struct btrfs_trans_handle *trans, 3690 struct btrfs_root *root, u64 type) 3691 { 3692 struct btrfs_space_info *info; 3693 u64 left; 3694 u64 thresh; 3695 3696 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3697 spin_lock(&info->lock); 3698 left = info->total_bytes - info->bytes_used - info->bytes_pinned - 3699 info->bytes_reserved - info->bytes_readonly; 3700 spin_unlock(&info->lock); 3701 3702 thresh = get_system_chunk_thresh(root, type); 3703 if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { 3704 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", 3705 left, thresh, type); 3706 dump_space_info(info, 0, 0); 3707 } 3708 3709 if (left < thresh) { 3710 u64 flags; 3711 3712 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); 3713 btrfs_alloc_chunk(trans, root, flags); 3714 } 3715 } 3716 3717 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3718 struct btrfs_root *extent_root, u64 flags, int force) 3719 { 3720 struct btrfs_space_info *space_info; 3721 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3722 int wait_for_alloc = 0; 3723 int ret = 0; 3724 3725 /* Don't re-enter if we're already allocating a chunk */ 3726 if (trans->allocating_chunk) 3727 return -ENOSPC; 3728 3729 space_info = __find_space_info(extent_root->fs_info, flags); 3730 if (!space_info) { 3731 ret = update_space_info(extent_root->fs_info, flags, 3732 0, 0, &space_info); 3733 BUG_ON(ret); /* -ENOMEM */ 3734 } 3735 BUG_ON(!space_info); /* Logic error */ 3736 3737 again: 3738 spin_lock(&space_info->lock); 3739 if (force < space_info->force_alloc) 3740 force = space_info->force_alloc; 3741 if (space_info->full) { 3742 spin_unlock(&space_info->lock); 3743 return 0; 3744 } 3745 3746 if (!should_alloc_chunk(extent_root, space_info, force)) { 3747 spin_unlock(&space_info->lock); 3748 return 0; 3749 } else if (space_info->chunk_alloc) { 3750 wait_for_alloc = 1; 3751 } else { 3752 space_info->chunk_alloc = 1; 3753 } 3754 3755 spin_unlock(&space_info->lock); 3756 3757 mutex_lock(&fs_info->chunk_mutex); 3758 3759 /* 3760 * The chunk_mutex is held throughout the entirety of a chunk 3761 * allocation, so once we've acquired the chunk_mutex we know that the 3762 * other guy is done and we need to recheck and see if we should 3763 * allocate. 3764 */ 3765 if (wait_for_alloc) { 3766 mutex_unlock(&fs_info->chunk_mutex); 3767 wait_for_alloc = 0; 3768 goto again; 3769 } 3770 3771 trans->allocating_chunk = true; 3772 3773 /* 3774 * If we have mixed data/metadata chunks we want to make sure we keep 3775 * allocating mixed chunks instead of individual chunks. 3776 */ 3777 if (btrfs_mixed_space_info(space_info)) 3778 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 3779 3780 /* 3781 * if we're doing a data chunk, go ahead and make sure that 3782 * we keep a reasonable number of metadata chunks allocated in the 3783 * FS as well. 3784 */ 3785 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3786 fs_info->data_chunk_allocations++; 3787 if (!(fs_info->data_chunk_allocations % 3788 fs_info->metadata_ratio)) 3789 force_metadata_allocation(fs_info); 3790 } 3791 3792 /* 3793 * Check if we have enough space in SYSTEM chunk because we may need 3794 * to update devices. 3795 */ 3796 check_system_chunk(trans, extent_root, flags); 3797 3798 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3799 trans->allocating_chunk = false; 3800 3801 spin_lock(&space_info->lock); 3802 if (ret < 0 && ret != -ENOSPC) 3803 goto out; 3804 if (ret) 3805 space_info->full = 1; 3806 else 3807 ret = 1; 3808 3809 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3810 out: 3811 space_info->chunk_alloc = 0; 3812 spin_unlock(&space_info->lock); 3813 mutex_unlock(&fs_info->chunk_mutex); 3814 return ret; 3815 } 3816 3817 static int can_overcommit(struct btrfs_root *root, 3818 struct btrfs_space_info *space_info, u64 bytes, 3819 enum btrfs_reserve_flush_enum flush) 3820 { 3821 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3822 u64 profile = btrfs_get_alloc_profile(root, 0); 3823 u64 space_size; 3824 u64 avail; 3825 u64 used; 3826 u64 to_add; 3827 3828 used = space_info->bytes_used + space_info->bytes_reserved + 3829 space_info->bytes_pinned + space_info->bytes_readonly; 3830 3831 /* 3832 * We only want to allow over committing if we have lots of actual space 3833 * free, but if we don't have enough space to handle the global reserve 3834 * space then we could end up having a real enospc problem when trying 3835 * to allocate a chunk or some other such important allocation. 3836 */ 3837 spin_lock(&global_rsv->lock); 3838 space_size = calc_global_rsv_need_space(global_rsv); 3839 spin_unlock(&global_rsv->lock); 3840 if (used + space_size >= space_info->total_bytes) 3841 return 0; 3842 3843 used += space_info->bytes_may_use; 3844 3845 spin_lock(&root->fs_info->free_chunk_lock); 3846 avail = root->fs_info->free_chunk_space; 3847 spin_unlock(&root->fs_info->free_chunk_lock); 3848 3849 /* 3850 * If we have dup, raid1 or raid10 then only half of the free 3851 * space is actually useable. For raid56, the space info used 3852 * doesn't include the parity drive, so we don't have to 3853 * change the math 3854 */ 3855 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3856 BTRFS_BLOCK_GROUP_RAID1 | 3857 BTRFS_BLOCK_GROUP_RAID10)) 3858 avail >>= 1; 3859 3860 to_add = space_info->total_bytes; 3861 3862 /* 3863 * If we aren't flushing all things, let us overcommit up to 3864 * 1/2th of the space. If we can flush, don't let us overcommit 3865 * too much, let it overcommit up to 1/8 of the space. 3866 */ 3867 if (flush == BTRFS_RESERVE_FLUSH_ALL) 3868 to_add >>= 3; 3869 else 3870 to_add >>= 1; 3871 3872 /* 3873 * Limit the overcommit to the amount of free space we could possibly 3874 * allocate for chunks. 3875 */ 3876 to_add = min(avail, to_add); 3877 3878 if (used + bytes < space_info->total_bytes + to_add) 3879 return 1; 3880 return 0; 3881 } 3882 3883 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 3884 unsigned long nr_pages) 3885 { 3886 struct super_block *sb = root->fs_info->sb; 3887 int started; 3888 3889 /* If we can not start writeback, just sync all the delalloc file. */ 3890 started = try_to_writeback_inodes_sb_nr(sb, nr_pages, 3891 WB_REASON_FS_FREE_SPACE); 3892 if (!started) { 3893 /* 3894 * We needn't worry the filesystem going from r/w to r/o though 3895 * we don't acquire ->s_umount mutex, because the filesystem 3896 * should guarantee the delalloc inodes list be empty after 3897 * the filesystem is readonly(all dirty pages are written to 3898 * the disk). 3899 */ 3900 btrfs_start_delalloc_inodes(root, 0); 3901 if (!current->journal_info) 3902 btrfs_wait_ordered_extents(root, 0); 3903 } 3904 } 3905 3906 /* 3907 * shrink metadata reservation for delalloc 3908 */ 3909 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 3910 bool wait_ordered) 3911 { 3912 struct btrfs_block_rsv *block_rsv; 3913 struct btrfs_space_info *space_info; 3914 struct btrfs_trans_handle *trans; 3915 u64 delalloc_bytes; 3916 u64 max_reclaim; 3917 long time_left; 3918 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3919 int loops = 0; 3920 enum btrfs_reserve_flush_enum flush; 3921 3922 trans = (struct btrfs_trans_handle *)current->journal_info; 3923 block_rsv = &root->fs_info->delalloc_block_rsv; 3924 space_info = block_rsv->space_info; 3925 3926 smp_mb(); 3927 delalloc_bytes = percpu_counter_sum_positive( 3928 &root->fs_info->delalloc_bytes); 3929 if (delalloc_bytes == 0) { 3930 if (trans) 3931 return; 3932 btrfs_wait_ordered_extents(root, 0); 3933 return; 3934 } 3935 3936 while (delalloc_bytes && loops < 3) { 3937 max_reclaim = min(delalloc_bytes, to_reclaim); 3938 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3939 btrfs_writeback_inodes_sb_nr(root, nr_pages); 3940 /* 3941 * We need to wait for the async pages to actually start before 3942 * we do anything. 3943 */ 3944 wait_event(root->fs_info->async_submit_wait, 3945 !atomic_read(&root->fs_info->async_delalloc_pages)); 3946 3947 if (!trans) 3948 flush = BTRFS_RESERVE_FLUSH_ALL; 3949 else 3950 flush = BTRFS_RESERVE_NO_FLUSH; 3951 spin_lock(&space_info->lock); 3952 if (can_overcommit(root, space_info, orig, flush)) { 3953 spin_unlock(&space_info->lock); 3954 break; 3955 } 3956 spin_unlock(&space_info->lock); 3957 3958 loops++; 3959 if (wait_ordered && !trans) { 3960 btrfs_wait_ordered_extents(root, 0); 3961 } else { 3962 time_left = schedule_timeout_killable(1); 3963 if (time_left) 3964 break; 3965 } 3966 smp_mb(); 3967 delalloc_bytes = percpu_counter_sum_positive( 3968 &root->fs_info->delalloc_bytes); 3969 } 3970 } 3971 3972 /** 3973 * maybe_commit_transaction - possibly commit the transaction if its ok to 3974 * @root - the root we're allocating for 3975 * @bytes - the number of bytes we want to reserve 3976 * @force - force the commit 3977 * 3978 * This will check to make sure that committing the transaction will actually 3979 * get us somewhere and then commit the transaction if it does. Otherwise it 3980 * will return -ENOSPC. 3981 */ 3982 static int may_commit_transaction(struct btrfs_root *root, 3983 struct btrfs_space_info *space_info, 3984 u64 bytes, int force) 3985 { 3986 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; 3987 struct btrfs_trans_handle *trans; 3988 3989 trans = (struct btrfs_trans_handle *)current->journal_info; 3990 if (trans) 3991 return -EAGAIN; 3992 3993 if (force) 3994 goto commit; 3995 3996 /* See if there is enough pinned space to make this reservation */ 3997 spin_lock(&space_info->lock); 3998 if (space_info->bytes_pinned >= bytes) { 3999 spin_unlock(&space_info->lock); 4000 goto commit; 4001 } 4002 spin_unlock(&space_info->lock); 4003 4004 /* 4005 * See if there is some space in the delayed insertion reservation for 4006 * this reservation. 4007 */ 4008 if (space_info != delayed_rsv->space_info) 4009 return -ENOSPC; 4010 4011 spin_lock(&space_info->lock); 4012 spin_lock(&delayed_rsv->lock); 4013 if (space_info->bytes_pinned + delayed_rsv->size < bytes) { 4014 spin_unlock(&delayed_rsv->lock); 4015 spin_unlock(&space_info->lock); 4016 return -ENOSPC; 4017 } 4018 spin_unlock(&delayed_rsv->lock); 4019 spin_unlock(&space_info->lock); 4020 4021 commit: 4022 trans = btrfs_join_transaction(root); 4023 if (IS_ERR(trans)) 4024 return -ENOSPC; 4025 4026 return btrfs_commit_transaction(trans, root); 4027 } 4028 4029 enum flush_state { 4030 FLUSH_DELAYED_ITEMS_NR = 1, 4031 FLUSH_DELAYED_ITEMS = 2, 4032 FLUSH_DELALLOC = 3, 4033 FLUSH_DELALLOC_WAIT = 4, 4034 ALLOC_CHUNK = 5, 4035 COMMIT_TRANS = 6, 4036 }; 4037 4038 static int flush_space(struct btrfs_root *root, 4039 struct btrfs_space_info *space_info, u64 num_bytes, 4040 u64 orig_bytes, int state) 4041 { 4042 struct btrfs_trans_handle *trans; 4043 int nr; 4044 int ret = 0; 4045 4046 switch (state) { 4047 case FLUSH_DELAYED_ITEMS_NR: 4048 case FLUSH_DELAYED_ITEMS: 4049 if (state == FLUSH_DELAYED_ITEMS_NR) { 4050 u64 bytes = btrfs_calc_trans_metadata_size(root, 1); 4051 4052 nr = (int)div64_u64(num_bytes, bytes); 4053 if (!nr) 4054 nr = 1; 4055 nr *= 2; 4056 } else { 4057 nr = -1; 4058 } 4059 trans = btrfs_join_transaction(root); 4060 if (IS_ERR(trans)) { 4061 ret = PTR_ERR(trans); 4062 break; 4063 } 4064 ret = btrfs_run_delayed_items_nr(trans, root, nr); 4065 btrfs_end_transaction(trans, root); 4066 break; 4067 case FLUSH_DELALLOC: 4068 case FLUSH_DELALLOC_WAIT: 4069 shrink_delalloc(root, num_bytes, orig_bytes, 4070 state == FLUSH_DELALLOC_WAIT); 4071 break; 4072 case ALLOC_CHUNK: 4073 trans = btrfs_join_transaction(root); 4074 if (IS_ERR(trans)) { 4075 ret = PTR_ERR(trans); 4076 break; 4077 } 4078 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4079 btrfs_get_alloc_profile(root, 0), 4080 CHUNK_ALLOC_NO_FORCE); 4081 btrfs_end_transaction(trans, root); 4082 if (ret == -ENOSPC) 4083 ret = 0; 4084 break; 4085 case COMMIT_TRANS: 4086 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 4087 break; 4088 default: 4089 ret = -ENOSPC; 4090 break; 4091 } 4092 4093 return ret; 4094 } 4095 /** 4096 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 4097 * @root - the root we're allocating for 4098 * @block_rsv - the block_rsv we're allocating for 4099 * @orig_bytes - the number of bytes we want 4100 * @flush - whether or not we can flush to make our reservation 4101 * 4102 * This will reserve orgi_bytes number of bytes from the space info associated 4103 * with the block_rsv. If there is not enough space it will make an attempt to 4104 * flush out space to make room. It will do this by flushing delalloc if 4105 * possible or committing the transaction. If flush is 0 then no attempts to 4106 * regain reservations will be made and this will fail if there is not enough 4107 * space already. 4108 */ 4109 static int reserve_metadata_bytes(struct btrfs_root *root, 4110 struct btrfs_block_rsv *block_rsv, 4111 u64 orig_bytes, 4112 enum btrfs_reserve_flush_enum flush) 4113 { 4114 struct btrfs_space_info *space_info = block_rsv->space_info; 4115 u64 used; 4116 u64 num_bytes = orig_bytes; 4117 int flush_state = FLUSH_DELAYED_ITEMS_NR; 4118 int ret = 0; 4119 bool flushing = false; 4120 4121 again: 4122 ret = 0; 4123 spin_lock(&space_info->lock); 4124 /* 4125 * We only want to wait if somebody other than us is flushing and we 4126 * are actually allowed to flush all things. 4127 */ 4128 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && 4129 space_info->flush) { 4130 spin_unlock(&space_info->lock); 4131 /* 4132 * If we have a trans handle we can't wait because the flusher 4133 * may have to commit the transaction, which would mean we would 4134 * deadlock since we are waiting for the flusher to finish, but 4135 * hold the current transaction open. 4136 */ 4137 if (current->journal_info) 4138 return -EAGAIN; 4139 ret = wait_event_killable(space_info->wait, !space_info->flush); 4140 /* Must have been killed, return */ 4141 if (ret) 4142 return -EINTR; 4143 4144 spin_lock(&space_info->lock); 4145 } 4146 4147 ret = -ENOSPC; 4148 used = space_info->bytes_used + space_info->bytes_reserved + 4149 space_info->bytes_pinned + space_info->bytes_readonly + 4150 space_info->bytes_may_use; 4151 4152 /* 4153 * The idea here is that we've not already over-reserved the block group 4154 * then we can go ahead and save our reservation first and then start 4155 * flushing if we need to. Otherwise if we've already overcommitted 4156 * lets start flushing stuff first and then come back and try to make 4157 * our reservation. 4158 */ 4159 if (used <= space_info->total_bytes) { 4160 if (used + orig_bytes <= space_info->total_bytes) { 4161 space_info->bytes_may_use += orig_bytes; 4162 trace_btrfs_space_reservation(root->fs_info, 4163 "space_info", space_info->flags, orig_bytes, 1); 4164 ret = 0; 4165 } else { 4166 /* 4167 * Ok set num_bytes to orig_bytes since we aren't 4168 * overocmmitted, this way we only try and reclaim what 4169 * we need. 4170 */ 4171 num_bytes = orig_bytes; 4172 } 4173 } else { 4174 /* 4175 * Ok we're over committed, set num_bytes to the overcommitted 4176 * amount plus the amount of bytes that we need for this 4177 * reservation. 4178 */ 4179 num_bytes = used - space_info->total_bytes + 4180 (orig_bytes * 2); 4181 } 4182 4183 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { 4184 space_info->bytes_may_use += orig_bytes; 4185 trace_btrfs_space_reservation(root->fs_info, "space_info", 4186 space_info->flags, orig_bytes, 4187 1); 4188 ret = 0; 4189 } 4190 4191 /* 4192 * Couldn't make our reservation, save our place so while we're trying 4193 * to reclaim space we can actually use it instead of somebody else 4194 * stealing it from us. 4195 * 4196 * We make the other tasks wait for the flush only when we can flush 4197 * all things. 4198 */ 4199 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 4200 flushing = true; 4201 space_info->flush = 1; 4202 } 4203 4204 spin_unlock(&space_info->lock); 4205 4206 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 4207 goto out; 4208 4209 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4210 flush_state); 4211 flush_state++; 4212 4213 /* 4214 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock 4215 * would happen. So skip delalloc flush. 4216 */ 4217 if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4218 (flush_state == FLUSH_DELALLOC || 4219 flush_state == FLUSH_DELALLOC_WAIT)) 4220 flush_state = ALLOC_CHUNK; 4221 4222 if (!ret) 4223 goto again; 4224 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4225 flush_state < COMMIT_TRANS) 4226 goto again; 4227 else if (flush == BTRFS_RESERVE_FLUSH_ALL && 4228 flush_state <= COMMIT_TRANS) 4229 goto again; 4230 4231 out: 4232 if (ret == -ENOSPC && 4233 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 4234 struct btrfs_block_rsv *global_rsv = 4235 &root->fs_info->global_block_rsv; 4236 4237 if (block_rsv != global_rsv && 4238 !block_rsv_use_bytes(global_rsv, orig_bytes)) 4239 ret = 0; 4240 } 4241 if (flushing) { 4242 spin_lock(&space_info->lock); 4243 space_info->flush = 0; 4244 wake_up_all(&space_info->wait); 4245 spin_unlock(&space_info->lock); 4246 } 4247 return ret; 4248 } 4249 4250 static struct btrfs_block_rsv *get_block_rsv( 4251 const struct btrfs_trans_handle *trans, 4252 const struct btrfs_root *root) 4253 { 4254 struct btrfs_block_rsv *block_rsv = NULL; 4255 4256 if (root->ref_cows) 4257 block_rsv = trans->block_rsv; 4258 4259 if (root == root->fs_info->csum_root && trans->adding_csums) 4260 block_rsv = trans->block_rsv; 4261 4262 if (!block_rsv) 4263 block_rsv = root->block_rsv; 4264 4265 if (!block_rsv) 4266 block_rsv = &root->fs_info->empty_block_rsv; 4267 4268 return block_rsv; 4269 } 4270 4271 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 4272 u64 num_bytes) 4273 { 4274 int ret = -ENOSPC; 4275 spin_lock(&block_rsv->lock); 4276 if (block_rsv->reserved >= num_bytes) { 4277 block_rsv->reserved -= num_bytes; 4278 if (block_rsv->reserved < block_rsv->size) 4279 block_rsv->full = 0; 4280 ret = 0; 4281 } 4282 spin_unlock(&block_rsv->lock); 4283 return ret; 4284 } 4285 4286 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 4287 u64 num_bytes, int update_size) 4288 { 4289 spin_lock(&block_rsv->lock); 4290 block_rsv->reserved += num_bytes; 4291 if (update_size) 4292 block_rsv->size += num_bytes; 4293 else if (block_rsv->reserved >= block_rsv->size) 4294 block_rsv->full = 1; 4295 spin_unlock(&block_rsv->lock); 4296 } 4297 4298 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 4299 struct btrfs_block_rsv *block_rsv, 4300 struct btrfs_block_rsv *dest, u64 num_bytes) 4301 { 4302 struct btrfs_space_info *space_info = block_rsv->space_info; 4303 4304 spin_lock(&block_rsv->lock); 4305 if (num_bytes == (u64)-1) 4306 num_bytes = block_rsv->size; 4307 block_rsv->size -= num_bytes; 4308 if (block_rsv->reserved >= block_rsv->size) { 4309 num_bytes = block_rsv->reserved - block_rsv->size; 4310 block_rsv->reserved = block_rsv->size; 4311 block_rsv->full = 1; 4312 } else { 4313 num_bytes = 0; 4314 } 4315 spin_unlock(&block_rsv->lock); 4316 4317 if (num_bytes > 0) { 4318 if (dest) { 4319 spin_lock(&dest->lock); 4320 if (!dest->full) { 4321 u64 bytes_to_add; 4322 4323 bytes_to_add = dest->size - dest->reserved; 4324 bytes_to_add = min(num_bytes, bytes_to_add); 4325 dest->reserved += bytes_to_add; 4326 if (dest->reserved >= dest->size) 4327 dest->full = 1; 4328 num_bytes -= bytes_to_add; 4329 } 4330 spin_unlock(&dest->lock); 4331 } 4332 if (num_bytes) { 4333 spin_lock(&space_info->lock); 4334 space_info->bytes_may_use -= num_bytes; 4335 trace_btrfs_space_reservation(fs_info, "space_info", 4336 space_info->flags, num_bytes, 0); 4337 space_info->reservation_progress++; 4338 spin_unlock(&space_info->lock); 4339 } 4340 } 4341 } 4342 4343 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, 4344 struct btrfs_block_rsv *dst, u64 num_bytes) 4345 { 4346 int ret; 4347 4348 ret = block_rsv_use_bytes(src, num_bytes); 4349 if (ret) 4350 return ret; 4351 4352 block_rsv_add_bytes(dst, num_bytes, 1); 4353 return 0; 4354 } 4355 4356 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 4357 { 4358 memset(rsv, 0, sizeof(*rsv)); 4359 spin_lock_init(&rsv->lock); 4360 rsv->type = type; 4361 } 4362 4363 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, 4364 unsigned short type) 4365 { 4366 struct btrfs_block_rsv *block_rsv; 4367 struct btrfs_fs_info *fs_info = root->fs_info; 4368 4369 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 4370 if (!block_rsv) 4371 return NULL; 4372 4373 btrfs_init_block_rsv(block_rsv, type); 4374 block_rsv->space_info = __find_space_info(fs_info, 4375 BTRFS_BLOCK_GROUP_METADATA); 4376 return block_rsv; 4377 } 4378 4379 void btrfs_free_block_rsv(struct btrfs_root *root, 4380 struct btrfs_block_rsv *rsv) 4381 { 4382 if (!rsv) 4383 return; 4384 btrfs_block_rsv_release(root, rsv, (u64)-1); 4385 kfree(rsv); 4386 } 4387 4388 int btrfs_block_rsv_add(struct btrfs_root *root, 4389 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 4390 enum btrfs_reserve_flush_enum flush) 4391 { 4392 int ret; 4393 4394 if (num_bytes == 0) 4395 return 0; 4396 4397 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4398 if (!ret) { 4399 block_rsv_add_bytes(block_rsv, num_bytes, 1); 4400 return 0; 4401 } 4402 4403 return ret; 4404 } 4405 4406 int btrfs_block_rsv_check(struct btrfs_root *root, 4407 struct btrfs_block_rsv *block_rsv, int min_factor) 4408 { 4409 u64 num_bytes = 0; 4410 int ret = -ENOSPC; 4411 4412 if (!block_rsv) 4413 return 0; 4414 4415 spin_lock(&block_rsv->lock); 4416 num_bytes = div_factor(block_rsv->size, min_factor); 4417 if (block_rsv->reserved >= num_bytes) 4418 ret = 0; 4419 spin_unlock(&block_rsv->lock); 4420 4421 return ret; 4422 } 4423 4424 int btrfs_block_rsv_refill(struct btrfs_root *root, 4425 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 4426 enum btrfs_reserve_flush_enum flush) 4427 { 4428 u64 num_bytes = 0; 4429 int ret = -ENOSPC; 4430 4431 if (!block_rsv) 4432 return 0; 4433 4434 spin_lock(&block_rsv->lock); 4435 num_bytes = min_reserved; 4436 if (block_rsv->reserved >= num_bytes) 4437 ret = 0; 4438 else 4439 num_bytes -= block_rsv->reserved; 4440 spin_unlock(&block_rsv->lock); 4441 4442 if (!ret) 4443 return 0; 4444 4445 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4446 if (!ret) { 4447 block_rsv_add_bytes(block_rsv, num_bytes, 0); 4448 return 0; 4449 } 4450 4451 return ret; 4452 } 4453 4454 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4455 struct btrfs_block_rsv *dst_rsv, 4456 u64 num_bytes) 4457 { 4458 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4459 } 4460 4461 void btrfs_block_rsv_release(struct btrfs_root *root, 4462 struct btrfs_block_rsv *block_rsv, 4463 u64 num_bytes) 4464 { 4465 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4466 if (global_rsv->full || global_rsv == block_rsv || 4467 block_rsv->space_info != global_rsv->space_info) 4468 global_rsv = NULL; 4469 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, 4470 num_bytes); 4471 } 4472 4473 /* 4474 * helper to calculate size of global block reservation. 4475 * the desired value is sum of space used by extent tree, 4476 * checksum tree and root tree 4477 */ 4478 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) 4479 { 4480 struct btrfs_space_info *sinfo; 4481 u64 num_bytes; 4482 u64 meta_used; 4483 u64 data_used; 4484 int csum_size = btrfs_super_csum_size(fs_info->super_copy); 4485 4486 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 4487 spin_lock(&sinfo->lock); 4488 data_used = sinfo->bytes_used; 4489 spin_unlock(&sinfo->lock); 4490 4491 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4492 spin_lock(&sinfo->lock); 4493 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) 4494 data_used = 0; 4495 meta_used = sinfo->bytes_used; 4496 spin_unlock(&sinfo->lock); 4497 4498 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * 4499 csum_size * 2; 4500 num_bytes += div64_u64(data_used + meta_used, 50); 4501 4502 if (num_bytes * 3 > meta_used) 4503 num_bytes = div64_u64(meta_used, 3); 4504 4505 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); 4506 } 4507 4508 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 4509 { 4510 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 4511 struct btrfs_space_info *sinfo = block_rsv->space_info; 4512 u64 num_bytes; 4513 4514 num_bytes = calc_global_metadata_size(fs_info); 4515 4516 spin_lock(&sinfo->lock); 4517 spin_lock(&block_rsv->lock); 4518 4519 block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); 4520 4521 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 4522 sinfo->bytes_reserved + sinfo->bytes_readonly + 4523 sinfo->bytes_may_use; 4524 4525 if (sinfo->total_bytes > num_bytes) { 4526 num_bytes = sinfo->total_bytes - num_bytes; 4527 block_rsv->reserved += num_bytes; 4528 sinfo->bytes_may_use += num_bytes; 4529 trace_btrfs_space_reservation(fs_info, "space_info", 4530 sinfo->flags, num_bytes, 1); 4531 } 4532 4533 if (block_rsv->reserved >= block_rsv->size) { 4534 num_bytes = block_rsv->reserved - block_rsv->size; 4535 sinfo->bytes_may_use -= num_bytes; 4536 trace_btrfs_space_reservation(fs_info, "space_info", 4537 sinfo->flags, num_bytes, 0); 4538 sinfo->reservation_progress++; 4539 block_rsv->reserved = block_rsv->size; 4540 block_rsv->full = 1; 4541 } 4542 4543 spin_unlock(&block_rsv->lock); 4544 spin_unlock(&sinfo->lock); 4545 } 4546 4547 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 4548 { 4549 struct btrfs_space_info *space_info; 4550 4551 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4552 fs_info->chunk_block_rsv.space_info = space_info; 4553 4554 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4555 fs_info->global_block_rsv.space_info = space_info; 4556 fs_info->delalloc_block_rsv.space_info = space_info; 4557 fs_info->trans_block_rsv.space_info = space_info; 4558 fs_info->empty_block_rsv.space_info = space_info; 4559 fs_info->delayed_block_rsv.space_info = space_info; 4560 4561 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 4562 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4563 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 4564 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4565 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4566 4567 update_global_block_rsv(fs_info); 4568 } 4569 4570 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 4571 { 4572 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 4573 (u64)-1); 4574 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 4575 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 4576 WARN_ON(fs_info->trans_block_rsv.size > 0); 4577 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4578 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4579 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4580 WARN_ON(fs_info->delayed_block_rsv.size > 0); 4581 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 4582 } 4583 4584 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4585 struct btrfs_root *root) 4586 { 4587 if (!trans->block_rsv) 4588 return; 4589 4590 if (!trans->bytes_reserved) 4591 return; 4592 4593 trace_btrfs_space_reservation(root->fs_info, "transaction", 4594 trans->transid, trans->bytes_reserved, 0); 4595 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 4596 trans->bytes_reserved = 0; 4597 } 4598 4599 /* Can only return 0 or -ENOSPC */ 4600 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 4601 struct inode *inode) 4602 { 4603 struct btrfs_root *root = BTRFS_I(inode)->root; 4604 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4605 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 4606 4607 /* 4608 * We need to hold space in order to delete our orphan item once we've 4609 * added it, so this takes the reservation so we can release it later 4610 * when we are truly done with the orphan item. 4611 */ 4612 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4613 trace_btrfs_space_reservation(root->fs_info, "orphan", 4614 btrfs_ino(inode), num_bytes, 1); 4615 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4616 } 4617 4618 void btrfs_orphan_release_metadata(struct inode *inode) 4619 { 4620 struct btrfs_root *root = BTRFS_I(inode)->root; 4621 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4622 trace_btrfs_space_reservation(root->fs_info, "orphan", 4623 btrfs_ino(inode), num_bytes, 0); 4624 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4625 } 4626 4627 /* 4628 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 4629 * root: the root of the parent directory 4630 * rsv: block reservation 4631 * items: the number of items that we need do reservation 4632 * qgroup_reserved: used to return the reserved size in qgroup 4633 * 4634 * This function is used to reserve the space for snapshot/subvolume 4635 * creation and deletion. Those operations are different with the 4636 * common file/directory operations, they change two fs/file trees 4637 * and root tree, the number of items that the qgroup reserves is 4638 * different with the free space reservation. So we can not use 4639 * the space reseravtion mechanism in start_transaction(). 4640 */ 4641 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 4642 struct btrfs_block_rsv *rsv, 4643 int items, 4644 u64 *qgroup_reserved) 4645 { 4646 u64 num_bytes; 4647 int ret; 4648 4649 if (root->fs_info->quota_enabled) { 4650 /* One for parent inode, two for dir entries */ 4651 num_bytes = 3 * root->leafsize; 4652 ret = btrfs_qgroup_reserve(root, num_bytes); 4653 if (ret) 4654 return ret; 4655 } else { 4656 num_bytes = 0; 4657 } 4658 4659 *qgroup_reserved = num_bytes; 4660 4661 num_bytes = btrfs_calc_trans_metadata_size(root, items); 4662 rsv->space_info = __find_space_info(root->fs_info, 4663 BTRFS_BLOCK_GROUP_METADATA); 4664 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 4665 BTRFS_RESERVE_FLUSH_ALL); 4666 if (ret) { 4667 if (*qgroup_reserved) 4668 btrfs_qgroup_free(root, *qgroup_reserved); 4669 } 4670 4671 return ret; 4672 } 4673 4674 void btrfs_subvolume_release_metadata(struct btrfs_root *root, 4675 struct btrfs_block_rsv *rsv, 4676 u64 qgroup_reserved) 4677 { 4678 btrfs_block_rsv_release(root, rsv, (u64)-1); 4679 if (qgroup_reserved) 4680 btrfs_qgroup_free(root, qgroup_reserved); 4681 } 4682 4683 /** 4684 * drop_outstanding_extent - drop an outstanding extent 4685 * @inode: the inode we're dropping the extent for 4686 * 4687 * This is called when we are freeing up an outstanding extent, either called 4688 * after an error or after an extent is written. This will return the number of 4689 * reserved extents that need to be freed. This must be called with 4690 * BTRFS_I(inode)->lock held. 4691 */ 4692 static unsigned drop_outstanding_extent(struct inode *inode) 4693 { 4694 unsigned drop_inode_space = 0; 4695 unsigned dropped_extents = 0; 4696 4697 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4698 BTRFS_I(inode)->outstanding_extents--; 4699 4700 if (BTRFS_I(inode)->outstanding_extents == 0 && 4701 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 4702 &BTRFS_I(inode)->runtime_flags)) 4703 drop_inode_space = 1; 4704 4705 /* 4706 * If we have more or the same amount of outsanding extents than we have 4707 * reserved then we need to leave the reserved extents count alone. 4708 */ 4709 if (BTRFS_I(inode)->outstanding_extents >= 4710 BTRFS_I(inode)->reserved_extents) 4711 return drop_inode_space; 4712 4713 dropped_extents = BTRFS_I(inode)->reserved_extents - 4714 BTRFS_I(inode)->outstanding_extents; 4715 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4716 return dropped_extents + drop_inode_space; 4717 } 4718 4719 /** 4720 * calc_csum_metadata_size - return the amount of metada space that must be 4721 * reserved/free'd for the given bytes. 4722 * @inode: the inode we're manipulating 4723 * @num_bytes: the number of bytes in question 4724 * @reserve: 1 if we are reserving space, 0 if we are freeing space 4725 * 4726 * This adjusts the number of csum_bytes in the inode and then returns the 4727 * correct amount of metadata that must either be reserved or freed. We 4728 * calculate how many checksums we can fit into one leaf and then divide the 4729 * number of bytes that will need to be checksumed by this value to figure out 4730 * how many checksums will be required. If we are adding bytes then the number 4731 * may go up and we will return the number of additional bytes that must be 4732 * reserved. If it is going down we will return the number of bytes that must 4733 * be freed. 4734 * 4735 * This must be called with BTRFS_I(inode)->lock held. 4736 */ 4737 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, 4738 int reserve) 4739 { 4740 struct btrfs_root *root = BTRFS_I(inode)->root; 4741 u64 csum_size; 4742 int num_csums_per_leaf; 4743 int num_csums; 4744 int old_csums; 4745 4746 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && 4747 BTRFS_I(inode)->csum_bytes == 0) 4748 return 0; 4749 4750 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 4751 if (reserve) 4752 BTRFS_I(inode)->csum_bytes += num_bytes; 4753 else 4754 BTRFS_I(inode)->csum_bytes -= num_bytes; 4755 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); 4756 num_csums_per_leaf = (int)div64_u64(csum_size, 4757 sizeof(struct btrfs_csum_item) + 4758 sizeof(struct btrfs_disk_key)); 4759 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 4760 num_csums = num_csums + num_csums_per_leaf - 1; 4761 num_csums = num_csums / num_csums_per_leaf; 4762 4763 old_csums = old_csums + num_csums_per_leaf - 1; 4764 old_csums = old_csums / num_csums_per_leaf; 4765 4766 /* No change, no need to reserve more */ 4767 if (old_csums == num_csums) 4768 return 0; 4769 4770 if (reserve) 4771 return btrfs_calc_trans_metadata_size(root, 4772 num_csums - old_csums); 4773 4774 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); 4775 } 4776 4777 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4778 { 4779 struct btrfs_root *root = BTRFS_I(inode)->root; 4780 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4781 u64 to_reserve = 0; 4782 u64 csum_bytes; 4783 unsigned nr_extents = 0; 4784 int extra_reserve = 0; 4785 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 4786 int ret = 0; 4787 bool delalloc_lock = true; 4788 u64 to_free = 0; 4789 unsigned dropped; 4790 4791 /* If we are a free space inode we need to not flush since we will be in 4792 * the middle of a transaction commit. We also don't need the delalloc 4793 * mutex since we won't race with anybody. We need this mostly to make 4794 * lockdep shut its filthy mouth. 4795 */ 4796 if (btrfs_is_free_space_inode(inode)) { 4797 flush = BTRFS_RESERVE_NO_FLUSH; 4798 delalloc_lock = false; 4799 } 4800 4801 if (flush != BTRFS_RESERVE_NO_FLUSH && 4802 btrfs_transaction_in_commit(root->fs_info)) 4803 schedule_timeout(1); 4804 4805 if (delalloc_lock) 4806 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 4807 4808 num_bytes = ALIGN(num_bytes, root->sectorsize); 4809 4810 spin_lock(&BTRFS_I(inode)->lock); 4811 BTRFS_I(inode)->outstanding_extents++; 4812 4813 if (BTRFS_I(inode)->outstanding_extents > 4814 BTRFS_I(inode)->reserved_extents) 4815 nr_extents = BTRFS_I(inode)->outstanding_extents - 4816 BTRFS_I(inode)->reserved_extents; 4817 4818 /* 4819 * Add an item to reserve for updating the inode when we complete the 4820 * delalloc io. 4821 */ 4822 if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 4823 &BTRFS_I(inode)->runtime_flags)) { 4824 nr_extents++; 4825 extra_reserve = 1; 4826 } 4827 4828 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4829 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 4830 csum_bytes = BTRFS_I(inode)->csum_bytes; 4831 spin_unlock(&BTRFS_I(inode)->lock); 4832 4833 if (root->fs_info->quota_enabled) { 4834 ret = btrfs_qgroup_reserve(root, num_bytes + 4835 nr_extents * root->leafsize); 4836 if (ret) 4837 goto out_fail; 4838 } 4839 4840 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4841 if (unlikely(ret)) { 4842 if (root->fs_info->quota_enabled) 4843 btrfs_qgroup_free(root, num_bytes + 4844 nr_extents * root->leafsize); 4845 goto out_fail; 4846 } 4847 4848 spin_lock(&BTRFS_I(inode)->lock); 4849 if (extra_reserve) { 4850 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 4851 &BTRFS_I(inode)->runtime_flags); 4852 nr_extents--; 4853 } 4854 BTRFS_I(inode)->reserved_extents += nr_extents; 4855 spin_unlock(&BTRFS_I(inode)->lock); 4856 4857 if (delalloc_lock) 4858 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4859 4860 if (to_reserve) 4861 trace_btrfs_space_reservation(root->fs_info,"delalloc", 4862 btrfs_ino(inode), to_reserve, 1); 4863 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4864 4865 return 0; 4866 4867 out_fail: 4868 spin_lock(&BTRFS_I(inode)->lock); 4869 dropped = drop_outstanding_extent(inode); 4870 /* 4871 * If the inodes csum_bytes is the same as the original 4872 * csum_bytes then we know we haven't raced with any free()ers 4873 * so we can just reduce our inodes csum bytes and carry on. 4874 */ 4875 if (BTRFS_I(inode)->csum_bytes == csum_bytes) { 4876 calc_csum_metadata_size(inode, num_bytes, 0); 4877 } else { 4878 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; 4879 u64 bytes; 4880 4881 /* 4882 * This is tricky, but first we need to figure out how much we 4883 * free'd from any free-ers that occured during this 4884 * reservation, so we reset ->csum_bytes to the csum_bytes 4885 * before we dropped our lock, and then call the free for the 4886 * number of bytes that were freed while we were trying our 4887 * reservation. 4888 */ 4889 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; 4890 BTRFS_I(inode)->csum_bytes = csum_bytes; 4891 to_free = calc_csum_metadata_size(inode, bytes, 0); 4892 4893 4894 /* 4895 * Now we need to see how much we would have freed had we not 4896 * been making this reservation and our ->csum_bytes were not 4897 * artificially inflated. 4898 */ 4899 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; 4900 bytes = csum_bytes - orig_csum_bytes; 4901 bytes = calc_csum_metadata_size(inode, bytes, 0); 4902 4903 /* 4904 * Now reset ->csum_bytes to what it should be. If bytes is 4905 * more than to_free then we would have free'd more space had we 4906 * not had an artificially high ->csum_bytes, so we need to free 4907 * the remainder. If bytes is the same or less then we don't 4908 * need to do anything, the other free-ers did the correct 4909 * thing. 4910 */ 4911 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; 4912 if (bytes > to_free) 4913 to_free = bytes - to_free; 4914 else 4915 to_free = 0; 4916 } 4917 spin_unlock(&BTRFS_I(inode)->lock); 4918 if (dropped) 4919 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4920 4921 if (to_free) { 4922 btrfs_block_rsv_release(root, block_rsv, to_free); 4923 trace_btrfs_space_reservation(root->fs_info, "delalloc", 4924 btrfs_ino(inode), to_free, 0); 4925 } 4926 if (delalloc_lock) 4927 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4928 return ret; 4929 } 4930 4931 /** 4932 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 4933 * @inode: the inode to release the reservation for 4934 * @num_bytes: the number of bytes we're releasing 4935 * 4936 * This will release the metadata reservation for an inode. This can be called 4937 * once we complete IO for a given set of bytes to release their metadata 4938 * reservations. 4939 */ 4940 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4941 { 4942 struct btrfs_root *root = BTRFS_I(inode)->root; 4943 u64 to_free = 0; 4944 unsigned dropped; 4945 4946 num_bytes = ALIGN(num_bytes, root->sectorsize); 4947 spin_lock(&BTRFS_I(inode)->lock); 4948 dropped = drop_outstanding_extent(inode); 4949 4950 if (num_bytes) 4951 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 4952 spin_unlock(&BTRFS_I(inode)->lock); 4953 if (dropped > 0) 4954 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4955 4956 trace_btrfs_space_reservation(root->fs_info, "delalloc", 4957 btrfs_ino(inode), to_free, 0); 4958 if (root->fs_info->quota_enabled) { 4959 btrfs_qgroup_free(root, num_bytes + 4960 dropped * root->leafsize); 4961 } 4962 4963 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4964 to_free); 4965 } 4966 4967 /** 4968 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc 4969 * @inode: inode we're writing to 4970 * @num_bytes: the number of bytes we want to allocate 4971 * 4972 * This will do the following things 4973 * 4974 * o reserve space in the data space info for num_bytes 4975 * o reserve space in the metadata space info based on number of outstanding 4976 * extents and how much csums will be needed 4977 * o add to the inodes ->delalloc_bytes 4978 * o add it to the fs_info's delalloc inodes list. 4979 * 4980 * This will return 0 for success and -ENOSPC if there is no space left. 4981 */ 4982 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 4983 { 4984 int ret; 4985 4986 ret = btrfs_check_data_free_space(inode, num_bytes); 4987 if (ret) 4988 return ret; 4989 4990 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); 4991 if (ret) { 4992 btrfs_free_reserved_data_space(inode, num_bytes); 4993 return ret; 4994 } 4995 4996 return 0; 4997 } 4998 4999 /** 5000 * btrfs_delalloc_release_space - release data and metadata space for delalloc 5001 * @inode: inode we're releasing space for 5002 * @num_bytes: the number of bytes we want to free up 5003 * 5004 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 5005 * called in the case that we don't need the metadata AND data reservations 5006 * anymore. So if there is an error or we insert an inline extent. 5007 * 5008 * This function will release the metadata space that was not used and will 5009 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 5010 * list if there are no delalloc bytes left. 5011 */ 5012 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 5013 { 5014 btrfs_delalloc_release_metadata(inode, num_bytes); 5015 btrfs_free_reserved_data_space(inode, num_bytes); 5016 } 5017 5018 static int update_block_group(struct btrfs_root *root, 5019 u64 bytenr, u64 num_bytes, int alloc) 5020 { 5021 struct btrfs_block_group_cache *cache = NULL; 5022 struct btrfs_fs_info *info = root->fs_info; 5023 u64 total = num_bytes; 5024 u64 old_val; 5025 u64 byte_in_group; 5026 int factor; 5027 5028 /* block accounting for super block */ 5029 spin_lock(&info->delalloc_lock); 5030 old_val = btrfs_super_bytes_used(info->super_copy); 5031 if (alloc) 5032 old_val += num_bytes; 5033 else 5034 old_val -= num_bytes; 5035 btrfs_set_super_bytes_used(info->super_copy, old_val); 5036 spin_unlock(&info->delalloc_lock); 5037 5038 while (total) { 5039 cache = btrfs_lookup_block_group(info, bytenr); 5040 if (!cache) 5041 return -ENOENT; 5042 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 5043 BTRFS_BLOCK_GROUP_RAID1 | 5044 BTRFS_BLOCK_GROUP_RAID10)) 5045 factor = 2; 5046 else 5047 factor = 1; 5048 /* 5049 * If this block group has free space cache written out, we 5050 * need to make sure to load it if we are removing space. This 5051 * is because we need the unpinning stage to actually add the 5052 * space back to the block group, otherwise we will leak space. 5053 */ 5054 if (!alloc && cache->cached == BTRFS_CACHE_NO) 5055 cache_block_group(cache, 1); 5056 5057 byte_in_group = bytenr - cache->key.objectid; 5058 WARN_ON(byte_in_group > cache->key.offset); 5059 5060 spin_lock(&cache->space_info->lock); 5061 spin_lock(&cache->lock); 5062 5063 if (btrfs_test_opt(root, SPACE_CACHE) && 5064 cache->disk_cache_state < BTRFS_DC_CLEAR) 5065 cache->disk_cache_state = BTRFS_DC_CLEAR; 5066 5067 cache->dirty = 1; 5068 old_val = btrfs_block_group_used(&cache->item); 5069 num_bytes = min(total, cache->key.offset - byte_in_group); 5070 if (alloc) { 5071 old_val += num_bytes; 5072 btrfs_set_block_group_used(&cache->item, old_val); 5073 cache->reserved -= num_bytes; 5074 cache->space_info->bytes_reserved -= num_bytes; 5075 cache->space_info->bytes_used += num_bytes; 5076 cache->space_info->disk_used += num_bytes * factor; 5077 spin_unlock(&cache->lock); 5078 spin_unlock(&cache->space_info->lock); 5079 } else { 5080 old_val -= num_bytes; 5081 btrfs_set_block_group_used(&cache->item, old_val); 5082 cache->pinned += num_bytes; 5083 cache->space_info->bytes_pinned += num_bytes; 5084 cache->space_info->bytes_used -= num_bytes; 5085 cache->space_info->disk_used -= num_bytes * factor; 5086 spin_unlock(&cache->lock); 5087 spin_unlock(&cache->space_info->lock); 5088 5089 set_extent_dirty(info->pinned_extents, 5090 bytenr, bytenr + num_bytes - 1, 5091 GFP_NOFS | __GFP_NOFAIL); 5092 } 5093 btrfs_put_block_group(cache); 5094 total -= num_bytes; 5095 bytenr += num_bytes; 5096 } 5097 return 0; 5098 } 5099 5100 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 5101 { 5102 struct btrfs_block_group_cache *cache; 5103 u64 bytenr; 5104 5105 spin_lock(&root->fs_info->block_group_cache_lock); 5106 bytenr = root->fs_info->first_logical_byte; 5107 spin_unlock(&root->fs_info->block_group_cache_lock); 5108 5109 if (bytenr < (u64)-1) 5110 return bytenr; 5111 5112 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 5113 if (!cache) 5114 return 0; 5115 5116 bytenr = cache->key.objectid; 5117 btrfs_put_block_group(cache); 5118 5119 return bytenr; 5120 } 5121 5122 static int pin_down_extent(struct btrfs_root *root, 5123 struct btrfs_block_group_cache *cache, 5124 u64 bytenr, u64 num_bytes, int reserved) 5125 { 5126 spin_lock(&cache->space_info->lock); 5127 spin_lock(&cache->lock); 5128 cache->pinned += num_bytes; 5129 cache->space_info->bytes_pinned += num_bytes; 5130 if (reserved) { 5131 cache->reserved -= num_bytes; 5132 cache->space_info->bytes_reserved -= num_bytes; 5133 } 5134 spin_unlock(&cache->lock); 5135 spin_unlock(&cache->space_info->lock); 5136 5137 set_extent_dirty(root->fs_info->pinned_extents, bytenr, 5138 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 5139 return 0; 5140 } 5141 5142 /* 5143 * this function must be called within transaction 5144 */ 5145 int btrfs_pin_extent(struct btrfs_root *root, 5146 u64 bytenr, u64 num_bytes, int reserved) 5147 { 5148 struct btrfs_block_group_cache *cache; 5149 5150 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5151 BUG_ON(!cache); /* Logic error */ 5152 5153 pin_down_extent(root, cache, bytenr, num_bytes, reserved); 5154 5155 btrfs_put_block_group(cache); 5156 return 0; 5157 } 5158 5159 /* 5160 * this function must be called within transaction 5161 */ 5162 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 5163 u64 bytenr, u64 num_bytes) 5164 { 5165 struct btrfs_block_group_cache *cache; 5166 int ret; 5167 5168 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5169 if (!cache) 5170 return -EINVAL; 5171 5172 /* 5173 * pull in the free space cache (if any) so that our pin 5174 * removes the free space from the cache. We have load_only set 5175 * to one because the slow code to read in the free extents does check 5176 * the pinned extents. 5177 */ 5178 cache_block_group(cache, 1); 5179 5180 pin_down_extent(root, cache, bytenr, num_bytes, 0); 5181 5182 /* remove us from the free space cache (if we're there at all) */ 5183 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 5184 btrfs_put_block_group(cache); 5185 return ret; 5186 } 5187 5188 /** 5189 * btrfs_update_reserved_bytes - update the block_group and space info counters 5190 * @cache: The cache we are manipulating 5191 * @num_bytes: The number of bytes in question 5192 * @reserve: One of the reservation enums 5193 * 5194 * This is called by the allocator when it reserves space, or by somebody who is 5195 * freeing space that was never actually used on disk. For example if you 5196 * reserve some space for a new leaf in transaction A and before transaction A 5197 * commits you free that leaf, you call this with reserve set to 0 in order to 5198 * clear the reservation. 5199 * 5200 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper 5201 * ENOSPC accounting. For data we handle the reservation through clearing the 5202 * delalloc bits in the io_tree. We have to do this since we could end up 5203 * allocating less disk space for the amount of data we have reserved in the 5204 * case of compression. 5205 * 5206 * If this is a reservation and the block group has become read only we cannot 5207 * make the reservation and return -EAGAIN, otherwise this function always 5208 * succeeds. 5209 */ 5210 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 5211 u64 num_bytes, int reserve) 5212 { 5213 struct btrfs_space_info *space_info = cache->space_info; 5214 int ret = 0; 5215 5216 spin_lock(&space_info->lock); 5217 spin_lock(&cache->lock); 5218 if (reserve != RESERVE_FREE) { 5219 if (cache->ro) { 5220 ret = -EAGAIN; 5221 } else { 5222 cache->reserved += num_bytes; 5223 space_info->bytes_reserved += num_bytes; 5224 if (reserve == RESERVE_ALLOC) { 5225 trace_btrfs_space_reservation(cache->fs_info, 5226 "space_info", space_info->flags, 5227 num_bytes, 0); 5228 space_info->bytes_may_use -= num_bytes; 5229 } 5230 } 5231 } else { 5232 if (cache->ro) 5233 space_info->bytes_readonly += num_bytes; 5234 cache->reserved -= num_bytes; 5235 space_info->bytes_reserved -= num_bytes; 5236 space_info->reservation_progress++; 5237 } 5238 spin_unlock(&cache->lock); 5239 spin_unlock(&space_info->lock); 5240 return ret; 5241 } 5242 5243 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 5244 struct btrfs_root *root) 5245 { 5246 struct btrfs_fs_info *fs_info = root->fs_info; 5247 struct btrfs_caching_control *next; 5248 struct btrfs_caching_control *caching_ctl; 5249 struct btrfs_block_group_cache *cache; 5250 5251 down_write(&fs_info->extent_commit_sem); 5252 5253 list_for_each_entry_safe(caching_ctl, next, 5254 &fs_info->caching_block_groups, list) { 5255 cache = caching_ctl->block_group; 5256 if (block_group_cache_done(cache)) { 5257 cache->last_byte_to_unpin = (u64)-1; 5258 list_del_init(&caching_ctl->list); 5259 put_caching_control(caching_ctl); 5260 } else { 5261 cache->last_byte_to_unpin = caching_ctl->progress; 5262 } 5263 } 5264 5265 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5266 fs_info->pinned_extents = &fs_info->freed_extents[1]; 5267 else 5268 fs_info->pinned_extents = &fs_info->freed_extents[0]; 5269 5270 up_write(&fs_info->extent_commit_sem); 5271 5272 update_global_block_rsv(fs_info); 5273 } 5274 5275 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 5276 { 5277 struct btrfs_fs_info *fs_info = root->fs_info; 5278 struct btrfs_block_group_cache *cache = NULL; 5279 struct btrfs_space_info *space_info; 5280 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5281 u64 len; 5282 bool readonly; 5283 5284 while (start <= end) { 5285 readonly = false; 5286 if (!cache || 5287 start >= cache->key.objectid + cache->key.offset) { 5288 if (cache) 5289 btrfs_put_block_group(cache); 5290 cache = btrfs_lookup_block_group(fs_info, start); 5291 BUG_ON(!cache); /* Logic error */ 5292 } 5293 5294 len = cache->key.objectid + cache->key.offset - start; 5295 len = min(len, end + 1 - start); 5296 5297 if (start < cache->last_byte_to_unpin) { 5298 len = min(len, cache->last_byte_to_unpin - start); 5299 btrfs_add_free_space(cache, start, len); 5300 } 5301 5302 start += len; 5303 space_info = cache->space_info; 5304 5305 spin_lock(&space_info->lock); 5306 spin_lock(&cache->lock); 5307 cache->pinned -= len; 5308 space_info->bytes_pinned -= len; 5309 if (cache->ro) { 5310 space_info->bytes_readonly += len; 5311 readonly = true; 5312 } 5313 spin_unlock(&cache->lock); 5314 if (!readonly && global_rsv->space_info == space_info) { 5315 spin_lock(&global_rsv->lock); 5316 if (!global_rsv->full) { 5317 len = min(len, global_rsv->size - 5318 global_rsv->reserved); 5319 global_rsv->reserved += len; 5320 space_info->bytes_may_use += len; 5321 if (global_rsv->reserved >= global_rsv->size) 5322 global_rsv->full = 1; 5323 } 5324 spin_unlock(&global_rsv->lock); 5325 } 5326 spin_unlock(&space_info->lock); 5327 } 5328 5329 if (cache) 5330 btrfs_put_block_group(cache); 5331 return 0; 5332 } 5333 5334 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 5335 struct btrfs_root *root) 5336 { 5337 struct btrfs_fs_info *fs_info = root->fs_info; 5338 struct extent_io_tree *unpin; 5339 u64 start; 5340 u64 end; 5341 int ret; 5342 5343 if (trans->aborted) 5344 return 0; 5345 5346 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5347 unpin = &fs_info->freed_extents[1]; 5348 else 5349 unpin = &fs_info->freed_extents[0]; 5350 5351 while (1) { 5352 ret = find_first_extent_bit(unpin, 0, &start, &end, 5353 EXTENT_DIRTY, NULL); 5354 if (ret) 5355 break; 5356 5357 if (btrfs_test_opt(root, DISCARD)) 5358 ret = btrfs_discard_extent(root, start, 5359 end + 1 - start, NULL); 5360 5361 clear_extent_dirty(unpin, start, end, GFP_NOFS); 5362 unpin_extent_range(root, start, end); 5363 cond_resched(); 5364 } 5365 5366 return 0; 5367 } 5368 5369 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 5370 struct btrfs_root *root, 5371 u64 bytenr, u64 num_bytes, u64 parent, 5372 u64 root_objectid, u64 owner_objectid, 5373 u64 owner_offset, int refs_to_drop, 5374 struct btrfs_delayed_extent_op *extent_op) 5375 { 5376 struct btrfs_key key; 5377 struct btrfs_path *path; 5378 struct btrfs_fs_info *info = root->fs_info; 5379 struct btrfs_root *extent_root = info->extent_root; 5380 struct extent_buffer *leaf; 5381 struct btrfs_extent_item *ei; 5382 struct btrfs_extent_inline_ref *iref; 5383 int ret; 5384 int is_data; 5385 int extent_slot = 0; 5386 int found_extent = 0; 5387 int num_to_del = 1; 5388 u32 item_size; 5389 u64 refs; 5390 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 5391 SKINNY_METADATA); 5392 5393 path = btrfs_alloc_path(); 5394 if (!path) 5395 return -ENOMEM; 5396 5397 path->reada = 1; 5398 path->leave_spinning = 1; 5399 5400 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 5401 BUG_ON(!is_data && refs_to_drop != 1); 5402 5403 if (is_data) 5404 skinny_metadata = 0; 5405 5406 ret = lookup_extent_backref(trans, extent_root, path, &iref, 5407 bytenr, num_bytes, parent, 5408 root_objectid, owner_objectid, 5409 owner_offset); 5410 if (ret == 0) { 5411 extent_slot = path->slots[0]; 5412 while (extent_slot >= 0) { 5413 btrfs_item_key_to_cpu(path->nodes[0], &key, 5414 extent_slot); 5415 if (key.objectid != bytenr) 5416 break; 5417 if (key.type == BTRFS_EXTENT_ITEM_KEY && 5418 key.offset == num_bytes) { 5419 found_extent = 1; 5420 break; 5421 } 5422 if (key.type == BTRFS_METADATA_ITEM_KEY && 5423 key.offset == owner_objectid) { 5424 found_extent = 1; 5425 break; 5426 } 5427 if (path->slots[0] - extent_slot > 5) 5428 break; 5429 extent_slot--; 5430 } 5431 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5432 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 5433 if (found_extent && item_size < sizeof(*ei)) 5434 found_extent = 0; 5435 #endif 5436 if (!found_extent) { 5437 BUG_ON(iref); 5438 ret = remove_extent_backref(trans, extent_root, path, 5439 NULL, refs_to_drop, 5440 is_data); 5441 if (ret) { 5442 btrfs_abort_transaction(trans, extent_root, ret); 5443 goto out; 5444 } 5445 btrfs_release_path(path); 5446 path->leave_spinning = 1; 5447 5448 key.objectid = bytenr; 5449 key.type = BTRFS_EXTENT_ITEM_KEY; 5450 key.offset = num_bytes; 5451 5452 if (!is_data && skinny_metadata) { 5453 key.type = BTRFS_METADATA_ITEM_KEY; 5454 key.offset = owner_objectid; 5455 } 5456 5457 ret = btrfs_search_slot(trans, extent_root, 5458 &key, path, -1, 1); 5459 if (ret > 0 && skinny_metadata && path->slots[0]) { 5460 /* 5461 * Couldn't find our skinny metadata item, 5462 * see if we have ye olde extent item. 5463 */ 5464 path->slots[0]--; 5465 btrfs_item_key_to_cpu(path->nodes[0], &key, 5466 path->slots[0]); 5467 if (key.objectid == bytenr && 5468 key.type == BTRFS_EXTENT_ITEM_KEY && 5469 key.offset == num_bytes) 5470 ret = 0; 5471 } 5472 5473 if (ret > 0 && skinny_metadata) { 5474 skinny_metadata = false; 5475 key.type = BTRFS_EXTENT_ITEM_KEY; 5476 key.offset = num_bytes; 5477 btrfs_release_path(path); 5478 ret = btrfs_search_slot(trans, extent_root, 5479 &key, path, -1, 1); 5480 } 5481 5482 if (ret) { 5483 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5484 ret, (unsigned long long)bytenr); 5485 if (ret > 0) 5486 btrfs_print_leaf(extent_root, 5487 path->nodes[0]); 5488 } 5489 if (ret < 0) { 5490 btrfs_abort_transaction(trans, extent_root, ret); 5491 goto out; 5492 } 5493 extent_slot = path->slots[0]; 5494 } 5495 } else if (ret == -ENOENT) { 5496 btrfs_print_leaf(extent_root, path->nodes[0]); 5497 WARN_ON(1); 5498 btrfs_err(info, 5499 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 5500 (unsigned long long)bytenr, 5501 (unsigned long long)parent, 5502 (unsigned long long)root_objectid, 5503 (unsigned long long)owner_objectid, 5504 (unsigned long long)owner_offset); 5505 } else { 5506 btrfs_abort_transaction(trans, extent_root, ret); 5507 goto out; 5508 } 5509 5510 leaf = path->nodes[0]; 5511 item_size = btrfs_item_size_nr(leaf, extent_slot); 5512 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5513 if (item_size < sizeof(*ei)) { 5514 BUG_ON(found_extent || extent_slot != path->slots[0]); 5515 ret = convert_extent_item_v0(trans, extent_root, path, 5516 owner_objectid, 0); 5517 if (ret < 0) { 5518 btrfs_abort_transaction(trans, extent_root, ret); 5519 goto out; 5520 } 5521 5522 btrfs_release_path(path); 5523 path->leave_spinning = 1; 5524 5525 key.objectid = bytenr; 5526 key.type = BTRFS_EXTENT_ITEM_KEY; 5527 key.offset = num_bytes; 5528 5529 ret = btrfs_search_slot(trans, extent_root, &key, path, 5530 -1, 1); 5531 if (ret) { 5532 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5533 ret, (unsigned long long)bytenr); 5534 btrfs_print_leaf(extent_root, path->nodes[0]); 5535 } 5536 if (ret < 0) { 5537 btrfs_abort_transaction(trans, extent_root, ret); 5538 goto out; 5539 } 5540 5541 extent_slot = path->slots[0]; 5542 leaf = path->nodes[0]; 5543 item_size = btrfs_item_size_nr(leaf, extent_slot); 5544 } 5545 #endif 5546 BUG_ON(item_size < sizeof(*ei)); 5547 ei = btrfs_item_ptr(leaf, extent_slot, 5548 struct btrfs_extent_item); 5549 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 5550 key.type == BTRFS_EXTENT_ITEM_KEY) { 5551 struct btrfs_tree_block_info *bi; 5552 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 5553 bi = (struct btrfs_tree_block_info *)(ei + 1); 5554 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 5555 } 5556 5557 refs = btrfs_extent_refs(leaf, ei); 5558 if (refs < refs_to_drop) { 5559 btrfs_err(info, "trying to drop %d refs but we only have %Lu " 5560 "for bytenr %Lu\n", refs_to_drop, refs, bytenr); 5561 ret = -EINVAL; 5562 btrfs_abort_transaction(trans, extent_root, ret); 5563 goto out; 5564 } 5565 refs -= refs_to_drop; 5566 5567 if (refs > 0) { 5568 if (extent_op) 5569 __run_delayed_extent_op(extent_op, leaf, ei); 5570 /* 5571 * In the case of inline back ref, reference count will 5572 * be updated by remove_extent_backref 5573 */ 5574 if (iref) { 5575 BUG_ON(!found_extent); 5576 } else { 5577 btrfs_set_extent_refs(leaf, ei, refs); 5578 btrfs_mark_buffer_dirty(leaf); 5579 } 5580 if (found_extent) { 5581 ret = remove_extent_backref(trans, extent_root, path, 5582 iref, refs_to_drop, 5583 is_data); 5584 if (ret) { 5585 btrfs_abort_transaction(trans, extent_root, ret); 5586 goto out; 5587 } 5588 } 5589 } else { 5590 if (found_extent) { 5591 BUG_ON(is_data && refs_to_drop != 5592 extent_data_ref_count(root, path, iref)); 5593 if (iref) { 5594 BUG_ON(path->slots[0] != extent_slot); 5595 } else { 5596 BUG_ON(path->slots[0] != extent_slot + 1); 5597 path->slots[0] = extent_slot; 5598 num_to_del = 2; 5599 } 5600 } 5601 5602 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 5603 num_to_del); 5604 if (ret) { 5605 btrfs_abort_transaction(trans, extent_root, ret); 5606 goto out; 5607 } 5608 btrfs_release_path(path); 5609 5610 if (is_data) { 5611 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 5612 if (ret) { 5613 btrfs_abort_transaction(trans, extent_root, ret); 5614 goto out; 5615 } 5616 } 5617 5618 ret = update_block_group(root, bytenr, num_bytes, 0); 5619 if (ret) { 5620 btrfs_abort_transaction(trans, extent_root, ret); 5621 goto out; 5622 } 5623 } 5624 out: 5625 btrfs_free_path(path); 5626 return ret; 5627 } 5628 5629 /* 5630 * when we free an block, it is possible (and likely) that we free the last 5631 * delayed ref for that extent as well. This searches the delayed ref tree for 5632 * a given extent, and if there are no other delayed refs to be processed, it 5633 * removes it from the tree. 5634 */ 5635 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 5636 struct btrfs_root *root, u64 bytenr) 5637 { 5638 struct btrfs_delayed_ref_head *head; 5639 struct btrfs_delayed_ref_root *delayed_refs; 5640 struct btrfs_delayed_ref_node *ref; 5641 struct rb_node *node; 5642 int ret = 0; 5643 5644 delayed_refs = &trans->transaction->delayed_refs; 5645 spin_lock(&delayed_refs->lock); 5646 head = btrfs_find_delayed_ref_head(trans, bytenr); 5647 if (!head) 5648 goto out; 5649 5650 node = rb_prev(&head->node.rb_node); 5651 if (!node) 5652 goto out; 5653 5654 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 5655 5656 /* there are still entries for this ref, we can't drop it */ 5657 if (ref->bytenr == bytenr) 5658 goto out; 5659 5660 if (head->extent_op) { 5661 if (!head->must_insert_reserved) 5662 goto out; 5663 btrfs_free_delayed_extent_op(head->extent_op); 5664 head->extent_op = NULL; 5665 } 5666 5667 /* 5668 * waiting for the lock here would deadlock. If someone else has it 5669 * locked they are already in the process of dropping it anyway 5670 */ 5671 if (!mutex_trylock(&head->mutex)) 5672 goto out; 5673 5674 /* 5675 * at this point we have a head with no other entries. Go 5676 * ahead and process it. 5677 */ 5678 head->node.in_tree = 0; 5679 rb_erase(&head->node.rb_node, &delayed_refs->root); 5680 5681 delayed_refs->num_entries--; 5682 5683 /* 5684 * we don't take a ref on the node because we're removing it from the 5685 * tree, so we just steal the ref the tree was holding. 5686 */ 5687 delayed_refs->num_heads--; 5688 if (list_empty(&head->cluster)) 5689 delayed_refs->num_heads_ready--; 5690 5691 list_del_init(&head->cluster); 5692 spin_unlock(&delayed_refs->lock); 5693 5694 BUG_ON(head->extent_op); 5695 if (head->must_insert_reserved) 5696 ret = 1; 5697 5698 mutex_unlock(&head->mutex); 5699 btrfs_put_delayed_ref(&head->node); 5700 return ret; 5701 out: 5702 spin_unlock(&delayed_refs->lock); 5703 return 0; 5704 } 5705 5706 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5707 struct btrfs_root *root, 5708 struct extent_buffer *buf, 5709 u64 parent, int last_ref) 5710 { 5711 struct btrfs_block_group_cache *cache = NULL; 5712 int ret; 5713 5714 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5715 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 5716 buf->start, buf->len, 5717 parent, root->root_key.objectid, 5718 btrfs_header_level(buf), 5719 BTRFS_DROP_DELAYED_REF, NULL, 0); 5720 BUG_ON(ret); /* -ENOMEM */ 5721 } 5722 5723 if (!last_ref) 5724 return; 5725 5726 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 5727 5728 if (btrfs_header_generation(buf) == trans->transid) { 5729 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5730 ret = check_ref_cleanup(trans, root, buf->start); 5731 if (!ret) 5732 goto out; 5733 } 5734 5735 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 5736 pin_down_extent(root, cache, buf->start, buf->len, 1); 5737 goto out; 5738 } 5739 5740 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 5741 5742 btrfs_add_free_space(cache, buf->start, buf->len); 5743 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); 5744 } 5745 out: 5746 /* 5747 * Deleting the buffer, clear the corrupt flag since it doesn't matter 5748 * anymore. 5749 */ 5750 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 5751 btrfs_put_block_group(cache); 5752 } 5753 5754 /* Can return -ENOMEM */ 5755 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, 5756 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 5757 u64 owner, u64 offset, int for_cow) 5758 { 5759 int ret; 5760 struct btrfs_fs_info *fs_info = root->fs_info; 5761 5762 /* 5763 * tree log blocks never actually go into the extent allocation 5764 * tree, just update pinning info and exit early. 5765 */ 5766 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 5767 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 5768 /* unlocks the pinned mutex */ 5769 btrfs_pin_extent(root, bytenr, num_bytes, 1); 5770 ret = 0; 5771 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 5772 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 5773 num_bytes, 5774 parent, root_objectid, (int)owner, 5775 BTRFS_DROP_DELAYED_REF, NULL, for_cow); 5776 } else { 5777 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 5778 num_bytes, 5779 parent, root_objectid, owner, 5780 offset, BTRFS_DROP_DELAYED_REF, 5781 NULL, for_cow); 5782 } 5783 return ret; 5784 } 5785 5786 static u64 stripe_align(struct btrfs_root *root, 5787 struct btrfs_block_group_cache *cache, 5788 u64 val, u64 num_bytes) 5789 { 5790 u64 ret = ALIGN(val, root->stripesize); 5791 return ret; 5792 } 5793 5794 /* 5795 * when we wait for progress in the block group caching, its because 5796 * our allocation attempt failed at least once. So, we must sleep 5797 * and let some progress happen before we try again. 5798 * 5799 * This function will sleep at least once waiting for new free space to 5800 * show up, and then it will check the block group free space numbers 5801 * for our min num_bytes. Another option is to have it go ahead 5802 * and look in the rbtree for a free extent of a given size, but this 5803 * is a good start. 5804 */ 5805 static noinline int 5806 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 5807 u64 num_bytes) 5808 { 5809 struct btrfs_caching_control *caching_ctl; 5810 5811 caching_ctl = get_caching_control(cache); 5812 if (!caching_ctl) 5813 return 0; 5814 5815 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 5816 (cache->free_space_ctl->free_space >= num_bytes)); 5817 5818 put_caching_control(caching_ctl); 5819 return 0; 5820 } 5821 5822 static noinline int 5823 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 5824 { 5825 struct btrfs_caching_control *caching_ctl; 5826 5827 caching_ctl = get_caching_control(cache); 5828 if (!caching_ctl) 5829 return 0; 5830 5831 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 5832 5833 put_caching_control(caching_ctl); 5834 return 0; 5835 } 5836 5837 int __get_raid_index(u64 flags) 5838 { 5839 if (flags & BTRFS_BLOCK_GROUP_RAID10) 5840 return BTRFS_RAID_RAID10; 5841 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 5842 return BTRFS_RAID_RAID1; 5843 else if (flags & BTRFS_BLOCK_GROUP_DUP) 5844 return BTRFS_RAID_DUP; 5845 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 5846 return BTRFS_RAID_RAID0; 5847 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 5848 return BTRFS_RAID_RAID5; 5849 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 5850 return BTRFS_RAID_RAID6; 5851 5852 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 5853 } 5854 5855 static int get_block_group_index(struct btrfs_block_group_cache *cache) 5856 { 5857 return __get_raid_index(cache->flags); 5858 } 5859 5860 enum btrfs_loop_type { 5861 LOOP_CACHING_NOWAIT = 0, 5862 LOOP_CACHING_WAIT = 1, 5863 LOOP_ALLOC_CHUNK = 2, 5864 LOOP_NO_EMPTY_SIZE = 3, 5865 }; 5866 5867 /* 5868 * walks the btree of allocated extents and find a hole of a given size. 5869 * The key ins is changed to record the hole: 5870 * ins->objectid == block start 5871 * ins->flags = BTRFS_EXTENT_ITEM_KEY 5872 * ins->offset == number of blocks 5873 * Any available blocks before search_start are skipped. 5874 */ 5875 static noinline int find_free_extent(struct btrfs_trans_handle *trans, 5876 struct btrfs_root *orig_root, 5877 u64 num_bytes, u64 empty_size, 5878 u64 hint_byte, struct btrfs_key *ins, 5879 u64 flags) 5880 { 5881 int ret = 0; 5882 struct btrfs_root *root = orig_root->fs_info->extent_root; 5883 struct btrfs_free_cluster *last_ptr = NULL; 5884 struct btrfs_block_group_cache *block_group = NULL; 5885 struct btrfs_block_group_cache *used_block_group; 5886 u64 search_start = 0; 5887 int empty_cluster = 2 * 1024 * 1024; 5888 struct btrfs_space_info *space_info; 5889 int loop = 0; 5890 int index = __get_raid_index(flags); 5891 int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? 5892 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 5893 bool found_uncached_bg = false; 5894 bool failed_cluster_refill = false; 5895 bool failed_alloc = false; 5896 bool use_cluster = true; 5897 bool have_caching_bg = false; 5898 5899 WARN_ON(num_bytes < root->sectorsize); 5900 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 5901 ins->objectid = 0; 5902 ins->offset = 0; 5903 5904 trace_find_free_extent(orig_root, num_bytes, empty_size, flags); 5905 5906 space_info = __find_space_info(root->fs_info, flags); 5907 if (!space_info) { 5908 btrfs_err(root->fs_info, "No space info for %llu", flags); 5909 return -ENOSPC; 5910 } 5911 5912 /* 5913 * If the space info is for both data and metadata it means we have a 5914 * small filesystem and we can't use the clustering stuff. 5915 */ 5916 if (btrfs_mixed_space_info(space_info)) 5917 use_cluster = false; 5918 5919 if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 5920 last_ptr = &root->fs_info->meta_alloc_cluster; 5921 if (!btrfs_test_opt(root, SSD)) 5922 empty_cluster = 64 * 1024; 5923 } 5924 5925 if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && 5926 btrfs_test_opt(root, SSD)) { 5927 last_ptr = &root->fs_info->data_alloc_cluster; 5928 } 5929 5930 if (last_ptr) { 5931 spin_lock(&last_ptr->lock); 5932 if (last_ptr->block_group) 5933 hint_byte = last_ptr->window_start; 5934 spin_unlock(&last_ptr->lock); 5935 } 5936 5937 search_start = max(search_start, first_logical_byte(root, 0)); 5938 search_start = max(search_start, hint_byte); 5939 5940 if (!last_ptr) 5941 empty_cluster = 0; 5942 5943 if (search_start == hint_byte) { 5944 block_group = btrfs_lookup_block_group(root->fs_info, 5945 search_start); 5946 used_block_group = block_group; 5947 /* 5948 * we don't want to use the block group if it doesn't match our 5949 * allocation bits, or if its not cached. 5950 * 5951 * However if we are re-searching with an ideal block group 5952 * picked out then we don't care that the block group is cached. 5953 */ 5954 if (block_group && block_group_bits(block_group, flags) && 5955 block_group->cached != BTRFS_CACHE_NO) { 5956 down_read(&space_info->groups_sem); 5957 if (list_empty(&block_group->list) || 5958 block_group->ro) { 5959 /* 5960 * someone is removing this block group, 5961 * we can't jump into the have_block_group 5962 * target because our list pointers are not 5963 * valid 5964 */ 5965 btrfs_put_block_group(block_group); 5966 up_read(&space_info->groups_sem); 5967 } else { 5968 index = get_block_group_index(block_group); 5969 goto have_block_group; 5970 } 5971 } else if (block_group) { 5972 btrfs_put_block_group(block_group); 5973 } 5974 } 5975 search: 5976 have_caching_bg = false; 5977 down_read(&space_info->groups_sem); 5978 list_for_each_entry(block_group, &space_info->block_groups[index], 5979 list) { 5980 u64 offset; 5981 int cached; 5982 5983 used_block_group = block_group; 5984 btrfs_get_block_group(block_group); 5985 search_start = block_group->key.objectid; 5986 5987 /* 5988 * this can happen if we end up cycling through all the 5989 * raid types, but we want to make sure we only allocate 5990 * for the proper type. 5991 */ 5992 if (!block_group_bits(block_group, flags)) { 5993 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5994 BTRFS_BLOCK_GROUP_RAID1 | 5995 BTRFS_BLOCK_GROUP_RAID5 | 5996 BTRFS_BLOCK_GROUP_RAID6 | 5997 BTRFS_BLOCK_GROUP_RAID10; 5998 5999 /* 6000 * if they asked for extra copies and this block group 6001 * doesn't provide them, bail. This does allow us to 6002 * fill raid0 from raid1. 6003 */ 6004 if ((flags & extra) && !(block_group->flags & extra)) 6005 goto loop; 6006 } 6007 6008 have_block_group: 6009 cached = block_group_cache_done(block_group); 6010 if (unlikely(!cached)) { 6011 found_uncached_bg = true; 6012 ret = cache_block_group(block_group, 0); 6013 BUG_ON(ret < 0); 6014 ret = 0; 6015 } 6016 6017 if (unlikely(block_group->ro)) 6018 goto loop; 6019 6020 /* 6021 * Ok we want to try and use the cluster allocator, so 6022 * lets look there 6023 */ 6024 if (last_ptr) { 6025 unsigned long aligned_cluster; 6026 /* 6027 * the refill lock keeps out other 6028 * people trying to start a new cluster 6029 */ 6030 spin_lock(&last_ptr->refill_lock); 6031 used_block_group = last_ptr->block_group; 6032 if (used_block_group != block_group && 6033 (!used_block_group || 6034 used_block_group->ro || 6035 !block_group_bits(used_block_group, flags))) { 6036 used_block_group = block_group; 6037 goto refill_cluster; 6038 } 6039 6040 if (used_block_group != block_group) 6041 btrfs_get_block_group(used_block_group); 6042 6043 offset = btrfs_alloc_from_cluster(used_block_group, 6044 last_ptr, num_bytes, used_block_group->key.objectid); 6045 if (offset) { 6046 /* we have a block, we're done */ 6047 spin_unlock(&last_ptr->refill_lock); 6048 trace_btrfs_reserve_extent_cluster(root, 6049 block_group, search_start, num_bytes); 6050 goto checks; 6051 } 6052 6053 WARN_ON(last_ptr->block_group != used_block_group); 6054 if (used_block_group != block_group) { 6055 btrfs_put_block_group(used_block_group); 6056 used_block_group = block_group; 6057 } 6058 refill_cluster: 6059 BUG_ON(used_block_group != block_group); 6060 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 6061 * set up a new clusters, so lets just skip it 6062 * and let the allocator find whatever block 6063 * it can find. If we reach this point, we 6064 * will have tried the cluster allocator 6065 * plenty of times and not have found 6066 * anything, so we are likely way too 6067 * fragmented for the clustering stuff to find 6068 * anything. 6069 * 6070 * However, if the cluster is taken from the 6071 * current block group, release the cluster 6072 * first, so that we stand a better chance of 6073 * succeeding in the unclustered 6074 * allocation. */ 6075 if (loop >= LOOP_NO_EMPTY_SIZE && 6076 last_ptr->block_group != block_group) { 6077 spin_unlock(&last_ptr->refill_lock); 6078 goto unclustered_alloc; 6079 } 6080 6081 /* 6082 * this cluster didn't work out, free it and 6083 * start over 6084 */ 6085 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6086 6087 if (loop >= LOOP_NO_EMPTY_SIZE) { 6088 spin_unlock(&last_ptr->refill_lock); 6089 goto unclustered_alloc; 6090 } 6091 6092 aligned_cluster = max_t(unsigned long, 6093 empty_cluster + empty_size, 6094 block_group->full_stripe_len); 6095 6096 /* allocate a cluster in this block group */ 6097 ret = btrfs_find_space_cluster(trans, root, 6098 block_group, last_ptr, 6099 search_start, num_bytes, 6100 aligned_cluster); 6101 if (ret == 0) { 6102 /* 6103 * now pull our allocation out of this 6104 * cluster 6105 */ 6106 offset = btrfs_alloc_from_cluster(block_group, 6107 last_ptr, num_bytes, 6108 search_start); 6109 if (offset) { 6110 /* we found one, proceed */ 6111 spin_unlock(&last_ptr->refill_lock); 6112 trace_btrfs_reserve_extent_cluster(root, 6113 block_group, search_start, 6114 num_bytes); 6115 goto checks; 6116 } 6117 } else if (!cached && loop > LOOP_CACHING_NOWAIT 6118 && !failed_cluster_refill) { 6119 spin_unlock(&last_ptr->refill_lock); 6120 6121 failed_cluster_refill = true; 6122 wait_block_group_cache_progress(block_group, 6123 num_bytes + empty_cluster + empty_size); 6124 goto have_block_group; 6125 } 6126 6127 /* 6128 * at this point we either didn't find a cluster 6129 * or we weren't able to allocate a block from our 6130 * cluster. Free the cluster we've been trying 6131 * to use, and go to the next block group 6132 */ 6133 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6134 spin_unlock(&last_ptr->refill_lock); 6135 goto loop; 6136 } 6137 6138 unclustered_alloc: 6139 spin_lock(&block_group->free_space_ctl->tree_lock); 6140 if (cached && 6141 block_group->free_space_ctl->free_space < 6142 num_bytes + empty_cluster + empty_size) { 6143 spin_unlock(&block_group->free_space_ctl->tree_lock); 6144 goto loop; 6145 } 6146 spin_unlock(&block_group->free_space_ctl->tree_lock); 6147 6148 offset = btrfs_find_space_for_alloc(block_group, search_start, 6149 num_bytes, empty_size); 6150 /* 6151 * If we didn't find a chunk, and we haven't failed on this 6152 * block group before, and this block group is in the middle of 6153 * caching and we are ok with waiting, then go ahead and wait 6154 * for progress to be made, and set failed_alloc to true. 6155 * 6156 * If failed_alloc is true then we've already waited on this 6157 * block group once and should move on to the next block group. 6158 */ 6159 if (!offset && !failed_alloc && !cached && 6160 loop > LOOP_CACHING_NOWAIT) { 6161 wait_block_group_cache_progress(block_group, 6162 num_bytes + empty_size); 6163 failed_alloc = true; 6164 goto have_block_group; 6165 } else if (!offset) { 6166 if (!cached) 6167 have_caching_bg = true; 6168 goto loop; 6169 } 6170 checks: 6171 search_start = stripe_align(root, used_block_group, 6172 offset, num_bytes); 6173 6174 /* move on to the next group */ 6175 if (search_start + num_bytes > 6176 used_block_group->key.objectid + used_block_group->key.offset) { 6177 btrfs_add_free_space(used_block_group, offset, num_bytes); 6178 goto loop; 6179 } 6180 6181 if (offset < search_start) 6182 btrfs_add_free_space(used_block_group, offset, 6183 search_start - offset); 6184 BUG_ON(offset > search_start); 6185 6186 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, 6187 alloc_type); 6188 if (ret == -EAGAIN) { 6189 btrfs_add_free_space(used_block_group, offset, num_bytes); 6190 goto loop; 6191 } 6192 6193 /* we are all good, lets return */ 6194 ins->objectid = search_start; 6195 ins->offset = num_bytes; 6196 6197 trace_btrfs_reserve_extent(orig_root, block_group, 6198 search_start, num_bytes); 6199 if (used_block_group != block_group) 6200 btrfs_put_block_group(used_block_group); 6201 btrfs_put_block_group(block_group); 6202 break; 6203 loop: 6204 failed_cluster_refill = false; 6205 failed_alloc = false; 6206 BUG_ON(index != get_block_group_index(block_group)); 6207 if (used_block_group != block_group) 6208 btrfs_put_block_group(used_block_group); 6209 btrfs_put_block_group(block_group); 6210 } 6211 up_read(&space_info->groups_sem); 6212 6213 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 6214 goto search; 6215 6216 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 6217 goto search; 6218 6219 /* 6220 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 6221 * caching kthreads as we move along 6222 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 6223 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 6224 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 6225 * again 6226 */ 6227 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 6228 index = 0; 6229 loop++; 6230 if (loop == LOOP_ALLOC_CHUNK) { 6231 ret = do_chunk_alloc(trans, root, flags, 6232 CHUNK_ALLOC_FORCE); 6233 /* 6234 * Do not bail out on ENOSPC since we 6235 * can do more things. 6236 */ 6237 if (ret < 0 && ret != -ENOSPC) { 6238 btrfs_abort_transaction(trans, 6239 root, ret); 6240 goto out; 6241 } 6242 } 6243 6244 if (loop == LOOP_NO_EMPTY_SIZE) { 6245 empty_size = 0; 6246 empty_cluster = 0; 6247 } 6248 6249 goto search; 6250 } else if (!ins->objectid) { 6251 ret = -ENOSPC; 6252 } else if (ins->objectid) { 6253 ret = 0; 6254 } 6255 out: 6256 6257 return ret; 6258 } 6259 6260 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 6261 int dump_block_groups) 6262 { 6263 struct btrfs_block_group_cache *cache; 6264 int index = 0; 6265 6266 spin_lock(&info->lock); 6267 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", 6268 (unsigned long long)info->flags, 6269 (unsigned long long)(info->total_bytes - info->bytes_used - 6270 info->bytes_pinned - info->bytes_reserved - 6271 info->bytes_readonly), 6272 (info->full) ? "" : "not "); 6273 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " 6274 "reserved=%llu, may_use=%llu, readonly=%llu\n", 6275 (unsigned long long)info->total_bytes, 6276 (unsigned long long)info->bytes_used, 6277 (unsigned long long)info->bytes_pinned, 6278 (unsigned long long)info->bytes_reserved, 6279 (unsigned long long)info->bytes_may_use, 6280 (unsigned long long)info->bytes_readonly); 6281 spin_unlock(&info->lock); 6282 6283 if (!dump_block_groups) 6284 return; 6285 6286 down_read(&info->groups_sem); 6287 again: 6288 list_for_each_entry(cache, &info->block_groups[index], list) { 6289 spin_lock(&cache->lock); 6290 printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", 6291 (unsigned long long)cache->key.objectid, 6292 (unsigned long long)cache->key.offset, 6293 (unsigned long long)btrfs_block_group_used(&cache->item), 6294 (unsigned long long)cache->pinned, 6295 (unsigned long long)cache->reserved, 6296 cache->ro ? "[readonly]" : ""); 6297 btrfs_dump_free_space(cache, bytes); 6298 spin_unlock(&cache->lock); 6299 } 6300 if (++index < BTRFS_NR_RAID_TYPES) 6301 goto again; 6302 up_read(&info->groups_sem); 6303 } 6304 6305 int btrfs_reserve_extent(struct btrfs_trans_handle *trans, 6306 struct btrfs_root *root, 6307 u64 num_bytes, u64 min_alloc_size, 6308 u64 empty_size, u64 hint_byte, 6309 struct btrfs_key *ins, int is_data) 6310 { 6311 bool final_tried = false; 6312 u64 flags; 6313 int ret; 6314 6315 flags = btrfs_get_alloc_profile(root, is_data); 6316 again: 6317 WARN_ON(num_bytes < root->sectorsize); 6318 ret = find_free_extent(trans, root, num_bytes, empty_size, 6319 hint_byte, ins, flags); 6320 6321 if (ret == -ENOSPC) { 6322 if (!final_tried) { 6323 num_bytes = num_bytes >> 1; 6324 num_bytes = round_down(num_bytes, root->sectorsize); 6325 num_bytes = max(num_bytes, min_alloc_size); 6326 if (num_bytes == min_alloc_size) 6327 final_tried = true; 6328 goto again; 6329 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 6330 struct btrfs_space_info *sinfo; 6331 6332 sinfo = __find_space_info(root->fs_info, flags); 6333 btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", 6334 (unsigned long long)flags, 6335 (unsigned long long)num_bytes); 6336 if (sinfo) 6337 dump_space_info(sinfo, num_bytes, 1); 6338 } 6339 } 6340 6341 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); 6342 6343 return ret; 6344 } 6345 6346 static int __btrfs_free_reserved_extent(struct btrfs_root *root, 6347 u64 start, u64 len, int pin) 6348 { 6349 struct btrfs_block_group_cache *cache; 6350 int ret = 0; 6351 6352 cache = btrfs_lookup_block_group(root->fs_info, start); 6353 if (!cache) { 6354 btrfs_err(root->fs_info, "Unable to find block group for %llu", 6355 (unsigned long long)start); 6356 return -ENOSPC; 6357 } 6358 6359 if (btrfs_test_opt(root, DISCARD)) 6360 ret = btrfs_discard_extent(root, start, len, NULL); 6361 6362 if (pin) 6363 pin_down_extent(root, cache, start, len, 1); 6364 else { 6365 btrfs_add_free_space(cache, start, len); 6366 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); 6367 } 6368 btrfs_put_block_group(cache); 6369 6370 trace_btrfs_reserved_extent_free(root, start, len); 6371 6372 return ret; 6373 } 6374 6375 int btrfs_free_reserved_extent(struct btrfs_root *root, 6376 u64 start, u64 len) 6377 { 6378 return __btrfs_free_reserved_extent(root, start, len, 0); 6379 } 6380 6381 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 6382 u64 start, u64 len) 6383 { 6384 return __btrfs_free_reserved_extent(root, start, len, 1); 6385 } 6386 6387 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6388 struct btrfs_root *root, 6389 u64 parent, u64 root_objectid, 6390 u64 flags, u64 owner, u64 offset, 6391 struct btrfs_key *ins, int ref_mod) 6392 { 6393 int ret; 6394 struct btrfs_fs_info *fs_info = root->fs_info; 6395 struct btrfs_extent_item *extent_item; 6396 struct btrfs_extent_inline_ref *iref; 6397 struct btrfs_path *path; 6398 struct extent_buffer *leaf; 6399 int type; 6400 u32 size; 6401 6402 if (parent > 0) 6403 type = BTRFS_SHARED_DATA_REF_KEY; 6404 else 6405 type = BTRFS_EXTENT_DATA_REF_KEY; 6406 6407 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 6408 6409 path = btrfs_alloc_path(); 6410 if (!path) 6411 return -ENOMEM; 6412 6413 path->leave_spinning = 1; 6414 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 6415 ins, size); 6416 if (ret) { 6417 btrfs_free_path(path); 6418 return ret; 6419 } 6420 6421 leaf = path->nodes[0]; 6422 extent_item = btrfs_item_ptr(leaf, path->slots[0], 6423 struct btrfs_extent_item); 6424 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 6425 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 6426 btrfs_set_extent_flags(leaf, extent_item, 6427 flags | BTRFS_EXTENT_FLAG_DATA); 6428 6429 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 6430 btrfs_set_extent_inline_ref_type(leaf, iref, type); 6431 if (parent > 0) { 6432 struct btrfs_shared_data_ref *ref; 6433 ref = (struct btrfs_shared_data_ref *)(iref + 1); 6434 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 6435 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 6436 } else { 6437 struct btrfs_extent_data_ref *ref; 6438 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 6439 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 6440 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 6441 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 6442 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 6443 } 6444 6445 btrfs_mark_buffer_dirty(path->nodes[0]); 6446 btrfs_free_path(path); 6447 6448 ret = update_block_group(root, ins->objectid, ins->offset, 1); 6449 if (ret) { /* -ENOENT, logic error */ 6450 btrfs_err(fs_info, "update block group failed for %llu %llu", 6451 (unsigned long long)ins->objectid, 6452 (unsigned long long)ins->offset); 6453 BUG(); 6454 } 6455 return ret; 6456 } 6457 6458 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 6459 struct btrfs_root *root, 6460 u64 parent, u64 root_objectid, 6461 u64 flags, struct btrfs_disk_key *key, 6462 int level, struct btrfs_key *ins) 6463 { 6464 int ret; 6465 struct btrfs_fs_info *fs_info = root->fs_info; 6466 struct btrfs_extent_item *extent_item; 6467 struct btrfs_tree_block_info *block_info; 6468 struct btrfs_extent_inline_ref *iref; 6469 struct btrfs_path *path; 6470 struct extent_buffer *leaf; 6471 u32 size = sizeof(*extent_item) + sizeof(*iref); 6472 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 6473 SKINNY_METADATA); 6474 6475 if (!skinny_metadata) 6476 size += sizeof(*block_info); 6477 6478 path = btrfs_alloc_path(); 6479 if (!path) 6480 return -ENOMEM; 6481 6482 path->leave_spinning = 1; 6483 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 6484 ins, size); 6485 if (ret) { 6486 btrfs_free_path(path); 6487 return ret; 6488 } 6489 6490 leaf = path->nodes[0]; 6491 extent_item = btrfs_item_ptr(leaf, path->slots[0], 6492 struct btrfs_extent_item); 6493 btrfs_set_extent_refs(leaf, extent_item, 1); 6494 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 6495 btrfs_set_extent_flags(leaf, extent_item, 6496 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 6497 6498 if (skinny_metadata) { 6499 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 6500 } else { 6501 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 6502 btrfs_set_tree_block_key(leaf, block_info, key); 6503 btrfs_set_tree_block_level(leaf, block_info, level); 6504 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 6505 } 6506 6507 if (parent > 0) { 6508 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 6509 btrfs_set_extent_inline_ref_type(leaf, iref, 6510 BTRFS_SHARED_BLOCK_REF_KEY); 6511 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 6512 } else { 6513 btrfs_set_extent_inline_ref_type(leaf, iref, 6514 BTRFS_TREE_BLOCK_REF_KEY); 6515 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 6516 } 6517 6518 btrfs_mark_buffer_dirty(leaf); 6519 btrfs_free_path(path); 6520 6521 ret = update_block_group(root, ins->objectid, root->leafsize, 1); 6522 if (ret) { /* -ENOENT, logic error */ 6523 btrfs_err(fs_info, "update block group failed for %llu %llu", 6524 (unsigned long long)ins->objectid, 6525 (unsigned long long)ins->offset); 6526 BUG(); 6527 } 6528 return ret; 6529 } 6530 6531 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6532 struct btrfs_root *root, 6533 u64 root_objectid, u64 owner, 6534 u64 offset, struct btrfs_key *ins) 6535 { 6536 int ret; 6537 6538 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 6539 6540 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, 6541 ins->offset, 0, 6542 root_objectid, owner, offset, 6543 BTRFS_ADD_DELAYED_EXTENT, NULL, 0); 6544 return ret; 6545 } 6546 6547 /* 6548 * this is used by the tree logging recovery code. It records that 6549 * an extent has been allocated and makes sure to clear the free 6550 * space cache bits as well 6551 */ 6552 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 6553 struct btrfs_root *root, 6554 u64 root_objectid, u64 owner, u64 offset, 6555 struct btrfs_key *ins) 6556 { 6557 int ret; 6558 struct btrfs_block_group_cache *block_group; 6559 struct btrfs_caching_control *caching_ctl; 6560 u64 start = ins->objectid; 6561 u64 num_bytes = ins->offset; 6562 6563 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 6564 cache_block_group(block_group, 0); 6565 caching_ctl = get_caching_control(block_group); 6566 6567 if (!caching_ctl) { 6568 BUG_ON(!block_group_cache_done(block_group)); 6569 ret = btrfs_remove_free_space(block_group, start, num_bytes); 6570 if (ret) 6571 goto out; 6572 } else { 6573 mutex_lock(&caching_ctl->mutex); 6574 6575 if (start >= caching_ctl->progress) { 6576 ret = add_excluded_extent(root, start, num_bytes); 6577 } else if (start + num_bytes <= caching_ctl->progress) { 6578 ret = btrfs_remove_free_space(block_group, 6579 start, num_bytes); 6580 } else { 6581 num_bytes = caching_ctl->progress - start; 6582 ret = btrfs_remove_free_space(block_group, 6583 start, num_bytes); 6584 if (ret) 6585 goto out_lock; 6586 6587 start = caching_ctl->progress; 6588 num_bytes = ins->objectid + ins->offset - 6589 caching_ctl->progress; 6590 ret = add_excluded_extent(root, start, num_bytes); 6591 } 6592 out_lock: 6593 mutex_unlock(&caching_ctl->mutex); 6594 put_caching_control(caching_ctl); 6595 if (ret) 6596 goto out; 6597 } 6598 6599 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 6600 RESERVE_ALLOC_NO_ACCOUNT); 6601 BUG_ON(ret); /* logic error */ 6602 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 6603 0, owner, offset, ins, 1); 6604 out: 6605 btrfs_put_block_group(block_group); 6606 return ret; 6607 } 6608 6609 static struct extent_buffer * 6610 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 6611 u64 bytenr, u32 blocksize, int level) 6612 { 6613 struct extent_buffer *buf; 6614 6615 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 6616 if (!buf) 6617 return ERR_PTR(-ENOMEM); 6618 btrfs_set_header_generation(buf, trans->transid); 6619 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 6620 btrfs_tree_lock(buf); 6621 clean_tree_block(trans, root, buf); 6622 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 6623 6624 btrfs_set_lock_blocking(buf); 6625 btrfs_set_buffer_uptodate(buf); 6626 6627 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 6628 /* 6629 * we allow two log transactions at a time, use different 6630 * EXENT bit to differentiate dirty pages. 6631 */ 6632 if (root->log_transid % 2 == 0) 6633 set_extent_dirty(&root->dirty_log_pages, buf->start, 6634 buf->start + buf->len - 1, GFP_NOFS); 6635 else 6636 set_extent_new(&root->dirty_log_pages, buf->start, 6637 buf->start + buf->len - 1, GFP_NOFS); 6638 } else { 6639 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 6640 buf->start + buf->len - 1, GFP_NOFS); 6641 } 6642 trans->blocks_used++; 6643 /* this returns a buffer locked for blocking */ 6644 return buf; 6645 } 6646 6647 static struct btrfs_block_rsv * 6648 use_block_rsv(struct btrfs_trans_handle *trans, 6649 struct btrfs_root *root, u32 blocksize) 6650 { 6651 struct btrfs_block_rsv *block_rsv; 6652 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 6653 int ret; 6654 6655 block_rsv = get_block_rsv(trans, root); 6656 6657 if (block_rsv->size == 0) { 6658 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6659 BTRFS_RESERVE_NO_FLUSH); 6660 /* 6661 * If we couldn't reserve metadata bytes try and use some from 6662 * the global reserve. 6663 */ 6664 if (ret && block_rsv != global_rsv) { 6665 ret = block_rsv_use_bytes(global_rsv, blocksize); 6666 if (!ret) 6667 return global_rsv; 6668 return ERR_PTR(ret); 6669 } else if (ret) { 6670 return ERR_PTR(ret); 6671 } 6672 return block_rsv; 6673 } 6674 6675 ret = block_rsv_use_bytes(block_rsv, blocksize); 6676 if (!ret) 6677 return block_rsv; 6678 if (ret && !block_rsv->failfast) { 6679 if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 6680 static DEFINE_RATELIMIT_STATE(_rs, 6681 DEFAULT_RATELIMIT_INTERVAL * 10, 6682 /*DEFAULT_RATELIMIT_BURST*/ 1); 6683 if (__ratelimit(&_rs)) 6684 WARN(1, KERN_DEBUG 6685 "btrfs: block rsv returned %d\n", ret); 6686 } 6687 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6688 BTRFS_RESERVE_NO_FLUSH); 6689 if (!ret) { 6690 return block_rsv; 6691 } else if (ret && block_rsv != global_rsv) { 6692 ret = block_rsv_use_bytes(global_rsv, blocksize); 6693 if (!ret) 6694 return global_rsv; 6695 } 6696 } 6697 6698 return ERR_PTR(-ENOSPC); 6699 } 6700 6701 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 6702 struct btrfs_block_rsv *block_rsv, u32 blocksize) 6703 { 6704 block_rsv_add_bytes(block_rsv, blocksize, 0); 6705 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 6706 } 6707 6708 /* 6709 * finds a free extent and does all the dirty work required for allocation 6710 * returns the key for the extent through ins, and a tree buffer for 6711 * the first block of the extent through buf. 6712 * 6713 * returns the tree buffer or NULL. 6714 */ 6715 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 6716 struct btrfs_root *root, u32 blocksize, 6717 u64 parent, u64 root_objectid, 6718 struct btrfs_disk_key *key, int level, 6719 u64 hint, u64 empty_size) 6720 { 6721 struct btrfs_key ins; 6722 struct btrfs_block_rsv *block_rsv; 6723 struct extent_buffer *buf; 6724 u64 flags = 0; 6725 int ret; 6726 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 6727 SKINNY_METADATA); 6728 6729 block_rsv = use_block_rsv(trans, root, blocksize); 6730 if (IS_ERR(block_rsv)) 6731 return ERR_CAST(block_rsv); 6732 6733 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, 6734 empty_size, hint, &ins, 0); 6735 if (ret) { 6736 unuse_block_rsv(root->fs_info, block_rsv, blocksize); 6737 return ERR_PTR(ret); 6738 } 6739 6740 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 6741 blocksize, level); 6742 BUG_ON(IS_ERR(buf)); /* -ENOMEM */ 6743 6744 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 6745 if (parent == 0) 6746 parent = ins.objectid; 6747 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 6748 } else 6749 BUG_ON(parent > 0); 6750 6751 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 6752 struct btrfs_delayed_extent_op *extent_op; 6753 extent_op = btrfs_alloc_delayed_extent_op(); 6754 BUG_ON(!extent_op); /* -ENOMEM */ 6755 if (key) 6756 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 6757 else 6758 memset(&extent_op->key, 0, sizeof(extent_op->key)); 6759 extent_op->flags_to_set = flags; 6760 if (skinny_metadata) 6761 extent_op->update_key = 0; 6762 else 6763 extent_op->update_key = 1; 6764 extent_op->update_flags = 1; 6765 extent_op->is_data = 0; 6766 6767 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 6768 ins.objectid, 6769 ins.offset, parent, root_objectid, 6770 level, BTRFS_ADD_DELAYED_EXTENT, 6771 extent_op, 0); 6772 BUG_ON(ret); /* -ENOMEM */ 6773 } 6774 return buf; 6775 } 6776 6777 struct walk_control { 6778 u64 refs[BTRFS_MAX_LEVEL]; 6779 u64 flags[BTRFS_MAX_LEVEL]; 6780 struct btrfs_key update_progress; 6781 int stage; 6782 int level; 6783 int shared_level; 6784 int update_ref; 6785 int keep_locks; 6786 int reada_slot; 6787 int reada_count; 6788 int for_reloc; 6789 }; 6790 6791 #define DROP_REFERENCE 1 6792 #define UPDATE_BACKREF 2 6793 6794 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 6795 struct btrfs_root *root, 6796 struct walk_control *wc, 6797 struct btrfs_path *path) 6798 { 6799 u64 bytenr; 6800 u64 generation; 6801 u64 refs; 6802 u64 flags; 6803 u32 nritems; 6804 u32 blocksize; 6805 struct btrfs_key key; 6806 struct extent_buffer *eb; 6807 int ret; 6808 int slot; 6809 int nread = 0; 6810 6811 if (path->slots[wc->level] < wc->reada_slot) { 6812 wc->reada_count = wc->reada_count * 2 / 3; 6813 wc->reada_count = max(wc->reada_count, 2); 6814 } else { 6815 wc->reada_count = wc->reada_count * 3 / 2; 6816 wc->reada_count = min_t(int, wc->reada_count, 6817 BTRFS_NODEPTRS_PER_BLOCK(root)); 6818 } 6819 6820 eb = path->nodes[wc->level]; 6821 nritems = btrfs_header_nritems(eb); 6822 blocksize = btrfs_level_size(root, wc->level - 1); 6823 6824 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 6825 if (nread >= wc->reada_count) 6826 break; 6827 6828 cond_resched(); 6829 bytenr = btrfs_node_blockptr(eb, slot); 6830 generation = btrfs_node_ptr_generation(eb, slot); 6831 6832 if (slot == path->slots[wc->level]) 6833 goto reada; 6834 6835 if (wc->stage == UPDATE_BACKREF && 6836 generation <= root->root_key.offset) 6837 continue; 6838 6839 /* We don't lock the tree block, it's OK to be racy here */ 6840 ret = btrfs_lookup_extent_info(trans, root, bytenr, 6841 wc->level - 1, 1, &refs, 6842 &flags); 6843 /* We don't care about errors in readahead. */ 6844 if (ret < 0) 6845 continue; 6846 BUG_ON(refs == 0); 6847 6848 if (wc->stage == DROP_REFERENCE) { 6849 if (refs == 1) 6850 goto reada; 6851 6852 if (wc->level == 1 && 6853 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 6854 continue; 6855 if (!wc->update_ref || 6856 generation <= root->root_key.offset) 6857 continue; 6858 btrfs_node_key_to_cpu(eb, &key, slot); 6859 ret = btrfs_comp_cpu_keys(&key, 6860 &wc->update_progress); 6861 if (ret < 0) 6862 continue; 6863 } else { 6864 if (wc->level == 1 && 6865 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 6866 continue; 6867 } 6868 reada: 6869 ret = readahead_tree_block(root, bytenr, blocksize, 6870 generation); 6871 if (ret) 6872 break; 6873 nread++; 6874 } 6875 wc->reada_slot = slot; 6876 } 6877 6878 /* 6879 * helper to process tree block while walking down the tree. 6880 * 6881 * when wc->stage == UPDATE_BACKREF, this function updates 6882 * back refs for pointers in the block. 6883 * 6884 * NOTE: return value 1 means we should stop walking down. 6885 */ 6886 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 6887 struct btrfs_root *root, 6888 struct btrfs_path *path, 6889 struct walk_control *wc, int lookup_info) 6890 { 6891 int level = wc->level; 6892 struct extent_buffer *eb = path->nodes[level]; 6893 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 6894 int ret; 6895 6896 if (wc->stage == UPDATE_BACKREF && 6897 btrfs_header_owner(eb) != root->root_key.objectid) 6898 return 1; 6899 6900 /* 6901 * when reference count of tree block is 1, it won't increase 6902 * again. once full backref flag is set, we never clear it. 6903 */ 6904 if (lookup_info && 6905 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 6906 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 6907 BUG_ON(!path->locks[level]); 6908 ret = btrfs_lookup_extent_info(trans, root, 6909 eb->start, level, 1, 6910 &wc->refs[level], 6911 &wc->flags[level]); 6912 BUG_ON(ret == -ENOMEM); 6913 if (ret) 6914 return ret; 6915 BUG_ON(wc->refs[level] == 0); 6916 } 6917 6918 if (wc->stage == DROP_REFERENCE) { 6919 if (wc->refs[level] > 1) 6920 return 1; 6921 6922 if (path->locks[level] && !wc->keep_locks) { 6923 btrfs_tree_unlock_rw(eb, path->locks[level]); 6924 path->locks[level] = 0; 6925 } 6926 return 0; 6927 } 6928 6929 /* wc->stage == UPDATE_BACKREF */ 6930 if (!(wc->flags[level] & flag)) { 6931 BUG_ON(!path->locks[level]); 6932 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); 6933 BUG_ON(ret); /* -ENOMEM */ 6934 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); 6935 BUG_ON(ret); /* -ENOMEM */ 6936 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 6937 eb->len, flag, 0); 6938 BUG_ON(ret); /* -ENOMEM */ 6939 wc->flags[level] |= flag; 6940 } 6941 6942 /* 6943 * the block is shared by multiple trees, so it's not good to 6944 * keep the tree lock 6945 */ 6946 if (path->locks[level] && level > 0) { 6947 btrfs_tree_unlock_rw(eb, path->locks[level]); 6948 path->locks[level] = 0; 6949 } 6950 return 0; 6951 } 6952 6953 /* 6954 * helper to process tree block pointer. 6955 * 6956 * when wc->stage == DROP_REFERENCE, this function checks 6957 * reference count of the block pointed to. if the block 6958 * is shared and we need update back refs for the subtree 6959 * rooted at the block, this function changes wc->stage to 6960 * UPDATE_BACKREF. if the block is shared and there is no 6961 * need to update back, this function drops the reference 6962 * to the block. 6963 * 6964 * NOTE: return value 1 means we should stop walking down. 6965 */ 6966 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 6967 struct btrfs_root *root, 6968 struct btrfs_path *path, 6969 struct walk_control *wc, int *lookup_info) 6970 { 6971 u64 bytenr; 6972 u64 generation; 6973 u64 parent; 6974 u32 blocksize; 6975 struct btrfs_key key; 6976 struct extent_buffer *next; 6977 int level = wc->level; 6978 int reada = 0; 6979 int ret = 0; 6980 6981 generation = btrfs_node_ptr_generation(path->nodes[level], 6982 path->slots[level]); 6983 /* 6984 * if the lower level block was created before the snapshot 6985 * was created, we know there is no need to update back refs 6986 * for the subtree 6987 */ 6988 if (wc->stage == UPDATE_BACKREF && 6989 generation <= root->root_key.offset) { 6990 *lookup_info = 1; 6991 return 1; 6992 } 6993 6994 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 6995 blocksize = btrfs_level_size(root, level - 1); 6996 6997 next = btrfs_find_tree_block(root, bytenr, blocksize); 6998 if (!next) { 6999 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 7000 if (!next) 7001 return -ENOMEM; 7002 reada = 1; 7003 } 7004 btrfs_tree_lock(next); 7005 btrfs_set_lock_blocking(next); 7006 7007 ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, 7008 &wc->refs[level - 1], 7009 &wc->flags[level - 1]); 7010 if (ret < 0) { 7011 btrfs_tree_unlock(next); 7012 return ret; 7013 } 7014 7015 if (unlikely(wc->refs[level - 1] == 0)) { 7016 btrfs_err(root->fs_info, "Missing references."); 7017 BUG(); 7018 } 7019 *lookup_info = 0; 7020 7021 if (wc->stage == DROP_REFERENCE) { 7022 if (wc->refs[level - 1] > 1) { 7023 if (level == 1 && 7024 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7025 goto skip; 7026 7027 if (!wc->update_ref || 7028 generation <= root->root_key.offset) 7029 goto skip; 7030 7031 btrfs_node_key_to_cpu(path->nodes[level], &key, 7032 path->slots[level]); 7033 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 7034 if (ret < 0) 7035 goto skip; 7036 7037 wc->stage = UPDATE_BACKREF; 7038 wc->shared_level = level - 1; 7039 } 7040 } else { 7041 if (level == 1 && 7042 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7043 goto skip; 7044 } 7045 7046 if (!btrfs_buffer_uptodate(next, generation, 0)) { 7047 btrfs_tree_unlock(next); 7048 free_extent_buffer(next); 7049 next = NULL; 7050 *lookup_info = 1; 7051 } 7052 7053 if (!next) { 7054 if (reada && level == 1) 7055 reada_walk_down(trans, root, wc, path); 7056 next = read_tree_block(root, bytenr, blocksize, generation); 7057 if (!next || !extent_buffer_uptodate(next)) { 7058 free_extent_buffer(next); 7059 return -EIO; 7060 } 7061 btrfs_tree_lock(next); 7062 btrfs_set_lock_blocking(next); 7063 } 7064 7065 level--; 7066 BUG_ON(level != btrfs_header_level(next)); 7067 path->nodes[level] = next; 7068 path->slots[level] = 0; 7069 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7070 wc->level = level; 7071 if (wc->level == 1) 7072 wc->reada_slot = 0; 7073 return 0; 7074 skip: 7075 wc->refs[level - 1] = 0; 7076 wc->flags[level - 1] = 0; 7077 if (wc->stage == DROP_REFERENCE) { 7078 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 7079 parent = path->nodes[level]->start; 7080 } else { 7081 BUG_ON(root->root_key.objectid != 7082 btrfs_header_owner(path->nodes[level])); 7083 parent = 0; 7084 } 7085 7086 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 7087 root->root_key.objectid, level - 1, 0, 0); 7088 BUG_ON(ret); /* -ENOMEM */ 7089 } 7090 btrfs_tree_unlock(next); 7091 free_extent_buffer(next); 7092 *lookup_info = 1; 7093 return 1; 7094 } 7095 7096 /* 7097 * helper to process tree block while walking up the tree. 7098 * 7099 * when wc->stage == DROP_REFERENCE, this function drops 7100 * reference count on the block. 7101 * 7102 * when wc->stage == UPDATE_BACKREF, this function changes 7103 * wc->stage back to DROP_REFERENCE if we changed wc->stage 7104 * to UPDATE_BACKREF previously while processing the block. 7105 * 7106 * NOTE: return value 1 means we should stop walking up. 7107 */ 7108 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 7109 struct btrfs_root *root, 7110 struct btrfs_path *path, 7111 struct walk_control *wc) 7112 { 7113 int ret; 7114 int level = wc->level; 7115 struct extent_buffer *eb = path->nodes[level]; 7116 u64 parent = 0; 7117 7118 if (wc->stage == UPDATE_BACKREF) { 7119 BUG_ON(wc->shared_level < level); 7120 if (level < wc->shared_level) 7121 goto out; 7122 7123 ret = find_next_key(path, level + 1, &wc->update_progress); 7124 if (ret > 0) 7125 wc->update_ref = 0; 7126 7127 wc->stage = DROP_REFERENCE; 7128 wc->shared_level = -1; 7129 path->slots[level] = 0; 7130 7131 /* 7132 * check reference count again if the block isn't locked. 7133 * we should start walking down the tree again if reference 7134 * count is one. 7135 */ 7136 if (!path->locks[level]) { 7137 BUG_ON(level == 0); 7138 btrfs_tree_lock(eb); 7139 btrfs_set_lock_blocking(eb); 7140 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7141 7142 ret = btrfs_lookup_extent_info(trans, root, 7143 eb->start, level, 1, 7144 &wc->refs[level], 7145 &wc->flags[level]); 7146 if (ret < 0) { 7147 btrfs_tree_unlock_rw(eb, path->locks[level]); 7148 path->locks[level] = 0; 7149 return ret; 7150 } 7151 BUG_ON(wc->refs[level] == 0); 7152 if (wc->refs[level] == 1) { 7153 btrfs_tree_unlock_rw(eb, path->locks[level]); 7154 path->locks[level] = 0; 7155 return 1; 7156 } 7157 } 7158 } 7159 7160 /* wc->stage == DROP_REFERENCE */ 7161 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 7162 7163 if (wc->refs[level] == 1) { 7164 if (level == 0) { 7165 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7166 ret = btrfs_dec_ref(trans, root, eb, 1, 7167 wc->for_reloc); 7168 else 7169 ret = btrfs_dec_ref(trans, root, eb, 0, 7170 wc->for_reloc); 7171 BUG_ON(ret); /* -ENOMEM */ 7172 } 7173 /* make block locked assertion in clean_tree_block happy */ 7174 if (!path->locks[level] && 7175 btrfs_header_generation(eb) == trans->transid) { 7176 btrfs_tree_lock(eb); 7177 btrfs_set_lock_blocking(eb); 7178 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7179 } 7180 clean_tree_block(trans, root, eb); 7181 } 7182 7183 if (eb == root->node) { 7184 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7185 parent = eb->start; 7186 else 7187 BUG_ON(root->root_key.objectid != 7188 btrfs_header_owner(eb)); 7189 } else { 7190 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7191 parent = path->nodes[level + 1]->start; 7192 else 7193 BUG_ON(root->root_key.objectid != 7194 btrfs_header_owner(path->nodes[level + 1])); 7195 } 7196 7197 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 7198 out: 7199 wc->refs[level] = 0; 7200 wc->flags[level] = 0; 7201 return 0; 7202 } 7203 7204 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 7205 struct btrfs_root *root, 7206 struct btrfs_path *path, 7207 struct walk_control *wc) 7208 { 7209 int level = wc->level; 7210 int lookup_info = 1; 7211 int ret; 7212 7213 while (level >= 0) { 7214 ret = walk_down_proc(trans, root, path, wc, lookup_info); 7215 if (ret > 0) 7216 break; 7217 7218 if (level == 0) 7219 break; 7220 7221 if (path->slots[level] >= 7222 btrfs_header_nritems(path->nodes[level])) 7223 break; 7224 7225 ret = do_walk_down(trans, root, path, wc, &lookup_info); 7226 if (ret > 0) { 7227 path->slots[level]++; 7228 continue; 7229 } else if (ret < 0) 7230 return ret; 7231 level = wc->level; 7232 } 7233 return 0; 7234 } 7235 7236 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 7237 struct btrfs_root *root, 7238 struct btrfs_path *path, 7239 struct walk_control *wc, int max_level) 7240 { 7241 int level = wc->level; 7242 int ret; 7243 7244 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 7245 while (level < max_level && path->nodes[level]) { 7246 wc->level = level; 7247 if (path->slots[level] + 1 < 7248 btrfs_header_nritems(path->nodes[level])) { 7249 path->slots[level]++; 7250 return 0; 7251 } else { 7252 ret = walk_up_proc(trans, root, path, wc); 7253 if (ret > 0) 7254 return 0; 7255 7256 if (path->locks[level]) { 7257 btrfs_tree_unlock_rw(path->nodes[level], 7258 path->locks[level]); 7259 path->locks[level] = 0; 7260 } 7261 free_extent_buffer(path->nodes[level]); 7262 path->nodes[level] = NULL; 7263 level++; 7264 } 7265 } 7266 return 1; 7267 } 7268 7269 /* 7270 * drop a subvolume tree. 7271 * 7272 * this function traverses the tree freeing any blocks that only 7273 * referenced by the tree. 7274 * 7275 * when a shared tree block is found. this function decreases its 7276 * reference count by one. if update_ref is true, this function 7277 * also make sure backrefs for the shared block and all lower level 7278 * blocks are properly updated. 7279 * 7280 * If called with for_reloc == 0, may exit early with -EAGAIN 7281 */ 7282 int btrfs_drop_snapshot(struct btrfs_root *root, 7283 struct btrfs_block_rsv *block_rsv, int update_ref, 7284 int for_reloc) 7285 { 7286 struct btrfs_path *path; 7287 struct btrfs_trans_handle *trans; 7288 struct btrfs_root *tree_root = root->fs_info->tree_root; 7289 struct btrfs_root_item *root_item = &root->root_item; 7290 struct walk_control *wc; 7291 struct btrfs_key key; 7292 int err = 0; 7293 int ret; 7294 int level; 7295 7296 path = btrfs_alloc_path(); 7297 if (!path) { 7298 err = -ENOMEM; 7299 goto out; 7300 } 7301 7302 wc = kzalloc(sizeof(*wc), GFP_NOFS); 7303 if (!wc) { 7304 btrfs_free_path(path); 7305 err = -ENOMEM; 7306 goto out; 7307 } 7308 7309 trans = btrfs_start_transaction(tree_root, 0); 7310 if (IS_ERR(trans)) { 7311 err = PTR_ERR(trans); 7312 goto out_free; 7313 } 7314 7315 if (block_rsv) 7316 trans->block_rsv = block_rsv; 7317 7318 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 7319 level = btrfs_header_level(root->node); 7320 path->nodes[level] = btrfs_lock_root_node(root); 7321 btrfs_set_lock_blocking(path->nodes[level]); 7322 path->slots[level] = 0; 7323 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7324 memset(&wc->update_progress, 0, 7325 sizeof(wc->update_progress)); 7326 } else { 7327 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 7328 memcpy(&wc->update_progress, &key, 7329 sizeof(wc->update_progress)); 7330 7331 level = root_item->drop_level; 7332 BUG_ON(level == 0); 7333 path->lowest_level = level; 7334 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7335 path->lowest_level = 0; 7336 if (ret < 0) { 7337 err = ret; 7338 goto out_end_trans; 7339 } 7340 WARN_ON(ret > 0); 7341 7342 /* 7343 * unlock our path, this is safe because only this 7344 * function is allowed to delete this snapshot 7345 */ 7346 btrfs_unlock_up_safe(path, 0); 7347 7348 level = btrfs_header_level(root->node); 7349 while (1) { 7350 btrfs_tree_lock(path->nodes[level]); 7351 btrfs_set_lock_blocking(path->nodes[level]); 7352 7353 ret = btrfs_lookup_extent_info(trans, root, 7354 path->nodes[level]->start, 7355 level, 1, &wc->refs[level], 7356 &wc->flags[level]); 7357 if (ret < 0) { 7358 err = ret; 7359 goto out_end_trans; 7360 } 7361 BUG_ON(wc->refs[level] == 0); 7362 7363 if (level == root_item->drop_level) 7364 break; 7365 7366 btrfs_tree_unlock(path->nodes[level]); 7367 WARN_ON(wc->refs[level] != 1); 7368 level--; 7369 } 7370 } 7371 7372 wc->level = level; 7373 wc->shared_level = -1; 7374 wc->stage = DROP_REFERENCE; 7375 wc->update_ref = update_ref; 7376 wc->keep_locks = 0; 7377 wc->for_reloc = for_reloc; 7378 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7379 7380 while (1) { 7381 if (!for_reloc && btrfs_fs_closing(root->fs_info)) { 7382 pr_debug("btrfs: drop snapshot early exit\n"); 7383 err = -EAGAIN; 7384 goto out_end_trans; 7385 } 7386 7387 ret = walk_down_tree(trans, root, path, wc); 7388 if (ret < 0) { 7389 err = ret; 7390 break; 7391 } 7392 7393 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 7394 if (ret < 0) { 7395 err = ret; 7396 break; 7397 } 7398 7399 if (ret > 0) { 7400 BUG_ON(wc->stage != DROP_REFERENCE); 7401 break; 7402 } 7403 7404 if (wc->stage == DROP_REFERENCE) { 7405 level = wc->level; 7406 btrfs_node_key(path->nodes[level], 7407 &root_item->drop_progress, 7408 path->slots[level]); 7409 root_item->drop_level = level; 7410 } 7411 7412 BUG_ON(wc->level == 0); 7413 if (btrfs_should_end_transaction(trans, tree_root)) { 7414 ret = btrfs_update_root(trans, tree_root, 7415 &root->root_key, 7416 root_item); 7417 if (ret) { 7418 btrfs_abort_transaction(trans, tree_root, ret); 7419 err = ret; 7420 goto out_end_trans; 7421 } 7422 7423 btrfs_end_transaction_throttle(trans, tree_root); 7424 trans = btrfs_start_transaction(tree_root, 0); 7425 if (IS_ERR(trans)) { 7426 err = PTR_ERR(trans); 7427 goto out_free; 7428 } 7429 if (block_rsv) 7430 trans->block_rsv = block_rsv; 7431 } 7432 } 7433 btrfs_release_path(path); 7434 if (err) 7435 goto out_end_trans; 7436 7437 ret = btrfs_del_root(trans, tree_root, &root->root_key); 7438 if (ret) { 7439 btrfs_abort_transaction(trans, tree_root, ret); 7440 goto out_end_trans; 7441 } 7442 7443 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 7444 ret = btrfs_find_last_root(tree_root, root->root_key.objectid, 7445 NULL, NULL); 7446 if (ret < 0) { 7447 btrfs_abort_transaction(trans, tree_root, ret); 7448 err = ret; 7449 goto out_end_trans; 7450 } else if (ret > 0) { 7451 /* if we fail to delete the orphan item this time 7452 * around, it'll get picked up the next time. 7453 * 7454 * The most common failure here is just -ENOENT. 7455 */ 7456 btrfs_del_orphan_item(trans, tree_root, 7457 root->root_key.objectid); 7458 } 7459 } 7460 7461 if (root->in_radix) { 7462 btrfs_free_fs_root(tree_root->fs_info, root); 7463 } else { 7464 free_extent_buffer(root->node); 7465 free_extent_buffer(root->commit_root); 7466 kfree(root); 7467 } 7468 out_end_trans: 7469 btrfs_end_transaction_throttle(trans, tree_root); 7470 out_free: 7471 kfree(wc); 7472 btrfs_free_path(path); 7473 out: 7474 if (err) 7475 btrfs_std_error(root->fs_info, err); 7476 return err; 7477 } 7478 7479 /* 7480 * drop subtree rooted at tree block 'node'. 7481 * 7482 * NOTE: this function will unlock and release tree block 'node' 7483 * only used by relocation code 7484 */ 7485 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 7486 struct btrfs_root *root, 7487 struct extent_buffer *node, 7488 struct extent_buffer *parent) 7489 { 7490 struct btrfs_path *path; 7491 struct walk_control *wc; 7492 int level; 7493 int parent_level; 7494 int ret = 0; 7495 int wret; 7496 7497 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 7498 7499 path = btrfs_alloc_path(); 7500 if (!path) 7501 return -ENOMEM; 7502 7503 wc = kzalloc(sizeof(*wc), GFP_NOFS); 7504 if (!wc) { 7505 btrfs_free_path(path); 7506 return -ENOMEM; 7507 } 7508 7509 btrfs_assert_tree_locked(parent); 7510 parent_level = btrfs_header_level(parent); 7511 extent_buffer_get(parent); 7512 path->nodes[parent_level] = parent; 7513 path->slots[parent_level] = btrfs_header_nritems(parent); 7514 7515 btrfs_assert_tree_locked(node); 7516 level = btrfs_header_level(node); 7517 path->nodes[level] = node; 7518 path->slots[level] = 0; 7519 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7520 7521 wc->refs[parent_level] = 1; 7522 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 7523 wc->level = level; 7524 wc->shared_level = -1; 7525 wc->stage = DROP_REFERENCE; 7526 wc->update_ref = 0; 7527 wc->keep_locks = 1; 7528 wc->for_reloc = 1; 7529 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7530 7531 while (1) { 7532 wret = walk_down_tree(trans, root, path, wc); 7533 if (wret < 0) { 7534 ret = wret; 7535 break; 7536 } 7537 7538 wret = walk_up_tree(trans, root, path, wc, parent_level); 7539 if (wret < 0) 7540 ret = wret; 7541 if (wret != 0) 7542 break; 7543 } 7544 7545 kfree(wc); 7546 btrfs_free_path(path); 7547 return ret; 7548 } 7549 7550 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 7551 { 7552 u64 num_devices; 7553 u64 stripped; 7554 7555 /* 7556 * if restripe for this chunk_type is on pick target profile and 7557 * return, otherwise do the usual balance 7558 */ 7559 stripped = get_restripe_target(root->fs_info, flags); 7560 if (stripped) 7561 return extended_to_chunk(stripped); 7562 7563 /* 7564 * we add in the count of missing devices because we want 7565 * to make sure that any RAID levels on a degraded FS 7566 * continue to be honored. 7567 */ 7568 num_devices = root->fs_info->fs_devices->rw_devices + 7569 root->fs_info->fs_devices->missing_devices; 7570 7571 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7572 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 7573 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7574 7575 if (num_devices == 1) { 7576 stripped |= BTRFS_BLOCK_GROUP_DUP; 7577 stripped = flags & ~stripped; 7578 7579 /* turn raid0 into single device chunks */ 7580 if (flags & BTRFS_BLOCK_GROUP_RAID0) 7581 return stripped; 7582 7583 /* turn mirroring into duplication */ 7584 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 7585 BTRFS_BLOCK_GROUP_RAID10)) 7586 return stripped | BTRFS_BLOCK_GROUP_DUP; 7587 } else { 7588 /* they already had raid on here, just return */ 7589 if (flags & stripped) 7590 return flags; 7591 7592 stripped |= BTRFS_BLOCK_GROUP_DUP; 7593 stripped = flags & ~stripped; 7594 7595 /* switch duplicated blocks with raid1 */ 7596 if (flags & BTRFS_BLOCK_GROUP_DUP) 7597 return stripped | BTRFS_BLOCK_GROUP_RAID1; 7598 7599 /* this is drive concat, leave it alone */ 7600 } 7601 7602 return flags; 7603 } 7604 7605 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) 7606 { 7607 struct btrfs_space_info *sinfo = cache->space_info; 7608 u64 num_bytes; 7609 u64 min_allocable_bytes; 7610 int ret = -ENOSPC; 7611 7612 7613 /* 7614 * We need some metadata space and system metadata space for 7615 * allocating chunks in some corner cases until we force to set 7616 * it to be readonly. 7617 */ 7618 if ((sinfo->flags & 7619 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 7620 !force) 7621 min_allocable_bytes = 1 * 1024 * 1024; 7622 else 7623 min_allocable_bytes = 0; 7624 7625 spin_lock(&sinfo->lock); 7626 spin_lock(&cache->lock); 7627 7628 if (cache->ro) { 7629 ret = 0; 7630 goto out; 7631 } 7632 7633 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 7634 cache->bytes_super - btrfs_block_group_used(&cache->item); 7635 7636 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 7637 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + 7638 min_allocable_bytes <= sinfo->total_bytes) { 7639 sinfo->bytes_readonly += num_bytes; 7640 cache->ro = 1; 7641 ret = 0; 7642 } 7643 out: 7644 spin_unlock(&cache->lock); 7645 spin_unlock(&sinfo->lock); 7646 return ret; 7647 } 7648 7649 int btrfs_set_block_group_ro(struct btrfs_root *root, 7650 struct btrfs_block_group_cache *cache) 7651 7652 { 7653 struct btrfs_trans_handle *trans; 7654 u64 alloc_flags; 7655 int ret; 7656 7657 BUG_ON(cache->ro); 7658 7659 trans = btrfs_join_transaction(root); 7660 if (IS_ERR(trans)) 7661 return PTR_ERR(trans); 7662 7663 alloc_flags = update_block_group_flags(root, cache->flags); 7664 if (alloc_flags != cache->flags) { 7665 ret = do_chunk_alloc(trans, root, alloc_flags, 7666 CHUNK_ALLOC_FORCE); 7667 if (ret < 0) 7668 goto out; 7669 } 7670 7671 ret = set_block_group_ro(cache, 0); 7672 if (!ret) 7673 goto out; 7674 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 7675 ret = do_chunk_alloc(trans, root, alloc_flags, 7676 CHUNK_ALLOC_FORCE); 7677 if (ret < 0) 7678 goto out; 7679 ret = set_block_group_ro(cache, 0); 7680 out: 7681 btrfs_end_transaction(trans, root); 7682 return ret; 7683 } 7684 7685 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 7686 struct btrfs_root *root, u64 type) 7687 { 7688 u64 alloc_flags = get_alloc_profile(root, type); 7689 return do_chunk_alloc(trans, root, alloc_flags, 7690 CHUNK_ALLOC_FORCE); 7691 } 7692 7693 /* 7694 * helper to account the unused space of all the readonly block group in the 7695 * list. takes mirrors into account. 7696 */ 7697 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) 7698 { 7699 struct btrfs_block_group_cache *block_group; 7700 u64 free_bytes = 0; 7701 int factor; 7702 7703 list_for_each_entry(block_group, groups_list, list) { 7704 spin_lock(&block_group->lock); 7705 7706 if (!block_group->ro) { 7707 spin_unlock(&block_group->lock); 7708 continue; 7709 } 7710 7711 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 7712 BTRFS_BLOCK_GROUP_RAID10 | 7713 BTRFS_BLOCK_GROUP_DUP)) 7714 factor = 2; 7715 else 7716 factor = 1; 7717 7718 free_bytes += (block_group->key.offset - 7719 btrfs_block_group_used(&block_group->item)) * 7720 factor; 7721 7722 spin_unlock(&block_group->lock); 7723 } 7724 7725 return free_bytes; 7726 } 7727 7728 /* 7729 * helper to account the unused space of all the readonly block group in the 7730 * space_info. takes mirrors into account. 7731 */ 7732 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 7733 { 7734 int i; 7735 u64 free_bytes = 0; 7736 7737 spin_lock(&sinfo->lock); 7738 7739 for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) 7740 if (!list_empty(&sinfo->block_groups[i])) 7741 free_bytes += __btrfs_get_ro_block_group_free_space( 7742 &sinfo->block_groups[i]); 7743 7744 spin_unlock(&sinfo->lock); 7745 7746 return free_bytes; 7747 } 7748 7749 void btrfs_set_block_group_rw(struct btrfs_root *root, 7750 struct btrfs_block_group_cache *cache) 7751 { 7752 struct btrfs_space_info *sinfo = cache->space_info; 7753 u64 num_bytes; 7754 7755 BUG_ON(!cache->ro); 7756 7757 spin_lock(&sinfo->lock); 7758 spin_lock(&cache->lock); 7759 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 7760 cache->bytes_super - btrfs_block_group_used(&cache->item); 7761 sinfo->bytes_readonly -= num_bytes; 7762 cache->ro = 0; 7763 spin_unlock(&cache->lock); 7764 spin_unlock(&sinfo->lock); 7765 } 7766 7767 /* 7768 * checks to see if its even possible to relocate this block group. 7769 * 7770 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 7771 * ok to go ahead and try. 7772 */ 7773 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) 7774 { 7775 struct btrfs_block_group_cache *block_group; 7776 struct btrfs_space_info *space_info; 7777 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 7778 struct btrfs_device *device; 7779 u64 min_free; 7780 u64 dev_min = 1; 7781 u64 dev_nr = 0; 7782 u64 target; 7783 int index; 7784 int full = 0; 7785 int ret = 0; 7786 7787 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 7788 7789 /* odd, couldn't find the block group, leave it alone */ 7790 if (!block_group) 7791 return -1; 7792 7793 min_free = btrfs_block_group_used(&block_group->item); 7794 7795 /* no bytes used, we're good */ 7796 if (!min_free) 7797 goto out; 7798 7799 space_info = block_group->space_info; 7800 spin_lock(&space_info->lock); 7801 7802 full = space_info->full; 7803 7804 /* 7805 * if this is the last block group we have in this space, we can't 7806 * relocate it unless we're able to allocate a new chunk below. 7807 * 7808 * Otherwise, we need to make sure we have room in the space to handle 7809 * all of the extents from this block group. If we can, we're good 7810 */ 7811 if ((space_info->total_bytes != block_group->key.offset) && 7812 (space_info->bytes_used + space_info->bytes_reserved + 7813 space_info->bytes_pinned + space_info->bytes_readonly + 7814 min_free < space_info->total_bytes)) { 7815 spin_unlock(&space_info->lock); 7816 goto out; 7817 } 7818 spin_unlock(&space_info->lock); 7819 7820 /* 7821 * ok we don't have enough space, but maybe we have free space on our 7822 * devices to allocate new chunks for relocation, so loop through our 7823 * alloc devices and guess if we have enough space. if this block 7824 * group is going to be restriped, run checks against the target 7825 * profile instead of the current one. 7826 */ 7827 ret = -1; 7828 7829 /* 7830 * index: 7831 * 0: raid10 7832 * 1: raid1 7833 * 2: dup 7834 * 3: raid0 7835 * 4: single 7836 */ 7837 target = get_restripe_target(root->fs_info, block_group->flags); 7838 if (target) { 7839 index = __get_raid_index(extended_to_chunk(target)); 7840 } else { 7841 /* 7842 * this is just a balance, so if we were marked as full 7843 * we know there is no space for a new chunk 7844 */ 7845 if (full) 7846 goto out; 7847 7848 index = get_block_group_index(block_group); 7849 } 7850 7851 if (index == BTRFS_RAID_RAID10) { 7852 dev_min = 4; 7853 /* Divide by 2 */ 7854 min_free >>= 1; 7855 } else if (index == BTRFS_RAID_RAID1) { 7856 dev_min = 2; 7857 } else if (index == BTRFS_RAID_DUP) { 7858 /* Multiply by 2 */ 7859 min_free <<= 1; 7860 } else if (index == BTRFS_RAID_RAID0) { 7861 dev_min = fs_devices->rw_devices; 7862 do_div(min_free, dev_min); 7863 } 7864 7865 mutex_lock(&root->fs_info->chunk_mutex); 7866 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 7867 u64 dev_offset; 7868 7869 /* 7870 * check to make sure we can actually find a chunk with enough 7871 * space to fit our block group in. 7872 */ 7873 if (device->total_bytes > device->bytes_used + min_free && 7874 !device->is_tgtdev_for_dev_replace) { 7875 ret = find_free_dev_extent(device, min_free, 7876 &dev_offset, NULL); 7877 if (!ret) 7878 dev_nr++; 7879 7880 if (dev_nr >= dev_min) 7881 break; 7882 7883 ret = -1; 7884 } 7885 } 7886 mutex_unlock(&root->fs_info->chunk_mutex); 7887 out: 7888 btrfs_put_block_group(block_group); 7889 return ret; 7890 } 7891 7892 static int find_first_block_group(struct btrfs_root *root, 7893 struct btrfs_path *path, struct btrfs_key *key) 7894 { 7895 int ret = 0; 7896 struct btrfs_key found_key; 7897 struct extent_buffer *leaf; 7898 int slot; 7899 7900 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 7901 if (ret < 0) 7902 goto out; 7903 7904 while (1) { 7905 slot = path->slots[0]; 7906 leaf = path->nodes[0]; 7907 if (slot >= btrfs_header_nritems(leaf)) { 7908 ret = btrfs_next_leaf(root, path); 7909 if (ret == 0) 7910 continue; 7911 if (ret < 0) 7912 goto out; 7913 break; 7914 } 7915 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7916 7917 if (found_key.objectid >= key->objectid && 7918 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 7919 ret = 0; 7920 goto out; 7921 } 7922 path->slots[0]++; 7923 } 7924 out: 7925 return ret; 7926 } 7927 7928 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 7929 { 7930 struct btrfs_block_group_cache *block_group; 7931 u64 last = 0; 7932 7933 while (1) { 7934 struct inode *inode; 7935 7936 block_group = btrfs_lookup_first_block_group(info, last); 7937 while (block_group) { 7938 spin_lock(&block_group->lock); 7939 if (block_group->iref) 7940 break; 7941 spin_unlock(&block_group->lock); 7942 block_group = next_block_group(info->tree_root, 7943 block_group); 7944 } 7945 if (!block_group) { 7946 if (last == 0) 7947 break; 7948 last = 0; 7949 continue; 7950 } 7951 7952 inode = block_group->inode; 7953 block_group->iref = 0; 7954 block_group->inode = NULL; 7955 spin_unlock(&block_group->lock); 7956 iput(inode); 7957 last = block_group->key.objectid + block_group->key.offset; 7958 btrfs_put_block_group(block_group); 7959 } 7960 } 7961 7962 int btrfs_free_block_groups(struct btrfs_fs_info *info) 7963 { 7964 struct btrfs_block_group_cache *block_group; 7965 struct btrfs_space_info *space_info; 7966 struct btrfs_caching_control *caching_ctl; 7967 struct rb_node *n; 7968 7969 down_write(&info->extent_commit_sem); 7970 while (!list_empty(&info->caching_block_groups)) { 7971 caching_ctl = list_entry(info->caching_block_groups.next, 7972 struct btrfs_caching_control, list); 7973 list_del(&caching_ctl->list); 7974 put_caching_control(caching_ctl); 7975 } 7976 up_write(&info->extent_commit_sem); 7977 7978 spin_lock(&info->block_group_cache_lock); 7979 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 7980 block_group = rb_entry(n, struct btrfs_block_group_cache, 7981 cache_node); 7982 rb_erase(&block_group->cache_node, 7983 &info->block_group_cache_tree); 7984 spin_unlock(&info->block_group_cache_lock); 7985 7986 down_write(&block_group->space_info->groups_sem); 7987 list_del(&block_group->list); 7988 up_write(&block_group->space_info->groups_sem); 7989 7990 if (block_group->cached == BTRFS_CACHE_STARTED) 7991 wait_block_group_cache_done(block_group); 7992 7993 /* 7994 * We haven't cached this block group, which means we could 7995 * possibly have excluded extents on this block group. 7996 */ 7997 if (block_group->cached == BTRFS_CACHE_NO) 7998 free_excluded_extents(info->extent_root, block_group); 7999 8000 btrfs_remove_free_space_cache(block_group); 8001 btrfs_put_block_group(block_group); 8002 8003 spin_lock(&info->block_group_cache_lock); 8004 } 8005 spin_unlock(&info->block_group_cache_lock); 8006 8007 /* now that all the block groups are freed, go through and 8008 * free all the space_info structs. This is only called during 8009 * the final stages of unmount, and so we know nobody is 8010 * using them. We call synchronize_rcu() once before we start, 8011 * just to be on the safe side. 8012 */ 8013 synchronize_rcu(); 8014 8015 release_global_block_rsv(info); 8016 8017 while(!list_empty(&info->space_info)) { 8018 space_info = list_entry(info->space_info.next, 8019 struct btrfs_space_info, 8020 list); 8021 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { 8022 if (space_info->bytes_pinned > 0 || 8023 space_info->bytes_reserved > 0 || 8024 space_info->bytes_may_use > 0) { 8025 WARN_ON(1); 8026 dump_space_info(space_info, 0, 0); 8027 } 8028 } 8029 list_del(&space_info->list); 8030 kfree(space_info); 8031 } 8032 return 0; 8033 } 8034 8035 static void __link_block_group(struct btrfs_space_info *space_info, 8036 struct btrfs_block_group_cache *cache) 8037 { 8038 int index = get_block_group_index(cache); 8039 8040 down_write(&space_info->groups_sem); 8041 list_add_tail(&cache->list, &space_info->block_groups[index]); 8042 up_write(&space_info->groups_sem); 8043 } 8044 8045 int btrfs_read_block_groups(struct btrfs_root *root) 8046 { 8047 struct btrfs_path *path; 8048 int ret; 8049 struct btrfs_block_group_cache *cache; 8050 struct btrfs_fs_info *info = root->fs_info; 8051 struct btrfs_space_info *space_info; 8052 struct btrfs_key key; 8053 struct btrfs_key found_key; 8054 struct extent_buffer *leaf; 8055 int need_clear = 0; 8056 u64 cache_gen; 8057 8058 root = info->extent_root; 8059 key.objectid = 0; 8060 key.offset = 0; 8061 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); 8062 path = btrfs_alloc_path(); 8063 if (!path) 8064 return -ENOMEM; 8065 path->reada = 1; 8066 8067 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 8068 if (btrfs_test_opt(root, SPACE_CACHE) && 8069 btrfs_super_generation(root->fs_info->super_copy) != cache_gen) 8070 need_clear = 1; 8071 if (btrfs_test_opt(root, CLEAR_CACHE)) 8072 need_clear = 1; 8073 8074 while (1) { 8075 ret = find_first_block_group(root, path, &key); 8076 if (ret > 0) 8077 break; 8078 if (ret != 0) 8079 goto error; 8080 leaf = path->nodes[0]; 8081 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 8082 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8083 if (!cache) { 8084 ret = -ENOMEM; 8085 goto error; 8086 } 8087 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 8088 GFP_NOFS); 8089 if (!cache->free_space_ctl) { 8090 kfree(cache); 8091 ret = -ENOMEM; 8092 goto error; 8093 } 8094 8095 atomic_set(&cache->count, 1); 8096 spin_lock_init(&cache->lock); 8097 cache->fs_info = info; 8098 INIT_LIST_HEAD(&cache->list); 8099 INIT_LIST_HEAD(&cache->cluster_list); 8100 8101 if (need_clear) { 8102 /* 8103 * When we mount with old space cache, we need to 8104 * set BTRFS_DC_CLEAR and set dirty flag. 8105 * 8106 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 8107 * truncate the old free space cache inode and 8108 * setup a new one. 8109 * b) Setting 'dirty flag' makes sure that we flush 8110 * the new space cache info onto disk. 8111 */ 8112 cache->disk_cache_state = BTRFS_DC_CLEAR; 8113 if (btrfs_test_opt(root, SPACE_CACHE)) 8114 cache->dirty = 1; 8115 } 8116 8117 read_extent_buffer(leaf, &cache->item, 8118 btrfs_item_ptr_offset(leaf, path->slots[0]), 8119 sizeof(cache->item)); 8120 memcpy(&cache->key, &found_key, sizeof(found_key)); 8121 8122 key.objectid = found_key.objectid + found_key.offset; 8123 btrfs_release_path(path); 8124 cache->flags = btrfs_block_group_flags(&cache->item); 8125 cache->sectorsize = root->sectorsize; 8126 cache->full_stripe_len = btrfs_full_stripe_len(root, 8127 &root->fs_info->mapping_tree, 8128 found_key.objectid); 8129 btrfs_init_free_space_ctl(cache); 8130 8131 /* 8132 * We need to exclude the super stripes now so that the space 8133 * info has super bytes accounted for, otherwise we'll think 8134 * we have more space than we actually do. 8135 */ 8136 ret = exclude_super_stripes(root, cache); 8137 if (ret) { 8138 /* 8139 * We may have excluded something, so call this just in 8140 * case. 8141 */ 8142 free_excluded_extents(root, cache); 8143 kfree(cache->free_space_ctl); 8144 kfree(cache); 8145 goto error; 8146 } 8147 8148 /* 8149 * check for two cases, either we are full, and therefore 8150 * don't need to bother with the caching work since we won't 8151 * find any space, or we are empty, and we can just add all 8152 * the space in and be done with it. This saves us _alot_ of 8153 * time, particularly in the full case. 8154 */ 8155 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 8156 cache->last_byte_to_unpin = (u64)-1; 8157 cache->cached = BTRFS_CACHE_FINISHED; 8158 free_excluded_extents(root, cache); 8159 } else if (btrfs_block_group_used(&cache->item) == 0) { 8160 cache->last_byte_to_unpin = (u64)-1; 8161 cache->cached = BTRFS_CACHE_FINISHED; 8162 add_new_free_space(cache, root->fs_info, 8163 found_key.objectid, 8164 found_key.objectid + 8165 found_key.offset); 8166 free_excluded_extents(root, cache); 8167 } 8168 8169 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8170 if (ret) { 8171 btrfs_remove_free_space_cache(cache); 8172 btrfs_put_block_group(cache); 8173 goto error; 8174 } 8175 8176 ret = update_space_info(info, cache->flags, found_key.offset, 8177 btrfs_block_group_used(&cache->item), 8178 &space_info); 8179 if (ret) { 8180 btrfs_remove_free_space_cache(cache); 8181 spin_lock(&info->block_group_cache_lock); 8182 rb_erase(&cache->cache_node, 8183 &info->block_group_cache_tree); 8184 spin_unlock(&info->block_group_cache_lock); 8185 btrfs_put_block_group(cache); 8186 goto error; 8187 } 8188 8189 cache->space_info = space_info; 8190 spin_lock(&cache->space_info->lock); 8191 cache->space_info->bytes_readonly += cache->bytes_super; 8192 spin_unlock(&cache->space_info->lock); 8193 8194 __link_block_group(space_info, cache); 8195 8196 set_avail_alloc_bits(root->fs_info, cache->flags); 8197 if (btrfs_chunk_readonly(root, cache->key.objectid)) 8198 set_block_group_ro(cache, 1); 8199 } 8200 8201 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 8202 if (!(get_alloc_profile(root, space_info->flags) & 8203 (BTRFS_BLOCK_GROUP_RAID10 | 8204 BTRFS_BLOCK_GROUP_RAID1 | 8205 BTRFS_BLOCK_GROUP_RAID5 | 8206 BTRFS_BLOCK_GROUP_RAID6 | 8207 BTRFS_BLOCK_GROUP_DUP))) 8208 continue; 8209 /* 8210 * avoid allocating from un-mirrored block group if there are 8211 * mirrored block groups. 8212 */ 8213 list_for_each_entry(cache, &space_info->block_groups[3], list) 8214 set_block_group_ro(cache, 1); 8215 list_for_each_entry(cache, &space_info->block_groups[4], list) 8216 set_block_group_ro(cache, 1); 8217 } 8218 8219 init_global_block_rsv(info); 8220 ret = 0; 8221 error: 8222 btrfs_free_path(path); 8223 return ret; 8224 } 8225 8226 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 8227 struct btrfs_root *root) 8228 { 8229 struct btrfs_block_group_cache *block_group, *tmp; 8230 struct btrfs_root *extent_root = root->fs_info->extent_root; 8231 struct btrfs_block_group_item item; 8232 struct btrfs_key key; 8233 int ret = 0; 8234 8235 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, 8236 new_bg_list) { 8237 list_del_init(&block_group->new_bg_list); 8238 8239 if (ret) 8240 continue; 8241 8242 spin_lock(&block_group->lock); 8243 memcpy(&item, &block_group->item, sizeof(item)); 8244 memcpy(&key, &block_group->key, sizeof(key)); 8245 spin_unlock(&block_group->lock); 8246 8247 ret = btrfs_insert_item(trans, extent_root, &key, &item, 8248 sizeof(item)); 8249 if (ret) 8250 btrfs_abort_transaction(trans, extent_root, ret); 8251 } 8252 } 8253 8254 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 8255 struct btrfs_root *root, u64 bytes_used, 8256 u64 type, u64 chunk_objectid, u64 chunk_offset, 8257 u64 size) 8258 { 8259 int ret; 8260 struct btrfs_root *extent_root; 8261 struct btrfs_block_group_cache *cache; 8262 8263 extent_root = root->fs_info->extent_root; 8264 8265 root->fs_info->last_trans_log_full_commit = trans->transid; 8266 8267 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8268 if (!cache) 8269 return -ENOMEM; 8270 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 8271 GFP_NOFS); 8272 if (!cache->free_space_ctl) { 8273 kfree(cache); 8274 return -ENOMEM; 8275 } 8276 8277 cache->key.objectid = chunk_offset; 8278 cache->key.offset = size; 8279 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8280 cache->sectorsize = root->sectorsize; 8281 cache->fs_info = root->fs_info; 8282 cache->full_stripe_len = btrfs_full_stripe_len(root, 8283 &root->fs_info->mapping_tree, 8284 chunk_offset); 8285 8286 atomic_set(&cache->count, 1); 8287 spin_lock_init(&cache->lock); 8288 INIT_LIST_HEAD(&cache->list); 8289 INIT_LIST_HEAD(&cache->cluster_list); 8290 INIT_LIST_HEAD(&cache->new_bg_list); 8291 8292 btrfs_init_free_space_ctl(cache); 8293 8294 btrfs_set_block_group_used(&cache->item, bytes_used); 8295 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 8296 cache->flags = type; 8297 btrfs_set_block_group_flags(&cache->item, type); 8298 8299 cache->last_byte_to_unpin = (u64)-1; 8300 cache->cached = BTRFS_CACHE_FINISHED; 8301 ret = exclude_super_stripes(root, cache); 8302 if (ret) { 8303 /* 8304 * We may have excluded something, so call this just in 8305 * case. 8306 */ 8307 free_excluded_extents(root, cache); 8308 kfree(cache->free_space_ctl); 8309 kfree(cache); 8310 return ret; 8311 } 8312 8313 add_new_free_space(cache, root->fs_info, chunk_offset, 8314 chunk_offset + size); 8315 8316 free_excluded_extents(root, cache); 8317 8318 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8319 if (ret) { 8320 btrfs_remove_free_space_cache(cache); 8321 btrfs_put_block_group(cache); 8322 return ret; 8323 } 8324 8325 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 8326 &cache->space_info); 8327 if (ret) { 8328 btrfs_remove_free_space_cache(cache); 8329 spin_lock(&root->fs_info->block_group_cache_lock); 8330 rb_erase(&cache->cache_node, 8331 &root->fs_info->block_group_cache_tree); 8332 spin_unlock(&root->fs_info->block_group_cache_lock); 8333 btrfs_put_block_group(cache); 8334 return ret; 8335 } 8336 update_global_block_rsv(root->fs_info); 8337 8338 spin_lock(&cache->space_info->lock); 8339 cache->space_info->bytes_readonly += cache->bytes_super; 8340 spin_unlock(&cache->space_info->lock); 8341 8342 __link_block_group(cache->space_info, cache); 8343 8344 list_add_tail(&cache->new_bg_list, &trans->new_bgs); 8345 8346 set_avail_alloc_bits(extent_root->fs_info, type); 8347 8348 return 0; 8349 } 8350 8351 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 8352 { 8353 u64 extra_flags = chunk_to_extended(flags) & 8354 BTRFS_EXTENDED_PROFILE_MASK; 8355 8356 write_seqlock(&fs_info->profiles_lock); 8357 if (flags & BTRFS_BLOCK_GROUP_DATA) 8358 fs_info->avail_data_alloc_bits &= ~extra_flags; 8359 if (flags & BTRFS_BLOCK_GROUP_METADATA) 8360 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 8361 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 8362 fs_info->avail_system_alloc_bits &= ~extra_flags; 8363 write_sequnlock(&fs_info->profiles_lock); 8364 } 8365 8366 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 8367 struct btrfs_root *root, u64 group_start) 8368 { 8369 struct btrfs_path *path; 8370 struct btrfs_block_group_cache *block_group; 8371 struct btrfs_free_cluster *cluster; 8372 struct btrfs_root *tree_root = root->fs_info->tree_root; 8373 struct btrfs_key key; 8374 struct inode *inode; 8375 int ret; 8376 int index; 8377 int factor; 8378 8379 root = root->fs_info->extent_root; 8380 8381 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 8382 BUG_ON(!block_group); 8383 BUG_ON(!block_group->ro); 8384 8385 /* 8386 * Free the reserved super bytes from this block group before 8387 * remove it. 8388 */ 8389 free_excluded_extents(root, block_group); 8390 8391 memcpy(&key, &block_group->key, sizeof(key)); 8392 index = get_block_group_index(block_group); 8393 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 8394 BTRFS_BLOCK_GROUP_RAID1 | 8395 BTRFS_BLOCK_GROUP_RAID10)) 8396 factor = 2; 8397 else 8398 factor = 1; 8399 8400 /* make sure this block group isn't part of an allocation cluster */ 8401 cluster = &root->fs_info->data_alloc_cluster; 8402 spin_lock(&cluster->refill_lock); 8403 btrfs_return_cluster_to_free_space(block_group, cluster); 8404 spin_unlock(&cluster->refill_lock); 8405 8406 /* 8407 * make sure this block group isn't part of a metadata 8408 * allocation cluster 8409 */ 8410 cluster = &root->fs_info->meta_alloc_cluster; 8411 spin_lock(&cluster->refill_lock); 8412 btrfs_return_cluster_to_free_space(block_group, cluster); 8413 spin_unlock(&cluster->refill_lock); 8414 8415 path = btrfs_alloc_path(); 8416 if (!path) { 8417 ret = -ENOMEM; 8418 goto out; 8419 } 8420 8421 inode = lookup_free_space_inode(tree_root, block_group, path); 8422 if (!IS_ERR(inode)) { 8423 ret = btrfs_orphan_add(trans, inode); 8424 if (ret) { 8425 btrfs_add_delayed_iput(inode); 8426 goto out; 8427 } 8428 clear_nlink(inode); 8429 /* One for the block groups ref */ 8430 spin_lock(&block_group->lock); 8431 if (block_group->iref) { 8432 block_group->iref = 0; 8433 block_group->inode = NULL; 8434 spin_unlock(&block_group->lock); 8435 iput(inode); 8436 } else { 8437 spin_unlock(&block_group->lock); 8438 } 8439 /* One for our lookup ref */ 8440 btrfs_add_delayed_iput(inode); 8441 } 8442 8443 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 8444 key.offset = block_group->key.objectid; 8445 key.type = 0; 8446 8447 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 8448 if (ret < 0) 8449 goto out; 8450 if (ret > 0) 8451 btrfs_release_path(path); 8452 if (ret == 0) { 8453 ret = btrfs_del_item(trans, tree_root, path); 8454 if (ret) 8455 goto out; 8456 btrfs_release_path(path); 8457 } 8458 8459 spin_lock(&root->fs_info->block_group_cache_lock); 8460 rb_erase(&block_group->cache_node, 8461 &root->fs_info->block_group_cache_tree); 8462 8463 if (root->fs_info->first_logical_byte == block_group->key.objectid) 8464 root->fs_info->first_logical_byte = (u64)-1; 8465 spin_unlock(&root->fs_info->block_group_cache_lock); 8466 8467 down_write(&block_group->space_info->groups_sem); 8468 /* 8469 * we must use list_del_init so people can check to see if they 8470 * are still on the list after taking the semaphore 8471 */ 8472 list_del_init(&block_group->list); 8473 if (list_empty(&block_group->space_info->block_groups[index])) 8474 clear_avail_alloc_bits(root->fs_info, block_group->flags); 8475 up_write(&block_group->space_info->groups_sem); 8476 8477 if (block_group->cached == BTRFS_CACHE_STARTED) 8478 wait_block_group_cache_done(block_group); 8479 8480 btrfs_remove_free_space_cache(block_group); 8481 8482 spin_lock(&block_group->space_info->lock); 8483 block_group->space_info->total_bytes -= block_group->key.offset; 8484 block_group->space_info->bytes_readonly -= block_group->key.offset; 8485 block_group->space_info->disk_total -= block_group->key.offset * factor; 8486 spin_unlock(&block_group->space_info->lock); 8487 8488 memcpy(&key, &block_group->key, sizeof(key)); 8489 8490 btrfs_clear_space_info_full(root->fs_info); 8491 8492 btrfs_put_block_group(block_group); 8493 btrfs_put_block_group(block_group); 8494 8495 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 8496 if (ret > 0) 8497 ret = -EIO; 8498 if (ret < 0) 8499 goto out; 8500 8501 ret = btrfs_del_item(trans, root, path); 8502 out: 8503 btrfs_free_path(path); 8504 return ret; 8505 } 8506 8507 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 8508 { 8509 struct btrfs_space_info *space_info; 8510 struct btrfs_super_block *disk_super; 8511 u64 features; 8512 u64 flags; 8513 int mixed = 0; 8514 int ret; 8515 8516 disk_super = fs_info->super_copy; 8517 if (!btrfs_super_root(disk_super)) 8518 return 1; 8519 8520 features = btrfs_super_incompat_flags(disk_super); 8521 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 8522 mixed = 1; 8523 8524 flags = BTRFS_BLOCK_GROUP_SYSTEM; 8525 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8526 if (ret) 8527 goto out; 8528 8529 if (mixed) { 8530 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 8531 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8532 } else { 8533 flags = BTRFS_BLOCK_GROUP_METADATA; 8534 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8535 if (ret) 8536 goto out; 8537 8538 flags = BTRFS_BLOCK_GROUP_DATA; 8539 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8540 } 8541 out: 8542 return ret; 8543 } 8544 8545 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 8546 { 8547 return unpin_extent_range(root, start, end); 8548 } 8549 8550 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, 8551 u64 num_bytes, u64 *actual_bytes) 8552 { 8553 return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); 8554 } 8555 8556 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) 8557 { 8558 struct btrfs_fs_info *fs_info = root->fs_info; 8559 struct btrfs_block_group_cache *cache = NULL; 8560 u64 group_trimmed; 8561 u64 start; 8562 u64 end; 8563 u64 trimmed = 0; 8564 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 8565 int ret = 0; 8566 8567 /* 8568 * try to trim all FS space, our block group may start from non-zero. 8569 */ 8570 if (range->len == total_bytes) 8571 cache = btrfs_lookup_first_block_group(fs_info, range->start); 8572 else 8573 cache = btrfs_lookup_block_group(fs_info, range->start); 8574 8575 while (cache) { 8576 if (cache->key.objectid >= (range->start + range->len)) { 8577 btrfs_put_block_group(cache); 8578 break; 8579 } 8580 8581 start = max(range->start, cache->key.objectid); 8582 end = min(range->start + range->len, 8583 cache->key.objectid + cache->key.offset); 8584 8585 if (end - start >= range->minlen) { 8586 if (!block_group_cache_done(cache)) { 8587 ret = cache_block_group(cache, 0); 8588 if (!ret) 8589 wait_block_group_cache_done(cache); 8590 } 8591 ret = btrfs_trim_block_group(cache, 8592 &group_trimmed, 8593 start, 8594 end, 8595 range->minlen); 8596 8597 trimmed += group_trimmed; 8598 if (ret) { 8599 btrfs_put_block_group(cache); 8600 break; 8601 } 8602 } 8603 8604 cache = next_block_group(fs_info->tree_root, cache); 8605 } 8606 8607 range->len = trimmed; 8608 return ret; 8609 } 8610