1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include <linux/slab.h> 26 #include <linux/ratelimit.h> 27 #include "compat.h" 28 #include "hash.h" 29 #include "ctree.h" 30 #include "disk-io.h" 31 #include "print-tree.h" 32 #include "transaction.h" 33 #include "volumes.h" 34 #include "raid56.h" 35 #include "locking.h" 36 #include "free-space-cache.h" 37 #include "math.h" 38 39 #undef SCRAMBLE_DELAYED_REFS 40 41 /* 42 * control flags for do_chunk_alloc's force field 43 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 44 * if we really need one. 45 * 46 * CHUNK_ALLOC_LIMITED means to only try and allocate one 47 * if we have very few chunks already allocated. This is 48 * used as part of the clustering code to help make sure 49 * we have a good pool of storage to cluster in, without 50 * filling the FS with empty chunks 51 * 52 * CHUNK_ALLOC_FORCE means it must try to allocate one 53 * 54 */ 55 enum { 56 CHUNK_ALLOC_NO_FORCE = 0, 57 CHUNK_ALLOC_LIMITED = 1, 58 CHUNK_ALLOC_FORCE = 2, 59 }; 60 61 /* 62 * Control how reservations are dealt with. 63 * 64 * RESERVE_FREE - freeing a reservation. 65 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for 66 * ENOSPC accounting 67 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update 68 * bytes_may_use as the ENOSPC accounting is done elsewhere 69 */ 70 enum { 71 RESERVE_FREE = 0, 72 RESERVE_ALLOC = 1, 73 RESERVE_ALLOC_NO_ACCOUNT = 2, 74 }; 75 76 static int update_block_group(struct btrfs_root *root, 77 u64 bytenr, u64 num_bytes, int alloc); 78 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 79 struct btrfs_root *root, 80 u64 bytenr, u64 num_bytes, u64 parent, 81 u64 root_objectid, u64 owner_objectid, 82 u64 owner_offset, int refs_to_drop, 83 struct btrfs_delayed_extent_op *extra_op); 84 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 85 struct extent_buffer *leaf, 86 struct btrfs_extent_item *ei); 87 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 88 struct btrfs_root *root, 89 u64 parent, u64 root_objectid, 90 u64 flags, u64 owner, u64 offset, 91 struct btrfs_key *ins, int ref_mod); 92 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 93 struct btrfs_root *root, 94 u64 parent, u64 root_objectid, 95 u64 flags, struct btrfs_disk_key *key, 96 int level, struct btrfs_key *ins); 97 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 98 struct btrfs_root *extent_root, u64 flags, 99 int force); 100 static int find_next_key(struct btrfs_path *path, int level, 101 struct btrfs_key *key); 102 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 103 int dump_block_groups); 104 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 105 u64 num_bytes, int reserve); 106 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 107 u64 num_bytes); 108 109 static noinline int 110 block_group_cache_done(struct btrfs_block_group_cache *cache) 111 { 112 smp_mb(); 113 return cache->cached == BTRFS_CACHE_FINISHED; 114 } 115 116 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 117 { 118 return (cache->flags & bits) == bits; 119 } 120 121 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 122 { 123 atomic_inc(&cache->count); 124 } 125 126 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 127 { 128 if (atomic_dec_and_test(&cache->count)) { 129 WARN_ON(cache->pinned > 0); 130 WARN_ON(cache->reserved > 0); 131 kfree(cache->free_space_ctl); 132 kfree(cache); 133 } 134 } 135 136 /* 137 * this adds the block group to the fs_info rb tree for the block group 138 * cache 139 */ 140 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 141 struct btrfs_block_group_cache *block_group) 142 { 143 struct rb_node **p; 144 struct rb_node *parent = NULL; 145 struct btrfs_block_group_cache *cache; 146 147 spin_lock(&info->block_group_cache_lock); 148 p = &info->block_group_cache_tree.rb_node; 149 150 while (*p) { 151 parent = *p; 152 cache = rb_entry(parent, struct btrfs_block_group_cache, 153 cache_node); 154 if (block_group->key.objectid < cache->key.objectid) { 155 p = &(*p)->rb_left; 156 } else if (block_group->key.objectid > cache->key.objectid) { 157 p = &(*p)->rb_right; 158 } else { 159 spin_unlock(&info->block_group_cache_lock); 160 return -EEXIST; 161 } 162 } 163 164 rb_link_node(&block_group->cache_node, parent, p); 165 rb_insert_color(&block_group->cache_node, 166 &info->block_group_cache_tree); 167 168 if (info->first_logical_byte > block_group->key.objectid) 169 info->first_logical_byte = block_group->key.objectid; 170 171 spin_unlock(&info->block_group_cache_lock); 172 173 return 0; 174 } 175 176 /* 177 * This will return the block group at or after bytenr if contains is 0, else 178 * it will return the block group that contains the bytenr 179 */ 180 static struct btrfs_block_group_cache * 181 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 182 int contains) 183 { 184 struct btrfs_block_group_cache *cache, *ret = NULL; 185 struct rb_node *n; 186 u64 end, start; 187 188 spin_lock(&info->block_group_cache_lock); 189 n = info->block_group_cache_tree.rb_node; 190 191 while (n) { 192 cache = rb_entry(n, struct btrfs_block_group_cache, 193 cache_node); 194 end = cache->key.objectid + cache->key.offset - 1; 195 start = cache->key.objectid; 196 197 if (bytenr < start) { 198 if (!contains && (!ret || start < ret->key.objectid)) 199 ret = cache; 200 n = n->rb_left; 201 } else if (bytenr > start) { 202 if (contains && bytenr <= end) { 203 ret = cache; 204 break; 205 } 206 n = n->rb_right; 207 } else { 208 ret = cache; 209 break; 210 } 211 } 212 if (ret) { 213 btrfs_get_block_group(ret); 214 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 215 info->first_logical_byte = ret->key.objectid; 216 } 217 spin_unlock(&info->block_group_cache_lock); 218 219 return ret; 220 } 221 222 static int add_excluded_extent(struct btrfs_root *root, 223 u64 start, u64 num_bytes) 224 { 225 u64 end = start + num_bytes - 1; 226 set_extent_bits(&root->fs_info->freed_extents[0], 227 start, end, EXTENT_UPTODATE, GFP_NOFS); 228 set_extent_bits(&root->fs_info->freed_extents[1], 229 start, end, EXTENT_UPTODATE, GFP_NOFS); 230 return 0; 231 } 232 233 static void free_excluded_extents(struct btrfs_root *root, 234 struct btrfs_block_group_cache *cache) 235 { 236 u64 start, end; 237 238 start = cache->key.objectid; 239 end = start + cache->key.offset - 1; 240 241 clear_extent_bits(&root->fs_info->freed_extents[0], 242 start, end, EXTENT_UPTODATE, GFP_NOFS); 243 clear_extent_bits(&root->fs_info->freed_extents[1], 244 start, end, EXTENT_UPTODATE, GFP_NOFS); 245 } 246 247 static int exclude_super_stripes(struct btrfs_root *root, 248 struct btrfs_block_group_cache *cache) 249 { 250 u64 bytenr; 251 u64 *logical; 252 int stripe_len; 253 int i, nr, ret; 254 255 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 256 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 257 cache->bytes_super += stripe_len; 258 ret = add_excluded_extent(root, cache->key.objectid, 259 stripe_len); 260 if (ret) 261 return ret; 262 } 263 264 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 265 bytenr = btrfs_sb_offset(i); 266 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 267 cache->key.objectid, bytenr, 268 0, &logical, &nr, &stripe_len); 269 if (ret) 270 return ret; 271 272 while (nr--) { 273 cache->bytes_super += stripe_len; 274 ret = add_excluded_extent(root, logical[nr], 275 stripe_len); 276 if (ret) { 277 kfree(logical); 278 return ret; 279 } 280 } 281 282 kfree(logical); 283 } 284 return 0; 285 } 286 287 static struct btrfs_caching_control * 288 get_caching_control(struct btrfs_block_group_cache *cache) 289 { 290 struct btrfs_caching_control *ctl; 291 292 spin_lock(&cache->lock); 293 if (cache->cached != BTRFS_CACHE_STARTED) { 294 spin_unlock(&cache->lock); 295 return NULL; 296 } 297 298 /* We're loading it the fast way, so we don't have a caching_ctl. */ 299 if (!cache->caching_ctl) { 300 spin_unlock(&cache->lock); 301 return NULL; 302 } 303 304 ctl = cache->caching_ctl; 305 atomic_inc(&ctl->count); 306 spin_unlock(&cache->lock); 307 return ctl; 308 } 309 310 static void put_caching_control(struct btrfs_caching_control *ctl) 311 { 312 if (atomic_dec_and_test(&ctl->count)) 313 kfree(ctl); 314 } 315 316 /* 317 * this is only called by cache_block_group, since we could have freed extents 318 * we need to check the pinned_extents for any extents that can't be used yet 319 * since their free space will be released as soon as the transaction commits. 320 */ 321 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 322 struct btrfs_fs_info *info, u64 start, u64 end) 323 { 324 u64 extent_start, extent_end, size, total_added = 0; 325 int ret; 326 327 while (start < end) { 328 ret = find_first_extent_bit(info->pinned_extents, start, 329 &extent_start, &extent_end, 330 EXTENT_DIRTY | EXTENT_UPTODATE, 331 NULL); 332 if (ret) 333 break; 334 335 if (extent_start <= start) { 336 start = extent_end + 1; 337 } else if (extent_start > start && extent_start < end) { 338 size = extent_start - start; 339 total_added += size; 340 ret = btrfs_add_free_space(block_group, start, 341 size); 342 BUG_ON(ret); /* -ENOMEM or logic error */ 343 start = extent_end + 1; 344 } else { 345 break; 346 } 347 } 348 349 if (start < end) { 350 size = end - start; 351 total_added += size; 352 ret = btrfs_add_free_space(block_group, start, size); 353 BUG_ON(ret); /* -ENOMEM or logic error */ 354 } 355 356 return total_added; 357 } 358 359 static noinline void caching_thread(struct btrfs_work *work) 360 { 361 struct btrfs_block_group_cache *block_group; 362 struct btrfs_fs_info *fs_info; 363 struct btrfs_caching_control *caching_ctl; 364 struct btrfs_root *extent_root; 365 struct btrfs_path *path; 366 struct extent_buffer *leaf; 367 struct btrfs_key key; 368 u64 total_found = 0; 369 u64 last = 0; 370 u32 nritems; 371 int ret = 0; 372 373 caching_ctl = container_of(work, struct btrfs_caching_control, work); 374 block_group = caching_ctl->block_group; 375 fs_info = block_group->fs_info; 376 extent_root = fs_info->extent_root; 377 378 path = btrfs_alloc_path(); 379 if (!path) 380 goto out; 381 382 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 383 384 /* 385 * We don't want to deadlock with somebody trying to allocate a new 386 * extent for the extent root while also trying to search the extent 387 * root to add free space. So we skip locking and search the commit 388 * root, since its read-only 389 */ 390 path->skip_locking = 1; 391 path->search_commit_root = 1; 392 path->reada = 1; 393 394 key.objectid = last; 395 key.offset = 0; 396 key.type = BTRFS_EXTENT_ITEM_KEY; 397 again: 398 mutex_lock(&caching_ctl->mutex); 399 /* need to make sure the commit_root doesn't disappear */ 400 down_read(&fs_info->extent_commit_sem); 401 402 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 403 if (ret < 0) 404 goto err; 405 406 leaf = path->nodes[0]; 407 nritems = btrfs_header_nritems(leaf); 408 409 while (1) { 410 if (btrfs_fs_closing(fs_info) > 1) { 411 last = (u64)-1; 412 break; 413 } 414 415 if (path->slots[0] < nritems) { 416 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 417 } else { 418 ret = find_next_key(path, 0, &key); 419 if (ret) 420 break; 421 422 if (need_resched() || 423 btrfs_next_leaf(extent_root, path)) { 424 caching_ctl->progress = last; 425 btrfs_release_path(path); 426 up_read(&fs_info->extent_commit_sem); 427 mutex_unlock(&caching_ctl->mutex); 428 cond_resched(); 429 goto again; 430 } 431 leaf = path->nodes[0]; 432 nritems = btrfs_header_nritems(leaf); 433 continue; 434 } 435 436 if (key.objectid < block_group->key.objectid) { 437 path->slots[0]++; 438 continue; 439 } 440 441 if (key.objectid >= block_group->key.objectid + 442 block_group->key.offset) 443 break; 444 445 if (key.type == BTRFS_EXTENT_ITEM_KEY) { 446 total_found += add_new_free_space(block_group, 447 fs_info, last, 448 key.objectid); 449 last = key.objectid + key.offset; 450 451 if (total_found > (1024 * 1024 * 2)) { 452 total_found = 0; 453 wake_up(&caching_ctl->wait); 454 } 455 } 456 path->slots[0]++; 457 } 458 ret = 0; 459 460 total_found += add_new_free_space(block_group, fs_info, last, 461 block_group->key.objectid + 462 block_group->key.offset); 463 caching_ctl->progress = (u64)-1; 464 465 spin_lock(&block_group->lock); 466 block_group->caching_ctl = NULL; 467 block_group->cached = BTRFS_CACHE_FINISHED; 468 spin_unlock(&block_group->lock); 469 470 err: 471 btrfs_free_path(path); 472 up_read(&fs_info->extent_commit_sem); 473 474 free_excluded_extents(extent_root, block_group); 475 476 mutex_unlock(&caching_ctl->mutex); 477 out: 478 wake_up(&caching_ctl->wait); 479 480 put_caching_control(caching_ctl); 481 btrfs_put_block_group(block_group); 482 } 483 484 static int cache_block_group(struct btrfs_block_group_cache *cache, 485 int load_cache_only) 486 { 487 DEFINE_WAIT(wait); 488 struct btrfs_fs_info *fs_info = cache->fs_info; 489 struct btrfs_caching_control *caching_ctl; 490 int ret = 0; 491 492 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 493 if (!caching_ctl) 494 return -ENOMEM; 495 496 INIT_LIST_HEAD(&caching_ctl->list); 497 mutex_init(&caching_ctl->mutex); 498 init_waitqueue_head(&caching_ctl->wait); 499 caching_ctl->block_group = cache; 500 caching_ctl->progress = cache->key.objectid; 501 atomic_set(&caching_ctl->count, 1); 502 caching_ctl->work.func = caching_thread; 503 504 spin_lock(&cache->lock); 505 /* 506 * This should be a rare occasion, but this could happen I think in the 507 * case where one thread starts to load the space cache info, and then 508 * some other thread starts a transaction commit which tries to do an 509 * allocation while the other thread is still loading the space cache 510 * info. The previous loop should have kept us from choosing this block 511 * group, but if we've moved to the state where we will wait on caching 512 * block groups we need to first check if we're doing a fast load here, 513 * so we can wait for it to finish, otherwise we could end up allocating 514 * from a block group who's cache gets evicted for one reason or 515 * another. 516 */ 517 while (cache->cached == BTRFS_CACHE_FAST) { 518 struct btrfs_caching_control *ctl; 519 520 ctl = cache->caching_ctl; 521 atomic_inc(&ctl->count); 522 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 523 spin_unlock(&cache->lock); 524 525 schedule(); 526 527 finish_wait(&ctl->wait, &wait); 528 put_caching_control(ctl); 529 spin_lock(&cache->lock); 530 } 531 532 if (cache->cached != BTRFS_CACHE_NO) { 533 spin_unlock(&cache->lock); 534 kfree(caching_ctl); 535 return 0; 536 } 537 WARN_ON(cache->caching_ctl); 538 cache->caching_ctl = caching_ctl; 539 cache->cached = BTRFS_CACHE_FAST; 540 spin_unlock(&cache->lock); 541 542 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 543 ret = load_free_space_cache(fs_info, cache); 544 545 spin_lock(&cache->lock); 546 if (ret == 1) { 547 cache->caching_ctl = NULL; 548 cache->cached = BTRFS_CACHE_FINISHED; 549 cache->last_byte_to_unpin = (u64)-1; 550 } else { 551 if (load_cache_only) { 552 cache->caching_ctl = NULL; 553 cache->cached = BTRFS_CACHE_NO; 554 } else { 555 cache->cached = BTRFS_CACHE_STARTED; 556 } 557 } 558 spin_unlock(&cache->lock); 559 wake_up(&caching_ctl->wait); 560 if (ret == 1) { 561 put_caching_control(caching_ctl); 562 free_excluded_extents(fs_info->extent_root, cache); 563 return 0; 564 } 565 } else { 566 /* 567 * We are not going to do the fast caching, set cached to the 568 * appropriate value and wakeup any waiters. 569 */ 570 spin_lock(&cache->lock); 571 if (load_cache_only) { 572 cache->caching_ctl = NULL; 573 cache->cached = BTRFS_CACHE_NO; 574 } else { 575 cache->cached = BTRFS_CACHE_STARTED; 576 } 577 spin_unlock(&cache->lock); 578 wake_up(&caching_ctl->wait); 579 } 580 581 if (load_cache_only) { 582 put_caching_control(caching_ctl); 583 return 0; 584 } 585 586 down_write(&fs_info->extent_commit_sem); 587 atomic_inc(&caching_ctl->count); 588 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 589 up_write(&fs_info->extent_commit_sem); 590 591 btrfs_get_block_group(cache); 592 593 btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); 594 595 return ret; 596 } 597 598 /* 599 * return the block group that starts at or after bytenr 600 */ 601 static struct btrfs_block_group_cache * 602 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 603 { 604 struct btrfs_block_group_cache *cache; 605 606 cache = block_group_cache_tree_search(info, bytenr, 0); 607 608 return cache; 609 } 610 611 /* 612 * return the block group that contains the given bytenr 613 */ 614 struct btrfs_block_group_cache *btrfs_lookup_block_group( 615 struct btrfs_fs_info *info, 616 u64 bytenr) 617 { 618 struct btrfs_block_group_cache *cache; 619 620 cache = block_group_cache_tree_search(info, bytenr, 1); 621 622 return cache; 623 } 624 625 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 626 u64 flags) 627 { 628 struct list_head *head = &info->space_info; 629 struct btrfs_space_info *found; 630 631 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 632 633 rcu_read_lock(); 634 list_for_each_entry_rcu(found, head, list) { 635 if (found->flags & flags) { 636 rcu_read_unlock(); 637 return found; 638 } 639 } 640 rcu_read_unlock(); 641 return NULL; 642 } 643 644 /* 645 * after adding space to the filesystem, we need to clear the full flags 646 * on all the space infos. 647 */ 648 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 649 { 650 struct list_head *head = &info->space_info; 651 struct btrfs_space_info *found; 652 653 rcu_read_lock(); 654 list_for_each_entry_rcu(found, head, list) 655 found->full = 0; 656 rcu_read_unlock(); 657 } 658 659 u64 btrfs_find_block_group(struct btrfs_root *root, 660 u64 search_start, u64 search_hint, int owner) 661 { 662 struct btrfs_block_group_cache *cache; 663 u64 used; 664 u64 last = max(search_hint, search_start); 665 u64 group_start = 0; 666 int full_search = 0; 667 int factor = 9; 668 int wrapped = 0; 669 again: 670 while (1) { 671 cache = btrfs_lookup_first_block_group(root->fs_info, last); 672 if (!cache) 673 break; 674 675 spin_lock(&cache->lock); 676 last = cache->key.objectid + cache->key.offset; 677 used = btrfs_block_group_used(&cache->item); 678 679 if ((full_search || !cache->ro) && 680 block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { 681 if (used + cache->pinned + cache->reserved < 682 div_factor(cache->key.offset, factor)) { 683 group_start = cache->key.objectid; 684 spin_unlock(&cache->lock); 685 btrfs_put_block_group(cache); 686 goto found; 687 } 688 } 689 spin_unlock(&cache->lock); 690 btrfs_put_block_group(cache); 691 cond_resched(); 692 } 693 if (!wrapped) { 694 last = search_start; 695 wrapped = 1; 696 goto again; 697 } 698 if (!full_search && factor < 10) { 699 last = search_start; 700 full_search = 1; 701 factor = 10; 702 goto again; 703 } 704 found: 705 return group_start; 706 } 707 708 /* simple helper to search for an existing extent at a given offset */ 709 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) 710 { 711 int ret; 712 struct btrfs_key key; 713 struct btrfs_path *path; 714 715 path = btrfs_alloc_path(); 716 if (!path) 717 return -ENOMEM; 718 719 key.objectid = start; 720 key.offset = len; 721 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 722 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 723 0, 0); 724 btrfs_free_path(path); 725 return ret; 726 } 727 728 /* 729 * helper function to lookup reference count and flags of extent. 730 * 731 * the head node for delayed ref is used to store the sum of all the 732 * reference count modifications queued up in the rbtree. the head 733 * node may also store the extent flags to set. This way you can check 734 * to see what the reference count and extent flags would be if all of 735 * the delayed refs are not processed. 736 */ 737 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 738 struct btrfs_root *root, u64 bytenr, 739 u64 num_bytes, u64 *refs, u64 *flags) 740 { 741 struct btrfs_delayed_ref_head *head; 742 struct btrfs_delayed_ref_root *delayed_refs; 743 struct btrfs_path *path; 744 struct btrfs_extent_item *ei; 745 struct extent_buffer *leaf; 746 struct btrfs_key key; 747 u32 item_size; 748 u64 num_refs; 749 u64 extent_flags; 750 int ret; 751 752 path = btrfs_alloc_path(); 753 if (!path) 754 return -ENOMEM; 755 756 key.objectid = bytenr; 757 key.type = BTRFS_EXTENT_ITEM_KEY; 758 key.offset = num_bytes; 759 if (!trans) { 760 path->skip_locking = 1; 761 path->search_commit_root = 1; 762 } 763 again: 764 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 765 &key, path, 0, 0); 766 if (ret < 0) 767 goto out_free; 768 769 if (ret == 0) { 770 leaf = path->nodes[0]; 771 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 772 if (item_size >= sizeof(*ei)) { 773 ei = btrfs_item_ptr(leaf, path->slots[0], 774 struct btrfs_extent_item); 775 num_refs = btrfs_extent_refs(leaf, ei); 776 extent_flags = btrfs_extent_flags(leaf, ei); 777 } else { 778 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 779 struct btrfs_extent_item_v0 *ei0; 780 BUG_ON(item_size != sizeof(*ei0)); 781 ei0 = btrfs_item_ptr(leaf, path->slots[0], 782 struct btrfs_extent_item_v0); 783 num_refs = btrfs_extent_refs_v0(leaf, ei0); 784 /* FIXME: this isn't correct for data */ 785 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 786 #else 787 BUG(); 788 #endif 789 } 790 BUG_ON(num_refs == 0); 791 } else { 792 num_refs = 0; 793 extent_flags = 0; 794 ret = 0; 795 } 796 797 if (!trans) 798 goto out; 799 800 delayed_refs = &trans->transaction->delayed_refs; 801 spin_lock(&delayed_refs->lock); 802 head = btrfs_find_delayed_ref_head(trans, bytenr); 803 if (head) { 804 if (!mutex_trylock(&head->mutex)) { 805 atomic_inc(&head->node.refs); 806 spin_unlock(&delayed_refs->lock); 807 808 btrfs_release_path(path); 809 810 /* 811 * Mutex was contended, block until it's released and try 812 * again 813 */ 814 mutex_lock(&head->mutex); 815 mutex_unlock(&head->mutex); 816 btrfs_put_delayed_ref(&head->node); 817 goto again; 818 } 819 if (head->extent_op && head->extent_op->update_flags) 820 extent_flags |= head->extent_op->flags_to_set; 821 else 822 BUG_ON(num_refs == 0); 823 824 num_refs += head->node.ref_mod; 825 mutex_unlock(&head->mutex); 826 } 827 spin_unlock(&delayed_refs->lock); 828 out: 829 WARN_ON(num_refs == 0); 830 if (refs) 831 *refs = num_refs; 832 if (flags) 833 *flags = extent_flags; 834 out_free: 835 btrfs_free_path(path); 836 return ret; 837 } 838 839 /* 840 * Back reference rules. Back refs have three main goals: 841 * 842 * 1) differentiate between all holders of references to an extent so that 843 * when a reference is dropped we can make sure it was a valid reference 844 * before freeing the extent. 845 * 846 * 2) Provide enough information to quickly find the holders of an extent 847 * if we notice a given block is corrupted or bad. 848 * 849 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 850 * maintenance. This is actually the same as #2, but with a slightly 851 * different use case. 852 * 853 * There are two kinds of back refs. The implicit back refs is optimized 854 * for pointers in non-shared tree blocks. For a given pointer in a block, 855 * back refs of this kind provide information about the block's owner tree 856 * and the pointer's key. These information allow us to find the block by 857 * b-tree searching. The full back refs is for pointers in tree blocks not 858 * referenced by their owner trees. The location of tree block is recorded 859 * in the back refs. Actually the full back refs is generic, and can be 860 * used in all cases the implicit back refs is used. The major shortcoming 861 * of the full back refs is its overhead. Every time a tree block gets 862 * COWed, we have to update back refs entry for all pointers in it. 863 * 864 * For a newly allocated tree block, we use implicit back refs for 865 * pointers in it. This means most tree related operations only involve 866 * implicit back refs. For a tree block created in old transaction, the 867 * only way to drop a reference to it is COW it. So we can detect the 868 * event that tree block loses its owner tree's reference and do the 869 * back refs conversion. 870 * 871 * When a tree block is COW'd through a tree, there are four cases: 872 * 873 * The reference count of the block is one and the tree is the block's 874 * owner tree. Nothing to do in this case. 875 * 876 * The reference count of the block is one and the tree is not the 877 * block's owner tree. In this case, full back refs is used for pointers 878 * in the block. Remove these full back refs, add implicit back refs for 879 * every pointers in the new block. 880 * 881 * The reference count of the block is greater than one and the tree is 882 * the block's owner tree. In this case, implicit back refs is used for 883 * pointers in the block. Add full back refs for every pointers in the 884 * block, increase lower level extents' reference counts. The original 885 * implicit back refs are entailed to the new block. 886 * 887 * The reference count of the block is greater than one and the tree is 888 * not the block's owner tree. Add implicit back refs for every pointer in 889 * the new block, increase lower level extents' reference count. 890 * 891 * Back Reference Key composing: 892 * 893 * The key objectid corresponds to the first byte in the extent, 894 * The key type is used to differentiate between types of back refs. 895 * There are different meanings of the key offset for different types 896 * of back refs. 897 * 898 * File extents can be referenced by: 899 * 900 * - multiple snapshots, subvolumes, or different generations in one subvol 901 * - different files inside a single subvolume 902 * - different offsets inside a file (bookend extents in file.c) 903 * 904 * The extent ref structure for the implicit back refs has fields for: 905 * 906 * - Objectid of the subvolume root 907 * - objectid of the file holding the reference 908 * - original offset in the file 909 * - how many bookend extents 910 * 911 * The key offset for the implicit back refs is hash of the first 912 * three fields. 913 * 914 * The extent ref structure for the full back refs has field for: 915 * 916 * - number of pointers in the tree leaf 917 * 918 * The key offset for the implicit back refs is the first byte of 919 * the tree leaf 920 * 921 * When a file extent is allocated, The implicit back refs is used. 922 * the fields are filled in: 923 * 924 * (root_key.objectid, inode objectid, offset in file, 1) 925 * 926 * When a file extent is removed file truncation, we find the 927 * corresponding implicit back refs and check the following fields: 928 * 929 * (btrfs_header_owner(leaf), inode objectid, offset in file) 930 * 931 * Btree extents can be referenced by: 932 * 933 * - Different subvolumes 934 * 935 * Both the implicit back refs and the full back refs for tree blocks 936 * only consist of key. The key offset for the implicit back refs is 937 * objectid of block's owner tree. The key offset for the full back refs 938 * is the first byte of parent block. 939 * 940 * When implicit back refs is used, information about the lowest key and 941 * level of the tree block are required. These information are stored in 942 * tree block info structure. 943 */ 944 945 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 946 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 947 struct btrfs_root *root, 948 struct btrfs_path *path, 949 u64 owner, u32 extra_size) 950 { 951 struct btrfs_extent_item *item; 952 struct btrfs_extent_item_v0 *ei0; 953 struct btrfs_extent_ref_v0 *ref0; 954 struct btrfs_tree_block_info *bi; 955 struct extent_buffer *leaf; 956 struct btrfs_key key; 957 struct btrfs_key found_key; 958 u32 new_size = sizeof(*item); 959 u64 refs; 960 int ret; 961 962 leaf = path->nodes[0]; 963 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 964 965 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 966 ei0 = btrfs_item_ptr(leaf, path->slots[0], 967 struct btrfs_extent_item_v0); 968 refs = btrfs_extent_refs_v0(leaf, ei0); 969 970 if (owner == (u64)-1) { 971 while (1) { 972 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 973 ret = btrfs_next_leaf(root, path); 974 if (ret < 0) 975 return ret; 976 BUG_ON(ret > 0); /* Corruption */ 977 leaf = path->nodes[0]; 978 } 979 btrfs_item_key_to_cpu(leaf, &found_key, 980 path->slots[0]); 981 BUG_ON(key.objectid != found_key.objectid); 982 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 983 path->slots[0]++; 984 continue; 985 } 986 ref0 = btrfs_item_ptr(leaf, path->slots[0], 987 struct btrfs_extent_ref_v0); 988 owner = btrfs_ref_objectid_v0(leaf, ref0); 989 break; 990 } 991 } 992 btrfs_release_path(path); 993 994 if (owner < BTRFS_FIRST_FREE_OBJECTID) 995 new_size += sizeof(*bi); 996 997 new_size -= sizeof(*ei0); 998 ret = btrfs_search_slot(trans, root, &key, path, 999 new_size + extra_size, 1); 1000 if (ret < 0) 1001 return ret; 1002 BUG_ON(ret); /* Corruption */ 1003 1004 btrfs_extend_item(trans, root, path, new_size); 1005 1006 leaf = path->nodes[0]; 1007 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1008 btrfs_set_extent_refs(leaf, item, refs); 1009 /* FIXME: get real generation */ 1010 btrfs_set_extent_generation(leaf, item, 0); 1011 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1012 btrfs_set_extent_flags(leaf, item, 1013 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1014 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1015 bi = (struct btrfs_tree_block_info *)(item + 1); 1016 /* FIXME: get first key of the block */ 1017 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); 1018 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1019 } else { 1020 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1021 } 1022 btrfs_mark_buffer_dirty(leaf); 1023 return 0; 1024 } 1025 #endif 1026 1027 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1028 { 1029 u32 high_crc = ~(u32)0; 1030 u32 low_crc = ~(u32)0; 1031 __le64 lenum; 1032 1033 lenum = cpu_to_le64(root_objectid); 1034 high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 1035 lenum = cpu_to_le64(owner); 1036 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1037 lenum = cpu_to_le64(offset); 1038 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1039 1040 return ((u64)high_crc << 31) ^ (u64)low_crc; 1041 } 1042 1043 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1044 struct btrfs_extent_data_ref *ref) 1045 { 1046 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1047 btrfs_extent_data_ref_objectid(leaf, ref), 1048 btrfs_extent_data_ref_offset(leaf, ref)); 1049 } 1050 1051 static int match_extent_data_ref(struct extent_buffer *leaf, 1052 struct btrfs_extent_data_ref *ref, 1053 u64 root_objectid, u64 owner, u64 offset) 1054 { 1055 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1056 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1057 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1058 return 0; 1059 return 1; 1060 } 1061 1062 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1063 struct btrfs_root *root, 1064 struct btrfs_path *path, 1065 u64 bytenr, u64 parent, 1066 u64 root_objectid, 1067 u64 owner, u64 offset) 1068 { 1069 struct btrfs_key key; 1070 struct btrfs_extent_data_ref *ref; 1071 struct extent_buffer *leaf; 1072 u32 nritems; 1073 int ret; 1074 int recow; 1075 int err = -ENOENT; 1076 1077 key.objectid = bytenr; 1078 if (parent) { 1079 key.type = BTRFS_SHARED_DATA_REF_KEY; 1080 key.offset = parent; 1081 } else { 1082 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1083 key.offset = hash_extent_data_ref(root_objectid, 1084 owner, offset); 1085 } 1086 again: 1087 recow = 0; 1088 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1089 if (ret < 0) { 1090 err = ret; 1091 goto fail; 1092 } 1093 1094 if (parent) { 1095 if (!ret) 1096 return 0; 1097 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1098 key.type = BTRFS_EXTENT_REF_V0_KEY; 1099 btrfs_release_path(path); 1100 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1101 if (ret < 0) { 1102 err = ret; 1103 goto fail; 1104 } 1105 if (!ret) 1106 return 0; 1107 #endif 1108 goto fail; 1109 } 1110 1111 leaf = path->nodes[0]; 1112 nritems = btrfs_header_nritems(leaf); 1113 while (1) { 1114 if (path->slots[0] >= nritems) { 1115 ret = btrfs_next_leaf(root, path); 1116 if (ret < 0) 1117 err = ret; 1118 if (ret) 1119 goto fail; 1120 1121 leaf = path->nodes[0]; 1122 nritems = btrfs_header_nritems(leaf); 1123 recow = 1; 1124 } 1125 1126 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1127 if (key.objectid != bytenr || 1128 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1129 goto fail; 1130 1131 ref = btrfs_item_ptr(leaf, path->slots[0], 1132 struct btrfs_extent_data_ref); 1133 1134 if (match_extent_data_ref(leaf, ref, root_objectid, 1135 owner, offset)) { 1136 if (recow) { 1137 btrfs_release_path(path); 1138 goto again; 1139 } 1140 err = 0; 1141 break; 1142 } 1143 path->slots[0]++; 1144 } 1145 fail: 1146 return err; 1147 } 1148 1149 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1150 struct btrfs_root *root, 1151 struct btrfs_path *path, 1152 u64 bytenr, u64 parent, 1153 u64 root_objectid, u64 owner, 1154 u64 offset, int refs_to_add) 1155 { 1156 struct btrfs_key key; 1157 struct extent_buffer *leaf; 1158 u32 size; 1159 u32 num_refs; 1160 int ret; 1161 1162 key.objectid = bytenr; 1163 if (parent) { 1164 key.type = BTRFS_SHARED_DATA_REF_KEY; 1165 key.offset = parent; 1166 size = sizeof(struct btrfs_shared_data_ref); 1167 } else { 1168 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1169 key.offset = hash_extent_data_ref(root_objectid, 1170 owner, offset); 1171 size = sizeof(struct btrfs_extent_data_ref); 1172 } 1173 1174 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1175 if (ret && ret != -EEXIST) 1176 goto fail; 1177 1178 leaf = path->nodes[0]; 1179 if (parent) { 1180 struct btrfs_shared_data_ref *ref; 1181 ref = btrfs_item_ptr(leaf, path->slots[0], 1182 struct btrfs_shared_data_ref); 1183 if (ret == 0) { 1184 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1185 } else { 1186 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1187 num_refs += refs_to_add; 1188 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1189 } 1190 } else { 1191 struct btrfs_extent_data_ref *ref; 1192 while (ret == -EEXIST) { 1193 ref = btrfs_item_ptr(leaf, path->slots[0], 1194 struct btrfs_extent_data_ref); 1195 if (match_extent_data_ref(leaf, ref, root_objectid, 1196 owner, offset)) 1197 break; 1198 btrfs_release_path(path); 1199 key.offset++; 1200 ret = btrfs_insert_empty_item(trans, root, path, &key, 1201 size); 1202 if (ret && ret != -EEXIST) 1203 goto fail; 1204 1205 leaf = path->nodes[0]; 1206 } 1207 ref = btrfs_item_ptr(leaf, path->slots[0], 1208 struct btrfs_extent_data_ref); 1209 if (ret == 0) { 1210 btrfs_set_extent_data_ref_root(leaf, ref, 1211 root_objectid); 1212 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1213 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1214 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1215 } else { 1216 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1217 num_refs += refs_to_add; 1218 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1219 } 1220 } 1221 btrfs_mark_buffer_dirty(leaf); 1222 ret = 0; 1223 fail: 1224 btrfs_release_path(path); 1225 return ret; 1226 } 1227 1228 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1229 struct btrfs_root *root, 1230 struct btrfs_path *path, 1231 int refs_to_drop) 1232 { 1233 struct btrfs_key key; 1234 struct btrfs_extent_data_ref *ref1 = NULL; 1235 struct btrfs_shared_data_ref *ref2 = NULL; 1236 struct extent_buffer *leaf; 1237 u32 num_refs = 0; 1238 int ret = 0; 1239 1240 leaf = path->nodes[0]; 1241 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1242 1243 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1244 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1245 struct btrfs_extent_data_ref); 1246 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1247 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1248 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1249 struct btrfs_shared_data_ref); 1250 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1251 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1252 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1253 struct btrfs_extent_ref_v0 *ref0; 1254 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1255 struct btrfs_extent_ref_v0); 1256 num_refs = btrfs_ref_count_v0(leaf, ref0); 1257 #endif 1258 } else { 1259 BUG(); 1260 } 1261 1262 BUG_ON(num_refs < refs_to_drop); 1263 num_refs -= refs_to_drop; 1264 1265 if (num_refs == 0) { 1266 ret = btrfs_del_item(trans, root, path); 1267 } else { 1268 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1269 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1270 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1271 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1272 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1273 else { 1274 struct btrfs_extent_ref_v0 *ref0; 1275 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1276 struct btrfs_extent_ref_v0); 1277 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1278 } 1279 #endif 1280 btrfs_mark_buffer_dirty(leaf); 1281 } 1282 return ret; 1283 } 1284 1285 static noinline u32 extent_data_ref_count(struct btrfs_root *root, 1286 struct btrfs_path *path, 1287 struct btrfs_extent_inline_ref *iref) 1288 { 1289 struct btrfs_key key; 1290 struct extent_buffer *leaf; 1291 struct btrfs_extent_data_ref *ref1; 1292 struct btrfs_shared_data_ref *ref2; 1293 u32 num_refs = 0; 1294 1295 leaf = path->nodes[0]; 1296 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1297 if (iref) { 1298 if (btrfs_extent_inline_ref_type(leaf, iref) == 1299 BTRFS_EXTENT_DATA_REF_KEY) { 1300 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1301 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1302 } else { 1303 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1304 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1305 } 1306 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1307 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1308 struct btrfs_extent_data_ref); 1309 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1310 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1311 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1312 struct btrfs_shared_data_ref); 1313 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1314 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1315 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1316 struct btrfs_extent_ref_v0 *ref0; 1317 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1318 struct btrfs_extent_ref_v0); 1319 num_refs = btrfs_ref_count_v0(leaf, ref0); 1320 #endif 1321 } else { 1322 WARN_ON(1); 1323 } 1324 return num_refs; 1325 } 1326 1327 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1328 struct btrfs_root *root, 1329 struct btrfs_path *path, 1330 u64 bytenr, u64 parent, 1331 u64 root_objectid) 1332 { 1333 struct btrfs_key key; 1334 int ret; 1335 1336 key.objectid = bytenr; 1337 if (parent) { 1338 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1339 key.offset = parent; 1340 } else { 1341 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1342 key.offset = root_objectid; 1343 } 1344 1345 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1346 if (ret > 0) 1347 ret = -ENOENT; 1348 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1349 if (ret == -ENOENT && parent) { 1350 btrfs_release_path(path); 1351 key.type = BTRFS_EXTENT_REF_V0_KEY; 1352 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1353 if (ret > 0) 1354 ret = -ENOENT; 1355 } 1356 #endif 1357 return ret; 1358 } 1359 1360 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1361 struct btrfs_root *root, 1362 struct btrfs_path *path, 1363 u64 bytenr, u64 parent, 1364 u64 root_objectid) 1365 { 1366 struct btrfs_key key; 1367 int ret; 1368 1369 key.objectid = bytenr; 1370 if (parent) { 1371 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1372 key.offset = parent; 1373 } else { 1374 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1375 key.offset = root_objectid; 1376 } 1377 1378 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1379 btrfs_release_path(path); 1380 return ret; 1381 } 1382 1383 static inline int extent_ref_type(u64 parent, u64 owner) 1384 { 1385 int type; 1386 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1387 if (parent > 0) 1388 type = BTRFS_SHARED_BLOCK_REF_KEY; 1389 else 1390 type = BTRFS_TREE_BLOCK_REF_KEY; 1391 } else { 1392 if (parent > 0) 1393 type = BTRFS_SHARED_DATA_REF_KEY; 1394 else 1395 type = BTRFS_EXTENT_DATA_REF_KEY; 1396 } 1397 return type; 1398 } 1399 1400 static int find_next_key(struct btrfs_path *path, int level, 1401 struct btrfs_key *key) 1402 1403 { 1404 for (; level < BTRFS_MAX_LEVEL; level++) { 1405 if (!path->nodes[level]) 1406 break; 1407 if (path->slots[level] + 1 >= 1408 btrfs_header_nritems(path->nodes[level])) 1409 continue; 1410 if (level == 0) 1411 btrfs_item_key_to_cpu(path->nodes[level], key, 1412 path->slots[level] + 1); 1413 else 1414 btrfs_node_key_to_cpu(path->nodes[level], key, 1415 path->slots[level] + 1); 1416 return 0; 1417 } 1418 return 1; 1419 } 1420 1421 /* 1422 * look for inline back ref. if back ref is found, *ref_ret is set 1423 * to the address of inline back ref, and 0 is returned. 1424 * 1425 * if back ref isn't found, *ref_ret is set to the address where it 1426 * should be inserted, and -ENOENT is returned. 1427 * 1428 * if insert is true and there are too many inline back refs, the path 1429 * points to the extent item, and -EAGAIN is returned. 1430 * 1431 * NOTE: inline back refs are ordered in the same way that back ref 1432 * items in the tree are ordered. 1433 */ 1434 static noinline_for_stack 1435 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1436 struct btrfs_root *root, 1437 struct btrfs_path *path, 1438 struct btrfs_extent_inline_ref **ref_ret, 1439 u64 bytenr, u64 num_bytes, 1440 u64 parent, u64 root_objectid, 1441 u64 owner, u64 offset, int insert) 1442 { 1443 struct btrfs_key key; 1444 struct extent_buffer *leaf; 1445 struct btrfs_extent_item *ei; 1446 struct btrfs_extent_inline_ref *iref; 1447 u64 flags; 1448 u64 item_size; 1449 unsigned long ptr; 1450 unsigned long end; 1451 int extra_size; 1452 int type; 1453 int want; 1454 int ret; 1455 int err = 0; 1456 1457 key.objectid = bytenr; 1458 key.type = BTRFS_EXTENT_ITEM_KEY; 1459 key.offset = num_bytes; 1460 1461 want = extent_ref_type(parent, owner); 1462 if (insert) { 1463 extra_size = btrfs_extent_inline_ref_size(want); 1464 path->keep_locks = 1; 1465 } else 1466 extra_size = -1; 1467 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1468 if (ret < 0) { 1469 err = ret; 1470 goto out; 1471 } 1472 if (ret && !insert) { 1473 err = -ENOENT; 1474 goto out; 1475 } else if (ret) { 1476 err = -EIO; 1477 WARN_ON(1); 1478 goto out; 1479 } 1480 1481 leaf = path->nodes[0]; 1482 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1483 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1484 if (item_size < sizeof(*ei)) { 1485 if (!insert) { 1486 err = -ENOENT; 1487 goto out; 1488 } 1489 ret = convert_extent_item_v0(trans, root, path, owner, 1490 extra_size); 1491 if (ret < 0) { 1492 err = ret; 1493 goto out; 1494 } 1495 leaf = path->nodes[0]; 1496 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1497 } 1498 #endif 1499 BUG_ON(item_size < sizeof(*ei)); 1500 1501 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1502 flags = btrfs_extent_flags(leaf, ei); 1503 1504 ptr = (unsigned long)(ei + 1); 1505 end = (unsigned long)ei + item_size; 1506 1507 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1508 ptr += sizeof(struct btrfs_tree_block_info); 1509 BUG_ON(ptr > end); 1510 } else { 1511 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); 1512 } 1513 1514 err = -ENOENT; 1515 while (1) { 1516 if (ptr >= end) { 1517 WARN_ON(ptr > end); 1518 break; 1519 } 1520 iref = (struct btrfs_extent_inline_ref *)ptr; 1521 type = btrfs_extent_inline_ref_type(leaf, iref); 1522 if (want < type) 1523 break; 1524 if (want > type) { 1525 ptr += btrfs_extent_inline_ref_size(type); 1526 continue; 1527 } 1528 1529 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1530 struct btrfs_extent_data_ref *dref; 1531 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1532 if (match_extent_data_ref(leaf, dref, root_objectid, 1533 owner, offset)) { 1534 err = 0; 1535 break; 1536 } 1537 if (hash_extent_data_ref_item(leaf, dref) < 1538 hash_extent_data_ref(root_objectid, owner, offset)) 1539 break; 1540 } else { 1541 u64 ref_offset; 1542 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1543 if (parent > 0) { 1544 if (parent == ref_offset) { 1545 err = 0; 1546 break; 1547 } 1548 if (ref_offset < parent) 1549 break; 1550 } else { 1551 if (root_objectid == ref_offset) { 1552 err = 0; 1553 break; 1554 } 1555 if (ref_offset < root_objectid) 1556 break; 1557 } 1558 } 1559 ptr += btrfs_extent_inline_ref_size(type); 1560 } 1561 if (err == -ENOENT && insert) { 1562 if (item_size + extra_size >= 1563 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1564 err = -EAGAIN; 1565 goto out; 1566 } 1567 /* 1568 * To add new inline back ref, we have to make sure 1569 * there is no corresponding back ref item. 1570 * For simplicity, we just do not add new inline back 1571 * ref if there is any kind of item for this block 1572 */ 1573 if (find_next_key(path, 0, &key) == 0 && 1574 key.objectid == bytenr && 1575 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1576 err = -EAGAIN; 1577 goto out; 1578 } 1579 } 1580 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1581 out: 1582 if (insert) { 1583 path->keep_locks = 0; 1584 btrfs_unlock_up_safe(path, 1); 1585 } 1586 return err; 1587 } 1588 1589 /* 1590 * helper to add new inline back ref 1591 */ 1592 static noinline_for_stack 1593 void setup_inline_extent_backref(struct btrfs_trans_handle *trans, 1594 struct btrfs_root *root, 1595 struct btrfs_path *path, 1596 struct btrfs_extent_inline_ref *iref, 1597 u64 parent, u64 root_objectid, 1598 u64 owner, u64 offset, int refs_to_add, 1599 struct btrfs_delayed_extent_op *extent_op) 1600 { 1601 struct extent_buffer *leaf; 1602 struct btrfs_extent_item *ei; 1603 unsigned long ptr; 1604 unsigned long end; 1605 unsigned long item_offset; 1606 u64 refs; 1607 int size; 1608 int type; 1609 1610 leaf = path->nodes[0]; 1611 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1612 item_offset = (unsigned long)iref - (unsigned long)ei; 1613 1614 type = extent_ref_type(parent, owner); 1615 size = btrfs_extent_inline_ref_size(type); 1616 1617 btrfs_extend_item(trans, root, path, size); 1618 1619 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1620 refs = btrfs_extent_refs(leaf, ei); 1621 refs += refs_to_add; 1622 btrfs_set_extent_refs(leaf, ei, refs); 1623 if (extent_op) 1624 __run_delayed_extent_op(extent_op, leaf, ei); 1625 1626 ptr = (unsigned long)ei + item_offset; 1627 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1628 if (ptr < end - size) 1629 memmove_extent_buffer(leaf, ptr + size, ptr, 1630 end - size - ptr); 1631 1632 iref = (struct btrfs_extent_inline_ref *)ptr; 1633 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1634 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1635 struct btrfs_extent_data_ref *dref; 1636 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1637 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1638 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1639 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1640 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1641 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1642 struct btrfs_shared_data_ref *sref; 1643 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1644 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1645 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1646 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1647 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1648 } else { 1649 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1650 } 1651 btrfs_mark_buffer_dirty(leaf); 1652 } 1653 1654 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1655 struct btrfs_root *root, 1656 struct btrfs_path *path, 1657 struct btrfs_extent_inline_ref **ref_ret, 1658 u64 bytenr, u64 num_bytes, u64 parent, 1659 u64 root_objectid, u64 owner, u64 offset) 1660 { 1661 int ret; 1662 1663 ret = lookup_inline_extent_backref(trans, root, path, ref_ret, 1664 bytenr, num_bytes, parent, 1665 root_objectid, owner, offset, 0); 1666 if (ret != -ENOENT) 1667 return ret; 1668 1669 btrfs_release_path(path); 1670 *ref_ret = NULL; 1671 1672 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1673 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, 1674 root_objectid); 1675 } else { 1676 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, 1677 root_objectid, owner, offset); 1678 } 1679 return ret; 1680 } 1681 1682 /* 1683 * helper to update/remove inline back ref 1684 */ 1685 static noinline_for_stack 1686 void update_inline_extent_backref(struct btrfs_trans_handle *trans, 1687 struct btrfs_root *root, 1688 struct btrfs_path *path, 1689 struct btrfs_extent_inline_ref *iref, 1690 int refs_to_mod, 1691 struct btrfs_delayed_extent_op *extent_op) 1692 { 1693 struct extent_buffer *leaf; 1694 struct btrfs_extent_item *ei; 1695 struct btrfs_extent_data_ref *dref = NULL; 1696 struct btrfs_shared_data_ref *sref = NULL; 1697 unsigned long ptr; 1698 unsigned long end; 1699 u32 item_size; 1700 int size; 1701 int type; 1702 u64 refs; 1703 1704 leaf = path->nodes[0]; 1705 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1706 refs = btrfs_extent_refs(leaf, ei); 1707 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1708 refs += refs_to_mod; 1709 btrfs_set_extent_refs(leaf, ei, refs); 1710 if (extent_op) 1711 __run_delayed_extent_op(extent_op, leaf, ei); 1712 1713 type = btrfs_extent_inline_ref_type(leaf, iref); 1714 1715 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1716 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1717 refs = btrfs_extent_data_ref_count(leaf, dref); 1718 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1719 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1720 refs = btrfs_shared_data_ref_count(leaf, sref); 1721 } else { 1722 refs = 1; 1723 BUG_ON(refs_to_mod != -1); 1724 } 1725 1726 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1727 refs += refs_to_mod; 1728 1729 if (refs > 0) { 1730 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1731 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1732 else 1733 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1734 } else { 1735 size = btrfs_extent_inline_ref_size(type); 1736 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1737 ptr = (unsigned long)iref; 1738 end = (unsigned long)ei + item_size; 1739 if (ptr + size < end) 1740 memmove_extent_buffer(leaf, ptr, ptr + size, 1741 end - ptr - size); 1742 item_size -= size; 1743 btrfs_truncate_item(trans, root, path, item_size, 1); 1744 } 1745 btrfs_mark_buffer_dirty(leaf); 1746 } 1747 1748 static noinline_for_stack 1749 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1750 struct btrfs_root *root, 1751 struct btrfs_path *path, 1752 u64 bytenr, u64 num_bytes, u64 parent, 1753 u64 root_objectid, u64 owner, 1754 u64 offset, int refs_to_add, 1755 struct btrfs_delayed_extent_op *extent_op) 1756 { 1757 struct btrfs_extent_inline_ref *iref; 1758 int ret; 1759 1760 ret = lookup_inline_extent_backref(trans, root, path, &iref, 1761 bytenr, num_bytes, parent, 1762 root_objectid, owner, offset, 1); 1763 if (ret == 0) { 1764 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1765 update_inline_extent_backref(trans, root, path, iref, 1766 refs_to_add, extent_op); 1767 } else if (ret == -ENOENT) { 1768 setup_inline_extent_backref(trans, root, path, iref, parent, 1769 root_objectid, owner, offset, 1770 refs_to_add, extent_op); 1771 ret = 0; 1772 } 1773 return ret; 1774 } 1775 1776 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1777 struct btrfs_root *root, 1778 struct btrfs_path *path, 1779 u64 bytenr, u64 parent, u64 root_objectid, 1780 u64 owner, u64 offset, int refs_to_add) 1781 { 1782 int ret; 1783 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1784 BUG_ON(refs_to_add != 1); 1785 ret = insert_tree_block_ref(trans, root, path, bytenr, 1786 parent, root_objectid); 1787 } else { 1788 ret = insert_extent_data_ref(trans, root, path, bytenr, 1789 parent, root_objectid, 1790 owner, offset, refs_to_add); 1791 } 1792 return ret; 1793 } 1794 1795 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1796 struct btrfs_root *root, 1797 struct btrfs_path *path, 1798 struct btrfs_extent_inline_ref *iref, 1799 int refs_to_drop, int is_data) 1800 { 1801 int ret = 0; 1802 1803 BUG_ON(!is_data && refs_to_drop != 1); 1804 if (iref) { 1805 update_inline_extent_backref(trans, root, path, iref, 1806 -refs_to_drop, NULL); 1807 } else if (is_data) { 1808 ret = remove_extent_data_ref(trans, root, path, refs_to_drop); 1809 } else { 1810 ret = btrfs_del_item(trans, root, path); 1811 } 1812 return ret; 1813 } 1814 1815 static int btrfs_issue_discard(struct block_device *bdev, 1816 u64 start, u64 len) 1817 { 1818 return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); 1819 } 1820 1821 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1822 u64 num_bytes, u64 *actual_bytes) 1823 { 1824 int ret; 1825 u64 discarded_bytes = 0; 1826 struct btrfs_bio *bbio = NULL; 1827 1828 1829 /* Tell the block device(s) that the sectors can be discarded */ 1830 ret = btrfs_map_block(root->fs_info, REQ_DISCARD, 1831 bytenr, &num_bytes, &bbio, 0); 1832 /* Error condition is -ENOMEM */ 1833 if (!ret) { 1834 struct btrfs_bio_stripe *stripe = bbio->stripes; 1835 int i; 1836 1837 1838 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 1839 if (!stripe->dev->can_discard) 1840 continue; 1841 1842 ret = btrfs_issue_discard(stripe->dev->bdev, 1843 stripe->physical, 1844 stripe->length); 1845 if (!ret) 1846 discarded_bytes += stripe->length; 1847 else if (ret != -EOPNOTSUPP) 1848 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 1849 1850 /* 1851 * Just in case we get back EOPNOTSUPP for some reason, 1852 * just ignore the return value so we don't screw up 1853 * people calling discard_extent. 1854 */ 1855 ret = 0; 1856 } 1857 kfree(bbio); 1858 } 1859 1860 if (actual_bytes) 1861 *actual_bytes = discarded_bytes; 1862 1863 1864 if (ret == -EOPNOTSUPP) 1865 ret = 0; 1866 return ret; 1867 } 1868 1869 /* Can return -ENOMEM */ 1870 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1871 struct btrfs_root *root, 1872 u64 bytenr, u64 num_bytes, u64 parent, 1873 u64 root_objectid, u64 owner, u64 offset, int for_cow) 1874 { 1875 int ret; 1876 struct btrfs_fs_info *fs_info = root->fs_info; 1877 1878 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1879 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1880 1881 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1882 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 1883 num_bytes, 1884 parent, root_objectid, (int)owner, 1885 BTRFS_ADD_DELAYED_REF, NULL, for_cow); 1886 } else { 1887 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 1888 num_bytes, 1889 parent, root_objectid, owner, offset, 1890 BTRFS_ADD_DELAYED_REF, NULL, for_cow); 1891 } 1892 return ret; 1893 } 1894 1895 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1896 struct btrfs_root *root, 1897 u64 bytenr, u64 num_bytes, 1898 u64 parent, u64 root_objectid, 1899 u64 owner, u64 offset, int refs_to_add, 1900 struct btrfs_delayed_extent_op *extent_op) 1901 { 1902 struct btrfs_path *path; 1903 struct extent_buffer *leaf; 1904 struct btrfs_extent_item *item; 1905 u64 refs; 1906 int ret; 1907 int err = 0; 1908 1909 path = btrfs_alloc_path(); 1910 if (!path) 1911 return -ENOMEM; 1912 1913 path->reada = 1; 1914 path->leave_spinning = 1; 1915 /* this will setup the path even if it fails to insert the back ref */ 1916 ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, 1917 path, bytenr, num_bytes, parent, 1918 root_objectid, owner, offset, 1919 refs_to_add, extent_op); 1920 if (ret == 0) 1921 goto out; 1922 1923 if (ret != -EAGAIN) { 1924 err = ret; 1925 goto out; 1926 } 1927 1928 leaf = path->nodes[0]; 1929 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1930 refs = btrfs_extent_refs(leaf, item); 1931 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 1932 if (extent_op) 1933 __run_delayed_extent_op(extent_op, leaf, item); 1934 1935 btrfs_mark_buffer_dirty(leaf); 1936 btrfs_release_path(path); 1937 1938 path->reada = 1; 1939 path->leave_spinning = 1; 1940 1941 /* now insert the actual backref */ 1942 ret = insert_extent_backref(trans, root->fs_info->extent_root, 1943 path, bytenr, parent, root_objectid, 1944 owner, offset, refs_to_add); 1945 if (ret) 1946 btrfs_abort_transaction(trans, root, ret); 1947 out: 1948 btrfs_free_path(path); 1949 return err; 1950 } 1951 1952 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 1953 struct btrfs_root *root, 1954 struct btrfs_delayed_ref_node *node, 1955 struct btrfs_delayed_extent_op *extent_op, 1956 int insert_reserved) 1957 { 1958 int ret = 0; 1959 struct btrfs_delayed_data_ref *ref; 1960 struct btrfs_key ins; 1961 u64 parent = 0; 1962 u64 ref_root = 0; 1963 u64 flags = 0; 1964 1965 ins.objectid = node->bytenr; 1966 ins.offset = node->num_bytes; 1967 ins.type = BTRFS_EXTENT_ITEM_KEY; 1968 1969 ref = btrfs_delayed_node_to_data_ref(node); 1970 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 1971 parent = ref->parent; 1972 else 1973 ref_root = ref->root; 1974 1975 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 1976 if (extent_op) { 1977 BUG_ON(extent_op->update_key); 1978 flags |= extent_op->flags_to_set; 1979 } 1980 ret = alloc_reserved_file_extent(trans, root, 1981 parent, ref_root, flags, 1982 ref->objectid, ref->offset, 1983 &ins, node->ref_mod); 1984 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1985 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 1986 node->num_bytes, parent, 1987 ref_root, ref->objectid, 1988 ref->offset, node->ref_mod, 1989 extent_op); 1990 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1991 ret = __btrfs_free_extent(trans, root, node->bytenr, 1992 node->num_bytes, parent, 1993 ref_root, ref->objectid, 1994 ref->offset, node->ref_mod, 1995 extent_op); 1996 } else { 1997 BUG(); 1998 } 1999 return ret; 2000 } 2001 2002 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2003 struct extent_buffer *leaf, 2004 struct btrfs_extent_item *ei) 2005 { 2006 u64 flags = btrfs_extent_flags(leaf, ei); 2007 if (extent_op->update_flags) { 2008 flags |= extent_op->flags_to_set; 2009 btrfs_set_extent_flags(leaf, ei, flags); 2010 } 2011 2012 if (extent_op->update_key) { 2013 struct btrfs_tree_block_info *bi; 2014 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2015 bi = (struct btrfs_tree_block_info *)(ei + 1); 2016 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2017 } 2018 } 2019 2020 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2021 struct btrfs_root *root, 2022 struct btrfs_delayed_ref_node *node, 2023 struct btrfs_delayed_extent_op *extent_op) 2024 { 2025 struct btrfs_key key; 2026 struct btrfs_path *path; 2027 struct btrfs_extent_item *ei; 2028 struct extent_buffer *leaf; 2029 u32 item_size; 2030 int ret; 2031 int err = 0; 2032 2033 if (trans->aborted) 2034 return 0; 2035 2036 path = btrfs_alloc_path(); 2037 if (!path) 2038 return -ENOMEM; 2039 2040 key.objectid = node->bytenr; 2041 key.type = BTRFS_EXTENT_ITEM_KEY; 2042 key.offset = node->num_bytes; 2043 2044 path->reada = 1; 2045 path->leave_spinning = 1; 2046 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 2047 path, 0, 1); 2048 if (ret < 0) { 2049 err = ret; 2050 goto out; 2051 } 2052 if (ret > 0) { 2053 err = -EIO; 2054 goto out; 2055 } 2056 2057 leaf = path->nodes[0]; 2058 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2059 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2060 if (item_size < sizeof(*ei)) { 2061 ret = convert_extent_item_v0(trans, root->fs_info->extent_root, 2062 path, (u64)-1, 0); 2063 if (ret < 0) { 2064 err = ret; 2065 goto out; 2066 } 2067 leaf = path->nodes[0]; 2068 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2069 } 2070 #endif 2071 BUG_ON(item_size < sizeof(*ei)); 2072 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2073 __run_delayed_extent_op(extent_op, leaf, ei); 2074 2075 btrfs_mark_buffer_dirty(leaf); 2076 out: 2077 btrfs_free_path(path); 2078 return err; 2079 } 2080 2081 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2082 struct btrfs_root *root, 2083 struct btrfs_delayed_ref_node *node, 2084 struct btrfs_delayed_extent_op *extent_op, 2085 int insert_reserved) 2086 { 2087 int ret = 0; 2088 struct btrfs_delayed_tree_ref *ref; 2089 struct btrfs_key ins; 2090 u64 parent = 0; 2091 u64 ref_root = 0; 2092 2093 ins.objectid = node->bytenr; 2094 ins.offset = node->num_bytes; 2095 ins.type = BTRFS_EXTENT_ITEM_KEY; 2096 2097 ref = btrfs_delayed_node_to_tree_ref(node); 2098 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2099 parent = ref->parent; 2100 else 2101 ref_root = ref->root; 2102 2103 BUG_ON(node->ref_mod != 1); 2104 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2105 BUG_ON(!extent_op || !extent_op->update_flags || 2106 !extent_op->update_key); 2107 ret = alloc_reserved_tree_block(trans, root, 2108 parent, ref_root, 2109 extent_op->flags_to_set, 2110 &extent_op->key, 2111 ref->level, &ins); 2112 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2113 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2114 node->num_bytes, parent, ref_root, 2115 ref->level, 0, 1, extent_op); 2116 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2117 ret = __btrfs_free_extent(trans, root, node->bytenr, 2118 node->num_bytes, parent, ref_root, 2119 ref->level, 0, 1, extent_op); 2120 } else { 2121 BUG(); 2122 } 2123 return ret; 2124 } 2125 2126 /* helper function to actually process a single delayed ref entry */ 2127 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2128 struct btrfs_root *root, 2129 struct btrfs_delayed_ref_node *node, 2130 struct btrfs_delayed_extent_op *extent_op, 2131 int insert_reserved) 2132 { 2133 int ret = 0; 2134 2135 if (trans->aborted) 2136 return 0; 2137 2138 if (btrfs_delayed_ref_is_head(node)) { 2139 struct btrfs_delayed_ref_head *head; 2140 /* 2141 * we've hit the end of the chain and we were supposed 2142 * to insert this extent into the tree. But, it got 2143 * deleted before we ever needed to insert it, so all 2144 * we have to do is clean up the accounting 2145 */ 2146 BUG_ON(extent_op); 2147 head = btrfs_delayed_node_to_head(node); 2148 if (insert_reserved) { 2149 btrfs_pin_extent(root, node->bytenr, 2150 node->num_bytes, 1); 2151 if (head->is_data) { 2152 ret = btrfs_del_csums(trans, root, 2153 node->bytenr, 2154 node->num_bytes); 2155 } 2156 } 2157 return ret; 2158 } 2159 2160 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2161 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2162 ret = run_delayed_tree_ref(trans, root, node, extent_op, 2163 insert_reserved); 2164 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2165 node->type == BTRFS_SHARED_DATA_REF_KEY) 2166 ret = run_delayed_data_ref(trans, root, node, extent_op, 2167 insert_reserved); 2168 else 2169 BUG(); 2170 return ret; 2171 } 2172 2173 static noinline struct btrfs_delayed_ref_node * 2174 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2175 { 2176 struct rb_node *node; 2177 struct btrfs_delayed_ref_node *ref; 2178 int action = BTRFS_ADD_DELAYED_REF; 2179 again: 2180 /* 2181 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 2182 * this prevents ref count from going down to zero when 2183 * there still are pending delayed ref. 2184 */ 2185 node = rb_prev(&head->node.rb_node); 2186 while (1) { 2187 if (!node) 2188 break; 2189 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2190 rb_node); 2191 if (ref->bytenr != head->node.bytenr) 2192 break; 2193 if (ref->action == action) 2194 return ref; 2195 node = rb_prev(node); 2196 } 2197 if (action == BTRFS_ADD_DELAYED_REF) { 2198 action = BTRFS_DROP_DELAYED_REF; 2199 goto again; 2200 } 2201 return NULL; 2202 } 2203 2204 /* 2205 * Returns 0 on success or if called with an already aborted transaction. 2206 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2207 */ 2208 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, 2209 struct btrfs_root *root, 2210 struct list_head *cluster) 2211 { 2212 struct btrfs_delayed_ref_root *delayed_refs; 2213 struct btrfs_delayed_ref_node *ref; 2214 struct btrfs_delayed_ref_head *locked_ref = NULL; 2215 struct btrfs_delayed_extent_op *extent_op; 2216 struct btrfs_fs_info *fs_info = root->fs_info; 2217 int ret; 2218 int count = 0; 2219 int must_insert_reserved = 0; 2220 2221 delayed_refs = &trans->transaction->delayed_refs; 2222 while (1) { 2223 if (!locked_ref) { 2224 /* pick a new head ref from the cluster list */ 2225 if (list_empty(cluster)) 2226 break; 2227 2228 locked_ref = list_entry(cluster->next, 2229 struct btrfs_delayed_ref_head, cluster); 2230 2231 /* grab the lock that says we are going to process 2232 * all the refs for this head */ 2233 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2234 2235 /* 2236 * we may have dropped the spin lock to get the head 2237 * mutex lock, and that might have given someone else 2238 * time to free the head. If that's true, it has been 2239 * removed from our list and we can move on. 2240 */ 2241 if (ret == -EAGAIN) { 2242 locked_ref = NULL; 2243 count++; 2244 continue; 2245 } 2246 } 2247 2248 /* 2249 * We need to try and merge add/drops of the same ref since we 2250 * can run into issues with relocate dropping the implicit ref 2251 * and then it being added back again before the drop can 2252 * finish. If we merged anything we need to re-loop so we can 2253 * get a good ref. 2254 */ 2255 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2256 locked_ref); 2257 2258 /* 2259 * locked_ref is the head node, so we have to go one 2260 * node back for any delayed ref updates 2261 */ 2262 ref = select_delayed_ref(locked_ref); 2263 2264 if (ref && ref->seq && 2265 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2266 /* 2267 * there are still refs with lower seq numbers in the 2268 * process of being added. Don't run this ref yet. 2269 */ 2270 list_del_init(&locked_ref->cluster); 2271 btrfs_delayed_ref_unlock(locked_ref); 2272 locked_ref = NULL; 2273 delayed_refs->num_heads_ready++; 2274 spin_unlock(&delayed_refs->lock); 2275 cond_resched(); 2276 spin_lock(&delayed_refs->lock); 2277 continue; 2278 } 2279 2280 /* 2281 * record the must insert reserved flag before we 2282 * drop the spin lock. 2283 */ 2284 must_insert_reserved = locked_ref->must_insert_reserved; 2285 locked_ref->must_insert_reserved = 0; 2286 2287 extent_op = locked_ref->extent_op; 2288 locked_ref->extent_op = NULL; 2289 2290 if (!ref) { 2291 /* All delayed refs have been processed, Go ahead 2292 * and send the head node to run_one_delayed_ref, 2293 * so that any accounting fixes can happen 2294 */ 2295 ref = &locked_ref->node; 2296 2297 if (extent_op && must_insert_reserved) { 2298 btrfs_free_delayed_extent_op(extent_op); 2299 extent_op = NULL; 2300 } 2301 2302 if (extent_op) { 2303 spin_unlock(&delayed_refs->lock); 2304 2305 ret = run_delayed_extent_op(trans, root, 2306 ref, extent_op); 2307 btrfs_free_delayed_extent_op(extent_op); 2308 2309 if (ret) { 2310 printk(KERN_DEBUG 2311 "btrfs: run_delayed_extent_op " 2312 "returned %d\n", ret); 2313 spin_lock(&delayed_refs->lock); 2314 btrfs_delayed_ref_unlock(locked_ref); 2315 return ret; 2316 } 2317 2318 goto next; 2319 } 2320 } 2321 2322 ref->in_tree = 0; 2323 rb_erase(&ref->rb_node, &delayed_refs->root); 2324 delayed_refs->num_entries--; 2325 if (!btrfs_delayed_ref_is_head(ref)) { 2326 /* 2327 * when we play the delayed ref, also correct the 2328 * ref_mod on head 2329 */ 2330 switch (ref->action) { 2331 case BTRFS_ADD_DELAYED_REF: 2332 case BTRFS_ADD_DELAYED_EXTENT: 2333 locked_ref->node.ref_mod -= ref->ref_mod; 2334 break; 2335 case BTRFS_DROP_DELAYED_REF: 2336 locked_ref->node.ref_mod += ref->ref_mod; 2337 break; 2338 default: 2339 WARN_ON(1); 2340 } 2341 } 2342 spin_unlock(&delayed_refs->lock); 2343 2344 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2345 must_insert_reserved); 2346 2347 btrfs_free_delayed_extent_op(extent_op); 2348 if (ret) { 2349 btrfs_delayed_ref_unlock(locked_ref); 2350 btrfs_put_delayed_ref(ref); 2351 printk(KERN_DEBUG 2352 "btrfs: run_one_delayed_ref returned %d\n", ret); 2353 spin_lock(&delayed_refs->lock); 2354 return ret; 2355 } 2356 2357 /* 2358 * If this node is a head, that means all the refs in this head 2359 * have been dealt with, and we will pick the next head to deal 2360 * with, so we must unlock the head and drop it from the cluster 2361 * list before we release it. 2362 */ 2363 if (btrfs_delayed_ref_is_head(ref)) { 2364 list_del_init(&locked_ref->cluster); 2365 btrfs_delayed_ref_unlock(locked_ref); 2366 locked_ref = NULL; 2367 } 2368 btrfs_put_delayed_ref(ref); 2369 count++; 2370 next: 2371 cond_resched(); 2372 spin_lock(&delayed_refs->lock); 2373 } 2374 return count; 2375 } 2376 2377 #ifdef SCRAMBLE_DELAYED_REFS 2378 /* 2379 * Normally delayed refs get processed in ascending bytenr order. This 2380 * correlates in most cases to the order added. To expose dependencies on this 2381 * order, we start to process the tree in the middle instead of the beginning 2382 */ 2383 static u64 find_middle(struct rb_root *root) 2384 { 2385 struct rb_node *n = root->rb_node; 2386 struct btrfs_delayed_ref_node *entry; 2387 int alt = 1; 2388 u64 middle; 2389 u64 first = 0, last = 0; 2390 2391 n = rb_first(root); 2392 if (n) { 2393 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2394 first = entry->bytenr; 2395 } 2396 n = rb_last(root); 2397 if (n) { 2398 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2399 last = entry->bytenr; 2400 } 2401 n = root->rb_node; 2402 2403 while (n) { 2404 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2405 WARN_ON(!entry->in_tree); 2406 2407 middle = entry->bytenr; 2408 2409 if (alt) 2410 n = n->rb_left; 2411 else 2412 n = n->rb_right; 2413 2414 alt = 1 - alt; 2415 } 2416 return middle; 2417 } 2418 #endif 2419 2420 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 2421 struct btrfs_fs_info *fs_info) 2422 { 2423 struct qgroup_update *qgroup_update; 2424 int ret = 0; 2425 2426 if (list_empty(&trans->qgroup_ref_list) != 2427 !trans->delayed_ref_elem.seq) { 2428 /* list without seq or seq without list */ 2429 printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n", 2430 list_empty(&trans->qgroup_ref_list) ? "" : " not", 2431 trans->delayed_ref_elem.seq); 2432 BUG(); 2433 } 2434 2435 if (!trans->delayed_ref_elem.seq) 2436 return 0; 2437 2438 while (!list_empty(&trans->qgroup_ref_list)) { 2439 qgroup_update = list_first_entry(&trans->qgroup_ref_list, 2440 struct qgroup_update, list); 2441 list_del(&qgroup_update->list); 2442 if (!ret) 2443 ret = btrfs_qgroup_account_ref( 2444 trans, fs_info, qgroup_update->node, 2445 qgroup_update->extent_op); 2446 kfree(qgroup_update); 2447 } 2448 2449 btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); 2450 2451 return ret; 2452 } 2453 2454 static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, 2455 int count) 2456 { 2457 int val = atomic_read(&delayed_refs->ref_seq); 2458 2459 if (val < seq || val >= seq + count) 2460 return 1; 2461 return 0; 2462 } 2463 2464 /* 2465 * this starts processing the delayed reference count updates and 2466 * extent insertions we have queued up so far. count can be 2467 * 0, which means to process everything in the tree at the start 2468 * of the run (but not newly added entries), or it can be some target 2469 * number you'd like to process. 2470 * 2471 * Returns 0 on success or if called with an aborted transaction 2472 * Returns <0 on error and aborts the transaction 2473 */ 2474 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2475 struct btrfs_root *root, unsigned long count) 2476 { 2477 struct rb_node *node; 2478 struct btrfs_delayed_ref_root *delayed_refs; 2479 struct btrfs_delayed_ref_node *ref; 2480 struct list_head cluster; 2481 int ret; 2482 u64 delayed_start; 2483 int run_all = count == (unsigned long)-1; 2484 int run_most = 0; 2485 int loops; 2486 2487 /* We'll clean this up in btrfs_cleanup_transaction */ 2488 if (trans->aborted) 2489 return 0; 2490 2491 if (root == root->fs_info->extent_root) 2492 root = root->fs_info->tree_root; 2493 2494 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); 2495 2496 delayed_refs = &trans->transaction->delayed_refs; 2497 INIT_LIST_HEAD(&cluster); 2498 if (count == 0) { 2499 count = delayed_refs->num_entries * 2; 2500 run_most = 1; 2501 } 2502 2503 if (!run_all && !run_most) { 2504 int old; 2505 int seq = atomic_read(&delayed_refs->ref_seq); 2506 2507 progress: 2508 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); 2509 if (old) { 2510 DEFINE_WAIT(__wait); 2511 if (delayed_refs->num_entries < 16348) 2512 return 0; 2513 2514 prepare_to_wait(&delayed_refs->wait, &__wait, 2515 TASK_UNINTERRUPTIBLE); 2516 2517 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); 2518 if (old) { 2519 schedule(); 2520 finish_wait(&delayed_refs->wait, &__wait); 2521 2522 if (!refs_newer(delayed_refs, seq, 256)) 2523 goto progress; 2524 else 2525 return 0; 2526 } else { 2527 finish_wait(&delayed_refs->wait, &__wait); 2528 goto again; 2529 } 2530 } 2531 2532 } else { 2533 atomic_inc(&delayed_refs->procs_running_refs); 2534 } 2535 2536 again: 2537 loops = 0; 2538 spin_lock(&delayed_refs->lock); 2539 2540 #ifdef SCRAMBLE_DELAYED_REFS 2541 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2542 #endif 2543 2544 while (1) { 2545 if (!(run_all || run_most) && 2546 delayed_refs->num_heads_ready < 64) 2547 break; 2548 2549 /* 2550 * go find something we can process in the rbtree. We start at 2551 * the beginning of the tree, and then build a cluster 2552 * of refs to process starting at the first one we are able to 2553 * lock 2554 */ 2555 delayed_start = delayed_refs->run_delayed_start; 2556 ret = btrfs_find_ref_cluster(trans, &cluster, 2557 delayed_refs->run_delayed_start); 2558 if (ret) 2559 break; 2560 2561 ret = run_clustered_refs(trans, root, &cluster); 2562 if (ret < 0) { 2563 btrfs_release_ref_cluster(&cluster); 2564 spin_unlock(&delayed_refs->lock); 2565 btrfs_abort_transaction(trans, root, ret); 2566 atomic_dec(&delayed_refs->procs_running_refs); 2567 return ret; 2568 } 2569 2570 atomic_add(ret, &delayed_refs->ref_seq); 2571 2572 count -= min_t(unsigned long, ret, count); 2573 2574 if (count == 0) 2575 break; 2576 2577 if (delayed_start >= delayed_refs->run_delayed_start) { 2578 if (loops == 0) { 2579 /* 2580 * btrfs_find_ref_cluster looped. let's do one 2581 * more cycle. if we don't run any delayed ref 2582 * during that cycle (because we can't because 2583 * all of them are blocked), bail out. 2584 */ 2585 loops = 1; 2586 } else { 2587 /* 2588 * no runnable refs left, stop trying 2589 */ 2590 BUG_ON(run_all); 2591 break; 2592 } 2593 } 2594 if (ret) { 2595 /* refs were run, let's reset staleness detection */ 2596 loops = 0; 2597 } 2598 } 2599 2600 if (run_all) { 2601 if (!list_empty(&trans->new_bgs)) { 2602 spin_unlock(&delayed_refs->lock); 2603 btrfs_create_pending_block_groups(trans, root); 2604 spin_lock(&delayed_refs->lock); 2605 } 2606 2607 node = rb_first(&delayed_refs->root); 2608 if (!node) 2609 goto out; 2610 count = (unsigned long)-1; 2611 2612 while (node) { 2613 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2614 rb_node); 2615 if (btrfs_delayed_ref_is_head(ref)) { 2616 struct btrfs_delayed_ref_head *head; 2617 2618 head = btrfs_delayed_node_to_head(ref); 2619 atomic_inc(&ref->refs); 2620 2621 spin_unlock(&delayed_refs->lock); 2622 /* 2623 * Mutex was contended, block until it's 2624 * released and try again 2625 */ 2626 mutex_lock(&head->mutex); 2627 mutex_unlock(&head->mutex); 2628 2629 btrfs_put_delayed_ref(ref); 2630 cond_resched(); 2631 goto again; 2632 } 2633 node = rb_next(node); 2634 } 2635 spin_unlock(&delayed_refs->lock); 2636 schedule_timeout(1); 2637 goto again; 2638 } 2639 out: 2640 atomic_dec(&delayed_refs->procs_running_refs); 2641 smp_mb(); 2642 if (waitqueue_active(&delayed_refs->wait)) 2643 wake_up(&delayed_refs->wait); 2644 2645 spin_unlock(&delayed_refs->lock); 2646 assert_qgroups_uptodate(trans); 2647 return 0; 2648 } 2649 2650 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2651 struct btrfs_root *root, 2652 u64 bytenr, u64 num_bytes, u64 flags, 2653 int is_data) 2654 { 2655 struct btrfs_delayed_extent_op *extent_op; 2656 int ret; 2657 2658 extent_op = btrfs_alloc_delayed_extent_op(); 2659 if (!extent_op) 2660 return -ENOMEM; 2661 2662 extent_op->flags_to_set = flags; 2663 extent_op->update_flags = 1; 2664 extent_op->update_key = 0; 2665 extent_op->is_data = is_data ? 1 : 0; 2666 2667 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2668 num_bytes, extent_op); 2669 if (ret) 2670 btrfs_free_delayed_extent_op(extent_op); 2671 return ret; 2672 } 2673 2674 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, 2675 struct btrfs_root *root, 2676 struct btrfs_path *path, 2677 u64 objectid, u64 offset, u64 bytenr) 2678 { 2679 struct btrfs_delayed_ref_head *head; 2680 struct btrfs_delayed_ref_node *ref; 2681 struct btrfs_delayed_data_ref *data_ref; 2682 struct btrfs_delayed_ref_root *delayed_refs; 2683 struct rb_node *node; 2684 int ret = 0; 2685 2686 ret = -ENOENT; 2687 delayed_refs = &trans->transaction->delayed_refs; 2688 spin_lock(&delayed_refs->lock); 2689 head = btrfs_find_delayed_ref_head(trans, bytenr); 2690 if (!head) 2691 goto out; 2692 2693 if (!mutex_trylock(&head->mutex)) { 2694 atomic_inc(&head->node.refs); 2695 spin_unlock(&delayed_refs->lock); 2696 2697 btrfs_release_path(path); 2698 2699 /* 2700 * Mutex was contended, block until it's released and let 2701 * caller try again 2702 */ 2703 mutex_lock(&head->mutex); 2704 mutex_unlock(&head->mutex); 2705 btrfs_put_delayed_ref(&head->node); 2706 return -EAGAIN; 2707 } 2708 2709 node = rb_prev(&head->node.rb_node); 2710 if (!node) 2711 goto out_unlock; 2712 2713 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2714 2715 if (ref->bytenr != bytenr) 2716 goto out_unlock; 2717 2718 ret = 1; 2719 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) 2720 goto out_unlock; 2721 2722 data_ref = btrfs_delayed_node_to_data_ref(ref); 2723 2724 node = rb_prev(node); 2725 if (node) { 2726 int seq = ref->seq; 2727 2728 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2729 if (ref->bytenr == bytenr && ref->seq == seq) 2730 goto out_unlock; 2731 } 2732 2733 if (data_ref->root != root->root_key.objectid || 2734 data_ref->objectid != objectid || data_ref->offset != offset) 2735 goto out_unlock; 2736 2737 ret = 0; 2738 out_unlock: 2739 mutex_unlock(&head->mutex); 2740 out: 2741 spin_unlock(&delayed_refs->lock); 2742 return ret; 2743 } 2744 2745 static noinline int check_committed_ref(struct btrfs_trans_handle *trans, 2746 struct btrfs_root *root, 2747 struct btrfs_path *path, 2748 u64 objectid, u64 offset, u64 bytenr) 2749 { 2750 struct btrfs_root *extent_root = root->fs_info->extent_root; 2751 struct extent_buffer *leaf; 2752 struct btrfs_extent_data_ref *ref; 2753 struct btrfs_extent_inline_ref *iref; 2754 struct btrfs_extent_item *ei; 2755 struct btrfs_key key; 2756 u32 item_size; 2757 int ret; 2758 2759 key.objectid = bytenr; 2760 key.offset = (u64)-1; 2761 key.type = BTRFS_EXTENT_ITEM_KEY; 2762 2763 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2764 if (ret < 0) 2765 goto out; 2766 BUG_ON(ret == 0); /* Corruption */ 2767 2768 ret = -ENOENT; 2769 if (path->slots[0] == 0) 2770 goto out; 2771 2772 path->slots[0]--; 2773 leaf = path->nodes[0]; 2774 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2775 2776 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2777 goto out; 2778 2779 ret = 1; 2780 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2781 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2782 if (item_size < sizeof(*ei)) { 2783 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 2784 goto out; 2785 } 2786 #endif 2787 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2788 2789 if (item_size != sizeof(*ei) + 2790 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2791 goto out; 2792 2793 if (btrfs_extent_generation(leaf, ei) <= 2794 btrfs_root_last_snapshot(&root->root_item)) 2795 goto out; 2796 2797 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 2798 if (btrfs_extent_inline_ref_type(leaf, iref) != 2799 BTRFS_EXTENT_DATA_REF_KEY) 2800 goto out; 2801 2802 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 2803 if (btrfs_extent_refs(leaf, ei) != 2804 btrfs_extent_data_ref_count(leaf, ref) || 2805 btrfs_extent_data_ref_root(leaf, ref) != 2806 root->root_key.objectid || 2807 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 2808 btrfs_extent_data_ref_offset(leaf, ref) != offset) 2809 goto out; 2810 2811 ret = 0; 2812 out: 2813 return ret; 2814 } 2815 2816 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2817 struct btrfs_root *root, 2818 u64 objectid, u64 offset, u64 bytenr) 2819 { 2820 struct btrfs_path *path; 2821 int ret; 2822 int ret2; 2823 2824 path = btrfs_alloc_path(); 2825 if (!path) 2826 return -ENOENT; 2827 2828 do { 2829 ret = check_committed_ref(trans, root, path, objectid, 2830 offset, bytenr); 2831 if (ret && ret != -ENOENT) 2832 goto out; 2833 2834 ret2 = check_delayed_ref(trans, root, path, objectid, 2835 offset, bytenr); 2836 } while (ret2 == -EAGAIN); 2837 2838 if (ret2 && ret2 != -ENOENT) { 2839 ret = ret2; 2840 goto out; 2841 } 2842 2843 if (ret != -ENOENT || ret2 != -ENOENT) 2844 ret = 0; 2845 out: 2846 btrfs_free_path(path); 2847 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 2848 WARN_ON(ret > 0); 2849 return ret; 2850 } 2851 2852 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2853 struct btrfs_root *root, 2854 struct extent_buffer *buf, 2855 int full_backref, int inc, int for_cow) 2856 { 2857 u64 bytenr; 2858 u64 num_bytes; 2859 u64 parent; 2860 u64 ref_root; 2861 u32 nritems; 2862 struct btrfs_key key; 2863 struct btrfs_file_extent_item *fi; 2864 int i; 2865 int level; 2866 int ret = 0; 2867 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 2868 u64, u64, u64, u64, u64, u64, int); 2869 2870 ref_root = btrfs_header_owner(buf); 2871 nritems = btrfs_header_nritems(buf); 2872 level = btrfs_header_level(buf); 2873 2874 if (!root->ref_cows && level == 0) 2875 return 0; 2876 2877 if (inc) 2878 process_func = btrfs_inc_extent_ref; 2879 else 2880 process_func = btrfs_free_extent; 2881 2882 if (full_backref) 2883 parent = buf->start; 2884 else 2885 parent = 0; 2886 2887 for (i = 0; i < nritems; i++) { 2888 if (level == 0) { 2889 btrfs_item_key_to_cpu(buf, &key, i); 2890 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 2891 continue; 2892 fi = btrfs_item_ptr(buf, i, 2893 struct btrfs_file_extent_item); 2894 if (btrfs_file_extent_type(buf, fi) == 2895 BTRFS_FILE_EXTENT_INLINE) 2896 continue; 2897 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 2898 if (bytenr == 0) 2899 continue; 2900 2901 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 2902 key.offset -= btrfs_file_extent_offset(buf, fi); 2903 ret = process_func(trans, root, bytenr, num_bytes, 2904 parent, ref_root, key.objectid, 2905 key.offset, for_cow); 2906 if (ret) 2907 goto fail; 2908 } else { 2909 bytenr = btrfs_node_blockptr(buf, i); 2910 num_bytes = btrfs_level_size(root, level - 1); 2911 ret = process_func(trans, root, bytenr, num_bytes, 2912 parent, ref_root, level - 1, 0, 2913 for_cow); 2914 if (ret) 2915 goto fail; 2916 } 2917 } 2918 return 0; 2919 fail: 2920 return ret; 2921 } 2922 2923 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2924 struct extent_buffer *buf, int full_backref, int for_cow) 2925 { 2926 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); 2927 } 2928 2929 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2930 struct extent_buffer *buf, int full_backref, int for_cow) 2931 { 2932 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); 2933 } 2934 2935 static int write_one_cache_group(struct btrfs_trans_handle *trans, 2936 struct btrfs_root *root, 2937 struct btrfs_path *path, 2938 struct btrfs_block_group_cache *cache) 2939 { 2940 int ret; 2941 struct btrfs_root *extent_root = root->fs_info->extent_root; 2942 unsigned long bi; 2943 struct extent_buffer *leaf; 2944 2945 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 2946 if (ret < 0) 2947 goto fail; 2948 BUG_ON(ret); /* Corruption */ 2949 2950 leaf = path->nodes[0]; 2951 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 2952 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 2953 btrfs_mark_buffer_dirty(leaf); 2954 btrfs_release_path(path); 2955 fail: 2956 if (ret) { 2957 btrfs_abort_transaction(trans, root, ret); 2958 return ret; 2959 } 2960 return 0; 2961 2962 } 2963 2964 static struct btrfs_block_group_cache * 2965 next_block_group(struct btrfs_root *root, 2966 struct btrfs_block_group_cache *cache) 2967 { 2968 struct rb_node *node; 2969 spin_lock(&root->fs_info->block_group_cache_lock); 2970 node = rb_next(&cache->cache_node); 2971 btrfs_put_block_group(cache); 2972 if (node) { 2973 cache = rb_entry(node, struct btrfs_block_group_cache, 2974 cache_node); 2975 btrfs_get_block_group(cache); 2976 } else 2977 cache = NULL; 2978 spin_unlock(&root->fs_info->block_group_cache_lock); 2979 return cache; 2980 } 2981 2982 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 2983 struct btrfs_trans_handle *trans, 2984 struct btrfs_path *path) 2985 { 2986 struct btrfs_root *root = block_group->fs_info->tree_root; 2987 struct inode *inode = NULL; 2988 u64 alloc_hint = 0; 2989 int dcs = BTRFS_DC_ERROR; 2990 int num_pages = 0; 2991 int retries = 0; 2992 int ret = 0; 2993 2994 /* 2995 * If this block group is smaller than 100 megs don't bother caching the 2996 * block group. 2997 */ 2998 if (block_group->key.offset < (100 * 1024 * 1024)) { 2999 spin_lock(&block_group->lock); 3000 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3001 spin_unlock(&block_group->lock); 3002 return 0; 3003 } 3004 3005 again: 3006 inode = lookup_free_space_inode(root, block_group, path); 3007 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3008 ret = PTR_ERR(inode); 3009 btrfs_release_path(path); 3010 goto out; 3011 } 3012 3013 if (IS_ERR(inode)) { 3014 BUG_ON(retries); 3015 retries++; 3016 3017 if (block_group->ro) 3018 goto out_free; 3019 3020 ret = create_free_space_inode(root, trans, block_group, path); 3021 if (ret) 3022 goto out_free; 3023 goto again; 3024 } 3025 3026 /* We've already setup this transaction, go ahead and exit */ 3027 if (block_group->cache_generation == trans->transid && 3028 i_size_read(inode)) { 3029 dcs = BTRFS_DC_SETUP; 3030 goto out_put; 3031 } 3032 3033 /* 3034 * We want to set the generation to 0, that way if anything goes wrong 3035 * from here on out we know not to trust this cache when we load up next 3036 * time. 3037 */ 3038 BTRFS_I(inode)->generation = 0; 3039 ret = btrfs_update_inode(trans, root, inode); 3040 WARN_ON(ret); 3041 3042 if (i_size_read(inode) > 0) { 3043 ret = btrfs_truncate_free_space_cache(root, trans, path, 3044 inode); 3045 if (ret) 3046 goto out_put; 3047 } 3048 3049 spin_lock(&block_group->lock); 3050 if (block_group->cached != BTRFS_CACHE_FINISHED || 3051 !btrfs_test_opt(root, SPACE_CACHE)) { 3052 /* 3053 * don't bother trying to write stuff out _if_ 3054 * a) we're not cached, 3055 * b) we're with nospace_cache mount option. 3056 */ 3057 dcs = BTRFS_DC_WRITTEN; 3058 spin_unlock(&block_group->lock); 3059 goto out_put; 3060 } 3061 spin_unlock(&block_group->lock); 3062 3063 /* 3064 * Try to preallocate enough space based on how big the block group is. 3065 * Keep in mind this has to include any pinned space which could end up 3066 * taking up quite a bit since it's not folded into the other space 3067 * cache. 3068 */ 3069 num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); 3070 if (!num_pages) 3071 num_pages = 1; 3072 3073 num_pages *= 16; 3074 num_pages *= PAGE_CACHE_SIZE; 3075 3076 ret = btrfs_check_data_free_space(inode, num_pages); 3077 if (ret) 3078 goto out_put; 3079 3080 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3081 num_pages, num_pages, 3082 &alloc_hint); 3083 if (!ret) 3084 dcs = BTRFS_DC_SETUP; 3085 btrfs_free_reserved_data_space(inode, num_pages); 3086 3087 out_put: 3088 iput(inode); 3089 out_free: 3090 btrfs_release_path(path); 3091 out: 3092 spin_lock(&block_group->lock); 3093 if (!ret && dcs == BTRFS_DC_SETUP) 3094 block_group->cache_generation = trans->transid; 3095 block_group->disk_cache_state = dcs; 3096 spin_unlock(&block_group->lock); 3097 3098 return ret; 3099 } 3100 3101 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3102 struct btrfs_root *root) 3103 { 3104 struct btrfs_block_group_cache *cache; 3105 int err = 0; 3106 struct btrfs_path *path; 3107 u64 last = 0; 3108 3109 path = btrfs_alloc_path(); 3110 if (!path) 3111 return -ENOMEM; 3112 3113 again: 3114 while (1) { 3115 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3116 while (cache) { 3117 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3118 break; 3119 cache = next_block_group(root, cache); 3120 } 3121 if (!cache) { 3122 if (last == 0) 3123 break; 3124 last = 0; 3125 continue; 3126 } 3127 err = cache_save_setup(cache, trans, path); 3128 last = cache->key.objectid + cache->key.offset; 3129 btrfs_put_block_group(cache); 3130 } 3131 3132 while (1) { 3133 if (last == 0) { 3134 err = btrfs_run_delayed_refs(trans, root, 3135 (unsigned long)-1); 3136 if (err) /* File system offline */ 3137 goto out; 3138 } 3139 3140 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3141 while (cache) { 3142 if (cache->disk_cache_state == BTRFS_DC_CLEAR) { 3143 btrfs_put_block_group(cache); 3144 goto again; 3145 } 3146 3147 if (cache->dirty) 3148 break; 3149 cache = next_block_group(root, cache); 3150 } 3151 if (!cache) { 3152 if (last == 0) 3153 break; 3154 last = 0; 3155 continue; 3156 } 3157 3158 if (cache->disk_cache_state == BTRFS_DC_SETUP) 3159 cache->disk_cache_state = BTRFS_DC_NEED_WRITE; 3160 cache->dirty = 0; 3161 last = cache->key.objectid + cache->key.offset; 3162 3163 err = write_one_cache_group(trans, root, path, cache); 3164 if (err) /* File system offline */ 3165 goto out; 3166 3167 btrfs_put_block_group(cache); 3168 } 3169 3170 while (1) { 3171 /* 3172 * I don't think this is needed since we're just marking our 3173 * preallocated extent as written, but just in case it can't 3174 * hurt. 3175 */ 3176 if (last == 0) { 3177 err = btrfs_run_delayed_refs(trans, root, 3178 (unsigned long)-1); 3179 if (err) /* File system offline */ 3180 goto out; 3181 } 3182 3183 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3184 while (cache) { 3185 /* 3186 * Really this shouldn't happen, but it could if we 3187 * couldn't write the entire preallocated extent and 3188 * splitting the extent resulted in a new block. 3189 */ 3190 if (cache->dirty) { 3191 btrfs_put_block_group(cache); 3192 goto again; 3193 } 3194 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 3195 break; 3196 cache = next_block_group(root, cache); 3197 } 3198 if (!cache) { 3199 if (last == 0) 3200 break; 3201 last = 0; 3202 continue; 3203 } 3204 3205 err = btrfs_write_out_cache(root, trans, cache, path); 3206 3207 /* 3208 * If we didn't have an error then the cache state is still 3209 * NEED_WRITE, so we can set it to WRITTEN. 3210 */ 3211 if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 3212 cache->disk_cache_state = BTRFS_DC_WRITTEN; 3213 last = cache->key.objectid + cache->key.offset; 3214 btrfs_put_block_group(cache); 3215 } 3216 out: 3217 3218 btrfs_free_path(path); 3219 return err; 3220 } 3221 3222 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 3223 { 3224 struct btrfs_block_group_cache *block_group; 3225 int readonly = 0; 3226 3227 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 3228 if (!block_group || block_group->ro) 3229 readonly = 1; 3230 if (block_group) 3231 btrfs_put_block_group(block_group); 3232 return readonly; 3233 } 3234 3235 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3236 u64 total_bytes, u64 bytes_used, 3237 struct btrfs_space_info **space_info) 3238 { 3239 struct btrfs_space_info *found; 3240 int i; 3241 int factor; 3242 3243 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3244 BTRFS_BLOCK_GROUP_RAID10)) 3245 factor = 2; 3246 else 3247 factor = 1; 3248 3249 found = __find_space_info(info, flags); 3250 if (found) { 3251 spin_lock(&found->lock); 3252 found->total_bytes += total_bytes; 3253 found->disk_total += total_bytes * factor; 3254 found->bytes_used += bytes_used; 3255 found->disk_used += bytes_used * factor; 3256 found->full = 0; 3257 spin_unlock(&found->lock); 3258 *space_info = found; 3259 return 0; 3260 } 3261 found = kzalloc(sizeof(*found), GFP_NOFS); 3262 if (!found) 3263 return -ENOMEM; 3264 3265 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3266 INIT_LIST_HEAD(&found->block_groups[i]); 3267 init_rwsem(&found->groups_sem); 3268 spin_lock_init(&found->lock); 3269 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3270 found->total_bytes = total_bytes; 3271 found->disk_total = total_bytes * factor; 3272 found->bytes_used = bytes_used; 3273 found->disk_used = bytes_used * factor; 3274 found->bytes_pinned = 0; 3275 found->bytes_reserved = 0; 3276 found->bytes_readonly = 0; 3277 found->bytes_may_use = 0; 3278 found->full = 0; 3279 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3280 found->chunk_alloc = 0; 3281 found->flush = 0; 3282 init_waitqueue_head(&found->wait); 3283 *space_info = found; 3284 list_add_rcu(&found->list, &info->space_info); 3285 if (flags & BTRFS_BLOCK_GROUP_DATA) 3286 info->data_sinfo = found; 3287 return 0; 3288 } 3289 3290 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3291 { 3292 u64 extra_flags = chunk_to_extended(flags) & 3293 BTRFS_EXTENDED_PROFILE_MASK; 3294 3295 write_seqlock(&fs_info->profiles_lock); 3296 if (flags & BTRFS_BLOCK_GROUP_DATA) 3297 fs_info->avail_data_alloc_bits |= extra_flags; 3298 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3299 fs_info->avail_metadata_alloc_bits |= extra_flags; 3300 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3301 fs_info->avail_system_alloc_bits |= extra_flags; 3302 write_sequnlock(&fs_info->profiles_lock); 3303 } 3304 3305 /* 3306 * returns target flags in extended format or 0 if restripe for this 3307 * chunk_type is not in progress 3308 * 3309 * should be called with either volume_mutex or balance_lock held 3310 */ 3311 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 3312 { 3313 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3314 u64 target = 0; 3315 3316 if (!bctl) 3317 return 0; 3318 3319 if (flags & BTRFS_BLOCK_GROUP_DATA && 3320 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3321 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 3322 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 3323 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3324 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 3325 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 3326 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3327 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 3328 } 3329 3330 return target; 3331 } 3332 3333 /* 3334 * @flags: available profiles in extended format (see ctree.h) 3335 * 3336 * Returns reduced profile in chunk format. If profile changing is in 3337 * progress (either running or paused) picks the target profile (if it's 3338 * already available), otherwise falls back to plain reducing. 3339 */ 3340 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3341 { 3342 /* 3343 * we add in the count of missing devices because we want 3344 * to make sure that any RAID levels on a degraded FS 3345 * continue to be honored. 3346 */ 3347 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3348 root->fs_info->fs_devices->missing_devices; 3349 u64 target; 3350 u64 tmp; 3351 3352 /* 3353 * see if restripe for this chunk_type is in progress, if so 3354 * try to reduce to the target profile 3355 */ 3356 spin_lock(&root->fs_info->balance_lock); 3357 target = get_restripe_target(root->fs_info, flags); 3358 if (target) { 3359 /* pick target profile only if it's already available */ 3360 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 3361 spin_unlock(&root->fs_info->balance_lock); 3362 return extended_to_chunk(target); 3363 } 3364 } 3365 spin_unlock(&root->fs_info->balance_lock); 3366 3367 /* First, mask out the RAID levels which aren't possible */ 3368 if (num_devices == 1) 3369 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | 3370 BTRFS_BLOCK_GROUP_RAID5); 3371 if (num_devices < 3) 3372 flags &= ~BTRFS_BLOCK_GROUP_RAID6; 3373 if (num_devices < 4) 3374 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3375 3376 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3377 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | 3378 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); 3379 flags &= ~tmp; 3380 3381 if (tmp & BTRFS_BLOCK_GROUP_RAID6) 3382 tmp = BTRFS_BLOCK_GROUP_RAID6; 3383 else if (tmp & BTRFS_BLOCK_GROUP_RAID5) 3384 tmp = BTRFS_BLOCK_GROUP_RAID5; 3385 else if (tmp & BTRFS_BLOCK_GROUP_RAID10) 3386 tmp = BTRFS_BLOCK_GROUP_RAID10; 3387 else if (tmp & BTRFS_BLOCK_GROUP_RAID1) 3388 tmp = BTRFS_BLOCK_GROUP_RAID1; 3389 else if (tmp & BTRFS_BLOCK_GROUP_RAID0) 3390 tmp = BTRFS_BLOCK_GROUP_RAID0; 3391 3392 return extended_to_chunk(flags | tmp); 3393 } 3394 3395 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3396 { 3397 unsigned seq; 3398 3399 do { 3400 seq = read_seqbegin(&root->fs_info->profiles_lock); 3401 3402 if (flags & BTRFS_BLOCK_GROUP_DATA) 3403 flags |= root->fs_info->avail_data_alloc_bits; 3404 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3405 flags |= root->fs_info->avail_system_alloc_bits; 3406 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3407 flags |= root->fs_info->avail_metadata_alloc_bits; 3408 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 3409 3410 return btrfs_reduce_alloc_profile(root, flags); 3411 } 3412 3413 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3414 { 3415 u64 flags; 3416 u64 ret; 3417 3418 if (data) 3419 flags = BTRFS_BLOCK_GROUP_DATA; 3420 else if (root == root->fs_info->chunk_root) 3421 flags = BTRFS_BLOCK_GROUP_SYSTEM; 3422 else 3423 flags = BTRFS_BLOCK_GROUP_METADATA; 3424 3425 ret = get_alloc_profile(root, flags); 3426 return ret; 3427 } 3428 3429 /* 3430 * This will check the space that the inode allocates from to make sure we have 3431 * enough space for bytes. 3432 */ 3433 int btrfs_check_data_free_space(struct inode *inode, u64 bytes) 3434 { 3435 struct btrfs_space_info *data_sinfo; 3436 struct btrfs_root *root = BTRFS_I(inode)->root; 3437 struct btrfs_fs_info *fs_info = root->fs_info; 3438 u64 used; 3439 int ret = 0, committed = 0, alloc_chunk = 1; 3440 3441 /* make sure bytes are sectorsize aligned */ 3442 bytes = ALIGN(bytes, root->sectorsize); 3443 3444 if (root == root->fs_info->tree_root || 3445 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { 3446 alloc_chunk = 0; 3447 committed = 1; 3448 } 3449 3450 data_sinfo = fs_info->data_sinfo; 3451 if (!data_sinfo) 3452 goto alloc; 3453 3454 again: 3455 /* make sure we have enough space to handle the data first */ 3456 spin_lock(&data_sinfo->lock); 3457 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + 3458 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + 3459 data_sinfo->bytes_may_use; 3460 3461 if (used + bytes > data_sinfo->total_bytes) { 3462 struct btrfs_trans_handle *trans; 3463 3464 /* 3465 * if we don't have enough free bytes in this space then we need 3466 * to alloc a new chunk. 3467 */ 3468 if (!data_sinfo->full && alloc_chunk) { 3469 u64 alloc_target; 3470 3471 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 3472 spin_unlock(&data_sinfo->lock); 3473 alloc: 3474 alloc_target = btrfs_get_alloc_profile(root, 1); 3475 trans = btrfs_join_transaction(root); 3476 if (IS_ERR(trans)) 3477 return PTR_ERR(trans); 3478 3479 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3480 alloc_target, 3481 CHUNK_ALLOC_NO_FORCE); 3482 btrfs_end_transaction(trans, root); 3483 if (ret < 0) { 3484 if (ret != -ENOSPC) 3485 return ret; 3486 else 3487 goto commit_trans; 3488 } 3489 3490 if (!data_sinfo) 3491 data_sinfo = fs_info->data_sinfo; 3492 3493 goto again; 3494 } 3495 3496 /* 3497 * If we have less pinned bytes than we want to allocate then 3498 * don't bother committing the transaction, it won't help us. 3499 */ 3500 if (data_sinfo->bytes_pinned < bytes) 3501 committed = 1; 3502 spin_unlock(&data_sinfo->lock); 3503 3504 /* commit the current transaction and try again */ 3505 commit_trans: 3506 if (!committed && 3507 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3508 committed = 1; 3509 trans = btrfs_join_transaction(root); 3510 if (IS_ERR(trans)) 3511 return PTR_ERR(trans); 3512 ret = btrfs_commit_transaction(trans, root); 3513 if (ret) 3514 return ret; 3515 goto again; 3516 } 3517 3518 return -ENOSPC; 3519 } 3520 data_sinfo->bytes_may_use += bytes; 3521 trace_btrfs_space_reservation(root->fs_info, "space_info", 3522 data_sinfo->flags, bytes, 1); 3523 spin_unlock(&data_sinfo->lock); 3524 3525 return 0; 3526 } 3527 3528 /* 3529 * Called if we need to clear a data reservation for this inode. 3530 */ 3531 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3532 { 3533 struct btrfs_root *root = BTRFS_I(inode)->root; 3534 struct btrfs_space_info *data_sinfo; 3535 3536 /* make sure bytes are sectorsize aligned */ 3537 bytes = ALIGN(bytes, root->sectorsize); 3538 3539 data_sinfo = root->fs_info->data_sinfo; 3540 spin_lock(&data_sinfo->lock); 3541 data_sinfo->bytes_may_use -= bytes; 3542 trace_btrfs_space_reservation(root->fs_info, "space_info", 3543 data_sinfo->flags, bytes, 0); 3544 spin_unlock(&data_sinfo->lock); 3545 } 3546 3547 static void force_metadata_allocation(struct btrfs_fs_info *info) 3548 { 3549 struct list_head *head = &info->space_info; 3550 struct btrfs_space_info *found; 3551 3552 rcu_read_lock(); 3553 list_for_each_entry_rcu(found, head, list) { 3554 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3555 found->force_alloc = CHUNK_ALLOC_FORCE; 3556 } 3557 rcu_read_unlock(); 3558 } 3559 3560 static int should_alloc_chunk(struct btrfs_root *root, 3561 struct btrfs_space_info *sinfo, int force) 3562 { 3563 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3564 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3565 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3566 u64 thresh; 3567 3568 if (force == CHUNK_ALLOC_FORCE) 3569 return 1; 3570 3571 /* 3572 * We need to take into account the global rsv because for all intents 3573 * and purposes it's used space. Don't worry about locking the 3574 * global_rsv, it doesn't change except when the transaction commits. 3575 */ 3576 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 3577 num_allocated += global_rsv->size; 3578 3579 /* 3580 * in limited mode, we want to have some free space up to 3581 * about 1% of the FS size. 3582 */ 3583 if (force == CHUNK_ALLOC_LIMITED) { 3584 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 3585 thresh = max_t(u64, 64 * 1024 * 1024, 3586 div_factor_fine(thresh, 1)); 3587 3588 if (num_bytes - num_allocated < thresh) 3589 return 1; 3590 } 3591 3592 if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) 3593 return 0; 3594 return 1; 3595 } 3596 3597 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) 3598 { 3599 u64 num_dev; 3600 3601 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 3602 BTRFS_BLOCK_GROUP_RAID0 | 3603 BTRFS_BLOCK_GROUP_RAID5 | 3604 BTRFS_BLOCK_GROUP_RAID6)) 3605 num_dev = root->fs_info->fs_devices->rw_devices; 3606 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3607 num_dev = 2; 3608 else 3609 num_dev = 1; /* DUP or single */ 3610 3611 /* metadata for updaing devices and chunk tree */ 3612 return btrfs_calc_trans_metadata_size(root, num_dev + 1); 3613 } 3614 3615 static void check_system_chunk(struct btrfs_trans_handle *trans, 3616 struct btrfs_root *root, u64 type) 3617 { 3618 struct btrfs_space_info *info; 3619 u64 left; 3620 u64 thresh; 3621 3622 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3623 spin_lock(&info->lock); 3624 left = info->total_bytes - info->bytes_used - info->bytes_pinned - 3625 info->bytes_reserved - info->bytes_readonly; 3626 spin_unlock(&info->lock); 3627 3628 thresh = get_system_chunk_thresh(root, type); 3629 if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { 3630 printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n", 3631 left, thresh, type); 3632 dump_space_info(info, 0, 0); 3633 } 3634 3635 if (left < thresh) { 3636 u64 flags; 3637 3638 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); 3639 btrfs_alloc_chunk(trans, root, flags); 3640 } 3641 } 3642 3643 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3644 struct btrfs_root *extent_root, u64 flags, int force) 3645 { 3646 struct btrfs_space_info *space_info; 3647 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3648 int wait_for_alloc = 0; 3649 int ret = 0; 3650 3651 /* Don't re-enter if we're already allocating a chunk */ 3652 if (trans->allocating_chunk) 3653 return -ENOSPC; 3654 3655 space_info = __find_space_info(extent_root->fs_info, flags); 3656 if (!space_info) { 3657 ret = update_space_info(extent_root->fs_info, flags, 3658 0, 0, &space_info); 3659 BUG_ON(ret); /* -ENOMEM */ 3660 } 3661 BUG_ON(!space_info); /* Logic error */ 3662 3663 again: 3664 spin_lock(&space_info->lock); 3665 if (force < space_info->force_alloc) 3666 force = space_info->force_alloc; 3667 if (space_info->full) { 3668 spin_unlock(&space_info->lock); 3669 return 0; 3670 } 3671 3672 if (!should_alloc_chunk(extent_root, space_info, force)) { 3673 spin_unlock(&space_info->lock); 3674 return 0; 3675 } else if (space_info->chunk_alloc) { 3676 wait_for_alloc = 1; 3677 } else { 3678 space_info->chunk_alloc = 1; 3679 } 3680 3681 spin_unlock(&space_info->lock); 3682 3683 mutex_lock(&fs_info->chunk_mutex); 3684 3685 /* 3686 * The chunk_mutex is held throughout the entirety of a chunk 3687 * allocation, so once we've acquired the chunk_mutex we know that the 3688 * other guy is done and we need to recheck and see if we should 3689 * allocate. 3690 */ 3691 if (wait_for_alloc) { 3692 mutex_unlock(&fs_info->chunk_mutex); 3693 wait_for_alloc = 0; 3694 goto again; 3695 } 3696 3697 trans->allocating_chunk = true; 3698 3699 /* 3700 * If we have mixed data/metadata chunks we want to make sure we keep 3701 * allocating mixed chunks instead of individual chunks. 3702 */ 3703 if (btrfs_mixed_space_info(space_info)) 3704 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 3705 3706 /* 3707 * if we're doing a data chunk, go ahead and make sure that 3708 * we keep a reasonable number of metadata chunks allocated in the 3709 * FS as well. 3710 */ 3711 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3712 fs_info->data_chunk_allocations++; 3713 if (!(fs_info->data_chunk_allocations % 3714 fs_info->metadata_ratio)) 3715 force_metadata_allocation(fs_info); 3716 } 3717 3718 /* 3719 * Check if we have enough space in SYSTEM chunk because we may need 3720 * to update devices. 3721 */ 3722 check_system_chunk(trans, extent_root, flags); 3723 3724 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3725 trans->allocating_chunk = false; 3726 3727 spin_lock(&space_info->lock); 3728 if (ret < 0 && ret != -ENOSPC) 3729 goto out; 3730 if (ret) 3731 space_info->full = 1; 3732 else 3733 ret = 1; 3734 3735 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3736 out: 3737 space_info->chunk_alloc = 0; 3738 spin_unlock(&space_info->lock); 3739 mutex_unlock(&fs_info->chunk_mutex); 3740 return ret; 3741 } 3742 3743 static int can_overcommit(struct btrfs_root *root, 3744 struct btrfs_space_info *space_info, u64 bytes, 3745 enum btrfs_reserve_flush_enum flush) 3746 { 3747 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3748 u64 profile = btrfs_get_alloc_profile(root, 0); 3749 u64 rsv_size = 0; 3750 u64 avail; 3751 u64 used; 3752 u64 to_add; 3753 3754 used = space_info->bytes_used + space_info->bytes_reserved + 3755 space_info->bytes_pinned + space_info->bytes_readonly; 3756 3757 spin_lock(&global_rsv->lock); 3758 rsv_size = global_rsv->size; 3759 spin_unlock(&global_rsv->lock); 3760 3761 /* 3762 * We only want to allow over committing if we have lots of actual space 3763 * free, but if we don't have enough space to handle the global reserve 3764 * space then we could end up having a real enospc problem when trying 3765 * to allocate a chunk or some other such important allocation. 3766 */ 3767 rsv_size <<= 1; 3768 if (used + rsv_size >= space_info->total_bytes) 3769 return 0; 3770 3771 used += space_info->bytes_may_use; 3772 3773 spin_lock(&root->fs_info->free_chunk_lock); 3774 avail = root->fs_info->free_chunk_space; 3775 spin_unlock(&root->fs_info->free_chunk_lock); 3776 3777 /* 3778 * If we have dup, raid1 or raid10 then only half of the free 3779 * space is actually useable. For raid56, the space info used 3780 * doesn't include the parity drive, so we don't have to 3781 * change the math 3782 */ 3783 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3784 BTRFS_BLOCK_GROUP_RAID1 | 3785 BTRFS_BLOCK_GROUP_RAID10)) 3786 avail >>= 1; 3787 3788 to_add = space_info->total_bytes; 3789 3790 /* 3791 * If we aren't flushing all things, let us overcommit up to 3792 * 1/2th of the space. If we can flush, don't let us overcommit 3793 * too much, let it overcommit up to 1/8 of the space. 3794 */ 3795 if (flush == BTRFS_RESERVE_FLUSH_ALL) 3796 to_add >>= 3; 3797 else 3798 to_add >>= 1; 3799 3800 /* 3801 * Limit the overcommit to the amount of free space we could possibly 3802 * allocate for chunks. 3803 */ 3804 to_add = min(avail, to_add); 3805 3806 if (used + bytes < space_info->total_bytes + to_add) 3807 return 1; 3808 return 0; 3809 } 3810 3811 void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 3812 unsigned long nr_pages) 3813 { 3814 struct super_block *sb = root->fs_info->sb; 3815 int started; 3816 3817 /* If we can not start writeback, just sync all the delalloc file. */ 3818 started = try_to_writeback_inodes_sb_nr(sb, nr_pages, 3819 WB_REASON_FS_FREE_SPACE); 3820 if (!started) { 3821 /* 3822 * We needn't worry the filesystem going from r/w to r/o though 3823 * we don't acquire ->s_umount mutex, because the filesystem 3824 * should guarantee the delalloc inodes list be empty after 3825 * the filesystem is readonly(all dirty pages are written to 3826 * the disk). 3827 */ 3828 btrfs_start_delalloc_inodes(root, 0); 3829 btrfs_wait_ordered_extents(root, 0); 3830 } 3831 } 3832 3833 /* 3834 * shrink metadata reservation for delalloc 3835 */ 3836 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 3837 bool wait_ordered) 3838 { 3839 struct btrfs_block_rsv *block_rsv; 3840 struct btrfs_space_info *space_info; 3841 struct btrfs_trans_handle *trans; 3842 u64 delalloc_bytes; 3843 u64 max_reclaim; 3844 long time_left; 3845 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3846 int loops = 0; 3847 enum btrfs_reserve_flush_enum flush; 3848 3849 trans = (struct btrfs_trans_handle *)current->journal_info; 3850 block_rsv = &root->fs_info->delalloc_block_rsv; 3851 space_info = block_rsv->space_info; 3852 3853 smp_mb(); 3854 delalloc_bytes = percpu_counter_sum_positive( 3855 &root->fs_info->delalloc_bytes); 3856 if (delalloc_bytes == 0) { 3857 if (trans) 3858 return; 3859 btrfs_wait_ordered_extents(root, 0); 3860 return; 3861 } 3862 3863 while (delalloc_bytes && loops < 3) { 3864 max_reclaim = min(delalloc_bytes, to_reclaim); 3865 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3866 btrfs_writeback_inodes_sb_nr(root, nr_pages); 3867 /* 3868 * We need to wait for the async pages to actually start before 3869 * we do anything. 3870 */ 3871 wait_event(root->fs_info->async_submit_wait, 3872 !atomic_read(&root->fs_info->async_delalloc_pages)); 3873 3874 if (!trans) 3875 flush = BTRFS_RESERVE_FLUSH_ALL; 3876 else 3877 flush = BTRFS_RESERVE_NO_FLUSH; 3878 spin_lock(&space_info->lock); 3879 if (can_overcommit(root, space_info, orig, flush)) { 3880 spin_unlock(&space_info->lock); 3881 break; 3882 } 3883 spin_unlock(&space_info->lock); 3884 3885 loops++; 3886 if (wait_ordered && !trans) { 3887 btrfs_wait_ordered_extents(root, 0); 3888 } else { 3889 time_left = schedule_timeout_killable(1); 3890 if (time_left) 3891 break; 3892 } 3893 smp_mb(); 3894 delalloc_bytes = percpu_counter_sum_positive( 3895 &root->fs_info->delalloc_bytes); 3896 } 3897 } 3898 3899 /** 3900 * maybe_commit_transaction - possibly commit the transaction if its ok to 3901 * @root - the root we're allocating for 3902 * @bytes - the number of bytes we want to reserve 3903 * @force - force the commit 3904 * 3905 * This will check to make sure that committing the transaction will actually 3906 * get us somewhere and then commit the transaction if it does. Otherwise it 3907 * will return -ENOSPC. 3908 */ 3909 static int may_commit_transaction(struct btrfs_root *root, 3910 struct btrfs_space_info *space_info, 3911 u64 bytes, int force) 3912 { 3913 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; 3914 struct btrfs_trans_handle *trans; 3915 3916 trans = (struct btrfs_trans_handle *)current->journal_info; 3917 if (trans) 3918 return -EAGAIN; 3919 3920 if (force) 3921 goto commit; 3922 3923 /* See if there is enough pinned space to make this reservation */ 3924 spin_lock(&space_info->lock); 3925 if (space_info->bytes_pinned >= bytes) { 3926 spin_unlock(&space_info->lock); 3927 goto commit; 3928 } 3929 spin_unlock(&space_info->lock); 3930 3931 /* 3932 * See if there is some space in the delayed insertion reservation for 3933 * this reservation. 3934 */ 3935 if (space_info != delayed_rsv->space_info) 3936 return -ENOSPC; 3937 3938 spin_lock(&space_info->lock); 3939 spin_lock(&delayed_rsv->lock); 3940 if (space_info->bytes_pinned + delayed_rsv->size < bytes) { 3941 spin_unlock(&delayed_rsv->lock); 3942 spin_unlock(&space_info->lock); 3943 return -ENOSPC; 3944 } 3945 spin_unlock(&delayed_rsv->lock); 3946 spin_unlock(&space_info->lock); 3947 3948 commit: 3949 trans = btrfs_join_transaction(root); 3950 if (IS_ERR(trans)) 3951 return -ENOSPC; 3952 3953 return btrfs_commit_transaction(trans, root); 3954 } 3955 3956 enum flush_state { 3957 FLUSH_DELAYED_ITEMS_NR = 1, 3958 FLUSH_DELAYED_ITEMS = 2, 3959 FLUSH_DELALLOC = 3, 3960 FLUSH_DELALLOC_WAIT = 4, 3961 ALLOC_CHUNK = 5, 3962 COMMIT_TRANS = 6, 3963 }; 3964 3965 static int flush_space(struct btrfs_root *root, 3966 struct btrfs_space_info *space_info, u64 num_bytes, 3967 u64 orig_bytes, int state) 3968 { 3969 struct btrfs_trans_handle *trans; 3970 int nr; 3971 int ret = 0; 3972 3973 switch (state) { 3974 case FLUSH_DELAYED_ITEMS_NR: 3975 case FLUSH_DELAYED_ITEMS: 3976 if (state == FLUSH_DELAYED_ITEMS_NR) { 3977 u64 bytes = btrfs_calc_trans_metadata_size(root, 1); 3978 3979 nr = (int)div64_u64(num_bytes, bytes); 3980 if (!nr) 3981 nr = 1; 3982 nr *= 2; 3983 } else { 3984 nr = -1; 3985 } 3986 trans = btrfs_join_transaction(root); 3987 if (IS_ERR(trans)) { 3988 ret = PTR_ERR(trans); 3989 break; 3990 } 3991 ret = btrfs_run_delayed_items_nr(trans, root, nr); 3992 btrfs_end_transaction(trans, root); 3993 break; 3994 case FLUSH_DELALLOC: 3995 case FLUSH_DELALLOC_WAIT: 3996 shrink_delalloc(root, num_bytes, orig_bytes, 3997 state == FLUSH_DELALLOC_WAIT); 3998 break; 3999 case ALLOC_CHUNK: 4000 trans = btrfs_join_transaction(root); 4001 if (IS_ERR(trans)) { 4002 ret = PTR_ERR(trans); 4003 break; 4004 } 4005 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4006 btrfs_get_alloc_profile(root, 0), 4007 CHUNK_ALLOC_NO_FORCE); 4008 btrfs_end_transaction(trans, root); 4009 if (ret == -ENOSPC) 4010 ret = 0; 4011 break; 4012 case COMMIT_TRANS: 4013 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 4014 break; 4015 default: 4016 ret = -ENOSPC; 4017 break; 4018 } 4019 4020 return ret; 4021 } 4022 /** 4023 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 4024 * @root - the root we're allocating for 4025 * @block_rsv - the block_rsv we're allocating for 4026 * @orig_bytes - the number of bytes we want 4027 * @flush - whether or not we can flush to make our reservation 4028 * 4029 * This will reserve orgi_bytes number of bytes from the space info associated 4030 * with the block_rsv. If there is not enough space it will make an attempt to 4031 * flush out space to make room. It will do this by flushing delalloc if 4032 * possible or committing the transaction. If flush is 0 then no attempts to 4033 * regain reservations will be made and this will fail if there is not enough 4034 * space already. 4035 */ 4036 static int reserve_metadata_bytes(struct btrfs_root *root, 4037 struct btrfs_block_rsv *block_rsv, 4038 u64 orig_bytes, 4039 enum btrfs_reserve_flush_enum flush) 4040 { 4041 struct btrfs_space_info *space_info = block_rsv->space_info; 4042 u64 used; 4043 u64 num_bytes = orig_bytes; 4044 int flush_state = FLUSH_DELAYED_ITEMS_NR; 4045 int ret = 0; 4046 bool flushing = false; 4047 4048 again: 4049 ret = 0; 4050 spin_lock(&space_info->lock); 4051 /* 4052 * We only want to wait if somebody other than us is flushing and we 4053 * are actually allowed to flush all things. 4054 */ 4055 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && 4056 space_info->flush) { 4057 spin_unlock(&space_info->lock); 4058 /* 4059 * If we have a trans handle we can't wait because the flusher 4060 * may have to commit the transaction, which would mean we would 4061 * deadlock since we are waiting for the flusher to finish, but 4062 * hold the current transaction open. 4063 */ 4064 if (current->journal_info) 4065 return -EAGAIN; 4066 ret = wait_event_killable(space_info->wait, !space_info->flush); 4067 /* Must have been killed, return */ 4068 if (ret) 4069 return -EINTR; 4070 4071 spin_lock(&space_info->lock); 4072 } 4073 4074 ret = -ENOSPC; 4075 used = space_info->bytes_used + space_info->bytes_reserved + 4076 space_info->bytes_pinned + space_info->bytes_readonly + 4077 space_info->bytes_may_use; 4078 4079 /* 4080 * The idea here is that we've not already over-reserved the block group 4081 * then we can go ahead and save our reservation first and then start 4082 * flushing if we need to. Otherwise if we've already overcommitted 4083 * lets start flushing stuff first and then come back and try to make 4084 * our reservation. 4085 */ 4086 if (used <= space_info->total_bytes) { 4087 if (used + orig_bytes <= space_info->total_bytes) { 4088 space_info->bytes_may_use += orig_bytes; 4089 trace_btrfs_space_reservation(root->fs_info, 4090 "space_info", space_info->flags, orig_bytes, 1); 4091 ret = 0; 4092 } else { 4093 /* 4094 * Ok set num_bytes to orig_bytes since we aren't 4095 * overocmmitted, this way we only try and reclaim what 4096 * we need. 4097 */ 4098 num_bytes = orig_bytes; 4099 } 4100 } else { 4101 /* 4102 * Ok we're over committed, set num_bytes to the overcommitted 4103 * amount plus the amount of bytes that we need for this 4104 * reservation. 4105 */ 4106 num_bytes = used - space_info->total_bytes + 4107 (orig_bytes * 2); 4108 } 4109 4110 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { 4111 space_info->bytes_may_use += orig_bytes; 4112 trace_btrfs_space_reservation(root->fs_info, "space_info", 4113 space_info->flags, orig_bytes, 4114 1); 4115 ret = 0; 4116 } 4117 4118 /* 4119 * Couldn't make our reservation, save our place so while we're trying 4120 * to reclaim space we can actually use it instead of somebody else 4121 * stealing it from us. 4122 * 4123 * We make the other tasks wait for the flush only when we can flush 4124 * all things. 4125 */ 4126 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 4127 flushing = true; 4128 space_info->flush = 1; 4129 } 4130 4131 spin_unlock(&space_info->lock); 4132 4133 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 4134 goto out; 4135 4136 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4137 flush_state); 4138 flush_state++; 4139 4140 /* 4141 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock 4142 * would happen. So skip delalloc flush. 4143 */ 4144 if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4145 (flush_state == FLUSH_DELALLOC || 4146 flush_state == FLUSH_DELALLOC_WAIT)) 4147 flush_state = ALLOC_CHUNK; 4148 4149 if (!ret) 4150 goto again; 4151 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4152 flush_state < COMMIT_TRANS) 4153 goto again; 4154 else if (flush == BTRFS_RESERVE_FLUSH_ALL && 4155 flush_state <= COMMIT_TRANS) 4156 goto again; 4157 4158 out: 4159 if (ret == -ENOSPC && 4160 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 4161 struct btrfs_block_rsv *global_rsv = 4162 &root->fs_info->global_block_rsv; 4163 4164 if (block_rsv != global_rsv && 4165 !block_rsv_use_bytes(global_rsv, orig_bytes)) 4166 ret = 0; 4167 } 4168 if (flushing) { 4169 spin_lock(&space_info->lock); 4170 space_info->flush = 0; 4171 wake_up_all(&space_info->wait); 4172 spin_unlock(&space_info->lock); 4173 } 4174 return ret; 4175 } 4176 4177 static struct btrfs_block_rsv *get_block_rsv( 4178 const struct btrfs_trans_handle *trans, 4179 const struct btrfs_root *root) 4180 { 4181 struct btrfs_block_rsv *block_rsv = NULL; 4182 4183 if (root->ref_cows) 4184 block_rsv = trans->block_rsv; 4185 4186 if (root == root->fs_info->csum_root && trans->adding_csums) 4187 block_rsv = trans->block_rsv; 4188 4189 if (!block_rsv) 4190 block_rsv = root->block_rsv; 4191 4192 if (!block_rsv) 4193 block_rsv = &root->fs_info->empty_block_rsv; 4194 4195 return block_rsv; 4196 } 4197 4198 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 4199 u64 num_bytes) 4200 { 4201 int ret = -ENOSPC; 4202 spin_lock(&block_rsv->lock); 4203 if (block_rsv->reserved >= num_bytes) { 4204 block_rsv->reserved -= num_bytes; 4205 if (block_rsv->reserved < block_rsv->size) 4206 block_rsv->full = 0; 4207 ret = 0; 4208 } 4209 spin_unlock(&block_rsv->lock); 4210 return ret; 4211 } 4212 4213 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 4214 u64 num_bytes, int update_size) 4215 { 4216 spin_lock(&block_rsv->lock); 4217 block_rsv->reserved += num_bytes; 4218 if (update_size) 4219 block_rsv->size += num_bytes; 4220 else if (block_rsv->reserved >= block_rsv->size) 4221 block_rsv->full = 1; 4222 spin_unlock(&block_rsv->lock); 4223 } 4224 4225 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 4226 struct btrfs_block_rsv *block_rsv, 4227 struct btrfs_block_rsv *dest, u64 num_bytes) 4228 { 4229 struct btrfs_space_info *space_info = block_rsv->space_info; 4230 4231 spin_lock(&block_rsv->lock); 4232 if (num_bytes == (u64)-1) 4233 num_bytes = block_rsv->size; 4234 block_rsv->size -= num_bytes; 4235 if (block_rsv->reserved >= block_rsv->size) { 4236 num_bytes = block_rsv->reserved - block_rsv->size; 4237 block_rsv->reserved = block_rsv->size; 4238 block_rsv->full = 1; 4239 } else { 4240 num_bytes = 0; 4241 } 4242 spin_unlock(&block_rsv->lock); 4243 4244 if (num_bytes > 0) { 4245 if (dest) { 4246 spin_lock(&dest->lock); 4247 if (!dest->full) { 4248 u64 bytes_to_add; 4249 4250 bytes_to_add = dest->size - dest->reserved; 4251 bytes_to_add = min(num_bytes, bytes_to_add); 4252 dest->reserved += bytes_to_add; 4253 if (dest->reserved >= dest->size) 4254 dest->full = 1; 4255 num_bytes -= bytes_to_add; 4256 } 4257 spin_unlock(&dest->lock); 4258 } 4259 if (num_bytes) { 4260 spin_lock(&space_info->lock); 4261 space_info->bytes_may_use -= num_bytes; 4262 trace_btrfs_space_reservation(fs_info, "space_info", 4263 space_info->flags, num_bytes, 0); 4264 space_info->reservation_progress++; 4265 spin_unlock(&space_info->lock); 4266 } 4267 } 4268 } 4269 4270 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, 4271 struct btrfs_block_rsv *dst, u64 num_bytes) 4272 { 4273 int ret; 4274 4275 ret = block_rsv_use_bytes(src, num_bytes); 4276 if (ret) 4277 return ret; 4278 4279 block_rsv_add_bytes(dst, num_bytes, 1); 4280 return 0; 4281 } 4282 4283 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 4284 { 4285 memset(rsv, 0, sizeof(*rsv)); 4286 spin_lock_init(&rsv->lock); 4287 rsv->type = type; 4288 } 4289 4290 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, 4291 unsigned short type) 4292 { 4293 struct btrfs_block_rsv *block_rsv; 4294 struct btrfs_fs_info *fs_info = root->fs_info; 4295 4296 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 4297 if (!block_rsv) 4298 return NULL; 4299 4300 btrfs_init_block_rsv(block_rsv, type); 4301 block_rsv->space_info = __find_space_info(fs_info, 4302 BTRFS_BLOCK_GROUP_METADATA); 4303 return block_rsv; 4304 } 4305 4306 void btrfs_free_block_rsv(struct btrfs_root *root, 4307 struct btrfs_block_rsv *rsv) 4308 { 4309 if (!rsv) 4310 return; 4311 btrfs_block_rsv_release(root, rsv, (u64)-1); 4312 kfree(rsv); 4313 } 4314 4315 int btrfs_block_rsv_add(struct btrfs_root *root, 4316 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 4317 enum btrfs_reserve_flush_enum flush) 4318 { 4319 int ret; 4320 4321 if (num_bytes == 0) 4322 return 0; 4323 4324 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4325 if (!ret) { 4326 block_rsv_add_bytes(block_rsv, num_bytes, 1); 4327 return 0; 4328 } 4329 4330 return ret; 4331 } 4332 4333 int btrfs_block_rsv_check(struct btrfs_root *root, 4334 struct btrfs_block_rsv *block_rsv, int min_factor) 4335 { 4336 u64 num_bytes = 0; 4337 int ret = -ENOSPC; 4338 4339 if (!block_rsv) 4340 return 0; 4341 4342 spin_lock(&block_rsv->lock); 4343 num_bytes = div_factor(block_rsv->size, min_factor); 4344 if (block_rsv->reserved >= num_bytes) 4345 ret = 0; 4346 spin_unlock(&block_rsv->lock); 4347 4348 return ret; 4349 } 4350 4351 int btrfs_block_rsv_refill(struct btrfs_root *root, 4352 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 4353 enum btrfs_reserve_flush_enum flush) 4354 { 4355 u64 num_bytes = 0; 4356 int ret = -ENOSPC; 4357 4358 if (!block_rsv) 4359 return 0; 4360 4361 spin_lock(&block_rsv->lock); 4362 num_bytes = min_reserved; 4363 if (block_rsv->reserved >= num_bytes) 4364 ret = 0; 4365 else 4366 num_bytes -= block_rsv->reserved; 4367 spin_unlock(&block_rsv->lock); 4368 4369 if (!ret) 4370 return 0; 4371 4372 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4373 if (!ret) { 4374 block_rsv_add_bytes(block_rsv, num_bytes, 0); 4375 return 0; 4376 } 4377 4378 return ret; 4379 } 4380 4381 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4382 struct btrfs_block_rsv *dst_rsv, 4383 u64 num_bytes) 4384 { 4385 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4386 } 4387 4388 void btrfs_block_rsv_release(struct btrfs_root *root, 4389 struct btrfs_block_rsv *block_rsv, 4390 u64 num_bytes) 4391 { 4392 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4393 if (global_rsv->full || global_rsv == block_rsv || 4394 block_rsv->space_info != global_rsv->space_info) 4395 global_rsv = NULL; 4396 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, 4397 num_bytes); 4398 } 4399 4400 /* 4401 * helper to calculate size of global block reservation. 4402 * the desired value is sum of space used by extent tree, 4403 * checksum tree and root tree 4404 */ 4405 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) 4406 { 4407 struct btrfs_space_info *sinfo; 4408 u64 num_bytes; 4409 u64 meta_used; 4410 u64 data_used; 4411 int csum_size = btrfs_super_csum_size(fs_info->super_copy); 4412 4413 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 4414 spin_lock(&sinfo->lock); 4415 data_used = sinfo->bytes_used; 4416 spin_unlock(&sinfo->lock); 4417 4418 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4419 spin_lock(&sinfo->lock); 4420 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) 4421 data_used = 0; 4422 meta_used = sinfo->bytes_used; 4423 spin_unlock(&sinfo->lock); 4424 4425 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * 4426 csum_size * 2; 4427 num_bytes += div64_u64(data_used + meta_used, 50); 4428 4429 if (num_bytes * 3 > meta_used) 4430 num_bytes = div64_u64(meta_used, 3); 4431 4432 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); 4433 } 4434 4435 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 4436 { 4437 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 4438 struct btrfs_space_info *sinfo = block_rsv->space_info; 4439 u64 num_bytes; 4440 4441 num_bytes = calc_global_metadata_size(fs_info); 4442 4443 spin_lock(&sinfo->lock); 4444 spin_lock(&block_rsv->lock); 4445 4446 block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); 4447 4448 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 4449 sinfo->bytes_reserved + sinfo->bytes_readonly + 4450 sinfo->bytes_may_use; 4451 4452 if (sinfo->total_bytes > num_bytes) { 4453 num_bytes = sinfo->total_bytes - num_bytes; 4454 block_rsv->reserved += num_bytes; 4455 sinfo->bytes_may_use += num_bytes; 4456 trace_btrfs_space_reservation(fs_info, "space_info", 4457 sinfo->flags, num_bytes, 1); 4458 } 4459 4460 if (block_rsv->reserved >= block_rsv->size) { 4461 num_bytes = block_rsv->reserved - block_rsv->size; 4462 sinfo->bytes_may_use -= num_bytes; 4463 trace_btrfs_space_reservation(fs_info, "space_info", 4464 sinfo->flags, num_bytes, 0); 4465 sinfo->reservation_progress++; 4466 block_rsv->reserved = block_rsv->size; 4467 block_rsv->full = 1; 4468 } 4469 4470 spin_unlock(&block_rsv->lock); 4471 spin_unlock(&sinfo->lock); 4472 } 4473 4474 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 4475 { 4476 struct btrfs_space_info *space_info; 4477 4478 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4479 fs_info->chunk_block_rsv.space_info = space_info; 4480 4481 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4482 fs_info->global_block_rsv.space_info = space_info; 4483 fs_info->delalloc_block_rsv.space_info = space_info; 4484 fs_info->trans_block_rsv.space_info = space_info; 4485 fs_info->empty_block_rsv.space_info = space_info; 4486 fs_info->delayed_block_rsv.space_info = space_info; 4487 4488 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 4489 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4490 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 4491 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4492 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4493 4494 update_global_block_rsv(fs_info); 4495 } 4496 4497 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 4498 { 4499 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 4500 (u64)-1); 4501 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 4502 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 4503 WARN_ON(fs_info->trans_block_rsv.size > 0); 4504 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4505 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4506 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4507 WARN_ON(fs_info->delayed_block_rsv.size > 0); 4508 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 4509 } 4510 4511 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4512 struct btrfs_root *root) 4513 { 4514 if (!trans->block_rsv) 4515 return; 4516 4517 if (!trans->bytes_reserved) 4518 return; 4519 4520 trace_btrfs_space_reservation(root->fs_info, "transaction", 4521 trans->transid, trans->bytes_reserved, 0); 4522 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 4523 trans->bytes_reserved = 0; 4524 } 4525 4526 /* Can only return 0 or -ENOSPC */ 4527 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 4528 struct inode *inode) 4529 { 4530 struct btrfs_root *root = BTRFS_I(inode)->root; 4531 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4532 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 4533 4534 /* 4535 * We need to hold space in order to delete our orphan item once we've 4536 * added it, so this takes the reservation so we can release it later 4537 * when we are truly done with the orphan item. 4538 */ 4539 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4540 trace_btrfs_space_reservation(root->fs_info, "orphan", 4541 btrfs_ino(inode), num_bytes, 1); 4542 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4543 } 4544 4545 void btrfs_orphan_release_metadata(struct inode *inode) 4546 { 4547 struct btrfs_root *root = BTRFS_I(inode)->root; 4548 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4549 trace_btrfs_space_reservation(root->fs_info, "orphan", 4550 btrfs_ino(inode), num_bytes, 0); 4551 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4552 } 4553 4554 /* 4555 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 4556 * root: the root of the parent directory 4557 * rsv: block reservation 4558 * items: the number of items that we need do reservation 4559 * qgroup_reserved: used to return the reserved size in qgroup 4560 * 4561 * This function is used to reserve the space for snapshot/subvolume 4562 * creation and deletion. Those operations are different with the 4563 * common file/directory operations, they change two fs/file trees 4564 * and root tree, the number of items that the qgroup reserves is 4565 * different with the free space reservation. So we can not use 4566 * the space reseravtion mechanism in start_transaction(). 4567 */ 4568 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 4569 struct btrfs_block_rsv *rsv, 4570 int items, 4571 u64 *qgroup_reserved) 4572 { 4573 u64 num_bytes; 4574 int ret; 4575 4576 if (root->fs_info->quota_enabled) { 4577 /* One for parent inode, two for dir entries */ 4578 num_bytes = 3 * root->leafsize; 4579 ret = btrfs_qgroup_reserve(root, num_bytes); 4580 if (ret) 4581 return ret; 4582 } else { 4583 num_bytes = 0; 4584 } 4585 4586 *qgroup_reserved = num_bytes; 4587 4588 num_bytes = btrfs_calc_trans_metadata_size(root, items); 4589 rsv->space_info = __find_space_info(root->fs_info, 4590 BTRFS_BLOCK_GROUP_METADATA); 4591 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 4592 BTRFS_RESERVE_FLUSH_ALL); 4593 if (ret) { 4594 if (*qgroup_reserved) 4595 btrfs_qgroup_free(root, *qgroup_reserved); 4596 } 4597 4598 return ret; 4599 } 4600 4601 void btrfs_subvolume_release_metadata(struct btrfs_root *root, 4602 struct btrfs_block_rsv *rsv, 4603 u64 qgroup_reserved) 4604 { 4605 btrfs_block_rsv_release(root, rsv, (u64)-1); 4606 if (qgroup_reserved) 4607 btrfs_qgroup_free(root, qgroup_reserved); 4608 } 4609 4610 /** 4611 * drop_outstanding_extent - drop an outstanding extent 4612 * @inode: the inode we're dropping the extent for 4613 * 4614 * This is called when we are freeing up an outstanding extent, either called 4615 * after an error or after an extent is written. This will return the number of 4616 * reserved extents that need to be freed. This must be called with 4617 * BTRFS_I(inode)->lock held. 4618 */ 4619 static unsigned drop_outstanding_extent(struct inode *inode) 4620 { 4621 unsigned drop_inode_space = 0; 4622 unsigned dropped_extents = 0; 4623 4624 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4625 BTRFS_I(inode)->outstanding_extents--; 4626 4627 if (BTRFS_I(inode)->outstanding_extents == 0 && 4628 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 4629 &BTRFS_I(inode)->runtime_flags)) 4630 drop_inode_space = 1; 4631 4632 /* 4633 * If we have more or the same amount of outsanding extents than we have 4634 * reserved then we need to leave the reserved extents count alone. 4635 */ 4636 if (BTRFS_I(inode)->outstanding_extents >= 4637 BTRFS_I(inode)->reserved_extents) 4638 return drop_inode_space; 4639 4640 dropped_extents = BTRFS_I(inode)->reserved_extents - 4641 BTRFS_I(inode)->outstanding_extents; 4642 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4643 return dropped_extents + drop_inode_space; 4644 } 4645 4646 /** 4647 * calc_csum_metadata_size - return the amount of metada space that must be 4648 * reserved/free'd for the given bytes. 4649 * @inode: the inode we're manipulating 4650 * @num_bytes: the number of bytes in question 4651 * @reserve: 1 if we are reserving space, 0 if we are freeing space 4652 * 4653 * This adjusts the number of csum_bytes in the inode and then returns the 4654 * correct amount of metadata that must either be reserved or freed. We 4655 * calculate how many checksums we can fit into one leaf and then divide the 4656 * number of bytes that will need to be checksumed by this value to figure out 4657 * how many checksums will be required. If we are adding bytes then the number 4658 * may go up and we will return the number of additional bytes that must be 4659 * reserved. If it is going down we will return the number of bytes that must 4660 * be freed. 4661 * 4662 * This must be called with BTRFS_I(inode)->lock held. 4663 */ 4664 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, 4665 int reserve) 4666 { 4667 struct btrfs_root *root = BTRFS_I(inode)->root; 4668 u64 csum_size; 4669 int num_csums_per_leaf; 4670 int num_csums; 4671 int old_csums; 4672 4673 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && 4674 BTRFS_I(inode)->csum_bytes == 0) 4675 return 0; 4676 4677 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 4678 if (reserve) 4679 BTRFS_I(inode)->csum_bytes += num_bytes; 4680 else 4681 BTRFS_I(inode)->csum_bytes -= num_bytes; 4682 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); 4683 num_csums_per_leaf = (int)div64_u64(csum_size, 4684 sizeof(struct btrfs_csum_item) + 4685 sizeof(struct btrfs_disk_key)); 4686 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 4687 num_csums = num_csums + num_csums_per_leaf - 1; 4688 num_csums = num_csums / num_csums_per_leaf; 4689 4690 old_csums = old_csums + num_csums_per_leaf - 1; 4691 old_csums = old_csums / num_csums_per_leaf; 4692 4693 /* No change, no need to reserve more */ 4694 if (old_csums == num_csums) 4695 return 0; 4696 4697 if (reserve) 4698 return btrfs_calc_trans_metadata_size(root, 4699 num_csums - old_csums); 4700 4701 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); 4702 } 4703 4704 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4705 { 4706 struct btrfs_root *root = BTRFS_I(inode)->root; 4707 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4708 u64 to_reserve = 0; 4709 u64 csum_bytes; 4710 unsigned nr_extents = 0; 4711 int extra_reserve = 0; 4712 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 4713 int ret = 0; 4714 bool delalloc_lock = true; 4715 u64 to_free = 0; 4716 unsigned dropped; 4717 4718 /* If we are a free space inode we need to not flush since we will be in 4719 * the middle of a transaction commit. We also don't need the delalloc 4720 * mutex since we won't race with anybody. We need this mostly to make 4721 * lockdep shut its filthy mouth. 4722 */ 4723 if (btrfs_is_free_space_inode(inode)) { 4724 flush = BTRFS_RESERVE_NO_FLUSH; 4725 delalloc_lock = false; 4726 } 4727 4728 if (flush != BTRFS_RESERVE_NO_FLUSH && 4729 btrfs_transaction_in_commit(root->fs_info)) 4730 schedule_timeout(1); 4731 4732 if (delalloc_lock) 4733 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 4734 4735 num_bytes = ALIGN(num_bytes, root->sectorsize); 4736 4737 spin_lock(&BTRFS_I(inode)->lock); 4738 BTRFS_I(inode)->outstanding_extents++; 4739 4740 if (BTRFS_I(inode)->outstanding_extents > 4741 BTRFS_I(inode)->reserved_extents) 4742 nr_extents = BTRFS_I(inode)->outstanding_extents - 4743 BTRFS_I(inode)->reserved_extents; 4744 4745 /* 4746 * Add an item to reserve for updating the inode when we complete the 4747 * delalloc io. 4748 */ 4749 if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 4750 &BTRFS_I(inode)->runtime_flags)) { 4751 nr_extents++; 4752 extra_reserve = 1; 4753 } 4754 4755 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4756 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 4757 csum_bytes = BTRFS_I(inode)->csum_bytes; 4758 spin_unlock(&BTRFS_I(inode)->lock); 4759 4760 if (root->fs_info->quota_enabled) { 4761 ret = btrfs_qgroup_reserve(root, num_bytes + 4762 nr_extents * root->leafsize); 4763 if (ret) 4764 goto out_fail; 4765 } 4766 4767 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4768 if (unlikely(ret)) { 4769 if (root->fs_info->quota_enabled) 4770 btrfs_qgroup_free(root, num_bytes + 4771 nr_extents * root->leafsize); 4772 goto out_fail; 4773 } 4774 4775 spin_lock(&BTRFS_I(inode)->lock); 4776 if (extra_reserve) { 4777 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 4778 &BTRFS_I(inode)->runtime_flags); 4779 nr_extents--; 4780 } 4781 BTRFS_I(inode)->reserved_extents += nr_extents; 4782 spin_unlock(&BTRFS_I(inode)->lock); 4783 4784 if (delalloc_lock) 4785 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4786 4787 if (to_reserve) 4788 trace_btrfs_space_reservation(root->fs_info,"delalloc", 4789 btrfs_ino(inode), to_reserve, 1); 4790 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4791 4792 return 0; 4793 4794 out_fail: 4795 spin_lock(&BTRFS_I(inode)->lock); 4796 dropped = drop_outstanding_extent(inode); 4797 /* 4798 * If the inodes csum_bytes is the same as the original 4799 * csum_bytes then we know we haven't raced with any free()ers 4800 * so we can just reduce our inodes csum bytes and carry on. 4801 */ 4802 if (BTRFS_I(inode)->csum_bytes == csum_bytes) { 4803 calc_csum_metadata_size(inode, num_bytes, 0); 4804 } else { 4805 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; 4806 u64 bytes; 4807 4808 /* 4809 * This is tricky, but first we need to figure out how much we 4810 * free'd from any free-ers that occured during this 4811 * reservation, so we reset ->csum_bytes to the csum_bytes 4812 * before we dropped our lock, and then call the free for the 4813 * number of bytes that were freed while we were trying our 4814 * reservation. 4815 */ 4816 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; 4817 BTRFS_I(inode)->csum_bytes = csum_bytes; 4818 to_free = calc_csum_metadata_size(inode, bytes, 0); 4819 4820 4821 /* 4822 * Now we need to see how much we would have freed had we not 4823 * been making this reservation and our ->csum_bytes were not 4824 * artificially inflated. 4825 */ 4826 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; 4827 bytes = csum_bytes - orig_csum_bytes; 4828 bytes = calc_csum_metadata_size(inode, bytes, 0); 4829 4830 /* 4831 * Now reset ->csum_bytes to what it should be. If bytes is 4832 * more than to_free then we would have free'd more space had we 4833 * not had an artificially high ->csum_bytes, so we need to free 4834 * the remainder. If bytes is the same or less then we don't 4835 * need to do anything, the other free-ers did the correct 4836 * thing. 4837 */ 4838 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; 4839 if (bytes > to_free) 4840 to_free = bytes - to_free; 4841 else 4842 to_free = 0; 4843 } 4844 spin_unlock(&BTRFS_I(inode)->lock); 4845 if (dropped) 4846 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4847 4848 if (to_free) { 4849 btrfs_block_rsv_release(root, block_rsv, to_free); 4850 trace_btrfs_space_reservation(root->fs_info, "delalloc", 4851 btrfs_ino(inode), to_free, 0); 4852 } 4853 if (delalloc_lock) 4854 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4855 return ret; 4856 } 4857 4858 /** 4859 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 4860 * @inode: the inode to release the reservation for 4861 * @num_bytes: the number of bytes we're releasing 4862 * 4863 * This will release the metadata reservation for an inode. This can be called 4864 * once we complete IO for a given set of bytes to release their metadata 4865 * reservations. 4866 */ 4867 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4868 { 4869 struct btrfs_root *root = BTRFS_I(inode)->root; 4870 u64 to_free = 0; 4871 unsigned dropped; 4872 4873 num_bytes = ALIGN(num_bytes, root->sectorsize); 4874 spin_lock(&BTRFS_I(inode)->lock); 4875 dropped = drop_outstanding_extent(inode); 4876 4877 if (num_bytes) 4878 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 4879 spin_unlock(&BTRFS_I(inode)->lock); 4880 if (dropped > 0) 4881 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4882 4883 trace_btrfs_space_reservation(root->fs_info, "delalloc", 4884 btrfs_ino(inode), to_free, 0); 4885 if (root->fs_info->quota_enabled) { 4886 btrfs_qgroup_free(root, num_bytes + 4887 dropped * root->leafsize); 4888 } 4889 4890 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4891 to_free); 4892 } 4893 4894 /** 4895 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc 4896 * @inode: inode we're writing to 4897 * @num_bytes: the number of bytes we want to allocate 4898 * 4899 * This will do the following things 4900 * 4901 * o reserve space in the data space info for num_bytes 4902 * o reserve space in the metadata space info based on number of outstanding 4903 * extents and how much csums will be needed 4904 * o add to the inodes ->delalloc_bytes 4905 * o add it to the fs_info's delalloc inodes list. 4906 * 4907 * This will return 0 for success and -ENOSPC if there is no space left. 4908 */ 4909 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 4910 { 4911 int ret; 4912 4913 ret = btrfs_check_data_free_space(inode, num_bytes); 4914 if (ret) 4915 return ret; 4916 4917 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); 4918 if (ret) { 4919 btrfs_free_reserved_data_space(inode, num_bytes); 4920 return ret; 4921 } 4922 4923 return 0; 4924 } 4925 4926 /** 4927 * btrfs_delalloc_release_space - release data and metadata space for delalloc 4928 * @inode: inode we're releasing space for 4929 * @num_bytes: the number of bytes we want to free up 4930 * 4931 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 4932 * called in the case that we don't need the metadata AND data reservations 4933 * anymore. So if there is an error or we insert an inline extent. 4934 * 4935 * This function will release the metadata space that was not used and will 4936 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 4937 * list if there are no delalloc bytes left. 4938 */ 4939 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 4940 { 4941 btrfs_delalloc_release_metadata(inode, num_bytes); 4942 btrfs_free_reserved_data_space(inode, num_bytes); 4943 } 4944 4945 static int update_block_group(struct btrfs_root *root, 4946 u64 bytenr, u64 num_bytes, int alloc) 4947 { 4948 struct btrfs_block_group_cache *cache = NULL; 4949 struct btrfs_fs_info *info = root->fs_info; 4950 u64 total = num_bytes; 4951 u64 old_val; 4952 u64 byte_in_group; 4953 int factor; 4954 4955 /* block accounting for super block */ 4956 spin_lock(&info->delalloc_lock); 4957 old_val = btrfs_super_bytes_used(info->super_copy); 4958 if (alloc) 4959 old_val += num_bytes; 4960 else 4961 old_val -= num_bytes; 4962 btrfs_set_super_bytes_used(info->super_copy, old_val); 4963 spin_unlock(&info->delalloc_lock); 4964 4965 while (total) { 4966 cache = btrfs_lookup_block_group(info, bytenr); 4967 if (!cache) 4968 return -ENOENT; 4969 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 4970 BTRFS_BLOCK_GROUP_RAID1 | 4971 BTRFS_BLOCK_GROUP_RAID10)) 4972 factor = 2; 4973 else 4974 factor = 1; 4975 /* 4976 * If this block group has free space cache written out, we 4977 * need to make sure to load it if we are removing space. This 4978 * is because we need the unpinning stage to actually add the 4979 * space back to the block group, otherwise we will leak space. 4980 */ 4981 if (!alloc && cache->cached == BTRFS_CACHE_NO) 4982 cache_block_group(cache, 1); 4983 4984 byte_in_group = bytenr - cache->key.objectid; 4985 WARN_ON(byte_in_group > cache->key.offset); 4986 4987 spin_lock(&cache->space_info->lock); 4988 spin_lock(&cache->lock); 4989 4990 if (btrfs_test_opt(root, SPACE_CACHE) && 4991 cache->disk_cache_state < BTRFS_DC_CLEAR) 4992 cache->disk_cache_state = BTRFS_DC_CLEAR; 4993 4994 cache->dirty = 1; 4995 old_val = btrfs_block_group_used(&cache->item); 4996 num_bytes = min(total, cache->key.offset - byte_in_group); 4997 if (alloc) { 4998 old_val += num_bytes; 4999 btrfs_set_block_group_used(&cache->item, old_val); 5000 cache->reserved -= num_bytes; 5001 cache->space_info->bytes_reserved -= num_bytes; 5002 cache->space_info->bytes_used += num_bytes; 5003 cache->space_info->disk_used += num_bytes * factor; 5004 spin_unlock(&cache->lock); 5005 spin_unlock(&cache->space_info->lock); 5006 } else { 5007 old_val -= num_bytes; 5008 btrfs_set_block_group_used(&cache->item, old_val); 5009 cache->pinned += num_bytes; 5010 cache->space_info->bytes_pinned += num_bytes; 5011 cache->space_info->bytes_used -= num_bytes; 5012 cache->space_info->disk_used -= num_bytes * factor; 5013 spin_unlock(&cache->lock); 5014 spin_unlock(&cache->space_info->lock); 5015 5016 set_extent_dirty(info->pinned_extents, 5017 bytenr, bytenr + num_bytes - 1, 5018 GFP_NOFS | __GFP_NOFAIL); 5019 } 5020 btrfs_put_block_group(cache); 5021 total -= num_bytes; 5022 bytenr += num_bytes; 5023 } 5024 return 0; 5025 } 5026 5027 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 5028 { 5029 struct btrfs_block_group_cache *cache; 5030 u64 bytenr; 5031 5032 spin_lock(&root->fs_info->block_group_cache_lock); 5033 bytenr = root->fs_info->first_logical_byte; 5034 spin_unlock(&root->fs_info->block_group_cache_lock); 5035 5036 if (bytenr < (u64)-1) 5037 return bytenr; 5038 5039 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 5040 if (!cache) 5041 return 0; 5042 5043 bytenr = cache->key.objectid; 5044 btrfs_put_block_group(cache); 5045 5046 return bytenr; 5047 } 5048 5049 static int pin_down_extent(struct btrfs_root *root, 5050 struct btrfs_block_group_cache *cache, 5051 u64 bytenr, u64 num_bytes, int reserved) 5052 { 5053 spin_lock(&cache->space_info->lock); 5054 spin_lock(&cache->lock); 5055 cache->pinned += num_bytes; 5056 cache->space_info->bytes_pinned += num_bytes; 5057 if (reserved) { 5058 cache->reserved -= num_bytes; 5059 cache->space_info->bytes_reserved -= num_bytes; 5060 } 5061 spin_unlock(&cache->lock); 5062 spin_unlock(&cache->space_info->lock); 5063 5064 set_extent_dirty(root->fs_info->pinned_extents, bytenr, 5065 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 5066 return 0; 5067 } 5068 5069 /* 5070 * this function must be called within transaction 5071 */ 5072 int btrfs_pin_extent(struct btrfs_root *root, 5073 u64 bytenr, u64 num_bytes, int reserved) 5074 { 5075 struct btrfs_block_group_cache *cache; 5076 5077 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5078 BUG_ON(!cache); /* Logic error */ 5079 5080 pin_down_extent(root, cache, bytenr, num_bytes, reserved); 5081 5082 btrfs_put_block_group(cache); 5083 return 0; 5084 } 5085 5086 /* 5087 * this function must be called within transaction 5088 */ 5089 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 5090 u64 bytenr, u64 num_bytes) 5091 { 5092 struct btrfs_block_group_cache *cache; 5093 5094 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5095 BUG_ON(!cache); /* Logic error */ 5096 5097 /* 5098 * pull in the free space cache (if any) so that our pin 5099 * removes the free space from the cache. We have load_only set 5100 * to one because the slow code to read in the free extents does check 5101 * the pinned extents. 5102 */ 5103 cache_block_group(cache, 1); 5104 5105 pin_down_extent(root, cache, bytenr, num_bytes, 0); 5106 5107 /* remove us from the free space cache (if we're there at all) */ 5108 btrfs_remove_free_space(cache, bytenr, num_bytes); 5109 btrfs_put_block_group(cache); 5110 return 0; 5111 } 5112 5113 /** 5114 * btrfs_update_reserved_bytes - update the block_group and space info counters 5115 * @cache: The cache we are manipulating 5116 * @num_bytes: The number of bytes in question 5117 * @reserve: One of the reservation enums 5118 * 5119 * This is called by the allocator when it reserves space, or by somebody who is 5120 * freeing space that was never actually used on disk. For example if you 5121 * reserve some space for a new leaf in transaction A and before transaction A 5122 * commits you free that leaf, you call this with reserve set to 0 in order to 5123 * clear the reservation. 5124 * 5125 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper 5126 * ENOSPC accounting. For data we handle the reservation through clearing the 5127 * delalloc bits in the io_tree. We have to do this since we could end up 5128 * allocating less disk space for the amount of data we have reserved in the 5129 * case of compression. 5130 * 5131 * If this is a reservation and the block group has become read only we cannot 5132 * make the reservation and return -EAGAIN, otherwise this function always 5133 * succeeds. 5134 */ 5135 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 5136 u64 num_bytes, int reserve) 5137 { 5138 struct btrfs_space_info *space_info = cache->space_info; 5139 int ret = 0; 5140 5141 spin_lock(&space_info->lock); 5142 spin_lock(&cache->lock); 5143 if (reserve != RESERVE_FREE) { 5144 if (cache->ro) { 5145 ret = -EAGAIN; 5146 } else { 5147 cache->reserved += num_bytes; 5148 space_info->bytes_reserved += num_bytes; 5149 if (reserve == RESERVE_ALLOC) { 5150 trace_btrfs_space_reservation(cache->fs_info, 5151 "space_info", space_info->flags, 5152 num_bytes, 0); 5153 space_info->bytes_may_use -= num_bytes; 5154 } 5155 } 5156 } else { 5157 if (cache->ro) 5158 space_info->bytes_readonly += num_bytes; 5159 cache->reserved -= num_bytes; 5160 space_info->bytes_reserved -= num_bytes; 5161 space_info->reservation_progress++; 5162 } 5163 spin_unlock(&cache->lock); 5164 spin_unlock(&space_info->lock); 5165 return ret; 5166 } 5167 5168 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 5169 struct btrfs_root *root) 5170 { 5171 struct btrfs_fs_info *fs_info = root->fs_info; 5172 struct btrfs_caching_control *next; 5173 struct btrfs_caching_control *caching_ctl; 5174 struct btrfs_block_group_cache *cache; 5175 5176 down_write(&fs_info->extent_commit_sem); 5177 5178 list_for_each_entry_safe(caching_ctl, next, 5179 &fs_info->caching_block_groups, list) { 5180 cache = caching_ctl->block_group; 5181 if (block_group_cache_done(cache)) { 5182 cache->last_byte_to_unpin = (u64)-1; 5183 list_del_init(&caching_ctl->list); 5184 put_caching_control(caching_ctl); 5185 } else { 5186 cache->last_byte_to_unpin = caching_ctl->progress; 5187 } 5188 } 5189 5190 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5191 fs_info->pinned_extents = &fs_info->freed_extents[1]; 5192 else 5193 fs_info->pinned_extents = &fs_info->freed_extents[0]; 5194 5195 up_write(&fs_info->extent_commit_sem); 5196 5197 update_global_block_rsv(fs_info); 5198 } 5199 5200 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 5201 { 5202 struct btrfs_fs_info *fs_info = root->fs_info; 5203 struct btrfs_block_group_cache *cache = NULL; 5204 struct btrfs_space_info *space_info; 5205 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5206 u64 len; 5207 bool readonly; 5208 5209 while (start <= end) { 5210 readonly = false; 5211 if (!cache || 5212 start >= cache->key.objectid + cache->key.offset) { 5213 if (cache) 5214 btrfs_put_block_group(cache); 5215 cache = btrfs_lookup_block_group(fs_info, start); 5216 BUG_ON(!cache); /* Logic error */ 5217 } 5218 5219 len = cache->key.objectid + cache->key.offset - start; 5220 len = min(len, end + 1 - start); 5221 5222 if (start < cache->last_byte_to_unpin) { 5223 len = min(len, cache->last_byte_to_unpin - start); 5224 btrfs_add_free_space(cache, start, len); 5225 } 5226 5227 start += len; 5228 space_info = cache->space_info; 5229 5230 spin_lock(&space_info->lock); 5231 spin_lock(&cache->lock); 5232 cache->pinned -= len; 5233 space_info->bytes_pinned -= len; 5234 if (cache->ro) { 5235 space_info->bytes_readonly += len; 5236 readonly = true; 5237 } 5238 spin_unlock(&cache->lock); 5239 if (!readonly && global_rsv->space_info == space_info) { 5240 spin_lock(&global_rsv->lock); 5241 if (!global_rsv->full) { 5242 len = min(len, global_rsv->size - 5243 global_rsv->reserved); 5244 global_rsv->reserved += len; 5245 space_info->bytes_may_use += len; 5246 if (global_rsv->reserved >= global_rsv->size) 5247 global_rsv->full = 1; 5248 } 5249 spin_unlock(&global_rsv->lock); 5250 } 5251 spin_unlock(&space_info->lock); 5252 } 5253 5254 if (cache) 5255 btrfs_put_block_group(cache); 5256 return 0; 5257 } 5258 5259 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 5260 struct btrfs_root *root) 5261 { 5262 struct btrfs_fs_info *fs_info = root->fs_info; 5263 struct extent_io_tree *unpin; 5264 u64 start; 5265 u64 end; 5266 int ret; 5267 5268 if (trans->aborted) 5269 return 0; 5270 5271 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5272 unpin = &fs_info->freed_extents[1]; 5273 else 5274 unpin = &fs_info->freed_extents[0]; 5275 5276 while (1) { 5277 ret = find_first_extent_bit(unpin, 0, &start, &end, 5278 EXTENT_DIRTY, NULL); 5279 if (ret) 5280 break; 5281 5282 if (btrfs_test_opt(root, DISCARD)) 5283 ret = btrfs_discard_extent(root, start, 5284 end + 1 - start, NULL); 5285 5286 clear_extent_dirty(unpin, start, end, GFP_NOFS); 5287 unpin_extent_range(root, start, end); 5288 cond_resched(); 5289 } 5290 5291 return 0; 5292 } 5293 5294 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 5295 struct btrfs_root *root, 5296 u64 bytenr, u64 num_bytes, u64 parent, 5297 u64 root_objectid, u64 owner_objectid, 5298 u64 owner_offset, int refs_to_drop, 5299 struct btrfs_delayed_extent_op *extent_op) 5300 { 5301 struct btrfs_key key; 5302 struct btrfs_path *path; 5303 struct btrfs_fs_info *info = root->fs_info; 5304 struct btrfs_root *extent_root = info->extent_root; 5305 struct extent_buffer *leaf; 5306 struct btrfs_extent_item *ei; 5307 struct btrfs_extent_inline_ref *iref; 5308 int ret; 5309 int is_data; 5310 int extent_slot = 0; 5311 int found_extent = 0; 5312 int num_to_del = 1; 5313 u32 item_size; 5314 u64 refs; 5315 5316 path = btrfs_alloc_path(); 5317 if (!path) 5318 return -ENOMEM; 5319 5320 path->reada = 1; 5321 path->leave_spinning = 1; 5322 5323 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 5324 BUG_ON(!is_data && refs_to_drop != 1); 5325 5326 ret = lookup_extent_backref(trans, extent_root, path, &iref, 5327 bytenr, num_bytes, parent, 5328 root_objectid, owner_objectid, 5329 owner_offset); 5330 if (ret == 0) { 5331 extent_slot = path->slots[0]; 5332 while (extent_slot >= 0) { 5333 btrfs_item_key_to_cpu(path->nodes[0], &key, 5334 extent_slot); 5335 if (key.objectid != bytenr) 5336 break; 5337 if (key.type == BTRFS_EXTENT_ITEM_KEY && 5338 key.offset == num_bytes) { 5339 found_extent = 1; 5340 break; 5341 } 5342 if (path->slots[0] - extent_slot > 5) 5343 break; 5344 extent_slot--; 5345 } 5346 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5347 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 5348 if (found_extent && item_size < sizeof(*ei)) 5349 found_extent = 0; 5350 #endif 5351 if (!found_extent) { 5352 BUG_ON(iref); 5353 ret = remove_extent_backref(trans, extent_root, path, 5354 NULL, refs_to_drop, 5355 is_data); 5356 if (ret) { 5357 btrfs_abort_transaction(trans, extent_root, ret); 5358 goto out; 5359 } 5360 btrfs_release_path(path); 5361 path->leave_spinning = 1; 5362 5363 key.objectid = bytenr; 5364 key.type = BTRFS_EXTENT_ITEM_KEY; 5365 key.offset = num_bytes; 5366 5367 ret = btrfs_search_slot(trans, extent_root, 5368 &key, path, -1, 1); 5369 if (ret) { 5370 printk(KERN_ERR "umm, got %d back from search" 5371 ", was looking for %llu\n", ret, 5372 (unsigned long long)bytenr); 5373 if (ret > 0) 5374 btrfs_print_leaf(extent_root, 5375 path->nodes[0]); 5376 } 5377 if (ret < 0) { 5378 btrfs_abort_transaction(trans, extent_root, ret); 5379 goto out; 5380 } 5381 extent_slot = path->slots[0]; 5382 } 5383 } else if (ret == -ENOENT) { 5384 btrfs_print_leaf(extent_root, path->nodes[0]); 5385 WARN_ON(1); 5386 printk(KERN_ERR "btrfs unable to find ref byte nr %llu " 5387 "parent %llu root %llu owner %llu offset %llu\n", 5388 (unsigned long long)bytenr, 5389 (unsigned long long)parent, 5390 (unsigned long long)root_objectid, 5391 (unsigned long long)owner_objectid, 5392 (unsigned long long)owner_offset); 5393 } else { 5394 btrfs_abort_transaction(trans, extent_root, ret); 5395 goto out; 5396 } 5397 5398 leaf = path->nodes[0]; 5399 item_size = btrfs_item_size_nr(leaf, extent_slot); 5400 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5401 if (item_size < sizeof(*ei)) { 5402 BUG_ON(found_extent || extent_slot != path->slots[0]); 5403 ret = convert_extent_item_v0(trans, extent_root, path, 5404 owner_objectid, 0); 5405 if (ret < 0) { 5406 btrfs_abort_transaction(trans, extent_root, ret); 5407 goto out; 5408 } 5409 5410 btrfs_release_path(path); 5411 path->leave_spinning = 1; 5412 5413 key.objectid = bytenr; 5414 key.type = BTRFS_EXTENT_ITEM_KEY; 5415 key.offset = num_bytes; 5416 5417 ret = btrfs_search_slot(trans, extent_root, &key, path, 5418 -1, 1); 5419 if (ret) { 5420 printk(KERN_ERR "umm, got %d back from search" 5421 ", was looking for %llu\n", ret, 5422 (unsigned long long)bytenr); 5423 btrfs_print_leaf(extent_root, path->nodes[0]); 5424 } 5425 if (ret < 0) { 5426 btrfs_abort_transaction(trans, extent_root, ret); 5427 goto out; 5428 } 5429 5430 extent_slot = path->slots[0]; 5431 leaf = path->nodes[0]; 5432 item_size = btrfs_item_size_nr(leaf, extent_slot); 5433 } 5434 #endif 5435 BUG_ON(item_size < sizeof(*ei)); 5436 ei = btrfs_item_ptr(leaf, extent_slot, 5437 struct btrfs_extent_item); 5438 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 5439 struct btrfs_tree_block_info *bi; 5440 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 5441 bi = (struct btrfs_tree_block_info *)(ei + 1); 5442 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 5443 } 5444 5445 refs = btrfs_extent_refs(leaf, ei); 5446 BUG_ON(refs < refs_to_drop); 5447 refs -= refs_to_drop; 5448 5449 if (refs > 0) { 5450 if (extent_op) 5451 __run_delayed_extent_op(extent_op, leaf, ei); 5452 /* 5453 * In the case of inline back ref, reference count will 5454 * be updated by remove_extent_backref 5455 */ 5456 if (iref) { 5457 BUG_ON(!found_extent); 5458 } else { 5459 btrfs_set_extent_refs(leaf, ei, refs); 5460 btrfs_mark_buffer_dirty(leaf); 5461 } 5462 if (found_extent) { 5463 ret = remove_extent_backref(trans, extent_root, path, 5464 iref, refs_to_drop, 5465 is_data); 5466 if (ret) { 5467 btrfs_abort_transaction(trans, extent_root, ret); 5468 goto out; 5469 } 5470 } 5471 } else { 5472 if (found_extent) { 5473 BUG_ON(is_data && refs_to_drop != 5474 extent_data_ref_count(root, path, iref)); 5475 if (iref) { 5476 BUG_ON(path->slots[0] != extent_slot); 5477 } else { 5478 BUG_ON(path->slots[0] != extent_slot + 1); 5479 path->slots[0] = extent_slot; 5480 num_to_del = 2; 5481 } 5482 } 5483 5484 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 5485 num_to_del); 5486 if (ret) { 5487 btrfs_abort_transaction(trans, extent_root, ret); 5488 goto out; 5489 } 5490 btrfs_release_path(path); 5491 5492 if (is_data) { 5493 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 5494 if (ret) { 5495 btrfs_abort_transaction(trans, extent_root, ret); 5496 goto out; 5497 } 5498 } 5499 5500 ret = update_block_group(root, bytenr, num_bytes, 0); 5501 if (ret) { 5502 btrfs_abort_transaction(trans, extent_root, ret); 5503 goto out; 5504 } 5505 } 5506 out: 5507 btrfs_free_path(path); 5508 return ret; 5509 } 5510 5511 /* 5512 * when we free an block, it is possible (and likely) that we free the last 5513 * delayed ref for that extent as well. This searches the delayed ref tree for 5514 * a given extent, and if there are no other delayed refs to be processed, it 5515 * removes it from the tree. 5516 */ 5517 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 5518 struct btrfs_root *root, u64 bytenr) 5519 { 5520 struct btrfs_delayed_ref_head *head; 5521 struct btrfs_delayed_ref_root *delayed_refs; 5522 struct btrfs_delayed_ref_node *ref; 5523 struct rb_node *node; 5524 int ret = 0; 5525 5526 delayed_refs = &trans->transaction->delayed_refs; 5527 spin_lock(&delayed_refs->lock); 5528 head = btrfs_find_delayed_ref_head(trans, bytenr); 5529 if (!head) 5530 goto out; 5531 5532 node = rb_prev(&head->node.rb_node); 5533 if (!node) 5534 goto out; 5535 5536 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 5537 5538 /* there are still entries for this ref, we can't drop it */ 5539 if (ref->bytenr == bytenr) 5540 goto out; 5541 5542 if (head->extent_op) { 5543 if (!head->must_insert_reserved) 5544 goto out; 5545 btrfs_free_delayed_extent_op(head->extent_op); 5546 head->extent_op = NULL; 5547 } 5548 5549 /* 5550 * waiting for the lock here would deadlock. If someone else has it 5551 * locked they are already in the process of dropping it anyway 5552 */ 5553 if (!mutex_trylock(&head->mutex)) 5554 goto out; 5555 5556 /* 5557 * at this point we have a head with no other entries. Go 5558 * ahead and process it. 5559 */ 5560 head->node.in_tree = 0; 5561 rb_erase(&head->node.rb_node, &delayed_refs->root); 5562 5563 delayed_refs->num_entries--; 5564 5565 /* 5566 * we don't take a ref on the node because we're removing it from the 5567 * tree, so we just steal the ref the tree was holding. 5568 */ 5569 delayed_refs->num_heads--; 5570 if (list_empty(&head->cluster)) 5571 delayed_refs->num_heads_ready--; 5572 5573 list_del_init(&head->cluster); 5574 spin_unlock(&delayed_refs->lock); 5575 5576 BUG_ON(head->extent_op); 5577 if (head->must_insert_reserved) 5578 ret = 1; 5579 5580 mutex_unlock(&head->mutex); 5581 btrfs_put_delayed_ref(&head->node); 5582 return ret; 5583 out: 5584 spin_unlock(&delayed_refs->lock); 5585 return 0; 5586 } 5587 5588 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5589 struct btrfs_root *root, 5590 struct extent_buffer *buf, 5591 u64 parent, int last_ref) 5592 { 5593 struct btrfs_block_group_cache *cache = NULL; 5594 int ret; 5595 5596 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5597 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 5598 buf->start, buf->len, 5599 parent, root->root_key.objectid, 5600 btrfs_header_level(buf), 5601 BTRFS_DROP_DELAYED_REF, NULL, 0); 5602 BUG_ON(ret); /* -ENOMEM */ 5603 } 5604 5605 if (!last_ref) 5606 return; 5607 5608 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 5609 5610 if (btrfs_header_generation(buf) == trans->transid) { 5611 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5612 ret = check_ref_cleanup(trans, root, buf->start); 5613 if (!ret) 5614 goto out; 5615 } 5616 5617 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 5618 pin_down_extent(root, cache, buf->start, buf->len, 1); 5619 goto out; 5620 } 5621 5622 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 5623 5624 btrfs_add_free_space(cache, buf->start, buf->len); 5625 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); 5626 } 5627 out: 5628 /* 5629 * Deleting the buffer, clear the corrupt flag since it doesn't matter 5630 * anymore. 5631 */ 5632 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 5633 btrfs_put_block_group(cache); 5634 } 5635 5636 /* Can return -ENOMEM */ 5637 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, 5638 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 5639 u64 owner, u64 offset, int for_cow) 5640 { 5641 int ret; 5642 struct btrfs_fs_info *fs_info = root->fs_info; 5643 5644 /* 5645 * tree log blocks never actually go into the extent allocation 5646 * tree, just update pinning info and exit early. 5647 */ 5648 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 5649 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 5650 /* unlocks the pinned mutex */ 5651 btrfs_pin_extent(root, bytenr, num_bytes, 1); 5652 ret = 0; 5653 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 5654 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 5655 num_bytes, 5656 parent, root_objectid, (int)owner, 5657 BTRFS_DROP_DELAYED_REF, NULL, for_cow); 5658 } else { 5659 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 5660 num_bytes, 5661 parent, root_objectid, owner, 5662 offset, BTRFS_DROP_DELAYED_REF, 5663 NULL, for_cow); 5664 } 5665 return ret; 5666 } 5667 5668 static u64 stripe_align(struct btrfs_root *root, 5669 struct btrfs_block_group_cache *cache, 5670 u64 val, u64 num_bytes) 5671 { 5672 u64 ret = ALIGN(val, root->stripesize); 5673 return ret; 5674 } 5675 5676 /* 5677 * when we wait for progress in the block group caching, its because 5678 * our allocation attempt failed at least once. So, we must sleep 5679 * and let some progress happen before we try again. 5680 * 5681 * This function will sleep at least once waiting for new free space to 5682 * show up, and then it will check the block group free space numbers 5683 * for our min num_bytes. Another option is to have it go ahead 5684 * and look in the rbtree for a free extent of a given size, but this 5685 * is a good start. 5686 */ 5687 static noinline int 5688 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 5689 u64 num_bytes) 5690 { 5691 struct btrfs_caching_control *caching_ctl; 5692 5693 caching_ctl = get_caching_control(cache); 5694 if (!caching_ctl) 5695 return 0; 5696 5697 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 5698 (cache->free_space_ctl->free_space >= num_bytes)); 5699 5700 put_caching_control(caching_ctl); 5701 return 0; 5702 } 5703 5704 static noinline int 5705 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 5706 { 5707 struct btrfs_caching_control *caching_ctl; 5708 5709 caching_ctl = get_caching_control(cache); 5710 if (!caching_ctl) 5711 return 0; 5712 5713 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 5714 5715 put_caching_control(caching_ctl); 5716 return 0; 5717 } 5718 5719 int __get_raid_index(u64 flags) 5720 { 5721 if (flags & BTRFS_BLOCK_GROUP_RAID10) 5722 return BTRFS_RAID_RAID10; 5723 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 5724 return BTRFS_RAID_RAID1; 5725 else if (flags & BTRFS_BLOCK_GROUP_DUP) 5726 return BTRFS_RAID_DUP; 5727 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 5728 return BTRFS_RAID_RAID0; 5729 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 5730 return BTRFS_RAID_RAID5; 5731 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 5732 return BTRFS_RAID_RAID6; 5733 5734 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 5735 } 5736 5737 static int get_block_group_index(struct btrfs_block_group_cache *cache) 5738 { 5739 return __get_raid_index(cache->flags); 5740 } 5741 5742 enum btrfs_loop_type { 5743 LOOP_CACHING_NOWAIT = 0, 5744 LOOP_CACHING_WAIT = 1, 5745 LOOP_ALLOC_CHUNK = 2, 5746 LOOP_NO_EMPTY_SIZE = 3, 5747 }; 5748 5749 /* 5750 * walks the btree of allocated extents and find a hole of a given size. 5751 * The key ins is changed to record the hole: 5752 * ins->objectid == block start 5753 * ins->flags = BTRFS_EXTENT_ITEM_KEY 5754 * ins->offset == number of blocks 5755 * Any available blocks before search_start are skipped. 5756 */ 5757 static noinline int find_free_extent(struct btrfs_trans_handle *trans, 5758 struct btrfs_root *orig_root, 5759 u64 num_bytes, u64 empty_size, 5760 u64 hint_byte, struct btrfs_key *ins, 5761 u64 data) 5762 { 5763 int ret = 0; 5764 struct btrfs_root *root = orig_root->fs_info->extent_root; 5765 struct btrfs_free_cluster *last_ptr = NULL; 5766 struct btrfs_block_group_cache *block_group = NULL; 5767 struct btrfs_block_group_cache *used_block_group; 5768 u64 search_start = 0; 5769 int empty_cluster = 2 * 1024 * 1024; 5770 struct btrfs_space_info *space_info; 5771 int loop = 0; 5772 int index = __get_raid_index(data); 5773 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? 5774 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 5775 bool found_uncached_bg = false; 5776 bool failed_cluster_refill = false; 5777 bool failed_alloc = false; 5778 bool use_cluster = true; 5779 bool have_caching_bg = false; 5780 5781 WARN_ON(num_bytes < root->sectorsize); 5782 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 5783 ins->objectid = 0; 5784 ins->offset = 0; 5785 5786 trace_find_free_extent(orig_root, num_bytes, empty_size, data); 5787 5788 space_info = __find_space_info(root->fs_info, data); 5789 if (!space_info) { 5790 printk(KERN_ERR "No space info for %llu\n", data); 5791 return -ENOSPC; 5792 } 5793 5794 /* 5795 * If the space info is for both data and metadata it means we have a 5796 * small filesystem and we can't use the clustering stuff. 5797 */ 5798 if (btrfs_mixed_space_info(space_info)) 5799 use_cluster = false; 5800 5801 if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 5802 last_ptr = &root->fs_info->meta_alloc_cluster; 5803 if (!btrfs_test_opt(root, SSD)) 5804 empty_cluster = 64 * 1024; 5805 } 5806 5807 if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster && 5808 btrfs_test_opt(root, SSD)) { 5809 last_ptr = &root->fs_info->data_alloc_cluster; 5810 } 5811 5812 if (last_ptr) { 5813 spin_lock(&last_ptr->lock); 5814 if (last_ptr->block_group) 5815 hint_byte = last_ptr->window_start; 5816 spin_unlock(&last_ptr->lock); 5817 } 5818 5819 search_start = max(search_start, first_logical_byte(root, 0)); 5820 search_start = max(search_start, hint_byte); 5821 5822 if (!last_ptr) 5823 empty_cluster = 0; 5824 5825 if (search_start == hint_byte) { 5826 block_group = btrfs_lookup_block_group(root->fs_info, 5827 search_start); 5828 used_block_group = block_group; 5829 /* 5830 * we don't want to use the block group if it doesn't match our 5831 * allocation bits, or if its not cached. 5832 * 5833 * However if we are re-searching with an ideal block group 5834 * picked out then we don't care that the block group is cached. 5835 */ 5836 if (block_group && block_group_bits(block_group, data) && 5837 block_group->cached != BTRFS_CACHE_NO) { 5838 down_read(&space_info->groups_sem); 5839 if (list_empty(&block_group->list) || 5840 block_group->ro) { 5841 /* 5842 * someone is removing this block group, 5843 * we can't jump into the have_block_group 5844 * target because our list pointers are not 5845 * valid 5846 */ 5847 btrfs_put_block_group(block_group); 5848 up_read(&space_info->groups_sem); 5849 } else { 5850 index = get_block_group_index(block_group); 5851 goto have_block_group; 5852 } 5853 } else if (block_group) { 5854 btrfs_put_block_group(block_group); 5855 } 5856 } 5857 search: 5858 have_caching_bg = false; 5859 down_read(&space_info->groups_sem); 5860 list_for_each_entry(block_group, &space_info->block_groups[index], 5861 list) { 5862 u64 offset; 5863 int cached; 5864 5865 used_block_group = block_group; 5866 btrfs_get_block_group(block_group); 5867 search_start = block_group->key.objectid; 5868 5869 /* 5870 * this can happen if we end up cycling through all the 5871 * raid types, but we want to make sure we only allocate 5872 * for the proper type. 5873 */ 5874 if (!block_group_bits(block_group, data)) { 5875 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5876 BTRFS_BLOCK_GROUP_RAID1 | 5877 BTRFS_BLOCK_GROUP_RAID5 | 5878 BTRFS_BLOCK_GROUP_RAID6 | 5879 BTRFS_BLOCK_GROUP_RAID10; 5880 5881 /* 5882 * if they asked for extra copies and this block group 5883 * doesn't provide them, bail. This does allow us to 5884 * fill raid0 from raid1. 5885 */ 5886 if ((data & extra) && !(block_group->flags & extra)) 5887 goto loop; 5888 } 5889 5890 have_block_group: 5891 cached = block_group_cache_done(block_group); 5892 if (unlikely(!cached)) { 5893 found_uncached_bg = true; 5894 ret = cache_block_group(block_group, 0); 5895 BUG_ON(ret < 0); 5896 ret = 0; 5897 } 5898 5899 if (unlikely(block_group->ro)) 5900 goto loop; 5901 5902 /* 5903 * Ok we want to try and use the cluster allocator, so 5904 * lets look there 5905 */ 5906 if (last_ptr) { 5907 unsigned long aligned_cluster; 5908 /* 5909 * the refill lock keeps out other 5910 * people trying to start a new cluster 5911 */ 5912 spin_lock(&last_ptr->refill_lock); 5913 used_block_group = last_ptr->block_group; 5914 if (used_block_group != block_group && 5915 (!used_block_group || 5916 used_block_group->ro || 5917 !block_group_bits(used_block_group, data))) { 5918 used_block_group = block_group; 5919 goto refill_cluster; 5920 } 5921 5922 if (used_block_group != block_group) 5923 btrfs_get_block_group(used_block_group); 5924 5925 offset = btrfs_alloc_from_cluster(used_block_group, 5926 last_ptr, num_bytes, used_block_group->key.objectid); 5927 if (offset) { 5928 /* we have a block, we're done */ 5929 spin_unlock(&last_ptr->refill_lock); 5930 trace_btrfs_reserve_extent_cluster(root, 5931 block_group, search_start, num_bytes); 5932 goto checks; 5933 } 5934 5935 WARN_ON(last_ptr->block_group != used_block_group); 5936 if (used_block_group != block_group) { 5937 btrfs_put_block_group(used_block_group); 5938 used_block_group = block_group; 5939 } 5940 refill_cluster: 5941 BUG_ON(used_block_group != block_group); 5942 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 5943 * set up a new clusters, so lets just skip it 5944 * and let the allocator find whatever block 5945 * it can find. If we reach this point, we 5946 * will have tried the cluster allocator 5947 * plenty of times and not have found 5948 * anything, so we are likely way too 5949 * fragmented for the clustering stuff to find 5950 * anything. 5951 * 5952 * However, if the cluster is taken from the 5953 * current block group, release the cluster 5954 * first, so that we stand a better chance of 5955 * succeeding in the unclustered 5956 * allocation. */ 5957 if (loop >= LOOP_NO_EMPTY_SIZE && 5958 last_ptr->block_group != block_group) { 5959 spin_unlock(&last_ptr->refill_lock); 5960 goto unclustered_alloc; 5961 } 5962 5963 /* 5964 * this cluster didn't work out, free it and 5965 * start over 5966 */ 5967 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5968 5969 if (loop >= LOOP_NO_EMPTY_SIZE) { 5970 spin_unlock(&last_ptr->refill_lock); 5971 goto unclustered_alloc; 5972 } 5973 5974 aligned_cluster = max_t(unsigned long, 5975 empty_cluster + empty_size, 5976 block_group->full_stripe_len); 5977 5978 /* allocate a cluster in this block group */ 5979 ret = btrfs_find_space_cluster(trans, root, 5980 block_group, last_ptr, 5981 search_start, num_bytes, 5982 aligned_cluster); 5983 if (ret == 0) { 5984 /* 5985 * now pull our allocation out of this 5986 * cluster 5987 */ 5988 offset = btrfs_alloc_from_cluster(block_group, 5989 last_ptr, num_bytes, 5990 search_start); 5991 if (offset) { 5992 /* we found one, proceed */ 5993 spin_unlock(&last_ptr->refill_lock); 5994 trace_btrfs_reserve_extent_cluster(root, 5995 block_group, search_start, 5996 num_bytes); 5997 goto checks; 5998 } 5999 } else if (!cached && loop > LOOP_CACHING_NOWAIT 6000 && !failed_cluster_refill) { 6001 spin_unlock(&last_ptr->refill_lock); 6002 6003 failed_cluster_refill = true; 6004 wait_block_group_cache_progress(block_group, 6005 num_bytes + empty_cluster + empty_size); 6006 goto have_block_group; 6007 } 6008 6009 /* 6010 * at this point we either didn't find a cluster 6011 * or we weren't able to allocate a block from our 6012 * cluster. Free the cluster we've been trying 6013 * to use, and go to the next block group 6014 */ 6015 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6016 spin_unlock(&last_ptr->refill_lock); 6017 goto loop; 6018 } 6019 6020 unclustered_alloc: 6021 spin_lock(&block_group->free_space_ctl->tree_lock); 6022 if (cached && 6023 block_group->free_space_ctl->free_space < 6024 num_bytes + empty_cluster + empty_size) { 6025 spin_unlock(&block_group->free_space_ctl->tree_lock); 6026 goto loop; 6027 } 6028 spin_unlock(&block_group->free_space_ctl->tree_lock); 6029 6030 offset = btrfs_find_space_for_alloc(block_group, search_start, 6031 num_bytes, empty_size); 6032 /* 6033 * If we didn't find a chunk, and we haven't failed on this 6034 * block group before, and this block group is in the middle of 6035 * caching and we are ok with waiting, then go ahead and wait 6036 * for progress to be made, and set failed_alloc to true. 6037 * 6038 * If failed_alloc is true then we've already waited on this 6039 * block group once and should move on to the next block group. 6040 */ 6041 if (!offset && !failed_alloc && !cached && 6042 loop > LOOP_CACHING_NOWAIT) { 6043 wait_block_group_cache_progress(block_group, 6044 num_bytes + empty_size); 6045 failed_alloc = true; 6046 goto have_block_group; 6047 } else if (!offset) { 6048 if (!cached) 6049 have_caching_bg = true; 6050 goto loop; 6051 } 6052 checks: 6053 search_start = stripe_align(root, used_block_group, 6054 offset, num_bytes); 6055 6056 /* move on to the next group */ 6057 if (search_start + num_bytes > 6058 used_block_group->key.objectid + used_block_group->key.offset) { 6059 btrfs_add_free_space(used_block_group, offset, num_bytes); 6060 goto loop; 6061 } 6062 6063 if (offset < search_start) 6064 btrfs_add_free_space(used_block_group, offset, 6065 search_start - offset); 6066 BUG_ON(offset > search_start); 6067 6068 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, 6069 alloc_type); 6070 if (ret == -EAGAIN) { 6071 btrfs_add_free_space(used_block_group, offset, num_bytes); 6072 goto loop; 6073 } 6074 6075 /* we are all good, lets return */ 6076 ins->objectid = search_start; 6077 ins->offset = num_bytes; 6078 6079 trace_btrfs_reserve_extent(orig_root, block_group, 6080 search_start, num_bytes); 6081 if (used_block_group != block_group) 6082 btrfs_put_block_group(used_block_group); 6083 btrfs_put_block_group(block_group); 6084 break; 6085 loop: 6086 failed_cluster_refill = false; 6087 failed_alloc = false; 6088 BUG_ON(index != get_block_group_index(block_group)); 6089 if (used_block_group != block_group) 6090 btrfs_put_block_group(used_block_group); 6091 btrfs_put_block_group(block_group); 6092 } 6093 up_read(&space_info->groups_sem); 6094 6095 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 6096 goto search; 6097 6098 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 6099 goto search; 6100 6101 /* 6102 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 6103 * caching kthreads as we move along 6104 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 6105 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 6106 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 6107 * again 6108 */ 6109 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 6110 index = 0; 6111 loop++; 6112 if (loop == LOOP_ALLOC_CHUNK) { 6113 ret = do_chunk_alloc(trans, root, data, 6114 CHUNK_ALLOC_FORCE); 6115 /* 6116 * Do not bail out on ENOSPC since we 6117 * can do more things. 6118 */ 6119 if (ret < 0 && ret != -ENOSPC) { 6120 btrfs_abort_transaction(trans, 6121 root, ret); 6122 goto out; 6123 } 6124 } 6125 6126 if (loop == LOOP_NO_EMPTY_SIZE) { 6127 empty_size = 0; 6128 empty_cluster = 0; 6129 } 6130 6131 goto search; 6132 } else if (!ins->objectid) { 6133 ret = -ENOSPC; 6134 } else if (ins->objectid) { 6135 ret = 0; 6136 } 6137 out: 6138 6139 return ret; 6140 } 6141 6142 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 6143 int dump_block_groups) 6144 { 6145 struct btrfs_block_group_cache *cache; 6146 int index = 0; 6147 6148 spin_lock(&info->lock); 6149 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", 6150 (unsigned long long)info->flags, 6151 (unsigned long long)(info->total_bytes - info->bytes_used - 6152 info->bytes_pinned - info->bytes_reserved - 6153 info->bytes_readonly), 6154 (info->full) ? "" : "not "); 6155 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " 6156 "reserved=%llu, may_use=%llu, readonly=%llu\n", 6157 (unsigned long long)info->total_bytes, 6158 (unsigned long long)info->bytes_used, 6159 (unsigned long long)info->bytes_pinned, 6160 (unsigned long long)info->bytes_reserved, 6161 (unsigned long long)info->bytes_may_use, 6162 (unsigned long long)info->bytes_readonly); 6163 spin_unlock(&info->lock); 6164 6165 if (!dump_block_groups) 6166 return; 6167 6168 down_read(&info->groups_sem); 6169 again: 6170 list_for_each_entry(cache, &info->block_groups[index], list) { 6171 spin_lock(&cache->lock); 6172 printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", 6173 (unsigned long long)cache->key.objectid, 6174 (unsigned long long)cache->key.offset, 6175 (unsigned long long)btrfs_block_group_used(&cache->item), 6176 (unsigned long long)cache->pinned, 6177 (unsigned long long)cache->reserved, 6178 cache->ro ? "[readonly]" : ""); 6179 btrfs_dump_free_space(cache, bytes); 6180 spin_unlock(&cache->lock); 6181 } 6182 if (++index < BTRFS_NR_RAID_TYPES) 6183 goto again; 6184 up_read(&info->groups_sem); 6185 } 6186 6187 int btrfs_reserve_extent(struct btrfs_trans_handle *trans, 6188 struct btrfs_root *root, 6189 u64 num_bytes, u64 min_alloc_size, 6190 u64 empty_size, u64 hint_byte, 6191 struct btrfs_key *ins, u64 data) 6192 { 6193 bool final_tried = false; 6194 int ret; 6195 6196 data = btrfs_get_alloc_profile(root, data); 6197 again: 6198 WARN_ON(num_bytes < root->sectorsize); 6199 ret = find_free_extent(trans, root, num_bytes, empty_size, 6200 hint_byte, ins, data); 6201 6202 if (ret == -ENOSPC) { 6203 if (!final_tried) { 6204 num_bytes = num_bytes >> 1; 6205 num_bytes = round_down(num_bytes, root->sectorsize); 6206 num_bytes = max(num_bytes, min_alloc_size); 6207 if (num_bytes == min_alloc_size) 6208 final_tried = true; 6209 goto again; 6210 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 6211 struct btrfs_space_info *sinfo; 6212 6213 sinfo = __find_space_info(root->fs_info, data); 6214 printk(KERN_ERR "btrfs allocation failed flags %llu, " 6215 "wanted %llu\n", (unsigned long long)data, 6216 (unsigned long long)num_bytes); 6217 if (sinfo) 6218 dump_space_info(sinfo, num_bytes, 1); 6219 } 6220 } 6221 6222 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); 6223 6224 return ret; 6225 } 6226 6227 static int __btrfs_free_reserved_extent(struct btrfs_root *root, 6228 u64 start, u64 len, int pin) 6229 { 6230 struct btrfs_block_group_cache *cache; 6231 int ret = 0; 6232 6233 cache = btrfs_lookup_block_group(root->fs_info, start); 6234 if (!cache) { 6235 printk(KERN_ERR "Unable to find block group for %llu\n", 6236 (unsigned long long)start); 6237 return -ENOSPC; 6238 } 6239 6240 if (btrfs_test_opt(root, DISCARD)) 6241 ret = btrfs_discard_extent(root, start, len, NULL); 6242 6243 if (pin) 6244 pin_down_extent(root, cache, start, len, 1); 6245 else { 6246 btrfs_add_free_space(cache, start, len); 6247 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); 6248 } 6249 btrfs_put_block_group(cache); 6250 6251 trace_btrfs_reserved_extent_free(root, start, len); 6252 6253 return ret; 6254 } 6255 6256 int btrfs_free_reserved_extent(struct btrfs_root *root, 6257 u64 start, u64 len) 6258 { 6259 return __btrfs_free_reserved_extent(root, start, len, 0); 6260 } 6261 6262 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 6263 u64 start, u64 len) 6264 { 6265 return __btrfs_free_reserved_extent(root, start, len, 1); 6266 } 6267 6268 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6269 struct btrfs_root *root, 6270 u64 parent, u64 root_objectid, 6271 u64 flags, u64 owner, u64 offset, 6272 struct btrfs_key *ins, int ref_mod) 6273 { 6274 int ret; 6275 struct btrfs_fs_info *fs_info = root->fs_info; 6276 struct btrfs_extent_item *extent_item; 6277 struct btrfs_extent_inline_ref *iref; 6278 struct btrfs_path *path; 6279 struct extent_buffer *leaf; 6280 int type; 6281 u32 size; 6282 6283 if (parent > 0) 6284 type = BTRFS_SHARED_DATA_REF_KEY; 6285 else 6286 type = BTRFS_EXTENT_DATA_REF_KEY; 6287 6288 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 6289 6290 path = btrfs_alloc_path(); 6291 if (!path) 6292 return -ENOMEM; 6293 6294 path->leave_spinning = 1; 6295 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 6296 ins, size); 6297 if (ret) { 6298 btrfs_free_path(path); 6299 return ret; 6300 } 6301 6302 leaf = path->nodes[0]; 6303 extent_item = btrfs_item_ptr(leaf, path->slots[0], 6304 struct btrfs_extent_item); 6305 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 6306 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 6307 btrfs_set_extent_flags(leaf, extent_item, 6308 flags | BTRFS_EXTENT_FLAG_DATA); 6309 6310 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 6311 btrfs_set_extent_inline_ref_type(leaf, iref, type); 6312 if (parent > 0) { 6313 struct btrfs_shared_data_ref *ref; 6314 ref = (struct btrfs_shared_data_ref *)(iref + 1); 6315 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 6316 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 6317 } else { 6318 struct btrfs_extent_data_ref *ref; 6319 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 6320 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 6321 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 6322 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 6323 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 6324 } 6325 6326 btrfs_mark_buffer_dirty(path->nodes[0]); 6327 btrfs_free_path(path); 6328 6329 ret = update_block_group(root, ins->objectid, ins->offset, 1); 6330 if (ret) { /* -ENOENT, logic error */ 6331 printk(KERN_ERR "btrfs update block group failed for %llu " 6332 "%llu\n", (unsigned long long)ins->objectid, 6333 (unsigned long long)ins->offset); 6334 BUG(); 6335 } 6336 return ret; 6337 } 6338 6339 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 6340 struct btrfs_root *root, 6341 u64 parent, u64 root_objectid, 6342 u64 flags, struct btrfs_disk_key *key, 6343 int level, struct btrfs_key *ins) 6344 { 6345 int ret; 6346 struct btrfs_fs_info *fs_info = root->fs_info; 6347 struct btrfs_extent_item *extent_item; 6348 struct btrfs_tree_block_info *block_info; 6349 struct btrfs_extent_inline_ref *iref; 6350 struct btrfs_path *path; 6351 struct extent_buffer *leaf; 6352 u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); 6353 6354 path = btrfs_alloc_path(); 6355 if (!path) 6356 return -ENOMEM; 6357 6358 path->leave_spinning = 1; 6359 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 6360 ins, size); 6361 if (ret) { 6362 btrfs_free_path(path); 6363 return ret; 6364 } 6365 6366 leaf = path->nodes[0]; 6367 extent_item = btrfs_item_ptr(leaf, path->slots[0], 6368 struct btrfs_extent_item); 6369 btrfs_set_extent_refs(leaf, extent_item, 1); 6370 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 6371 btrfs_set_extent_flags(leaf, extent_item, 6372 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 6373 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 6374 6375 btrfs_set_tree_block_key(leaf, block_info, key); 6376 btrfs_set_tree_block_level(leaf, block_info, level); 6377 6378 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 6379 if (parent > 0) { 6380 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 6381 btrfs_set_extent_inline_ref_type(leaf, iref, 6382 BTRFS_SHARED_BLOCK_REF_KEY); 6383 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 6384 } else { 6385 btrfs_set_extent_inline_ref_type(leaf, iref, 6386 BTRFS_TREE_BLOCK_REF_KEY); 6387 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 6388 } 6389 6390 btrfs_mark_buffer_dirty(leaf); 6391 btrfs_free_path(path); 6392 6393 ret = update_block_group(root, ins->objectid, ins->offset, 1); 6394 if (ret) { /* -ENOENT, logic error */ 6395 printk(KERN_ERR "btrfs update block group failed for %llu " 6396 "%llu\n", (unsigned long long)ins->objectid, 6397 (unsigned long long)ins->offset); 6398 BUG(); 6399 } 6400 return ret; 6401 } 6402 6403 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6404 struct btrfs_root *root, 6405 u64 root_objectid, u64 owner, 6406 u64 offset, struct btrfs_key *ins) 6407 { 6408 int ret; 6409 6410 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 6411 6412 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, 6413 ins->offset, 0, 6414 root_objectid, owner, offset, 6415 BTRFS_ADD_DELAYED_EXTENT, NULL, 0); 6416 return ret; 6417 } 6418 6419 /* 6420 * this is used by the tree logging recovery code. It records that 6421 * an extent has been allocated and makes sure to clear the free 6422 * space cache bits as well 6423 */ 6424 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 6425 struct btrfs_root *root, 6426 u64 root_objectid, u64 owner, u64 offset, 6427 struct btrfs_key *ins) 6428 { 6429 int ret; 6430 struct btrfs_block_group_cache *block_group; 6431 struct btrfs_caching_control *caching_ctl; 6432 u64 start = ins->objectid; 6433 u64 num_bytes = ins->offset; 6434 6435 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 6436 cache_block_group(block_group, 0); 6437 caching_ctl = get_caching_control(block_group); 6438 6439 if (!caching_ctl) { 6440 BUG_ON(!block_group_cache_done(block_group)); 6441 ret = btrfs_remove_free_space(block_group, start, num_bytes); 6442 BUG_ON(ret); /* -ENOMEM */ 6443 } else { 6444 mutex_lock(&caching_ctl->mutex); 6445 6446 if (start >= caching_ctl->progress) { 6447 ret = add_excluded_extent(root, start, num_bytes); 6448 BUG_ON(ret); /* -ENOMEM */ 6449 } else if (start + num_bytes <= caching_ctl->progress) { 6450 ret = btrfs_remove_free_space(block_group, 6451 start, num_bytes); 6452 BUG_ON(ret); /* -ENOMEM */ 6453 } else { 6454 num_bytes = caching_ctl->progress - start; 6455 ret = btrfs_remove_free_space(block_group, 6456 start, num_bytes); 6457 BUG_ON(ret); /* -ENOMEM */ 6458 6459 start = caching_ctl->progress; 6460 num_bytes = ins->objectid + ins->offset - 6461 caching_ctl->progress; 6462 ret = add_excluded_extent(root, start, num_bytes); 6463 BUG_ON(ret); /* -ENOMEM */ 6464 } 6465 6466 mutex_unlock(&caching_ctl->mutex); 6467 put_caching_control(caching_ctl); 6468 } 6469 6470 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 6471 RESERVE_ALLOC_NO_ACCOUNT); 6472 BUG_ON(ret); /* logic error */ 6473 btrfs_put_block_group(block_group); 6474 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 6475 0, owner, offset, ins, 1); 6476 return ret; 6477 } 6478 6479 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 6480 struct btrfs_root *root, 6481 u64 bytenr, u32 blocksize, 6482 int level) 6483 { 6484 struct extent_buffer *buf; 6485 6486 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 6487 if (!buf) 6488 return ERR_PTR(-ENOMEM); 6489 btrfs_set_header_generation(buf, trans->transid); 6490 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 6491 btrfs_tree_lock(buf); 6492 clean_tree_block(trans, root, buf); 6493 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 6494 6495 btrfs_set_lock_blocking(buf); 6496 btrfs_set_buffer_uptodate(buf); 6497 6498 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 6499 /* 6500 * we allow two log transactions at a time, use different 6501 * EXENT bit to differentiate dirty pages. 6502 */ 6503 if (root->log_transid % 2 == 0) 6504 set_extent_dirty(&root->dirty_log_pages, buf->start, 6505 buf->start + buf->len - 1, GFP_NOFS); 6506 else 6507 set_extent_new(&root->dirty_log_pages, buf->start, 6508 buf->start + buf->len - 1, GFP_NOFS); 6509 } else { 6510 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 6511 buf->start + buf->len - 1, GFP_NOFS); 6512 } 6513 trans->blocks_used++; 6514 /* this returns a buffer locked for blocking */ 6515 return buf; 6516 } 6517 6518 static struct btrfs_block_rsv * 6519 use_block_rsv(struct btrfs_trans_handle *trans, 6520 struct btrfs_root *root, u32 blocksize) 6521 { 6522 struct btrfs_block_rsv *block_rsv; 6523 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 6524 int ret; 6525 6526 block_rsv = get_block_rsv(trans, root); 6527 6528 if (block_rsv->size == 0) { 6529 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6530 BTRFS_RESERVE_NO_FLUSH); 6531 /* 6532 * If we couldn't reserve metadata bytes try and use some from 6533 * the global reserve. 6534 */ 6535 if (ret && block_rsv != global_rsv) { 6536 ret = block_rsv_use_bytes(global_rsv, blocksize); 6537 if (!ret) 6538 return global_rsv; 6539 return ERR_PTR(ret); 6540 } else if (ret) { 6541 return ERR_PTR(ret); 6542 } 6543 return block_rsv; 6544 } 6545 6546 ret = block_rsv_use_bytes(block_rsv, blocksize); 6547 if (!ret) 6548 return block_rsv; 6549 if (ret && !block_rsv->failfast) { 6550 if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 6551 static DEFINE_RATELIMIT_STATE(_rs, 6552 DEFAULT_RATELIMIT_INTERVAL * 10, 6553 /*DEFAULT_RATELIMIT_BURST*/ 1); 6554 if (__ratelimit(&_rs)) 6555 WARN(1, KERN_DEBUG 6556 "btrfs: block rsv returned %d\n", ret); 6557 } 6558 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6559 BTRFS_RESERVE_NO_FLUSH); 6560 if (!ret) { 6561 return block_rsv; 6562 } else if (ret && block_rsv != global_rsv) { 6563 ret = block_rsv_use_bytes(global_rsv, blocksize); 6564 if (!ret) 6565 return global_rsv; 6566 } 6567 } 6568 6569 return ERR_PTR(-ENOSPC); 6570 } 6571 6572 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 6573 struct btrfs_block_rsv *block_rsv, u32 blocksize) 6574 { 6575 block_rsv_add_bytes(block_rsv, blocksize, 0); 6576 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 6577 } 6578 6579 /* 6580 * finds a free extent and does all the dirty work required for allocation 6581 * returns the key for the extent through ins, and a tree buffer for 6582 * the first block of the extent through buf. 6583 * 6584 * returns the tree buffer or NULL. 6585 */ 6586 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 6587 struct btrfs_root *root, u32 blocksize, 6588 u64 parent, u64 root_objectid, 6589 struct btrfs_disk_key *key, int level, 6590 u64 hint, u64 empty_size) 6591 { 6592 struct btrfs_key ins; 6593 struct btrfs_block_rsv *block_rsv; 6594 struct extent_buffer *buf; 6595 u64 flags = 0; 6596 int ret; 6597 6598 6599 block_rsv = use_block_rsv(trans, root, blocksize); 6600 if (IS_ERR(block_rsv)) 6601 return ERR_CAST(block_rsv); 6602 6603 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, 6604 empty_size, hint, &ins, 0); 6605 if (ret) { 6606 unuse_block_rsv(root->fs_info, block_rsv, blocksize); 6607 return ERR_PTR(ret); 6608 } 6609 6610 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 6611 blocksize, level); 6612 BUG_ON(IS_ERR(buf)); /* -ENOMEM */ 6613 6614 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 6615 if (parent == 0) 6616 parent = ins.objectid; 6617 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 6618 } else 6619 BUG_ON(parent > 0); 6620 6621 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 6622 struct btrfs_delayed_extent_op *extent_op; 6623 extent_op = btrfs_alloc_delayed_extent_op(); 6624 BUG_ON(!extent_op); /* -ENOMEM */ 6625 if (key) 6626 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 6627 else 6628 memset(&extent_op->key, 0, sizeof(extent_op->key)); 6629 extent_op->flags_to_set = flags; 6630 extent_op->update_key = 1; 6631 extent_op->update_flags = 1; 6632 extent_op->is_data = 0; 6633 6634 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 6635 ins.objectid, 6636 ins.offset, parent, root_objectid, 6637 level, BTRFS_ADD_DELAYED_EXTENT, 6638 extent_op, 0); 6639 BUG_ON(ret); /* -ENOMEM */ 6640 } 6641 return buf; 6642 } 6643 6644 struct walk_control { 6645 u64 refs[BTRFS_MAX_LEVEL]; 6646 u64 flags[BTRFS_MAX_LEVEL]; 6647 struct btrfs_key update_progress; 6648 int stage; 6649 int level; 6650 int shared_level; 6651 int update_ref; 6652 int keep_locks; 6653 int reada_slot; 6654 int reada_count; 6655 int for_reloc; 6656 }; 6657 6658 #define DROP_REFERENCE 1 6659 #define UPDATE_BACKREF 2 6660 6661 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 6662 struct btrfs_root *root, 6663 struct walk_control *wc, 6664 struct btrfs_path *path) 6665 { 6666 u64 bytenr; 6667 u64 generation; 6668 u64 refs; 6669 u64 flags; 6670 u32 nritems; 6671 u32 blocksize; 6672 struct btrfs_key key; 6673 struct extent_buffer *eb; 6674 int ret; 6675 int slot; 6676 int nread = 0; 6677 6678 if (path->slots[wc->level] < wc->reada_slot) { 6679 wc->reada_count = wc->reada_count * 2 / 3; 6680 wc->reada_count = max(wc->reada_count, 2); 6681 } else { 6682 wc->reada_count = wc->reada_count * 3 / 2; 6683 wc->reada_count = min_t(int, wc->reada_count, 6684 BTRFS_NODEPTRS_PER_BLOCK(root)); 6685 } 6686 6687 eb = path->nodes[wc->level]; 6688 nritems = btrfs_header_nritems(eb); 6689 blocksize = btrfs_level_size(root, wc->level - 1); 6690 6691 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 6692 if (nread >= wc->reada_count) 6693 break; 6694 6695 cond_resched(); 6696 bytenr = btrfs_node_blockptr(eb, slot); 6697 generation = btrfs_node_ptr_generation(eb, slot); 6698 6699 if (slot == path->slots[wc->level]) 6700 goto reada; 6701 6702 if (wc->stage == UPDATE_BACKREF && 6703 generation <= root->root_key.offset) 6704 continue; 6705 6706 /* We don't lock the tree block, it's OK to be racy here */ 6707 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, 6708 &refs, &flags); 6709 /* We don't care about errors in readahead. */ 6710 if (ret < 0) 6711 continue; 6712 BUG_ON(refs == 0); 6713 6714 if (wc->stage == DROP_REFERENCE) { 6715 if (refs == 1) 6716 goto reada; 6717 6718 if (wc->level == 1 && 6719 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 6720 continue; 6721 if (!wc->update_ref || 6722 generation <= root->root_key.offset) 6723 continue; 6724 btrfs_node_key_to_cpu(eb, &key, slot); 6725 ret = btrfs_comp_cpu_keys(&key, 6726 &wc->update_progress); 6727 if (ret < 0) 6728 continue; 6729 } else { 6730 if (wc->level == 1 && 6731 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 6732 continue; 6733 } 6734 reada: 6735 ret = readahead_tree_block(root, bytenr, blocksize, 6736 generation); 6737 if (ret) 6738 break; 6739 nread++; 6740 } 6741 wc->reada_slot = slot; 6742 } 6743 6744 /* 6745 * helper to process tree block while walking down the tree. 6746 * 6747 * when wc->stage == UPDATE_BACKREF, this function updates 6748 * back refs for pointers in the block. 6749 * 6750 * NOTE: return value 1 means we should stop walking down. 6751 */ 6752 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 6753 struct btrfs_root *root, 6754 struct btrfs_path *path, 6755 struct walk_control *wc, int lookup_info) 6756 { 6757 int level = wc->level; 6758 struct extent_buffer *eb = path->nodes[level]; 6759 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 6760 int ret; 6761 6762 if (wc->stage == UPDATE_BACKREF && 6763 btrfs_header_owner(eb) != root->root_key.objectid) 6764 return 1; 6765 6766 /* 6767 * when reference count of tree block is 1, it won't increase 6768 * again. once full backref flag is set, we never clear it. 6769 */ 6770 if (lookup_info && 6771 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 6772 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 6773 BUG_ON(!path->locks[level]); 6774 ret = btrfs_lookup_extent_info(trans, root, 6775 eb->start, eb->len, 6776 &wc->refs[level], 6777 &wc->flags[level]); 6778 BUG_ON(ret == -ENOMEM); 6779 if (ret) 6780 return ret; 6781 BUG_ON(wc->refs[level] == 0); 6782 } 6783 6784 if (wc->stage == DROP_REFERENCE) { 6785 if (wc->refs[level] > 1) 6786 return 1; 6787 6788 if (path->locks[level] && !wc->keep_locks) { 6789 btrfs_tree_unlock_rw(eb, path->locks[level]); 6790 path->locks[level] = 0; 6791 } 6792 return 0; 6793 } 6794 6795 /* wc->stage == UPDATE_BACKREF */ 6796 if (!(wc->flags[level] & flag)) { 6797 BUG_ON(!path->locks[level]); 6798 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); 6799 BUG_ON(ret); /* -ENOMEM */ 6800 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); 6801 BUG_ON(ret); /* -ENOMEM */ 6802 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 6803 eb->len, flag, 0); 6804 BUG_ON(ret); /* -ENOMEM */ 6805 wc->flags[level] |= flag; 6806 } 6807 6808 /* 6809 * the block is shared by multiple trees, so it's not good to 6810 * keep the tree lock 6811 */ 6812 if (path->locks[level] && level > 0) { 6813 btrfs_tree_unlock_rw(eb, path->locks[level]); 6814 path->locks[level] = 0; 6815 } 6816 return 0; 6817 } 6818 6819 /* 6820 * helper to process tree block pointer. 6821 * 6822 * when wc->stage == DROP_REFERENCE, this function checks 6823 * reference count of the block pointed to. if the block 6824 * is shared and we need update back refs for the subtree 6825 * rooted at the block, this function changes wc->stage to 6826 * UPDATE_BACKREF. if the block is shared and there is no 6827 * need to update back, this function drops the reference 6828 * to the block. 6829 * 6830 * NOTE: return value 1 means we should stop walking down. 6831 */ 6832 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 6833 struct btrfs_root *root, 6834 struct btrfs_path *path, 6835 struct walk_control *wc, int *lookup_info) 6836 { 6837 u64 bytenr; 6838 u64 generation; 6839 u64 parent; 6840 u32 blocksize; 6841 struct btrfs_key key; 6842 struct extent_buffer *next; 6843 int level = wc->level; 6844 int reada = 0; 6845 int ret = 0; 6846 6847 generation = btrfs_node_ptr_generation(path->nodes[level], 6848 path->slots[level]); 6849 /* 6850 * if the lower level block was created before the snapshot 6851 * was created, we know there is no need to update back refs 6852 * for the subtree 6853 */ 6854 if (wc->stage == UPDATE_BACKREF && 6855 generation <= root->root_key.offset) { 6856 *lookup_info = 1; 6857 return 1; 6858 } 6859 6860 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 6861 blocksize = btrfs_level_size(root, level - 1); 6862 6863 next = btrfs_find_tree_block(root, bytenr, blocksize); 6864 if (!next) { 6865 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 6866 if (!next) 6867 return -ENOMEM; 6868 reada = 1; 6869 } 6870 btrfs_tree_lock(next); 6871 btrfs_set_lock_blocking(next); 6872 6873 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, 6874 &wc->refs[level - 1], 6875 &wc->flags[level - 1]); 6876 if (ret < 0) { 6877 btrfs_tree_unlock(next); 6878 return ret; 6879 } 6880 6881 BUG_ON(wc->refs[level - 1] == 0); 6882 *lookup_info = 0; 6883 6884 if (wc->stage == DROP_REFERENCE) { 6885 if (wc->refs[level - 1] > 1) { 6886 if (level == 1 && 6887 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 6888 goto skip; 6889 6890 if (!wc->update_ref || 6891 generation <= root->root_key.offset) 6892 goto skip; 6893 6894 btrfs_node_key_to_cpu(path->nodes[level], &key, 6895 path->slots[level]); 6896 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 6897 if (ret < 0) 6898 goto skip; 6899 6900 wc->stage = UPDATE_BACKREF; 6901 wc->shared_level = level - 1; 6902 } 6903 } else { 6904 if (level == 1 && 6905 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 6906 goto skip; 6907 } 6908 6909 if (!btrfs_buffer_uptodate(next, generation, 0)) { 6910 btrfs_tree_unlock(next); 6911 free_extent_buffer(next); 6912 next = NULL; 6913 *lookup_info = 1; 6914 } 6915 6916 if (!next) { 6917 if (reada && level == 1) 6918 reada_walk_down(trans, root, wc, path); 6919 next = read_tree_block(root, bytenr, blocksize, generation); 6920 if (!next) 6921 return -EIO; 6922 btrfs_tree_lock(next); 6923 btrfs_set_lock_blocking(next); 6924 } 6925 6926 level--; 6927 BUG_ON(level != btrfs_header_level(next)); 6928 path->nodes[level] = next; 6929 path->slots[level] = 0; 6930 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 6931 wc->level = level; 6932 if (wc->level == 1) 6933 wc->reada_slot = 0; 6934 return 0; 6935 skip: 6936 wc->refs[level - 1] = 0; 6937 wc->flags[level - 1] = 0; 6938 if (wc->stage == DROP_REFERENCE) { 6939 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 6940 parent = path->nodes[level]->start; 6941 } else { 6942 BUG_ON(root->root_key.objectid != 6943 btrfs_header_owner(path->nodes[level])); 6944 parent = 0; 6945 } 6946 6947 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 6948 root->root_key.objectid, level - 1, 0, 0); 6949 BUG_ON(ret); /* -ENOMEM */ 6950 } 6951 btrfs_tree_unlock(next); 6952 free_extent_buffer(next); 6953 *lookup_info = 1; 6954 return 1; 6955 } 6956 6957 /* 6958 * helper to process tree block while walking up the tree. 6959 * 6960 * when wc->stage == DROP_REFERENCE, this function drops 6961 * reference count on the block. 6962 * 6963 * when wc->stage == UPDATE_BACKREF, this function changes 6964 * wc->stage back to DROP_REFERENCE if we changed wc->stage 6965 * to UPDATE_BACKREF previously while processing the block. 6966 * 6967 * NOTE: return value 1 means we should stop walking up. 6968 */ 6969 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 6970 struct btrfs_root *root, 6971 struct btrfs_path *path, 6972 struct walk_control *wc) 6973 { 6974 int ret; 6975 int level = wc->level; 6976 struct extent_buffer *eb = path->nodes[level]; 6977 u64 parent = 0; 6978 6979 if (wc->stage == UPDATE_BACKREF) { 6980 BUG_ON(wc->shared_level < level); 6981 if (level < wc->shared_level) 6982 goto out; 6983 6984 ret = find_next_key(path, level + 1, &wc->update_progress); 6985 if (ret > 0) 6986 wc->update_ref = 0; 6987 6988 wc->stage = DROP_REFERENCE; 6989 wc->shared_level = -1; 6990 path->slots[level] = 0; 6991 6992 /* 6993 * check reference count again if the block isn't locked. 6994 * we should start walking down the tree again if reference 6995 * count is one. 6996 */ 6997 if (!path->locks[level]) { 6998 BUG_ON(level == 0); 6999 btrfs_tree_lock(eb); 7000 btrfs_set_lock_blocking(eb); 7001 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7002 7003 ret = btrfs_lookup_extent_info(trans, root, 7004 eb->start, eb->len, 7005 &wc->refs[level], 7006 &wc->flags[level]); 7007 if (ret < 0) { 7008 btrfs_tree_unlock_rw(eb, path->locks[level]); 7009 path->locks[level] = 0; 7010 return ret; 7011 } 7012 BUG_ON(wc->refs[level] == 0); 7013 if (wc->refs[level] == 1) { 7014 btrfs_tree_unlock_rw(eb, path->locks[level]); 7015 path->locks[level] = 0; 7016 return 1; 7017 } 7018 } 7019 } 7020 7021 /* wc->stage == DROP_REFERENCE */ 7022 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 7023 7024 if (wc->refs[level] == 1) { 7025 if (level == 0) { 7026 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7027 ret = btrfs_dec_ref(trans, root, eb, 1, 7028 wc->for_reloc); 7029 else 7030 ret = btrfs_dec_ref(trans, root, eb, 0, 7031 wc->for_reloc); 7032 BUG_ON(ret); /* -ENOMEM */ 7033 } 7034 /* make block locked assertion in clean_tree_block happy */ 7035 if (!path->locks[level] && 7036 btrfs_header_generation(eb) == trans->transid) { 7037 btrfs_tree_lock(eb); 7038 btrfs_set_lock_blocking(eb); 7039 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7040 } 7041 clean_tree_block(trans, root, eb); 7042 } 7043 7044 if (eb == root->node) { 7045 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7046 parent = eb->start; 7047 else 7048 BUG_ON(root->root_key.objectid != 7049 btrfs_header_owner(eb)); 7050 } else { 7051 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7052 parent = path->nodes[level + 1]->start; 7053 else 7054 BUG_ON(root->root_key.objectid != 7055 btrfs_header_owner(path->nodes[level + 1])); 7056 } 7057 7058 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 7059 out: 7060 wc->refs[level] = 0; 7061 wc->flags[level] = 0; 7062 return 0; 7063 } 7064 7065 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 7066 struct btrfs_root *root, 7067 struct btrfs_path *path, 7068 struct walk_control *wc) 7069 { 7070 int level = wc->level; 7071 int lookup_info = 1; 7072 int ret; 7073 7074 while (level >= 0) { 7075 ret = walk_down_proc(trans, root, path, wc, lookup_info); 7076 if (ret > 0) 7077 break; 7078 7079 if (level == 0) 7080 break; 7081 7082 if (path->slots[level] >= 7083 btrfs_header_nritems(path->nodes[level])) 7084 break; 7085 7086 ret = do_walk_down(trans, root, path, wc, &lookup_info); 7087 if (ret > 0) { 7088 path->slots[level]++; 7089 continue; 7090 } else if (ret < 0) 7091 return ret; 7092 level = wc->level; 7093 } 7094 return 0; 7095 } 7096 7097 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 7098 struct btrfs_root *root, 7099 struct btrfs_path *path, 7100 struct walk_control *wc, int max_level) 7101 { 7102 int level = wc->level; 7103 int ret; 7104 7105 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 7106 while (level < max_level && path->nodes[level]) { 7107 wc->level = level; 7108 if (path->slots[level] + 1 < 7109 btrfs_header_nritems(path->nodes[level])) { 7110 path->slots[level]++; 7111 return 0; 7112 } else { 7113 ret = walk_up_proc(trans, root, path, wc); 7114 if (ret > 0) 7115 return 0; 7116 7117 if (path->locks[level]) { 7118 btrfs_tree_unlock_rw(path->nodes[level], 7119 path->locks[level]); 7120 path->locks[level] = 0; 7121 } 7122 free_extent_buffer(path->nodes[level]); 7123 path->nodes[level] = NULL; 7124 level++; 7125 } 7126 } 7127 return 1; 7128 } 7129 7130 /* 7131 * drop a subvolume tree. 7132 * 7133 * this function traverses the tree freeing any blocks that only 7134 * referenced by the tree. 7135 * 7136 * when a shared tree block is found. this function decreases its 7137 * reference count by one. if update_ref is true, this function 7138 * also make sure backrefs for the shared block and all lower level 7139 * blocks are properly updated. 7140 */ 7141 int btrfs_drop_snapshot(struct btrfs_root *root, 7142 struct btrfs_block_rsv *block_rsv, int update_ref, 7143 int for_reloc) 7144 { 7145 struct btrfs_path *path; 7146 struct btrfs_trans_handle *trans; 7147 struct btrfs_root *tree_root = root->fs_info->tree_root; 7148 struct btrfs_root_item *root_item = &root->root_item; 7149 struct walk_control *wc; 7150 struct btrfs_key key; 7151 int err = 0; 7152 int ret; 7153 int level; 7154 7155 path = btrfs_alloc_path(); 7156 if (!path) { 7157 err = -ENOMEM; 7158 goto out; 7159 } 7160 7161 wc = kzalloc(sizeof(*wc), GFP_NOFS); 7162 if (!wc) { 7163 btrfs_free_path(path); 7164 err = -ENOMEM; 7165 goto out; 7166 } 7167 7168 trans = btrfs_start_transaction(tree_root, 0); 7169 if (IS_ERR(trans)) { 7170 err = PTR_ERR(trans); 7171 goto out_free; 7172 } 7173 7174 if (block_rsv) 7175 trans->block_rsv = block_rsv; 7176 7177 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 7178 level = btrfs_header_level(root->node); 7179 path->nodes[level] = btrfs_lock_root_node(root); 7180 btrfs_set_lock_blocking(path->nodes[level]); 7181 path->slots[level] = 0; 7182 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7183 memset(&wc->update_progress, 0, 7184 sizeof(wc->update_progress)); 7185 } else { 7186 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 7187 memcpy(&wc->update_progress, &key, 7188 sizeof(wc->update_progress)); 7189 7190 level = root_item->drop_level; 7191 BUG_ON(level == 0); 7192 path->lowest_level = level; 7193 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7194 path->lowest_level = 0; 7195 if (ret < 0) { 7196 err = ret; 7197 goto out_end_trans; 7198 } 7199 WARN_ON(ret > 0); 7200 7201 /* 7202 * unlock our path, this is safe because only this 7203 * function is allowed to delete this snapshot 7204 */ 7205 btrfs_unlock_up_safe(path, 0); 7206 7207 level = btrfs_header_level(root->node); 7208 while (1) { 7209 btrfs_tree_lock(path->nodes[level]); 7210 btrfs_set_lock_blocking(path->nodes[level]); 7211 7212 ret = btrfs_lookup_extent_info(trans, root, 7213 path->nodes[level]->start, 7214 path->nodes[level]->len, 7215 &wc->refs[level], 7216 &wc->flags[level]); 7217 if (ret < 0) { 7218 err = ret; 7219 goto out_end_trans; 7220 } 7221 BUG_ON(wc->refs[level] == 0); 7222 7223 if (level == root_item->drop_level) 7224 break; 7225 7226 btrfs_tree_unlock(path->nodes[level]); 7227 WARN_ON(wc->refs[level] != 1); 7228 level--; 7229 } 7230 } 7231 7232 wc->level = level; 7233 wc->shared_level = -1; 7234 wc->stage = DROP_REFERENCE; 7235 wc->update_ref = update_ref; 7236 wc->keep_locks = 0; 7237 wc->for_reloc = for_reloc; 7238 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7239 7240 while (1) { 7241 ret = walk_down_tree(trans, root, path, wc); 7242 if (ret < 0) { 7243 err = ret; 7244 break; 7245 } 7246 7247 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 7248 if (ret < 0) { 7249 err = ret; 7250 break; 7251 } 7252 7253 if (ret > 0) { 7254 BUG_ON(wc->stage != DROP_REFERENCE); 7255 break; 7256 } 7257 7258 if (wc->stage == DROP_REFERENCE) { 7259 level = wc->level; 7260 btrfs_node_key(path->nodes[level], 7261 &root_item->drop_progress, 7262 path->slots[level]); 7263 root_item->drop_level = level; 7264 } 7265 7266 BUG_ON(wc->level == 0); 7267 if (btrfs_should_end_transaction(trans, tree_root)) { 7268 ret = btrfs_update_root(trans, tree_root, 7269 &root->root_key, 7270 root_item); 7271 if (ret) { 7272 btrfs_abort_transaction(trans, tree_root, ret); 7273 err = ret; 7274 goto out_end_trans; 7275 } 7276 7277 btrfs_end_transaction_throttle(trans, tree_root); 7278 trans = btrfs_start_transaction(tree_root, 0); 7279 if (IS_ERR(trans)) { 7280 err = PTR_ERR(trans); 7281 goto out_free; 7282 } 7283 if (block_rsv) 7284 trans->block_rsv = block_rsv; 7285 } 7286 } 7287 btrfs_release_path(path); 7288 if (err) 7289 goto out_end_trans; 7290 7291 ret = btrfs_del_root(trans, tree_root, &root->root_key); 7292 if (ret) { 7293 btrfs_abort_transaction(trans, tree_root, ret); 7294 goto out_end_trans; 7295 } 7296 7297 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 7298 ret = btrfs_find_last_root(tree_root, root->root_key.objectid, 7299 NULL, NULL); 7300 if (ret < 0) { 7301 btrfs_abort_transaction(trans, tree_root, ret); 7302 err = ret; 7303 goto out_end_trans; 7304 } else if (ret > 0) { 7305 /* if we fail to delete the orphan item this time 7306 * around, it'll get picked up the next time. 7307 * 7308 * The most common failure here is just -ENOENT. 7309 */ 7310 btrfs_del_orphan_item(trans, tree_root, 7311 root->root_key.objectid); 7312 } 7313 } 7314 7315 if (root->in_radix) { 7316 btrfs_free_fs_root(tree_root->fs_info, root); 7317 } else { 7318 free_extent_buffer(root->node); 7319 free_extent_buffer(root->commit_root); 7320 kfree(root); 7321 } 7322 out_end_trans: 7323 btrfs_end_transaction_throttle(trans, tree_root); 7324 out_free: 7325 kfree(wc); 7326 btrfs_free_path(path); 7327 out: 7328 if (err) 7329 btrfs_std_error(root->fs_info, err); 7330 return err; 7331 } 7332 7333 /* 7334 * drop subtree rooted at tree block 'node'. 7335 * 7336 * NOTE: this function will unlock and release tree block 'node' 7337 * only used by relocation code 7338 */ 7339 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 7340 struct btrfs_root *root, 7341 struct extent_buffer *node, 7342 struct extent_buffer *parent) 7343 { 7344 struct btrfs_path *path; 7345 struct walk_control *wc; 7346 int level; 7347 int parent_level; 7348 int ret = 0; 7349 int wret; 7350 7351 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 7352 7353 path = btrfs_alloc_path(); 7354 if (!path) 7355 return -ENOMEM; 7356 7357 wc = kzalloc(sizeof(*wc), GFP_NOFS); 7358 if (!wc) { 7359 btrfs_free_path(path); 7360 return -ENOMEM; 7361 } 7362 7363 btrfs_assert_tree_locked(parent); 7364 parent_level = btrfs_header_level(parent); 7365 extent_buffer_get(parent); 7366 path->nodes[parent_level] = parent; 7367 path->slots[parent_level] = btrfs_header_nritems(parent); 7368 7369 btrfs_assert_tree_locked(node); 7370 level = btrfs_header_level(node); 7371 path->nodes[level] = node; 7372 path->slots[level] = 0; 7373 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7374 7375 wc->refs[parent_level] = 1; 7376 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 7377 wc->level = level; 7378 wc->shared_level = -1; 7379 wc->stage = DROP_REFERENCE; 7380 wc->update_ref = 0; 7381 wc->keep_locks = 1; 7382 wc->for_reloc = 1; 7383 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7384 7385 while (1) { 7386 wret = walk_down_tree(trans, root, path, wc); 7387 if (wret < 0) { 7388 ret = wret; 7389 break; 7390 } 7391 7392 wret = walk_up_tree(trans, root, path, wc, parent_level); 7393 if (wret < 0) 7394 ret = wret; 7395 if (wret != 0) 7396 break; 7397 } 7398 7399 kfree(wc); 7400 btrfs_free_path(path); 7401 return ret; 7402 } 7403 7404 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 7405 { 7406 u64 num_devices; 7407 u64 stripped; 7408 7409 /* 7410 * if restripe for this chunk_type is on pick target profile and 7411 * return, otherwise do the usual balance 7412 */ 7413 stripped = get_restripe_target(root->fs_info, flags); 7414 if (stripped) 7415 return extended_to_chunk(stripped); 7416 7417 /* 7418 * we add in the count of missing devices because we want 7419 * to make sure that any RAID levels on a degraded FS 7420 * continue to be honored. 7421 */ 7422 num_devices = root->fs_info->fs_devices->rw_devices + 7423 root->fs_info->fs_devices->missing_devices; 7424 7425 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7426 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 7427 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7428 7429 if (num_devices == 1) { 7430 stripped |= BTRFS_BLOCK_GROUP_DUP; 7431 stripped = flags & ~stripped; 7432 7433 /* turn raid0 into single device chunks */ 7434 if (flags & BTRFS_BLOCK_GROUP_RAID0) 7435 return stripped; 7436 7437 /* turn mirroring into duplication */ 7438 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 7439 BTRFS_BLOCK_GROUP_RAID10)) 7440 return stripped | BTRFS_BLOCK_GROUP_DUP; 7441 } else { 7442 /* they already had raid on here, just return */ 7443 if (flags & stripped) 7444 return flags; 7445 7446 stripped |= BTRFS_BLOCK_GROUP_DUP; 7447 stripped = flags & ~stripped; 7448 7449 /* switch duplicated blocks with raid1 */ 7450 if (flags & BTRFS_BLOCK_GROUP_DUP) 7451 return stripped | BTRFS_BLOCK_GROUP_RAID1; 7452 7453 /* this is drive concat, leave it alone */ 7454 } 7455 7456 return flags; 7457 } 7458 7459 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) 7460 { 7461 struct btrfs_space_info *sinfo = cache->space_info; 7462 u64 num_bytes; 7463 u64 min_allocable_bytes; 7464 int ret = -ENOSPC; 7465 7466 7467 /* 7468 * We need some metadata space and system metadata space for 7469 * allocating chunks in some corner cases until we force to set 7470 * it to be readonly. 7471 */ 7472 if ((sinfo->flags & 7473 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 7474 !force) 7475 min_allocable_bytes = 1 * 1024 * 1024; 7476 else 7477 min_allocable_bytes = 0; 7478 7479 spin_lock(&sinfo->lock); 7480 spin_lock(&cache->lock); 7481 7482 if (cache->ro) { 7483 ret = 0; 7484 goto out; 7485 } 7486 7487 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 7488 cache->bytes_super - btrfs_block_group_used(&cache->item); 7489 7490 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 7491 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + 7492 min_allocable_bytes <= sinfo->total_bytes) { 7493 sinfo->bytes_readonly += num_bytes; 7494 cache->ro = 1; 7495 ret = 0; 7496 } 7497 out: 7498 spin_unlock(&cache->lock); 7499 spin_unlock(&sinfo->lock); 7500 return ret; 7501 } 7502 7503 int btrfs_set_block_group_ro(struct btrfs_root *root, 7504 struct btrfs_block_group_cache *cache) 7505 7506 { 7507 struct btrfs_trans_handle *trans; 7508 u64 alloc_flags; 7509 int ret; 7510 7511 BUG_ON(cache->ro); 7512 7513 trans = btrfs_join_transaction(root); 7514 if (IS_ERR(trans)) 7515 return PTR_ERR(trans); 7516 7517 alloc_flags = update_block_group_flags(root, cache->flags); 7518 if (alloc_flags != cache->flags) { 7519 ret = do_chunk_alloc(trans, root, alloc_flags, 7520 CHUNK_ALLOC_FORCE); 7521 if (ret < 0) 7522 goto out; 7523 } 7524 7525 ret = set_block_group_ro(cache, 0); 7526 if (!ret) 7527 goto out; 7528 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 7529 ret = do_chunk_alloc(trans, root, alloc_flags, 7530 CHUNK_ALLOC_FORCE); 7531 if (ret < 0) 7532 goto out; 7533 ret = set_block_group_ro(cache, 0); 7534 out: 7535 btrfs_end_transaction(trans, root); 7536 return ret; 7537 } 7538 7539 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 7540 struct btrfs_root *root, u64 type) 7541 { 7542 u64 alloc_flags = get_alloc_profile(root, type); 7543 return do_chunk_alloc(trans, root, alloc_flags, 7544 CHUNK_ALLOC_FORCE); 7545 } 7546 7547 /* 7548 * helper to account the unused space of all the readonly block group in the 7549 * list. takes mirrors into account. 7550 */ 7551 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) 7552 { 7553 struct btrfs_block_group_cache *block_group; 7554 u64 free_bytes = 0; 7555 int factor; 7556 7557 list_for_each_entry(block_group, groups_list, list) { 7558 spin_lock(&block_group->lock); 7559 7560 if (!block_group->ro) { 7561 spin_unlock(&block_group->lock); 7562 continue; 7563 } 7564 7565 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 7566 BTRFS_BLOCK_GROUP_RAID10 | 7567 BTRFS_BLOCK_GROUP_DUP)) 7568 factor = 2; 7569 else 7570 factor = 1; 7571 7572 free_bytes += (block_group->key.offset - 7573 btrfs_block_group_used(&block_group->item)) * 7574 factor; 7575 7576 spin_unlock(&block_group->lock); 7577 } 7578 7579 return free_bytes; 7580 } 7581 7582 /* 7583 * helper to account the unused space of all the readonly block group in the 7584 * space_info. takes mirrors into account. 7585 */ 7586 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 7587 { 7588 int i; 7589 u64 free_bytes = 0; 7590 7591 spin_lock(&sinfo->lock); 7592 7593 for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) 7594 if (!list_empty(&sinfo->block_groups[i])) 7595 free_bytes += __btrfs_get_ro_block_group_free_space( 7596 &sinfo->block_groups[i]); 7597 7598 spin_unlock(&sinfo->lock); 7599 7600 return free_bytes; 7601 } 7602 7603 void btrfs_set_block_group_rw(struct btrfs_root *root, 7604 struct btrfs_block_group_cache *cache) 7605 { 7606 struct btrfs_space_info *sinfo = cache->space_info; 7607 u64 num_bytes; 7608 7609 BUG_ON(!cache->ro); 7610 7611 spin_lock(&sinfo->lock); 7612 spin_lock(&cache->lock); 7613 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 7614 cache->bytes_super - btrfs_block_group_used(&cache->item); 7615 sinfo->bytes_readonly -= num_bytes; 7616 cache->ro = 0; 7617 spin_unlock(&cache->lock); 7618 spin_unlock(&sinfo->lock); 7619 } 7620 7621 /* 7622 * checks to see if its even possible to relocate this block group. 7623 * 7624 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 7625 * ok to go ahead and try. 7626 */ 7627 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) 7628 { 7629 struct btrfs_block_group_cache *block_group; 7630 struct btrfs_space_info *space_info; 7631 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 7632 struct btrfs_device *device; 7633 u64 min_free; 7634 u64 dev_min = 1; 7635 u64 dev_nr = 0; 7636 u64 target; 7637 int index; 7638 int full = 0; 7639 int ret = 0; 7640 7641 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 7642 7643 /* odd, couldn't find the block group, leave it alone */ 7644 if (!block_group) 7645 return -1; 7646 7647 min_free = btrfs_block_group_used(&block_group->item); 7648 7649 /* no bytes used, we're good */ 7650 if (!min_free) 7651 goto out; 7652 7653 space_info = block_group->space_info; 7654 spin_lock(&space_info->lock); 7655 7656 full = space_info->full; 7657 7658 /* 7659 * if this is the last block group we have in this space, we can't 7660 * relocate it unless we're able to allocate a new chunk below. 7661 * 7662 * Otherwise, we need to make sure we have room in the space to handle 7663 * all of the extents from this block group. If we can, we're good 7664 */ 7665 if ((space_info->total_bytes != block_group->key.offset) && 7666 (space_info->bytes_used + space_info->bytes_reserved + 7667 space_info->bytes_pinned + space_info->bytes_readonly + 7668 min_free < space_info->total_bytes)) { 7669 spin_unlock(&space_info->lock); 7670 goto out; 7671 } 7672 spin_unlock(&space_info->lock); 7673 7674 /* 7675 * ok we don't have enough space, but maybe we have free space on our 7676 * devices to allocate new chunks for relocation, so loop through our 7677 * alloc devices and guess if we have enough space. if this block 7678 * group is going to be restriped, run checks against the target 7679 * profile instead of the current one. 7680 */ 7681 ret = -1; 7682 7683 /* 7684 * index: 7685 * 0: raid10 7686 * 1: raid1 7687 * 2: dup 7688 * 3: raid0 7689 * 4: single 7690 */ 7691 target = get_restripe_target(root->fs_info, block_group->flags); 7692 if (target) { 7693 index = __get_raid_index(extended_to_chunk(target)); 7694 } else { 7695 /* 7696 * this is just a balance, so if we were marked as full 7697 * we know there is no space for a new chunk 7698 */ 7699 if (full) 7700 goto out; 7701 7702 index = get_block_group_index(block_group); 7703 } 7704 7705 if (index == BTRFS_RAID_RAID10) { 7706 dev_min = 4; 7707 /* Divide by 2 */ 7708 min_free >>= 1; 7709 } else if (index == BTRFS_RAID_RAID1) { 7710 dev_min = 2; 7711 } else if (index == BTRFS_RAID_DUP) { 7712 /* Multiply by 2 */ 7713 min_free <<= 1; 7714 } else if (index == BTRFS_RAID_RAID0) { 7715 dev_min = fs_devices->rw_devices; 7716 do_div(min_free, dev_min); 7717 } 7718 7719 mutex_lock(&root->fs_info->chunk_mutex); 7720 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 7721 u64 dev_offset; 7722 7723 /* 7724 * check to make sure we can actually find a chunk with enough 7725 * space to fit our block group in. 7726 */ 7727 if (device->total_bytes > device->bytes_used + min_free && 7728 !device->is_tgtdev_for_dev_replace) { 7729 ret = find_free_dev_extent(device, min_free, 7730 &dev_offset, NULL); 7731 if (!ret) 7732 dev_nr++; 7733 7734 if (dev_nr >= dev_min) 7735 break; 7736 7737 ret = -1; 7738 } 7739 } 7740 mutex_unlock(&root->fs_info->chunk_mutex); 7741 out: 7742 btrfs_put_block_group(block_group); 7743 return ret; 7744 } 7745 7746 static int find_first_block_group(struct btrfs_root *root, 7747 struct btrfs_path *path, struct btrfs_key *key) 7748 { 7749 int ret = 0; 7750 struct btrfs_key found_key; 7751 struct extent_buffer *leaf; 7752 int slot; 7753 7754 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 7755 if (ret < 0) 7756 goto out; 7757 7758 while (1) { 7759 slot = path->slots[0]; 7760 leaf = path->nodes[0]; 7761 if (slot >= btrfs_header_nritems(leaf)) { 7762 ret = btrfs_next_leaf(root, path); 7763 if (ret == 0) 7764 continue; 7765 if (ret < 0) 7766 goto out; 7767 break; 7768 } 7769 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7770 7771 if (found_key.objectid >= key->objectid && 7772 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 7773 ret = 0; 7774 goto out; 7775 } 7776 path->slots[0]++; 7777 } 7778 out: 7779 return ret; 7780 } 7781 7782 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 7783 { 7784 struct btrfs_block_group_cache *block_group; 7785 u64 last = 0; 7786 7787 while (1) { 7788 struct inode *inode; 7789 7790 block_group = btrfs_lookup_first_block_group(info, last); 7791 while (block_group) { 7792 spin_lock(&block_group->lock); 7793 if (block_group->iref) 7794 break; 7795 spin_unlock(&block_group->lock); 7796 block_group = next_block_group(info->tree_root, 7797 block_group); 7798 } 7799 if (!block_group) { 7800 if (last == 0) 7801 break; 7802 last = 0; 7803 continue; 7804 } 7805 7806 inode = block_group->inode; 7807 block_group->iref = 0; 7808 block_group->inode = NULL; 7809 spin_unlock(&block_group->lock); 7810 iput(inode); 7811 last = block_group->key.objectid + block_group->key.offset; 7812 btrfs_put_block_group(block_group); 7813 } 7814 } 7815 7816 int btrfs_free_block_groups(struct btrfs_fs_info *info) 7817 { 7818 struct btrfs_block_group_cache *block_group; 7819 struct btrfs_space_info *space_info; 7820 struct btrfs_caching_control *caching_ctl; 7821 struct rb_node *n; 7822 7823 down_write(&info->extent_commit_sem); 7824 while (!list_empty(&info->caching_block_groups)) { 7825 caching_ctl = list_entry(info->caching_block_groups.next, 7826 struct btrfs_caching_control, list); 7827 list_del(&caching_ctl->list); 7828 put_caching_control(caching_ctl); 7829 } 7830 up_write(&info->extent_commit_sem); 7831 7832 spin_lock(&info->block_group_cache_lock); 7833 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 7834 block_group = rb_entry(n, struct btrfs_block_group_cache, 7835 cache_node); 7836 rb_erase(&block_group->cache_node, 7837 &info->block_group_cache_tree); 7838 spin_unlock(&info->block_group_cache_lock); 7839 7840 down_write(&block_group->space_info->groups_sem); 7841 list_del(&block_group->list); 7842 up_write(&block_group->space_info->groups_sem); 7843 7844 if (block_group->cached == BTRFS_CACHE_STARTED) 7845 wait_block_group_cache_done(block_group); 7846 7847 /* 7848 * We haven't cached this block group, which means we could 7849 * possibly have excluded extents on this block group. 7850 */ 7851 if (block_group->cached == BTRFS_CACHE_NO) 7852 free_excluded_extents(info->extent_root, block_group); 7853 7854 btrfs_remove_free_space_cache(block_group); 7855 btrfs_put_block_group(block_group); 7856 7857 spin_lock(&info->block_group_cache_lock); 7858 } 7859 spin_unlock(&info->block_group_cache_lock); 7860 7861 /* now that all the block groups are freed, go through and 7862 * free all the space_info structs. This is only called during 7863 * the final stages of unmount, and so we know nobody is 7864 * using them. We call synchronize_rcu() once before we start, 7865 * just to be on the safe side. 7866 */ 7867 synchronize_rcu(); 7868 7869 release_global_block_rsv(info); 7870 7871 while(!list_empty(&info->space_info)) { 7872 space_info = list_entry(info->space_info.next, 7873 struct btrfs_space_info, 7874 list); 7875 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { 7876 if (space_info->bytes_pinned > 0 || 7877 space_info->bytes_reserved > 0 || 7878 space_info->bytes_may_use > 0) { 7879 WARN_ON(1); 7880 dump_space_info(space_info, 0, 0); 7881 } 7882 } 7883 list_del(&space_info->list); 7884 kfree(space_info); 7885 } 7886 return 0; 7887 } 7888 7889 static void __link_block_group(struct btrfs_space_info *space_info, 7890 struct btrfs_block_group_cache *cache) 7891 { 7892 int index = get_block_group_index(cache); 7893 7894 down_write(&space_info->groups_sem); 7895 list_add_tail(&cache->list, &space_info->block_groups[index]); 7896 up_write(&space_info->groups_sem); 7897 } 7898 7899 int btrfs_read_block_groups(struct btrfs_root *root) 7900 { 7901 struct btrfs_path *path; 7902 int ret; 7903 struct btrfs_block_group_cache *cache; 7904 struct btrfs_fs_info *info = root->fs_info; 7905 struct btrfs_space_info *space_info; 7906 struct btrfs_key key; 7907 struct btrfs_key found_key; 7908 struct extent_buffer *leaf; 7909 int need_clear = 0; 7910 u64 cache_gen; 7911 7912 root = info->extent_root; 7913 key.objectid = 0; 7914 key.offset = 0; 7915 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); 7916 path = btrfs_alloc_path(); 7917 if (!path) 7918 return -ENOMEM; 7919 path->reada = 1; 7920 7921 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 7922 if (btrfs_test_opt(root, SPACE_CACHE) && 7923 btrfs_super_generation(root->fs_info->super_copy) != cache_gen) 7924 need_clear = 1; 7925 if (btrfs_test_opt(root, CLEAR_CACHE)) 7926 need_clear = 1; 7927 7928 while (1) { 7929 ret = find_first_block_group(root, path, &key); 7930 if (ret > 0) 7931 break; 7932 if (ret != 0) 7933 goto error; 7934 leaf = path->nodes[0]; 7935 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 7936 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7937 if (!cache) { 7938 ret = -ENOMEM; 7939 goto error; 7940 } 7941 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 7942 GFP_NOFS); 7943 if (!cache->free_space_ctl) { 7944 kfree(cache); 7945 ret = -ENOMEM; 7946 goto error; 7947 } 7948 7949 atomic_set(&cache->count, 1); 7950 spin_lock_init(&cache->lock); 7951 cache->fs_info = info; 7952 INIT_LIST_HEAD(&cache->list); 7953 INIT_LIST_HEAD(&cache->cluster_list); 7954 7955 if (need_clear) { 7956 /* 7957 * When we mount with old space cache, we need to 7958 * set BTRFS_DC_CLEAR and set dirty flag. 7959 * 7960 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 7961 * truncate the old free space cache inode and 7962 * setup a new one. 7963 * b) Setting 'dirty flag' makes sure that we flush 7964 * the new space cache info onto disk. 7965 */ 7966 cache->disk_cache_state = BTRFS_DC_CLEAR; 7967 if (btrfs_test_opt(root, SPACE_CACHE)) 7968 cache->dirty = 1; 7969 } 7970 7971 read_extent_buffer(leaf, &cache->item, 7972 btrfs_item_ptr_offset(leaf, path->slots[0]), 7973 sizeof(cache->item)); 7974 memcpy(&cache->key, &found_key, sizeof(found_key)); 7975 7976 key.objectid = found_key.objectid + found_key.offset; 7977 btrfs_release_path(path); 7978 cache->flags = btrfs_block_group_flags(&cache->item); 7979 cache->sectorsize = root->sectorsize; 7980 cache->full_stripe_len = btrfs_full_stripe_len(root, 7981 &root->fs_info->mapping_tree, 7982 found_key.objectid); 7983 btrfs_init_free_space_ctl(cache); 7984 7985 /* 7986 * We need to exclude the super stripes now so that the space 7987 * info has super bytes accounted for, otherwise we'll think 7988 * we have more space than we actually do. 7989 */ 7990 ret = exclude_super_stripes(root, cache); 7991 if (ret) { 7992 /* 7993 * We may have excluded something, so call this just in 7994 * case. 7995 */ 7996 free_excluded_extents(root, cache); 7997 kfree(cache->free_space_ctl); 7998 kfree(cache); 7999 goto error; 8000 } 8001 8002 /* 8003 * check for two cases, either we are full, and therefore 8004 * don't need to bother with the caching work since we won't 8005 * find any space, or we are empty, and we can just add all 8006 * the space in and be done with it. This saves us _alot_ of 8007 * time, particularly in the full case. 8008 */ 8009 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 8010 cache->last_byte_to_unpin = (u64)-1; 8011 cache->cached = BTRFS_CACHE_FINISHED; 8012 free_excluded_extents(root, cache); 8013 } else if (btrfs_block_group_used(&cache->item) == 0) { 8014 cache->last_byte_to_unpin = (u64)-1; 8015 cache->cached = BTRFS_CACHE_FINISHED; 8016 add_new_free_space(cache, root->fs_info, 8017 found_key.objectid, 8018 found_key.objectid + 8019 found_key.offset); 8020 free_excluded_extents(root, cache); 8021 } 8022 8023 ret = update_space_info(info, cache->flags, found_key.offset, 8024 btrfs_block_group_used(&cache->item), 8025 &space_info); 8026 BUG_ON(ret); /* -ENOMEM */ 8027 cache->space_info = space_info; 8028 spin_lock(&cache->space_info->lock); 8029 cache->space_info->bytes_readonly += cache->bytes_super; 8030 spin_unlock(&cache->space_info->lock); 8031 8032 __link_block_group(space_info, cache); 8033 8034 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8035 BUG_ON(ret); /* Logic error */ 8036 8037 set_avail_alloc_bits(root->fs_info, cache->flags); 8038 if (btrfs_chunk_readonly(root, cache->key.objectid)) 8039 set_block_group_ro(cache, 1); 8040 } 8041 8042 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 8043 if (!(get_alloc_profile(root, space_info->flags) & 8044 (BTRFS_BLOCK_GROUP_RAID10 | 8045 BTRFS_BLOCK_GROUP_RAID1 | 8046 BTRFS_BLOCK_GROUP_RAID5 | 8047 BTRFS_BLOCK_GROUP_RAID6 | 8048 BTRFS_BLOCK_GROUP_DUP))) 8049 continue; 8050 /* 8051 * avoid allocating from un-mirrored block group if there are 8052 * mirrored block groups. 8053 */ 8054 list_for_each_entry(cache, &space_info->block_groups[3], list) 8055 set_block_group_ro(cache, 1); 8056 list_for_each_entry(cache, &space_info->block_groups[4], list) 8057 set_block_group_ro(cache, 1); 8058 } 8059 8060 init_global_block_rsv(info); 8061 ret = 0; 8062 error: 8063 btrfs_free_path(path); 8064 return ret; 8065 } 8066 8067 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 8068 struct btrfs_root *root) 8069 { 8070 struct btrfs_block_group_cache *block_group, *tmp; 8071 struct btrfs_root *extent_root = root->fs_info->extent_root; 8072 struct btrfs_block_group_item item; 8073 struct btrfs_key key; 8074 int ret = 0; 8075 8076 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, 8077 new_bg_list) { 8078 list_del_init(&block_group->new_bg_list); 8079 8080 if (ret) 8081 continue; 8082 8083 spin_lock(&block_group->lock); 8084 memcpy(&item, &block_group->item, sizeof(item)); 8085 memcpy(&key, &block_group->key, sizeof(key)); 8086 spin_unlock(&block_group->lock); 8087 8088 ret = btrfs_insert_item(trans, extent_root, &key, &item, 8089 sizeof(item)); 8090 if (ret) 8091 btrfs_abort_transaction(trans, extent_root, ret); 8092 } 8093 } 8094 8095 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 8096 struct btrfs_root *root, u64 bytes_used, 8097 u64 type, u64 chunk_objectid, u64 chunk_offset, 8098 u64 size) 8099 { 8100 int ret; 8101 struct btrfs_root *extent_root; 8102 struct btrfs_block_group_cache *cache; 8103 8104 extent_root = root->fs_info->extent_root; 8105 8106 root->fs_info->last_trans_log_full_commit = trans->transid; 8107 8108 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8109 if (!cache) 8110 return -ENOMEM; 8111 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 8112 GFP_NOFS); 8113 if (!cache->free_space_ctl) { 8114 kfree(cache); 8115 return -ENOMEM; 8116 } 8117 8118 cache->key.objectid = chunk_offset; 8119 cache->key.offset = size; 8120 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8121 cache->sectorsize = root->sectorsize; 8122 cache->fs_info = root->fs_info; 8123 cache->full_stripe_len = btrfs_full_stripe_len(root, 8124 &root->fs_info->mapping_tree, 8125 chunk_offset); 8126 8127 atomic_set(&cache->count, 1); 8128 spin_lock_init(&cache->lock); 8129 INIT_LIST_HEAD(&cache->list); 8130 INIT_LIST_HEAD(&cache->cluster_list); 8131 INIT_LIST_HEAD(&cache->new_bg_list); 8132 8133 btrfs_init_free_space_ctl(cache); 8134 8135 btrfs_set_block_group_used(&cache->item, bytes_used); 8136 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 8137 cache->flags = type; 8138 btrfs_set_block_group_flags(&cache->item, type); 8139 8140 cache->last_byte_to_unpin = (u64)-1; 8141 cache->cached = BTRFS_CACHE_FINISHED; 8142 ret = exclude_super_stripes(root, cache); 8143 if (ret) { 8144 /* 8145 * We may have excluded something, so call this just in 8146 * case. 8147 */ 8148 free_excluded_extents(root, cache); 8149 kfree(cache->free_space_ctl); 8150 kfree(cache); 8151 return ret; 8152 } 8153 8154 add_new_free_space(cache, root->fs_info, chunk_offset, 8155 chunk_offset + size); 8156 8157 free_excluded_extents(root, cache); 8158 8159 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 8160 &cache->space_info); 8161 BUG_ON(ret); /* -ENOMEM */ 8162 update_global_block_rsv(root->fs_info); 8163 8164 spin_lock(&cache->space_info->lock); 8165 cache->space_info->bytes_readonly += cache->bytes_super; 8166 spin_unlock(&cache->space_info->lock); 8167 8168 __link_block_group(cache->space_info, cache); 8169 8170 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8171 BUG_ON(ret); /* Logic error */ 8172 8173 list_add_tail(&cache->new_bg_list, &trans->new_bgs); 8174 8175 set_avail_alloc_bits(extent_root->fs_info, type); 8176 8177 return 0; 8178 } 8179 8180 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 8181 { 8182 u64 extra_flags = chunk_to_extended(flags) & 8183 BTRFS_EXTENDED_PROFILE_MASK; 8184 8185 write_seqlock(&fs_info->profiles_lock); 8186 if (flags & BTRFS_BLOCK_GROUP_DATA) 8187 fs_info->avail_data_alloc_bits &= ~extra_flags; 8188 if (flags & BTRFS_BLOCK_GROUP_METADATA) 8189 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 8190 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 8191 fs_info->avail_system_alloc_bits &= ~extra_flags; 8192 write_sequnlock(&fs_info->profiles_lock); 8193 } 8194 8195 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 8196 struct btrfs_root *root, u64 group_start) 8197 { 8198 struct btrfs_path *path; 8199 struct btrfs_block_group_cache *block_group; 8200 struct btrfs_free_cluster *cluster; 8201 struct btrfs_root *tree_root = root->fs_info->tree_root; 8202 struct btrfs_key key; 8203 struct inode *inode; 8204 int ret; 8205 int index; 8206 int factor; 8207 8208 root = root->fs_info->extent_root; 8209 8210 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 8211 BUG_ON(!block_group); 8212 BUG_ON(!block_group->ro); 8213 8214 /* 8215 * Free the reserved super bytes from this block group before 8216 * remove it. 8217 */ 8218 free_excluded_extents(root, block_group); 8219 8220 memcpy(&key, &block_group->key, sizeof(key)); 8221 index = get_block_group_index(block_group); 8222 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 8223 BTRFS_BLOCK_GROUP_RAID1 | 8224 BTRFS_BLOCK_GROUP_RAID10)) 8225 factor = 2; 8226 else 8227 factor = 1; 8228 8229 /* make sure this block group isn't part of an allocation cluster */ 8230 cluster = &root->fs_info->data_alloc_cluster; 8231 spin_lock(&cluster->refill_lock); 8232 btrfs_return_cluster_to_free_space(block_group, cluster); 8233 spin_unlock(&cluster->refill_lock); 8234 8235 /* 8236 * make sure this block group isn't part of a metadata 8237 * allocation cluster 8238 */ 8239 cluster = &root->fs_info->meta_alloc_cluster; 8240 spin_lock(&cluster->refill_lock); 8241 btrfs_return_cluster_to_free_space(block_group, cluster); 8242 spin_unlock(&cluster->refill_lock); 8243 8244 path = btrfs_alloc_path(); 8245 if (!path) { 8246 ret = -ENOMEM; 8247 goto out; 8248 } 8249 8250 inode = lookup_free_space_inode(tree_root, block_group, path); 8251 if (!IS_ERR(inode)) { 8252 ret = btrfs_orphan_add(trans, inode); 8253 if (ret) { 8254 btrfs_add_delayed_iput(inode); 8255 goto out; 8256 } 8257 clear_nlink(inode); 8258 /* One for the block groups ref */ 8259 spin_lock(&block_group->lock); 8260 if (block_group->iref) { 8261 block_group->iref = 0; 8262 block_group->inode = NULL; 8263 spin_unlock(&block_group->lock); 8264 iput(inode); 8265 } else { 8266 spin_unlock(&block_group->lock); 8267 } 8268 /* One for our lookup ref */ 8269 btrfs_add_delayed_iput(inode); 8270 } 8271 8272 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 8273 key.offset = block_group->key.objectid; 8274 key.type = 0; 8275 8276 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 8277 if (ret < 0) 8278 goto out; 8279 if (ret > 0) 8280 btrfs_release_path(path); 8281 if (ret == 0) { 8282 ret = btrfs_del_item(trans, tree_root, path); 8283 if (ret) 8284 goto out; 8285 btrfs_release_path(path); 8286 } 8287 8288 spin_lock(&root->fs_info->block_group_cache_lock); 8289 rb_erase(&block_group->cache_node, 8290 &root->fs_info->block_group_cache_tree); 8291 8292 if (root->fs_info->first_logical_byte == block_group->key.objectid) 8293 root->fs_info->first_logical_byte = (u64)-1; 8294 spin_unlock(&root->fs_info->block_group_cache_lock); 8295 8296 down_write(&block_group->space_info->groups_sem); 8297 /* 8298 * we must use list_del_init so people can check to see if they 8299 * are still on the list after taking the semaphore 8300 */ 8301 list_del_init(&block_group->list); 8302 if (list_empty(&block_group->space_info->block_groups[index])) 8303 clear_avail_alloc_bits(root->fs_info, block_group->flags); 8304 up_write(&block_group->space_info->groups_sem); 8305 8306 if (block_group->cached == BTRFS_CACHE_STARTED) 8307 wait_block_group_cache_done(block_group); 8308 8309 btrfs_remove_free_space_cache(block_group); 8310 8311 spin_lock(&block_group->space_info->lock); 8312 block_group->space_info->total_bytes -= block_group->key.offset; 8313 block_group->space_info->bytes_readonly -= block_group->key.offset; 8314 block_group->space_info->disk_total -= block_group->key.offset * factor; 8315 spin_unlock(&block_group->space_info->lock); 8316 8317 memcpy(&key, &block_group->key, sizeof(key)); 8318 8319 btrfs_clear_space_info_full(root->fs_info); 8320 8321 btrfs_put_block_group(block_group); 8322 btrfs_put_block_group(block_group); 8323 8324 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 8325 if (ret > 0) 8326 ret = -EIO; 8327 if (ret < 0) 8328 goto out; 8329 8330 ret = btrfs_del_item(trans, root, path); 8331 out: 8332 btrfs_free_path(path); 8333 return ret; 8334 } 8335 8336 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 8337 { 8338 struct btrfs_space_info *space_info; 8339 struct btrfs_super_block *disk_super; 8340 u64 features; 8341 u64 flags; 8342 int mixed = 0; 8343 int ret; 8344 8345 disk_super = fs_info->super_copy; 8346 if (!btrfs_super_root(disk_super)) 8347 return 1; 8348 8349 features = btrfs_super_incompat_flags(disk_super); 8350 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 8351 mixed = 1; 8352 8353 flags = BTRFS_BLOCK_GROUP_SYSTEM; 8354 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8355 if (ret) 8356 goto out; 8357 8358 if (mixed) { 8359 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 8360 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8361 } else { 8362 flags = BTRFS_BLOCK_GROUP_METADATA; 8363 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8364 if (ret) 8365 goto out; 8366 8367 flags = BTRFS_BLOCK_GROUP_DATA; 8368 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8369 } 8370 out: 8371 return ret; 8372 } 8373 8374 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 8375 { 8376 return unpin_extent_range(root, start, end); 8377 } 8378 8379 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, 8380 u64 num_bytes, u64 *actual_bytes) 8381 { 8382 return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); 8383 } 8384 8385 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) 8386 { 8387 struct btrfs_fs_info *fs_info = root->fs_info; 8388 struct btrfs_block_group_cache *cache = NULL; 8389 u64 group_trimmed; 8390 u64 start; 8391 u64 end; 8392 u64 trimmed = 0; 8393 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 8394 int ret = 0; 8395 8396 /* 8397 * try to trim all FS space, our block group may start from non-zero. 8398 */ 8399 if (range->len == total_bytes) 8400 cache = btrfs_lookup_first_block_group(fs_info, range->start); 8401 else 8402 cache = btrfs_lookup_block_group(fs_info, range->start); 8403 8404 while (cache) { 8405 if (cache->key.objectid >= (range->start + range->len)) { 8406 btrfs_put_block_group(cache); 8407 break; 8408 } 8409 8410 start = max(range->start, cache->key.objectid); 8411 end = min(range->start + range->len, 8412 cache->key.objectid + cache->key.offset); 8413 8414 if (end - start >= range->minlen) { 8415 if (!block_group_cache_done(cache)) { 8416 ret = cache_block_group(cache, 0); 8417 if (!ret) 8418 wait_block_group_cache_done(cache); 8419 } 8420 ret = btrfs_trim_block_group(cache, 8421 &group_trimmed, 8422 start, 8423 end, 8424 range->minlen); 8425 8426 trimmed += group_trimmed; 8427 if (ret) { 8428 btrfs_put_block_group(cache); 8429 break; 8430 } 8431 } 8432 8433 cache = next_block_group(fs_info->tree_root, cache); 8434 } 8435 8436 range->len = trimmed; 8437 return ret; 8438 } 8439