1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include <linux/slab.h> 26 #include <linux/ratelimit.h> 27 #include <linux/percpu_counter.h> 28 #include "hash.h" 29 #include "ctree.h" 30 #include "disk-io.h" 31 #include "print-tree.h" 32 #include "transaction.h" 33 #include "volumes.h" 34 #include "raid56.h" 35 #include "locking.h" 36 #include "free-space-cache.h" 37 #include "math.h" 38 39 #undef SCRAMBLE_DELAYED_REFS 40 41 /* 42 * control flags for do_chunk_alloc's force field 43 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 44 * if we really need one. 45 * 46 * CHUNK_ALLOC_LIMITED means to only try and allocate one 47 * if we have very few chunks already allocated. This is 48 * used as part of the clustering code to help make sure 49 * we have a good pool of storage to cluster in, without 50 * filling the FS with empty chunks 51 * 52 * CHUNK_ALLOC_FORCE means it must try to allocate one 53 * 54 */ 55 enum { 56 CHUNK_ALLOC_NO_FORCE = 0, 57 CHUNK_ALLOC_LIMITED = 1, 58 CHUNK_ALLOC_FORCE = 2, 59 }; 60 61 /* 62 * Control how reservations are dealt with. 63 * 64 * RESERVE_FREE - freeing a reservation. 65 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for 66 * ENOSPC accounting 67 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update 68 * bytes_may_use as the ENOSPC accounting is done elsewhere 69 */ 70 enum { 71 RESERVE_FREE = 0, 72 RESERVE_ALLOC = 1, 73 RESERVE_ALLOC_NO_ACCOUNT = 2, 74 }; 75 76 static int update_block_group(struct btrfs_root *root, 77 u64 bytenr, u64 num_bytes, int alloc); 78 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 79 struct btrfs_root *root, 80 u64 bytenr, u64 num_bytes, u64 parent, 81 u64 root_objectid, u64 owner_objectid, 82 u64 owner_offset, int refs_to_drop, 83 struct btrfs_delayed_extent_op *extra_op); 84 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 85 struct extent_buffer *leaf, 86 struct btrfs_extent_item *ei); 87 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 88 struct btrfs_root *root, 89 u64 parent, u64 root_objectid, 90 u64 flags, u64 owner, u64 offset, 91 struct btrfs_key *ins, int ref_mod); 92 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 93 struct btrfs_root *root, 94 u64 parent, u64 root_objectid, 95 u64 flags, struct btrfs_disk_key *key, 96 int level, struct btrfs_key *ins); 97 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 98 struct btrfs_root *extent_root, u64 flags, 99 int force); 100 static int find_next_key(struct btrfs_path *path, int level, 101 struct btrfs_key *key); 102 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 103 int dump_block_groups); 104 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 105 u64 num_bytes, int reserve); 106 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 107 u64 num_bytes); 108 int btrfs_pin_extent(struct btrfs_root *root, 109 u64 bytenr, u64 num_bytes, int reserved); 110 111 static noinline int 112 block_group_cache_done(struct btrfs_block_group_cache *cache) 113 { 114 smp_mb(); 115 return cache->cached == BTRFS_CACHE_FINISHED || 116 cache->cached == BTRFS_CACHE_ERROR; 117 } 118 119 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 120 { 121 return (cache->flags & bits) == bits; 122 } 123 124 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 125 { 126 atomic_inc(&cache->count); 127 } 128 129 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 130 { 131 if (atomic_dec_and_test(&cache->count)) { 132 WARN_ON(cache->pinned > 0); 133 WARN_ON(cache->reserved > 0); 134 kfree(cache->free_space_ctl); 135 kfree(cache); 136 } 137 } 138 139 /* 140 * this adds the block group to the fs_info rb tree for the block group 141 * cache 142 */ 143 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 144 struct btrfs_block_group_cache *block_group) 145 { 146 struct rb_node **p; 147 struct rb_node *parent = NULL; 148 struct btrfs_block_group_cache *cache; 149 150 spin_lock(&info->block_group_cache_lock); 151 p = &info->block_group_cache_tree.rb_node; 152 153 while (*p) { 154 parent = *p; 155 cache = rb_entry(parent, struct btrfs_block_group_cache, 156 cache_node); 157 if (block_group->key.objectid < cache->key.objectid) { 158 p = &(*p)->rb_left; 159 } else if (block_group->key.objectid > cache->key.objectid) { 160 p = &(*p)->rb_right; 161 } else { 162 spin_unlock(&info->block_group_cache_lock); 163 return -EEXIST; 164 } 165 } 166 167 rb_link_node(&block_group->cache_node, parent, p); 168 rb_insert_color(&block_group->cache_node, 169 &info->block_group_cache_tree); 170 171 if (info->first_logical_byte > block_group->key.objectid) 172 info->first_logical_byte = block_group->key.objectid; 173 174 spin_unlock(&info->block_group_cache_lock); 175 176 return 0; 177 } 178 179 /* 180 * This will return the block group at or after bytenr if contains is 0, else 181 * it will return the block group that contains the bytenr 182 */ 183 static struct btrfs_block_group_cache * 184 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 185 int contains) 186 { 187 struct btrfs_block_group_cache *cache, *ret = NULL; 188 struct rb_node *n; 189 u64 end, start; 190 191 spin_lock(&info->block_group_cache_lock); 192 n = info->block_group_cache_tree.rb_node; 193 194 while (n) { 195 cache = rb_entry(n, struct btrfs_block_group_cache, 196 cache_node); 197 end = cache->key.objectid + cache->key.offset - 1; 198 start = cache->key.objectid; 199 200 if (bytenr < start) { 201 if (!contains && (!ret || start < ret->key.objectid)) 202 ret = cache; 203 n = n->rb_left; 204 } else if (bytenr > start) { 205 if (contains && bytenr <= end) { 206 ret = cache; 207 break; 208 } 209 n = n->rb_right; 210 } else { 211 ret = cache; 212 break; 213 } 214 } 215 if (ret) { 216 btrfs_get_block_group(ret); 217 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 218 info->first_logical_byte = ret->key.objectid; 219 } 220 spin_unlock(&info->block_group_cache_lock); 221 222 return ret; 223 } 224 225 static int add_excluded_extent(struct btrfs_root *root, 226 u64 start, u64 num_bytes) 227 { 228 u64 end = start + num_bytes - 1; 229 set_extent_bits(&root->fs_info->freed_extents[0], 230 start, end, EXTENT_UPTODATE, GFP_NOFS); 231 set_extent_bits(&root->fs_info->freed_extents[1], 232 start, end, EXTENT_UPTODATE, GFP_NOFS); 233 return 0; 234 } 235 236 static void free_excluded_extents(struct btrfs_root *root, 237 struct btrfs_block_group_cache *cache) 238 { 239 u64 start, end; 240 241 start = cache->key.objectid; 242 end = start + cache->key.offset - 1; 243 244 clear_extent_bits(&root->fs_info->freed_extents[0], 245 start, end, EXTENT_UPTODATE, GFP_NOFS); 246 clear_extent_bits(&root->fs_info->freed_extents[1], 247 start, end, EXTENT_UPTODATE, GFP_NOFS); 248 } 249 250 static int exclude_super_stripes(struct btrfs_root *root, 251 struct btrfs_block_group_cache *cache) 252 { 253 u64 bytenr; 254 u64 *logical; 255 int stripe_len; 256 int i, nr, ret; 257 258 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 259 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 260 cache->bytes_super += stripe_len; 261 ret = add_excluded_extent(root, cache->key.objectid, 262 stripe_len); 263 if (ret) 264 return ret; 265 } 266 267 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 268 bytenr = btrfs_sb_offset(i); 269 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 270 cache->key.objectid, bytenr, 271 0, &logical, &nr, &stripe_len); 272 if (ret) 273 return ret; 274 275 while (nr--) { 276 u64 start, len; 277 278 if (logical[nr] > cache->key.objectid + 279 cache->key.offset) 280 continue; 281 282 if (logical[nr] + stripe_len <= cache->key.objectid) 283 continue; 284 285 start = logical[nr]; 286 if (start < cache->key.objectid) { 287 start = cache->key.objectid; 288 len = (logical[nr] + stripe_len) - start; 289 } else { 290 len = min_t(u64, stripe_len, 291 cache->key.objectid + 292 cache->key.offset - start); 293 } 294 295 cache->bytes_super += len; 296 ret = add_excluded_extent(root, start, len); 297 if (ret) { 298 kfree(logical); 299 return ret; 300 } 301 } 302 303 kfree(logical); 304 } 305 return 0; 306 } 307 308 static struct btrfs_caching_control * 309 get_caching_control(struct btrfs_block_group_cache *cache) 310 { 311 struct btrfs_caching_control *ctl; 312 313 spin_lock(&cache->lock); 314 if (cache->cached != BTRFS_CACHE_STARTED) { 315 spin_unlock(&cache->lock); 316 return NULL; 317 } 318 319 /* We're loading it the fast way, so we don't have a caching_ctl. */ 320 if (!cache->caching_ctl) { 321 spin_unlock(&cache->lock); 322 return NULL; 323 } 324 325 ctl = cache->caching_ctl; 326 atomic_inc(&ctl->count); 327 spin_unlock(&cache->lock); 328 return ctl; 329 } 330 331 static void put_caching_control(struct btrfs_caching_control *ctl) 332 { 333 if (atomic_dec_and_test(&ctl->count)) 334 kfree(ctl); 335 } 336 337 /* 338 * this is only called by cache_block_group, since we could have freed extents 339 * we need to check the pinned_extents for any extents that can't be used yet 340 * since their free space will be released as soon as the transaction commits. 341 */ 342 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 343 struct btrfs_fs_info *info, u64 start, u64 end) 344 { 345 u64 extent_start, extent_end, size, total_added = 0; 346 int ret; 347 348 while (start < end) { 349 ret = find_first_extent_bit(info->pinned_extents, start, 350 &extent_start, &extent_end, 351 EXTENT_DIRTY | EXTENT_UPTODATE, 352 NULL); 353 if (ret) 354 break; 355 356 if (extent_start <= start) { 357 start = extent_end + 1; 358 } else if (extent_start > start && extent_start < end) { 359 size = extent_start - start; 360 total_added += size; 361 ret = btrfs_add_free_space(block_group, start, 362 size); 363 BUG_ON(ret); /* -ENOMEM or logic error */ 364 start = extent_end + 1; 365 } else { 366 break; 367 } 368 } 369 370 if (start < end) { 371 size = end - start; 372 total_added += size; 373 ret = btrfs_add_free_space(block_group, start, size); 374 BUG_ON(ret); /* -ENOMEM or logic error */ 375 } 376 377 return total_added; 378 } 379 380 static noinline void caching_thread(struct btrfs_work *work) 381 { 382 struct btrfs_block_group_cache *block_group; 383 struct btrfs_fs_info *fs_info; 384 struct btrfs_caching_control *caching_ctl; 385 struct btrfs_root *extent_root; 386 struct btrfs_path *path; 387 struct extent_buffer *leaf; 388 struct btrfs_key key; 389 u64 total_found = 0; 390 u64 last = 0; 391 u32 nritems; 392 int ret = -ENOMEM; 393 394 caching_ctl = container_of(work, struct btrfs_caching_control, work); 395 block_group = caching_ctl->block_group; 396 fs_info = block_group->fs_info; 397 extent_root = fs_info->extent_root; 398 399 path = btrfs_alloc_path(); 400 if (!path) 401 goto out; 402 403 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 404 405 /* 406 * We don't want to deadlock with somebody trying to allocate a new 407 * extent for the extent root while also trying to search the extent 408 * root to add free space. So we skip locking and search the commit 409 * root, since its read-only 410 */ 411 path->skip_locking = 1; 412 path->search_commit_root = 1; 413 path->reada = 1; 414 415 key.objectid = last; 416 key.offset = 0; 417 key.type = BTRFS_EXTENT_ITEM_KEY; 418 again: 419 mutex_lock(&caching_ctl->mutex); 420 /* need to make sure the commit_root doesn't disappear */ 421 down_read(&fs_info->extent_commit_sem); 422 423 next: 424 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 425 if (ret < 0) 426 goto err; 427 428 leaf = path->nodes[0]; 429 nritems = btrfs_header_nritems(leaf); 430 431 while (1) { 432 if (btrfs_fs_closing(fs_info) > 1) { 433 last = (u64)-1; 434 break; 435 } 436 437 if (path->slots[0] < nritems) { 438 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 439 } else { 440 ret = find_next_key(path, 0, &key); 441 if (ret) 442 break; 443 444 if (need_resched()) { 445 caching_ctl->progress = last; 446 btrfs_release_path(path); 447 up_read(&fs_info->extent_commit_sem); 448 mutex_unlock(&caching_ctl->mutex); 449 cond_resched(); 450 goto again; 451 } 452 453 ret = btrfs_next_leaf(extent_root, path); 454 if (ret < 0) 455 goto err; 456 if (ret) 457 break; 458 leaf = path->nodes[0]; 459 nritems = btrfs_header_nritems(leaf); 460 continue; 461 } 462 463 if (key.objectid < last) { 464 key.objectid = last; 465 key.offset = 0; 466 key.type = BTRFS_EXTENT_ITEM_KEY; 467 468 caching_ctl->progress = last; 469 btrfs_release_path(path); 470 goto next; 471 } 472 473 if (key.objectid < block_group->key.objectid) { 474 path->slots[0]++; 475 continue; 476 } 477 478 if (key.objectid >= block_group->key.objectid + 479 block_group->key.offset) 480 break; 481 482 if (key.type == BTRFS_EXTENT_ITEM_KEY || 483 key.type == BTRFS_METADATA_ITEM_KEY) { 484 total_found += add_new_free_space(block_group, 485 fs_info, last, 486 key.objectid); 487 if (key.type == BTRFS_METADATA_ITEM_KEY) 488 last = key.objectid + 489 fs_info->tree_root->leafsize; 490 else 491 last = key.objectid + key.offset; 492 493 if (total_found > (1024 * 1024 * 2)) { 494 total_found = 0; 495 wake_up(&caching_ctl->wait); 496 } 497 } 498 path->slots[0]++; 499 } 500 ret = 0; 501 502 total_found += add_new_free_space(block_group, fs_info, last, 503 block_group->key.objectid + 504 block_group->key.offset); 505 caching_ctl->progress = (u64)-1; 506 507 spin_lock(&block_group->lock); 508 block_group->caching_ctl = NULL; 509 block_group->cached = BTRFS_CACHE_FINISHED; 510 spin_unlock(&block_group->lock); 511 512 err: 513 btrfs_free_path(path); 514 up_read(&fs_info->extent_commit_sem); 515 516 free_excluded_extents(extent_root, block_group); 517 518 mutex_unlock(&caching_ctl->mutex); 519 out: 520 if (ret) { 521 spin_lock(&block_group->lock); 522 block_group->caching_ctl = NULL; 523 block_group->cached = BTRFS_CACHE_ERROR; 524 spin_unlock(&block_group->lock); 525 } 526 wake_up(&caching_ctl->wait); 527 528 put_caching_control(caching_ctl); 529 btrfs_put_block_group(block_group); 530 } 531 532 static int cache_block_group(struct btrfs_block_group_cache *cache, 533 int load_cache_only) 534 { 535 DEFINE_WAIT(wait); 536 struct btrfs_fs_info *fs_info = cache->fs_info; 537 struct btrfs_caching_control *caching_ctl; 538 int ret = 0; 539 540 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 541 if (!caching_ctl) 542 return -ENOMEM; 543 544 INIT_LIST_HEAD(&caching_ctl->list); 545 mutex_init(&caching_ctl->mutex); 546 init_waitqueue_head(&caching_ctl->wait); 547 caching_ctl->block_group = cache; 548 caching_ctl->progress = cache->key.objectid; 549 atomic_set(&caching_ctl->count, 1); 550 caching_ctl->work.func = caching_thread; 551 552 spin_lock(&cache->lock); 553 /* 554 * This should be a rare occasion, but this could happen I think in the 555 * case where one thread starts to load the space cache info, and then 556 * some other thread starts a transaction commit which tries to do an 557 * allocation while the other thread is still loading the space cache 558 * info. The previous loop should have kept us from choosing this block 559 * group, but if we've moved to the state where we will wait on caching 560 * block groups we need to first check if we're doing a fast load here, 561 * so we can wait for it to finish, otherwise we could end up allocating 562 * from a block group who's cache gets evicted for one reason or 563 * another. 564 */ 565 while (cache->cached == BTRFS_CACHE_FAST) { 566 struct btrfs_caching_control *ctl; 567 568 ctl = cache->caching_ctl; 569 atomic_inc(&ctl->count); 570 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 571 spin_unlock(&cache->lock); 572 573 schedule(); 574 575 finish_wait(&ctl->wait, &wait); 576 put_caching_control(ctl); 577 spin_lock(&cache->lock); 578 } 579 580 if (cache->cached != BTRFS_CACHE_NO) { 581 spin_unlock(&cache->lock); 582 kfree(caching_ctl); 583 return 0; 584 } 585 WARN_ON(cache->caching_ctl); 586 cache->caching_ctl = caching_ctl; 587 cache->cached = BTRFS_CACHE_FAST; 588 spin_unlock(&cache->lock); 589 590 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 591 ret = load_free_space_cache(fs_info, cache); 592 593 spin_lock(&cache->lock); 594 if (ret == 1) { 595 cache->caching_ctl = NULL; 596 cache->cached = BTRFS_CACHE_FINISHED; 597 cache->last_byte_to_unpin = (u64)-1; 598 } else { 599 if (load_cache_only) { 600 cache->caching_ctl = NULL; 601 cache->cached = BTRFS_CACHE_NO; 602 } else { 603 cache->cached = BTRFS_CACHE_STARTED; 604 } 605 } 606 spin_unlock(&cache->lock); 607 wake_up(&caching_ctl->wait); 608 if (ret == 1) { 609 put_caching_control(caching_ctl); 610 free_excluded_extents(fs_info->extent_root, cache); 611 return 0; 612 } 613 } else { 614 /* 615 * We are not going to do the fast caching, set cached to the 616 * appropriate value and wakeup any waiters. 617 */ 618 spin_lock(&cache->lock); 619 if (load_cache_only) { 620 cache->caching_ctl = NULL; 621 cache->cached = BTRFS_CACHE_NO; 622 } else { 623 cache->cached = BTRFS_CACHE_STARTED; 624 } 625 spin_unlock(&cache->lock); 626 wake_up(&caching_ctl->wait); 627 } 628 629 if (load_cache_only) { 630 put_caching_control(caching_ctl); 631 return 0; 632 } 633 634 down_write(&fs_info->extent_commit_sem); 635 atomic_inc(&caching_ctl->count); 636 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 637 up_write(&fs_info->extent_commit_sem); 638 639 btrfs_get_block_group(cache); 640 641 btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); 642 643 return ret; 644 } 645 646 /* 647 * return the block group that starts at or after bytenr 648 */ 649 static struct btrfs_block_group_cache * 650 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 651 { 652 struct btrfs_block_group_cache *cache; 653 654 cache = block_group_cache_tree_search(info, bytenr, 0); 655 656 return cache; 657 } 658 659 /* 660 * return the block group that contains the given bytenr 661 */ 662 struct btrfs_block_group_cache *btrfs_lookup_block_group( 663 struct btrfs_fs_info *info, 664 u64 bytenr) 665 { 666 struct btrfs_block_group_cache *cache; 667 668 cache = block_group_cache_tree_search(info, bytenr, 1); 669 670 return cache; 671 } 672 673 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 674 u64 flags) 675 { 676 struct list_head *head = &info->space_info; 677 struct btrfs_space_info *found; 678 679 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 680 681 rcu_read_lock(); 682 list_for_each_entry_rcu(found, head, list) { 683 if (found->flags & flags) { 684 rcu_read_unlock(); 685 return found; 686 } 687 } 688 rcu_read_unlock(); 689 return NULL; 690 } 691 692 /* 693 * after adding space to the filesystem, we need to clear the full flags 694 * on all the space infos. 695 */ 696 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 697 { 698 struct list_head *head = &info->space_info; 699 struct btrfs_space_info *found; 700 701 rcu_read_lock(); 702 list_for_each_entry_rcu(found, head, list) 703 found->full = 0; 704 rcu_read_unlock(); 705 } 706 707 /* simple helper to search for an existing extent at a given offset */ 708 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) 709 { 710 int ret; 711 struct btrfs_key key; 712 struct btrfs_path *path; 713 714 path = btrfs_alloc_path(); 715 if (!path) 716 return -ENOMEM; 717 718 key.objectid = start; 719 key.offset = len; 720 key.type = BTRFS_EXTENT_ITEM_KEY; 721 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 722 0, 0); 723 if (ret > 0) { 724 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 725 if (key.objectid == start && 726 key.type == BTRFS_METADATA_ITEM_KEY) 727 ret = 0; 728 } 729 btrfs_free_path(path); 730 return ret; 731 } 732 733 /* 734 * helper function to lookup reference count and flags of a tree block. 735 * 736 * the head node for delayed ref is used to store the sum of all the 737 * reference count modifications queued up in the rbtree. the head 738 * node may also store the extent flags to set. This way you can check 739 * to see what the reference count and extent flags would be if all of 740 * the delayed refs are not processed. 741 */ 742 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 743 struct btrfs_root *root, u64 bytenr, 744 u64 offset, int metadata, u64 *refs, u64 *flags) 745 { 746 struct btrfs_delayed_ref_head *head; 747 struct btrfs_delayed_ref_root *delayed_refs; 748 struct btrfs_path *path; 749 struct btrfs_extent_item *ei; 750 struct extent_buffer *leaf; 751 struct btrfs_key key; 752 u32 item_size; 753 u64 num_refs; 754 u64 extent_flags; 755 int ret; 756 757 /* 758 * If we don't have skinny metadata, don't bother doing anything 759 * different 760 */ 761 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { 762 offset = root->leafsize; 763 metadata = 0; 764 } 765 766 path = btrfs_alloc_path(); 767 if (!path) 768 return -ENOMEM; 769 770 if (!trans) { 771 path->skip_locking = 1; 772 path->search_commit_root = 1; 773 } 774 775 search_again: 776 key.objectid = bytenr; 777 key.offset = offset; 778 if (metadata) 779 key.type = BTRFS_METADATA_ITEM_KEY; 780 else 781 key.type = BTRFS_EXTENT_ITEM_KEY; 782 783 again: 784 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 785 &key, path, 0, 0); 786 if (ret < 0) 787 goto out_free; 788 789 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 790 if (path->slots[0]) { 791 path->slots[0]--; 792 btrfs_item_key_to_cpu(path->nodes[0], &key, 793 path->slots[0]); 794 if (key.objectid == bytenr && 795 key.type == BTRFS_EXTENT_ITEM_KEY && 796 key.offset == root->leafsize) 797 ret = 0; 798 } 799 if (ret) { 800 key.objectid = bytenr; 801 key.type = BTRFS_EXTENT_ITEM_KEY; 802 key.offset = root->leafsize; 803 btrfs_release_path(path); 804 goto again; 805 } 806 } 807 808 if (ret == 0) { 809 leaf = path->nodes[0]; 810 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 811 if (item_size >= sizeof(*ei)) { 812 ei = btrfs_item_ptr(leaf, path->slots[0], 813 struct btrfs_extent_item); 814 num_refs = btrfs_extent_refs(leaf, ei); 815 extent_flags = btrfs_extent_flags(leaf, ei); 816 } else { 817 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 818 struct btrfs_extent_item_v0 *ei0; 819 BUG_ON(item_size != sizeof(*ei0)); 820 ei0 = btrfs_item_ptr(leaf, path->slots[0], 821 struct btrfs_extent_item_v0); 822 num_refs = btrfs_extent_refs_v0(leaf, ei0); 823 /* FIXME: this isn't correct for data */ 824 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 825 #else 826 BUG(); 827 #endif 828 } 829 BUG_ON(num_refs == 0); 830 } else { 831 num_refs = 0; 832 extent_flags = 0; 833 ret = 0; 834 } 835 836 if (!trans) 837 goto out; 838 839 delayed_refs = &trans->transaction->delayed_refs; 840 spin_lock(&delayed_refs->lock); 841 head = btrfs_find_delayed_ref_head(trans, bytenr); 842 if (head) { 843 if (!mutex_trylock(&head->mutex)) { 844 atomic_inc(&head->node.refs); 845 spin_unlock(&delayed_refs->lock); 846 847 btrfs_release_path(path); 848 849 /* 850 * Mutex was contended, block until it's released and try 851 * again 852 */ 853 mutex_lock(&head->mutex); 854 mutex_unlock(&head->mutex); 855 btrfs_put_delayed_ref(&head->node); 856 goto search_again; 857 } 858 if (head->extent_op && head->extent_op->update_flags) 859 extent_flags |= head->extent_op->flags_to_set; 860 else 861 BUG_ON(num_refs == 0); 862 863 num_refs += head->node.ref_mod; 864 mutex_unlock(&head->mutex); 865 } 866 spin_unlock(&delayed_refs->lock); 867 out: 868 WARN_ON(num_refs == 0); 869 if (refs) 870 *refs = num_refs; 871 if (flags) 872 *flags = extent_flags; 873 out_free: 874 btrfs_free_path(path); 875 return ret; 876 } 877 878 /* 879 * Back reference rules. Back refs have three main goals: 880 * 881 * 1) differentiate between all holders of references to an extent so that 882 * when a reference is dropped we can make sure it was a valid reference 883 * before freeing the extent. 884 * 885 * 2) Provide enough information to quickly find the holders of an extent 886 * if we notice a given block is corrupted or bad. 887 * 888 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 889 * maintenance. This is actually the same as #2, but with a slightly 890 * different use case. 891 * 892 * There are two kinds of back refs. The implicit back refs is optimized 893 * for pointers in non-shared tree blocks. For a given pointer in a block, 894 * back refs of this kind provide information about the block's owner tree 895 * and the pointer's key. These information allow us to find the block by 896 * b-tree searching. The full back refs is for pointers in tree blocks not 897 * referenced by their owner trees. The location of tree block is recorded 898 * in the back refs. Actually the full back refs is generic, and can be 899 * used in all cases the implicit back refs is used. The major shortcoming 900 * of the full back refs is its overhead. Every time a tree block gets 901 * COWed, we have to update back refs entry for all pointers in it. 902 * 903 * For a newly allocated tree block, we use implicit back refs for 904 * pointers in it. This means most tree related operations only involve 905 * implicit back refs. For a tree block created in old transaction, the 906 * only way to drop a reference to it is COW it. So we can detect the 907 * event that tree block loses its owner tree's reference and do the 908 * back refs conversion. 909 * 910 * When a tree block is COW'd through a tree, there are four cases: 911 * 912 * The reference count of the block is one and the tree is the block's 913 * owner tree. Nothing to do in this case. 914 * 915 * The reference count of the block is one and the tree is not the 916 * block's owner tree. In this case, full back refs is used for pointers 917 * in the block. Remove these full back refs, add implicit back refs for 918 * every pointers in the new block. 919 * 920 * The reference count of the block is greater than one and the tree is 921 * the block's owner tree. In this case, implicit back refs is used for 922 * pointers in the block. Add full back refs for every pointers in the 923 * block, increase lower level extents' reference counts. The original 924 * implicit back refs are entailed to the new block. 925 * 926 * The reference count of the block is greater than one and the tree is 927 * not the block's owner tree. Add implicit back refs for every pointer in 928 * the new block, increase lower level extents' reference count. 929 * 930 * Back Reference Key composing: 931 * 932 * The key objectid corresponds to the first byte in the extent, 933 * The key type is used to differentiate between types of back refs. 934 * There are different meanings of the key offset for different types 935 * of back refs. 936 * 937 * File extents can be referenced by: 938 * 939 * - multiple snapshots, subvolumes, or different generations in one subvol 940 * - different files inside a single subvolume 941 * - different offsets inside a file (bookend extents in file.c) 942 * 943 * The extent ref structure for the implicit back refs has fields for: 944 * 945 * - Objectid of the subvolume root 946 * - objectid of the file holding the reference 947 * - original offset in the file 948 * - how many bookend extents 949 * 950 * The key offset for the implicit back refs is hash of the first 951 * three fields. 952 * 953 * The extent ref structure for the full back refs has field for: 954 * 955 * - number of pointers in the tree leaf 956 * 957 * The key offset for the implicit back refs is the first byte of 958 * the tree leaf 959 * 960 * When a file extent is allocated, The implicit back refs is used. 961 * the fields are filled in: 962 * 963 * (root_key.objectid, inode objectid, offset in file, 1) 964 * 965 * When a file extent is removed file truncation, we find the 966 * corresponding implicit back refs and check the following fields: 967 * 968 * (btrfs_header_owner(leaf), inode objectid, offset in file) 969 * 970 * Btree extents can be referenced by: 971 * 972 * - Different subvolumes 973 * 974 * Both the implicit back refs and the full back refs for tree blocks 975 * only consist of key. The key offset for the implicit back refs is 976 * objectid of block's owner tree. The key offset for the full back refs 977 * is the first byte of parent block. 978 * 979 * When implicit back refs is used, information about the lowest key and 980 * level of the tree block are required. These information are stored in 981 * tree block info structure. 982 */ 983 984 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 985 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 986 struct btrfs_root *root, 987 struct btrfs_path *path, 988 u64 owner, u32 extra_size) 989 { 990 struct btrfs_extent_item *item; 991 struct btrfs_extent_item_v0 *ei0; 992 struct btrfs_extent_ref_v0 *ref0; 993 struct btrfs_tree_block_info *bi; 994 struct extent_buffer *leaf; 995 struct btrfs_key key; 996 struct btrfs_key found_key; 997 u32 new_size = sizeof(*item); 998 u64 refs; 999 int ret; 1000 1001 leaf = path->nodes[0]; 1002 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 1003 1004 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1005 ei0 = btrfs_item_ptr(leaf, path->slots[0], 1006 struct btrfs_extent_item_v0); 1007 refs = btrfs_extent_refs_v0(leaf, ei0); 1008 1009 if (owner == (u64)-1) { 1010 while (1) { 1011 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1012 ret = btrfs_next_leaf(root, path); 1013 if (ret < 0) 1014 return ret; 1015 BUG_ON(ret > 0); /* Corruption */ 1016 leaf = path->nodes[0]; 1017 } 1018 btrfs_item_key_to_cpu(leaf, &found_key, 1019 path->slots[0]); 1020 BUG_ON(key.objectid != found_key.objectid); 1021 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 1022 path->slots[0]++; 1023 continue; 1024 } 1025 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1026 struct btrfs_extent_ref_v0); 1027 owner = btrfs_ref_objectid_v0(leaf, ref0); 1028 break; 1029 } 1030 } 1031 btrfs_release_path(path); 1032 1033 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1034 new_size += sizeof(*bi); 1035 1036 new_size -= sizeof(*ei0); 1037 ret = btrfs_search_slot(trans, root, &key, path, 1038 new_size + extra_size, 1); 1039 if (ret < 0) 1040 return ret; 1041 BUG_ON(ret); /* Corruption */ 1042 1043 btrfs_extend_item(root, path, new_size); 1044 1045 leaf = path->nodes[0]; 1046 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1047 btrfs_set_extent_refs(leaf, item, refs); 1048 /* FIXME: get real generation */ 1049 btrfs_set_extent_generation(leaf, item, 0); 1050 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1051 btrfs_set_extent_flags(leaf, item, 1052 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1053 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1054 bi = (struct btrfs_tree_block_info *)(item + 1); 1055 /* FIXME: get first key of the block */ 1056 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); 1057 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1058 } else { 1059 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1060 } 1061 btrfs_mark_buffer_dirty(leaf); 1062 return 0; 1063 } 1064 #endif 1065 1066 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1067 { 1068 u32 high_crc = ~(u32)0; 1069 u32 low_crc = ~(u32)0; 1070 __le64 lenum; 1071 1072 lenum = cpu_to_le64(root_objectid); 1073 high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 1074 lenum = cpu_to_le64(owner); 1075 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1076 lenum = cpu_to_le64(offset); 1077 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1078 1079 return ((u64)high_crc << 31) ^ (u64)low_crc; 1080 } 1081 1082 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1083 struct btrfs_extent_data_ref *ref) 1084 { 1085 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1086 btrfs_extent_data_ref_objectid(leaf, ref), 1087 btrfs_extent_data_ref_offset(leaf, ref)); 1088 } 1089 1090 static int match_extent_data_ref(struct extent_buffer *leaf, 1091 struct btrfs_extent_data_ref *ref, 1092 u64 root_objectid, u64 owner, u64 offset) 1093 { 1094 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1095 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1096 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1097 return 0; 1098 return 1; 1099 } 1100 1101 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1102 struct btrfs_root *root, 1103 struct btrfs_path *path, 1104 u64 bytenr, u64 parent, 1105 u64 root_objectid, 1106 u64 owner, u64 offset) 1107 { 1108 struct btrfs_key key; 1109 struct btrfs_extent_data_ref *ref; 1110 struct extent_buffer *leaf; 1111 u32 nritems; 1112 int ret; 1113 int recow; 1114 int err = -ENOENT; 1115 1116 key.objectid = bytenr; 1117 if (parent) { 1118 key.type = BTRFS_SHARED_DATA_REF_KEY; 1119 key.offset = parent; 1120 } else { 1121 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1122 key.offset = hash_extent_data_ref(root_objectid, 1123 owner, offset); 1124 } 1125 again: 1126 recow = 0; 1127 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1128 if (ret < 0) { 1129 err = ret; 1130 goto fail; 1131 } 1132 1133 if (parent) { 1134 if (!ret) 1135 return 0; 1136 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1137 key.type = BTRFS_EXTENT_REF_V0_KEY; 1138 btrfs_release_path(path); 1139 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1140 if (ret < 0) { 1141 err = ret; 1142 goto fail; 1143 } 1144 if (!ret) 1145 return 0; 1146 #endif 1147 goto fail; 1148 } 1149 1150 leaf = path->nodes[0]; 1151 nritems = btrfs_header_nritems(leaf); 1152 while (1) { 1153 if (path->slots[0] >= nritems) { 1154 ret = btrfs_next_leaf(root, path); 1155 if (ret < 0) 1156 err = ret; 1157 if (ret) 1158 goto fail; 1159 1160 leaf = path->nodes[0]; 1161 nritems = btrfs_header_nritems(leaf); 1162 recow = 1; 1163 } 1164 1165 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1166 if (key.objectid != bytenr || 1167 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1168 goto fail; 1169 1170 ref = btrfs_item_ptr(leaf, path->slots[0], 1171 struct btrfs_extent_data_ref); 1172 1173 if (match_extent_data_ref(leaf, ref, root_objectid, 1174 owner, offset)) { 1175 if (recow) { 1176 btrfs_release_path(path); 1177 goto again; 1178 } 1179 err = 0; 1180 break; 1181 } 1182 path->slots[0]++; 1183 } 1184 fail: 1185 return err; 1186 } 1187 1188 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1189 struct btrfs_root *root, 1190 struct btrfs_path *path, 1191 u64 bytenr, u64 parent, 1192 u64 root_objectid, u64 owner, 1193 u64 offset, int refs_to_add) 1194 { 1195 struct btrfs_key key; 1196 struct extent_buffer *leaf; 1197 u32 size; 1198 u32 num_refs; 1199 int ret; 1200 1201 key.objectid = bytenr; 1202 if (parent) { 1203 key.type = BTRFS_SHARED_DATA_REF_KEY; 1204 key.offset = parent; 1205 size = sizeof(struct btrfs_shared_data_ref); 1206 } else { 1207 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1208 key.offset = hash_extent_data_ref(root_objectid, 1209 owner, offset); 1210 size = sizeof(struct btrfs_extent_data_ref); 1211 } 1212 1213 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1214 if (ret && ret != -EEXIST) 1215 goto fail; 1216 1217 leaf = path->nodes[0]; 1218 if (parent) { 1219 struct btrfs_shared_data_ref *ref; 1220 ref = btrfs_item_ptr(leaf, path->slots[0], 1221 struct btrfs_shared_data_ref); 1222 if (ret == 0) { 1223 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1224 } else { 1225 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1226 num_refs += refs_to_add; 1227 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1228 } 1229 } else { 1230 struct btrfs_extent_data_ref *ref; 1231 while (ret == -EEXIST) { 1232 ref = btrfs_item_ptr(leaf, path->slots[0], 1233 struct btrfs_extent_data_ref); 1234 if (match_extent_data_ref(leaf, ref, root_objectid, 1235 owner, offset)) 1236 break; 1237 btrfs_release_path(path); 1238 key.offset++; 1239 ret = btrfs_insert_empty_item(trans, root, path, &key, 1240 size); 1241 if (ret && ret != -EEXIST) 1242 goto fail; 1243 1244 leaf = path->nodes[0]; 1245 } 1246 ref = btrfs_item_ptr(leaf, path->slots[0], 1247 struct btrfs_extent_data_ref); 1248 if (ret == 0) { 1249 btrfs_set_extent_data_ref_root(leaf, ref, 1250 root_objectid); 1251 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1252 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1253 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1254 } else { 1255 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1256 num_refs += refs_to_add; 1257 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1258 } 1259 } 1260 btrfs_mark_buffer_dirty(leaf); 1261 ret = 0; 1262 fail: 1263 btrfs_release_path(path); 1264 return ret; 1265 } 1266 1267 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1268 struct btrfs_root *root, 1269 struct btrfs_path *path, 1270 int refs_to_drop) 1271 { 1272 struct btrfs_key key; 1273 struct btrfs_extent_data_ref *ref1 = NULL; 1274 struct btrfs_shared_data_ref *ref2 = NULL; 1275 struct extent_buffer *leaf; 1276 u32 num_refs = 0; 1277 int ret = 0; 1278 1279 leaf = path->nodes[0]; 1280 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1281 1282 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1283 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1284 struct btrfs_extent_data_ref); 1285 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1286 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1287 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1288 struct btrfs_shared_data_ref); 1289 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1290 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1291 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1292 struct btrfs_extent_ref_v0 *ref0; 1293 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1294 struct btrfs_extent_ref_v0); 1295 num_refs = btrfs_ref_count_v0(leaf, ref0); 1296 #endif 1297 } else { 1298 BUG(); 1299 } 1300 1301 BUG_ON(num_refs < refs_to_drop); 1302 num_refs -= refs_to_drop; 1303 1304 if (num_refs == 0) { 1305 ret = btrfs_del_item(trans, root, path); 1306 } else { 1307 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1308 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1309 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1310 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1311 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1312 else { 1313 struct btrfs_extent_ref_v0 *ref0; 1314 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1315 struct btrfs_extent_ref_v0); 1316 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1317 } 1318 #endif 1319 btrfs_mark_buffer_dirty(leaf); 1320 } 1321 return ret; 1322 } 1323 1324 static noinline u32 extent_data_ref_count(struct btrfs_root *root, 1325 struct btrfs_path *path, 1326 struct btrfs_extent_inline_ref *iref) 1327 { 1328 struct btrfs_key key; 1329 struct extent_buffer *leaf; 1330 struct btrfs_extent_data_ref *ref1; 1331 struct btrfs_shared_data_ref *ref2; 1332 u32 num_refs = 0; 1333 1334 leaf = path->nodes[0]; 1335 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1336 if (iref) { 1337 if (btrfs_extent_inline_ref_type(leaf, iref) == 1338 BTRFS_EXTENT_DATA_REF_KEY) { 1339 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1340 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1341 } else { 1342 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1343 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1344 } 1345 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1346 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1347 struct btrfs_extent_data_ref); 1348 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1349 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1350 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1351 struct btrfs_shared_data_ref); 1352 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1353 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1354 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1355 struct btrfs_extent_ref_v0 *ref0; 1356 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1357 struct btrfs_extent_ref_v0); 1358 num_refs = btrfs_ref_count_v0(leaf, ref0); 1359 #endif 1360 } else { 1361 WARN_ON(1); 1362 } 1363 return num_refs; 1364 } 1365 1366 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1367 struct btrfs_root *root, 1368 struct btrfs_path *path, 1369 u64 bytenr, u64 parent, 1370 u64 root_objectid) 1371 { 1372 struct btrfs_key key; 1373 int ret; 1374 1375 key.objectid = bytenr; 1376 if (parent) { 1377 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1378 key.offset = parent; 1379 } else { 1380 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1381 key.offset = root_objectid; 1382 } 1383 1384 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1385 if (ret > 0) 1386 ret = -ENOENT; 1387 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1388 if (ret == -ENOENT && parent) { 1389 btrfs_release_path(path); 1390 key.type = BTRFS_EXTENT_REF_V0_KEY; 1391 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1392 if (ret > 0) 1393 ret = -ENOENT; 1394 } 1395 #endif 1396 return ret; 1397 } 1398 1399 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1400 struct btrfs_root *root, 1401 struct btrfs_path *path, 1402 u64 bytenr, u64 parent, 1403 u64 root_objectid) 1404 { 1405 struct btrfs_key key; 1406 int ret; 1407 1408 key.objectid = bytenr; 1409 if (parent) { 1410 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1411 key.offset = parent; 1412 } else { 1413 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1414 key.offset = root_objectid; 1415 } 1416 1417 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1418 btrfs_release_path(path); 1419 return ret; 1420 } 1421 1422 static inline int extent_ref_type(u64 parent, u64 owner) 1423 { 1424 int type; 1425 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1426 if (parent > 0) 1427 type = BTRFS_SHARED_BLOCK_REF_KEY; 1428 else 1429 type = BTRFS_TREE_BLOCK_REF_KEY; 1430 } else { 1431 if (parent > 0) 1432 type = BTRFS_SHARED_DATA_REF_KEY; 1433 else 1434 type = BTRFS_EXTENT_DATA_REF_KEY; 1435 } 1436 return type; 1437 } 1438 1439 static int find_next_key(struct btrfs_path *path, int level, 1440 struct btrfs_key *key) 1441 1442 { 1443 for (; level < BTRFS_MAX_LEVEL; level++) { 1444 if (!path->nodes[level]) 1445 break; 1446 if (path->slots[level] + 1 >= 1447 btrfs_header_nritems(path->nodes[level])) 1448 continue; 1449 if (level == 0) 1450 btrfs_item_key_to_cpu(path->nodes[level], key, 1451 path->slots[level] + 1); 1452 else 1453 btrfs_node_key_to_cpu(path->nodes[level], key, 1454 path->slots[level] + 1); 1455 return 0; 1456 } 1457 return 1; 1458 } 1459 1460 /* 1461 * look for inline back ref. if back ref is found, *ref_ret is set 1462 * to the address of inline back ref, and 0 is returned. 1463 * 1464 * if back ref isn't found, *ref_ret is set to the address where it 1465 * should be inserted, and -ENOENT is returned. 1466 * 1467 * if insert is true and there are too many inline back refs, the path 1468 * points to the extent item, and -EAGAIN is returned. 1469 * 1470 * NOTE: inline back refs are ordered in the same way that back ref 1471 * items in the tree are ordered. 1472 */ 1473 static noinline_for_stack 1474 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1475 struct btrfs_root *root, 1476 struct btrfs_path *path, 1477 struct btrfs_extent_inline_ref **ref_ret, 1478 u64 bytenr, u64 num_bytes, 1479 u64 parent, u64 root_objectid, 1480 u64 owner, u64 offset, int insert) 1481 { 1482 struct btrfs_key key; 1483 struct extent_buffer *leaf; 1484 struct btrfs_extent_item *ei; 1485 struct btrfs_extent_inline_ref *iref; 1486 u64 flags; 1487 u64 item_size; 1488 unsigned long ptr; 1489 unsigned long end; 1490 int extra_size; 1491 int type; 1492 int want; 1493 int ret; 1494 int err = 0; 1495 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 1496 SKINNY_METADATA); 1497 1498 key.objectid = bytenr; 1499 key.type = BTRFS_EXTENT_ITEM_KEY; 1500 key.offset = num_bytes; 1501 1502 want = extent_ref_type(parent, owner); 1503 if (insert) { 1504 extra_size = btrfs_extent_inline_ref_size(want); 1505 path->keep_locks = 1; 1506 } else 1507 extra_size = -1; 1508 1509 /* 1510 * Owner is our parent level, so we can just add one to get the level 1511 * for the block we are interested in. 1512 */ 1513 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1514 key.type = BTRFS_METADATA_ITEM_KEY; 1515 key.offset = owner; 1516 } 1517 1518 again: 1519 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1520 if (ret < 0) { 1521 err = ret; 1522 goto out; 1523 } 1524 1525 /* 1526 * We may be a newly converted file system which still has the old fat 1527 * extent entries for metadata, so try and see if we have one of those. 1528 */ 1529 if (ret > 0 && skinny_metadata) { 1530 skinny_metadata = false; 1531 if (path->slots[0]) { 1532 path->slots[0]--; 1533 btrfs_item_key_to_cpu(path->nodes[0], &key, 1534 path->slots[0]); 1535 if (key.objectid == bytenr && 1536 key.type == BTRFS_EXTENT_ITEM_KEY && 1537 key.offset == num_bytes) 1538 ret = 0; 1539 } 1540 if (ret) { 1541 key.type = BTRFS_EXTENT_ITEM_KEY; 1542 key.offset = num_bytes; 1543 btrfs_release_path(path); 1544 goto again; 1545 } 1546 } 1547 1548 if (ret && !insert) { 1549 err = -ENOENT; 1550 goto out; 1551 } else if (WARN_ON(ret)) { 1552 err = -EIO; 1553 goto out; 1554 } 1555 1556 leaf = path->nodes[0]; 1557 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1558 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1559 if (item_size < sizeof(*ei)) { 1560 if (!insert) { 1561 err = -ENOENT; 1562 goto out; 1563 } 1564 ret = convert_extent_item_v0(trans, root, path, owner, 1565 extra_size); 1566 if (ret < 0) { 1567 err = ret; 1568 goto out; 1569 } 1570 leaf = path->nodes[0]; 1571 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1572 } 1573 #endif 1574 BUG_ON(item_size < sizeof(*ei)); 1575 1576 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1577 flags = btrfs_extent_flags(leaf, ei); 1578 1579 ptr = (unsigned long)(ei + 1); 1580 end = (unsigned long)ei + item_size; 1581 1582 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1583 ptr += sizeof(struct btrfs_tree_block_info); 1584 BUG_ON(ptr > end); 1585 } 1586 1587 err = -ENOENT; 1588 while (1) { 1589 if (ptr >= end) { 1590 WARN_ON(ptr > end); 1591 break; 1592 } 1593 iref = (struct btrfs_extent_inline_ref *)ptr; 1594 type = btrfs_extent_inline_ref_type(leaf, iref); 1595 if (want < type) 1596 break; 1597 if (want > type) { 1598 ptr += btrfs_extent_inline_ref_size(type); 1599 continue; 1600 } 1601 1602 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1603 struct btrfs_extent_data_ref *dref; 1604 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1605 if (match_extent_data_ref(leaf, dref, root_objectid, 1606 owner, offset)) { 1607 err = 0; 1608 break; 1609 } 1610 if (hash_extent_data_ref_item(leaf, dref) < 1611 hash_extent_data_ref(root_objectid, owner, offset)) 1612 break; 1613 } else { 1614 u64 ref_offset; 1615 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1616 if (parent > 0) { 1617 if (parent == ref_offset) { 1618 err = 0; 1619 break; 1620 } 1621 if (ref_offset < parent) 1622 break; 1623 } else { 1624 if (root_objectid == ref_offset) { 1625 err = 0; 1626 break; 1627 } 1628 if (ref_offset < root_objectid) 1629 break; 1630 } 1631 } 1632 ptr += btrfs_extent_inline_ref_size(type); 1633 } 1634 if (err == -ENOENT && insert) { 1635 if (item_size + extra_size >= 1636 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1637 err = -EAGAIN; 1638 goto out; 1639 } 1640 /* 1641 * To add new inline back ref, we have to make sure 1642 * there is no corresponding back ref item. 1643 * For simplicity, we just do not add new inline back 1644 * ref if there is any kind of item for this block 1645 */ 1646 if (find_next_key(path, 0, &key) == 0 && 1647 key.objectid == bytenr && 1648 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1649 err = -EAGAIN; 1650 goto out; 1651 } 1652 } 1653 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1654 out: 1655 if (insert) { 1656 path->keep_locks = 0; 1657 btrfs_unlock_up_safe(path, 1); 1658 } 1659 return err; 1660 } 1661 1662 /* 1663 * helper to add new inline back ref 1664 */ 1665 static noinline_for_stack 1666 void setup_inline_extent_backref(struct btrfs_root *root, 1667 struct btrfs_path *path, 1668 struct btrfs_extent_inline_ref *iref, 1669 u64 parent, u64 root_objectid, 1670 u64 owner, u64 offset, int refs_to_add, 1671 struct btrfs_delayed_extent_op *extent_op) 1672 { 1673 struct extent_buffer *leaf; 1674 struct btrfs_extent_item *ei; 1675 unsigned long ptr; 1676 unsigned long end; 1677 unsigned long item_offset; 1678 u64 refs; 1679 int size; 1680 int type; 1681 1682 leaf = path->nodes[0]; 1683 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1684 item_offset = (unsigned long)iref - (unsigned long)ei; 1685 1686 type = extent_ref_type(parent, owner); 1687 size = btrfs_extent_inline_ref_size(type); 1688 1689 btrfs_extend_item(root, path, size); 1690 1691 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1692 refs = btrfs_extent_refs(leaf, ei); 1693 refs += refs_to_add; 1694 btrfs_set_extent_refs(leaf, ei, refs); 1695 if (extent_op) 1696 __run_delayed_extent_op(extent_op, leaf, ei); 1697 1698 ptr = (unsigned long)ei + item_offset; 1699 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1700 if (ptr < end - size) 1701 memmove_extent_buffer(leaf, ptr + size, ptr, 1702 end - size - ptr); 1703 1704 iref = (struct btrfs_extent_inline_ref *)ptr; 1705 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1706 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1707 struct btrfs_extent_data_ref *dref; 1708 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1709 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1710 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1711 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1712 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1713 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1714 struct btrfs_shared_data_ref *sref; 1715 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1716 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1717 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1718 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1719 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1720 } else { 1721 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1722 } 1723 btrfs_mark_buffer_dirty(leaf); 1724 } 1725 1726 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1727 struct btrfs_root *root, 1728 struct btrfs_path *path, 1729 struct btrfs_extent_inline_ref **ref_ret, 1730 u64 bytenr, u64 num_bytes, u64 parent, 1731 u64 root_objectid, u64 owner, u64 offset) 1732 { 1733 int ret; 1734 1735 ret = lookup_inline_extent_backref(trans, root, path, ref_ret, 1736 bytenr, num_bytes, parent, 1737 root_objectid, owner, offset, 0); 1738 if (ret != -ENOENT) 1739 return ret; 1740 1741 btrfs_release_path(path); 1742 *ref_ret = NULL; 1743 1744 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1745 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, 1746 root_objectid); 1747 } else { 1748 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, 1749 root_objectid, owner, offset); 1750 } 1751 return ret; 1752 } 1753 1754 /* 1755 * helper to update/remove inline back ref 1756 */ 1757 static noinline_for_stack 1758 void update_inline_extent_backref(struct btrfs_root *root, 1759 struct btrfs_path *path, 1760 struct btrfs_extent_inline_ref *iref, 1761 int refs_to_mod, 1762 struct btrfs_delayed_extent_op *extent_op) 1763 { 1764 struct extent_buffer *leaf; 1765 struct btrfs_extent_item *ei; 1766 struct btrfs_extent_data_ref *dref = NULL; 1767 struct btrfs_shared_data_ref *sref = NULL; 1768 unsigned long ptr; 1769 unsigned long end; 1770 u32 item_size; 1771 int size; 1772 int type; 1773 u64 refs; 1774 1775 leaf = path->nodes[0]; 1776 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1777 refs = btrfs_extent_refs(leaf, ei); 1778 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1779 refs += refs_to_mod; 1780 btrfs_set_extent_refs(leaf, ei, refs); 1781 if (extent_op) 1782 __run_delayed_extent_op(extent_op, leaf, ei); 1783 1784 type = btrfs_extent_inline_ref_type(leaf, iref); 1785 1786 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1787 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1788 refs = btrfs_extent_data_ref_count(leaf, dref); 1789 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1790 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1791 refs = btrfs_shared_data_ref_count(leaf, sref); 1792 } else { 1793 refs = 1; 1794 BUG_ON(refs_to_mod != -1); 1795 } 1796 1797 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1798 refs += refs_to_mod; 1799 1800 if (refs > 0) { 1801 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1802 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1803 else 1804 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1805 } else { 1806 size = btrfs_extent_inline_ref_size(type); 1807 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1808 ptr = (unsigned long)iref; 1809 end = (unsigned long)ei + item_size; 1810 if (ptr + size < end) 1811 memmove_extent_buffer(leaf, ptr, ptr + size, 1812 end - ptr - size); 1813 item_size -= size; 1814 btrfs_truncate_item(root, path, item_size, 1); 1815 } 1816 btrfs_mark_buffer_dirty(leaf); 1817 } 1818 1819 static noinline_for_stack 1820 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1821 struct btrfs_root *root, 1822 struct btrfs_path *path, 1823 u64 bytenr, u64 num_bytes, u64 parent, 1824 u64 root_objectid, u64 owner, 1825 u64 offset, int refs_to_add, 1826 struct btrfs_delayed_extent_op *extent_op) 1827 { 1828 struct btrfs_extent_inline_ref *iref; 1829 int ret; 1830 1831 ret = lookup_inline_extent_backref(trans, root, path, &iref, 1832 bytenr, num_bytes, parent, 1833 root_objectid, owner, offset, 1); 1834 if (ret == 0) { 1835 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1836 update_inline_extent_backref(root, path, iref, 1837 refs_to_add, extent_op); 1838 } else if (ret == -ENOENT) { 1839 setup_inline_extent_backref(root, path, iref, parent, 1840 root_objectid, owner, offset, 1841 refs_to_add, extent_op); 1842 ret = 0; 1843 } 1844 return ret; 1845 } 1846 1847 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1848 struct btrfs_root *root, 1849 struct btrfs_path *path, 1850 u64 bytenr, u64 parent, u64 root_objectid, 1851 u64 owner, u64 offset, int refs_to_add) 1852 { 1853 int ret; 1854 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1855 BUG_ON(refs_to_add != 1); 1856 ret = insert_tree_block_ref(trans, root, path, bytenr, 1857 parent, root_objectid); 1858 } else { 1859 ret = insert_extent_data_ref(trans, root, path, bytenr, 1860 parent, root_objectid, 1861 owner, offset, refs_to_add); 1862 } 1863 return ret; 1864 } 1865 1866 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1867 struct btrfs_root *root, 1868 struct btrfs_path *path, 1869 struct btrfs_extent_inline_ref *iref, 1870 int refs_to_drop, int is_data) 1871 { 1872 int ret = 0; 1873 1874 BUG_ON(!is_data && refs_to_drop != 1); 1875 if (iref) { 1876 update_inline_extent_backref(root, path, iref, 1877 -refs_to_drop, NULL); 1878 } else if (is_data) { 1879 ret = remove_extent_data_ref(trans, root, path, refs_to_drop); 1880 } else { 1881 ret = btrfs_del_item(trans, root, path); 1882 } 1883 return ret; 1884 } 1885 1886 static int btrfs_issue_discard(struct block_device *bdev, 1887 u64 start, u64 len) 1888 { 1889 return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); 1890 } 1891 1892 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1893 u64 num_bytes, u64 *actual_bytes) 1894 { 1895 int ret; 1896 u64 discarded_bytes = 0; 1897 struct btrfs_bio *bbio = NULL; 1898 1899 1900 /* Tell the block device(s) that the sectors can be discarded */ 1901 ret = btrfs_map_block(root->fs_info, REQ_DISCARD, 1902 bytenr, &num_bytes, &bbio, 0); 1903 /* Error condition is -ENOMEM */ 1904 if (!ret) { 1905 struct btrfs_bio_stripe *stripe = bbio->stripes; 1906 int i; 1907 1908 1909 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 1910 if (!stripe->dev->can_discard) 1911 continue; 1912 1913 ret = btrfs_issue_discard(stripe->dev->bdev, 1914 stripe->physical, 1915 stripe->length); 1916 if (!ret) 1917 discarded_bytes += stripe->length; 1918 else if (ret != -EOPNOTSUPP) 1919 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 1920 1921 /* 1922 * Just in case we get back EOPNOTSUPP for some reason, 1923 * just ignore the return value so we don't screw up 1924 * people calling discard_extent. 1925 */ 1926 ret = 0; 1927 } 1928 kfree(bbio); 1929 } 1930 1931 if (actual_bytes) 1932 *actual_bytes = discarded_bytes; 1933 1934 1935 if (ret == -EOPNOTSUPP) 1936 ret = 0; 1937 return ret; 1938 } 1939 1940 /* Can return -ENOMEM */ 1941 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1942 struct btrfs_root *root, 1943 u64 bytenr, u64 num_bytes, u64 parent, 1944 u64 root_objectid, u64 owner, u64 offset, int for_cow) 1945 { 1946 int ret; 1947 struct btrfs_fs_info *fs_info = root->fs_info; 1948 1949 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1950 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1951 1952 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1953 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 1954 num_bytes, 1955 parent, root_objectid, (int)owner, 1956 BTRFS_ADD_DELAYED_REF, NULL, for_cow); 1957 } else { 1958 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 1959 num_bytes, 1960 parent, root_objectid, owner, offset, 1961 BTRFS_ADD_DELAYED_REF, NULL, for_cow); 1962 } 1963 return ret; 1964 } 1965 1966 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1967 struct btrfs_root *root, 1968 u64 bytenr, u64 num_bytes, 1969 u64 parent, u64 root_objectid, 1970 u64 owner, u64 offset, int refs_to_add, 1971 struct btrfs_delayed_extent_op *extent_op) 1972 { 1973 struct btrfs_path *path; 1974 struct extent_buffer *leaf; 1975 struct btrfs_extent_item *item; 1976 u64 refs; 1977 int ret; 1978 1979 path = btrfs_alloc_path(); 1980 if (!path) 1981 return -ENOMEM; 1982 1983 path->reada = 1; 1984 path->leave_spinning = 1; 1985 /* this will setup the path even if it fails to insert the back ref */ 1986 ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, 1987 path, bytenr, num_bytes, parent, 1988 root_objectid, owner, offset, 1989 refs_to_add, extent_op); 1990 if (ret != -EAGAIN) 1991 goto out; 1992 1993 leaf = path->nodes[0]; 1994 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1995 refs = btrfs_extent_refs(leaf, item); 1996 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 1997 if (extent_op) 1998 __run_delayed_extent_op(extent_op, leaf, item); 1999 2000 btrfs_mark_buffer_dirty(leaf); 2001 btrfs_release_path(path); 2002 2003 path->reada = 1; 2004 path->leave_spinning = 1; 2005 2006 /* now insert the actual backref */ 2007 ret = insert_extent_backref(trans, root->fs_info->extent_root, 2008 path, bytenr, parent, root_objectid, 2009 owner, offset, refs_to_add); 2010 if (ret) 2011 btrfs_abort_transaction(trans, root, ret); 2012 out: 2013 btrfs_free_path(path); 2014 return ret; 2015 } 2016 2017 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2018 struct btrfs_root *root, 2019 struct btrfs_delayed_ref_node *node, 2020 struct btrfs_delayed_extent_op *extent_op, 2021 int insert_reserved) 2022 { 2023 int ret = 0; 2024 struct btrfs_delayed_data_ref *ref; 2025 struct btrfs_key ins; 2026 u64 parent = 0; 2027 u64 ref_root = 0; 2028 u64 flags = 0; 2029 2030 ins.objectid = node->bytenr; 2031 ins.offset = node->num_bytes; 2032 ins.type = BTRFS_EXTENT_ITEM_KEY; 2033 2034 ref = btrfs_delayed_node_to_data_ref(node); 2035 trace_run_delayed_data_ref(node, ref, node->action); 2036 2037 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2038 parent = ref->parent; 2039 else 2040 ref_root = ref->root; 2041 2042 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2043 if (extent_op) 2044 flags |= extent_op->flags_to_set; 2045 ret = alloc_reserved_file_extent(trans, root, 2046 parent, ref_root, flags, 2047 ref->objectid, ref->offset, 2048 &ins, node->ref_mod); 2049 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2050 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2051 node->num_bytes, parent, 2052 ref_root, ref->objectid, 2053 ref->offset, node->ref_mod, 2054 extent_op); 2055 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2056 ret = __btrfs_free_extent(trans, root, node->bytenr, 2057 node->num_bytes, parent, 2058 ref_root, ref->objectid, 2059 ref->offset, node->ref_mod, 2060 extent_op); 2061 } else { 2062 BUG(); 2063 } 2064 return ret; 2065 } 2066 2067 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2068 struct extent_buffer *leaf, 2069 struct btrfs_extent_item *ei) 2070 { 2071 u64 flags = btrfs_extent_flags(leaf, ei); 2072 if (extent_op->update_flags) { 2073 flags |= extent_op->flags_to_set; 2074 btrfs_set_extent_flags(leaf, ei, flags); 2075 } 2076 2077 if (extent_op->update_key) { 2078 struct btrfs_tree_block_info *bi; 2079 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2080 bi = (struct btrfs_tree_block_info *)(ei + 1); 2081 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2082 } 2083 } 2084 2085 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2086 struct btrfs_root *root, 2087 struct btrfs_delayed_ref_node *node, 2088 struct btrfs_delayed_extent_op *extent_op) 2089 { 2090 struct btrfs_key key; 2091 struct btrfs_path *path; 2092 struct btrfs_extent_item *ei; 2093 struct extent_buffer *leaf; 2094 u32 item_size; 2095 int ret; 2096 int err = 0; 2097 int metadata = !extent_op->is_data; 2098 2099 if (trans->aborted) 2100 return 0; 2101 2102 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2103 metadata = 0; 2104 2105 path = btrfs_alloc_path(); 2106 if (!path) 2107 return -ENOMEM; 2108 2109 key.objectid = node->bytenr; 2110 2111 if (metadata) { 2112 key.type = BTRFS_METADATA_ITEM_KEY; 2113 key.offset = extent_op->level; 2114 } else { 2115 key.type = BTRFS_EXTENT_ITEM_KEY; 2116 key.offset = node->num_bytes; 2117 } 2118 2119 again: 2120 path->reada = 1; 2121 path->leave_spinning = 1; 2122 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 2123 path, 0, 1); 2124 if (ret < 0) { 2125 err = ret; 2126 goto out; 2127 } 2128 if (ret > 0) { 2129 if (metadata) { 2130 if (path->slots[0] > 0) { 2131 path->slots[0]--; 2132 btrfs_item_key_to_cpu(path->nodes[0], &key, 2133 path->slots[0]); 2134 if (key.objectid == node->bytenr && 2135 key.type == BTRFS_EXTENT_ITEM_KEY && 2136 key.offset == node->num_bytes) 2137 ret = 0; 2138 } 2139 if (ret > 0) { 2140 btrfs_release_path(path); 2141 metadata = 0; 2142 2143 key.objectid = node->bytenr; 2144 key.offset = node->num_bytes; 2145 key.type = BTRFS_EXTENT_ITEM_KEY; 2146 goto again; 2147 } 2148 } else { 2149 err = -EIO; 2150 goto out; 2151 } 2152 } 2153 2154 leaf = path->nodes[0]; 2155 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2156 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2157 if (item_size < sizeof(*ei)) { 2158 ret = convert_extent_item_v0(trans, root->fs_info->extent_root, 2159 path, (u64)-1, 0); 2160 if (ret < 0) { 2161 err = ret; 2162 goto out; 2163 } 2164 leaf = path->nodes[0]; 2165 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2166 } 2167 #endif 2168 BUG_ON(item_size < sizeof(*ei)); 2169 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2170 __run_delayed_extent_op(extent_op, leaf, ei); 2171 2172 btrfs_mark_buffer_dirty(leaf); 2173 out: 2174 btrfs_free_path(path); 2175 return err; 2176 } 2177 2178 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2179 struct btrfs_root *root, 2180 struct btrfs_delayed_ref_node *node, 2181 struct btrfs_delayed_extent_op *extent_op, 2182 int insert_reserved) 2183 { 2184 int ret = 0; 2185 struct btrfs_delayed_tree_ref *ref; 2186 struct btrfs_key ins; 2187 u64 parent = 0; 2188 u64 ref_root = 0; 2189 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 2190 SKINNY_METADATA); 2191 2192 ref = btrfs_delayed_node_to_tree_ref(node); 2193 trace_run_delayed_tree_ref(node, ref, node->action); 2194 2195 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2196 parent = ref->parent; 2197 else 2198 ref_root = ref->root; 2199 2200 ins.objectid = node->bytenr; 2201 if (skinny_metadata) { 2202 ins.offset = ref->level; 2203 ins.type = BTRFS_METADATA_ITEM_KEY; 2204 } else { 2205 ins.offset = node->num_bytes; 2206 ins.type = BTRFS_EXTENT_ITEM_KEY; 2207 } 2208 2209 BUG_ON(node->ref_mod != 1); 2210 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2211 BUG_ON(!extent_op || !extent_op->update_flags); 2212 ret = alloc_reserved_tree_block(trans, root, 2213 parent, ref_root, 2214 extent_op->flags_to_set, 2215 &extent_op->key, 2216 ref->level, &ins); 2217 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2218 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2219 node->num_bytes, parent, ref_root, 2220 ref->level, 0, 1, extent_op); 2221 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2222 ret = __btrfs_free_extent(trans, root, node->bytenr, 2223 node->num_bytes, parent, ref_root, 2224 ref->level, 0, 1, extent_op); 2225 } else { 2226 BUG(); 2227 } 2228 return ret; 2229 } 2230 2231 /* helper function to actually process a single delayed ref entry */ 2232 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2233 struct btrfs_root *root, 2234 struct btrfs_delayed_ref_node *node, 2235 struct btrfs_delayed_extent_op *extent_op, 2236 int insert_reserved) 2237 { 2238 int ret = 0; 2239 2240 if (trans->aborted) { 2241 if (insert_reserved) 2242 btrfs_pin_extent(root, node->bytenr, 2243 node->num_bytes, 1); 2244 return 0; 2245 } 2246 2247 if (btrfs_delayed_ref_is_head(node)) { 2248 struct btrfs_delayed_ref_head *head; 2249 /* 2250 * we've hit the end of the chain and we were supposed 2251 * to insert this extent into the tree. But, it got 2252 * deleted before we ever needed to insert it, so all 2253 * we have to do is clean up the accounting 2254 */ 2255 BUG_ON(extent_op); 2256 head = btrfs_delayed_node_to_head(node); 2257 trace_run_delayed_ref_head(node, head, node->action); 2258 2259 if (insert_reserved) { 2260 btrfs_pin_extent(root, node->bytenr, 2261 node->num_bytes, 1); 2262 if (head->is_data) { 2263 ret = btrfs_del_csums(trans, root, 2264 node->bytenr, 2265 node->num_bytes); 2266 } 2267 } 2268 return ret; 2269 } 2270 2271 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2272 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2273 ret = run_delayed_tree_ref(trans, root, node, extent_op, 2274 insert_reserved); 2275 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2276 node->type == BTRFS_SHARED_DATA_REF_KEY) 2277 ret = run_delayed_data_ref(trans, root, node, extent_op, 2278 insert_reserved); 2279 else 2280 BUG(); 2281 return ret; 2282 } 2283 2284 static noinline struct btrfs_delayed_ref_node * 2285 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2286 { 2287 struct rb_node *node; 2288 struct btrfs_delayed_ref_node *ref; 2289 int action = BTRFS_ADD_DELAYED_REF; 2290 again: 2291 /* 2292 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 2293 * this prevents ref count from going down to zero when 2294 * there still are pending delayed ref. 2295 */ 2296 node = rb_prev(&head->node.rb_node); 2297 while (1) { 2298 if (!node) 2299 break; 2300 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2301 rb_node); 2302 if (ref->bytenr != head->node.bytenr) 2303 break; 2304 if (ref->action == action) 2305 return ref; 2306 node = rb_prev(node); 2307 } 2308 if (action == BTRFS_ADD_DELAYED_REF) { 2309 action = BTRFS_DROP_DELAYED_REF; 2310 goto again; 2311 } 2312 return NULL; 2313 } 2314 2315 /* 2316 * Returns 0 on success or if called with an already aborted transaction. 2317 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2318 */ 2319 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, 2320 struct btrfs_root *root, 2321 struct list_head *cluster) 2322 { 2323 struct btrfs_delayed_ref_root *delayed_refs; 2324 struct btrfs_delayed_ref_node *ref; 2325 struct btrfs_delayed_ref_head *locked_ref = NULL; 2326 struct btrfs_delayed_extent_op *extent_op; 2327 struct btrfs_fs_info *fs_info = root->fs_info; 2328 int ret; 2329 int count = 0; 2330 int must_insert_reserved = 0; 2331 2332 delayed_refs = &trans->transaction->delayed_refs; 2333 while (1) { 2334 if (!locked_ref) { 2335 /* pick a new head ref from the cluster list */ 2336 if (list_empty(cluster)) 2337 break; 2338 2339 locked_ref = list_entry(cluster->next, 2340 struct btrfs_delayed_ref_head, cluster); 2341 2342 /* grab the lock that says we are going to process 2343 * all the refs for this head */ 2344 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2345 2346 /* 2347 * we may have dropped the spin lock to get the head 2348 * mutex lock, and that might have given someone else 2349 * time to free the head. If that's true, it has been 2350 * removed from our list and we can move on. 2351 */ 2352 if (ret == -EAGAIN) { 2353 locked_ref = NULL; 2354 count++; 2355 continue; 2356 } 2357 } 2358 2359 /* 2360 * We need to try and merge add/drops of the same ref since we 2361 * can run into issues with relocate dropping the implicit ref 2362 * and then it being added back again before the drop can 2363 * finish. If we merged anything we need to re-loop so we can 2364 * get a good ref. 2365 */ 2366 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2367 locked_ref); 2368 2369 /* 2370 * locked_ref is the head node, so we have to go one 2371 * node back for any delayed ref updates 2372 */ 2373 ref = select_delayed_ref(locked_ref); 2374 2375 if (ref && ref->seq && 2376 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2377 /* 2378 * there are still refs with lower seq numbers in the 2379 * process of being added. Don't run this ref yet. 2380 */ 2381 list_del_init(&locked_ref->cluster); 2382 btrfs_delayed_ref_unlock(locked_ref); 2383 locked_ref = NULL; 2384 delayed_refs->num_heads_ready++; 2385 spin_unlock(&delayed_refs->lock); 2386 cond_resched(); 2387 spin_lock(&delayed_refs->lock); 2388 continue; 2389 } 2390 2391 /* 2392 * record the must insert reserved flag before we 2393 * drop the spin lock. 2394 */ 2395 must_insert_reserved = locked_ref->must_insert_reserved; 2396 locked_ref->must_insert_reserved = 0; 2397 2398 extent_op = locked_ref->extent_op; 2399 locked_ref->extent_op = NULL; 2400 2401 if (!ref) { 2402 /* All delayed refs have been processed, Go ahead 2403 * and send the head node to run_one_delayed_ref, 2404 * so that any accounting fixes can happen 2405 */ 2406 ref = &locked_ref->node; 2407 2408 if (extent_op && must_insert_reserved) { 2409 btrfs_free_delayed_extent_op(extent_op); 2410 extent_op = NULL; 2411 } 2412 2413 if (extent_op) { 2414 spin_unlock(&delayed_refs->lock); 2415 2416 ret = run_delayed_extent_op(trans, root, 2417 ref, extent_op); 2418 btrfs_free_delayed_extent_op(extent_op); 2419 2420 if (ret) { 2421 /* 2422 * Need to reset must_insert_reserved if 2423 * there was an error so the abort stuff 2424 * can cleanup the reserved space 2425 * properly. 2426 */ 2427 if (must_insert_reserved) 2428 locked_ref->must_insert_reserved = 1; 2429 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2430 spin_lock(&delayed_refs->lock); 2431 btrfs_delayed_ref_unlock(locked_ref); 2432 return ret; 2433 } 2434 2435 goto next; 2436 } 2437 } 2438 2439 ref->in_tree = 0; 2440 rb_erase(&ref->rb_node, &delayed_refs->root); 2441 delayed_refs->num_entries--; 2442 if (!btrfs_delayed_ref_is_head(ref)) { 2443 /* 2444 * when we play the delayed ref, also correct the 2445 * ref_mod on head 2446 */ 2447 switch (ref->action) { 2448 case BTRFS_ADD_DELAYED_REF: 2449 case BTRFS_ADD_DELAYED_EXTENT: 2450 locked_ref->node.ref_mod -= ref->ref_mod; 2451 break; 2452 case BTRFS_DROP_DELAYED_REF: 2453 locked_ref->node.ref_mod += ref->ref_mod; 2454 break; 2455 default: 2456 WARN_ON(1); 2457 } 2458 } else { 2459 list_del_init(&locked_ref->cluster); 2460 } 2461 spin_unlock(&delayed_refs->lock); 2462 2463 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2464 must_insert_reserved); 2465 2466 btrfs_free_delayed_extent_op(extent_op); 2467 if (ret) { 2468 btrfs_delayed_ref_unlock(locked_ref); 2469 btrfs_put_delayed_ref(ref); 2470 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); 2471 spin_lock(&delayed_refs->lock); 2472 return ret; 2473 } 2474 2475 /* 2476 * If this node is a head, that means all the refs in this head 2477 * have been dealt with, and we will pick the next head to deal 2478 * with, so we must unlock the head and drop it from the cluster 2479 * list before we release it. 2480 */ 2481 if (btrfs_delayed_ref_is_head(ref)) { 2482 btrfs_delayed_ref_unlock(locked_ref); 2483 locked_ref = NULL; 2484 } 2485 btrfs_put_delayed_ref(ref); 2486 count++; 2487 next: 2488 cond_resched(); 2489 spin_lock(&delayed_refs->lock); 2490 } 2491 return count; 2492 } 2493 2494 #ifdef SCRAMBLE_DELAYED_REFS 2495 /* 2496 * Normally delayed refs get processed in ascending bytenr order. This 2497 * correlates in most cases to the order added. To expose dependencies on this 2498 * order, we start to process the tree in the middle instead of the beginning 2499 */ 2500 static u64 find_middle(struct rb_root *root) 2501 { 2502 struct rb_node *n = root->rb_node; 2503 struct btrfs_delayed_ref_node *entry; 2504 int alt = 1; 2505 u64 middle; 2506 u64 first = 0, last = 0; 2507 2508 n = rb_first(root); 2509 if (n) { 2510 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2511 first = entry->bytenr; 2512 } 2513 n = rb_last(root); 2514 if (n) { 2515 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2516 last = entry->bytenr; 2517 } 2518 n = root->rb_node; 2519 2520 while (n) { 2521 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2522 WARN_ON(!entry->in_tree); 2523 2524 middle = entry->bytenr; 2525 2526 if (alt) 2527 n = n->rb_left; 2528 else 2529 n = n->rb_right; 2530 2531 alt = 1 - alt; 2532 } 2533 return middle; 2534 } 2535 #endif 2536 2537 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 2538 struct btrfs_fs_info *fs_info) 2539 { 2540 struct qgroup_update *qgroup_update; 2541 int ret = 0; 2542 2543 if (list_empty(&trans->qgroup_ref_list) != 2544 !trans->delayed_ref_elem.seq) { 2545 /* list without seq or seq without list */ 2546 btrfs_err(fs_info, 2547 "qgroup accounting update error, list is%s empty, seq is %#x.%x", 2548 list_empty(&trans->qgroup_ref_list) ? "" : " not", 2549 (u32)(trans->delayed_ref_elem.seq >> 32), 2550 (u32)trans->delayed_ref_elem.seq); 2551 BUG(); 2552 } 2553 2554 if (!trans->delayed_ref_elem.seq) 2555 return 0; 2556 2557 while (!list_empty(&trans->qgroup_ref_list)) { 2558 qgroup_update = list_first_entry(&trans->qgroup_ref_list, 2559 struct qgroup_update, list); 2560 list_del(&qgroup_update->list); 2561 if (!ret) 2562 ret = btrfs_qgroup_account_ref( 2563 trans, fs_info, qgroup_update->node, 2564 qgroup_update->extent_op); 2565 kfree(qgroup_update); 2566 } 2567 2568 btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); 2569 2570 return ret; 2571 } 2572 2573 static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, 2574 int count) 2575 { 2576 int val = atomic_read(&delayed_refs->ref_seq); 2577 2578 if (val < seq || val >= seq + count) 2579 return 1; 2580 return 0; 2581 } 2582 2583 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) 2584 { 2585 u64 num_bytes; 2586 2587 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2588 sizeof(struct btrfs_extent_inline_ref)); 2589 if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2590 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2591 2592 /* 2593 * We don't ever fill up leaves all the way so multiply by 2 just to be 2594 * closer to what we're really going to want to ouse. 2595 */ 2596 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); 2597 } 2598 2599 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 2600 struct btrfs_root *root) 2601 { 2602 struct btrfs_block_rsv *global_rsv; 2603 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; 2604 u64 num_bytes; 2605 int ret = 0; 2606 2607 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 2608 num_heads = heads_to_leaves(root, num_heads); 2609 if (num_heads > 1) 2610 num_bytes += (num_heads - 1) * root->leafsize; 2611 num_bytes <<= 1; 2612 global_rsv = &root->fs_info->global_block_rsv; 2613 2614 /* 2615 * If we can't allocate any more chunks lets make sure we have _lots_ of 2616 * wiggle room since running delayed refs can create more delayed refs. 2617 */ 2618 if (global_rsv->space_info->full) 2619 num_bytes <<= 1; 2620 2621 spin_lock(&global_rsv->lock); 2622 if (global_rsv->reserved <= num_bytes) 2623 ret = 1; 2624 spin_unlock(&global_rsv->lock); 2625 return ret; 2626 } 2627 2628 /* 2629 * this starts processing the delayed reference count updates and 2630 * extent insertions we have queued up so far. count can be 2631 * 0, which means to process everything in the tree at the start 2632 * of the run (but not newly added entries), or it can be some target 2633 * number you'd like to process. 2634 * 2635 * Returns 0 on success or if called with an aborted transaction 2636 * Returns <0 on error and aborts the transaction 2637 */ 2638 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2639 struct btrfs_root *root, unsigned long count) 2640 { 2641 struct rb_node *node; 2642 struct btrfs_delayed_ref_root *delayed_refs; 2643 struct btrfs_delayed_ref_node *ref; 2644 struct list_head cluster; 2645 int ret; 2646 u64 delayed_start; 2647 int run_all = count == (unsigned long)-1; 2648 int run_most = 0; 2649 int loops; 2650 2651 /* We'll clean this up in btrfs_cleanup_transaction */ 2652 if (trans->aborted) 2653 return 0; 2654 2655 if (root == root->fs_info->extent_root) 2656 root = root->fs_info->tree_root; 2657 2658 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); 2659 2660 delayed_refs = &trans->transaction->delayed_refs; 2661 INIT_LIST_HEAD(&cluster); 2662 if (count == 0) { 2663 count = delayed_refs->num_entries * 2; 2664 run_most = 1; 2665 } 2666 2667 if (!run_all && !run_most) { 2668 int old; 2669 int seq = atomic_read(&delayed_refs->ref_seq); 2670 2671 progress: 2672 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); 2673 if (old) { 2674 DEFINE_WAIT(__wait); 2675 if (delayed_refs->flushing || 2676 !btrfs_should_throttle_delayed_refs(trans, root)) 2677 return 0; 2678 2679 prepare_to_wait(&delayed_refs->wait, &__wait, 2680 TASK_UNINTERRUPTIBLE); 2681 2682 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); 2683 if (old) { 2684 schedule(); 2685 finish_wait(&delayed_refs->wait, &__wait); 2686 2687 if (!refs_newer(delayed_refs, seq, 256)) 2688 goto progress; 2689 else 2690 return 0; 2691 } else { 2692 finish_wait(&delayed_refs->wait, &__wait); 2693 goto again; 2694 } 2695 } 2696 2697 } else { 2698 atomic_inc(&delayed_refs->procs_running_refs); 2699 } 2700 2701 again: 2702 loops = 0; 2703 spin_lock(&delayed_refs->lock); 2704 2705 #ifdef SCRAMBLE_DELAYED_REFS 2706 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2707 #endif 2708 2709 while (1) { 2710 if (!(run_all || run_most) && 2711 !btrfs_should_throttle_delayed_refs(trans, root)) 2712 break; 2713 2714 /* 2715 * go find something we can process in the rbtree. We start at 2716 * the beginning of the tree, and then build a cluster 2717 * of refs to process starting at the first one we are able to 2718 * lock 2719 */ 2720 delayed_start = delayed_refs->run_delayed_start; 2721 ret = btrfs_find_ref_cluster(trans, &cluster, 2722 delayed_refs->run_delayed_start); 2723 if (ret) 2724 break; 2725 2726 ret = run_clustered_refs(trans, root, &cluster); 2727 if (ret < 0) { 2728 btrfs_release_ref_cluster(&cluster); 2729 spin_unlock(&delayed_refs->lock); 2730 btrfs_abort_transaction(trans, root, ret); 2731 atomic_dec(&delayed_refs->procs_running_refs); 2732 wake_up(&delayed_refs->wait); 2733 return ret; 2734 } 2735 2736 atomic_add(ret, &delayed_refs->ref_seq); 2737 2738 count -= min_t(unsigned long, ret, count); 2739 2740 if (count == 0) 2741 break; 2742 2743 if (delayed_start >= delayed_refs->run_delayed_start) { 2744 if (loops == 0) { 2745 /* 2746 * btrfs_find_ref_cluster looped. let's do one 2747 * more cycle. if we don't run any delayed ref 2748 * during that cycle (because we can't because 2749 * all of them are blocked), bail out. 2750 */ 2751 loops = 1; 2752 } else { 2753 /* 2754 * no runnable refs left, stop trying 2755 */ 2756 BUG_ON(run_all); 2757 break; 2758 } 2759 } 2760 if (ret) { 2761 /* refs were run, let's reset staleness detection */ 2762 loops = 0; 2763 } 2764 } 2765 2766 if (run_all) { 2767 if (!list_empty(&trans->new_bgs)) { 2768 spin_unlock(&delayed_refs->lock); 2769 btrfs_create_pending_block_groups(trans, root); 2770 spin_lock(&delayed_refs->lock); 2771 } 2772 2773 node = rb_first(&delayed_refs->root); 2774 if (!node) 2775 goto out; 2776 count = (unsigned long)-1; 2777 2778 while (node) { 2779 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2780 rb_node); 2781 if (btrfs_delayed_ref_is_head(ref)) { 2782 struct btrfs_delayed_ref_head *head; 2783 2784 head = btrfs_delayed_node_to_head(ref); 2785 atomic_inc(&ref->refs); 2786 2787 spin_unlock(&delayed_refs->lock); 2788 /* 2789 * Mutex was contended, block until it's 2790 * released and try again 2791 */ 2792 mutex_lock(&head->mutex); 2793 mutex_unlock(&head->mutex); 2794 2795 btrfs_put_delayed_ref(ref); 2796 cond_resched(); 2797 goto again; 2798 } 2799 node = rb_next(node); 2800 } 2801 spin_unlock(&delayed_refs->lock); 2802 schedule_timeout(1); 2803 goto again; 2804 } 2805 out: 2806 atomic_dec(&delayed_refs->procs_running_refs); 2807 smp_mb(); 2808 if (waitqueue_active(&delayed_refs->wait)) 2809 wake_up(&delayed_refs->wait); 2810 2811 spin_unlock(&delayed_refs->lock); 2812 assert_qgroups_uptodate(trans); 2813 return 0; 2814 } 2815 2816 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2817 struct btrfs_root *root, 2818 u64 bytenr, u64 num_bytes, u64 flags, 2819 int level, int is_data) 2820 { 2821 struct btrfs_delayed_extent_op *extent_op; 2822 int ret; 2823 2824 extent_op = btrfs_alloc_delayed_extent_op(); 2825 if (!extent_op) 2826 return -ENOMEM; 2827 2828 extent_op->flags_to_set = flags; 2829 extent_op->update_flags = 1; 2830 extent_op->update_key = 0; 2831 extent_op->is_data = is_data ? 1 : 0; 2832 extent_op->level = level; 2833 2834 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2835 num_bytes, extent_op); 2836 if (ret) 2837 btrfs_free_delayed_extent_op(extent_op); 2838 return ret; 2839 } 2840 2841 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, 2842 struct btrfs_root *root, 2843 struct btrfs_path *path, 2844 u64 objectid, u64 offset, u64 bytenr) 2845 { 2846 struct btrfs_delayed_ref_head *head; 2847 struct btrfs_delayed_ref_node *ref; 2848 struct btrfs_delayed_data_ref *data_ref; 2849 struct btrfs_delayed_ref_root *delayed_refs; 2850 struct rb_node *node; 2851 int ret = 0; 2852 2853 ret = -ENOENT; 2854 delayed_refs = &trans->transaction->delayed_refs; 2855 spin_lock(&delayed_refs->lock); 2856 head = btrfs_find_delayed_ref_head(trans, bytenr); 2857 if (!head) 2858 goto out; 2859 2860 if (!mutex_trylock(&head->mutex)) { 2861 atomic_inc(&head->node.refs); 2862 spin_unlock(&delayed_refs->lock); 2863 2864 btrfs_release_path(path); 2865 2866 /* 2867 * Mutex was contended, block until it's released and let 2868 * caller try again 2869 */ 2870 mutex_lock(&head->mutex); 2871 mutex_unlock(&head->mutex); 2872 btrfs_put_delayed_ref(&head->node); 2873 return -EAGAIN; 2874 } 2875 2876 node = rb_prev(&head->node.rb_node); 2877 if (!node) 2878 goto out_unlock; 2879 2880 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2881 2882 if (ref->bytenr != bytenr) 2883 goto out_unlock; 2884 2885 ret = 1; 2886 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) 2887 goto out_unlock; 2888 2889 data_ref = btrfs_delayed_node_to_data_ref(ref); 2890 2891 node = rb_prev(node); 2892 if (node) { 2893 int seq = ref->seq; 2894 2895 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2896 if (ref->bytenr == bytenr && ref->seq == seq) 2897 goto out_unlock; 2898 } 2899 2900 if (data_ref->root != root->root_key.objectid || 2901 data_ref->objectid != objectid || data_ref->offset != offset) 2902 goto out_unlock; 2903 2904 ret = 0; 2905 out_unlock: 2906 mutex_unlock(&head->mutex); 2907 out: 2908 spin_unlock(&delayed_refs->lock); 2909 return ret; 2910 } 2911 2912 static noinline int check_committed_ref(struct btrfs_trans_handle *trans, 2913 struct btrfs_root *root, 2914 struct btrfs_path *path, 2915 u64 objectid, u64 offset, u64 bytenr) 2916 { 2917 struct btrfs_root *extent_root = root->fs_info->extent_root; 2918 struct extent_buffer *leaf; 2919 struct btrfs_extent_data_ref *ref; 2920 struct btrfs_extent_inline_ref *iref; 2921 struct btrfs_extent_item *ei; 2922 struct btrfs_key key; 2923 u32 item_size; 2924 int ret; 2925 2926 key.objectid = bytenr; 2927 key.offset = (u64)-1; 2928 key.type = BTRFS_EXTENT_ITEM_KEY; 2929 2930 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2931 if (ret < 0) 2932 goto out; 2933 BUG_ON(ret == 0); /* Corruption */ 2934 2935 ret = -ENOENT; 2936 if (path->slots[0] == 0) 2937 goto out; 2938 2939 path->slots[0]--; 2940 leaf = path->nodes[0]; 2941 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2942 2943 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2944 goto out; 2945 2946 ret = 1; 2947 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2948 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2949 if (item_size < sizeof(*ei)) { 2950 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 2951 goto out; 2952 } 2953 #endif 2954 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2955 2956 if (item_size != sizeof(*ei) + 2957 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2958 goto out; 2959 2960 if (btrfs_extent_generation(leaf, ei) <= 2961 btrfs_root_last_snapshot(&root->root_item)) 2962 goto out; 2963 2964 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 2965 if (btrfs_extent_inline_ref_type(leaf, iref) != 2966 BTRFS_EXTENT_DATA_REF_KEY) 2967 goto out; 2968 2969 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 2970 if (btrfs_extent_refs(leaf, ei) != 2971 btrfs_extent_data_ref_count(leaf, ref) || 2972 btrfs_extent_data_ref_root(leaf, ref) != 2973 root->root_key.objectid || 2974 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 2975 btrfs_extent_data_ref_offset(leaf, ref) != offset) 2976 goto out; 2977 2978 ret = 0; 2979 out: 2980 return ret; 2981 } 2982 2983 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2984 struct btrfs_root *root, 2985 u64 objectid, u64 offset, u64 bytenr) 2986 { 2987 struct btrfs_path *path; 2988 int ret; 2989 int ret2; 2990 2991 path = btrfs_alloc_path(); 2992 if (!path) 2993 return -ENOENT; 2994 2995 do { 2996 ret = check_committed_ref(trans, root, path, objectid, 2997 offset, bytenr); 2998 if (ret && ret != -ENOENT) 2999 goto out; 3000 3001 ret2 = check_delayed_ref(trans, root, path, objectid, 3002 offset, bytenr); 3003 } while (ret2 == -EAGAIN); 3004 3005 if (ret2 && ret2 != -ENOENT) { 3006 ret = ret2; 3007 goto out; 3008 } 3009 3010 if (ret != -ENOENT || ret2 != -ENOENT) 3011 ret = 0; 3012 out: 3013 btrfs_free_path(path); 3014 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3015 WARN_ON(ret > 0); 3016 return ret; 3017 } 3018 3019 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3020 struct btrfs_root *root, 3021 struct extent_buffer *buf, 3022 int full_backref, int inc, int for_cow) 3023 { 3024 u64 bytenr; 3025 u64 num_bytes; 3026 u64 parent; 3027 u64 ref_root; 3028 u32 nritems; 3029 struct btrfs_key key; 3030 struct btrfs_file_extent_item *fi; 3031 int i; 3032 int level; 3033 int ret = 0; 3034 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 3035 u64, u64, u64, u64, u64, u64, int); 3036 3037 ref_root = btrfs_header_owner(buf); 3038 nritems = btrfs_header_nritems(buf); 3039 level = btrfs_header_level(buf); 3040 3041 if (!root->ref_cows && level == 0) 3042 return 0; 3043 3044 if (inc) 3045 process_func = btrfs_inc_extent_ref; 3046 else 3047 process_func = btrfs_free_extent; 3048 3049 if (full_backref) 3050 parent = buf->start; 3051 else 3052 parent = 0; 3053 3054 for (i = 0; i < nritems; i++) { 3055 if (level == 0) { 3056 btrfs_item_key_to_cpu(buf, &key, i); 3057 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3058 continue; 3059 fi = btrfs_item_ptr(buf, i, 3060 struct btrfs_file_extent_item); 3061 if (btrfs_file_extent_type(buf, fi) == 3062 BTRFS_FILE_EXTENT_INLINE) 3063 continue; 3064 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3065 if (bytenr == 0) 3066 continue; 3067 3068 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3069 key.offset -= btrfs_file_extent_offset(buf, fi); 3070 ret = process_func(trans, root, bytenr, num_bytes, 3071 parent, ref_root, key.objectid, 3072 key.offset, for_cow); 3073 if (ret) 3074 goto fail; 3075 } else { 3076 bytenr = btrfs_node_blockptr(buf, i); 3077 num_bytes = btrfs_level_size(root, level - 1); 3078 ret = process_func(trans, root, bytenr, num_bytes, 3079 parent, ref_root, level - 1, 0, 3080 for_cow); 3081 if (ret) 3082 goto fail; 3083 } 3084 } 3085 return 0; 3086 fail: 3087 return ret; 3088 } 3089 3090 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3091 struct extent_buffer *buf, int full_backref, int for_cow) 3092 { 3093 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); 3094 } 3095 3096 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3097 struct extent_buffer *buf, int full_backref, int for_cow) 3098 { 3099 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); 3100 } 3101 3102 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3103 struct btrfs_root *root, 3104 struct btrfs_path *path, 3105 struct btrfs_block_group_cache *cache) 3106 { 3107 int ret; 3108 struct btrfs_root *extent_root = root->fs_info->extent_root; 3109 unsigned long bi; 3110 struct extent_buffer *leaf; 3111 3112 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3113 if (ret < 0) 3114 goto fail; 3115 BUG_ON(ret); /* Corruption */ 3116 3117 leaf = path->nodes[0]; 3118 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3119 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3120 btrfs_mark_buffer_dirty(leaf); 3121 btrfs_release_path(path); 3122 fail: 3123 if (ret) { 3124 btrfs_abort_transaction(trans, root, ret); 3125 return ret; 3126 } 3127 return 0; 3128 3129 } 3130 3131 static struct btrfs_block_group_cache * 3132 next_block_group(struct btrfs_root *root, 3133 struct btrfs_block_group_cache *cache) 3134 { 3135 struct rb_node *node; 3136 spin_lock(&root->fs_info->block_group_cache_lock); 3137 node = rb_next(&cache->cache_node); 3138 btrfs_put_block_group(cache); 3139 if (node) { 3140 cache = rb_entry(node, struct btrfs_block_group_cache, 3141 cache_node); 3142 btrfs_get_block_group(cache); 3143 } else 3144 cache = NULL; 3145 spin_unlock(&root->fs_info->block_group_cache_lock); 3146 return cache; 3147 } 3148 3149 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3150 struct btrfs_trans_handle *trans, 3151 struct btrfs_path *path) 3152 { 3153 struct btrfs_root *root = block_group->fs_info->tree_root; 3154 struct inode *inode = NULL; 3155 u64 alloc_hint = 0; 3156 int dcs = BTRFS_DC_ERROR; 3157 int num_pages = 0; 3158 int retries = 0; 3159 int ret = 0; 3160 3161 /* 3162 * If this block group is smaller than 100 megs don't bother caching the 3163 * block group. 3164 */ 3165 if (block_group->key.offset < (100 * 1024 * 1024)) { 3166 spin_lock(&block_group->lock); 3167 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3168 spin_unlock(&block_group->lock); 3169 return 0; 3170 } 3171 3172 again: 3173 inode = lookup_free_space_inode(root, block_group, path); 3174 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3175 ret = PTR_ERR(inode); 3176 btrfs_release_path(path); 3177 goto out; 3178 } 3179 3180 if (IS_ERR(inode)) { 3181 BUG_ON(retries); 3182 retries++; 3183 3184 if (block_group->ro) 3185 goto out_free; 3186 3187 ret = create_free_space_inode(root, trans, block_group, path); 3188 if (ret) 3189 goto out_free; 3190 goto again; 3191 } 3192 3193 /* We've already setup this transaction, go ahead and exit */ 3194 if (block_group->cache_generation == trans->transid && 3195 i_size_read(inode)) { 3196 dcs = BTRFS_DC_SETUP; 3197 goto out_put; 3198 } 3199 3200 /* 3201 * We want to set the generation to 0, that way if anything goes wrong 3202 * from here on out we know not to trust this cache when we load up next 3203 * time. 3204 */ 3205 BTRFS_I(inode)->generation = 0; 3206 ret = btrfs_update_inode(trans, root, inode); 3207 WARN_ON(ret); 3208 3209 if (i_size_read(inode) > 0) { 3210 ret = btrfs_check_trunc_cache_free_space(root, 3211 &root->fs_info->global_block_rsv); 3212 if (ret) 3213 goto out_put; 3214 3215 ret = btrfs_truncate_free_space_cache(root, trans, inode); 3216 if (ret) 3217 goto out_put; 3218 } 3219 3220 spin_lock(&block_group->lock); 3221 if (block_group->cached != BTRFS_CACHE_FINISHED || 3222 !btrfs_test_opt(root, SPACE_CACHE)) { 3223 /* 3224 * don't bother trying to write stuff out _if_ 3225 * a) we're not cached, 3226 * b) we're with nospace_cache mount option. 3227 */ 3228 dcs = BTRFS_DC_WRITTEN; 3229 spin_unlock(&block_group->lock); 3230 goto out_put; 3231 } 3232 spin_unlock(&block_group->lock); 3233 3234 /* 3235 * Try to preallocate enough space based on how big the block group is. 3236 * Keep in mind this has to include any pinned space which could end up 3237 * taking up quite a bit since it's not folded into the other space 3238 * cache. 3239 */ 3240 num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); 3241 if (!num_pages) 3242 num_pages = 1; 3243 3244 num_pages *= 16; 3245 num_pages *= PAGE_CACHE_SIZE; 3246 3247 ret = btrfs_check_data_free_space(inode, num_pages); 3248 if (ret) 3249 goto out_put; 3250 3251 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3252 num_pages, num_pages, 3253 &alloc_hint); 3254 if (!ret) 3255 dcs = BTRFS_DC_SETUP; 3256 btrfs_free_reserved_data_space(inode, num_pages); 3257 3258 out_put: 3259 iput(inode); 3260 out_free: 3261 btrfs_release_path(path); 3262 out: 3263 spin_lock(&block_group->lock); 3264 if (!ret && dcs == BTRFS_DC_SETUP) 3265 block_group->cache_generation = trans->transid; 3266 block_group->disk_cache_state = dcs; 3267 spin_unlock(&block_group->lock); 3268 3269 return ret; 3270 } 3271 3272 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3273 struct btrfs_root *root) 3274 { 3275 struct btrfs_block_group_cache *cache; 3276 int err = 0; 3277 struct btrfs_path *path; 3278 u64 last = 0; 3279 3280 path = btrfs_alloc_path(); 3281 if (!path) 3282 return -ENOMEM; 3283 3284 again: 3285 while (1) { 3286 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3287 while (cache) { 3288 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3289 break; 3290 cache = next_block_group(root, cache); 3291 } 3292 if (!cache) { 3293 if (last == 0) 3294 break; 3295 last = 0; 3296 continue; 3297 } 3298 err = cache_save_setup(cache, trans, path); 3299 last = cache->key.objectid + cache->key.offset; 3300 btrfs_put_block_group(cache); 3301 } 3302 3303 while (1) { 3304 if (last == 0) { 3305 err = btrfs_run_delayed_refs(trans, root, 3306 (unsigned long)-1); 3307 if (err) /* File system offline */ 3308 goto out; 3309 } 3310 3311 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3312 while (cache) { 3313 if (cache->disk_cache_state == BTRFS_DC_CLEAR) { 3314 btrfs_put_block_group(cache); 3315 goto again; 3316 } 3317 3318 if (cache->dirty) 3319 break; 3320 cache = next_block_group(root, cache); 3321 } 3322 if (!cache) { 3323 if (last == 0) 3324 break; 3325 last = 0; 3326 continue; 3327 } 3328 3329 if (cache->disk_cache_state == BTRFS_DC_SETUP) 3330 cache->disk_cache_state = BTRFS_DC_NEED_WRITE; 3331 cache->dirty = 0; 3332 last = cache->key.objectid + cache->key.offset; 3333 3334 err = write_one_cache_group(trans, root, path, cache); 3335 btrfs_put_block_group(cache); 3336 if (err) /* File system offline */ 3337 goto out; 3338 } 3339 3340 while (1) { 3341 /* 3342 * I don't think this is needed since we're just marking our 3343 * preallocated extent as written, but just in case it can't 3344 * hurt. 3345 */ 3346 if (last == 0) { 3347 err = btrfs_run_delayed_refs(trans, root, 3348 (unsigned long)-1); 3349 if (err) /* File system offline */ 3350 goto out; 3351 } 3352 3353 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3354 while (cache) { 3355 /* 3356 * Really this shouldn't happen, but it could if we 3357 * couldn't write the entire preallocated extent and 3358 * splitting the extent resulted in a new block. 3359 */ 3360 if (cache->dirty) { 3361 btrfs_put_block_group(cache); 3362 goto again; 3363 } 3364 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 3365 break; 3366 cache = next_block_group(root, cache); 3367 } 3368 if (!cache) { 3369 if (last == 0) 3370 break; 3371 last = 0; 3372 continue; 3373 } 3374 3375 err = btrfs_write_out_cache(root, trans, cache, path); 3376 3377 /* 3378 * If we didn't have an error then the cache state is still 3379 * NEED_WRITE, so we can set it to WRITTEN. 3380 */ 3381 if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 3382 cache->disk_cache_state = BTRFS_DC_WRITTEN; 3383 last = cache->key.objectid + cache->key.offset; 3384 btrfs_put_block_group(cache); 3385 } 3386 out: 3387 3388 btrfs_free_path(path); 3389 return err; 3390 } 3391 3392 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 3393 { 3394 struct btrfs_block_group_cache *block_group; 3395 int readonly = 0; 3396 3397 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 3398 if (!block_group || block_group->ro) 3399 readonly = 1; 3400 if (block_group) 3401 btrfs_put_block_group(block_group); 3402 return readonly; 3403 } 3404 3405 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3406 u64 total_bytes, u64 bytes_used, 3407 struct btrfs_space_info **space_info) 3408 { 3409 struct btrfs_space_info *found; 3410 int i; 3411 int factor; 3412 int ret; 3413 3414 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3415 BTRFS_BLOCK_GROUP_RAID10)) 3416 factor = 2; 3417 else 3418 factor = 1; 3419 3420 found = __find_space_info(info, flags); 3421 if (found) { 3422 spin_lock(&found->lock); 3423 found->total_bytes += total_bytes; 3424 found->disk_total += total_bytes * factor; 3425 found->bytes_used += bytes_used; 3426 found->disk_used += bytes_used * factor; 3427 found->full = 0; 3428 spin_unlock(&found->lock); 3429 *space_info = found; 3430 return 0; 3431 } 3432 found = kzalloc(sizeof(*found), GFP_NOFS); 3433 if (!found) 3434 return -ENOMEM; 3435 3436 ret = percpu_counter_init(&found->total_bytes_pinned, 0); 3437 if (ret) { 3438 kfree(found); 3439 return ret; 3440 } 3441 3442 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3443 INIT_LIST_HEAD(&found->block_groups[i]); 3444 init_rwsem(&found->groups_sem); 3445 spin_lock_init(&found->lock); 3446 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3447 found->total_bytes = total_bytes; 3448 found->disk_total = total_bytes * factor; 3449 found->bytes_used = bytes_used; 3450 found->disk_used = bytes_used * factor; 3451 found->bytes_pinned = 0; 3452 found->bytes_reserved = 0; 3453 found->bytes_readonly = 0; 3454 found->bytes_may_use = 0; 3455 found->full = 0; 3456 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3457 found->chunk_alloc = 0; 3458 found->flush = 0; 3459 init_waitqueue_head(&found->wait); 3460 *space_info = found; 3461 list_add_rcu(&found->list, &info->space_info); 3462 if (flags & BTRFS_BLOCK_GROUP_DATA) 3463 info->data_sinfo = found; 3464 return 0; 3465 } 3466 3467 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3468 { 3469 u64 extra_flags = chunk_to_extended(flags) & 3470 BTRFS_EXTENDED_PROFILE_MASK; 3471 3472 write_seqlock(&fs_info->profiles_lock); 3473 if (flags & BTRFS_BLOCK_GROUP_DATA) 3474 fs_info->avail_data_alloc_bits |= extra_flags; 3475 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3476 fs_info->avail_metadata_alloc_bits |= extra_flags; 3477 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3478 fs_info->avail_system_alloc_bits |= extra_flags; 3479 write_sequnlock(&fs_info->profiles_lock); 3480 } 3481 3482 /* 3483 * returns target flags in extended format or 0 if restripe for this 3484 * chunk_type is not in progress 3485 * 3486 * should be called with either volume_mutex or balance_lock held 3487 */ 3488 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 3489 { 3490 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3491 u64 target = 0; 3492 3493 if (!bctl) 3494 return 0; 3495 3496 if (flags & BTRFS_BLOCK_GROUP_DATA && 3497 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3498 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 3499 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 3500 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3501 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 3502 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 3503 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3504 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 3505 } 3506 3507 return target; 3508 } 3509 3510 /* 3511 * @flags: available profiles in extended format (see ctree.h) 3512 * 3513 * Returns reduced profile in chunk format. If profile changing is in 3514 * progress (either running or paused) picks the target profile (if it's 3515 * already available), otherwise falls back to plain reducing. 3516 */ 3517 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3518 { 3519 /* 3520 * we add in the count of missing devices because we want 3521 * to make sure that any RAID levels on a degraded FS 3522 * continue to be honored. 3523 */ 3524 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3525 root->fs_info->fs_devices->missing_devices; 3526 u64 target; 3527 u64 tmp; 3528 3529 /* 3530 * see if restripe for this chunk_type is in progress, if so 3531 * try to reduce to the target profile 3532 */ 3533 spin_lock(&root->fs_info->balance_lock); 3534 target = get_restripe_target(root->fs_info, flags); 3535 if (target) { 3536 /* pick target profile only if it's already available */ 3537 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 3538 spin_unlock(&root->fs_info->balance_lock); 3539 return extended_to_chunk(target); 3540 } 3541 } 3542 spin_unlock(&root->fs_info->balance_lock); 3543 3544 /* First, mask out the RAID levels which aren't possible */ 3545 if (num_devices == 1) 3546 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | 3547 BTRFS_BLOCK_GROUP_RAID5); 3548 if (num_devices < 3) 3549 flags &= ~BTRFS_BLOCK_GROUP_RAID6; 3550 if (num_devices < 4) 3551 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3552 3553 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3554 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | 3555 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); 3556 flags &= ~tmp; 3557 3558 if (tmp & BTRFS_BLOCK_GROUP_RAID6) 3559 tmp = BTRFS_BLOCK_GROUP_RAID6; 3560 else if (tmp & BTRFS_BLOCK_GROUP_RAID5) 3561 tmp = BTRFS_BLOCK_GROUP_RAID5; 3562 else if (tmp & BTRFS_BLOCK_GROUP_RAID10) 3563 tmp = BTRFS_BLOCK_GROUP_RAID10; 3564 else if (tmp & BTRFS_BLOCK_GROUP_RAID1) 3565 tmp = BTRFS_BLOCK_GROUP_RAID1; 3566 else if (tmp & BTRFS_BLOCK_GROUP_RAID0) 3567 tmp = BTRFS_BLOCK_GROUP_RAID0; 3568 3569 return extended_to_chunk(flags | tmp); 3570 } 3571 3572 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3573 { 3574 unsigned seq; 3575 3576 do { 3577 seq = read_seqbegin(&root->fs_info->profiles_lock); 3578 3579 if (flags & BTRFS_BLOCK_GROUP_DATA) 3580 flags |= root->fs_info->avail_data_alloc_bits; 3581 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3582 flags |= root->fs_info->avail_system_alloc_bits; 3583 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3584 flags |= root->fs_info->avail_metadata_alloc_bits; 3585 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 3586 3587 return btrfs_reduce_alloc_profile(root, flags); 3588 } 3589 3590 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3591 { 3592 u64 flags; 3593 u64 ret; 3594 3595 if (data) 3596 flags = BTRFS_BLOCK_GROUP_DATA; 3597 else if (root == root->fs_info->chunk_root) 3598 flags = BTRFS_BLOCK_GROUP_SYSTEM; 3599 else 3600 flags = BTRFS_BLOCK_GROUP_METADATA; 3601 3602 ret = get_alloc_profile(root, flags); 3603 return ret; 3604 } 3605 3606 /* 3607 * This will check the space that the inode allocates from to make sure we have 3608 * enough space for bytes. 3609 */ 3610 int btrfs_check_data_free_space(struct inode *inode, u64 bytes) 3611 { 3612 struct btrfs_space_info *data_sinfo; 3613 struct btrfs_root *root = BTRFS_I(inode)->root; 3614 struct btrfs_fs_info *fs_info = root->fs_info; 3615 u64 used; 3616 int ret = 0, committed = 0, alloc_chunk = 1; 3617 3618 /* make sure bytes are sectorsize aligned */ 3619 bytes = ALIGN(bytes, root->sectorsize); 3620 3621 if (btrfs_is_free_space_inode(inode)) { 3622 committed = 1; 3623 ASSERT(current->journal_info); 3624 } 3625 3626 data_sinfo = fs_info->data_sinfo; 3627 if (!data_sinfo) 3628 goto alloc; 3629 3630 again: 3631 /* make sure we have enough space to handle the data first */ 3632 spin_lock(&data_sinfo->lock); 3633 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + 3634 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + 3635 data_sinfo->bytes_may_use; 3636 3637 if (used + bytes > data_sinfo->total_bytes) { 3638 struct btrfs_trans_handle *trans; 3639 3640 /* 3641 * if we don't have enough free bytes in this space then we need 3642 * to alloc a new chunk. 3643 */ 3644 if (!data_sinfo->full && alloc_chunk) { 3645 u64 alloc_target; 3646 3647 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 3648 spin_unlock(&data_sinfo->lock); 3649 alloc: 3650 alloc_target = btrfs_get_alloc_profile(root, 1); 3651 /* 3652 * It is ugly that we don't call nolock join 3653 * transaction for the free space inode case here. 3654 * But it is safe because we only do the data space 3655 * reservation for the free space cache in the 3656 * transaction context, the common join transaction 3657 * just increase the counter of the current transaction 3658 * handler, doesn't try to acquire the trans_lock of 3659 * the fs. 3660 */ 3661 trans = btrfs_join_transaction(root); 3662 if (IS_ERR(trans)) 3663 return PTR_ERR(trans); 3664 3665 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3666 alloc_target, 3667 CHUNK_ALLOC_NO_FORCE); 3668 btrfs_end_transaction(trans, root); 3669 if (ret < 0) { 3670 if (ret != -ENOSPC) 3671 return ret; 3672 else 3673 goto commit_trans; 3674 } 3675 3676 if (!data_sinfo) 3677 data_sinfo = fs_info->data_sinfo; 3678 3679 goto again; 3680 } 3681 3682 /* 3683 * If we don't have enough pinned space to deal with this 3684 * allocation don't bother committing the transaction. 3685 */ 3686 if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, 3687 bytes) < 0) 3688 committed = 1; 3689 spin_unlock(&data_sinfo->lock); 3690 3691 /* commit the current transaction and try again */ 3692 commit_trans: 3693 if (!committed && 3694 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3695 committed = 1; 3696 3697 trans = btrfs_join_transaction(root); 3698 if (IS_ERR(trans)) 3699 return PTR_ERR(trans); 3700 ret = btrfs_commit_transaction(trans, root); 3701 if (ret) 3702 return ret; 3703 goto again; 3704 } 3705 3706 trace_btrfs_space_reservation(root->fs_info, 3707 "space_info:enospc", 3708 data_sinfo->flags, bytes, 1); 3709 return -ENOSPC; 3710 } 3711 data_sinfo->bytes_may_use += bytes; 3712 trace_btrfs_space_reservation(root->fs_info, "space_info", 3713 data_sinfo->flags, bytes, 1); 3714 spin_unlock(&data_sinfo->lock); 3715 3716 return 0; 3717 } 3718 3719 /* 3720 * Called if we need to clear a data reservation for this inode. 3721 */ 3722 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3723 { 3724 struct btrfs_root *root = BTRFS_I(inode)->root; 3725 struct btrfs_space_info *data_sinfo; 3726 3727 /* make sure bytes are sectorsize aligned */ 3728 bytes = ALIGN(bytes, root->sectorsize); 3729 3730 data_sinfo = root->fs_info->data_sinfo; 3731 spin_lock(&data_sinfo->lock); 3732 WARN_ON(data_sinfo->bytes_may_use < bytes); 3733 data_sinfo->bytes_may_use -= bytes; 3734 trace_btrfs_space_reservation(root->fs_info, "space_info", 3735 data_sinfo->flags, bytes, 0); 3736 spin_unlock(&data_sinfo->lock); 3737 } 3738 3739 static void force_metadata_allocation(struct btrfs_fs_info *info) 3740 { 3741 struct list_head *head = &info->space_info; 3742 struct btrfs_space_info *found; 3743 3744 rcu_read_lock(); 3745 list_for_each_entry_rcu(found, head, list) { 3746 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3747 found->force_alloc = CHUNK_ALLOC_FORCE; 3748 } 3749 rcu_read_unlock(); 3750 } 3751 3752 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 3753 { 3754 return (global->size << 1); 3755 } 3756 3757 static int should_alloc_chunk(struct btrfs_root *root, 3758 struct btrfs_space_info *sinfo, int force) 3759 { 3760 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3761 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3762 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3763 u64 thresh; 3764 3765 if (force == CHUNK_ALLOC_FORCE) 3766 return 1; 3767 3768 /* 3769 * We need to take into account the global rsv because for all intents 3770 * and purposes it's used space. Don't worry about locking the 3771 * global_rsv, it doesn't change except when the transaction commits. 3772 */ 3773 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 3774 num_allocated += calc_global_rsv_need_space(global_rsv); 3775 3776 /* 3777 * in limited mode, we want to have some free space up to 3778 * about 1% of the FS size. 3779 */ 3780 if (force == CHUNK_ALLOC_LIMITED) { 3781 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 3782 thresh = max_t(u64, 64 * 1024 * 1024, 3783 div_factor_fine(thresh, 1)); 3784 3785 if (num_bytes - num_allocated < thresh) 3786 return 1; 3787 } 3788 3789 if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) 3790 return 0; 3791 return 1; 3792 } 3793 3794 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) 3795 { 3796 u64 num_dev; 3797 3798 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 3799 BTRFS_BLOCK_GROUP_RAID0 | 3800 BTRFS_BLOCK_GROUP_RAID5 | 3801 BTRFS_BLOCK_GROUP_RAID6)) 3802 num_dev = root->fs_info->fs_devices->rw_devices; 3803 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3804 num_dev = 2; 3805 else 3806 num_dev = 1; /* DUP or single */ 3807 3808 /* metadata for updaing devices and chunk tree */ 3809 return btrfs_calc_trans_metadata_size(root, num_dev + 1); 3810 } 3811 3812 static void check_system_chunk(struct btrfs_trans_handle *trans, 3813 struct btrfs_root *root, u64 type) 3814 { 3815 struct btrfs_space_info *info; 3816 u64 left; 3817 u64 thresh; 3818 3819 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3820 spin_lock(&info->lock); 3821 left = info->total_bytes - info->bytes_used - info->bytes_pinned - 3822 info->bytes_reserved - info->bytes_readonly; 3823 spin_unlock(&info->lock); 3824 3825 thresh = get_system_chunk_thresh(root, type); 3826 if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { 3827 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", 3828 left, thresh, type); 3829 dump_space_info(info, 0, 0); 3830 } 3831 3832 if (left < thresh) { 3833 u64 flags; 3834 3835 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); 3836 btrfs_alloc_chunk(trans, root, flags); 3837 } 3838 } 3839 3840 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3841 struct btrfs_root *extent_root, u64 flags, int force) 3842 { 3843 struct btrfs_space_info *space_info; 3844 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3845 int wait_for_alloc = 0; 3846 int ret = 0; 3847 3848 /* Don't re-enter if we're already allocating a chunk */ 3849 if (trans->allocating_chunk) 3850 return -ENOSPC; 3851 3852 space_info = __find_space_info(extent_root->fs_info, flags); 3853 if (!space_info) { 3854 ret = update_space_info(extent_root->fs_info, flags, 3855 0, 0, &space_info); 3856 BUG_ON(ret); /* -ENOMEM */ 3857 } 3858 BUG_ON(!space_info); /* Logic error */ 3859 3860 again: 3861 spin_lock(&space_info->lock); 3862 if (force < space_info->force_alloc) 3863 force = space_info->force_alloc; 3864 if (space_info->full) { 3865 if (should_alloc_chunk(extent_root, space_info, force)) 3866 ret = -ENOSPC; 3867 else 3868 ret = 0; 3869 spin_unlock(&space_info->lock); 3870 return ret; 3871 } 3872 3873 if (!should_alloc_chunk(extent_root, space_info, force)) { 3874 spin_unlock(&space_info->lock); 3875 return 0; 3876 } else if (space_info->chunk_alloc) { 3877 wait_for_alloc = 1; 3878 } else { 3879 space_info->chunk_alloc = 1; 3880 } 3881 3882 spin_unlock(&space_info->lock); 3883 3884 mutex_lock(&fs_info->chunk_mutex); 3885 3886 /* 3887 * The chunk_mutex is held throughout the entirety of a chunk 3888 * allocation, so once we've acquired the chunk_mutex we know that the 3889 * other guy is done and we need to recheck and see if we should 3890 * allocate. 3891 */ 3892 if (wait_for_alloc) { 3893 mutex_unlock(&fs_info->chunk_mutex); 3894 wait_for_alloc = 0; 3895 goto again; 3896 } 3897 3898 trans->allocating_chunk = true; 3899 3900 /* 3901 * If we have mixed data/metadata chunks we want to make sure we keep 3902 * allocating mixed chunks instead of individual chunks. 3903 */ 3904 if (btrfs_mixed_space_info(space_info)) 3905 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 3906 3907 /* 3908 * if we're doing a data chunk, go ahead and make sure that 3909 * we keep a reasonable number of metadata chunks allocated in the 3910 * FS as well. 3911 */ 3912 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3913 fs_info->data_chunk_allocations++; 3914 if (!(fs_info->data_chunk_allocations % 3915 fs_info->metadata_ratio)) 3916 force_metadata_allocation(fs_info); 3917 } 3918 3919 /* 3920 * Check if we have enough space in SYSTEM chunk because we may need 3921 * to update devices. 3922 */ 3923 check_system_chunk(trans, extent_root, flags); 3924 3925 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3926 trans->allocating_chunk = false; 3927 3928 spin_lock(&space_info->lock); 3929 if (ret < 0 && ret != -ENOSPC) 3930 goto out; 3931 if (ret) 3932 space_info->full = 1; 3933 else 3934 ret = 1; 3935 3936 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3937 out: 3938 space_info->chunk_alloc = 0; 3939 spin_unlock(&space_info->lock); 3940 mutex_unlock(&fs_info->chunk_mutex); 3941 return ret; 3942 } 3943 3944 static int can_overcommit(struct btrfs_root *root, 3945 struct btrfs_space_info *space_info, u64 bytes, 3946 enum btrfs_reserve_flush_enum flush) 3947 { 3948 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3949 u64 profile = btrfs_get_alloc_profile(root, 0); 3950 u64 space_size; 3951 u64 avail; 3952 u64 used; 3953 3954 used = space_info->bytes_used + space_info->bytes_reserved + 3955 space_info->bytes_pinned + space_info->bytes_readonly; 3956 3957 /* 3958 * We only want to allow over committing if we have lots of actual space 3959 * free, but if we don't have enough space to handle the global reserve 3960 * space then we could end up having a real enospc problem when trying 3961 * to allocate a chunk or some other such important allocation. 3962 */ 3963 spin_lock(&global_rsv->lock); 3964 space_size = calc_global_rsv_need_space(global_rsv); 3965 spin_unlock(&global_rsv->lock); 3966 if (used + space_size >= space_info->total_bytes) 3967 return 0; 3968 3969 used += space_info->bytes_may_use; 3970 3971 spin_lock(&root->fs_info->free_chunk_lock); 3972 avail = root->fs_info->free_chunk_space; 3973 spin_unlock(&root->fs_info->free_chunk_lock); 3974 3975 /* 3976 * If we have dup, raid1 or raid10 then only half of the free 3977 * space is actually useable. For raid56, the space info used 3978 * doesn't include the parity drive, so we don't have to 3979 * change the math 3980 */ 3981 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3982 BTRFS_BLOCK_GROUP_RAID1 | 3983 BTRFS_BLOCK_GROUP_RAID10)) 3984 avail >>= 1; 3985 3986 /* 3987 * If we aren't flushing all things, let us overcommit up to 3988 * 1/2th of the space. If we can flush, don't let us overcommit 3989 * too much, let it overcommit up to 1/8 of the space. 3990 */ 3991 if (flush == BTRFS_RESERVE_FLUSH_ALL) 3992 avail >>= 3; 3993 else 3994 avail >>= 1; 3995 3996 if (used + bytes < space_info->total_bytes + avail) 3997 return 1; 3998 return 0; 3999 } 4000 4001 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 4002 unsigned long nr_pages) 4003 { 4004 struct super_block *sb = root->fs_info->sb; 4005 4006 if (down_read_trylock(&sb->s_umount)) { 4007 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4008 up_read(&sb->s_umount); 4009 } else { 4010 /* 4011 * We needn't worry the filesystem going from r/w to r/o though 4012 * we don't acquire ->s_umount mutex, because the filesystem 4013 * should guarantee the delalloc inodes list be empty after 4014 * the filesystem is readonly(all dirty pages are written to 4015 * the disk). 4016 */ 4017 btrfs_start_delalloc_roots(root->fs_info, 0); 4018 if (!current->journal_info) 4019 btrfs_wait_ordered_roots(root->fs_info, -1); 4020 } 4021 } 4022 4023 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) 4024 { 4025 u64 bytes; 4026 int nr; 4027 4028 bytes = btrfs_calc_trans_metadata_size(root, 1); 4029 nr = (int)div64_u64(to_reclaim, bytes); 4030 if (!nr) 4031 nr = 1; 4032 return nr; 4033 } 4034 4035 #define EXTENT_SIZE_PER_ITEM (256 * 1024) 4036 4037 /* 4038 * shrink metadata reservation for delalloc 4039 */ 4040 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 4041 bool wait_ordered) 4042 { 4043 struct btrfs_block_rsv *block_rsv; 4044 struct btrfs_space_info *space_info; 4045 struct btrfs_trans_handle *trans; 4046 u64 delalloc_bytes; 4047 u64 max_reclaim; 4048 long time_left; 4049 unsigned long nr_pages; 4050 int loops; 4051 int items; 4052 enum btrfs_reserve_flush_enum flush; 4053 4054 /* Calc the number of the pages we need flush for space reservation */ 4055 items = calc_reclaim_items_nr(root, to_reclaim); 4056 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 4057 4058 trans = (struct btrfs_trans_handle *)current->journal_info; 4059 block_rsv = &root->fs_info->delalloc_block_rsv; 4060 space_info = block_rsv->space_info; 4061 4062 delalloc_bytes = percpu_counter_sum_positive( 4063 &root->fs_info->delalloc_bytes); 4064 if (delalloc_bytes == 0) { 4065 if (trans) 4066 return; 4067 if (wait_ordered) 4068 btrfs_wait_ordered_roots(root->fs_info, items); 4069 return; 4070 } 4071 4072 loops = 0; 4073 while (delalloc_bytes && loops < 3) { 4074 max_reclaim = min(delalloc_bytes, to_reclaim); 4075 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 4076 btrfs_writeback_inodes_sb_nr(root, nr_pages); 4077 /* 4078 * We need to wait for the async pages to actually start before 4079 * we do anything. 4080 */ 4081 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); 4082 if (!max_reclaim) 4083 goto skip_async; 4084 4085 if (max_reclaim <= nr_pages) 4086 max_reclaim = 0; 4087 else 4088 max_reclaim -= nr_pages; 4089 4090 wait_event(root->fs_info->async_submit_wait, 4091 atomic_read(&root->fs_info->async_delalloc_pages) <= 4092 (int)max_reclaim); 4093 skip_async: 4094 if (!trans) 4095 flush = BTRFS_RESERVE_FLUSH_ALL; 4096 else 4097 flush = BTRFS_RESERVE_NO_FLUSH; 4098 spin_lock(&space_info->lock); 4099 if (can_overcommit(root, space_info, orig, flush)) { 4100 spin_unlock(&space_info->lock); 4101 break; 4102 } 4103 spin_unlock(&space_info->lock); 4104 4105 loops++; 4106 if (wait_ordered && !trans) { 4107 btrfs_wait_ordered_roots(root->fs_info, items); 4108 } else { 4109 time_left = schedule_timeout_killable(1); 4110 if (time_left) 4111 break; 4112 } 4113 delalloc_bytes = percpu_counter_sum_positive( 4114 &root->fs_info->delalloc_bytes); 4115 } 4116 } 4117 4118 /** 4119 * maybe_commit_transaction - possibly commit the transaction if its ok to 4120 * @root - the root we're allocating for 4121 * @bytes - the number of bytes we want to reserve 4122 * @force - force the commit 4123 * 4124 * This will check to make sure that committing the transaction will actually 4125 * get us somewhere and then commit the transaction if it does. Otherwise it 4126 * will return -ENOSPC. 4127 */ 4128 static int may_commit_transaction(struct btrfs_root *root, 4129 struct btrfs_space_info *space_info, 4130 u64 bytes, int force) 4131 { 4132 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; 4133 struct btrfs_trans_handle *trans; 4134 4135 trans = (struct btrfs_trans_handle *)current->journal_info; 4136 if (trans) 4137 return -EAGAIN; 4138 4139 if (force) 4140 goto commit; 4141 4142 /* See if there is enough pinned space to make this reservation */ 4143 spin_lock(&space_info->lock); 4144 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4145 bytes) >= 0) { 4146 spin_unlock(&space_info->lock); 4147 goto commit; 4148 } 4149 spin_unlock(&space_info->lock); 4150 4151 /* 4152 * See if there is some space in the delayed insertion reservation for 4153 * this reservation. 4154 */ 4155 if (space_info != delayed_rsv->space_info) 4156 return -ENOSPC; 4157 4158 spin_lock(&space_info->lock); 4159 spin_lock(&delayed_rsv->lock); 4160 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4161 bytes - delayed_rsv->size) >= 0) { 4162 spin_unlock(&delayed_rsv->lock); 4163 spin_unlock(&space_info->lock); 4164 return -ENOSPC; 4165 } 4166 spin_unlock(&delayed_rsv->lock); 4167 spin_unlock(&space_info->lock); 4168 4169 commit: 4170 trans = btrfs_join_transaction(root); 4171 if (IS_ERR(trans)) 4172 return -ENOSPC; 4173 4174 return btrfs_commit_transaction(trans, root); 4175 } 4176 4177 enum flush_state { 4178 FLUSH_DELAYED_ITEMS_NR = 1, 4179 FLUSH_DELAYED_ITEMS = 2, 4180 FLUSH_DELALLOC = 3, 4181 FLUSH_DELALLOC_WAIT = 4, 4182 ALLOC_CHUNK = 5, 4183 COMMIT_TRANS = 6, 4184 }; 4185 4186 static int flush_space(struct btrfs_root *root, 4187 struct btrfs_space_info *space_info, u64 num_bytes, 4188 u64 orig_bytes, int state) 4189 { 4190 struct btrfs_trans_handle *trans; 4191 int nr; 4192 int ret = 0; 4193 4194 switch (state) { 4195 case FLUSH_DELAYED_ITEMS_NR: 4196 case FLUSH_DELAYED_ITEMS: 4197 if (state == FLUSH_DELAYED_ITEMS_NR) 4198 nr = calc_reclaim_items_nr(root, num_bytes) * 2; 4199 else 4200 nr = -1; 4201 4202 trans = btrfs_join_transaction(root); 4203 if (IS_ERR(trans)) { 4204 ret = PTR_ERR(trans); 4205 break; 4206 } 4207 ret = btrfs_run_delayed_items_nr(trans, root, nr); 4208 btrfs_end_transaction(trans, root); 4209 break; 4210 case FLUSH_DELALLOC: 4211 case FLUSH_DELALLOC_WAIT: 4212 shrink_delalloc(root, num_bytes, orig_bytes, 4213 state == FLUSH_DELALLOC_WAIT); 4214 break; 4215 case ALLOC_CHUNK: 4216 trans = btrfs_join_transaction(root); 4217 if (IS_ERR(trans)) { 4218 ret = PTR_ERR(trans); 4219 break; 4220 } 4221 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4222 btrfs_get_alloc_profile(root, 0), 4223 CHUNK_ALLOC_NO_FORCE); 4224 btrfs_end_transaction(trans, root); 4225 if (ret == -ENOSPC) 4226 ret = 0; 4227 break; 4228 case COMMIT_TRANS: 4229 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 4230 break; 4231 default: 4232 ret = -ENOSPC; 4233 break; 4234 } 4235 4236 return ret; 4237 } 4238 /** 4239 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 4240 * @root - the root we're allocating for 4241 * @block_rsv - the block_rsv we're allocating for 4242 * @orig_bytes - the number of bytes we want 4243 * @flush - whether or not we can flush to make our reservation 4244 * 4245 * This will reserve orgi_bytes number of bytes from the space info associated 4246 * with the block_rsv. If there is not enough space it will make an attempt to 4247 * flush out space to make room. It will do this by flushing delalloc if 4248 * possible or committing the transaction. If flush is 0 then no attempts to 4249 * regain reservations will be made and this will fail if there is not enough 4250 * space already. 4251 */ 4252 static int reserve_metadata_bytes(struct btrfs_root *root, 4253 struct btrfs_block_rsv *block_rsv, 4254 u64 orig_bytes, 4255 enum btrfs_reserve_flush_enum flush) 4256 { 4257 struct btrfs_space_info *space_info = block_rsv->space_info; 4258 u64 used; 4259 u64 num_bytes = orig_bytes; 4260 int flush_state = FLUSH_DELAYED_ITEMS_NR; 4261 int ret = 0; 4262 bool flushing = false; 4263 4264 again: 4265 ret = 0; 4266 spin_lock(&space_info->lock); 4267 /* 4268 * We only want to wait if somebody other than us is flushing and we 4269 * are actually allowed to flush all things. 4270 */ 4271 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && 4272 space_info->flush) { 4273 spin_unlock(&space_info->lock); 4274 /* 4275 * If we have a trans handle we can't wait because the flusher 4276 * may have to commit the transaction, which would mean we would 4277 * deadlock since we are waiting for the flusher to finish, but 4278 * hold the current transaction open. 4279 */ 4280 if (current->journal_info) 4281 return -EAGAIN; 4282 ret = wait_event_killable(space_info->wait, !space_info->flush); 4283 /* Must have been killed, return */ 4284 if (ret) 4285 return -EINTR; 4286 4287 spin_lock(&space_info->lock); 4288 } 4289 4290 ret = -ENOSPC; 4291 used = space_info->bytes_used + space_info->bytes_reserved + 4292 space_info->bytes_pinned + space_info->bytes_readonly + 4293 space_info->bytes_may_use; 4294 4295 /* 4296 * The idea here is that we've not already over-reserved the block group 4297 * then we can go ahead and save our reservation first and then start 4298 * flushing if we need to. Otherwise if we've already overcommitted 4299 * lets start flushing stuff first and then come back and try to make 4300 * our reservation. 4301 */ 4302 if (used <= space_info->total_bytes) { 4303 if (used + orig_bytes <= space_info->total_bytes) { 4304 space_info->bytes_may_use += orig_bytes; 4305 trace_btrfs_space_reservation(root->fs_info, 4306 "space_info", space_info->flags, orig_bytes, 1); 4307 ret = 0; 4308 } else { 4309 /* 4310 * Ok set num_bytes to orig_bytes since we aren't 4311 * overocmmitted, this way we only try and reclaim what 4312 * we need. 4313 */ 4314 num_bytes = orig_bytes; 4315 } 4316 } else { 4317 /* 4318 * Ok we're over committed, set num_bytes to the overcommitted 4319 * amount plus the amount of bytes that we need for this 4320 * reservation. 4321 */ 4322 num_bytes = used - space_info->total_bytes + 4323 (orig_bytes * 2); 4324 } 4325 4326 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { 4327 space_info->bytes_may_use += orig_bytes; 4328 trace_btrfs_space_reservation(root->fs_info, "space_info", 4329 space_info->flags, orig_bytes, 4330 1); 4331 ret = 0; 4332 } 4333 4334 /* 4335 * Couldn't make our reservation, save our place so while we're trying 4336 * to reclaim space we can actually use it instead of somebody else 4337 * stealing it from us. 4338 * 4339 * We make the other tasks wait for the flush only when we can flush 4340 * all things. 4341 */ 4342 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 4343 flushing = true; 4344 space_info->flush = 1; 4345 } 4346 4347 spin_unlock(&space_info->lock); 4348 4349 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 4350 goto out; 4351 4352 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4353 flush_state); 4354 flush_state++; 4355 4356 /* 4357 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock 4358 * would happen. So skip delalloc flush. 4359 */ 4360 if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4361 (flush_state == FLUSH_DELALLOC || 4362 flush_state == FLUSH_DELALLOC_WAIT)) 4363 flush_state = ALLOC_CHUNK; 4364 4365 if (!ret) 4366 goto again; 4367 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4368 flush_state < COMMIT_TRANS) 4369 goto again; 4370 else if (flush == BTRFS_RESERVE_FLUSH_ALL && 4371 flush_state <= COMMIT_TRANS) 4372 goto again; 4373 4374 out: 4375 if (ret == -ENOSPC && 4376 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 4377 struct btrfs_block_rsv *global_rsv = 4378 &root->fs_info->global_block_rsv; 4379 4380 if (block_rsv != global_rsv && 4381 !block_rsv_use_bytes(global_rsv, orig_bytes)) 4382 ret = 0; 4383 } 4384 if (ret == -ENOSPC) 4385 trace_btrfs_space_reservation(root->fs_info, 4386 "space_info:enospc", 4387 space_info->flags, orig_bytes, 1); 4388 if (flushing) { 4389 spin_lock(&space_info->lock); 4390 space_info->flush = 0; 4391 wake_up_all(&space_info->wait); 4392 spin_unlock(&space_info->lock); 4393 } 4394 return ret; 4395 } 4396 4397 static struct btrfs_block_rsv *get_block_rsv( 4398 const struct btrfs_trans_handle *trans, 4399 const struct btrfs_root *root) 4400 { 4401 struct btrfs_block_rsv *block_rsv = NULL; 4402 4403 if (root->ref_cows) 4404 block_rsv = trans->block_rsv; 4405 4406 if (root == root->fs_info->csum_root && trans->adding_csums) 4407 block_rsv = trans->block_rsv; 4408 4409 if (root == root->fs_info->uuid_root) 4410 block_rsv = trans->block_rsv; 4411 4412 if (!block_rsv) 4413 block_rsv = root->block_rsv; 4414 4415 if (!block_rsv) 4416 block_rsv = &root->fs_info->empty_block_rsv; 4417 4418 return block_rsv; 4419 } 4420 4421 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 4422 u64 num_bytes) 4423 { 4424 int ret = -ENOSPC; 4425 spin_lock(&block_rsv->lock); 4426 if (block_rsv->reserved >= num_bytes) { 4427 block_rsv->reserved -= num_bytes; 4428 if (block_rsv->reserved < block_rsv->size) 4429 block_rsv->full = 0; 4430 ret = 0; 4431 } 4432 spin_unlock(&block_rsv->lock); 4433 return ret; 4434 } 4435 4436 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 4437 u64 num_bytes, int update_size) 4438 { 4439 spin_lock(&block_rsv->lock); 4440 block_rsv->reserved += num_bytes; 4441 if (update_size) 4442 block_rsv->size += num_bytes; 4443 else if (block_rsv->reserved >= block_rsv->size) 4444 block_rsv->full = 1; 4445 spin_unlock(&block_rsv->lock); 4446 } 4447 4448 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 4449 struct btrfs_block_rsv *dest, u64 num_bytes, 4450 int min_factor) 4451 { 4452 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4453 u64 min_bytes; 4454 4455 if (global_rsv->space_info != dest->space_info) 4456 return -ENOSPC; 4457 4458 spin_lock(&global_rsv->lock); 4459 min_bytes = div_factor(global_rsv->size, min_factor); 4460 if (global_rsv->reserved < min_bytes + num_bytes) { 4461 spin_unlock(&global_rsv->lock); 4462 return -ENOSPC; 4463 } 4464 global_rsv->reserved -= num_bytes; 4465 if (global_rsv->reserved < global_rsv->size) 4466 global_rsv->full = 0; 4467 spin_unlock(&global_rsv->lock); 4468 4469 block_rsv_add_bytes(dest, num_bytes, 1); 4470 return 0; 4471 } 4472 4473 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 4474 struct btrfs_block_rsv *block_rsv, 4475 struct btrfs_block_rsv *dest, u64 num_bytes) 4476 { 4477 struct btrfs_space_info *space_info = block_rsv->space_info; 4478 4479 spin_lock(&block_rsv->lock); 4480 if (num_bytes == (u64)-1) 4481 num_bytes = block_rsv->size; 4482 block_rsv->size -= num_bytes; 4483 if (block_rsv->reserved >= block_rsv->size) { 4484 num_bytes = block_rsv->reserved - block_rsv->size; 4485 block_rsv->reserved = block_rsv->size; 4486 block_rsv->full = 1; 4487 } else { 4488 num_bytes = 0; 4489 } 4490 spin_unlock(&block_rsv->lock); 4491 4492 if (num_bytes > 0) { 4493 if (dest) { 4494 spin_lock(&dest->lock); 4495 if (!dest->full) { 4496 u64 bytes_to_add; 4497 4498 bytes_to_add = dest->size - dest->reserved; 4499 bytes_to_add = min(num_bytes, bytes_to_add); 4500 dest->reserved += bytes_to_add; 4501 if (dest->reserved >= dest->size) 4502 dest->full = 1; 4503 num_bytes -= bytes_to_add; 4504 } 4505 spin_unlock(&dest->lock); 4506 } 4507 if (num_bytes) { 4508 spin_lock(&space_info->lock); 4509 space_info->bytes_may_use -= num_bytes; 4510 trace_btrfs_space_reservation(fs_info, "space_info", 4511 space_info->flags, num_bytes, 0); 4512 spin_unlock(&space_info->lock); 4513 } 4514 } 4515 } 4516 4517 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, 4518 struct btrfs_block_rsv *dst, u64 num_bytes) 4519 { 4520 int ret; 4521 4522 ret = block_rsv_use_bytes(src, num_bytes); 4523 if (ret) 4524 return ret; 4525 4526 block_rsv_add_bytes(dst, num_bytes, 1); 4527 return 0; 4528 } 4529 4530 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 4531 { 4532 memset(rsv, 0, sizeof(*rsv)); 4533 spin_lock_init(&rsv->lock); 4534 rsv->type = type; 4535 } 4536 4537 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, 4538 unsigned short type) 4539 { 4540 struct btrfs_block_rsv *block_rsv; 4541 struct btrfs_fs_info *fs_info = root->fs_info; 4542 4543 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 4544 if (!block_rsv) 4545 return NULL; 4546 4547 btrfs_init_block_rsv(block_rsv, type); 4548 block_rsv->space_info = __find_space_info(fs_info, 4549 BTRFS_BLOCK_GROUP_METADATA); 4550 return block_rsv; 4551 } 4552 4553 void btrfs_free_block_rsv(struct btrfs_root *root, 4554 struct btrfs_block_rsv *rsv) 4555 { 4556 if (!rsv) 4557 return; 4558 btrfs_block_rsv_release(root, rsv, (u64)-1); 4559 kfree(rsv); 4560 } 4561 4562 int btrfs_block_rsv_add(struct btrfs_root *root, 4563 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 4564 enum btrfs_reserve_flush_enum flush) 4565 { 4566 int ret; 4567 4568 if (num_bytes == 0) 4569 return 0; 4570 4571 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4572 if (!ret) { 4573 block_rsv_add_bytes(block_rsv, num_bytes, 1); 4574 return 0; 4575 } 4576 4577 return ret; 4578 } 4579 4580 int btrfs_block_rsv_check(struct btrfs_root *root, 4581 struct btrfs_block_rsv *block_rsv, int min_factor) 4582 { 4583 u64 num_bytes = 0; 4584 int ret = -ENOSPC; 4585 4586 if (!block_rsv) 4587 return 0; 4588 4589 spin_lock(&block_rsv->lock); 4590 num_bytes = div_factor(block_rsv->size, min_factor); 4591 if (block_rsv->reserved >= num_bytes) 4592 ret = 0; 4593 spin_unlock(&block_rsv->lock); 4594 4595 return ret; 4596 } 4597 4598 int btrfs_block_rsv_refill(struct btrfs_root *root, 4599 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 4600 enum btrfs_reserve_flush_enum flush) 4601 { 4602 u64 num_bytes = 0; 4603 int ret = -ENOSPC; 4604 4605 if (!block_rsv) 4606 return 0; 4607 4608 spin_lock(&block_rsv->lock); 4609 num_bytes = min_reserved; 4610 if (block_rsv->reserved >= num_bytes) 4611 ret = 0; 4612 else 4613 num_bytes -= block_rsv->reserved; 4614 spin_unlock(&block_rsv->lock); 4615 4616 if (!ret) 4617 return 0; 4618 4619 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4620 if (!ret) { 4621 block_rsv_add_bytes(block_rsv, num_bytes, 0); 4622 return 0; 4623 } 4624 4625 return ret; 4626 } 4627 4628 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4629 struct btrfs_block_rsv *dst_rsv, 4630 u64 num_bytes) 4631 { 4632 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4633 } 4634 4635 void btrfs_block_rsv_release(struct btrfs_root *root, 4636 struct btrfs_block_rsv *block_rsv, 4637 u64 num_bytes) 4638 { 4639 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4640 if (global_rsv->full || global_rsv == block_rsv || 4641 block_rsv->space_info != global_rsv->space_info) 4642 global_rsv = NULL; 4643 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, 4644 num_bytes); 4645 } 4646 4647 /* 4648 * helper to calculate size of global block reservation. 4649 * the desired value is sum of space used by extent tree, 4650 * checksum tree and root tree 4651 */ 4652 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) 4653 { 4654 struct btrfs_space_info *sinfo; 4655 u64 num_bytes; 4656 u64 meta_used; 4657 u64 data_used; 4658 int csum_size = btrfs_super_csum_size(fs_info->super_copy); 4659 4660 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 4661 spin_lock(&sinfo->lock); 4662 data_used = sinfo->bytes_used; 4663 spin_unlock(&sinfo->lock); 4664 4665 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4666 spin_lock(&sinfo->lock); 4667 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) 4668 data_used = 0; 4669 meta_used = sinfo->bytes_used; 4670 spin_unlock(&sinfo->lock); 4671 4672 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * 4673 csum_size * 2; 4674 num_bytes += div64_u64(data_used + meta_used, 50); 4675 4676 if (num_bytes * 3 > meta_used) 4677 num_bytes = div64_u64(meta_used, 3); 4678 4679 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); 4680 } 4681 4682 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 4683 { 4684 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 4685 struct btrfs_space_info *sinfo = block_rsv->space_info; 4686 u64 num_bytes; 4687 4688 num_bytes = calc_global_metadata_size(fs_info); 4689 4690 spin_lock(&sinfo->lock); 4691 spin_lock(&block_rsv->lock); 4692 4693 block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); 4694 4695 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 4696 sinfo->bytes_reserved + sinfo->bytes_readonly + 4697 sinfo->bytes_may_use; 4698 4699 if (sinfo->total_bytes > num_bytes) { 4700 num_bytes = sinfo->total_bytes - num_bytes; 4701 block_rsv->reserved += num_bytes; 4702 sinfo->bytes_may_use += num_bytes; 4703 trace_btrfs_space_reservation(fs_info, "space_info", 4704 sinfo->flags, num_bytes, 1); 4705 } 4706 4707 if (block_rsv->reserved >= block_rsv->size) { 4708 num_bytes = block_rsv->reserved - block_rsv->size; 4709 sinfo->bytes_may_use -= num_bytes; 4710 trace_btrfs_space_reservation(fs_info, "space_info", 4711 sinfo->flags, num_bytes, 0); 4712 block_rsv->reserved = block_rsv->size; 4713 block_rsv->full = 1; 4714 } 4715 4716 spin_unlock(&block_rsv->lock); 4717 spin_unlock(&sinfo->lock); 4718 } 4719 4720 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 4721 { 4722 struct btrfs_space_info *space_info; 4723 4724 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4725 fs_info->chunk_block_rsv.space_info = space_info; 4726 4727 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4728 fs_info->global_block_rsv.space_info = space_info; 4729 fs_info->delalloc_block_rsv.space_info = space_info; 4730 fs_info->trans_block_rsv.space_info = space_info; 4731 fs_info->empty_block_rsv.space_info = space_info; 4732 fs_info->delayed_block_rsv.space_info = space_info; 4733 4734 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 4735 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4736 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 4737 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4738 if (fs_info->quota_root) 4739 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 4740 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4741 4742 update_global_block_rsv(fs_info); 4743 } 4744 4745 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 4746 { 4747 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 4748 (u64)-1); 4749 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 4750 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 4751 WARN_ON(fs_info->trans_block_rsv.size > 0); 4752 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4753 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4754 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4755 WARN_ON(fs_info->delayed_block_rsv.size > 0); 4756 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 4757 } 4758 4759 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4760 struct btrfs_root *root) 4761 { 4762 if (!trans->block_rsv) 4763 return; 4764 4765 if (!trans->bytes_reserved) 4766 return; 4767 4768 trace_btrfs_space_reservation(root->fs_info, "transaction", 4769 trans->transid, trans->bytes_reserved, 0); 4770 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 4771 trans->bytes_reserved = 0; 4772 } 4773 4774 /* Can only return 0 or -ENOSPC */ 4775 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 4776 struct inode *inode) 4777 { 4778 struct btrfs_root *root = BTRFS_I(inode)->root; 4779 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4780 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 4781 4782 /* 4783 * We need to hold space in order to delete our orphan item once we've 4784 * added it, so this takes the reservation so we can release it later 4785 * when we are truly done with the orphan item. 4786 */ 4787 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4788 trace_btrfs_space_reservation(root->fs_info, "orphan", 4789 btrfs_ino(inode), num_bytes, 1); 4790 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4791 } 4792 4793 void btrfs_orphan_release_metadata(struct inode *inode) 4794 { 4795 struct btrfs_root *root = BTRFS_I(inode)->root; 4796 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4797 trace_btrfs_space_reservation(root->fs_info, "orphan", 4798 btrfs_ino(inode), num_bytes, 0); 4799 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4800 } 4801 4802 /* 4803 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 4804 * root: the root of the parent directory 4805 * rsv: block reservation 4806 * items: the number of items that we need do reservation 4807 * qgroup_reserved: used to return the reserved size in qgroup 4808 * 4809 * This function is used to reserve the space for snapshot/subvolume 4810 * creation and deletion. Those operations are different with the 4811 * common file/directory operations, they change two fs/file trees 4812 * and root tree, the number of items that the qgroup reserves is 4813 * different with the free space reservation. So we can not use 4814 * the space reseravtion mechanism in start_transaction(). 4815 */ 4816 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 4817 struct btrfs_block_rsv *rsv, 4818 int items, 4819 u64 *qgroup_reserved, 4820 bool use_global_rsv) 4821 { 4822 u64 num_bytes; 4823 int ret; 4824 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4825 4826 if (root->fs_info->quota_enabled) { 4827 /* One for parent inode, two for dir entries */ 4828 num_bytes = 3 * root->leafsize; 4829 ret = btrfs_qgroup_reserve(root, num_bytes); 4830 if (ret) 4831 return ret; 4832 } else { 4833 num_bytes = 0; 4834 } 4835 4836 *qgroup_reserved = num_bytes; 4837 4838 num_bytes = btrfs_calc_trans_metadata_size(root, items); 4839 rsv->space_info = __find_space_info(root->fs_info, 4840 BTRFS_BLOCK_GROUP_METADATA); 4841 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 4842 BTRFS_RESERVE_FLUSH_ALL); 4843 4844 if (ret == -ENOSPC && use_global_rsv) 4845 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); 4846 4847 if (ret) { 4848 if (*qgroup_reserved) 4849 btrfs_qgroup_free(root, *qgroup_reserved); 4850 } 4851 4852 return ret; 4853 } 4854 4855 void btrfs_subvolume_release_metadata(struct btrfs_root *root, 4856 struct btrfs_block_rsv *rsv, 4857 u64 qgroup_reserved) 4858 { 4859 btrfs_block_rsv_release(root, rsv, (u64)-1); 4860 if (qgroup_reserved) 4861 btrfs_qgroup_free(root, qgroup_reserved); 4862 } 4863 4864 /** 4865 * drop_outstanding_extent - drop an outstanding extent 4866 * @inode: the inode we're dropping the extent for 4867 * 4868 * This is called when we are freeing up an outstanding extent, either called 4869 * after an error or after an extent is written. This will return the number of 4870 * reserved extents that need to be freed. This must be called with 4871 * BTRFS_I(inode)->lock held. 4872 */ 4873 static unsigned drop_outstanding_extent(struct inode *inode) 4874 { 4875 unsigned drop_inode_space = 0; 4876 unsigned dropped_extents = 0; 4877 4878 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4879 BTRFS_I(inode)->outstanding_extents--; 4880 4881 if (BTRFS_I(inode)->outstanding_extents == 0 && 4882 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 4883 &BTRFS_I(inode)->runtime_flags)) 4884 drop_inode_space = 1; 4885 4886 /* 4887 * If we have more or the same amount of outsanding extents than we have 4888 * reserved then we need to leave the reserved extents count alone. 4889 */ 4890 if (BTRFS_I(inode)->outstanding_extents >= 4891 BTRFS_I(inode)->reserved_extents) 4892 return drop_inode_space; 4893 4894 dropped_extents = BTRFS_I(inode)->reserved_extents - 4895 BTRFS_I(inode)->outstanding_extents; 4896 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4897 return dropped_extents + drop_inode_space; 4898 } 4899 4900 /** 4901 * calc_csum_metadata_size - return the amount of metada space that must be 4902 * reserved/free'd for the given bytes. 4903 * @inode: the inode we're manipulating 4904 * @num_bytes: the number of bytes in question 4905 * @reserve: 1 if we are reserving space, 0 if we are freeing space 4906 * 4907 * This adjusts the number of csum_bytes in the inode and then returns the 4908 * correct amount of metadata that must either be reserved or freed. We 4909 * calculate how many checksums we can fit into one leaf and then divide the 4910 * number of bytes that will need to be checksumed by this value to figure out 4911 * how many checksums will be required. If we are adding bytes then the number 4912 * may go up and we will return the number of additional bytes that must be 4913 * reserved. If it is going down we will return the number of bytes that must 4914 * be freed. 4915 * 4916 * This must be called with BTRFS_I(inode)->lock held. 4917 */ 4918 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, 4919 int reserve) 4920 { 4921 struct btrfs_root *root = BTRFS_I(inode)->root; 4922 u64 csum_size; 4923 int num_csums_per_leaf; 4924 int num_csums; 4925 int old_csums; 4926 4927 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && 4928 BTRFS_I(inode)->csum_bytes == 0) 4929 return 0; 4930 4931 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 4932 if (reserve) 4933 BTRFS_I(inode)->csum_bytes += num_bytes; 4934 else 4935 BTRFS_I(inode)->csum_bytes -= num_bytes; 4936 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); 4937 num_csums_per_leaf = (int)div64_u64(csum_size, 4938 sizeof(struct btrfs_csum_item) + 4939 sizeof(struct btrfs_disk_key)); 4940 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 4941 num_csums = num_csums + num_csums_per_leaf - 1; 4942 num_csums = num_csums / num_csums_per_leaf; 4943 4944 old_csums = old_csums + num_csums_per_leaf - 1; 4945 old_csums = old_csums / num_csums_per_leaf; 4946 4947 /* No change, no need to reserve more */ 4948 if (old_csums == num_csums) 4949 return 0; 4950 4951 if (reserve) 4952 return btrfs_calc_trans_metadata_size(root, 4953 num_csums - old_csums); 4954 4955 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); 4956 } 4957 4958 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4959 { 4960 struct btrfs_root *root = BTRFS_I(inode)->root; 4961 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4962 u64 to_reserve = 0; 4963 u64 csum_bytes; 4964 unsigned nr_extents = 0; 4965 int extra_reserve = 0; 4966 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 4967 int ret = 0; 4968 bool delalloc_lock = true; 4969 u64 to_free = 0; 4970 unsigned dropped; 4971 4972 /* If we are a free space inode we need to not flush since we will be in 4973 * the middle of a transaction commit. We also don't need the delalloc 4974 * mutex since we won't race with anybody. We need this mostly to make 4975 * lockdep shut its filthy mouth. 4976 */ 4977 if (btrfs_is_free_space_inode(inode)) { 4978 flush = BTRFS_RESERVE_NO_FLUSH; 4979 delalloc_lock = false; 4980 } 4981 4982 if (flush != BTRFS_RESERVE_NO_FLUSH && 4983 btrfs_transaction_in_commit(root->fs_info)) 4984 schedule_timeout(1); 4985 4986 if (delalloc_lock) 4987 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 4988 4989 num_bytes = ALIGN(num_bytes, root->sectorsize); 4990 4991 spin_lock(&BTRFS_I(inode)->lock); 4992 BTRFS_I(inode)->outstanding_extents++; 4993 4994 if (BTRFS_I(inode)->outstanding_extents > 4995 BTRFS_I(inode)->reserved_extents) 4996 nr_extents = BTRFS_I(inode)->outstanding_extents - 4997 BTRFS_I(inode)->reserved_extents; 4998 4999 /* 5000 * Add an item to reserve for updating the inode when we complete the 5001 * delalloc io. 5002 */ 5003 if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5004 &BTRFS_I(inode)->runtime_flags)) { 5005 nr_extents++; 5006 extra_reserve = 1; 5007 } 5008 5009 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 5010 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 5011 csum_bytes = BTRFS_I(inode)->csum_bytes; 5012 spin_unlock(&BTRFS_I(inode)->lock); 5013 5014 if (root->fs_info->quota_enabled) { 5015 ret = btrfs_qgroup_reserve(root, num_bytes + 5016 nr_extents * root->leafsize); 5017 if (ret) 5018 goto out_fail; 5019 } 5020 5021 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 5022 if (unlikely(ret)) { 5023 if (root->fs_info->quota_enabled) 5024 btrfs_qgroup_free(root, num_bytes + 5025 nr_extents * root->leafsize); 5026 goto out_fail; 5027 } 5028 5029 spin_lock(&BTRFS_I(inode)->lock); 5030 if (extra_reserve) { 5031 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5032 &BTRFS_I(inode)->runtime_flags); 5033 nr_extents--; 5034 } 5035 BTRFS_I(inode)->reserved_extents += nr_extents; 5036 spin_unlock(&BTRFS_I(inode)->lock); 5037 5038 if (delalloc_lock) 5039 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 5040 5041 if (to_reserve) 5042 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5043 btrfs_ino(inode), to_reserve, 1); 5044 block_rsv_add_bytes(block_rsv, to_reserve, 1); 5045 5046 return 0; 5047 5048 out_fail: 5049 spin_lock(&BTRFS_I(inode)->lock); 5050 dropped = drop_outstanding_extent(inode); 5051 /* 5052 * If the inodes csum_bytes is the same as the original 5053 * csum_bytes then we know we haven't raced with any free()ers 5054 * so we can just reduce our inodes csum bytes and carry on. 5055 */ 5056 if (BTRFS_I(inode)->csum_bytes == csum_bytes) { 5057 calc_csum_metadata_size(inode, num_bytes, 0); 5058 } else { 5059 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; 5060 u64 bytes; 5061 5062 /* 5063 * This is tricky, but first we need to figure out how much we 5064 * free'd from any free-ers that occured during this 5065 * reservation, so we reset ->csum_bytes to the csum_bytes 5066 * before we dropped our lock, and then call the free for the 5067 * number of bytes that were freed while we were trying our 5068 * reservation. 5069 */ 5070 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; 5071 BTRFS_I(inode)->csum_bytes = csum_bytes; 5072 to_free = calc_csum_metadata_size(inode, bytes, 0); 5073 5074 5075 /* 5076 * Now we need to see how much we would have freed had we not 5077 * been making this reservation and our ->csum_bytes were not 5078 * artificially inflated. 5079 */ 5080 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; 5081 bytes = csum_bytes - orig_csum_bytes; 5082 bytes = calc_csum_metadata_size(inode, bytes, 0); 5083 5084 /* 5085 * Now reset ->csum_bytes to what it should be. If bytes is 5086 * more than to_free then we would have free'd more space had we 5087 * not had an artificially high ->csum_bytes, so we need to free 5088 * the remainder. If bytes is the same or less then we don't 5089 * need to do anything, the other free-ers did the correct 5090 * thing. 5091 */ 5092 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; 5093 if (bytes > to_free) 5094 to_free = bytes - to_free; 5095 else 5096 to_free = 0; 5097 } 5098 spin_unlock(&BTRFS_I(inode)->lock); 5099 if (dropped) 5100 to_free += btrfs_calc_trans_metadata_size(root, dropped); 5101 5102 if (to_free) { 5103 btrfs_block_rsv_release(root, block_rsv, to_free); 5104 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5105 btrfs_ino(inode), to_free, 0); 5106 } 5107 if (delalloc_lock) 5108 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 5109 return ret; 5110 } 5111 5112 /** 5113 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 5114 * @inode: the inode to release the reservation for 5115 * @num_bytes: the number of bytes we're releasing 5116 * 5117 * This will release the metadata reservation for an inode. This can be called 5118 * once we complete IO for a given set of bytes to release their metadata 5119 * reservations. 5120 */ 5121 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 5122 { 5123 struct btrfs_root *root = BTRFS_I(inode)->root; 5124 u64 to_free = 0; 5125 unsigned dropped; 5126 5127 num_bytes = ALIGN(num_bytes, root->sectorsize); 5128 spin_lock(&BTRFS_I(inode)->lock); 5129 dropped = drop_outstanding_extent(inode); 5130 5131 if (num_bytes) 5132 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 5133 spin_unlock(&BTRFS_I(inode)->lock); 5134 if (dropped > 0) 5135 to_free += btrfs_calc_trans_metadata_size(root, dropped); 5136 5137 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5138 btrfs_ino(inode), to_free, 0); 5139 if (root->fs_info->quota_enabled) { 5140 btrfs_qgroup_free(root, num_bytes + 5141 dropped * root->leafsize); 5142 } 5143 5144 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 5145 to_free); 5146 } 5147 5148 /** 5149 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc 5150 * @inode: inode we're writing to 5151 * @num_bytes: the number of bytes we want to allocate 5152 * 5153 * This will do the following things 5154 * 5155 * o reserve space in the data space info for num_bytes 5156 * o reserve space in the metadata space info based on number of outstanding 5157 * extents and how much csums will be needed 5158 * o add to the inodes ->delalloc_bytes 5159 * o add it to the fs_info's delalloc inodes list. 5160 * 5161 * This will return 0 for success and -ENOSPC if there is no space left. 5162 */ 5163 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 5164 { 5165 int ret; 5166 5167 ret = btrfs_check_data_free_space(inode, num_bytes); 5168 if (ret) 5169 return ret; 5170 5171 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); 5172 if (ret) { 5173 btrfs_free_reserved_data_space(inode, num_bytes); 5174 return ret; 5175 } 5176 5177 return 0; 5178 } 5179 5180 /** 5181 * btrfs_delalloc_release_space - release data and metadata space for delalloc 5182 * @inode: inode we're releasing space for 5183 * @num_bytes: the number of bytes we want to free up 5184 * 5185 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 5186 * called in the case that we don't need the metadata AND data reservations 5187 * anymore. So if there is an error or we insert an inline extent. 5188 * 5189 * This function will release the metadata space that was not used and will 5190 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 5191 * list if there are no delalloc bytes left. 5192 */ 5193 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 5194 { 5195 btrfs_delalloc_release_metadata(inode, num_bytes); 5196 btrfs_free_reserved_data_space(inode, num_bytes); 5197 } 5198 5199 static int update_block_group(struct btrfs_root *root, 5200 u64 bytenr, u64 num_bytes, int alloc) 5201 { 5202 struct btrfs_block_group_cache *cache = NULL; 5203 struct btrfs_fs_info *info = root->fs_info; 5204 u64 total = num_bytes; 5205 u64 old_val; 5206 u64 byte_in_group; 5207 int factor; 5208 5209 /* block accounting for super block */ 5210 spin_lock(&info->delalloc_root_lock); 5211 old_val = btrfs_super_bytes_used(info->super_copy); 5212 if (alloc) 5213 old_val += num_bytes; 5214 else 5215 old_val -= num_bytes; 5216 btrfs_set_super_bytes_used(info->super_copy, old_val); 5217 spin_unlock(&info->delalloc_root_lock); 5218 5219 while (total) { 5220 cache = btrfs_lookup_block_group(info, bytenr); 5221 if (!cache) 5222 return -ENOENT; 5223 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 5224 BTRFS_BLOCK_GROUP_RAID1 | 5225 BTRFS_BLOCK_GROUP_RAID10)) 5226 factor = 2; 5227 else 5228 factor = 1; 5229 /* 5230 * If this block group has free space cache written out, we 5231 * need to make sure to load it if we are removing space. This 5232 * is because we need the unpinning stage to actually add the 5233 * space back to the block group, otherwise we will leak space. 5234 */ 5235 if (!alloc && cache->cached == BTRFS_CACHE_NO) 5236 cache_block_group(cache, 1); 5237 5238 byte_in_group = bytenr - cache->key.objectid; 5239 WARN_ON(byte_in_group > cache->key.offset); 5240 5241 spin_lock(&cache->space_info->lock); 5242 spin_lock(&cache->lock); 5243 5244 if (btrfs_test_opt(root, SPACE_CACHE) && 5245 cache->disk_cache_state < BTRFS_DC_CLEAR) 5246 cache->disk_cache_state = BTRFS_DC_CLEAR; 5247 5248 cache->dirty = 1; 5249 old_val = btrfs_block_group_used(&cache->item); 5250 num_bytes = min(total, cache->key.offset - byte_in_group); 5251 if (alloc) { 5252 old_val += num_bytes; 5253 btrfs_set_block_group_used(&cache->item, old_val); 5254 cache->reserved -= num_bytes; 5255 cache->space_info->bytes_reserved -= num_bytes; 5256 cache->space_info->bytes_used += num_bytes; 5257 cache->space_info->disk_used += num_bytes * factor; 5258 spin_unlock(&cache->lock); 5259 spin_unlock(&cache->space_info->lock); 5260 } else { 5261 old_val -= num_bytes; 5262 btrfs_set_block_group_used(&cache->item, old_val); 5263 cache->pinned += num_bytes; 5264 cache->space_info->bytes_pinned += num_bytes; 5265 cache->space_info->bytes_used -= num_bytes; 5266 cache->space_info->disk_used -= num_bytes * factor; 5267 spin_unlock(&cache->lock); 5268 spin_unlock(&cache->space_info->lock); 5269 5270 set_extent_dirty(info->pinned_extents, 5271 bytenr, bytenr + num_bytes - 1, 5272 GFP_NOFS | __GFP_NOFAIL); 5273 } 5274 btrfs_put_block_group(cache); 5275 total -= num_bytes; 5276 bytenr += num_bytes; 5277 } 5278 return 0; 5279 } 5280 5281 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 5282 { 5283 struct btrfs_block_group_cache *cache; 5284 u64 bytenr; 5285 5286 spin_lock(&root->fs_info->block_group_cache_lock); 5287 bytenr = root->fs_info->first_logical_byte; 5288 spin_unlock(&root->fs_info->block_group_cache_lock); 5289 5290 if (bytenr < (u64)-1) 5291 return bytenr; 5292 5293 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 5294 if (!cache) 5295 return 0; 5296 5297 bytenr = cache->key.objectid; 5298 btrfs_put_block_group(cache); 5299 5300 return bytenr; 5301 } 5302 5303 static int pin_down_extent(struct btrfs_root *root, 5304 struct btrfs_block_group_cache *cache, 5305 u64 bytenr, u64 num_bytes, int reserved) 5306 { 5307 spin_lock(&cache->space_info->lock); 5308 spin_lock(&cache->lock); 5309 cache->pinned += num_bytes; 5310 cache->space_info->bytes_pinned += num_bytes; 5311 if (reserved) { 5312 cache->reserved -= num_bytes; 5313 cache->space_info->bytes_reserved -= num_bytes; 5314 } 5315 spin_unlock(&cache->lock); 5316 spin_unlock(&cache->space_info->lock); 5317 5318 set_extent_dirty(root->fs_info->pinned_extents, bytenr, 5319 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 5320 if (reserved) 5321 trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); 5322 return 0; 5323 } 5324 5325 /* 5326 * this function must be called within transaction 5327 */ 5328 int btrfs_pin_extent(struct btrfs_root *root, 5329 u64 bytenr, u64 num_bytes, int reserved) 5330 { 5331 struct btrfs_block_group_cache *cache; 5332 5333 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5334 BUG_ON(!cache); /* Logic error */ 5335 5336 pin_down_extent(root, cache, bytenr, num_bytes, reserved); 5337 5338 btrfs_put_block_group(cache); 5339 return 0; 5340 } 5341 5342 /* 5343 * this function must be called within transaction 5344 */ 5345 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 5346 u64 bytenr, u64 num_bytes) 5347 { 5348 struct btrfs_block_group_cache *cache; 5349 int ret; 5350 5351 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5352 if (!cache) 5353 return -EINVAL; 5354 5355 /* 5356 * pull in the free space cache (if any) so that our pin 5357 * removes the free space from the cache. We have load_only set 5358 * to one because the slow code to read in the free extents does check 5359 * the pinned extents. 5360 */ 5361 cache_block_group(cache, 1); 5362 5363 pin_down_extent(root, cache, bytenr, num_bytes, 0); 5364 5365 /* remove us from the free space cache (if we're there at all) */ 5366 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 5367 btrfs_put_block_group(cache); 5368 return ret; 5369 } 5370 5371 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) 5372 { 5373 int ret; 5374 struct btrfs_block_group_cache *block_group; 5375 struct btrfs_caching_control *caching_ctl; 5376 5377 block_group = btrfs_lookup_block_group(root->fs_info, start); 5378 if (!block_group) 5379 return -EINVAL; 5380 5381 cache_block_group(block_group, 0); 5382 caching_ctl = get_caching_control(block_group); 5383 5384 if (!caching_ctl) { 5385 /* Logic error */ 5386 BUG_ON(!block_group_cache_done(block_group)); 5387 ret = btrfs_remove_free_space(block_group, start, num_bytes); 5388 } else { 5389 mutex_lock(&caching_ctl->mutex); 5390 5391 if (start >= caching_ctl->progress) { 5392 ret = add_excluded_extent(root, start, num_bytes); 5393 } else if (start + num_bytes <= caching_ctl->progress) { 5394 ret = btrfs_remove_free_space(block_group, 5395 start, num_bytes); 5396 } else { 5397 num_bytes = caching_ctl->progress - start; 5398 ret = btrfs_remove_free_space(block_group, 5399 start, num_bytes); 5400 if (ret) 5401 goto out_lock; 5402 5403 num_bytes = (start + num_bytes) - 5404 caching_ctl->progress; 5405 start = caching_ctl->progress; 5406 ret = add_excluded_extent(root, start, num_bytes); 5407 } 5408 out_lock: 5409 mutex_unlock(&caching_ctl->mutex); 5410 put_caching_control(caching_ctl); 5411 } 5412 btrfs_put_block_group(block_group); 5413 return ret; 5414 } 5415 5416 int btrfs_exclude_logged_extents(struct btrfs_root *log, 5417 struct extent_buffer *eb) 5418 { 5419 struct btrfs_file_extent_item *item; 5420 struct btrfs_key key; 5421 int found_type; 5422 int i; 5423 5424 if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) 5425 return 0; 5426 5427 for (i = 0; i < btrfs_header_nritems(eb); i++) { 5428 btrfs_item_key_to_cpu(eb, &key, i); 5429 if (key.type != BTRFS_EXTENT_DATA_KEY) 5430 continue; 5431 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 5432 found_type = btrfs_file_extent_type(eb, item); 5433 if (found_type == BTRFS_FILE_EXTENT_INLINE) 5434 continue; 5435 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 5436 continue; 5437 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 5438 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 5439 __exclude_logged_extent(log, key.objectid, key.offset); 5440 } 5441 5442 return 0; 5443 } 5444 5445 /** 5446 * btrfs_update_reserved_bytes - update the block_group and space info counters 5447 * @cache: The cache we are manipulating 5448 * @num_bytes: The number of bytes in question 5449 * @reserve: One of the reservation enums 5450 * 5451 * This is called by the allocator when it reserves space, or by somebody who is 5452 * freeing space that was never actually used on disk. For example if you 5453 * reserve some space for a new leaf in transaction A and before transaction A 5454 * commits you free that leaf, you call this with reserve set to 0 in order to 5455 * clear the reservation. 5456 * 5457 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper 5458 * ENOSPC accounting. For data we handle the reservation through clearing the 5459 * delalloc bits in the io_tree. We have to do this since we could end up 5460 * allocating less disk space for the amount of data we have reserved in the 5461 * case of compression. 5462 * 5463 * If this is a reservation and the block group has become read only we cannot 5464 * make the reservation and return -EAGAIN, otherwise this function always 5465 * succeeds. 5466 */ 5467 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 5468 u64 num_bytes, int reserve) 5469 { 5470 struct btrfs_space_info *space_info = cache->space_info; 5471 int ret = 0; 5472 5473 spin_lock(&space_info->lock); 5474 spin_lock(&cache->lock); 5475 if (reserve != RESERVE_FREE) { 5476 if (cache->ro) { 5477 ret = -EAGAIN; 5478 } else { 5479 cache->reserved += num_bytes; 5480 space_info->bytes_reserved += num_bytes; 5481 if (reserve == RESERVE_ALLOC) { 5482 trace_btrfs_space_reservation(cache->fs_info, 5483 "space_info", space_info->flags, 5484 num_bytes, 0); 5485 space_info->bytes_may_use -= num_bytes; 5486 } 5487 } 5488 } else { 5489 if (cache->ro) 5490 space_info->bytes_readonly += num_bytes; 5491 cache->reserved -= num_bytes; 5492 space_info->bytes_reserved -= num_bytes; 5493 } 5494 spin_unlock(&cache->lock); 5495 spin_unlock(&space_info->lock); 5496 return ret; 5497 } 5498 5499 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 5500 struct btrfs_root *root) 5501 { 5502 struct btrfs_fs_info *fs_info = root->fs_info; 5503 struct btrfs_caching_control *next; 5504 struct btrfs_caching_control *caching_ctl; 5505 struct btrfs_block_group_cache *cache; 5506 struct btrfs_space_info *space_info; 5507 5508 down_write(&fs_info->extent_commit_sem); 5509 5510 list_for_each_entry_safe(caching_ctl, next, 5511 &fs_info->caching_block_groups, list) { 5512 cache = caching_ctl->block_group; 5513 if (block_group_cache_done(cache)) { 5514 cache->last_byte_to_unpin = (u64)-1; 5515 list_del_init(&caching_ctl->list); 5516 put_caching_control(caching_ctl); 5517 } else { 5518 cache->last_byte_to_unpin = caching_ctl->progress; 5519 } 5520 } 5521 5522 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5523 fs_info->pinned_extents = &fs_info->freed_extents[1]; 5524 else 5525 fs_info->pinned_extents = &fs_info->freed_extents[0]; 5526 5527 up_write(&fs_info->extent_commit_sem); 5528 5529 list_for_each_entry_rcu(space_info, &fs_info->space_info, list) 5530 percpu_counter_set(&space_info->total_bytes_pinned, 0); 5531 5532 update_global_block_rsv(fs_info); 5533 } 5534 5535 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 5536 { 5537 struct btrfs_fs_info *fs_info = root->fs_info; 5538 struct btrfs_block_group_cache *cache = NULL; 5539 struct btrfs_space_info *space_info; 5540 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5541 u64 len; 5542 bool readonly; 5543 5544 while (start <= end) { 5545 readonly = false; 5546 if (!cache || 5547 start >= cache->key.objectid + cache->key.offset) { 5548 if (cache) 5549 btrfs_put_block_group(cache); 5550 cache = btrfs_lookup_block_group(fs_info, start); 5551 BUG_ON(!cache); /* Logic error */ 5552 } 5553 5554 len = cache->key.objectid + cache->key.offset - start; 5555 len = min(len, end + 1 - start); 5556 5557 if (start < cache->last_byte_to_unpin) { 5558 len = min(len, cache->last_byte_to_unpin - start); 5559 btrfs_add_free_space(cache, start, len); 5560 } 5561 5562 start += len; 5563 space_info = cache->space_info; 5564 5565 spin_lock(&space_info->lock); 5566 spin_lock(&cache->lock); 5567 cache->pinned -= len; 5568 space_info->bytes_pinned -= len; 5569 if (cache->ro) { 5570 space_info->bytes_readonly += len; 5571 readonly = true; 5572 } 5573 spin_unlock(&cache->lock); 5574 if (!readonly && global_rsv->space_info == space_info) { 5575 spin_lock(&global_rsv->lock); 5576 if (!global_rsv->full) { 5577 len = min(len, global_rsv->size - 5578 global_rsv->reserved); 5579 global_rsv->reserved += len; 5580 space_info->bytes_may_use += len; 5581 if (global_rsv->reserved >= global_rsv->size) 5582 global_rsv->full = 1; 5583 } 5584 spin_unlock(&global_rsv->lock); 5585 } 5586 spin_unlock(&space_info->lock); 5587 } 5588 5589 if (cache) 5590 btrfs_put_block_group(cache); 5591 return 0; 5592 } 5593 5594 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 5595 struct btrfs_root *root) 5596 { 5597 struct btrfs_fs_info *fs_info = root->fs_info; 5598 struct extent_io_tree *unpin; 5599 u64 start; 5600 u64 end; 5601 int ret; 5602 5603 if (trans->aborted) 5604 return 0; 5605 5606 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5607 unpin = &fs_info->freed_extents[1]; 5608 else 5609 unpin = &fs_info->freed_extents[0]; 5610 5611 while (1) { 5612 ret = find_first_extent_bit(unpin, 0, &start, &end, 5613 EXTENT_DIRTY, NULL); 5614 if (ret) 5615 break; 5616 5617 if (btrfs_test_opt(root, DISCARD)) 5618 ret = btrfs_discard_extent(root, start, 5619 end + 1 - start, NULL); 5620 5621 clear_extent_dirty(unpin, start, end, GFP_NOFS); 5622 unpin_extent_range(root, start, end); 5623 cond_resched(); 5624 } 5625 5626 return 0; 5627 } 5628 5629 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, 5630 u64 owner, u64 root_objectid) 5631 { 5632 struct btrfs_space_info *space_info; 5633 u64 flags; 5634 5635 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 5636 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) 5637 flags = BTRFS_BLOCK_GROUP_SYSTEM; 5638 else 5639 flags = BTRFS_BLOCK_GROUP_METADATA; 5640 } else { 5641 flags = BTRFS_BLOCK_GROUP_DATA; 5642 } 5643 5644 space_info = __find_space_info(fs_info, flags); 5645 BUG_ON(!space_info); /* Logic bug */ 5646 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); 5647 } 5648 5649 5650 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 5651 struct btrfs_root *root, 5652 u64 bytenr, u64 num_bytes, u64 parent, 5653 u64 root_objectid, u64 owner_objectid, 5654 u64 owner_offset, int refs_to_drop, 5655 struct btrfs_delayed_extent_op *extent_op) 5656 { 5657 struct btrfs_key key; 5658 struct btrfs_path *path; 5659 struct btrfs_fs_info *info = root->fs_info; 5660 struct btrfs_root *extent_root = info->extent_root; 5661 struct extent_buffer *leaf; 5662 struct btrfs_extent_item *ei; 5663 struct btrfs_extent_inline_ref *iref; 5664 int ret; 5665 int is_data; 5666 int extent_slot = 0; 5667 int found_extent = 0; 5668 int num_to_del = 1; 5669 u32 item_size; 5670 u64 refs; 5671 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 5672 SKINNY_METADATA); 5673 5674 path = btrfs_alloc_path(); 5675 if (!path) 5676 return -ENOMEM; 5677 5678 path->reada = 1; 5679 path->leave_spinning = 1; 5680 5681 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 5682 BUG_ON(!is_data && refs_to_drop != 1); 5683 5684 if (is_data) 5685 skinny_metadata = 0; 5686 5687 ret = lookup_extent_backref(trans, extent_root, path, &iref, 5688 bytenr, num_bytes, parent, 5689 root_objectid, owner_objectid, 5690 owner_offset); 5691 if (ret == 0) { 5692 extent_slot = path->slots[0]; 5693 while (extent_slot >= 0) { 5694 btrfs_item_key_to_cpu(path->nodes[0], &key, 5695 extent_slot); 5696 if (key.objectid != bytenr) 5697 break; 5698 if (key.type == BTRFS_EXTENT_ITEM_KEY && 5699 key.offset == num_bytes) { 5700 found_extent = 1; 5701 break; 5702 } 5703 if (key.type == BTRFS_METADATA_ITEM_KEY && 5704 key.offset == owner_objectid) { 5705 found_extent = 1; 5706 break; 5707 } 5708 if (path->slots[0] - extent_slot > 5) 5709 break; 5710 extent_slot--; 5711 } 5712 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5713 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 5714 if (found_extent && item_size < sizeof(*ei)) 5715 found_extent = 0; 5716 #endif 5717 if (!found_extent) { 5718 BUG_ON(iref); 5719 ret = remove_extent_backref(trans, extent_root, path, 5720 NULL, refs_to_drop, 5721 is_data); 5722 if (ret) { 5723 btrfs_abort_transaction(trans, extent_root, ret); 5724 goto out; 5725 } 5726 btrfs_release_path(path); 5727 path->leave_spinning = 1; 5728 5729 key.objectid = bytenr; 5730 key.type = BTRFS_EXTENT_ITEM_KEY; 5731 key.offset = num_bytes; 5732 5733 if (!is_data && skinny_metadata) { 5734 key.type = BTRFS_METADATA_ITEM_KEY; 5735 key.offset = owner_objectid; 5736 } 5737 5738 ret = btrfs_search_slot(trans, extent_root, 5739 &key, path, -1, 1); 5740 if (ret > 0 && skinny_metadata && path->slots[0]) { 5741 /* 5742 * Couldn't find our skinny metadata item, 5743 * see if we have ye olde extent item. 5744 */ 5745 path->slots[0]--; 5746 btrfs_item_key_to_cpu(path->nodes[0], &key, 5747 path->slots[0]); 5748 if (key.objectid == bytenr && 5749 key.type == BTRFS_EXTENT_ITEM_KEY && 5750 key.offset == num_bytes) 5751 ret = 0; 5752 } 5753 5754 if (ret > 0 && skinny_metadata) { 5755 skinny_metadata = false; 5756 key.type = BTRFS_EXTENT_ITEM_KEY; 5757 key.offset = num_bytes; 5758 btrfs_release_path(path); 5759 ret = btrfs_search_slot(trans, extent_root, 5760 &key, path, -1, 1); 5761 } 5762 5763 if (ret) { 5764 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5765 ret, bytenr); 5766 if (ret > 0) 5767 btrfs_print_leaf(extent_root, 5768 path->nodes[0]); 5769 } 5770 if (ret < 0) { 5771 btrfs_abort_transaction(trans, extent_root, ret); 5772 goto out; 5773 } 5774 extent_slot = path->slots[0]; 5775 } 5776 } else if (WARN_ON(ret == -ENOENT)) { 5777 btrfs_print_leaf(extent_root, path->nodes[0]); 5778 btrfs_err(info, 5779 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 5780 bytenr, parent, root_objectid, owner_objectid, 5781 owner_offset); 5782 } else { 5783 btrfs_abort_transaction(trans, extent_root, ret); 5784 goto out; 5785 } 5786 5787 leaf = path->nodes[0]; 5788 item_size = btrfs_item_size_nr(leaf, extent_slot); 5789 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5790 if (item_size < sizeof(*ei)) { 5791 BUG_ON(found_extent || extent_slot != path->slots[0]); 5792 ret = convert_extent_item_v0(trans, extent_root, path, 5793 owner_objectid, 0); 5794 if (ret < 0) { 5795 btrfs_abort_transaction(trans, extent_root, ret); 5796 goto out; 5797 } 5798 5799 btrfs_release_path(path); 5800 path->leave_spinning = 1; 5801 5802 key.objectid = bytenr; 5803 key.type = BTRFS_EXTENT_ITEM_KEY; 5804 key.offset = num_bytes; 5805 5806 ret = btrfs_search_slot(trans, extent_root, &key, path, 5807 -1, 1); 5808 if (ret) { 5809 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5810 ret, bytenr); 5811 btrfs_print_leaf(extent_root, path->nodes[0]); 5812 } 5813 if (ret < 0) { 5814 btrfs_abort_transaction(trans, extent_root, ret); 5815 goto out; 5816 } 5817 5818 extent_slot = path->slots[0]; 5819 leaf = path->nodes[0]; 5820 item_size = btrfs_item_size_nr(leaf, extent_slot); 5821 } 5822 #endif 5823 BUG_ON(item_size < sizeof(*ei)); 5824 ei = btrfs_item_ptr(leaf, extent_slot, 5825 struct btrfs_extent_item); 5826 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 5827 key.type == BTRFS_EXTENT_ITEM_KEY) { 5828 struct btrfs_tree_block_info *bi; 5829 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 5830 bi = (struct btrfs_tree_block_info *)(ei + 1); 5831 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 5832 } 5833 5834 refs = btrfs_extent_refs(leaf, ei); 5835 if (refs < refs_to_drop) { 5836 btrfs_err(info, "trying to drop %d refs but we only have %Lu " 5837 "for bytenr %Lu\n", refs_to_drop, refs, bytenr); 5838 ret = -EINVAL; 5839 btrfs_abort_transaction(trans, extent_root, ret); 5840 goto out; 5841 } 5842 refs -= refs_to_drop; 5843 5844 if (refs > 0) { 5845 if (extent_op) 5846 __run_delayed_extent_op(extent_op, leaf, ei); 5847 /* 5848 * In the case of inline back ref, reference count will 5849 * be updated by remove_extent_backref 5850 */ 5851 if (iref) { 5852 BUG_ON(!found_extent); 5853 } else { 5854 btrfs_set_extent_refs(leaf, ei, refs); 5855 btrfs_mark_buffer_dirty(leaf); 5856 } 5857 if (found_extent) { 5858 ret = remove_extent_backref(trans, extent_root, path, 5859 iref, refs_to_drop, 5860 is_data); 5861 if (ret) { 5862 btrfs_abort_transaction(trans, extent_root, ret); 5863 goto out; 5864 } 5865 } 5866 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, 5867 root_objectid); 5868 } else { 5869 if (found_extent) { 5870 BUG_ON(is_data && refs_to_drop != 5871 extent_data_ref_count(root, path, iref)); 5872 if (iref) { 5873 BUG_ON(path->slots[0] != extent_slot); 5874 } else { 5875 BUG_ON(path->slots[0] != extent_slot + 1); 5876 path->slots[0] = extent_slot; 5877 num_to_del = 2; 5878 } 5879 } 5880 5881 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 5882 num_to_del); 5883 if (ret) { 5884 btrfs_abort_transaction(trans, extent_root, ret); 5885 goto out; 5886 } 5887 btrfs_release_path(path); 5888 5889 if (is_data) { 5890 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 5891 if (ret) { 5892 btrfs_abort_transaction(trans, extent_root, ret); 5893 goto out; 5894 } 5895 } 5896 5897 ret = update_block_group(root, bytenr, num_bytes, 0); 5898 if (ret) { 5899 btrfs_abort_transaction(trans, extent_root, ret); 5900 goto out; 5901 } 5902 } 5903 out: 5904 btrfs_free_path(path); 5905 return ret; 5906 } 5907 5908 /* 5909 * when we free an block, it is possible (and likely) that we free the last 5910 * delayed ref for that extent as well. This searches the delayed ref tree for 5911 * a given extent, and if there are no other delayed refs to be processed, it 5912 * removes it from the tree. 5913 */ 5914 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 5915 struct btrfs_root *root, u64 bytenr) 5916 { 5917 struct btrfs_delayed_ref_head *head; 5918 struct btrfs_delayed_ref_root *delayed_refs; 5919 struct btrfs_delayed_ref_node *ref; 5920 struct rb_node *node; 5921 int ret = 0; 5922 5923 delayed_refs = &trans->transaction->delayed_refs; 5924 spin_lock(&delayed_refs->lock); 5925 head = btrfs_find_delayed_ref_head(trans, bytenr); 5926 if (!head) 5927 goto out; 5928 5929 node = rb_prev(&head->node.rb_node); 5930 if (!node) 5931 goto out; 5932 5933 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 5934 5935 /* there are still entries for this ref, we can't drop it */ 5936 if (ref->bytenr == bytenr) 5937 goto out; 5938 5939 if (head->extent_op) { 5940 if (!head->must_insert_reserved) 5941 goto out; 5942 btrfs_free_delayed_extent_op(head->extent_op); 5943 head->extent_op = NULL; 5944 } 5945 5946 /* 5947 * waiting for the lock here would deadlock. If someone else has it 5948 * locked they are already in the process of dropping it anyway 5949 */ 5950 if (!mutex_trylock(&head->mutex)) 5951 goto out; 5952 5953 /* 5954 * at this point we have a head with no other entries. Go 5955 * ahead and process it. 5956 */ 5957 head->node.in_tree = 0; 5958 rb_erase(&head->node.rb_node, &delayed_refs->root); 5959 5960 delayed_refs->num_entries--; 5961 5962 /* 5963 * we don't take a ref on the node because we're removing it from the 5964 * tree, so we just steal the ref the tree was holding. 5965 */ 5966 delayed_refs->num_heads--; 5967 if (list_empty(&head->cluster)) 5968 delayed_refs->num_heads_ready--; 5969 5970 list_del_init(&head->cluster); 5971 spin_unlock(&delayed_refs->lock); 5972 5973 BUG_ON(head->extent_op); 5974 if (head->must_insert_reserved) 5975 ret = 1; 5976 5977 mutex_unlock(&head->mutex); 5978 btrfs_put_delayed_ref(&head->node); 5979 return ret; 5980 out: 5981 spin_unlock(&delayed_refs->lock); 5982 return 0; 5983 } 5984 5985 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5986 struct btrfs_root *root, 5987 struct extent_buffer *buf, 5988 u64 parent, int last_ref) 5989 { 5990 struct btrfs_block_group_cache *cache = NULL; 5991 int pin = 1; 5992 int ret; 5993 5994 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5995 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 5996 buf->start, buf->len, 5997 parent, root->root_key.objectid, 5998 btrfs_header_level(buf), 5999 BTRFS_DROP_DELAYED_REF, NULL, 0); 6000 BUG_ON(ret); /* -ENOMEM */ 6001 } 6002 6003 if (!last_ref) 6004 return; 6005 6006 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 6007 6008 if (btrfs_header_generation(buf) == trans->transid) { 6009 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 6010 ret = check_ref_cleanup(trans, root, buf->start); 6011 if (!ret) 6012 goto out; 6013 } 6014 6015 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 6016 pin_down_extent(root, cache, buf->start, buf->len, 1); 6017 goto out; 6018 } 6019 6020 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 6021 6022 btrfs_add_free_space(cache, buf->start, buf->len); 6023 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); 6024 trace_btrfs_reserved_extent_free(root, buf->start, buf->len); 6025 pin = 0; 6026 } 6027 out: 6028 if (pin) 6029 add_pinned_bytes(root->fs_info, buf->len, 6030 btrfs_header_level(buf), 6031 root->root_key.objectid); 6032 6033 /* 6034 * Deleting the buffer, clear the corrupt flag since it doesn't matter 6035 * anymore. 6036 */ 6037 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 6038 btrfs_put_block_group(cache); 6039 } 6040 6041 /* Can return -ENOMEM */ 6042 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, 6043 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 6044 u64 owner, u64 offset, int for_cow) 6045 { 6046 int ret; 6047 struct btrfs_fs_info *fs_info = root->fs_info; 6048 6049 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); 6050 6051 /* 6052 * tree log blocks never actually go into the extent allocation 6053 * tree, just update pinning info and exit early. 6054 */ 6055 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 6056 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 6057 /* unlocks the pinned mutex */ 6058 btrfs_pin_extent(root, bytenr, num_bytes, 1); 6059 ret = 0; 6060 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 6061 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 6062 num_bytes, 6063 parent, root_objectid, (int)owner, 6064 BTRFS_DROP_DELAYED_REF, NULL, for_cow); 6065 } else { 6066 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 6067 num_bytes, 6068 parent, root_objectid, owner, 6069 offset, BTRFS_DROP_DELAYED_REF, 6070 NULL, for_cow); 6071 } 6072 return ret; 6073 } 6074 6075 static u64 stripe_align(struct btrfs_root *root, 6076 struct btrfs_block_group_cache *cache, 6077 u64 val, u64 num_bytes) 6078 { 6079 u64 ret = ALIGN(val, root->stripesize); 6080 return ret; 6081 } 6082 6083 /* 6084 * when we wait for progress in the block group caching, its because 6085 * our allocation attempt failed at least once. So, we must sleep 6086 * and let some progress happen before we try again. 6087 * 6088 * This function will sleep at least once waiting for new free space to 6089 * show up, and then it will check the block group free space numbers 6090 * for our min num_bytes. Another option is to have it go ahead 6091 * and look in the rbtree for a free extent of a given size, but this 6092 * is a good start. 6093 * 6094 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 6095 * any of the information in this block group. 6096 */ 6097 static noinline void 6098 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 6099 u64 num_bytes) 6100 { 6101 struct btrfs_caching_control *caching_ctl; 6102 6103 caching_ctl = get_caching_control(cache); 6104 if (!caching_ctl) 6105 return; 6106 6107 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 6108 (cache->free_space_ctl->free_space >= num_bytes)); 6109 6110 put_caching_control(caching_ctl); 6111 } 6112 6113 static noinline int 6114 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 6115 { 6116 struct btrfs_caching_control *caching_ctl; 6117 int ret = 0; 6118 6119 caching_ctl = get_caching_control(cache); 6120 if (!caching_ctl) 6121 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 6122 6123 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 6124 if (cache->cached == BTRFS_CACHE_ERROR) 6125 ret = -EIO; 6126 put_caching_control(caching_ctl); 6127 return ret; 6128 } 6129 6130 int __get_raid_index(u64 flags) 6131 { 6132 if (flags & BTRFS_BLOCK_GROUP_RAID10) 6133 return BTRFS_RAID_RAID10; 6134 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 6135 return BTRFS_RAID_RAID1; 6136 else if (flags & BTRFS_BLOCK_GROUP_DUP) 6137 return BTRFS_RAID_DUP; 6138 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 6139 return BTRFS_RAID_RAID0; 6140 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 6141 return BTRFS_RAID_RAID5; 6142 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 6143 return BTRFS_RAID_RAID6; 6144 6145 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 6146 } 6147 6148 static int get_block_group_index(struct btrfs_block_group_cache *cache) 6149 { 6150 return __get_raid_index(cache->flags); 6151 } 6152 6153 enum btrfs_loop_type { 6154 LOOP_CACHING_NOWAIT = 0, 6155 LOOP_CACHING_WAIT = 1, 6156 LOOP_ALLOC_CHUNK = 2, 6157 LOOP_NO_EMPTY_SIZE = 3, 6158 }; 6159 6160 /* 6161 * walks the btree of allocated extents and find a hole of a given size. 6162 * The key ins is changed to record the hole: 6163 * ins->objectid == start position 6164 * ins->flags = BTRFS_EXTENT_ITEM_KEY 6165 * ins->offset == the size of the hole. 6166 * Any available blocks before search_start are skipped. 6167 * 6168 * If there is no suitable free space, we will record the max size of 6169 * the free space extent currently. 6170 */ 6171 static noinline int find_free_extent(struct btrfs_root *orig_root, 6172 u64 num_bytes, u64 empty_size, 6173 u64 hint_byte, struct btrfs_key *ins, 6174 u64 flags) 6175 { 6176 int ret = 0; 6177 struct btrfs_root *root = orig_root->fs_info->extent_root; 6178 struct btrfs_free_cluster *last_ptr = NULL; 6179 struct btrfs_block_group_cache *block_group = NULL; 6180 struct btrfs_block_group_cache *used_block_group; 6181 u64 search_start = 0; 6182 u64 max_extent_size = 0; 6183 int empty_cluster = 2 * 1024 * 1024; 6184 struct btrfs_space_info *space_info; 6185 int loop = 0; 6186 int index = __get_raid_index(flags); 6187 int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? 6188 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 6189 bool found_uncached_bg = false; 6190 bool failed_cluster_refill = false; 6191 bool failed_alloc = false; 6192 bool use_cluster = true; 6193 bool have_caching_bg = false; 6194 6195 WARN_ON(num_bytes < root->sectorsize); 6196 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 6197 ins->objectid = 0; 6198 ins->offset = 0; 6199 6200 trace_find_free_extent(orig_root, num_bytes, empty_size, flags); 6201 6202 space_info = __find_space_info(root->fs_info, flags); 6203 if (!space_info) { 6204 btrfs_err(root->fs_info, "No space info for %llu", flags); 6205 return -ENOSPC; 6206 } 6207 6208 /* 6209 * If the space info is for both data and metadata it means we have a 6210 * small filesystem and we can't use the clustering stuff. 6211 */ 6212 if (btrfs_mixed_space_info(space_info)) 6213 use_cluster = false; 6214 6215 if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 6216 last_ptr = &root->fs_info->meta_alloc_cluster; 6217 if (!btrfs_test_opt(root, SSD)) 6218 empty_cluster = 64 * 1024; 6219 } 6220 6221 if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && 6222 btrfs_test_opt(root, SSD)) { 6223 last_ptr = &root->fs_info->data_alloc_cluster; 6224 } 6225 6226 if (last_ptr) { 6227 spin_lock(&last_ptr->lock); 6228 if (last_ptr->block_group) 6229 hint_byte = last_ptr->window_start; 6230 spin_unlock(&last_ptr->lock); 6231 } 6232 6233 search_start = max(search_start, first_logical_byte(root, 0)); 6234 search_start = max(search_start, hint_byte); 6235 6236 if (!last_ptr) 6237 empty_cluster = 0; 6238 6239 if (search_start == hint_byte) { 6240 block_group = btrfs_lookup_block_group(root->fs_info, 6241 search_start); 6242 used_block_group = block_group; 6243 /* 6244 * we don't want to use the block group if it doesn't match our 6245 * allocation bits, or if its not cached. 6246 * 6247 * However if we are re-searching with an ideal block group 6248 * picked out then we don't care that the block group is cached. 6249 */ 6250 if (block_group && block_group_bits(block_group, flags) && 6251 block_group->cached != BTRFS_CACHE_NO) { 6252 down_read(&space_info->groups_sem); 6253 if (list_empty(&block_group->list) || 6254 block_group->ro) { 6255 /* 6256 * someone is removing this block group, 6257 * we can't jump into the have_block_group 6258 * target because our list pointers are not 6259 * valid 6260 */ 6261 btrfs_put_block_group(block_group); 6262 up_read(&space_info->groups_sem); 6263 } else { 6264 index = get_block_group_index(block_group); 6265 goto have_block_group; 6266 } 6267 } else if (block_group) { 6268 btrfs_put_block_group(block_group); 6269 } 6270 } 6271 search: 6272 have_caching_bg = false; 6273 down_read(&space_info->groups_sem); 6274 list_for_each_entry(block_group, &space_info->block_groups[index], 6275 list) { 6276 u64 offset; 6277 int cached; 6278 6279 used_block_group = block_group; 6280 btrfs_get_block_group(block_group); 6281 search_start = block_group->key.objectid; 6282 6283 /* 6284 * this can happen if we end up cycling through all the 6285 * raid types, but we want to make sure we only allocate 6286 * for the proper type. 6287 */ 6288 if (!block_group_bits(block_group, flags)) { 6289 u64 extra = BTRFS_BLOCK_GROUP_DUP | 6290 BTRFS_BLOCK_GROUP_RAID1 | 6291 BTRFS_BLOCK_GROUP_RAID5 | 6292 BTRFS_BLOCK_GROUP_RAID6 | 6293 BTRFS_BLOCK_GROUP_RAID10; 6294 6295 /* 6296 * if they asked for extra copies and this block group 6297 * doesn't provide them, bail. This does allow us to 6298 * fill raid0 from raid1. 6299 */ 6300 if ((flags & extra) && !(block_group->flags & extra)) 6301 goto loop; 6302 } 6303 6304 have_block_group: 6305 cached = block_group_cache_done(block_group); 6306 if (unlikely(!cached)) { 6307 found_uncached_bg = true; 6308 ret = cache_block_group(block_group, 0); 6309 BUG_ON(ret < 0); 6310 ret = 0; 6311 } 6312 6313 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 6314 goto loop; 6315 if (unlikely(block_group->ro)) 6316 goto loop; 6317 6318 /* 6319 * Ok we want to try and use the cluster allocator, so 6320 * lets look there 6321 */ 6322 if (last_ptr) { 6323 unsigned long aligned_cluster; 6324 /* 6325 * the refill lock keeps out other 6326 * people trying to start a new cluster 6327 */ 6328 spin_lock(&last_ptr->refill_lock); 6329 used_block_group = last_ptr->block_group; 6330 if (used_block_group != block_group && 6331 (!used_block_group || 6332 used_block_group->ro || 6333 !block_group_bits(used_block_group, flags))) { 6334 used_block_group = block_group; 6335 goto refill_cluster; 6336 } 6337 6338 if (used_block_group != block_group) 6339 btrfs_get_block_group(used_block_group); 6340 6341 offset = btrfs_alloc_from_cluster(used_block_group, 6342 last_ptr, 6343 num_bytes, 6344 used_block_group->key.objectid, 6345 &max_extent_size); 6346 if (offset) { 6347 /* we have a block, we're done */ 6348 spin_unlock(&last_ptr->refill_lock); 6349 trace_btrfs_reserve_extent_cluster(root, 6350 block_group, search_start, num_bytes); 6351 goto checks; 6352 } 6353 6354 WARN_ON(last_ptr->block_group != used_block_group); 6355 if (used_block_group != block_group) { 6356 btrfs_put_block_group(used_block_group); 6357 used_block_group = block_group; 6358 } 6359 refill_cluster: 6360 BUG_ON(used_block_group != block_group); 6361 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 6362 * set up a new clusters, so lets just skip it 6363 * and let the allocator find whatever block 6364 * it can find. If we reach this point, we 6365 * will have tried the cluster allocator 6366 * plenty of times and not have found 6367 * anything, so we are likely way too 6368 * fragmented for the clustering stuff to find 6369 * anything. 6370 * 6371 * However, if the cluster is taken from the 6372 * current block group, release the cluster 6373 * first, so that we stand a better chance of 6374 * succeeding in the unclustered 6375 * allocation. */ 6376 if (loop >= LOOP_NO_EMPTY_SIZE && 6377 last_ptr->block_group != block_group) { 6378 spin_unlock(&last_ptr->refill_lock); 6379 goto unclustered_alloc; 6380 } 6381 6382 /* 6383 * this cluster didn't work out, free it and 6384 * start over 6385 */ 6386 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6387 6388 if (loop >= LOOP_NO_EMPTY_SIZE) { 6389 spin_unlock(&last_ptr->refill_lock); 6390 goto unclustered_alloc; 6391 } 6392 6393 aligned_cluster = max_t(unsigned long, 6394 empty_cluster + empty_size, 6395 block_group->full_stripe_len); 6396 6397 /* allocate a cluster in this block group */ 6398 ret = btrfs_find_space_cluster(root, block_group, 6399 last_ptr, search_start, 6400 num_bytes, 6401 aligned_cluster); 6402 if (ret == 0) { 6403 /* 6404 * now pull our allocation out of this 6405 * cluster 6406 */ 6407 offset = btrfs_alloc_from_cluster(block_group, 6408 last_ptr, 6409 num_bytes, 6410 search_start, 6411 &max_extent_size); 6412 if (offset) { 6413 /* we found one, proceed */ 6414 spin_unlock(&last_ptr->refill_lock); 6415 trace_btrfs_reserve_extent_cluster(root, 6416 block_group, search_start, 6417 num_bytes); 6418 goto checks; 6419 } 6420 } else if (!cached && loop > LOOP_CACHING_NOWAIT 6421 && !failed_cluster_refill) { 6422 spin_unlock(&last_ptr->refill_lock); 6423 6424 failed_cluster_refill = true; 6425 wait_block_group_cache_progress(block_group, 6426 num_bytes + empty_cluster + empty_size); 6427 goto have_block_group; 6428 } 6429 6430 /* 6431 * at this point we either didn't find a cluster 6432 * or we weren't able to allocate a block from our 6433 * cluster. Free the cluster we've been trying 6434 * to use, and go to the next block group 6435 */ 6436 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6437 spin_unlock(&last_ptr->refill_lock); 6438 goto loop; 6439 } 6440 6441 unclustered_alloc: 6442 spin_lock(&block_group->free_space_ctl->tree_lock); 6443 if (cached && 6444 block_group->free_space_ctl->free_space < 6445 num_bytes + empty_cluster + empty_size) { 6446 if (block_group->free_space_ctl->free_space > 6447 max_extent_size) 6448 max_extent_size = 6449 block_group->free_space_ctl->free_space; 6450 spin_unlock(&block_group->free_space_ctl->tree_lock); 6451 goto loop; 6452 } 6453 spin_unlock(&block_group->free_space_ctl->tree_lock); 6454 6455 offset = btrfs_find_space_for_alloc(block_group, search_start, 6456 num_bytes, empty_size, 6457 &max_extent_size); 6458 /* 6459 * If we didn't find a chunk, and we haven't failed on this 6460 * block group before, and this block group is in the middle of 6461 * caching and we are ok with waiting, then go ahead and wait 6462 * for progress to be made, and set failed_alloc to true. 6463 * 6464 * If failed_alloc is true then we've already waited on this 6465 * block group once and should move on to the next block group. 6466 */ 6467 if (!offset && !failed_alloc && !cached && 6468 loop > LOOP_CACHING_NOWAIT) { 6469 wait_block_group_cache_progress(block_group, 6470 num_bytes + empty_size); 6471 failed_alloc = true; 6472 goto have_block_group; 6473 } else if (!offset) { 6474 if (!cached) 6475 have_caching_bg = true; 6476 goto loop; 6477 } 6478 checks: 6479 search_start = stripe_align(root, used_block_group, 6480 offset, num_bytes); 6481 6482 /* move on to the next group */ 6483 if (search_start + num_bytes > 6484 used_block_group->key.objectid + used_block_group->key.offset) { 6485 btrfs_add_free_space(used_block_group, offset, num_bytes); 6486 goto loop; 6487 } 6488 6489 if (offset < search_start) 6490 btrfs_add_free_space(used_block_group, offset, 6491 search_start - offset); 6492 BUG_ON(offset > search_start); 6493 6494 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, 6495 alloc_type); 6496 if (ret == -EAGAIN) { 6497 btrfs_add_free_space(used_block_group, offset, num_bytes); 6498 goto loop; 6499 } 6500 6501 /* we are all good, lets return */ 6502 ins->objectid = search_start; 6503 ins->offset = num_bytes; 6504 6505 trace_btrfs_reserve_extent(orig_root, block_group, 6506 search_start, num_bytes); 6507 if (used_block_group != block_group) 6508 btrfs_put_block_group(used_block_group); 6509 btrfs_put_block_group(block_group); 6510 break; 6511 loop: 6512 failed_cluster_refill = false; 6513 failed_alloc = false; 6514 BUG_ON(index != get_block_group_index(block_group)); 6515 if (used_block_group != block_group) 6516 btrfs_put_block_group(used_block_group); 6517 btrfs_put_block_group(block_group); 6518 } 6519 up_read(&space_info->groups_sem); 6520 6521 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 6522 goto search; 6523 6524 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 6525 goto search; 6526 6527 /* 6528 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 6529 * caching kthreads as we move along 6530 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 6531 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 6532 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 6533 * again 6534 */ 6535 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 6536 index = 0; 6537 loop++; 6538 if (loop == LOOP_ALLOC_CHUNK) { 6539 struct btrfs_trans_handle *trans; 6540 6541 trans = btrfs_join_transaction(root); 6542 if (IS_ERR(trans)) { 6543 ret = PTR_ERR(trans); 6544 goto out; 6545 } 6546 6547 ret = do_chunk_alloc(trans, root, flags, 6548 CHUNK_ALLOC_FORCE); 6549 /* 6550 * Do not bail out on ENOSPC since we 6551 * can do more things. 6552 */ 6553 if (ret < 0 && ret != -ENOSPC) 6554 btrfs_abort_transaction(trans, 6555 root, ret); 6556 else 6557 ret = 0; 6558 btrfs_end_transaction(trans, root); 6559 if (ret) 6560 goto out; 6561 } 6562 6563 if (loop == LOOP_NO_EMPTY_SIZE) { 6564 empty_size = 0; 6565 empty_cluster = 0; 6566 } 6567 6568 goto search; 6569 } else if (!ins->objectid) { 6570 ret = -ENOSPC; 6571 } else if (ins->objectid) { 6572 ret = 0; 6573 } 6574 out: 6575 if (ret == -ENOSPC) 6576 ins->offset = max_extent_size; 6577 return ret; 6578 } 6579 6580 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 6581 int dump_block_groups) 6582 { 6583 struct btrfs_block_group_cache *cache; 6584 int index = 0; 6585 6586 spin_lock(&info->lock); 6587 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", 6588 info->flags, 6589 info->total_bytes - info->bytes_used - info->bytes_pinned - 6590 info->bytes_reserved - info->bytes_readonly, 6591 (info->full) ? "" : "not "); 6592 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " 6593 "reserved=%llu, may_use=%llu, readonly=%llu\n", 6594 info->total_bytes, info->bytes_used, info->bytes_pinned, 6595 info->bytes_reserved, info->bytes_may_use, 6596 info->bytes_readonly); 6597 spin_unlock(&info->lock); 6598 6599 if (!dump_block_groups) 6600 return; 6601 6602 down_read(&info->groups_sem); 6603 again: 6604 list_for_each_entry(cache, &info->block_groups[index], list) { 6605 spin_lock(&cache->lock); 6606 printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", 6607 cache->key.objectid, cache->key.offset, 6608 btrfs_block_group_used(&cache->item), cache->pinned, 6609 cache->reserved, cache->ro ? "[readonly]" : ""); 6610 btrfs_dump_free_space(cache, bytes); 6611 spin_unlock(&cache->lock); 6612 } 6613 if (++index < BTRFS_NR_RAID_TYPES) 6614 goto again; 6615 up_read(&info->groups_sem); 6616 } 6617 6618 int btrfs_reserve_extent(struct btrfs_root *root, 6619 u64 num_bytes, u64 min_alloc_size, 6620 u64 empty_size, u64 hint_byte, 6621 struct btrfs_key *ins, int is_data) 6622 { 6623 bool final_tried = false; 6624 u64 flags; 6625 int ret; 6626 6627 flags = btrfs_get_alloc_profile(root, is_data); 6628 again: 6629 WARN_ON(num_bytes < root->sectorsize); 6630 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, 6631 flags); 6632 6633 if (ret == -ENOSPC) { 6634 if (!final_tried && ins->offset) { 6635 num_bytes = min(num_bytes >> 1, ins->offset); 6636 num_bytes = round_down(num_bytes, root->sectorsize); 6637 num_bytes = max(num_bytes, min_alloc_size); 6638 if (num_bytes == min_alloc_size) 6639 final_tried = true; 6640 goto again; 6641 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 6642 struct btrfs_space_info *sinfo; 6643 6644 sinfo = __find_space_info(root->fs_info, flags); 6645 btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", 6646 flags, num_bytes); 6647 if (sinfo) 6648 dump_space_info(sinfo, num_bytes, 1); 6649 } 6650 } 6651 6652 return ret; 6653 } 6654 6655 static int __btrfs_free_reserved_extent(struct btrfs_root *root, 6656 u64 start, u64 len, int pin) 6657 { 6658 struct btrfs_block_group_cache *cache; 6659 int ret = 0; 6660 6661 cache = btrfs_lookup_block_group(root->fs_info, start); 6662 if (!cache) { 6663 btrfs_err(root->fs_info, "Unable to find block group for %llu", 6664 start); 6665 return -ENOSPC; 6666 } 6667 6668 if (btrfs_test_opt(root, DISCARD)) 6669 ret = btrfs_discard_extent(root, start, len, NULL); 6670 6671 if (pin) 6672 pin_down_extent(root, cache, start, len, 1); 6673 else { 6674 btrfs_add_free_space(cache, start, len); 6675 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); 6676 } 6677 btrfs_put_block_group(cache); 6678 6679 trace_btrfs_reserved_extent_free(root, start, len); 6680 6681 return ret; 6682 } 6683 6684 int btrfs_free_reserved_extent(struct btrfs_root *root, 6685 u64 start, u64 len) 6686 { 6687 return __btrfs_free_reserved_extent(root, start, len, 0); 6688 } 6689 6690 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 6691 u64 start, u64 len) 6692 { 6693 return __btrfs_free_reserved_extent(root, start, len, 1); 6694 } 6695 6696 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6697 struct btrfs_root *root, 6698 u64 parent, u64 root_objectid, 6699 u64 flags, u64 owner, u64 offset, 6700 struct btrfs_key *ins, int ref_mod) 6701 { 6702 int ret; 6703 struct btrfs_fs_info *fs_info = root->fs_info; 6704 struct btrfs_extent_item *extent_item; 6705 struct btrfs_extent_inline_ref *iref; 6706 struct btrfs_path *path; 6707 struct extent_buffer *leaf; 6708 int type; 6709 u32 size; 6710 6711 if (parent > 0) 6712 type = BTRFS_SHARED_DATA_REF_KEY; 6713 else 6714 type = BTRFS_EXTENT_DATA_REF_KEY; 6715 6716 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 6717 6718 path = btrfs_alloc_path(); 6719 if (!path) 6720 return -ENOMEM; 6721 6722 path->leave_spinning = 1; 6723 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 6724 ins, size); 6725 if (ret) { 6726 btrfs_free_path(path); 6727 return ret; 6728 } 6729 6730 leaf = path->nodes[0]; 6731 extent_item = btrfs_item_ptr(leaf, path->slots[0], 6732 struct btrfs_extent_item); 6733 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 6734 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 6735 btrfs_set_extent_flags(leaf, extent_item, 6736 flags | BTRFS_EXTENT_FLAG_DATA); 6737 6738 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 6739 btrfs_set_extent_inline_ref_type(leaf, iref, type); 6740 if (parent > 0) { 6741 struct btrfs_shared_data_ref *ref; 6742 ref = (struct btrfs_shared_data_ref *)(iref + 1); 6743 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 6744 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 6745 } else { 6746 struct btrfs_extent_data_ref *ref; 6747 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 6748 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 6749 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 6750 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 6751 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 6752 } 6753 6754 btrfs_mark_buffer_dirty(path->nodes[0]); 6755 btrfs_free_path(path); 6756 6757 ret = update_block_group(root, ins->objectid, ins->offset, 1); 6758 if (ret) { /* -ENOENT, logic error */ 6759 btrfs_err(fs_info, "update block group failed for %llu %llu", 6760 ins->objectid, ins->offset); 6761 BUG(); 6762 } 6763 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); 6764 return ret; 6765 } 6766 6767 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 6768 struct btrfs_root *root, 6769 u64 parent, u64 root_objectid, 6770 u64 flags, struct btrfs_disk_key *key, 6771 int level, struct btrfs_key *ins) 6772 { 6773 int ret; 6774 struct btrfs_fs_info *fs_info = root->fs_info; 6775 struct btrfs_extent_item *extent_item; 6776 struct btrfs_tree_block_info *block_info; 6777 struct btrfs_extent_inline_ref *iref; 6778 struct btrfs_path *path; 6779 struct extent_buffer *leaf; 6780 u32 size = sizeof(*extent_item) + sizeof(*iref); 6781 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 6782 SKINNY_METADATA); 6783 6784 if (!skinny_metadata) 6785 size += sizeof(*block_info); 6786 6787 path = btrfs_alloc_path(); 6788 if (!path) { 6789 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 6790 root->leafsize); 6791 return -ENOMEM; 6792 } 6793 6794 path->leave_spinning = 1; 6795 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 6796 ins, size); 6797 if (ret) { 6798 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 6799 root->leafsize); 6800 btrfs_free_path(path); 6801 return ret; 6802 } 6803 6804 leaf = path->nodes[0]; 6805 extent_item = btrfs_item_ptr(leaf, path->slots[0], 6806 struct btrfs_extent_item); 6807 btrfs_set_extent_refs(leaf, extent_item, 1); 6808 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 6809 btrfs_set_extent_flags(leaf, extent_item, 6810 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 6811 6812 if (skinny_metadata) { 6813 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 6814 } else { 6815 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 6816 btrfs_set_tree_block_key(leaf, block_info, key); 6817 btrfs_set_tree_block_level(leaf, block_info, level); 6818 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 6819 } 6820 6821 if (parent > 0) { 6822 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 6823 btrfs_set_extent_inline_ref_type(leaf, iref, 6824 BTRFS_SHARED_BLOCK_REF_KEY); 6825 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 6826 } else { 6827 btrfs_set_extent_inline_ref_type(leaf, iref, 6828 BTRFS_TREE_BLOCK_REF_KEY); 6829 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 6830 } 6831 6832 btrfs_mark_buffer_dirty(leaf); 6833 btrfs_free_path(path); 6834 6835 ret = update_block_group(root, ins->objectid, root->leafsize, 1); 6836 if (ret) { /* -ENOENT, logic error */ 6837 btrfs_err(fs_info, "update block group failed for %llu %llu", 6838 ins->objectid, ins->offset); 6839 BUG(); 6840 } 6841 6842 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); 6843 return ret; 6844 } 6845 6846 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6847 struct btrfs_root *root, 6848 u64 root_objectid, u64 owner, 6849 u64 offset, struct btrfs_key *ins) 6850 { 6851 int ret; 6852 6853 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 6854 6855 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, 6856 ins->offset, 0, 6857 root_objectid, owner, offset, 6858 BTRFS_ADD_DELAYED_EXTENT, NULL, 0); 6859 return ret; 6860 } 6861 6862 /* 6863 * this is used by the tree logging recovery code. It records that 6864 * an extent has been allocated and makes sure to clear the free 6865 * space cache bits as well 6866 */ 6867 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 6868 struct btrfs_root *root, 6869 u64 root_objectid, u64 owner, u64 offset, 6870 struct btrfs_key *ins) 6871 { 6872 int ret; 6873 struct btrfs_block_group_cache *block_group; 6874 6875 /* 6876 * Mixed block groups will exclude before processing the log so we only 6877 * need to do the exlude dance if this fs isn't mixed. 6878 */ 6879 if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { 6880 ret = __exclude_logged_extent(root, ins->objectid, ins->offset); 6881 if (ret) 6882 return ret; 6883 } 6884 6885 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 6886 if (!block_group) 6887 return -EINVAL; 6888 6889 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 6890 RESERVE_ALLOC_NO_ACCOUNT); 6891 BUG_ON(ret); /* logic error */ 6892 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 6893 0, owner, offset, ins, 1); 6894 btrfs_put_block_group(block_group); 6895 return ret; 6896 } 6897 6898 static struct extent_buffer * 6899 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 6900 u64 bytenr, u32 blocksize, int level) 6901 { 6902 struct extent_buffer *buf; 6903 6904 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 6905 if (!buf) 6906 return ERR_PTR(-ENOMEM); 6907 btrfs_set_header_generation(buf, trans->transid); 6908 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 6909 btrfs_tree_lock(buf); 6910 clean_tree_block(trans, root, buf); 6911 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 6912 6913 btrfs_set_lock_blocking(buf); 6914 btrfs_set_buffer_uptodate(buf); 6915 6916 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 6917 /* 6918 * we allow two log transactions at a time, use different 6919 * EXENT bit to differentiate dirty pages. 6920 */ 6921 if (root->log_transid % 2 == 0) 6922 set_extent_dirty(&root->dirty_log_pages, buf->start, 6923 buf->start + buf->len - 1, GFP_NOFS); 6924 else 6925 set_extent_new(&root->dirty_log_pages, buf->start, 6926 buf->start + buf->len - 1, GFP_NOFS); 6927 } else { 6928 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 6929 buf->start + buf->len - 1, GFP_NOFS); 6930 } 6931 trans->blocks_used++; 6932 /* this returns a buffer locked for blocking */ 6933 return buf; 6934 } 6935 6936 static struct btrfs_block_rsv * 6937 use_block_rsv(struct btrfs_trans_handle *trans, 6938 struct btrfs_root *root, u32 blocksize) 6939 { 6940 struct btrfs_block_rsv *block_rsv; 6941 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 6942 int ret; 6943 bool global_updated = false; 6944 6945 block_rsv = get_block_rsv(trans, root); 6946 6947 if (unlikely(block_rsv->size == 0)) 6948 goto try_reserve; 6949 again: 6950 ret = block_rsv_use_bytes(block_rsv, blocksize); 6951 if (!ret) 6952 return block_rsv; 6953 6954 if (block_rsv->failfast) 6955 return ERR_PTR(ret); 6956 6957 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 6958 global_updated = true; 6959 update_global_block_rsv(root->fs_info); 6960 goto again; 6961 } 6962 6963 if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 6964 static DEFINE_RATELIMIT_STATE(_rs, 6965 DEFAULT_RATELIMIT_INTERVAL * 10, 6966 /*DEFAULT_RATELIMIT_BURST*/ 1); 6967 if (__ratelimit(&_rs)) 6968 WARN(1, KERN_DEBUG 6969 "btrfs: block rsv returned %d\n", ret); 6970 } 6971 try_reserve: 6972 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6973 BTRFS_RESERVE_NO_FLUSH); 6974 if (!ret) 6975 return block_rsv; 6976 /* 6977 * If we couldn't reserve metadata bytes try and use some from 6978 * the global reserve if its space type is the same as the global 6979 * reservation. 6980 */ 6981 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 6982 block_rsv->space_info == global_rsv->space_info) { 6983 ret = block_rsv_use_bytes(global_rsv, blocksize); 6984 if (!ret) 6985 return global_rsv; 6986 } 6987 return ERR_PTR(ret); 6988 } 6989 6990 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 6991 struct btrfs_block_rsv *block_rsv, u32 blocksize) 6992 { 6993 block_rsv_add_bytes(block_rsv, blocksize, 0); 6994 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 6995 } 6996 6997 /* 6998 * finds a free extent and does all the dirty work required for allocation 6999 * returns the key for the extent through ins, and a tree buffer for 7000 * the first block of the extent through buf. 7001 * 7002 * returns the tree buffer or NULL. 7003 */ 7004 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 7005 struct btrfs_root *root, u32 blocksize, 7006 u64 parent, u64 root_objectid, 7007 struct btrfs_disk_key *key, int level, 7008 u64 hint, u64 empty_size) 7009 { 7010 struct btrfs_key ins; 7011 struct btrfs_block_rsv *block_rsv; 7012 struct extent_buffer *buf; 7013 u64 flags = 0; 7014 int ret; 7015 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7016 SKINNY_METADATA); 7017 7018 block_rsv = use_block_rsv(trans, root, blocksize); 7019 if (IS_ERR(block_rsv)) 7020 return ERR_CAST(block_rsv); 7021 7022 ret = btrfs_reserve_extent(root, blocksize, blocksize, 7023 empty_size, hint, &ins, 0); 7024 if (ret) { 7025 unuse_block_rsv(root->fs_info, block_rsv, blocksize); 7026 return ERR_PTR(ret); 7027 } 7028 7029 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 7030 blocksize, level); 7031 BUG_ON(IS_ERR(buf)); /* -ENOMEM */ 7032 7033 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 7034 if (parent == 0) 7035 parent = ins.objectid; 7036 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 7037 } else 7038 BUG_ON(parent > 0); 7039 7040 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 7041 struct btrfs_delayed_extent_op *extent_op; 7042 extent_op = btrfs_alloc_delayed_extent_op(); 7043 BUG_ON(!extent_op); /* -ENOMEM */ 7044 if (key) 7045 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 7046 else 7047 memset(&extent_op->key, 0, sizeof(extent_op->key)); 7048 extent_op->flags_to_set = flags; 7049 if (skinny_metadata) 7050 extent_op->update_key = 0; 7051 else 7052 extent_op->update_key = 1; 7053 extent_op->update_flags = 1; 7054 extent_op->is_data = 0; 7055 extent_op->level = level; 7056 7057 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 7058 ins.objectid, 7059 ins.offset, parent, root_objectid, 7060 level, BTRFS_ADD_DELAYED_EXTENT, 7061 extent_op, 0); 7062 BUG_ON(ret); /* -ENOMEM */ 7063 } 7064 return buf; 7065 } 7066 7067 struct walk_control { 7068 u64 refs[BTRFS_MAX_LEVEL]; 7069 u64 flags[BTRFS_MAX_LEVEL]; 7070 struct btrfs_key update_progress; 7071 int stage; 7072 int level; 7073 int shared_level; 7074 int update_ref; 7075 int keep_locks; 7076 int reada_slot; 7077 int reada_count; 7078 int for_reloc; 7079 }; 7080 7081 #define DROP_REFERENCE 1 7082 #define UPDATE_BACKREF 2 7083 7084 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 7085 struct btrfs_root *root, 7086 struct walk_control *wc, 7087 struct btrfs_path *path) 7088 { 7089 u64 bytenr; 7090 u64 generation; 7091 u64 refs; 7092 u64 flags; 7093 u32 nritems; 7094 u32 blocksize; 7095 struct btrfs_key key; 7096 struct extent_buffer *eb; 7097 int ret; 7098 int slot; 7099 int nread = 0; 7100 7101 if (path->slots[wc->level] < wc->reada_slot) { 7102 wc->reada_count = wc->reada_count * 2 / 3; 7103 wc->reada_count = max(wc->reada_count, 2); 7104 } else { 7105 wc->reada_count = wc->reada_count * 3 / 2; 7106 wc->reada_count = min_t(int, wc->reada_count, 7107 BTRFS_NODEPTRS_PER_BLOCK(root)); 7108 } 7109 7110 eb = path->nodes[wc->level]; 7111 nritems = btrfs_header_nritems(eb); 7112 blocksize = btrfs_level_size(root, wc->level - 1); 7113 7114 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 7115 if (nread >= wc->reada_count) 7116 break; 7117 7118 cond_resched(); 7119 bytenr = btrfs_node_blockptr(eb, slot); 7120 generation = btrfs_node_ptr_generation(eb, slot); 7121 7122 if (slot == path->slots[wc->level]) 7123 goto reada; 7124 7125 if (wc->stage == UPDATE_BACKREF && 7126 generation <= root->root_key.offset) 7127 continue; 7128 7129 /* We don't lock the tree block, it's OK to be racy here */ 7130 ret = btrfs_lookup_extent_info(trans, root, bytenr, 7131 wc->level - 1, 1, &refs, 7132 &flags); 7133 /* We don't care about errors in readahead. */ 7134 if (ret < 0) 7135 continue; 7136 BUG_ON(refs == 0); 7137 7138 if (wc->stage == DROP_REFERENCE) { 7139 if (refs == 1) 7140 goto reada; 7141 7142 if (wc->level == 1 && 7143 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7144 continue; 7145 if (!wc->update_ref || 7146 generation <= root->root_key.offset) 7147 continue; 7148 btrfs_node_key_to_cpu(eb, &key, slot); 7149 ret = btrfs_comp_cpu_keys(&key, 7150 &wc->update_progress); 7151 if (ret < 0) 7152 continue; 7153 } else { 7154 if (wc->level == 1 && 7155 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7156 continue; 7157 } 7158 reada: 7159 ret = readahead_tree_block(root, bytenr, blocksize, 7160 generation); 7161 if (ret) 7162 break; 7163 nread++; 7164 } 7165 wc->reada_slot = slot; 7166 } 7167 7168 /* 7169 * helper to process tree block while walking down the tree. 7170 * 7171 * when wc->stage == UPDATE_BACKREF, this function updates 7172 * back refs for pointers in the block. 7173 * 7174 * NOTE: return value 1 means we should stop walking down. 7175 */ 7176 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 7177 struct btrfs_root *root, 7178 struct btrfs_path *path, 7179 struct walk_control *wc, int lookup_info) 7180 { 7181 int level = wc->level; 7182 struct extent_buffer *eb = path->nodes[level]; 7183 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 7184 int ret; 7185 7186 if (wc->stage == UPDATE_BACKREF && 7187 btrfs_header_owner(eb) != root->root_key.objectid) 7188 return 1; 7189 7190 /* 7191 * when reference count of tree block is 1, it won't increase 7192 * again. once full backref flag is set, we never clear it. 7193 */ 7194 if (lookup_info && 7195 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 7196 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 7197 BUG_ON(!path->locks[level]); 7198 ret = btrfs_lookup_extent_info(trans, root, 7199 eb->start, level, 1, 7200 &wc->refs[level], 7201 &wc->flags[level]); 7202 BUG_ON(ret == -ENOMEM); 7203 if (ret) 7204 return ret; 7205 BUG_ON(wc->refs[level] == 0); 7206 } 7207 7208 if (wc->stage == DROP_REFERENCE) { 7209 if (wc->refs[level] > 1) 7210 return 1; 7211 7212 if (path->locks[level] && !wc->keep_locks) { 7213 btrfs_tree_unlock_rw(eb, path->locks[level]); 7214 path->locks[level] = 0; 7215 } 7216 return 0; 7217 } 7218 7219 /* wc->stage == UPDATE_BACKREF */ 7220 if (!(wc->flags[level] & flag)) { 7221 BUG_ON(!path->locks[level]); 7222 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); 7223 BUG_ON(ret); /* -ENOMEM */ 7224 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); 7225 BUG_ON(ret); /* -ENOMEM */ 7226 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 7227 eb->len, flag, 7228 btrfs_header_level(eb), 0); 7229 BUG_ON(ret); /* -ENOMEM */ 7230 wc->flags[level] |= flag; 7231 } 7232 7233 /* 7234 * the block is shared by multiple trees, so it's not good to 7235 * keep the tree lock 7236 */ 7237 if (path->locks[level] && level > 0) { 7238 btrfs_tree_unlock_rw(eb, path->locks[level]); 7239 path->locks[level] = 0; 7240 } 7241 return 0; 7242 } 7243 7244 /* 7245 * helper to process tree block pointer. 7246 * 7247 * when wc->stage == DROP_REFERENCE, this function checks 7248 * reference count of the block pointed to. if the block 7249 * is shared and we need update back refs for the subtree 7250 * rooted at the block, this function changes wc->stage to 7251 * UPDATE_BACKREF. if the block is shared and there is no 7252 * need to update back, this function drops the reference 7253 * to the block. 7254 * 7255 * NOTE: return value 1 means we should stop walking down. 7256 */ 7257 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 7258 struct btrfs_root *root, 7259 struct btrfs_path *path, 7260 struct walk_control *wc, int *lookup_info) 7261 { 7262 u64 bytenr; 7263 u64 generation; 7264 u64 parent; 7265 u32 blocksize; 7266 struct btrfs_key key; 7267 struct extent_buffer *next; 7268 int level = wc->level; 7269 int reada = 0; 7270 int ret = 0; 7271 7272 generation = btrfs_node_ptr_generation(path->nodes[level], 7273 path->slots[level]); 7274 /* 7275 * if the lower level block was created before the snapshot 7276 * was created, we know there is no need to update back refs 7277 * for the subtree 7278 */ 7279 if (wc->stage == UPDATE_BACKREF && 7280 generation <= root->root_key.offset) { 7281 *lookup_info = 1; 7282 return 1; 7283 } 7284 7285 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 7286 blocksize = btrfs_level_size(root, level - 1); 7287 7288 next = btrfs_find_tree_block(root, bytenr, blocksize); 7289 if (!next) { 7290 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 7291 if (!next) 7292 return -ENOMEM; 7293 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 7294 level - 1); 7295 reada = 1; 7296 } 7297 btrfs_tree_lock(next); 7298 btrfs_set_lock_blocking(next); 7299 7300 ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, 7301 &wc->refs[level - 1], 7302 &wc->flags[level - 1]); 7303 if (ret < 0) { 7304 btrfs_tree_unlock(next); 7305 return ret; 7306 } 7307 7308 if (unlikely(wc->refs[level - 1] == 0)) { 7309 btrfs_err(root->fs_info, "Missing references."); 7310 BUG(); 7311 } 7312 *lookup_info = 0; 7313 7314 if (wc->stage == DROP_REFERENCE) { 7315 if (wc->refs[level - 1] > 1) { 7316 if (level == 1 && 7317 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7318 goto skip; 7319 7320 if (!wc->update_ref || 7321 generation <= root->root_key.offset) 7322 goto skip; 7323 7324 btrfs_node_key_to_cpu(path->nodes[level], &key, 7325 path->slots[level]); 7326 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 7327 if (ret < 0) 7328 goto skip; 7329 7330 wc->stage = UPDATE_BACKREF; 7331 wc->shared_level = level - 1; 7332 } 7333 } else { 7334 if (level == 1 && 7335 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7336 goto skip; 7337 } 7338 7339 if (!btrfs_buffer_uptodate(next, generation, 0)) { 7340 btrfs_tree_unlock(next); 7341 free_extent_buffer(next); 7342 next = NULL; 7343 *lookup_info = 1; 7344 } 7345 7346 if (!next) { 7347 if (reada && level == 1) 7348 reada_walk_down(trans, root, wc, path); 7349 next = read_tree_block(root, bytenr, blocksize, generation); 7350 if (!next || !extent_buffer_uptodate(next)) { 7351 free_extent_buffer(next); 7352 return -EIO; 7353 } 7354 btrfs_tree_lock(next); 7355 btrfs_set_lock_blocking(next); 7356 } 7357 7358 level--; 7359 BUG_ON(level != btrfs_header_level(next)); 7360 path->nodes[level] = next; 7361 path->slots[level] = 0; 7362 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7363 wc->level = level; 7364 if (wc->level == 1) 7365 wc->reada_slot = 0; 7366 return 0; 7367 skip: 7368 wc->refs[level - 1] = 0; 7369 wc->flags[level - 1] = 0; 7370 if (wc->stage == DROP_REFERENCE) { 7371 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 7372 parent = path->nodes[level]->start; 7373 } else { 7374 BUG_ON(root->root_key.objectid != 7375 btrfs_header_owner(path->nodes[level])); 7376 parent = 0; 7377 } 7378 7379 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 7380 root->root_key.objectid, level - 1, 0, 0); 7381 BUG_ON(ret); /* -ENOMEM */ 7382 } 7383 btrfs_tree_unlock(next); 7384 free_extent_buffer(next); 7385 *lookup_info = 1; 7386 return 1; 7387 } 7388 7389 /* 7390 * helper to process tree block while walking up the tree. 7391 * 7392 * when wc->stage == DROP_REFERENCE, this function drops 7393 * reference count on the block. 7394 * 7395 * when wc->stage == UPDATE_BACKREF, this function changes 7396 * wc->stage back to DROP_REFERENCE if we changed wc->stage 7397 * to UPDATE_BACKREF previously while processing the block. 7398 * 7399 * NOTE: return value 1 means we should stop walking up. 7400 */ 7401 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 7402 struct btrfs_root *root, 7403 struct btrfs_path *path, 7404 struct walk_control *wc) 7405 { 7406 int ret; 7407 int level = wc->level; 7408 struct extent_buffer *eb = path->nodes[level]; 7409 u64 parent = 0; 7410 7411 if (wc->stage == UPDATE_BACKREF) { 7412 BUG_ON(wc->shared_level < level); 7413 if (level < wc->shared_level) 7414 goto out; 7415 7416 ret = find_next_key(path, level + 1, &wc->update_progress); 7417 if (ret > 0) 7418 wc->update_ref = 0; 7419 7420 wc->stage = DROP_REFERENCE; 7421 wc->shared_level = -1; 7422 path->slots[level] = 0; 7423 7424 /* 7425 * check reference count again if the block isn't locked. 7426 * we should start walking down the tree again if reference 7427 * count is one. 7428 */ 7429 if (!path->locks[level]) { 7430 BUG_ON(level == 0); 7431 btrfs_tree_lock(eb); 7432 btrfs_set_lock_blocking(eb); 7433 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7434 7435 ret = btrfs_lookup_extent_info(trans, root, 7436 eb->start, level, 1, 7437 &wc->refs[level], 7438 &wc->flags[level]); 7439 if (ret < 0) { 7440 btrfs_tree_unlock_rw(eb, path->locks[level]); 7441 path->locks[level] = 0; 7442 return ret; 7443 } 7444 BUG_ON(wc->refs[level] == 0); 7445 if (wc->refs[level] == 1) { 7446 btrfs_tree_unlock_rw(eb, path->locks[level]); 7447 path->locks[level] = 0; 7448 return 1; 7449 } 7450 } 7451 } 7452 7453 /* wc->stage == DROP_REFERENCE */ 7454 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 7455 7456 if (wc->refs[level] == 1) { 7457 if (level == 0) { 7458 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7459 ret = btrfs_dec_ref(trans, root, eb, 1, 7460 wc->for_reloc); 7461 else 7462 ret = btrfs_dec_ref(trans, root, eb, 0, 7463 wc->for_reloc); 7464 BUG_ON(ret); /* -ENOMEM */ 7465 } 7466 /* make block locked assertion in clean_tree_block happy */ 7467 if (!path->locks[level] && 7468 btrfs_header_generation(eb) == trans->transid) { 7469 btrfs_tree_lock(eb); 7470 btrfs_set_lock_blocking(eb); 7471 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7472 } 7473 clean_tree_block(trans, root, eb); 7474 } 7475 7476 if (eb == root->node) { 7477 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7478 parent = eb->start; 7479 else 7480 BUG_ON(root->root_key.objectid != 7481 btrfs_header_owner(eb)); 7482 } else { 7483 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7484 parent = path->nodes[level + 1]->start; 7485 else 7486 BUG_ON(root->root_key.objectid != 7487 btrfs_header_owner(path->nodes[level + 1])); 7488 } 7489 7490 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 7491 out: 7492 wc->refs[level] = 0; 7493 wc->flags[level] = 0; 7494 return 0; 7495 } 7496 7497 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 7498 struct btrfs_root *root, 7499 struct btrfs_path *path, 7500 struct walk_control *wc) 7501 { 7502 int level = wc->level; 7503 int lookup_info = 1; 7504 int ret; 7505 7506 while (level >= 0) { 7507 ret = walk_down_proc(trans, root, path, wc, lookup_info); 7508 if (ret > 0) 7509 break; 7510 7511 if (level == 0) 7512 break; 7513 7514 if (path->slots[level] >= 7515 btrfs_header_nritems(path->nodes[level])) 7516 break; 7517 7518 ret = do_walk_down(trans, root, path, wc, &lookup_info); 7519 if (ret > 0) { 7520 path->slots[level]++; 7521 continue; 7522 } else if (ret < 0) 7523 return ret; 7524 level = wc->level; 7525 } 7526 return 0; 7527 } 7528 7529 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 7530 struct btrfs_root *root, 7531 struct btrfs_path *path, 7532 struct walk_control *wc, int max_level) 7533 { 7534 int level = wc->level; 7535 int ret; 7536 7537 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 7538 while (level < max_level && path->nodes[level]) { 7539 wc->level = level; 7540 if (path->slots[level] + 1 < 7541 btrfs_header_nritems(path->nodes[level])) { 7542 path->slots[level]++; 7543 return 0; 7544 } else { 7545 ret = walk_up_proc(trans, root, path, wc); 7546 if (ret > 0) 7547 return 0; 7548 7549 if (path->locks[level]) { 7550 btrfs_tree_unlock_rw(path->nodes[level], 7551 path->locks[level]); 7552 path->locks[level] = 0; 7553 } 7554 free_extent_buffer(path->nodes[level]); 7555 path->nodes[level] = NULL; 7556 level++; 7557 } 7558 } 7559 return 1; 7560 } 7561 7562 /* 7563 * drop a subvolume tree. 7564 * 7565 * this function traverses the tree freeing any blocks that only 7566 * referenced by the tree. 7567 * 7568 * when a shared tree block is found. this function decreases its 7569 * reference count by one. if update_ref is true, this function 7570 * also make sure backrefs for the shared block and all lower level 7571 * blocks are properly updated. 7572 * 7573 * If called with for_reloc == 0, may exit early with -EAGAIN 7574 */ 7575 int btrfs_drop_snapshot(struct btrfs_root *root, 7576 struct btrfs_block_rsv *block_rsv, int update_ref, 7577 int for_reloc) 7578 { 7579 struct btrfs_path *path; 7580 struct btrfs_trans_handle *trans; 7581 struct btrfs_root *tree_root = root->fs_info->tree_root; 7582 struct btrfs_root_item *root_item = &root->root_item; 7583 struct walk_control *wc; 7584 struct btrfs_key key; 7585 int err = 0; 7586 int ret; 7587 int level; 7588 bool root_dropped = false; 7589 7590 path = btrfs_alloc_path(); 7591 if (!path) { 7592 err = -ENOMEM; 7593 goto out; 7594 } 7595 7596 wc = kzalloc(sizeof(*wc), GFP_NOFS); 7597 if (!wc) { 7598 btrfs_free_path(path); 7599 err = -ENOMEM; 7600 goto out; 7601 } 7602 7603 trans = btrfs_start_transaction(tree_root, 0); 7604 if (IS_ERR(trans)) { 7605 err = PTR_ERR(trans); 7606 goto out_free; 7607 } 7608 7609 if (block_rsv) 7610 trans->block_rsv = block_rsv; 7611 7612 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 7613 level = btrfs_header_level(root->node); 7614 path->nodes[level] = btrfs_lock_root_node(root); 7615 btrfs_set_lock_blocking(path->nodes[level]); 7616 path->slots[level] = 0; 7617 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7618 memset(&wc->update_progress, 0, 7619 sizeof(wc->update_progress)); 7620 } else { 7621 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 7622 memcpy(&wc->update_progress, &key, 7623 sizeof(wc->update_progress)); 7624 7625 level = root_item->drop_level; 7626 BUG_ON(level == 0); 7627 path->lowest_level = level; 7628 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7629 path->lowest_level = 0; 7630 if (ret < 0) { 7631 err = ret; 7632 goto out_end_trans; 7633 } 7634 WARN_ON(ret > 0); 7635 7636 /* 7637 * unlock our path, this is safe because only this 7638 * function is allowed to delete this snapshot 7639 */ 7640 btrfs_unlock_up_safe(path, 0); 7641 7642 level = btrfs_header_level(root->node); 7643 while (1) { 7644 btrfs_tree_lock(path->nodes[level]); 7645 btrfs_set_lock_blocking(path->nodes[level]); 7646 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7647 7648 ret = btrfs_lookup_extent_info(trans, root, 7649 path->nodes[level]->start, 7650 level, 1, &wc->refs[level], 7651 &wc->flags[level]); 7652 if (ret < 0) { 7653 err = ret; 7654 goto out_end_trans; 7655 } 7656 BUG_ON(wc->refs[level] == 0); 7657 7658 if (level == root_item->drop_level) 7659 break; 7660 7661 btrfs_tree_unlock(path->nodes[level]); 7662 path->locks[level] = 0; 7663 WARN_ON(wc->refs[level] != 1); 7664 level--; 7665 } 7666 } 7667 7668 wc->level = level; 7669 wc->shared_level = -1; 7670 wc->stage = DROP_REFERENCE; 7671 wc->update_ref = update_ref; 7672 wc->keep_locks = 0; 7673 wc->for_reloc = for_reloc; 7674 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7675 7676 while (1) { 7677 7678 ret = walk_down_tree(trans, root, path, wc); 7679 if (ret < 0) { 7680 err = ret; 7681 break; 7682 } 7683 7684 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 7685 if (ret < 0) { 7686 err = ret; 7687 break; 7688 } 7689 7690 if (ret > 0) { 7691 BUG_ON(wc->stage != DROP_REFERENCE); 7692 break; 7693 } 7694 7695 if (wc->stage == DROP_REFERENCE) { 7696 level = wc->level; 7697 btrfs_node_key(path->nodes[level], 7698 &root_item->drop_progress, 7699 path->slots[level]); 7700 root_item->drop_level = level; 7701 } 7702 7703 BUG_ON(wc->level == 0); 7704 if (btrfs_should_end_transaction(trans, tree_root) || 7705 (!for_reloc && btrfs_need_cleaner_sleep(root))) { 7706 ret = btrfs_update_root(trans, tree_root, 7707 &root->root_key, 7708 root_item); 7709 if (ret) { 7710 btrfs_abort_transaction(trans, tree_root, ret); 7711 err = ret; 7712 goto out_end_trans; 7713 } 7714 7715 btrfs_end_transaction_throttle(trans, tree_root); 7716 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 7717 pr_debug("btrfs: drop snapshot early exit\n"); 7718 err = -EAGAIN; 7719 goto out_free; 7720 } 7721 7722 trans = btrfs_start_transaction(tree_root, 0); 7723 if (IS_ERR(trans)) { 7724 err = PTR_ERR(trans); 7725 goto out_free; 7726 } 7727 if (block_rsv) 7728 trans->block_rsv = block_rsv; 7729 } 7730 } 7731 btrfs_release_path(path); 7732 if (err) 7733 goto out_end_trans; 7734 7735 ret = btrfs_del_root(trans, tree_root, &root->root_key); 7736 if (ret) { 7737 btrfs_abort_transaction(trans, tree_root, ret); 7738 goto out_end_trans; 7739 } 7740 7741 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 7742 ret = btrfs_find_root(tree_root, &root->root_key, path, 7743 NULL, NULL); 7744 if (ret < 0) { 7745 btrfs_abort_transaction(trans, tree_root, ret); 7746 err = ret; 7747 goto out_end_trans; 7748 } else if (ret > 0) { 7749 /* if we fail to delete the orphan item this time 7750 * around, it'll get picked up the next time. 7751 * 7752 * The most common failure here is just -ENOENT. 7753 */ 7754 btrfs_del_orphan_item(trans, tree_root, 7755 root->root_key.objectid); 7756 } 7757 } 7758 7759 if (root->in_radix) { 7760 btrfs_drop_and_free_fs_root(tree_root->fs_info, root); 7761 } else { 7762 free_extent_buffer(root->node); 7763 free_extent_buffer(root->commit_root); 7764 btrfs_put_fs_root(root); 7765 } 7766 root_dropped = true; 7767 out_end_trans: 7768 btrfs_end_transaction_throttle(trans, tree_root); 7769 out_free: 7770 kfree(wc); 7771 btrfs_free_path(path); 7772 out: 7773 /* 7774 * So if we need to stop dropping the snapshot for whatever reason we 7775 * need to make sure to add it back to the dead root list so that we 7776 * keep trying to do the work later. This also cleans up roots if we 7777 * don't have it in the radix (like when we recover after a power fail 7778 * or unmount) so we don't leak memory. 7779 */ 7780 if (!for_reloc && root_dropped == false) 7781 btrfs_add_dead_root(root); 7782 if (err) 7783 btrfs_std_error(root->fs_info, err); 7784 return err; 7785 } 7786 7787 /* 7788 * drop subtree rooted at tree block 'node'. 7789 * 7790 * NOTE: this function will unlock and release tree block 'node' 7791 * only used by relocation code 7792 */ 7793 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 7794 struct btrfs_root *root, 7795 struct extent_buffer *node, 7796 struct extent_buffer *parent) 7797 { 7798 struct btrfs_path *path; 7799 struct walk_control *wc; 7800 int level; 7801 int parent_level; 7802 int ret = 0; 7803 int wret; 7804 7805 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 7806 7807 path = btrfs_alloc_path(); 7808 if (!path) 7809 return -ENOMEM; 7810 7811 wc = kzalloc(sizeof(*wc), GFP_NOFS); 7812 if (!wc) { 7813 btrfs_free_path(path); 7814 return -ENOMEM; 7815 } 7816 7817 btrfs_assert_tree_locked(parent); 7818 parent_level = btrfs_header_level(parent); 7819 extent_buffer_get(parent); 7820 path->nodes[parent_level] = parent; 7821 path->slots[parent_level] = btrfs_header_nritems(parent); 7822 7823 btrfs_assert_tree_locked(node); 7824 level = btrfs_header_level(node); 7825 path->nodes[level] = node; 7826 path->slots[level] = 0; 7827 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7828 7829 wc->refs[parent_level] = 1; 7830 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 7831 wc->level = level; 7832 wc->shared_level = -1; 7833 wc->stage = DROP_REFERENCE; 7834 wc->update_ref = 0; 7835 wc->keep_locks = 1; 7836 wc->for_reloc = 1; 7837 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7838 7839 while (1) { 7840 wret = walk_down_tree(trans, root, path, wc); 7841 if (wret < 0) { 7842 ret = wret; 7843 break; 7844 } 7845 7846 wret = walk_up_tree(trans, root, path, wc, parent_level); 7847 if (wret < 0) 7848 ret = wret; 7849 if (wret != 0) 7850 break; 7851 } 7852 7853 kfree(wc); 7854 btrfs_free_path(path); 7855 return ret; 7856 } 7857 7858 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 7859 { 7860 u64 num_devices; 7861 u64 stripped; 7862 7863 /* 7864 * if restripe for this chunk_type is on pick target profile and 7865 * return, otherwise do the usual balance 7866 */ 7867 stripped = get_restripe_target(root->fs_info, flags); 7868 if (stripped) 7869 return extended_to_chunk(stripped); 7870 7871 /* 7872 * we add in the count of missing devices because we want 7873 * to make sure that any RAID levels on a degraded FS 7874 * continue to be honored. 7875 */ 7876 num_devices = root->fs_info->fs_devices->rw_devices + 7877 root->fs_info->fs_devices->missing_devices; 7878 7879 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7880 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 7881 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7882 7883 if (num_devices == 1) { 7884 stripped |= BTRFS_BLOCK_GROUP_DUP; 7885 stripped = flags & ~stripped; 7886 7887 /* turn raid0 into single device chunks */ 7888 if (flags & BTRFS_BLOCK_GROUP_RAID0) 7889 return stripped; 7890 7891 /* turn mirroring into duplication */ 7892 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 7893 BTRFS_BLOCK_GROUP_RAID10)) 7894 return stripped | BTRFS_BLOCK_GROUP_DUP; 7895 } else { 7896 /* they already had raid on here, just return */ 7897 if (flags & stripped) 7898 return flags; 7899 7900 stripped |= BTRFS_BLOCK_GROUP_DUP; 7901 stripped = flags & ~stripped; 7902 7903 /* switch duplicated blocks with raid1 */ 7904 if (flags & BTRFS_BLOCK_GROUP_DUP) 7905 return stripped | BTRFS_BLOCK_GROUP_RAID1; 7906 7907 /* this is drive concat, leave it alone */ 7908 } 7909 7910 return flags; 7911 } 7912 7913 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) 7914 { 7915 struct btrfs_space_info *sinfo = cache->space_info; 7916 u64 num_bytes; 7917 u64 min_allocable_bytes; 7918 int ret = -ENOSPC; 7919 7920 7921 /* 7922 * We need some metadata space and system metadata space for 7923 * allocating chunks in some corner cases until we force to set 7924 * it to be readonly. 7925 */ 7926 if ((sinfo->flags & 7927 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 7928 !force) 7929 min_allocable_bytes = 1 * 1024 * 1024; 7930 else 7931 min_allocable_bytes = 0; 7932 7933 spin_lock(&sinfo->lock); 7934 spin_lock(&cache->lock); 7935 7936 if (cache->ro) { 7937 ret = 0; 7938 goto out; 7939 } 7940 7941 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 7942 cache->bytes_super - btrfs_block_group_used(&cache->item); 7943 7944 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 7945 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + 7946 min_allocable_bytes <= sinfo->total_bytes) { 7947 sinfo->bytes_readonly += num_bytes; 7948 cache->ro = 1; 7949 ret = 0; 7950 } 7951 out: 7952 spin_unlock(&cache->lock); 7953 spin_unlock(&sinfo->lock); 7954 return ret; 7955 } 7956 7957 int btrfs_set_block_group_ro(struct btrfs_root *root, 7958 struct btrfs_block_group_cache *cache) 7959 7960 { 7961 struct btrfs_trans_handle *trans; 7962 u64 alloc_flags; 7963 int ret; 7964 7965 BUG_ON(cache->ro); 7966 7967 trans = btrfs_join_transaction(root); 7968 if (IS_ERR(trans)) 7969 return PTR_ERR(trans); 7970 7971 alloc_flags = update_block_group_flags(root, cache->flags); 7972 if (alloc_flags != cache->flags) { 7973 ret = do_chunk_alloc(trans, root, alloc_flags, 7974 CHUNK_ALLOC_FORCE); 7975 if (ret < 0) 7976 goto out; 7977 } 7978 7979 ret = set_block_group_ro(cache, 0); 7980 if (!ret) 7981 goto out; 7982 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 7983 ret = do_chunk_alloc(trans, root, alloc_flags, 7984 CHUNK_ALLOC_FORCE); 7985 if (ret < 0) 7986 goto out; 7987 ret = set_block_group_ro(cache, 0); 7988 out: 7989 btrfs_end_transaction(trans, root); 7990 return ret; 7991 } 7992 7993 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 7994 struct btrfs_root *root, u64 type) 7995 { 7996 u64 alloc_flags = get_alloc_profile(root, type); 7997 return do_chunk_alloc(trans, root, alloc_flags, 7998 CHUNK_ALLOC_FORCE); 7999 } 8000 8001 /* 8002 * helper to account the unused space of all the readonly block group in the 8003 * list. takes mirrors into account. 8004 */ 8005 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) 8006 { 8007 struct btrfs_block_group_cache *block_group; 8008 u64 free_bytes = 0; 8009 int factor; 8010 8011 list_for_each_entry(block_group, groups_list, list) { 8012 spin_lock(&block_group->lock); 8013 8014 if (!block_group->ro) { 8015 spin_unlock(&block_group->lock); 8016 continue; 8017 } 8018 8019 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 8020 BTRFS_BLOCK_GROUP_RAID10 | 8021 BTRFS_BLOCK_GROUP_DUP)) 8022 factor = 2; 8023 else 8024 factor = 1; 8025 8026 free_bytes += (block_group->key.offset - 8027 btrfs_block_group_used(&block_group->item)) * 8028 factor; 8029 8030 spin_unlock(&block_group->lock); 8031 } 8032 8033 return free_bytes; 8034 } 8035 8036 /* 8037 * helper to account the unused space of all the readonly block group in the 8038 * space_info. takes mirrors into account. 8039 */ 8040 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 8041 { 8042 int i; 8043 u64 free_bytes = 0; 8044 8045 spin_lock(&sinfo->lock); 8046 8047 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 8048 if (!list_empty(&sinfo->block_groups[i])) 8049 free_bytes += __btrfs_get_ro_block_group_free_space( 8050 &sinfo->block_groups[i]); 8051 8052 spin_unlock(&sinfo->lock); 8053 8054 return free_bytes; 8055 } 8056 8057 void btrfs_set_block_group_rw(struct btrfs_root *root, 8058 struct btrfs_block_group_cache *cache) 8059 { 8060 struct btrfs_space_info *sinfo = cache->space_info; 8061 u64 num_bytes; 8062 8063 BUG_ON(!cache->ro); 8064 8065 spin_lock(&sinfo->lock); 8066 spin_lock(&cache->lock); 8067 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 8068 cache->bytes_super - btrfs_block_group_used(&cache->item); 8069 sinfo->bytes_readonly -= num_bytes; 8070 cache->ro = 0; 8071 spin_unlock(&cache->lock); 8072 spin_unlock(&sinfo->lock); 8073 } 8074 8075 /* 8076 * checks to see if its even possible to relocate this block group. 8077 * 8078 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 8079 * ok to go ahead and try. 8080 */ 8081 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) 8082 { 8083 struct btrfs_block_group_cache *block_group; 8084 struct btrfs_space_info *space_info; 8085 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 8086 struct btrfs_device *device; 8087 struct btrfs_trans_handle *trans; 8088 u64 min_free; 8089 u64 dev_min = 1; 8090 u64 dev_nr = 0; 8091 u64 target; 8092 int index; 8093 int full = 0; 8094 int ret = 0; 8095 8096 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 8097 8098 /* odd, couldn't find the block group, leave it alone */ 8099 if (!block_group) 8100 return -1; 8101 8102 min_free = btrfs_block_group_used(&block_group->item); 8103 8104 /* no bytes used, we're good */ 8105 if (!min_free) 8106 goto out; 8107 8108 space_info = block_group->space_info; 8109 spin_lock(&space_info->lock); 8110 8111 full = space_info->full; 8112 8113 /* 8114 * if this is the last block group we have in this space, we can't 8115 * relocate it unless we're able to allocate a new chunk below. 8116 * 8117 * Otherwise, we need to make sure we have room in the space to handle 8118 * all of the extents from this block group. If we can, we're good 8119 */ 8120 if ((space_info->total_bytes != block_group->key.offset) && 8121 (space_info->bytes_used + space_info->bytes_reserved + 8122 space_info->bytes_pinned + space_info->bytes_readonly + 8123 min_free < space_info->total_bytes)) { 8124 spin_unlock(&space_info->lock); 8125 goto out; 8126 } 8127 spin_unlock(&space_info->lock); 8128 8129 /* 8130 * ok we don't have enough space, but maybe we have free space on our 8131 * devices to allocate new chunks for relocation, so loop through our 8132 * alloc devices and guess if we have enough space. if this block 8133 * group is going to be restriped, run checks against the target 8134 * profile instead of the current one. 8135 */ 8136 ret = -1; 8137 8138 /* 8139 * index: 8140 * 0: raid10 8141 * 1: raid1 8142 * 2: dup 8143 * 3: raid0 8144 * 4: single 8145 */ 8146 target = get_restripe_target(root->fs_info, block_group->flags); 8147 if (target) { 8148 index = __get_raid_index(extended_to_chunk(target)); 8149 } else { 8150 /* 8151 * this is just a balance, so if we were marked as full 8152 * we know there is no space for a new chunk 8153 */ 8154 if (full) 8155 goto out; 8156 8157 index = get_block_group_index(block_group); 8158 } 8159 8160 if (index == BTRFS_RAID_RAID10) { 8161 dev_min = 4; 8162 /* Divide by 2 */ 8163 min_free >>= 1; 8164 } else if (index == BTRFS_RAID_RAID1) { 8165 dev_min = 2; 8166 } else if (index == BTRFS_RAID_DUP) { 8167 /* Multiply by 2 */ 8168 min_free <<= 1; 8169 } else if (index == BTRFS_RAID_RAID0) { 8170 dev_min = fs_devices->rw_devices; 8171 do_div(min_free, dev_min); 8172 } 8173 8174 /* We need to do this so that we can look at pending chunks */ 8175 trans = btrfs_join_transaction(root); 8176 if (IS_ERR(trans)) { 8177 ret = PTR_ERR(trans); 8178 goto out; 8179 } 8180 8181 mutex_lock(&root->fs_info->chunk_mutex); 8182 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8183 u64 dev_offset; 8184 8185 /* 8186 * check to make sure we can actually find a chunk with enough 8187 * space to fit our block group in. 8188 */ 8189 if (device->total_bytes > device->bytes_used + min_free && 8190 !device->is_tgtdev_for_dev_replace) { 8191 ret = find_free_dev_extent(trans, device, min_free, 8192 &dev_offset, NULL); 8193 if (!ret) 8194 dev_nr++; 8195 8196 if (dev_nr >= dev_min) 8197 break; 8198 8199 ret = -1; 8200 } 8201 } 8202 mutex_unlock(&root->fs_info->chunk_mutex); 8203 btrfs_end_transaction(trans, root); 8204 out: 8205 btrfs_put_block_group(block_group); 8206 return ret; 8207 } 8208 8209 static int find_first_block_group(struct btrfs_root *root, 8210 struct btrfs_path *path, struct btrfs_key *key) 8211 { 8212 int ret = 0; 8213 struct btrfs_key found_key; 8214 struct extent_buffer *leaf; 8215 int slot; 8216 8217 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 8218 if (ret < 0) 8219 goto out; 8220 8221 while (1) { 8222 slot = path->slots[0]; 8223 leaf = path->nodes[0]; 8224 if (slot >= btrfs_header_nritems(leaf)) { 8225 ret = btrfs_next_leaf(root, path); 8226 if (ret == 0) 8227 continue; 8228 if (ret < 0) 8229 goto out; 8230 break; 8231 } 8232 btrfs_item_key_to_cpu(leaf, &found_key, slot); 8233 8234 if (found_key.objectid >= key->objectid && 8235 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 8236 ret = 0; 8237 goto out; 8238 } 8239 path->slots[0]++; 8240 } 8241 out: 8242 return ret; 8243 } 8244 8245 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 8246 { 8247 struct btrfs_block_group_cache *block_group; 8248 u64 last = 0; 8249 8250 while (1) { 8251 struct inode *inode; 8252 8253 block_group = btrfs_lookup_first_block_group(info, last); 8254 while (block_group) { 8255 spin_lock(&block_group->lock); 8256 if (block_group->iref) 8257 break; 8258 spin_unlock(&block_group->lock); 8259 block_group = next_block_group(info->tree_root, 8260 block_group); 8261 } 8262 if (!block_group) { 8263 if (last == 0) 8264 break; 8265 last = 0; 8266 continue; 8267 } 8268 8269 inode = block_group->inode; 8270 block_group->iref = 0; 8271 block_group->inode = NULL; 8272 spin_unlock(&block_group->lock); 8273 iput(inode); 8274 last = block_group->key.objectid + block_group->key.offset; 8275 btrfs_put_block_group(block_group); 8276 } 8277 } 8278 8279 int btrfs_free_block_groups(struct btrfs_fs_info *info) 8280 { 8281 struct btrfs_block_group_cache *block_group; 8282 struct btrfs_space_info *space_info; 8283 struct btrfs_caching_control *caching_ctl; 8284 struct rb_node *n; 8285 8286 down_write(&info->extent_commit_sem); 8287 while (!list_empty(&info->caching_block_groups)) { 8288 caching_ctl = list_entry(info->caching_block_groups.next, 8289 struct btrfs_caching_control, list); 8290 list_del(&caching_ctl->list); 8291 put_caching_control(caching_ctl); 8292 } 8293 up_write(&info->extent_commit_sem); 8294 8295 spin_lock(&info->block_group_cache_lock); 8296 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 8297 block_group = rb_entry(n, struct btrfs_block_group_cache, 8298 cache_node); 8299 rb_erase(&block_group->cache_node, 8300 &info->block_group_cache_tree); 8301 spin_unlock(&info->block_group_cache_lock); 8302 8303 down_write(&block_group->space_info->groups_sem); 8304 list_del(&block_group->list); 8305 up_write(&block_group->space_info->groups_sem); 8306 8307 if (block_group->cached == BTRFS_CACHE_STARTED) 8308 wait_block_group_cache_done(block_group); 8309 8310 /* 8311 * We haven't cached this block group, which means we could 8312 * possibly have excluded extents on this block group. 8313 */ 8314 if (block_group->cached == BTRFS_CACHE_NO || 8315 block_group->cached == BTRFS_CACHE_ERROR) 8316 free_excluded_extents(info->extent_root, block_group); 8317 8318 btrfs_remove_free_space_cache(block_group); 8319 btrfs_put_block_group(block_group); 8320 8321 spin_lock(&info->block_group_cache_lock); 8322 } 8323 spin_unlock(&info->block_group_cache_lock); 8324 8325 /* now that all the block groups are freed, go through and 8326 * free all the space_info structs. This is only called during 8327 * the final stages of unmount, and so we know nobody is 8328 * using them. We call synchronize_rcu() once before we start, 8329 * just to be on the safe side. 8330 */ 8331 synchronize_rcu(); 8332 8333 release_global_block_rsv(info); 8334 8335 while (!list_empty(&info->space_info)) { 8336 space_info = list_entry(info->space_info.next, 8337 struct btrfs_space_info, 8338 list); 8339 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { 8340 if (WARN_ON(space_info->bytes_pinned > 0 || 8341 space_info->bytes_reserved > 0 || 8342 space_info->bytes_may_use > 0)) { 8343 dump_space_info(space_info, 0, 0); 8344 } 8345 } 8346 percpu_counter_destroy(&space_info->total_bytes_pinned); 8347 list_del(&space_info->list); 8348 kfree(space_info); 8349 } 8350 return 0; 8351 } 8352 8353 static void __link_block_group(struct btrfs_space_info *space_info, 8354 struct btrfs_block_group_cache *cache) 8355 { 8356 int index = get_block_group_index(cache); 8357 8358 down_write(&space_info->groups_sem); 8359 list_add_tail(&cache->list, &space_info->block_groups[index]); 8360 up_write(&space_info->groups_sem); 8361 } 8362 8363 int btrfs_read_block_groups(struct btrfs_root *root) 8364 { 8365 struct btrfs_path *path; 8366 int ret; 8367 struct btrfs_block_group_cache *cache; 8368 struct btrfs_fs_info *info = root->fs_info; 8369 struct btrfs_space_info *space_info; 8370 struct btrfs_key key; 8371 struct btrfs_key found_key; 8372 struct extent_buffer *leaf; 8373 int need_clear = 0; 8374 u64 cache_gen; 8375 8376 root = info->extent_root; 8377 key.objectid = 0; 8378 key.offset = 0; 8379 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); 8380 path = btrfs_alloc_path(); 8381 if (!path) 8382 return -ENOMEM; 8383 path->reada = 1; 8384 8385 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 8386 if (btrfs_test_opt(root, SPACE_CACHE) && 8387 btrfs_super_generation(root->fs_info->super_copy) != cache_gen) 8388 need_clear = 1; 8389 if (btrfs_test_opt(root, CLEAR_CACHE)) 8390 need_clear = 1; 8391 8392 while (1) { 8393 ret = find_first_block_group(root, path, &key); 8394 if (ret > 0) 8395 break; 8396 if (ret != 0) 8397 goto error; 8398 leaf = path->nodes[0]; 8399 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 8400 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8401 if (!cache) { 8402 ret = -ENOMEM; 8403 goto error; 8404 } 8405 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 8406 GFP_NOFS); 8407 if (!cache->free_space_ctl) { 8408 kfree(cache); 8409 ret = -ENOMEM; 8410 goto error; 8411 } 8412 8413 atomic_set(&cache->count, 1); 8414 spin_lock_init(&cache->lock); 8415 cache->fs_info = info; 8416 INIT_LIST_HEAD(&cache->list); 8417 INIT_LIST_HEAD(&cache->cluster_list); 8418 8419 if (need_clear) { 8420 /* 8421 * When we mount with old space cache, we need to 8422 * set BTRFS_DC_CLEAR and set dirty flag. 8423 * 8424 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 8425 * truncate the old free space cache inode and 8426 * setup a new one. 8427 * b) Setting 'dirty flag' makes sure that we flush 8428 * the new space cache info onto disk. 8429 */ 8430 cache->disk_cache_state = BTRFS_DC_CLEAR; 8431 if (btrfs_test_opt(root, SPACE_CACHE)) 8432 cache->dirty = 1; 8433 } 8434 8435 read_extent_buffer(leaf, &cache->item, 8436 btrfs_item_ptr_offset(leaf, path->slots[0]), 8437 sizeof(cache->item)); 8438 memcpy(&cache->key, &found_key, sizeof(found_key)); 8439 8440 key.objectid = found_key.objectid + found_key.offset; 8441 btrfs_release_path(path); 8442 cache->flags = btrfs_block_group_flags(&cache->item); 8443 cache->sectorsize = root->sectorsize; 8444 cache->full_stripe_len = btrfs_full_stripe_len(root, 8445 &root->fs_info->mapping_tree, 8446 found_key.objectid); 8447 btrfs_init_free_space_ctl(cache); 8448 8449 /* 8450 * We need to exclude the super stripes now so that the space 8451 * info has super bytes accounted for, otherwise we'll think 8452 * we have more space than we actually do. 8453 */ 8454 ret = exclude_super_stripes(root, cache); 8455 if (ret) { 8456 /* 8457 * We may have excluded something, so call this just in 8458 * case. 8459 */ 8460 free_excluded_extents(root, cache); 8461 kfree(cache->free_space_ctl); 8462 kfree(cache); 8463 goto error; 8464 } 8465 8466 /* 8467 * check for two cases, either we are full, and therefore 8468 * don't need to bother with the caching work since we won't 8469 * find any space, or we are empty, and we can just add all 8470 * the space in and be done with it. This saves us _alot_ of 8471 * time, particularly in the full case. 8472 */ 8473 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 8474 cache->last_byte_to_unpin = (u64)-1; 8475 cache->cached = BTRFS_CACHE_FINISHED; 8476 free_excluded_extents(root, cache); 8477 } else if (btrfs_block_group_used(&cache->item) == 0) { 8478 cache->last_byte_to_unpin = (u64)-1; 8479 cache->cached = BTRFS_CACHE_FINISHED; 8480 add_new_free_space(cache, root->fs_info, 8481 found_key.objectid, 8482 found_key.objectid + 8483 found_key.offset); 8484 free_excluded_extents(root, cache); 8485 } 8486 8487 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8488 if (ret) { 8489 btrfs_remove_free_space_cache(cache); 8490 btrfs_put_block_group(cache); 8491 goto error; 8492 } 8493 8494 ret = update_space_info(info, cache->flags, found_key.offset, 8495 btrfs_block_group_used(&cache->item), 8496 &space_info); 8497 if (ret) { 8498 btrfs_remove_free_space_cache(cache); 8499 spin_lock(&info->block_group_cache_lock); 8500 rb_erase(&cache->cache_node, 8501 &info->block_group_cache_tree); 8502 spin_unlock(&info->block_group_cache_lock); 8503 btrfs_put_block_group(cache); 8504 goto error; 8505 } 8506 8507 cache->space_info = space_info; 8508 spin_lock(&cache->space_info->lock); 8509 cache->space_info->bytes_readonly += cache->bytes_super; 8510 spin_unlock(&cache->space_info->lock); 8511 8512 __link_block_group(space_info, cache); 8513 8514 set_avail_alloc_bits(root->fs_info, cache->flags); 8515 if (btrfs_chunk_readonly(root, cache->key.objectid)) 8516 set_block_group_ro(cache, 1); 8517 } 8518 8519 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 8520 if (!(get_alloc_profile(root, space_info->flags) & 8521 (BTRFS_BLOCK_GROUP_RAID10 | 8522 BTRFS_BLOCK_GROUP_RAID1 | 8523 BTRFS_BLOCK_GROUP_RAID5 | 8524 BTRFS_BLOCK_GROUP_RAID6 | 8525 BTRFS_BLOCK_GROUP_DUP))) 8526 continue; 8527 /* 8528 * avoid allocating from un-mirrored block group if there are 8529 * mirrored block groups. 8530 */ 8531 list_for_each_entry(cache, 8532 &space_info->block_groups[BTRFS_RAID_RAID0], 8533 list) 8534 set_block_group_ro(cache, 1); 8535 list_for_each_entry(cache, 8536 &space_info->block_groups[BTRFS_RAID_SINGLE], 8537 list) 8538 set_block_group_ro(cache, 1); 8539 } 8540 8541 init_global_block_rsv(info); 8542 ret = 0; 8543 error: 8544 btrfs_free_path(path); 8545 return ret; 8546 } 8547 8548 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 8549 struct btrfs_root *root) 8550 { 8551 struct btrfs_block_group_cache *block_group, *tmp; 8552 struct btrfs_root *extent_root = root->fs_info->extent_root; 8553 struct btrfs_block_group_item item; 8554 struct btrfs_key key; 8555 int ret = 0; 8556 8557 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, 8558 new_bg_list) { 8559 list_del_init(&block_group->new_bg_list); 8560 8561 if (ret) 8562 continue; 8563 8564 spin_lock(&block_group->lock); 8565 memcpy(&item, &block_group->item, sizeof(item)); 8566 memcpy(&key, &block_group->key, sizeof(key)); 8567 spin_unlock(&block_group->lock); 8568 8569 ret = btrfs_insert_item(trans, extent_root, &key, &item, 8570 sizeof(item)); 8571 if (ret) 8572 btrfs_abort_transaction(trans, extent_root, ret); 8573 ret = btrfs_finish_chunk_alloc(trans, extent_root, 8574 key.objectid, key.offset); 8575 if (ret) 8576 btrfs_abort_transaction(trans, extent_root, ret); 8577 } 8578 } 8579 8580 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 8581 struct btrfs_root *root, u64 bytes_used, 8582 u64 type, u64 chunk_objectid, u64 chunk_offset, 8583 u64 size) 8584 { 8585 int ret; 8586 struct btrfs_root *extent_root; 8587 struct btrfs_block_group_cache *cache; 8588 8589 extent_root = root->fs_info->extent_root; 8590 8591 root->fs_info->last_trans_log_full_commit = trans->transid; 8592 8593 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8594 if (!cache) 8595 return -ENOMEM; 8596 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 8597 GFP_NOFS); 8598 if (!cache->free_space_ctl) { 8599 kfree(cache); 8600 return -ENOMEM; 8601 } 8602 8603 cache->key.objectid = chunk_offset; 8604 cache->key.offset = size; 8605 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8606 cache->sectorsize = root->sectorsize; 8607 cache->fs_info = root->fs_info; 8608 cache->full_stripe_len = btrfs_full_stripe_len(root, 8609 &root->fs_info->mapping_tree, 8610 chunk_offset); 8611 8612 atomic_set(&cache->count, 1); 8613 spin_lock_init(&cache->lock); 8614 INIT_LIST_HEAD(&cache->list); 8615 INIT_LIST_HEAD(&cache->cluster_list); 8616 INIT_LIST_HEAD(&cache->new_bg_list); 8617 8618 btrfs_init_free_space_ctl(cache); 8619 8620 btrfs_set_block_group_used(&cache->item, bytes_used); 8621 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 8622 cache->flags = type; 8623 btrfs_set_block_group_flags(&cache->item, type); 8624 8625 cache->last_byte_to_unpin = (u64)-1; 8626 cache->cached = BTRFS_CACHE_FINISHED; 8627 ret = exclude_super_stripes(root, cache); 8628 if (ret) { 8629 /* 8630 * We may have excluded something, so call this just in 8631 * case. 8632 */ 8633 free_excluded_extents(root, cache); 8634 kfree(cache->free_space_ctl); 8635 kfree(cache); 8636 return ret; 8637 } 8638 8639 add_new_free_space(cache, root->fs_info, chunk_offset, 8640 chunk_offset + size); 8641 8642 free_excluded_extents(root, cache); 8643 8644 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8645 if (ret) { 8646 btrfs_remove_free_space_cache(cache); 8647 btrfs_put_block_group(cache); 8648 return ret; 8649 } 8650 8651 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 8652 &cache->space_info); 8653 if (ret) { 8654 btrfs_remove_free_space_cache(cache); 8655 spin_lock(&root->fs_info->block_group_cache_lock); 8656 rb_erase(&cache->cache_node, 8657 &root->fs_info->block_group_cache_tree); 8658 spin_unlock(&root->fs_info->block_group_cache_lock); 8659 btrfs_put_block_group(cache); 8660 return ret; 8661 } 8662 update_global_block_rsv(root->fs_info); 8663 8664 spin_lock(&cache->space_info->lock); 8665 cache->space_info->bytes_readonly += cache->bytes_super; 8666 spin_unlock(&cache->space_info->lock); 8667 8668 __link_block_group(cache->space_info, cache); 8669 8670 list_add_tail(&cache->new_bg_list, &trans->new_bgs); 8671 8672 set_avail_alloc_bits(extent_root->fs_info, type); 8673 8674 return 0; 8675 } 8676 8677 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 8678 { 8679 u64 extra_flags = chunk_to_extended(flags) & 8680 BTRFS_EXTENDED_PROFILE_MASK; 8681 8682 write_seqlock(&fs_info->profiles_lock); 8683 if (flags & BTRFS_BLOCK_GROUP_DATA) 8684 fs_info->avail_data_alloc_bits &= ~extra_flags; 8685 if (flags & BTRFS_BLOCK_GROUP_METADATA) 8686 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 8687 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 8688 fs_info->avail_system_alloc_bits &= ~extra_flags; 8689 write_sequnlock(&fs_info->profiles_lock); 8690 } 8691 8692 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 8693 struct btrfs_root *root, u64 group_start) 8694 { 8695 struct btrfs_path *path; 8696 struct btrfs_block_group_cache *block_group; 8697 struct btrfs_free_cluster *cluster; 8698 struct btrfs_root *tree_root = root->fs_info->tree_root; 8699 struct btrfs_key key; 8700 struct inode *inode; 8701 int ret; 8702 int index; 8703 int factor; 8704 8705 root = root->fs_info->extent_root; 8706 8707 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 8708 BUG_ON(!block_group); 8709 BUG_ON(!block_group->ro); 8710 8711 /* 8712 * Free the reserved super bytes from this block group before 8713 * remove it. 8714 */ 8715 free_excluded_extents(root, block_group); 8716 8717 memcpy(&key, &block_group->key, sizeof(key)); 8718 index = get_block_group_index(block_group); 8719 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 8720 BTRFS_BLOCK_GROUP_RAID1 | 8721 BTRFS_BLOCK_GROUP_RAID10)) 8722 factor = 2; 8723 else 8724 factor = 1; 8725 8726 /* make sure this block group isn't part of an allocation cluster */ 8727 cluster = &root->fs_info->data_alloc_cluster; 8728 spin_lock(&cluster->refill_lock); 8729 btrfs_return_cluster_to_free_space(block_group, cluster); 8730 spin_unlock(&cluster->refill_lock); 8731 8732 /* 8733 * make sure this block group isn't part of a metadata 8734 * allocation cluster 8735 */ 8736 cluster = &root->fs_info->meta_alloc_cluster; 8737 spin_lock(&cluster->refill_lock); 8738 btrfs_return_cluster_to_free_space(block_group, cluster); 8739 spin_unlock(&cluster->refill_lock); 8740 8741 path = btrfs_alloc_path(); 8742 if (!path) { 8743 ret = -ENOMEM; 8744 goto out; 8745 } 8746 8747 inode = lookup_free_space_inode(tree_root, block_group, path); 8748 if (!IS_ERR(inode)) { 8749 ret = btrfs_orphan_add(trans, inode); 8750 if (ret) { 8751 btrfs_add_delayed_iput(inode); 8752 goto out; 8753 } 8754 clear_nlink(inode); 8755 /* One for the block groups ref */ 8756 spin_lock(&block_group->lock); 8757 if (block_group->iref) { 8758 block_group->iref = 0; 8759 block_group->inode = NULL; 8760 spin_unlock(&block_group->lock); 8761 iput(inode); 8762 } else { 8763 spin_unlock(&block_group->lock); 8764 } 8765 /* One for our lookup ref */ 8766 btrfs_add_delayed_iput(inode); 8767 } 8768 8769 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 8770 key.offset = block_group->key.objectid; 8771 key.type = 0; 8772 8773 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 8774 if (ret < 0) 8775 goto out; 8776 if (ret > 0) 8777 btrfs_release_path(path); 8778 if (ret == 0) { 8779 ret = btrfs_del_item(trans, tree_root, path); 8780 if (ret) 8781 goto out; 8782 btrfs_release_path(path); 8783 } 8784 8785 spin_lock(&root->fs_info->block_group_cache_lock); 8786 rb_erase(&block_group->cache_node, 8787 &root->fs_info->block_group_cache_tree); 8788 8789 if (root->fs_info->first_logical_byte == block_group->key.objectid) 8790 root->fs_info->first_logical_byte = (u64)-1; 8791 spin_unlock(&root->fs_info->block_group_cache_lock); 8792 8793 down_write(&block_group->space_info->groups_sem); 8794 /* 8795 * we must use list_del_init so people can check to see if they 8796 * are still on the list after taking the semaphore 8797 */ 8798 list_del_init(&block_group->list); 8799 if (list_empty(&block_group->space_info->block_groups[index])) 8800 clear_avail_alloc_bits(root->fs_info, block_group->flags); 8801 up_write(&block_group->space_info->groups_sem); 8802 8803 if (block_group->cached == BTRFS_CACHE_STARTED) 8804 wait_block_group_cache_done(block_group); 8805 8806 btrfs_remove_free_space_cache(block_group); 8807 8808 spin_lock(&block_group->space_info->lock); 8809 block_group->space_info->total_bytes -= block_group->key.offset; 8810 block_group->space_info->bytes_readonly -= block_group->key.offset; 8811 block_group->space_info->disk_total -= block_group->key.offset * factor; 8812 spin_unlock(&block_group->space_info->lock); 8813 8814 memcpy(&key, &block_group->key, sizeof(key)); 8815 8816 btrfs_clear_space_info_full(root->fs_info); 8817 8818 btrfs_put_block_group(block_group); 8819 btrfs_put_block_group(block_group); 8820 8821 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 8822 if (ret > 0) 8823 ret = -EIO; 8824 if (ret < 0) 8825 goto out; 8826 8827 ret = btrfs_del_item(trans, root, path); 8828 out: 8829 btrfs_free_path(path); 8830 return ret; 8831 } 8832 8833 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 8834 { 8835 struct btrfs_space_info *space_info; 8836 struct btrfs_super_block *disk_super; 8837 u64 features; 8838 u64 flags; 8839 int mixed = 0; 8840 int ret; 8841 8842 disk_super = fs_info->super_copy; 8843 if (!btrfs_super_root(disk_super)) 8844 return 1; 8845 8846 features = btrfs_super_incompat_flags(disk_super); 8847 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 8848 mixed = 1; 8849 8850 flags = BTRFS_BLOCK_GROUP_SYSTEM; 8851 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8852 if (ret) 8853 goto out; 8854 8855 if (mixed) { 8856 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 8857 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8858 } else { 8859 flags = BTRFS_BLOCK_GROUP_METADATA; 8860 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8861 if (ret) 8862 goto out; 8863 8864 flags = BTRFS_BLOCK_GROUP_DATA; 8865 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8866 } 8867 out: 8868 return ret; 8869 } 8870 8871 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 8872 { 8873 return unpin_extent_range(root, start, end); 8874 } 8875 8876 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, 8877 u64 num_bytes, u64 *actual_bytes) 8878 { 8879 return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); 8880 } 8881 8882 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) 8883 { 8884 struct btrfs_fs_info *fs_info = root->fs_info; 8885 struct btrfs_block_group_cache *cache = NULL; 8886 u64 group_trimmed; 8887 u64 start; 8888 u64 end; 8889 u64 trimmed = 0; 8890 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 8891 int ret = 0; 8892 8893 /* 8894 * try to trim all FS space, our block group may start from non-zero. 8895 */ 8896 if (range->len == total_bytes) 8897 cache = btrfs_lookup_first_block_group(fs_info, range->start); 8898 else 8899 cache = btrfs_lookup_block_group(fs_info, range->start); 8900 8901 while (cache) { 8902 if (cache->key.objectid >= (range->start + range->len)) { 8903 btrfs_put_block_group(cache); 8904 break; 8905 } 8906 8907 start = max(range->start, cache->key.objectid); 8908 end = min(range->start + range->len, 8909 cache->key.objectid + cache->key.offset); 8910 8911 if (end - start >= range->minlen) { 8912 if (!block_group_cache_done(cache)) { 8913 ret = cache_block_group(cache, 0); 8914 if (ret) { 8915 btrfs_put_block_group(cache); 8916 break; 8917 } 8918 ret = wait_block_group_cache_done(cache); 8919 if (ret) { 8920 btrfs_put_block_group(cache); 8921 break; 8922 } 8923 } 8924 ret = btrfs_trim_block_group(cache, 8925 &group_trimmed, 8926 start, 8927 end, 8928 range->minlen); 8929 8930 trimmed += group_trimmed; 8931 if (ret) { 8932 btrfs_put_block_group(cache); 8933 break; 8934 } 8935 } 8936 8937 cache = next_block_group(fs_info->tree_root, cache); 8938 } 8939 8940 range->len = trimmed; 8941 return ret; 8942 } 8943