1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include <linux/slab.h> 26 #include <linux/ratelimit.h> 27 #include <linux/percpu_counter.h> 28 #include "hash.h" 29 #include "tree-log.h" 30 #include "disk-io.h" 31 #include "print-tree.h" 32 #include "volumes.h" 33 #include "raid56.h" 34 #include "locking.h" 35 #include "free-space-cache.h" 36 #include "free-space-tree.h" 37 #include "math.h" 38 #include "sysfs.h" 39 #include "qgroup.h" 40 41 #undef SCRAMBLE_DELAYED_REFS 42 43 /* 44 * control flags for do_chunk_alloc's force field 45 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 46 * if we really need one. 47 * 48 * CHUNK_ALLOC_LIMITED means to only try and allocate one 49 * if we have very few chunks already allocated. This is 50 * used as part of the clustering code to help make sure 51 * we have a good pool of storage to cluster in, without 52 * filling the FS with empty chunks 53 * 54 * CHUNK_ALLOC_FORCE means it must try to allocate one 55 * 56 */ 57 enum { 58 CHUNK_ALLOC_NO_FORCE = 0, 59 CHUNK_ALLOC_LIMITED = 1, 60 CHUNK_ALLOC_FORCE = 2, 61 }; 62 63 /* 64 * Control how reservations are dealt with. 65 * 66 * RESERVE_FREE - freeing a reservation. 67 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for 68 * ENOSPC accounting 69 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update 70 * bytes_may_use as the ENOSPC accounting is done elsewhere 71 */ 72 enum { 73 RESERVE_FREE = 0, 74 RESERVE_ALLOC = 1, 75 RESERVE_ALLOC_NO_ACCOUNT = 2, 76 }; 77 78 static int update_block_group(struct btrfs_trans_handle *trans, 79 struct btrfs_root *root, u64 bytenr, 80 u64 num_bytes, int alloc); 81 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 82 struct btrfs_root *root, 83 struct btrfs_delayed_ref_node *node, u64 parent, 84 u64 root_objectid, u64 owner_objectid, 85 u64 owner_offset, int refs_to_drop, 86 struct btrfs_delayed_extent_op *extra_op); 87 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 88 struct extent_buffer *leaf, 89 struct btrfs_extent_item *ei); 90 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 91 struct btrfs_root *root, 92 u64 parent, u64 root_objectid, 93 u64 flags, u64 owner, u64 offset, 94 struct btrfs_key *ins, int ref_mod); 95 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 96 struct btrfs_root *root, 97 u64 parent, u64 root_objectid, 98 u64 flags, struct btrfs_disk_key *key, 99 int level, struct btrfs_key *ins); 100 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 101 struct btrfs_root *extent_root, u64 flags, 102 int force); 103 static int find_next_key(struct btrfs_path *path, int level, 104 struct btrfs_key *key); 105 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 106 int dump_block_groups); 107 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 108 u64 num_bytes, int reserve, 109 int delalloc); 110 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 111 u64 num_bytes); 112 int btrfs_pin_extent(struct btrfs_root *root, 113 u64 bytenr, u64 num_bytes, int reserved); 114 static int __reserve_metadata_bytes(struct btrfs_root *root, 115 struct btrfs_space_info *space_info, 116 u64 orig_bytes, 117 enum btrfs_reserve_flush_enum flush); 118 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 119 struct btrfs_space_info *space_info, 120 u64 num_bytes); 121 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 122 struct btrfs_space_info *space_info, 123 u64 num_bytes); 124 125 static noinline int 126 block_group_cache_done(struct btrfs_block_group_cache *cache) 127 { 128 smp_mb(); 129 return cache->cached == BTRFS_CACHE_FINISHED || 130 cache->cached == BTRFS_CACHE_ERROR; 131 } 132 133 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 134 { 135 return (cache->flags & bits) == bits; 136 } 137 138 void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 139 { 140 atomic_inc(&cache->count); 141 } 142 143 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 144 { 145 if (atomic_dec_and_test(&cache->count)) { 146 WARN_ON(cache->pinned > 0); 147 WARN_ON(cache->reserved > 0); 148 kfree(cache->free_space_ctl); 149 kfree(cache); 150 } 151 } 152 153 /* 154 * this adds the block group to the fs_info rb tree for the block group 155 * cache 156 */ 157 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 158 struct btrfs_block_group_cache *block_group) 159 { 160 struct rb_node **p; 161 struct rb_node *parent = NULL; 162 struct btrfs_block_group_cache *cache; 163 164 spin_lock(&info->block_group_cache_lock); 165 p = &info->block_group_cache_tree.rb_node; 166 167 while (*p) { 168 parent = *p; 169 cache = rb_entry(parent, struct btrfs_block_group_cache, 170 cache_node); 171 if (block_group->key.objectid < cache->key.objectid) { 172 p = &(*p)->rb_left; 173 } else if (block_group->key.objectid > cache->key.objectid) { 174 p = &(*p)->rb_right; 175 } else { 176 spin_unlock(&info->block_group_cache_lock); 177 return -EEXIST; 178 } 179 } 180 181 rb_link_node(&block_group->cache_node, parent, p); 182 rb_insert_color(&block_group->cache_node, 183 &info->block_group_cache_tree); 184 185 if (info->first_logical_byte > block_group->key.objectid) 186 info->first_logical_byte = block_group->key.objectid; 187 188 spin_unlock(&info->block_group_cache_lock); 189 190 return 0; 191 } 192 193 /* 194 * This will return the block group at or after bytenr if contains is 0, else 195 * it will return the block group that contains the bytenr 196 */ 197 static struct btrfs_block_group_cache * 198 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 199 int contains) 200 { 201 struct btrfs_block_group_cache *cache, *ret = NULL; 202 struct rb_node *n; 203 u64 end, start; 204 205 spin_lock(&info->block_group_cache_lock); 206 n = info->block_group_cache_tree.rb_node; 207 208 while (n) { 209 cache = rb_entry(n, struct btrfs_block_group_cache, 210 cache_node); 211 end = cache->key.objectid + cache->key.offset - 1; 212 start = cache->key.objectid; 213 214 if (bytenr < start) { 215 if (!contains && (!ret || start < ret->key.objectid)) 216 ret = cache; 217 n = n->rb_left; 218 } else if (bytenr > start) { 219 if (contains && bytenr <= end) { 220 ret = cache; 221 break; 222 } 223 n = n->rb_right; 224 } else { 225 ret = cache; 226 break; 227 } 228 } 229 if (ret) { 230 btrfs_get_block_group(ret); 231 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 232 info->first_logical_byte = ret->key.objectid; 233 } 234 spin_unlock(&info->block_group_cache_lock); 235 236 return ret; 237 } 238 239 static int add_excluded_extent(struct btrfs_root *root, 240 u64 start, u64 num_bytes) 241 { 242 u64 end = start + num_bytes - 1; 243 set_extent_bits(&root->fs_info->freed_extents[0], 244 start, end, EXTENT_UPTODATE); 245 set_extent_bits(&root->fs_info->freed_extents[1], 246 start, end, EXTENT_UPTODATE); 247 return 0; 248 } 249 250 static void free_excluded_extents(struct btrfs_root *root, 251 struct btrfs_block_group_cache *cache) 252 { 253 u64 start, end; 254 255 start = cache->key.objectid; 256 end = start + cache->key.offset - 1; 257 258 clear_extent_bits(&root->fs_info->freed_extents[0], 259 start, end, EXTENT_UPTODATE); 260 clear_extent_bits(&root->fs_info->freed_extents[1], 261 start, end, EXTENT_UPTODATE); 262 } 263 264 static int exclude_super_stripes(struct btrfs_root *root, 265 struct btrfs_block_group_cache *cache) 266 { 267 u64 bytenr; 268 u64 *logical; 269 int stripe_len; 270 int i, nr, ret; 271 272 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 273 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 274 cache->bytes_super += stripe_len; 275 ret = add_excluded_extent(root, cache->key.objectid, 276 stripe_len); 277 if (ret) 278 return ret; 279 } 280 281 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 282 bytenr = btrfs_sb_offset(i); 283 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 284 cache->key.objectid, bytenr, 285 0, &logical, &nr, &stripe_len); 286 if (ret) 287 return ret; 288 289 while (nr--) { 290 u64 start, len; 291 292 if (logical[nr] > cache->key.objectid + 293 cache->key.offset) 294 continue; 295 296 if (logical[nr] + stripe_len <= cache->key.objectid) 297 continue; 298 299 start = logical[nr]; 300 if (start < cache->key.objectid) { 301 start = cache->key.objectid; 302 len = (logical[nr] + stripe_len) - start; 303 } else { 304 len = min_t(u64, stripe_len, 305 cache->key.objectid + 306 cache->key.offset - start); 307 } 308 309 cache->bytes_super += len; 310 ret = add_excluded_extent(root, start, len); 311 if (ret) { 312 kfree(logical); 313 return ret; 314 } 315 } 316 317 kfree(logical); 318 } 319 return 0; 320 } 321 322 static struct btrfs_caching_control * 323 get_caching_control(struct btrfs_block_group_cache *cache) 324 { 325 struct btrfs_caching_control *ctl; 326 327 spin_lock(&cache->lock); 328 if (!cache->caching_ctl) { 329 spin_unlock(&cache->lock); 330 return NULL; 331 } 332 333 ctl = cache->caching_ctl; 334 atomic_inc(&ctl->count); 335 spin_unlock(&cache->lock); 336 return ctl; 337 } 338 339 static void put_caching_control(struct btrfs_caching_control *ctl) 340 { 341 if (atomic_dec_and_test(&ctl->count)) 342 kfree(ctl); 343 } 344 345 #ifdef CONFIG_BTRFS_DEBUG 346 static void fragment_free_space(struct btrfs_root *root, 347 struct btrfs_block_group_cache *block_group) 348 { 349 u64 start = block_group->key.objectid; 350 u64 len = block_group->key.offset; 351 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 352 root->nodesize : root->sectorsize; 353 u64 step = chunk << 1; 354 355 while (len > chunk) { 356 btrfs_remove_free_space(block_group, start, chunk); 357 start += step; 358 if (len < step) 359 len = 0; 360 else 361 len -= step; 362 } 363 } 364 #endif 365 366 /* 367 * this is only called by cache_block_group, since we could have freed extents 368 * we need to check the pinned_extents for any extents that can't be used yet 369 * since their free space will be released as soon as the transaction commits. 370 */ 371 u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 372 struct btrfs_fs_info *info, u64 start, u64 end) 373 { 374 u64 extent_start, extent_end, size, total_added = 0; 375 int ret; 376 377 while (start < end) { 378 ret = find_first_extent_bit(info->pinned_extents, start, 379 &extent_start, &extent_end, 380 EXTENT_DIRTY | EXTENT_UPTODATE, 381 NULL); 382 if (ret) 383 break; 384 385 if (extent_start <= start) { 386 start = extent_end + 1; 387 } else if (extent_start > start && extent_start < end) { 388 size = extent_start - start; 389 total_added += size; 390 ret = btrfs_add_free_space(block_group, start, 391 size); 392 BUG_ON(ret); /* -ENOMEM or logic error */ 393 start = extent_end + 1; 394 } else { 395 break; 396 } 397 } 398 399 if (start < end) { 400 size = end - start; 401 total_added += size; 402 ret = btrfs_add_free_space(block_group, start, size); 403 BUG_ON(ret); /* -ENOMEM or logic error */ 404 } 405 406 return total_added; 407 } 408 409 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 410 { 411 struct btrfs_block_group_cache *block_group; 412 struct btrfs_fs_info *fs_info; 413 struct btrfs_root *extent_root; 414 struct btrfs_path *path; 415 struct extent_buffer *leaf; 416 struct btrfs_key key; 417 u64 total_found = 0; 418 u64 last = 0; 419 u32 nritems; 420 int ret; 421 bool wakeup = true; 422 423 block_group = caching_ctl->block_group; 424 fs_info = block_group->fs_info; 425 extent_root = fs_info->extent_root; 426 427 path = btrfs_alloc_path(); 428 if (!path) 429 return -ENOMEM; 430 431 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 432 433 #ifdef CONFIG_BTRFS_DEBUG 434 /* 435 * If we're fragmenting we don't want to make anybody think we can 436 * allocate from this block group until we've had a chance to fragment 437 * the free space. 438 */ 439 if (btrfs_should_fragment_free_space(extent_root, block_group)) 440 wakeup = false; 441 #endif 442 /* 443 * We don't want to deadlock with somebody trying to allocate a new 444 * extent for the extent root while also trying to search the extent 445 * root to add free space. So we skip locking and search the commit 446 * root, since its read-only 447 */ 448 path->skip_locking = 1; 449 path->search_commit_root = 1; 450 path->reada = READA_FORWARD; 451 452 key.objectid = last; 453 key.offset = 0; 454 key.type = BTRFS_EXTENT_ITEM_KEY; 455 456 next: 457 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 458 if (ret < 0) 459 goto out; 460 461 leaf = path->nodes[0]; 462 nritems = btrfs_header_nritems(leaf); 463 464 while (1) { 465 if (btrfs_fs_closing(fs_info) > 1) { 466 last = (u64)-1; 467 break; 468 } 469 470 if (path->slots[0] < nritems) { 471 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 472 } else { 473 ret = find_next_key(path, 0, &key); 474 if (ret) 475 break; 476 477 if (need_resched() || 478 rwsem_is_contended(&fs_info->commit_root_sem)) { 479 if (wakeup) 480 caching_ctl->progress = last; 481 btrfs_release_path(path); 482 up_read(&fs_info->commit_root_sem); 483 mutex_unlock(&caching_ctl->mutex); 484 cond_resched(); 485 mutex_lock(&caching_ctl->mutex); 486 down_read(&fs_info->commit_root_sem); 487 goto next; 488 } 489 490 ret = btrfs_next_leaf(extent_root, path); 491 if (ret < 0) 492 goto out; 493 if (ret) 494 break; 495 leaf = path->nodes[0]; 496 nritems = btrfs_header_nritems(leaf); 497 continue; 498 } 499 500 if (key.objectid < last) { 501 key.objectid = last; 502 key.offset = 0; 503 key.type = BTRFS_EXTENT_ITEM_KEY; 504 505 if (wakeup) 506 caching_ctl->progress = last; 507 btrfs_release_path(path); 508 goto next; 509 } 510 511 if (key.objectid < block_group->key.objectid) { 512 path->slots[0]++; 513 continue; 514 } 515 516 if (key.objectid >= block_group->key.objectid + 517 block_group->key.offset) 518 break; 519 520 if (key.type == BTRFS_EXTENT_ITEM_KEY || 521 key.type == BTRFS_METADATA_ITEM_KEY) { 522 total_found += add_new_free_space(block_group, 523 fs_info, last, 524 key.objectid); 525 if (key.type == BTRFS_METADATA_ITEM_KEY) 526 last = key.objectid + 527 fs_info->tree_root->nodesize; 528 else 529 last = key.objectid + key.offset; 530 531 if (total_found > CACHING_CTL_WAKE_UP) { 532 total_found = 0; 533 if (wakeup) 534 wake_up(&caching_ctl->wait); 535 } 536 } 537 path->slots[0]++; 538 } 539 ret = 0; 540 541 total_found += add_new_free_space(block_group, fs_info, last, 542 block_group->key.objectid + 543 block_group->key.offset); 544 caching_ctl->progress = (u64)-1; 545 546 out: 547 btrfs_free_path(path); 548 return ret; 549 } 550 551 static noinline void caching_thread(struct btrfs_work *work) 552 { 553 struct btrfs_block_group_cache *block_group; 554 struct btrfs_fs_info *fs_info; 555 struct btrfs_caching_control *caching_ctl; 556 struct btrfs_root *extent_root; 557 int ret; 558 559 caching_ctl = container_of(work, struct btrfs_caching_control, work); 560 block_group = caching_ctl->block_group; 561 fs_info = block_group->fs_info; 562 extent_root = fs_info->extent_root; 563 564 mutex_lock(&caching_ctl->mutex); 565 down_read(&fs_info->commit_root_sem); 566 567 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 568 ret = load_free_space_tree(caching_ctl); 569 else 570 ret = load_extent_tree_free(caching_ctl); 571 572 spin_lock(&block_group->lock); 573 block_group->caching_ctl = NULL; 574 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 575 spin_unlock(&block_group->lock); 576 577 #ifdef CONFIG_BTRFS_DEBUG 578 if (btrfs_should_fragment_free_space(extent_root, block_group)) { 579 u64 bytes_used; 580 581 spin_lock(&block_group->space_info->lock); 582 spin_lock(&block_group->lock); 583 bytes_used = block_group->key.offset - 584 btrfs_block_group_used(&block_group->item); 585 block_group->space_info->bytes_used += bytes_used >> 1; 586 spin_unlock(&block_group->lock); 587 spin_unlock(&block_group->space_info->lock); 588 fragment_free_space(extent_root, block_group); 589 } 590 #endif 591 592 caching_ctl->progress = (u64)-1; 593 594 up_read(&fs_info->commit_root_sem); 595 free_excluded_extents(fs_info->extent_root, block_group); 596 mutex_unlock(&caching_ctl->mutex); 597 598 wake_up(&caching_ctl->wait); 599 600 put_caching_control(caching_ctl); 601 btrfs_put_block_group(block_group); 602 } 603 604 static int cache_block_group(struct btrfs_block_group_cache *cache, 605 int load_cache_only) 606 { 607 DEFINE_WAIT(wait); 608 struct btrfs_fs_info *fs_info = cache->fs_info; 609 struct btrfs_caching_control *caching_ctl; 610 int ret = 0; 611 612 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 613 if (!caching_ctl) 614 return -ENOMEM; 615 616 INIT_LIST_HEAD(&caching_ctl->list); 617 mutex_init(&caching_ctl->mutex); 618 init_waitqueue_head(&caching_ctl->wait); 619 caching_ctl->block_group = cache; 620 caching_ctl->progress = cache->key.objectid; 621 atomic_set(&caching_ctl->count, 1); 622 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, 623 caching_thread, NULL, NULL); 624 625 spin_lock(&cache->lock); 626 /* 627 * This should be a rare occasion, but this could happen I think in the 628 * case where one thread starts to load the space cache info, and then 629 * some other thread starts a transaction commit which tries to do an 630 * allocation while the other thread is still loading the space cache 631 * info. The previous loop should have kept us from choosing this block 632 * group, but if we've moved to the state where we will wait on caching 633 * block groups we need to first check if we're doing a fast load here, 634 * so we can wait for it to finish, otherwise we could end up allocating 635 * from a block group who's cache gets evicted for one reason or 636 * another. 637 */ 638 while (cache->cached == BTRFS_CACHE_FAST) { 639 struct btrfs_caching_control *ctl; 640 641 ctl = cache->caching_ctl; 642 atomic_inc(&ctl->count); 643 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 644 spin_unlock(&cache->lock); 645 646 schedule(); 647 648 finish_wait(&ctl->wait, &wait); 649 put_caching_control(ctl); 650 spin_lock(&cache->lock); 651 } 652 653 if (cache->cached != BTRFS_CACHE_NO) { 654 spin_unlock(&cache->lock); 655 kfree(caching_ctl); 656 return 0; 657 } 658 WARN_ON(cache->caching_ctl); 659 cache->caching_ctl = caching_ctl; 660 cache->cached = BTRFS_CACHE_FAST; 661 spin_unlock(&cache->lock); 662 663 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 664 mutex_lock(&caching_ctl->mutex); 665 ret = load_free_space_cache(fs_info, cache); 666 667 spin_lock(&cache->lock); 668 if (ret == 1) { 669 cache->caching_ctl = NULL; 670 cache->cached = BTRFS_CACHE_FINISHED; 671 cache->last_byte_to_unpin = (u64)-1; 672 caching_ctl->progress = (u64)-1; 673 } else { 674 if (load_cache_only) { 675 cache->caching_ctl = NULL; 676 cache->cached = BTRFS_CACHE_NO; 677 } else { 678 cache->cached = BTRFS_CACHE_STARTED; 679 cache->has_caching_ctl = 1; 680 } 681 } 682 spin_unlock(&cache->lock); 683 #ifdef CONFIG_BTRFS_DEBUG 684 if (ret == 1 && 685 btrfs_should_fragment_free_space(fs_info->extent_root, 686 cache)) { 687 u64 bytes_used; 688 689 spin_lock(&cache->space_info->lock); 690 spin_lock(&cache->lock); 691 bytes_used = cache->key.offset - 692 btrfs_block_group_used(&cache->item); 693 cache->space_info->bytes_used += bytes_used >> 1; 694 spin_unlock(&cache->lock); 695 spin_unlock(&cache->space_info->lock); 696 fragment_free_space(fs_info->extent_root, cache); 697 } 698 #endif 699 mutex_unlock(&caching_ctl->mutex); 700 701 wake_up(&caching_ctl->wait); 702 if (ret == 1) { 703 put_caching_control(caching_ctl); 704 free_excluded_extents(fs_info->extent_root, cache); 705 return 0; 706 } 707 } else { 708 /* 709 * We're either using the free space tree or no caching at all. 710 * Set cached to the appropriate value and wakeup any waiters. 711 */ 712 spin_lock(&cache->lock); 713 if (load_cache_only) { 714 cache->caching_ctl = NULL; 715 cache->cached = BTRFS_CACHE_NO; 716 } else { 717 cache->cached = BTRFS_CACHE_STARTED; 718 cache->has_caching_ctl = 1; 719 } 720 spin_unlock(&cache->lock); 721 wake_up(&caching_ctl->wait); 722 } 723 724 if (load_cache_only) { 725 put_caching_control(caching_ctl); 726 return 0; 727 } 728 729 down_write(&fs_info->commit_root_sem); 730 atomic_inc(&caching_ctl->count); 731 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 732 up_write(&fs_info->commit_root_sem); 733 734 btrfs_get_block_group(cache); 735 736 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 737 738 return ret; 739 } 740 741 /* 742 * return the block group that starts at or after bytenr 743 */ 744 static struct btrfs_block_group_cache * 745 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 746 { 747 struct btrfs_block_group_cache *cache; 748 749 cache = block_group_cache_tree_search(info, bytenr, 0); 750 751 return cache; 752 } 753 754 /* 755 * return the block group that contains the given bytenr 756 */ 757 struct btrfs_block_group_cache *btrfs_lookup_block_group( 758 struct btrfs_fs_info *info, 759 u64 bytenr) 760 { 761 struct btrfs_block_group_cache *cache; 762 763 cache = block_group_cache_tree_search(info, bytenr, 1); 764 765 return cache; 766 } 767 768 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 769 u64 flags) 770 { 771 struct list_head *head = &info->space_info; 772 struct btrfs_space_info *found; 773 774 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 775 776 rcu_read_lock(); 777 list_for_each_entry_rcu(found, head, list) { 778 if (found->flags & flags) { 779 rcu_read_unlock(); 780 return found; 781 } 782 } 783 rcu_read_unlock(); 784 return NULL; 785 } 786 787 /* 788 * after adding space to the filesystem, we need to clear the full flags 789 * on all the space infos. 790 */ 791 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 792 { 793 struct list_head *head = &info->space_info; 794 struct btrfs_space_info *found; 795 796 rcu_read_lock(); 797 list_for_each_entry_rcu(found, head, list) 798 found->full = 0; 799 rcu_read_unlock(); 800 } 801 802 /* simple helper to search for an existing data extent at a given offset */ 803 int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len) 804 { 805 int ret; 806 struct btrfs_key key; 807 struct btrfs_path *path; 808 809 path = btrfs_alloc_path(); 810 if (!path) 811 return -ENOMEM; 812 813 key.objectid = start; 814 key.offset = len; 815 key.type = BTRFS_EXTENT_ITEM_KEY; 816 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 817 0, 0); 818 btrfs_free_path(path); 819 return ret; 820 } 821 822 /* 823 * helper function to lookup reference count and flags of a tree block. 824 * 825 * the head node for delayed ref is used to store the sum of all the 826 * reference count modifications queued up in the rbtree. the head 827 * node may also store the extent flags to set. This way you can check 828 * to see what the reference count and extent flags would be if all of 829 * the delayed refs are not processed. 830 */ 831 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 832 struct btrfs_root *root, u64 bytenr, 833 u64 offset, int metadata, u64 *refs, u64 *flags) 834 { 835 struct btrfs_delayed_ref_head *head; 836 struct btrfs_delayed_ref_root *delayed_refs; 837 struct btrfs_path *path; 838 struct btrfs_extent_item *ei; 839 struct extent_buffer *leaf; 840 struct btrfs_key key; 841 u32 item_size; 842 u64 num_refs; 843 u64 extent_flags; 844 int ret; 845 846 /* 847 * If we don't have skinny metadata, don't bother doing anything 848 * different 849 */ 850 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { 851 offset = root->nodesize; 852 metadata = 0; 853 } 854 855 path = btrfs_alloc_path(); 856 if (!path) 857 return -ENOMEM; 858 859 if (!trans) { 860 path->skip_locking = 1; 861 path->search_commit_root = 1; 862 } 863 864 search_again: 865 key.objectid = bytenr; 866 key.offset = offset; 867 if (metadata) 868 key.type = BTRFS_METADATA_ITEM_KEY; 869 else 870 key.type = BTRFS_EXTENT_ITEM_KEY; 871 872 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 873 &key, path, 0, 0); 874 if (ret < 0) 875 goto out_free; 876 877 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 878 if (path->slots[0]) { 879 path->slots[0]--; 880 btrfs_item_key_to_cpu(path->nodes[0], &key, 881 path->slots[0]); 882 if (key.objectid == bytenr && 883 key.type == BTRFS_EXTENT_ITEM_KEY && 884 key.offset == root->nodesize) 885 ret = 0; 886 } 887 } 888 889 if (ret == 0) { 890 leaf = path->nodes[0]; 891 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 892 if (item_size >= sizeof(*ei)) { 893 ei = btrfs_item_ptr(leaf, path->slots[0], 894 struct btrfs_extent_item); 895 num_refs = btrfs_extent_refs(leaf, ei); 896 extent_flags = btrfs_extent_flags(leaf, ei); 897 } else { 898 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 899 struct btrfs_extent_item_v0 *ei0; 900 BUG_ON(item_size != sizeof(*ei0)); 901 ei0 = btrfs_item_ptr(leaf, path->slots[0], 902 struct btrfs_extent_item_v0); 903 num_refs = btrfs_extent_refs_v0(leaf, ei0); 904 /* FIXME: this isn't correct for data */ 905 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 906 #else 907 BUG(); 908 #endif 909 } 910 BUG_ON(num_refs == 0); 911 } else { 912 num_refs = 0; 913 extent_flags = 0; 914 ret = 0; 915 } 916 917 if (!trans) 918 goto out; 919 920 delayed_refs = &trans->transaction->delayed_refs; 921 spin_lock(&delayed_refs->lock); 922 head = btrfs_find_delayed_ref_head(trans, bytenr); 923 if (head) { 924 if (!mutex_trylock(&head->mutex)) { 925 atomic_inc(&head->node.refs); 926 spin_unlock(&delayed_refs->lock); 927 928 btrfs_release_path(path); 929 930 /* 931 * Mutex was contended, block until it's released and try 932 * again 933 */ 934 mutex_lock(&head->mutex); 935 mutex_unlock(&head->mutex); 936 btrfs_put_delayed_ref(&head->node); 937 goto search_again; 938 } 939 spin_lock(&head->lock); 940 if (head->extent_op && head->extent_op->update_flags) 941 extent_flags |= head->extent_op->flags_to_set; 942 else 943 BUG_ON(num_refs == 0); 944 945 num_refs += head->node.ref_mod; 946 spin_unlock(&head->lock); 947 mutex_unlock(&head->mutex); 948 } 949 spin_unlock(&delayed_refs->lock); 950 out: 951 WARN_ON(num_refs == 0); 952 if (refs) 953 *refs = num_refs; 954 if (flags) 955 *flags = extent_flags; 956 out_free: 957 btrfs_free_path(path); 958 return ret; 959 } 960 961 /* 962 * Back reference rules. Back refs have three main goals: 963 * 964 * 1) differentiate between all holders of references to an extent so that 965 * when a reference is dropped we can make sure it was a valid reference 966 * before freeing the extent. 967 * 968 * 2) Provide enough information to quickly find the holders of an extent 969 * if we notice a given block is corrupted or bad. 970 * 971 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 972 * maintenance. This is actually the same as #2, but with a slightly 973 * different use case. 974 * 975 * There are two kinds of back refs. The implicit back refs is optimized 976 * for pointers in non-shared tree blocks. For a given pointer in a block, 977 * back refs of this kind provide information about the block's owner tree 978 * and the pointer's key. These information allow us to find the block by 979 * b-tree searching. The full back refs is for pointers in tree blocks not 980 * referenced by their owner trees. The location of tree block is recorded 981 * in the back refs. Actually the full back refs is generic, and can be 982 * used in all cases the implicit back refs is used. The major shortcoming 983 * of the full back refs is its overhead. Every time a tree block gets 984 * COWed, we have to update back refs entry for all pointers in it. 985 * 986 * For a newly allocated tree block, we use implicit back refs for 987 * pointers in it. This means most tree related operations only involve 988 * implicit back refs. For a tree block created in old transaction, the 989 * only way to drop a reference to it is COW it. So we can detect the 990 * event that tree block loses its owner tree's reference and do the 991 * back refs conversion. 992 * 993 * When a tree block is COWed through a tree, there are four cases: 994 * 995 * The reference count of the block is one and the tree is the block's 996 * owner tree. Nothing to do in this case. 997 * 998 * The reference count of the block is one and the tree is not the 999 * block's owner tree. In this case, full back refs is used for pointers 1000 * in the block. Remove these full back refs, add implicit back refs for 1001 * every pointers in the new block. 1002 * 1003 * The reference count of the block is greater than one and the tree is 1004 * the block's owner tree. In this case, implicit back refs is used for 1005 * pointers in the block. Add full back refs for every pointers in the 1006 * block, increase lower level extents' reference counts. The original 1007 * implicit back refs are entailed to the new block. 1008 * 1009 * The reference count of the block is greater than one and the tree is 1010 * not the block's owner tree. Add implicit back refs for every pointer in 1011 * the new block, increase lower level extents' reference count. 1012 * 1013 * Back Reference Key composing: 1014 * 1015 * The key objectid corresponds to the first byte in the extent, 1016 * The key type is used to differentiate between types of back refs. 1017 * There are different meanings of the key offset for different types 1018 * of back refs. 1019 * 1020 * File extents can be referenced by: 1021 * 1022 * - multiple snapshots, subvolumes, or different generations in one subvol 1023 * - different files inside a single subvolume 1024 * - different offsets inside a file (bookend extents in file.c) 1025 * 1026 * The extent ref structure for the implicit back refs has fields for: 1027 * 1028 * - Objectid of the subvolume root 1029 * - objectid of the file holding the reference 1030 * - original offset in the file 1031 * - how many bookend extents 1032 * 1033 * The key offset for the implicit back refs is hash of the first 1034 * three fields. 1035 * 1036 * The extent ref structure for the full back refs has field for: 1037 * 1038 * - number of pointers in the tree leaf 1039 * 1040 * The key offset for the implicit back refs is the first byte of 1041 * the tree leaf 1042 * 1043 * When a file extent is allocated, The implicit back refs is used. 1044 * the fields are filled in: 1045 * 1046 * (root_key.objectid, inode objectid, offset in file, 1) 1047 * 1048 * When a file extent is removed file truncation, we find the 1049 * corresponding implicit back refs and check the following fields: 1050 * 1051 * (btrfs_header_owner(leaf), inode objectid, offset in file) 1052 * 1053 * Btree extents can be referenced by: 1054 * 1055 * - Different subvolumes 1056 * 1057 * Both the implicit back refs and the full back refs for tree blocks 1058 * only consist of key. The key offset for the implicit back refs is 1059 * objectid of block's owner tree. The key offset for the full back refs 1060 * is the first byte of parent block. 1061 * 1062 * When implicit back refs is used, information about the lowest key and 1063 * level of the tree block are required. These information are stored in 1064 * tree block info structure. 1065 */ 1066 1067 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1068 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 1069 struct btrfs_root *root, 1070 struct btrfs_path *path, 1071 u64 owner, u32 extra_size) 1072 { 1073 struct btrfs_extent_item *item; 1074 struct btrfs_extent_item_v0 *ei0; 1075 struct btrfs_extent_ref_v0 *ref0; 1076 struct btrfs_tree_block_info *bi; 1077 struct extent_buffer *leaf; 1078 struct btrfs_key key; 1079 struct btrfs_key found_key; 1080 u32 new_size = sizeof(*item); 1081 u64 refs; 1082 int ret; 1083 1084 leaf = path->nodes[0]; 1085 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 1086 1087 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1088 ei0 = btrfs_item_ptr(leaf, path->slots[0], 1089 struct btrfs_extent_item_v0); 1090 refs = btrfs_extent_refs_v0(leaf, ei0); 1091 1092 if (owner == (u64)-1) { 1093 while (1) { 1094 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1095 ret = btrfs_next_leaf(root, path); 1096 if (ret < 0) 1097 return ret; 1098 BUG_ON(ret > 0); /* Corruption */ 1099 leaf = path->nodes[0]; 1100 } 1101 btrfs_item_key_to_cpu(leaf, &found_key, 1102 path->slots[0]); 1103 BUG_ON(key.objectid != found_key.objectid); 1104 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 1105 path->slots[0]++; 1106 continue; 1107 } 1108 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1109 struct btrfs_extent_ref_v0); 1110 owner = btrfs_ref_objectid_v0(leaf, ref0); 1111 break; 1112 } 1113 } 1114 btrfs_release_path(path); 1115 1116 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1117 new_size += sizeof(*bi); 1118 1119 new_size -= sizeof(*ei0); 1120 ret = btrfs_search_slot(trans, root, &key, path, 1121 new_size + extra_size, 1); 1122 if (ret < 0) 1123 return ret; 1124 BUG_ON(ret); /* Corruption */ 1125 1126 btrfs_extend_item(root, path, new_size); 1127 1128 leaf = path->nodes[0]; 1129 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1130 btrfs_set_extent_refs(leaf, item, refs); 1131 /* FIXME: get real generation */ 1132 btrfs_set_extent_generation(leaf, item, 0); 1133 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1134 btrfs_set_extent_flags(leaf, item, 1135 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1136 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1137 bi = (struct btrfs_tree_block_info *)(item + 1); 1138 /* FIXME: get first key of the block */ 1139 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); 1140 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1141 } else { 1142 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1143 } 1144 btrfs_mark_buffer_dirty(leaf); 1145 return 0; 1146 } 1147 #endif 1148 1149 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1150 { 1151 u32 high_crc = ~(u32)0; 1152 u32 low_crc = ~(u32)0; 1153 __le64 lenum; 1154 1155 lenum = cpu_to_le64(root_objectid); 1156 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 1157 lenum = cpu_to_le64(owner); 1158 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1159 lenum = cpu_to_le64(offset); 1160 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1161 1162 return ((u64)high_crc << 31) ^ (u64)low_crc; 1163 } 1164 1165 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1166 struct btrfs_extent_data_ref *ref) 1167 { 1168 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1169 btrfs_extent_data_ref_objectid(leaf, ref), 1170 btrfs_extent_data_ref_offset(leaf, ref)); 1171 } 1172 1173 static int match_extent_data_ref(struct extent_buffer *leaf, 1174 struct btrfs_extent_data_ref *ref, 1175 u64 root_objectid, u64 owner, u64 offset) 1176 { 1177 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1178 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1179 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1180 return 0; 1181 return 1; 1182 } 1183 1184 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1185 struct btrfs_root *root, 1186 struct btrfs_path *path, 1187 u64 bytenr, u64 parent, 1188 u64 root_objectid, 1189 u64 owner, u64 offset) 1190 { 1191 struct btrfs_key key; 1192 struct btrfs_extent_data_ref *ref; 1193 struct extent_buffer *leaf; 1194 u32 nritems; 1195 int ret; 1196 int recow; 1197 int err = -ENOENT; 1198 1199 key.objectid = bytenr; 1200 if (parent) { 1201 key.type = BTRFS_SHARED_DATA_REF_KEY; 1202 key.offset = parent; 1203 } else { 1204 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1205 key.offset = hash_extent_data_ref(root_objectid, 1206 owner, offset); 1207 } 1208 again: 1209 recow = 0; 1210 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1211 if (ret < 0) { 1212 err = ret; 1213 goto fail; 1214 } 1215 1216 if (parent) { 1217 if (!ret) 1218 return 0; 1219 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1220 key.type = BTRFS_EXTENT_REF_V0_KEY; 1221 btrfs_release_path(path); 1222 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1223 if (ret < 0) { 1224 err = ret; 1225 goto fail; 1226 } 1227 if (!ret) 1228 return 0; 1229 #endif 1230 goto fail; 1231 } 1232 1233 leaf = path->nodes[0]; 1234 nritems = btrfs_header_nritems(leaf); 1235 while (1) { 1236 if (path->slots[0] >= nritems) { 1237 ret = btrfs_next_leaf(root, path); 1238 if (ret < 0) 1239 err = ret; 1240 if (ret) 1241 goto fail; 1242 1243 leaf = path->nodes[0]; 1244 nritems = btrfs_header_nritems(leaf); 1245 recow = 1; 1246 } 1247 1248 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1249 if (key.objectid != bytenr || 1250 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1251 goto fail; 1252 1253 ref = btrfs_item_ptr(leaf, path->slots[0], 1254 struct btrfs_extent_data_ref); 1255 1256 if (match_extent_data_ref(leaf, ref, root_objectid, 1257 owner, offset)) { 1258 if (recow) { 1259 btrfs_release_path(path); 1260 goto again; 1261 } 1262 err = 0; 1263 break; 1264 } 1265 path->slots[0]++; 1266 } 1267 fail: 1268 return err; 1269 } 1270 1271 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1272 struct btrfs_root *root, 1273 struct btrfs_path *path, 1274 u64 bytenr, u64 parent, 1275 u64 root_objectid, u64 owner, 1276 u64 offset, int refs_to_add) 1277 { 1278 struct btrfs_key key; 1279 struct extent_buffer *leaf; 1280 u32 size; 1281 u32 num_refs; 1282 int ret; 1283 1284 key.objectid = bytenr; 1285 if (parent) { 1286 key.type = BTRFS_SHARED_DATA_REF_KEY; 1287 key.offset = parent; 1288 size = sizeof(struct btrfs_shared_data_ref); 1289 } else { 1290 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1291 key.offset = hash_extent_data_ref(root_objectid, 1292 owner, offset); 1293 size = sizeof(struct btrfs_extent_data_ref); 1294 } 1295 1296 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1297 if (ret && ret != -EEXIST) 1298 goto fail; 1299 1300 leaf = path->nodes[0]; 1301 if (parent) { 1302 struct btrfs_shared_data_ref *ref; 1303 ref = btrfs_item_ptr(leaf, path->slots[0], 1304 struct btrfs_shared_data_ref); 1305 if (ret == 0) { 1306 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1307 } else { 1308 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1309 num_refs += refs_to_add; 1310 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1311 } 1312 } else { 1313 struct btrfs_extent_data_ref *ref; 1314 while (ret == -EEXIST) { 1315 ref = btrfs_item_ptr(leaf, path->slots[0], 1316 struct btrfs_extent_data_ref); 1317 if (match_extent_data_ref(leaf, ref, root_objectid, 1318 owner, offset)) 1319 break; 1320 btrfs_release_path(path); 1321 key.offset++; 1322 ret = btrfs_insert_empty_item(trans, root, path, &key, 1323 size); 1324 if (ret && ret != -EEXIST) 1325 goto fail; 1326 1327 leaf = path->nodes[0]; 1328 } 1329 ref = btrfs_item_ptr(leaf, path->slots[0], 1330 struct btrfs_extent_data_ref); 1331 if (ret == 0) { 1332 btrfs_set_extent_data_ref_root(leaf, ref, 1333 root_objectid); 1334 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1335 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1336 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1337 } else { 1338 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1339 num_refs += refs_to_add; 1340 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1341 } 1342 } 1343 btrfs_mark_buffer_dirty(leaf); 1344 ret = 0; 1345 fail: 1346 btrfs_release_path(path); 1347 return ret; 1348 } 1349 1350 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1351 struct btrfs_root *root, 1352 struct btrfs_path *path, 1353 int refs_to_drop, int *last_ref) 1354 { 1355 struct btrfs_key key; 1356 struct btrfs_extent_data_ref *ref1 = NULL; 1357 struct btrfs_shared_data_ref *ref2 = NULL; 1358 struct extent_buffer *leaf; 1359 u32 num_refs = 0; 1360 int ret = 0; 1361 1362 leaf = path->nodes[0]; 1363 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1364 1365 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1366 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1367 struct btrfs_extent_data_ref); 1368 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1369 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1370 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1371 struct btrfs_shared_data_ref); 1372 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1373 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1374 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1375 struct btrfs_extent_ref_v0 *ref0; 1376 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1377 struct btrfs_extent_ref_v0); 1378 num_refs = btrfs_ref_count_v0(leaf, ref0); 1379 #endif 1380 } else { 1381 BUG(); 1382 } 1383 1384 BUG_ON(num_refs < refs_to_drop); 1385 num_refs -= refs_to_drop; 1386 1387 if (num_refs == 0) { 1388 ret = btrfs_del_item(trans, root, path); 1389 *last_ref = 1; 1390 } else { 1391 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1392 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1393 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1394 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1395 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1396 else { 1397 struct btrfs_extent_ref_v0 *ref0; 1398 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1399 struct btrfs_extent_ref_v0); 1400 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1401 } 1402 #endif 1403 btrfs_mark_buffer_dirty(leaf); 1404 } 1405 return ret; 1406 } 1407 1408 static noinline u32 extent_data_ref_count(struct btrfs_path *path, 1409 struct btrfs_extent_inline_ref *iref) 1410 { 1411 struct btrfs_key key; 1412 struct extent_buffer *leaf; 1413 struct btrfs_extent_data_ref *ref1; 1414 struct btrfs_shared_data_ref *ref2; 1415 u32 num_refs = 0; 1416 1417 leaf = path->nodes[0]; 1418 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1419 if (iref) { 1420 if (btrfs_extent_inline_ref_type(leaf, iref) == 1421 BTRFS_EXTENT_DATA_REF_KEY) { 1422 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1423 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1424 } else { 1425 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1426 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1427 } 1428 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1429 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1430 struct btrfs_extent_data_ref); 1431 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1432 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1433 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1434 struct btrfs_shared_data_ref); 1435 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1436 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1437 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1438 struct btrfs_extent_ref_v0 *ref0; 1439 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1440 struct btrfs_extent_ref_v0); 1441 num_refs = btrfs_ref_count_v0(leaf, ref0); 1442 #endif 1443 } else { 1444 WARN_ON(1); 1445 } 1446 return num_refs; 1447 } 1448 1449 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1450 struct btrfs_root *root, 1451 struct btrfs_path *path, 1452 u64 bytenr, u64 parent, 1453 u64 root_objectid) 1454 { 1455 struct btrfs_key key; 1456 int ret; 1457 1458 key.objectid = bytenr; 1459 if (parent) { 1460 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1461 key.offset = parent; 1462 } else { 1463 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1464 key.offset = root_objectid; 1465 } 1466 1467 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1468 if (ret > 0) 1469 ret = -ENOENT; 1470 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1471 if (ret == -ENOENT && parent) { 1472 btrfs_release_path(path); 1473 key.type = BTRFS_EXTENT_REF_V0_KEY; 1474 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1475 if (ret > 0) 1476 ret = -ENOENT; 1477 } 1478 #endif 1479 return ret; 1480 } 1481 1482 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1483 struct btrfs_root *root, 1484 struct btrfs_path *path, 1485 u64 bytenr, u64 parent, 1486 u64 root_objectid) 1487 { 1488 struct btrfs_key key; 1489 int ret; 1490 1491 key.objectid = bytenr; 1492 if (parent) { 1493 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1494 key.offset = parent; 1495 } else { 1496 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1497 key.offset = root_objectid; 1498 } 1499 1500 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1501 btrfs_release_path(path); 1502 return ret; 1503 } 1504 1505 static inline int extent_ref_type(u64 parent, u64 owner) 1506 { 1507 int type; 1508 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1509 if (parent > 0) 1510 type = BTRFS_SHARED_BLOCK_REF_KEY; 1511 else 1512 type = BTRFS_TREE_BLOCK_REF_KEY; 1513 } else { 1514 if (parent > 0) 1515 type = BTRFS_SHARED_DATA_REF_KEY; 1516 else 1517 type = BTRFS_EXTENT_DATA_REF_KEY; 1518 } 1519 return type; 1520 } 1521 1522 static int find_next_key(struct btrfs_path *path, int level, 1523 struct btrfs_key *key) 1524 1525 { 1526 for (; level < BTRFS_MAX_LEVEL; level++) { 1527 if (!path->nodes[level]) 1528 break; 1529 if (path->slots[level] + 1 >= 1530 btrfs_header_nritems(path->nodes[level])) 1531 continue; 1532 if (level == 0) 1533 btrfs_item_key_to_cpu(path->nodes[level], key, 1534 path->slots[level] + 1); 1535 else 1536 btrfs_node_key_to_cpu(path->nodes[level], key, 1537 path->slots[level] + 1); 1538 return 0; 1539 } 1540 return 1; 1541 } 1542 1543 /* 1544 * look for inline back ref. if back ref is found, *ref_ret is set 1545 * to the address of inline back ref, and 0 is returned. 1546 * 1547 * if back ref isn't found, *ref_ret is set to the address where it 1548 * should be inserted, and -ENOENT is returned. 1549 * 1550 * if insert is true and there are too many inline back refs, the path 1551 * points to the extent item, and -EAGAIN is returned. 1552 * 1553 * NOTE: inline back refs are ordered in the same way that back ref 1554 * items in the tree are ordered. 1555 */ 1556 static noinline_for_stack 1557 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1558 struct btrfs_root *root, 1559 struct btrfs_path *path, 1560 struct btrfs_extent_inline_ref **ref_ret, 1561 u64 bytenr, u64 num_bytes, 1562 u64 parent, u64 root_objectid, 1563 u64 owner, u64 offset, int insert) 1564 { 1565 struct btrfs_key key; 1566 struct extent_buffer *leaf; 1567 struct btrfs_extent_item *ei; 1568 struct btrfs_extent_inline_ref *iref; 1569 u64 flags; 1570 u64 item_size; 1571 unsigned long ptr; 1572 unsigned long end; 1573 int extra_size; 1574 int type; 1575 int want; 1576 int ret; 1577 int err = 0; 1578 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 1579 SKINNY_METADATA); 1580 1581 key.objectid = bytenr; 1582 key.type = BTRFS_EXTENT_ITEM_KEY; 1583 key.offset = num_bytes; 1584 1585 want = extent_ref_type(parent, owner); 1586 if (insert) { 1587 extra_size = btrfs_extent_inline_ref_size(want); 1588 path->keep_locks = 1; 1589 } else 1590 extra_size = -1; 1591 1592 /* 1593 * Owner is our parent level, so we can just add one to get the level 1594 * for the block we are interested in. 1595 */ 1596 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1597 key.type = BTRFS_METADATA_ITEM_KEY; 1598 key.offset = owner; 1599 } 1600 1601 again: 1602 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1603 if (ret < 0) { 1604 err = ret; 1605 goto out; 1606 } 1607 1608 /* 1609 * We may be a newly converted file system which still has the old fat 1610 * extent entries for metadata, so try and see if we have one of those. 1611 */ 1612 if (ret > 0 && skinny_metadata) { 1613 skinny_metadata = false; 1614 if (path->slots[0]) { 1615 path->slots[0]--; 1616 btrfs_item_key_to_cpu(path->nodes[0], &key, 1617 path->slots[0]); 1618 if (key.objectid == bytenr && 1619 key.type == BTRFS_EXTENT_ITEM_KEY && 1620 key.offset == num_bytes) 1621 ret = 0; 1622 } 1623 if (ret) { 1624 key.objectid = bytenr; 1625 key.type = BTRFS_EXTENT_ITEM_KEY; 1626 key.offset = num_bytes; 1627 btrfs_release_path(path); 1628 goto again; 1629 } 1630 } 1631 1632 if (ret && !insert) { 1633 err = -ENOENT; 1634 goto out; 1635 } else if (WARN_ON(ret)) { 1636 err = -EIO; 1637 goto out; 1638 } 1639 1640 leaf = path->nodes[0]; 1641 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1642 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1643 if (item_size < sizeof(*ei)) { 1644 if (!insert) { 1645 err = -ENOENT; 1646 goto out; 1647 } 1648 ret = convert_extent_item_v0(trans, root, path, owner, 1649 extra_size); 1650 if (ret < 0) { 1651 err = ret; 1652 goto out; 1653 } 1654 leaf = path->nodes[0]; 1655 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1656 } 1657 #endif 1658 BUG_ON(item_size < sizeof(*ei)); 1659 1660 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1661 flags = btrfs_extent_flags(leaf, ei); 1662 1663 ptr = (unsigned long)(ei + 1); 1664 end = (unsigned long)ei + item_size; 1665 1666 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1667 ptr += sizeof(struct btrfs_tree_block_info); 1668 BUG_ON(ptr > end); 1669 } 1670 1671 err = -ENOENT; 1672 while (1) { 1673 if (ptr >= end) { 1674 WARN_ON(ptr > end); 1675 break; 1676 } 1677 iref = (struct btrfs_extent_inline_ref *)ptr; 1678 type = btrfs_extent_inline_ref_type(leaf, iref); 1679 if (want < type) 1680 break; 1681 if (want > type) { 1682 ptr += btrfs_extent_inline_ref_size(type); 1683 continue; 1684 } 1685 1686 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1687 struct btrfs_extent_data_ref *dref; 1688 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1689 if (match_extent_data_ref(leaf, dref, root_objectid, 1690 owner, offset)) { 1691 err = 0; 1692 break; 1693 } 1694 if (hash_extent_data_ref_item(leaf, dref) < 1695 hash_extent_data_ref(root_objectid, owner, offset)) 1696 break; 1697 } else { 1698 u64 ref_offset; 1699 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1700 if (parent > 0) { 1701 if (parent == ref_offset) { 1702 err = 0; 1703 break; 1704 } 1705 if (ref_offset < parent) 1706 break; 1707 } else { 1708 if (root_objectid == ref_offset) { 1709 err = 0; 1710 break; 1711 } 1712 if (ref_offset < root_objectid) 1713 break; 1714 } 1715 } 1716 ptr += btrfs_extent_inline_ref_size(type); 1717 } 1718 if (err == -ENOENT && insert) { 1719 if (item_size + extra_size >= 1720 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1721 err = -EAGAIN; 1722 goto out; 1723 } 1724 /* 1725 * To add new inline back ref, we have to make sure 1726 * there is no corresponding back ref item. 1727 * For simplicity, we just do not add new inline back 1728 * ref if there is any kind of item for this block 1729 */ 1730 if (find_next_key(path, 0, &key) == 0 && 1731 key.objectid == bytenr && 1732 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1733 err = -EAGAIN; 1734 goto out; 1735 } 1736 } 1737 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1738 out: 1739 if (insert) { 1740 path->keep_locks = 0; 1741 btrfs_unlock_up_safe(path, 1); 1742 } 1743 return err; 1744 } 1745 1746 /* 1747 * helper to add new inline back ref 1748 */ 1749 static noinline_for_stack 1750 void setup_inline_extent_backref(struct btrfs_root *root, 1751 struct btrfs_path *path, 1752 struct btrfs_extent_inline_ref *iref, 1753 u64 parent, u64 root_objectid, 1754 u64 owner, u64 offset, int refs_to_add, 1755 struct btrfs_delayed_extent_op *extent_op) 1756 { 1757 struct extent_buffer *leaf; 1758 struct btrfs_extent_item *ei; 1759 unsigned long ptr; 1760 unsigned long end; 1761 unsigned long item_offset; 1762 u64 refs; 1763 int size; 1764 int type; 1765 1766 leaf = path->nodes[0]; 1767 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1768 item_offset = (unsigned long)iref - (unsigned long)ei; 1769 1770 type = extent_ref_type(parent, owner); 1771 size = btrfs_extent_inline_ref_size(type); 1772 1773 btrfs_extend_item(root, path, size); 1774 1775 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1776 refs = btrfs_extent_refs(leaf, ei); 1777 refs += refs_to_add; 1778 btrfs_set_extent_refs(leaf, ei, refs); 1779 if (extent_op) 1780 __run_delayed_extent_op(extent_op, leaf, ei); 1781 1782 ptr = (unsigned long)ei + item_offset; 1783 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1784 if (ptr < end - size) 1785 memmove_extent_buffer(leaf, ptr + size, ptr, 1786 end - size - ptr); 1787 1788 iref = (struct btrfs_extent_inline_ref *)ptr; 1789 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1790 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1791 struct btrfs_extent_data_ref *dref; 1792 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1793 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1794 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1795 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1796 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1797 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1798 struct btrfs_shared_data_ref *sref; 1799 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1800 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1801 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1802 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1803 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1804 } else { 1805 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1806 } 1807 btrfs_mark_buffer_dirty(leaf); 1808 } 1809 1810 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1811 struct btrfs_root *root, 1812 struct btrfs_path *path, 1813 struct btrfs_extent_inline_ref **ref_ret, 1814 u64 bytenr, u64 num_bytes, u64 parent, 1815 u64 root_objectid, u64 owner, u64 offset) 1816 { 1817 int ret; 1818 1819 ret = lookup_inline_extent_backref(trans, root, path, ref_ret, 1820 bytenr, num_bytes, parent, 1821 root_objectid, owner, offset, 0); 1822 if (ret != -ENOENT) 1823 return ret; 1824 1825 btrfs_release_path(path); 1826 *ref_ret = NULL; 1827 1828 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1829 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, 1830 root_objectid); 1831 } else { 1832 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, 1833 root_objectid, owner, offset); 1834 } 1835 return ret; 1836 } 1837 1838 /* 1839 * helper to update/remove inline back ref 1840 */ 1841 static noinline_for_stack 1842 void update_inline_extent_backref(struct btrfs_root *root, 1843 struct btrfs_path *path, 1844 struct btrfs_extent_inline_ref *iref, 1845 int refs_to_mod, 1846 struct btrfs_delayed_extent_op *extent_op, 1847 int *last_ref) 1848 { 1849 struct extent_buffer *leaf; 1850 struct btrfs_extent_item *ei; 1851 struct btrfs_extent_data_ref *dref = NULL; 1852 struct btrfs_shared_data_ref *sref = NULL; 1853 unsigned long ptr; 1854 unsigned long end; 1855 u32 item_size; 1856 int size; 1857 int type; 1858 u64 refs; 1859 1860 leaf = path->nodes[0]; 1861 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1862 refs = btrfs_extent_refs(leaf, ei); 1863 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1864 refs += refs_to_mod; 1865 btrfs_set_extent_refs(leaf, ei, refs); 1866 if (extent_op) 1867 __run_delayed_extent_op(extent_op, leaf, ei); 1868 1869 type = btrfs_extent_inline_ref_type(leaf, iref); 1870 1871 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1872 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1873 refs = btrfs_extent_data_ref_count(leaf, dref); 1874 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1875 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1876 refs = btrfs_shared_data_ref_count(leaf, sref); 1877 } else { 1878 refs = 1; 1879 BUG_ON(refs_to_mod != -1); 1880 } 1881 1882 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1883 refs += refs_to_mod; 1884 1885 if (refs > 0) { 1886 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1887 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1888 else 1889 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1890 } else { 1891 *last_ref = 1; 1892 size = btrfs_extent_inline_ref_size(type); 1893 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1894 ptr = (unsigned long)iref; 1895 end = (unsigned long)ei + item_size; 1896 if (ptr + size < end) 1897 memmove_extent_buffer(leaf, ptr, ptr + size, 1898 end - ptr - size); 1899 item_size -= size; 1900 btrfs_truncate_item(root, path, item_size, 1); 1901 } 1902 btrfs_mark_buffer_dirty(leaf); 1903 } 1904 1905 static noinline_for_stack 1906 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1907 struct btrfs_root *root, 1908 struct btrfs_path *path, 1909 u64 bytenr, u64 num_bytes, u64 parent, 1910 u64 root_objectid, u64 owner, 1911 u64 offset, int refs_to_add, 1912 struct btrfs_delayed_extent_op *extent_op) 1913 { 1914 struct btrfs_extent_inline_ref *iref; 1915 int ret; 1916 1917 ret = lookup_inline_extent_backref(trans, root, path, &iref, 1918 bytenr, num_bytes, parent, 1919 root_objectid, owner, offset, 1); 1920 if (ret == 0) { 1921 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1922 update_inline_extent_backref(root, path, iref, 1923 refs_to_add, extent_op, NULL); 1924 } else if (ret == -ENOENT) { 1925 setup_inline_extent_backref(root, path, iref, parent, 1926 root_objectid, owner, offset, 1927 refs_to_add, extent_op); 1928 ret = 0; 1929 } 1930 return ret; 1931 } 1932 1933 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1934 struct btrfs_root *root, 1935 struct btrfs_path *path, 1936 u64 bytenr, u64 parent, u64 root_objectid, 1937 u64 owner, u64 offset, int refs_to_add) 1938 { 1939 int ret; 1940 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1941 BUG_ON(refs_to_add != 1); 1942 ret = insert_tree_block_ref(trans, root, path, bytenr, 1943 parent, root_objectid); 1944 } else { 1945 ret = insert_extent_data_ref(trans, root, path, bytenr, 1946 parent, root_objectid, 1947 owner, offset, refs_to_add); 1948 } 1949 return ret; 1950 } 1951 1952 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1953 struct btrfs_root *root, 1954 struct btrfs_path *path, 1955 struct btrfs_extent_inline_ref *iref, 1956 int refs_to_drop, int is_data, int *last_ref) 1957 { 1958 int ret = 0; 1959 1960 BUG_ON(!is_data && refs_to_drop != 1); 1961 if (iref) { 1962 update_inline_extent_backref(root, path, iref, 1963 -refs_to_drop, NULL, last_ref); 1964 } else if (is_data) { 1965 ret = remove_extent_data_ref(trans, root, path, refs_to_drop, 1966 last_ref); 1967 } else { 1968 *last_ref = 1; 1969 ret = btrfs_del_item(trans, root, path); 1970 } 1971 return ret; 1972 } 1973 1974 #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) 1975 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, 1976 u64 *discarded_bytes) 1977 { 1978 int j, ret = 0; 1979 u64 bytes_left, end; 1980 u64 aligned_start = ALIGN(start, 1 << 9); 1981 1982 if (WARN_ON(start != aligned_start)) { 1983 len -= aligned_start - start; 1984 len = round_down(len, 1 << 9); 1985 start = aligned_start; 1986 } 1987 1988 *discarded_bytes = 0; 1989 1990 if (!len) 1991 return 0; 1992 1993 end = start + len; 1994 bytes_left = len; 1995 1996 /* Skip any superblocks on this device. */ 1997 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { 1998 u64 sb_start = btrfs_sb_offset(j); 1999 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; 2000 u64 size = sb_start - start; 2001 2002 if (!in_range(sb_start, start, bytes_left) && 2003 !in_range(sb_end, start, bytes_left) && 2004 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) 2005 continue; 2006 2007 /* 2008 * Superblock spans beginning of range. Adjust start and 2009 * try again. 2010 */ 2011 if (sb_start <= start) { 2012 start += sb_end - start; 2013 if (start > end) { 2014 bytes_left = 0; 2015 break; 2016 } 2017 bytes_left = end - start; 2018 continue; 2019 } 2020 2021 if (size) { 2022 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, 2023 GFP_NOFS, 0); 2024 if (!ret) 2025 *discarded_bytes += size; 2026 else if (ret != -EOPNOTSUPP) 2027 return ret; 2028 } 2029 2030 start = sb_end; 2031 if (start > end) { 2032 bytes_left = 0; 2033 break; 2034 } 2035 bytes_left = end - start; 2036 } 2037 2038 if (bytes_left) { 2039 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, 2040 GFP_NOFS, 0); 2041 if (!ret) 2042 *discarded_bytes += bytes_left; 2043 } 2044 return ret; 2045 } 2046 2047 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 2048 u64 num_bytes, u64 *actual_bytes) 2049 { 2050 int ret; 2051 u64 discarded_bytes = 0; 2052 struct btrfs_bio *bbio = NULL; 2053 2054 2055 /* 2056 * Avoid races with device replace and make sure our bbio has devices 2057 * associated to its stripes that don't go away while we are discarding. 2058 */ 2059 btrfs_bio_counter_inc_blocked(root->fs_info); 2060 /* Tell the block device(s) that the sectors can be discarded */ 2061 ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD, 2062 bytenr, &num_bytes, &bbio, 0); 2063 /* Error condition is -ENOMEM */ 2064 if (!ret) { 2065 struct btrfs_bio_stripe *stripe = bbio->stripes; 2066 int i; 2067 2068 2069 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 2070 u64 bytes; 2071 if (!stripe->dev->can_discard) 2072 continue; 2073 2074 ret = btrfs_issue_discard(stripe->dev->bdev, 2075 stripe->physical, 2076 stripe->length, 2077 &bytes); 2078 if (!ret) 2079 discarded_bytes += bytes; 2080 else if (ret != -EOPNOTSUPP) 2081 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 2082 2083 /* 2084 * Just in case we get back EOPNOTSUPP for some reason, 2085 * just ignore the return value so we don't screw up 2086 * people calling discard_extent. 2087 */ 2088 ret = 0; 2089 } 2090 btrfs_put_bbio(bbio); 2091 } 2092 btrfs_bio_counter_dec(root->fs_info); 2093 2094 if (actual_bytes) 2095 *actual_bytes = discarded_bytes; 2096 2097 2098 if (ret == -EOPNOTSUPP) 2099 ret = 0; 2100 return ret; 2101 } 2102 2103 /* Can return -ENOMEM */ 2104 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2105 struct btrfs_root *root, 2106 u64 bytenr, u64 num_bytes, u64 parent, 2107 u64 root_objectid, u64 owner, u64 offset) 2108 { 2109 int ret; 2110 struct btrfs_fs_info *fs_info = root->fs_info; 2111 2112 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 2113 root_objectid == BTRFS_TREE_LOG_OBJECTID); 2114 2115 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 2116 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 2117 num_bytes, 2118 parent, root_objectid, (int)owner, 2119 BTRFS_ADD_DELAYED_REF, NULL); 2120 } else { 2121 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 2122 num_bytes, parent, root_objectid, 2123 owner, offset, 0, 2124 BTRFS_ADD_DELAYED_REF, NULL); 2125 } 2126 return ret; 2127 } 2128 2129 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2130 struct btrfs_root *root, 2131 struct btrfs_delayed_ref_node *node, 2132 u64 parent, u64 root_objectid, 2133 u64 owner, u64 offset, int refs_to_add, 2134 struct btrfs_delayed_extent_op *extent_op) 2135 { 2136 struct btrfs_fs_info *fs_info = root->fs_info; 2137 struct btrfs_path *path; 2138 struct extent_buffer *leaf; 2139 struct btrfs_extent_item *item; 2140 struct btrfs_key key; 2141 u64 bytenr = node->bytenr; 2142 u64 num_bytes = node->num_bytes; 2143 u64 refs; 2144 int ret; 2145 2146 path = btrfs_alloc_path(); 2147 if (!path) 2148 return -ENOMEM; 2149 2150 path->reada = READA_FORWARD; 2151 path->leave_spinning = 1; 2152 /* this will setup the path even if it fails to insert the back ref */ 2153 ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, 2154 bytenr, num_bytes, parent, 2155 root_objectid, owner, offset, 2156 refs_to_add, extent_op); 2157 if ((ret < 0 && ret != -EAGAIN) || !ret) 2158 goto out; 2159 2160 /* 2161 * Ok we had -EAGAIN which means we didn't have space to insert and 2162 * inline extent ref, so just update the reference count and add a 2163 * normal backref. 2164 */ 2165 leaf = path->nodes[0]; 2166 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2167 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2168 refs = btrfs_extent_refs(leaf, item); 2169 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2170 if (extent_op) 2171 __run_delayed_extent_op(extent_op, leaf, item); 2172 2173 btrfs_mark_buffer_dirty(leaf); 2174 btrfs_release_path(path); 2175 2176 path->reada = READA_FORWARD; 2177 path->leave_spinning = 1; 2178 /* now insert the actual backref */ 2179 ret = insert_extent_backref(trans, root->fs_info->extent_root, 2180 path, bytenr, parent, root_objectid, 2181 owner, offset, refs_to_add); 2182 if (ret) 2183 btrfs_abort_transaction(trans, ret); 2184 out: 2185 btrfs_free_path(path); 2186 return ret; 2187 } 2188 2189 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2190 struct btrfs_root *root, 2191 struct btrfs_delayed_ref_node *node, 2192 struct btrfs_delayed_extent_op *extent_op, 2193 int insert_reserved) 2194 { 2195 int ret = 0; 2196 struct btrfs_delayed_data_ref *ref; 2197 struct btrfs_key ins; 2198 u64 parent = 0; 2199 u64 ref_root = 0; 2200 u64 flags = 0; 2201 2202 ins.objectid = node->bytenr; 2203 ins.offset = node->num_bytes; 2204 ins.type = BTRFS_EXTENT_ITEM_KEY; 2205 2206 ref = btrfs_delayed_node_to_data_ref(node); 2207 trace_run_delayed_data_ref(root->fs_info, node, ref, node->action); 2208 2209 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2210 parent = ref->parent; 2211 ref_root = ref->root; 2212 2213 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2214 if (extent_op) 2215 flags |= extent_op->flags_to_set; 2216 ret = alloc_reserved_file_extent(trans, root, 2217 parent, ref_root, flags, 2218 ref->objectid, ref->offset, 2219 &ins, node->ref_mod); 2220 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2221 ret = __btrfs_inc_extent_ref(trans, root, node, parent, 2222 ref_root, ref->objectid, 2223 ref->offset, node->ref_mod, 2224 extent_op); 2225 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2226 ret = __btrfs_free_extent(trans, root, node, parent, 2227 ref_root, ref->objectid, 2228 ref->offset, node->ref_mod, 2229 extent_op); 2230 } else { 2231 BUG(); 2232 } 2233 return ret; 2234 } 2235 2236 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2237 struct extent_buffer *leaf, 2238 struct btrfs_extent_item *ei) 2239 { 2240 u64 flags = btrfs_extent_flags(leaf, ei); 2241 if (extent_op->update_flags) { 2242 flags |= extent_op->flags_to_set; 2243 btrfs_set_extent_flags(leaf, ei, flags); 2244 } 2245 2246 if (extent_op->update_key) { 2247 struct btrfs_tree_block_info *bi; 2248 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2249 bi = (struct btrfs_tree_block_info *)(ei + 1); 2250 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2251 } 2252 } 2253 2254 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2255 struct btrfs_root *root, 2256 struct btrfs_delayed_ref_node *node, 2257 struct btrfs_delayed_extent_op *extent_op) 2258 { 2259 struct btrfs_key key; 2260 struct btrfs_path *path; 2261 struct btrfs_extent_item *ei; 2262 struct extent_buffer *leaf; 2263 u32 item_size; 2264 int ret; 2265 int err = 0; 2266 int metadata = !extent_op->is_data; 2267 2268 if (trans->aborted) 2269 return 0; 2270 2271 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2272 metadata = 0; 2273 2274 path = btrfs_alloc_path(); 2275 if (!path) 2276 return -ENOMEM; 2277 2278 key.objectid = node->bytenr; 2279 2280 if (metadata) { 2281 key.type = BTRFS_METADATA_ITEM_KEY; 2282 key.offset = extent_op->level; 2283 } else { 2284 key.type = BTRFS_EXTENT_ITEM_KEY; 2285 key.offset = node->num_bytes; 2286 } 2287 2288 again: 2289 path->reada = READA_FORWARD; 2290 path->leave_spinning = 1; 2291 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 2292 path, 0, 1); 2293 if (ret < 0) { 2294 err = ret; 2295 goto out; 2296 } 2297 if (ret > 0) { 2298 if (metadata) { 2299 if (path->slots[0] > 0) { 2300 path->slots[0]--; 2301 btrfs_item_key_to_cpu(path->nodes[0], &key, 2302 path->slots[0]); 2303 if (key.objectid == node->bytenr && 2304 key.type == BTRFS_EXTENT_ITEM_KEY && 2305 key.offset == node->num_bytes) 2306 ret = 0; 2307 } 2308 if (ret > 0) { 2309 btrfs_release_path(path); 2310 metadata = 0; 2311 2312 key.objectid = node->bytenr; 2313 key.offset = node->num_bytes; 2314 key.type = BTRFS_EXTENT_ITEM_KEY; 2315 goto again; 2316 } 2317 } else { 2318 err = -EIO; 2319 goto out; 2320 } 2321 } 2322 2323 leaf = path->nodes[0]; 2324 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2325 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2326 if (item_size < sizeof(*ei)) { 2327 ret = convert_extent_item_v0(trans, root->fs_info->extent_root, 2328 path, (u64)-1, 0); 2329 if (ret < 0) { 2330 err = ret; 2331 goto out; 2332 } 2333 leaf = path->nodes[0]; 2334 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2335 } 2336 #endif 2337 BUG_ON(item_size < sizeof(*ei)); 2338 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2339 __run_delayed_extent_op(extent_op, leaf, ei); 2340 2341 btrfs_mark_buffer_dirty(leaf); 2342 out: 2343 btrfs_free_path(path); 2344 return err; 2345 } 2346 2347 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2348 struct btrfs_root *root, 2349 struct btrfs_delayed_ref_node *node, 2350 struct btrfs_delayed_extent_op *extent_op, 2351 int insert_reserved) 2352 { 2353 int ret = 0; 2354 struct btrfs_delayed_tree_ref *ref; 2355 struct btrfs_key ins; 2356 u64 parent = 0; 2357 u64 ref_root = 0; 2358 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 2359 SKINNY_METADATA); 2360 2361 ref = btrfs_delayed_node_to_tree_ref(node); 2362 trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action); 2363 2364 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2365 parent = ref->parent; 2366 ref_root = ref->root; 2367 2368 ins.objectid = node->bytenr; 2369 if (skinny_metadata) { 2370 ins.offset = ref->level; 2371 ins.type = BTRFS_METADATA_ITEM_KEY; 2372 } else { 2373 ins.offset = node->num_bytes; 2374 ins.type = BTRFS_EXTENT_ITEM_KEY; 2375 } 2376 2377 BUG_ON(node->ref_mod != 1); 2378 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2379 BUG_ON(!extent_op || !extent_op->update_flags); 2380 ret = alloc_reserved_tree_block(trans, root, 2381 parent, ref_root, 2382 extent_op->flags_to_set, 2383 &extent_op->key, 2384 ref->level, &ins); 2385 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2386 ret = __btrfs_inc_extent_ref(trans, root, node, 2387 parent, ref_root, 2388 ref->level, 0, 1, 2389 extent_op); 2390 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2391 ret = __btrfs_free_extent(trans, root, node, 2392 parent, ref_root, 2393 ref->level, 0, 1, extent_op); 2394 } else { 2395 BUG(); 2396 } 2397 return ret; 2398 } 2399 2400 /* helper function to actually process a single delayed ref entry */ 2401 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2402 struct btrfs_root *root, 2403 struct btrfs_delayed_ref_node *node, 2404 struct btrfs_delayed_extent_op *extent_op, 2405 int insert_reserved) 2406 { 2407 int ret = 0; 2408 2409 if (trans->aborted) { 2410 if (insert_reserved) 2411 btrfs_pin_extent(root, node->bytenr, 2412 node->num_bytes, 1); 2413 return 0; 2414 } 2415 2416 if (btrfs_delayed_ref_is_head(node)) { 2417 struct btrfs_delayed_ref_head *head; 2418 /* 2419 * we've hit the end of the chain and we were supposed 2420 * to insert this extent into the tree. But, it got 2421 * deleted before we ever needed to insert it, so all 2422 * we have to do is clean up the accounting 2423 */ 2424 BUG_ON(extent_op); 2425 head = btrfs_delayed_node_to_head(node); 2426 trace_run_delayed_ref_head(root->fs_info, node, head, 2427 node->action); 2428 2429 if (insert_reserved) { 2430 btrfs_pin_extent(root, node->bytenr, 2431 node->num_bytes, 1); 2432 if (head->is_data) { 2433 ret = btrfs_del_csums(trans, root, 2434 node->bytenr, 2435 node->num_bytes); 2436 } 2437 } 2438 2439 /* Also free its reserved qgroup space */ 2440 btrfs_qgroup_free_delayed_ref(root->fs_info, 2441 head->qgroup_ref_root, 2442 head->qgroup_reserved); 2443 return ret; 2444 } 2445 2446 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2447 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2448 ret = run_delayed_tree_ref(trans, root, node, extent_op, 2449 insert_reserved); 2450 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2451 node->type == BTRFS_SHARED_DATA_REF_KEY) 2452 ret = run_delayed_data_ref(trans, root, node, extent_op, 2453 insert_reserved); 2454 else 2455 BUG(); 2456 return ret; 2457 } 2458 2459 static inline struct btrfs_delayed_ref_node * 2460 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2461 { 2462 struct btrfs_delayed_ref_node *ref; 2463 2464 if (list_empty(&head->ref_list)) 2465 return NULL; 2466 2467 /* 2468 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. 2469 * This is to prevent a ref count from going down to zero, which deletes 2470 * the extent item from the extent tree, when there still are references 2471 * to add, which would fail because they would not find the extent item. 2472 */ 2473 list_for_each_entry(ref, &head->ref_list, list) { 2474 if (ref->action == BTRFS_ADD_DELAYED_REF) 2475 return ref; 2476 } 2477 2478 return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node, 2479 list); 2480 } 2481 2482 /* 2483 * Returns 0 on success or if called with an already aborted transaction. 2484 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2485 */ 2486 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2487 struct btrfs_root *root, 2488 unsigned long nr) 2489 { 2490 struct btrfs_delayed_ref_root *delayed_refs; 2491 struct btrfs_delayed_ref_node *ref; 2492 struct btrfs_delayed_ref_head *locked_ref = NULL; 2493 struct btrfs_delayed_extent_op *extent_op; 2494 struct btrfs_fs_info *fs_info = root->fs_info; 2495 ktime_t start = ktime_get(); 2496 int ret; 2497 unsigned long count = 0; 2498 unsigned long actual_count = 0; 2499 int must_insert_reserved = 0; 2500 2501 delayed_refs = &trans->transaction->delayed_refs; 2502 while (1) { 2503 if (!locked_ref) { 2504 if (count >= nr) 2505 break; 2506 2507 spin_lock(&delayed_refs->lock); 2508 locked_ref = btrfs_select_ref_head(trans); 2509 if (!locked_ref) { 2510 spin_unlock(&delayed_refs->lock); 2511 break; 2512 } 2513 2514 /* grab the lock that says we are going to process 2515 * all the refs for this head */ 2516 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2517 spin_unlock(&delayed_refs->lock); 2518 /* 2519 * we may have dropped the spin lock to get the head 2520 * mutex lock, and that might have given someone else 2521 * time to free the head. If that's true, it has been 2522 * removed from our list and we can move on. 2523 */ 2524 if (ret == -EAGAIN) { 2525 locked_ref = NULL; 2526 count++; 2527 continue; 2528 } 2529 } 2530 2531 /* 2532 * We need to try and merge add/drops of the same ref since we 2533 * can run into issues with relocate dropping the implicit ref 2534 * and then it being added back again before the drop can 2535 * finish. If we merged anything we need to re-loop so we can 2536 * get a good ref. 2537 * Or we can get node references of the same type that weren't 2538 * merged when created due to bumps in the tree mod seq, and 2539 * we need to merge them to prevent adding an inline extent 2540 * backref before dropping it (triggering a BUG_ON at 2541 * insert_inline_extent_backref()). 2542 */ 2543 spin_lock(&locked_ref->lock); 2544 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2545 locked_ref); 2546 2547 /* 2548 * locked_ref is the head node, so we have to go one 2549 * node back for any delayed ref updates 2550 */ 2551 ref = select_delayed_ref(locked_ref); 2552 2553 if (ref && ref->seq && 2554 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2555 spin_unlock(&locked_ref->lock); 2556 btrfs_delayed_ref_unlock(locked_ref); 2557 spin_lock(&delayed_refs->lock); 2558 locked_ref->processing = 0; 2559 delayed_refs->num_heads_ready++; 2560 spin_unlock(&delayed_refs->lock); 2561 locked_ref = NULL; 2562 cond_resched(); 2563 count++; 2564 continue; 2565 } 2566 2567 /* 2568 * record the must insert reserved flag before we 2569 * drop the spin lock. 2570 */ 2571 must_insert_reserved = locked_ref->must_insert_reserved; 2572 locked_ref->must_insert_reserved = 0; 2573 2574 extent_op = locked_ref->extent_op; 2575 locked_ref->extent_op = NULL; 2576 2577 if (!ref) { 2578 2579 2580 /* All delayed refs have been processed, Go ahead 2581 * and send the head node to run_one_delayed_ref, 2582 * so that any accounting fixes can happen 2583 */ 2584 ref = &locked_ref->node; 2585 2586 if (extent_op && must_insert_reserved) { 2587 btrfs_free_delayed_extent_op(extent_op); 2588 extent_op = NULL; 2589 } 2590 2591 if (extent_op) { 2592 spin_unlock(&locked_ref->lock); 2593 ret = run_delayed_extent_op(trans, root, 2594 ref, extent_op); 2595 btrfs_free_delayed_extent_op(extent_op); 2596 2597 if (ret) { 2598 /* 2599 * Need to reset must_insert_reserved if 2600 * there was an error so the abort stuff 2601 * can cleanup the reserved space 2602 * properly. 2603 */ 2604 if (must_insert_reserved) 2605 locked_ref->must_insert_reserved = 1; 2606 locked_ref->processing = 0; 2607 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2608 btrfs_delayed_ref_unlock(locked_ref); 2609 return ret; 2610 } 2611 continue; 2612 } 2613 2614 /* 2615 * Need to drop our head ref lock and re-acquire the 2616 * delayed ref lock and then re-check to make sure 2617 * nobody got added. 2618 */ 2619 spin_unlock(&locked_ref->lock); 2620 spin_lock(&delayed_refs->lock); 2621 spin_lock(&locked_ref->lock); 2622 if (!list_empty(&locked_ref->ref_list) || 2623 locked_ref->extent_op) { 2624 spin_unlock(&locked_ref->lock); 2625 spin_unlock(&delayed_refs->lock); 2626 continue; 2627 } 2628 ref->in_tree = 0; 2629 delayed_refs->num_heads--; 2630 rb_erase(&locked_ref->href_node, 2631 &delayed_refs->href_root); 2632 spin_unlock(&delayed_refs->lock); 2633 } else { 2634 actual_count++; 2635 ref->in_tree = 0; 2636 list_del(&ref->list); 2637 } 2638 atomic_dec(&delayed_refs->num_entries); 2639 2640 if (!btrfs_delayed_ref_is_head(ref)) { 2641 /* 2642 * when we play the delayed ref, also correct the 2643 * ref_mod on head 2644 */ 2645 switch (ref->action) { 2646 case BTRFS_ADD_DELAYED_REF: 2647 case BTRFS_ADD_DELAYED_EXTENT: 2648 locked_ref->node.ref_mod -= ref->ref_mod; 2649 break; 2650 case BTRFS_DROP_DELAYED_REF: 2651 locked_ref->node.ref_mod += ref->ref_mod; 2652 break; 2653 default: 2654 WARN_ON(1); 2655 } 2656 } 2657 spin_unlock(&locked_ref->lock); 2658 2659 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2660 must_insert_reserved); 2661 2662 btrfs_free_delayed_extent_op(extent_op); 2663 if (ret) { 2664 locked_ref->processing = 0; 2665 btrfs_delayed_ref_unlock(locked_ref); 2666 btrfs_put_delayed_ref(ref); 2667 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); 2668 return ret; 2669 } 2670 2671 /* 2672 * If this node is a head, that means all the refs in this head 2673 * have been dealt with, and we will pick the next head to deal 2674 * with, so we must unlock the head and drop it from the cluster 2675 * list before we release it. 2676 */ 2677 if (btrfs_delayed_ref_is_head(ref)) { 2678 if (locked_ref->is_data && 2679 locked_ref->total_ref_mod < 0) { 2680 spin_lock(&delayed_refs->lock); 2681 delayed_refs->pending_csums -= ref->num_bytes; 2682 spin_unlock(&delayed_refs->lock); 2683 } 2684 btrfs_delayed_ref_unlock(locked_ref); 2685 locked_ref = NULL; 2686 } 2687 btrfs_put_delayed_ref(ref); 2688 count++; 2689 cond_resched(); 2690 } 2691 2692 /* 2693 * We don't want to include ref heads since we can have empty ref heads 2694 * and those will drastically skew our runtime down since we just do 2695 * accounting, no actual extent tree updates. 2696 */ 2697 if (actual_count > 0) { 2698 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2699 u64 avg; 2700 2701 /* 2702 * We weigh the current average higher than our current runtime 2703 * to avoid large swings in the average. 2704 */ 2705 spin_lock(&delayed_refs->lock); 2706 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2707 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ 2708 spin_unlock(&delayed_refs->lock); 2709 } 2710 return 0; 2711 } 2712 2713 #ifdef SCRAMBLE_DELAYED_REFS 2714 /* 2715 * Normally delayed refs get processed in ascending bytenr order. This 2716 * correlates in most cases to the order added. To expose dependencies on this 2717 * order, we start to process the tree in the middle instead of the beginning 2718 */ 2719 static u64 find_middle(struct rb_root *root) 2720 { 2721 struct rb_node *n = root->rb_node; 2722 struct btrfs_delayed_ref_node *entry; 2723 int alt = 1; 2724 u64 middle; 2725 u64 first = 0, last = 0; 2726 2727 n = rb_first(root); 2728 if (n) { 2729 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2730 first = entry->bytenr; 2731 } 2732 n = rb_last(root); 2733 if (n) { 2734 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2735 last = entry->bytenr; 2736 } 2737 n = root->rb_node; 2738 2739 while (n) { 2740 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2741 WARN_ON(!entry->in_tree); 2742 2743 middle = entry->bytenr; 2744 2745 if (alt) 2746 n = n->rb_left; 2747 else 2748 n = n->rb_right; 2749 2750 alt = 1 - alt; 2751 } 2752 return middle; 2753 } 2754 #endif 2755 2756 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) 2757 { 2758 u64 num_bytes; 2759 2760 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2761 sizeof(struct btrfs_extent_inline_ref)); 2762 if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2763 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2764 2765 /* 2766 * We don't ever fill up leaves all the way so multiply by 2 just to be 2767 * closer to what we're really going to want to use. 2768 */ 2769 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); 2770 } 2771 2772 /* 2773 * Takes the number of bytes to be csumm'ed and figures out how many leaves it 2774 * would require to store the csums for that many bytes. 2775 */ 2776 u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) 2777 { 2778 u64 csum_size; 2779 u64 num_csums_per_leaf; 2780 u64 num_csums; 2781 2782 csum_size = BTRFS_MAX_ITEM_SIZE(root); 2783 num_csums_per_leaf = div64_u64(csum_size, 2784 (u64)btrfs_super_csum_size(root->fs_info->super_copy)); 2785 num_csums = div64_u64(csum_bytes, root->sectorsize); 2786 num_csums += num_csums_per_leaf - 1; 2787 num_csums = div64_u64(num_csums, num_csums_per_leaf); 2788 return num_csums; 2789 } 2790 2791 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, 2792 struct btrfs_root *root) 2793 { 2794 struct btrfs_block_rsv *global_rsv; 2795 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; 2796 u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; 2797 u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; 2798 u64 num_bytes, num_dirty_bgs_bytes; 2799 int ret = 0; 2800 2801 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 2802 num_heads = heads_to_leaves(root, num_heads); 2803 if (num_heads > 1) 2804 num_bytes += (num_heads - 1) * root->nodesize; 2805 num_bytes <<= 1; 2806 num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize; 2807 num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root, 2808 num_dirty_bgs); 2809 global_rsv = &root->fs_info->global_block_rsv; 2810 2811 /* 2812 * If we can't allocate any more chunks lets make sure we have _lots_ of 2813 * wiggle room since running delayed refs can create more delayed refs. 2814 */ 2815 if (global_rsv->space_info->full) { 2816 num_dirty_bgs_bytes <<= 1; 2817 num_bytes <<= 1; 2818 } 2819 2820 spin_lock(&global_rsv->lock); 2821 if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) 2822 ret = 1; 2823 spin_unlock(&global_rsv->lock); 2824 return ret; 2825 } 2826 2827 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 2828 struct btrfs_root *root) 2829 { 2830 struct btrfs_fs_info *fs_info = root->fs_info; 2831 u64 num_entries = 2832 atomic_read(&trans->transaction->delayed_refs.num_entries); 2833 u64 avg_runtime; 2834 u64 val; 2835 2836 smp_mb(); 2837 avg_runtime = fs_info->avg_delayed_ref_runtime; 2838 val = num_entries * avg_runtime; 2839 if (num_entries * avg_runtime >= NSEC_PER_SEC) 2840 return 1; 2841 if (val >= NSEC_PER_SEC / 2) 2842 return 2; 2843 2844 return btrfs_check_space_for_delayed_refs(trans, root); 2845 } 2846 2847 struct async_delayed_refs { 2848 struct btrfs_root *root; 2849 u64 transid; 2850 int count; 2851 int error; 2852 int sync; 2853 struct completion wait; 2854 struct btrfs_work work; 2855 }; 2856 2857 static void delayed_ref_async_start(struct btrfs_work *work) 2858 { 2859 struct async_delayed_refs *async; 2860 struct btrfs_trans_handle *trans; 2861 int ret; 2862 2863 async = container_of(work, struct async_delayed_refs, work); 2864 2865 /* if the commit is already started, we don't need to wait here */ 2866 if (btrfs_transaction_blocked(async->root->fs_info)) 2867 goto done; 2868 2869 trans = btrfs_join_transaction(async->root); 2870 if (IS_ERR(trans)) { 2871 async->error = PTR_ERR(trans); 2872 goto done; 2873 } 2874 2875 /* 2876 * trans->sync means that when we call end_transaction, we won't 2877 * wait on delayed refs 2878 */ 2879 trans->sync = true; 2880 2881 /* Don't bother flushing if we got into a different transaction */ 2882 if (trans->transid > async->transid) 2883 goto end; 2884 2885 ret = btrfs_run_delayed_refs(trans, async->root, async->count); 2886 if (ret) 2887 async->error = ret; 2888 end: 2889 ret = btrfs_end_transaction(trans, async->root); 2890 if (ret && !async->error) 2891 async->error = ret; 2892 done: 2893 if (async->sync) 2894 complete(&async->wait); 2895 else 2896 kfree(async); 2897 } 2898 2899 int btrfs_async_run_delayed_refs(struct btrfs_root *root, 2900 unsigned long count, u64 transid, int wait) 2901 { 2902 struct async_delayed_refs *async; 2903 int ret; 2904 2905 async = kmalloc(sizeof(*async), GFP_NOFS); 2906 if (!async) 2907 return -ENOMEM; 2908 2909 async->root = root->fs_info->tree_root; 2910 async->count = count; 2911 async->error = 0; 2912 async->transid = transid; 2913 if (wait) 2914 async->sync = 1; 2915 else 2916 async->sync = 0; 2917 init_completion(&async->wait); 2918 2919 btrfs_init_work(&async->work, btrfs_extent_refs_helper, 2920 delayed_ref_async_start, NULL, NULL); 2921 2922 btrfs_queue_work(root->fs_info->extent_workers, &async->work); 2923 2924 if (wait) { 2925 wait_for_completion(&async->wait); 2926 ret = async->error; 2927 kfree(async); 2928 return ret; 2929 } 2930 return 0; 2931 } 2932 2933 /* 2934 * this starts processing the delayed reference count updates and 2935 * extent insertions we have queued up so far. count can be 2936 * 0, which means to process everything in the tree at the start 2937 * of the run (but not newly added entries), or it can be some target 2938 * number you'd like to process. 2939 * 2940 * Returns 0 on success or if called with an aborted transaction 2941 * Returns <0 on error and aborts the transaction 2942 */ 2943 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2944 struct btrfs_root *root, unsigned long count) 2945 { 2946 struct rb_node *node; 2947 struct btrfs_delayed_ref_root *delayed_refs; 2948 struct btrfs_delayed_ref_head *head; 2949 int ret; 2950 int run_all = count == (unsigned long)-1; 2951 bool can_flush_pending_bgs = trans->can_flush_pending_bgs; 2952 2953 /* We'll clean this up in btrfs_cleanup_transaction */ 2954 if (trans->aborted) 2955 return 0; 2956 2957 if (root->fs_info->creating_free_space_tree) 2958 return 0; 2959 2960 if (root == root->fs_info->extent_root) 2961 root = root->fs_info->tree_root; 2962 2963 delayed_refs = &trans->transaction->delayed_refs; 2964 if (count == 0) 2965 count = atomic_read(&delayed_refs->num_entries) * 2; 2966 2967 again: 2968 #ifdef SCRAMBLE_DELAYED_REFS 2969 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2970 #endif 2971 trans->can_flush_pending_bgs = false; 2972 ret = __btrfs_run_delayed_refs(trans, root, count); 2973 if (ret < 0) { 2974 btrfs_abort_transaction(trans, ret); 2975 return ret; 2976 } 2977 2978 if (run_all) { 2979 if (!list_empty(&trans->new_bgs)) 2980 btrfs_create_pending_block_groups(trans, root); 2981 2982 spin_lock(&delayed_refs->lock); 2983 node = rb_first(&delayed_refs->href_root); 2984 if (!node) { 2985 spin_unlock(&delayed_refs->lock); 2986 goto out; 2987 } 2988 count = (unsigned long)-1; 2989 2990 while (node) { 2991 head = rb_entry(node, struct btrfs_delayed_ref_head, 2992 href_node); 2993 if (btrfs_delayed_ref_is_head(&head->node)) { 2994 struct btrfs_delayed_ref_node *ref; 2995 2996 ref = &head->node; 2997 atomic_inc(&ref->refs); 2998 2999 spin_unlock(&delayed_refs->lock); 3000 /* 3001 * Mutex was contended, block until it's 3002 * released and try again 3003 */ 3004 mutex_lock(&head->mutex); 3005 mutex_unlock(&head->mutex); 3006 3007 btrfs_put_delayed_ref(ref); 3008 cond_resched(); 3009 goto again; 3010 } else { 3011 WARN_ON(1); 3012 } 3013 node = rb_next(node); 3014 } 3015 spin_unlock(&delayed_refs->lock); 3016 cond_resched(); 3017 goto again; 3018 } 3019 out: 3020 assert_qgroups_uptodate(trans); 3021 trans->can_flush_pending_bgs = can_flush_pending_bgs; 3022 return 0; 3023 } 3024 3025 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 3026 struct btrfs_root *root, 3027 u64 bytenr, u64 num_bytes, u64 flags, 3028 int level, int is_data) 3029 { 3030 struct btrfs_delayed_extent_op *extent_op; 3031 int ret; 3032 3033 extent_op = btrfs_alloc_delayed_extent_op(); 3034 if (!extent_op) 3035 return -ENOMEM; 3036 3037 extent_op->flags_to_set = flags; 3038 extent_op->update_flags = true; 3039 extent_op->update_key = false; 3040 extent_op->is_data = is_data ? true : false; 3041 extent_op->level = level; 3042 3043 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 3044 num_bytes, extent_op); 3045 if (ret) 3046 btrfs_free_delayed_extent_op(extent_op); 3047 return ret; 3048 } 3049 3050 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, 3051 struct btrfs_root *root, 3052 struct btrfs_path *path, 3053 u64 objectid, u64 offset, u64 bytenr) 3054 { 3055 struct btrfs_delayed_ref_head *head; 3056 struct btrfs_delayed_ref_node *ref; 3057 struct btrfs_delayed_data_ref *data_ref; 3058 struct btrfs_delayed_ref_root *delayed_refs; 3059 int ret = 0; 3060 3061 delayed_refs = &trans->transaction->delayed_refs; 3062 spin_lock(&delayed_refs->lock); 3063 head = btrfs_find_delayed_ref_head(trans, bytenr); 3064 if (!head) { 3065 spin_unlock(&delayed_refs->lock); 3066 return 0; 3067 } 3068 3069 if (!mutex_trylock(&head->mutex)) { 3070 atomic_inc(&head->node.refs); 3071 spin_unlock(&delayed_refs->lock); 3072 3073 btrfs_release_path(path); 3074 3075 /* 3076 * Mutex was contended, block until it's released and let 3077 * caller try again 3078 */ 3079 mutex_lock(&head->mutex); 3080 mutex_unlock(&head->mutex); 3081 btrfs_put_delayed_ref(&head->node); 3082 return -EAGAIN; 3083 } 3084 spin_unlock(&delayed_refs->lock); 3085 3086 spin_lock(&head->lock); 3087 list_for_each_entry(ref, &head->ref_list, list) { 3088 /* If it's a shared ref we know a cross reference exists */ 3089 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 3090 ret = 1; 3091 break; 3092 } 3093 3094 data_ref = btrfs_delayed_node_to_data_ref(ref); 3095 3096 /* 3097 * If our ref doesn't match the one we're currently looking at 3098 * then we have a cross reference. 3099 */ 3100 if (data_ref->root != root->root_key.objectid || 3101 data_ref->objectid != objectid || 3102 data_ref->offset != offset) { 3103 ret = 1; 3104 break; 3105 } 3106 } 3107 spin_unlock(&head->lock); 3108 mutex_unlock(&head->mutex); 3109 return ret; 3110 } 3111 3112 static noinline int check_committed_ref(struct btrfs_trans_handle *trans, 3113 struct btrfs_root *root, 3114 struct btrfs_path *path, 3115 u64 objectid, u64 offset, u64 bytenr) 3116 { 3117 struct btrfs_root *extent_root = root->fs_info->extent_root; 3118 struct extent_buffer *leaf; 3119 struct btrfs_extent_data_ref *ref; 3120 struct btrfs_extent_inline_ref *iref; 3121 struct btrfs_extent_item *ei; 3122 struct btrfs_key key; 3123 u32 item_size; 3124 int ret; 3125 3126 key.objectid = bytenr; 3127 key.offset = (u64)-1; 3128 key.type = BTRFS_EXTENT_ITEM_KEY; 3129 3130 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 3131 if (ret < 0) 3132 goto out; 3133 BUG_ON(ret == 0); /* Corruption */ 3134 3135 ret = -ENOENT; 3136 if (path->slots[0] == 0) 3137 goto out; 3138 3139 path->slots[0]--; 3140 leaf = path->nodes[0]; 3141 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3142 3143 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 3144 goto out; 3145 3146 ret = 1; 3147 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3148 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 3149 if (item_size < sizeof(*ei)) { 3150 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 3151 goto out; 3152 } 3153 #endif 3154 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 3155 3156 if (item_size != sizeof(*ei) + 3157 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 3158 goto out; 3159 3160 if (btrfs_extent_generation(leaf, ei) <= 3161 btrfs_root_last_snapshot(&root->root_item)) 3162 goto out; 3163 3164 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 3165 if (btrfs_extent_inline_ref_type(leaf, iref) != 3166 BTRFS_EXTENT_DATA_REF_KEY) 3167 goto out; 3168 3169 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 3170 if (btrfs_extent_refs(leaf, ei) != 3171 btrfs_extent_data_ref_count(leaf, ref) || 3172 btrfs_extent_data_ref_root(leaf, ref) != 3173 root->root_key.objectid || 3174 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 3175 btrfs_extent_data_ref_offset(leaf, ref) != offset) 3176 goto out; 3177 3178 ret = 0; 3179 out: 3180 return ret; 3181 } 3182 3183 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3184 struct btrfs_root *root, 3185 u64 objectid, u64 offset, u64 bytenr) 3186 { 3187 struct btrfs_path *path; 3188 int ret; 3189 int ret2; 3190 3191 path = btrfs_alloc_path(); 3192 if (!path) 3193 return -ENOENT; 3194 3195 do { 3196 ret = check_committed_ref(trans, root, path, objectid, 3197 offset, bytenr); 3198 if (ret && ret != -ENOENT) 3199 goto out; 3200 3201 ret2 = check_delayed_ref(trans, root, path, objectid, 3202 offset, bytenr); 3203 } while (ret2 == -EAGAIN); 3204 3205 if (ret2 && ret2 != -ENOENT) { 3206 ret = ret2; 3207 goto out; 3208 } 3209 3210 if (ret != -ENOENT || ret2 != -ENOENT) 3211 ret = 0; 3212 out: 3213 btrfs_free_path(path); 3214 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3215 WARN_ON(ret > 0); 3216 return ret; 3217 } 3218 3219 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3220 struct btrfs_root *root, 3221 struct extent_buffer *buf, 3222 int full_backref, int inc) 3223 { 3224 u64 bytenr; 3225 u64 num_bytes; 3226 u64 parent; 3227 u64 ref_root; 3228 u32 nritems; 3229 struct btrfs_key key; 3230 struct btrfs_file_extent_item *fi; 3231 int i; 3232 int level; 3233 int ret = 0; 3234 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 3235 u64, u64, u64, u64, u64, u64); 3236 3237 3238 if (btrfs_is_testing(root->fs_info)) 3239 return 0; 3240 3241 ref_root = btrfs_header_owner(buf); 3242 nritems = btrfs_header_nritems(buf); 3243 level = btrfs_header_level(buf); 3244 3245 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3246 return 0; 3247 3248 if (inc) 3249 process_func = btrfs_inc_extent_ref; 3250 else 3251 process_func = btrfs_free_extent; 3252 3253 if (full_backref) 3254 parent = buf->start; 3255 else 3256 parent = 0; 3257 3258 for (i = 0; i < nritems; i++) { 3259 if (level == 0) { 3260 btrfs_item_key_to_cpu(buf, &key, i); 3261 if (key.type != BTRFS_EXTENT_DATA_KEY) 3262 continue; 3263 fi = btrfs_item_ptr(buf, i, 3264 struct btrfs_file_extent_item); 3265 if (btrfs_file_extent_type(buf, fi) == 3266 BTRFS_FILE_EXTENT_INLINE) 3267 continue; 3268 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3269 if (bytenr == 0) 3270 continue; 3271 3272 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3273 key.offset -= btrfs_file_extent_offset(buf, fi); 3274 ret = process_func(trans, root, bytenr, num_bytes, 3275 parent, ref_root, key.objectid, 3276 key.offset); 3277 if (ret) 3278 goto fail; 3279 } else { 3280 bytenr = btrfs_node_blockptr(buf, i); 3281 num_bytes = root->nodesize; 3282 ret = process_func(trans, root, bytenr, num_bytes, 3283 parent, ref_root, level - 1, 0); 3284 if (ret) 3285 goto fail; 3286 } 3287 } 3288 return 0; 3289 fail: 3290 return ret; 3291 } 3292 3293 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3294 struct extent_buffer *buf, int full_backref) 3295 { 3296 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 3297 } 3298 3299 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3300 struct extent_buffer *buf, int full_backref) 3301 { 3302 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 3303 } 3304 3305 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3306 struct btrfs_root *root, 3307 struct btrfs_path *path, 3308 struct btrfs_block_group_cache *cache) 3309 { 3310 int ret; 3311 struct btrfs_root *extent_root = root->fs_info->extent_root; 3312 unsigned long bi; 3313 struct extent_buffer *leaf; 3314 3315 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3316 if (ret) { 3317 if (ret > 0) 3318 ret = -ENOENT; 3319 goto fail; 3320 } 3321 3322 leaf = path->nodes[0]; 3323 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3324 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3325 btrfs_mark_buffer_dirty(leaf); 3326 fail: 3327 btrfs_release_path(path); 3328 return ret; 3329 3330 } 3331 3332 static struct btrfs_block_group_cache * 3333 next_block_group(struct btrfs_root *root, 3334 struct btrfs_block_group_cache *cache) 3335 { 3336 struct rb_node *node; 3337 3338 spin_lock(&root->fs_info->block_group_cache_lock); 3339 3340 /* If our block group was removed, we need a full search. */ 3341 if (RB_EMPTY_NODE(&cache->cache_node)) { 3342 const u64 next_bytenr = cache->key.objectid + cache->key.offset; 3343 3344 spin_unlock(&root->fs_info->block_group_cache_lock); 3345 btrfs_put_block_group(cache); 3346 cache = btrfs_lookup_first_block_group(root->fs_info, 3347 next_bytenr); 3348 return cache; 3349 } 3350 node = rb_next(&cache->cache_node); 3351 btrfs_put_block_group(cache); 3352 if (node) { 3353 cache = rb_entry(node, struct btrfs_block_group_cache, 3354 cache_node); 3355 btrfs_get_block_group(cache); 3356 } else 3357 cache = NULL; 3358 spin_unlock(&root->fs_info->block_group_cache_lock); 3359 return cache; 3360 } 3361 3362 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3363 struct btrfs_trans_handle *trans, 3364 struct btrfs_path *path) 3365 { 3366 struct btrfs_root *root = block_group->fs_info->tree_root; 3367 struct inode *inode = NULL; 3368 u64 alloc_hint = 0; 3369 int dcs = BTRFS_DC_ERROR; 3370 u64 num_pages = 0; 3371 int retries = 0; 3372 int ret = 0; 3373 3374 /* 3375 * If this block group is smaller than 100 megs don't bother caching the 3376 * block group. 3377 */ 3378 if (block_group->key.offset < (100 * SZ_1M)) { 3379 spin_lock(&block_group->lock); 3380 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3381 spin_unlock(&block_group->lock); 3382 return 0; 3383 } 3384 3385 if (trans->aborted) 3386 return 0; 3387 again: 3388 inode = lookup_free_space_inode(root, block_group, path); 3389 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3390 ret = PTR_ERR(inode); 3391 btrfs_release_path(path); 3392 goto out; 3393 } 3394 3395 if (IS_ERR(inode)) { 3396 BUG_ON(retries); 3397 retries++; 3398 3399 if (block_group->ro) 3400 goto out_free; 3401 3402 ret = create_free_space_inode(root, trans, block_group, path); 3403 if (ret) 3404 goto out_free; 3405 goto again; 3406 } 3407 3408 /* We've already setup this transaction, go ahead and exit */ 3409 if (block_group->cache_generation == trans->transid && 3410 i_size_read(inode)) { 3411 dcs = BTRFS_DC_SETUP; 3412 goto out_put; 3413 } 3414 3415 /* 3416 * We want to set the generation to 0, that way if anything goes wrong 3417 * from here on out we know not to trust this cache when we load up next 3418 * time. 3419 */ 3420 BTRFS_I(inode)->generation = 0; 3421 ret = btrfs_update_inode(trans, root, inode); 3422 if (ret) { 3423 /* 3424 * So theoretically we could recover from this, simply set the 3425 * super cache generation to 0 so we know to invalidate the 3426 * cache, but then we'd have to keep track of the block groups 3427 * that fail this way so we know we _have_ to reset this cache 3428 * before the next commit or risk reading stale cache. So to 3429 * limit our exposure to horrible edge cases lets just abort the 3430 * transaction, this only happens in really bad situations 3431 * anyway. 3432 */ 3433 btrfs_abort_transaction(trans, ret); 3434 goto out_put; 3435 } 3436 WARN_ON(ret); 3437 3438 if (i_size_read(inode) > 0) { 3439 ret = btrfs_check_trunc_cache_free_space(root, 3440 &root->fs_info->global_block_rsv); 3441 if (ret) 3442 goto out_put; 3443 3444 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); 3445 if (ret) 3446 goto out_put; 3447 } 3448 3449 spin_lock(&block_group->lock); 3450 if (block_group->cached != BTRFS_CACHE_FINISHED || 3451 !btrfs_test_opt(root->fs_info, SPACE_CACHE)) { 3452 /* 3453 * don't bother trying to write stuff out _if_ 3454 * a) we're not cached, 3455 * b) we're with nospace_cache mount option. 3456 */ 3457 dcs = BTRFS_DC_WRITTEN; 3458 spin_unlock(&block_group->lock); 3459 goto out_put; 3460 } 3461 spin_unlock(&block_group->lock); 3462 3463 /* 3464 * We hit an ENOSPC when setting up the cache in this transaction, just 3465 * skip doing the setup, we've already cleared the cache so we're safe. 3466 */ 3467 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 3468 ret = -ENOSPC; 3469 goto out_put; 3470 } 3471 3472 /* 3473 * Try to preallocate enough space based on how big the block group is. 3474 * Keep in mind this has to include any pinned space which could end up 3475 * taking up quite a bit since it's not folded into the other space 3476 * cache. 3477 */ 3478 num_pages = div_u64(block_group->key.offset, SZ_256M); 3479 if (!num_pages) 3480 num_pages = 1; 3481 3482 num_pages *= 16; 3483 num_pages *= PAGE_SIZE; 3484 3485 ret = btrfs_check_data_free_space(inode, 0, num_pages); 3486 if (ret) 3487 goto out_put; 3488 3489 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3490 num_pages, num_pages, 3491 &alloc_hint); 3492 /* 3493 * Our cache requires contiguous chunks so that we don't modify a bunch 3494 * of metadata or split extents when writing the cache out, which means 3495 * we can enospc if we are heavily fragmented in addition to just normal 3496 * out of space conditions. So if we hit this just skip setting up any 3497 * other block groups for this transaction, maybe we'll unpin enough 3498 * space the next time around. 3499 */ 3500 if (!ret) 3501 dcs = BTRFS_DC_SETUP; 3502 else if (ret == -ENOSPC) 3503 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 3504 btrfs_free_reserved_data_space(inode, 0, num_pages); 3505 3506 out_put: 3507 iput(inode); 3508 out_free: 3509 btrfs_release_path(path); 3510 out: 3511 spin_lock(&block_group->lock); 3512 if (!ret && dcs == BTRFS_DC_SETUP) 3513 block_group->cache_generation = trans->transid; 3514 block_group->disk_cache_state = dcs; 3515 spin_unlock(&block_group->lock); 3516 3517 return ret; 3518 } 3519 3520 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, 3521 struct btrfs_root *root) 3522 { 3523 struct btrfs_block_group_cache *cache, *tmp; 3524 struct btrfs_transaction *cur_trans = trans->transaction; 3525 struct btrfs_path *path; 3526 3527 if (list_empty(&cur_trans->dirty_bgs) || 3528 !btrfs_test_opt(root->fs_info, SPACE_CACHE)) 3529 return 0; 3530 3531 path = btrfs_alloc_path(); 3532 if (!path) 3533 return -ENOMEM; 3534 3535 /* Could add new block groups, use _safe just in case */ 3536 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 3537 dirty_list) { 3538 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3539 cache_save_setup(cache, trans, path); 3540 } 3541 3542 btrfs_free_path(path); 3543 return 0; 3544 } 3545 3546 /* 3547 * transaction commit does final block group cache writeback during a 3548 * critical section where nothing is allowed to change the FS. This is 3549 * required in order for the cache to actually match the block group, 3550 * but can introduce a lot of latency into the commit. 3551 * 3552 * So, btrfs_start_dirty_block_groups is here to kick off block group 3553 * cache IO. There's a chance we'll have to redo some of it if the 3554 * block group changes again during the commit, but it greatly reduces 3555 * the commit latency by getting rid of the easy block groups while 3556 * we're still allowing others to join the commit. 3557 */ 3558 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, 3559 struct btrfs_root *root) 3560 { 3561 struct btrfs_block_group_cache *cache; 3562 struct btrfs_transaction *cur_trans = trans->transaction; 3563 int ret = 0; 3564 int should_put; 3565 struct btrfs_path *path = NULL; 3566 LIST_HEAD(dirty); 3567 struct list_head *io = &cur_trans->io_bgs; 3568 int num_started = 0; 3569 int loops = 0; 3570 3571 spin_lock(&cur_trans->dirty_bgs_lock); 3572 if (list_empty(&cur_trans->dirty_bgs)) { 3573 spin_unlock(&cur_trans->dirty_bgs_lock); 3574 return 0; 3575 } 3576 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3577 spin_unlock(&cur_trans->dirty_bgs_lock); 3578 3579 again: 3580 /* 3581 * make sure all the block groups on our dirty list actually 3582 * exist 3583 */ 3584 btrfs_create_pending_block_groups(trans, root); 3585 3586 if (!path) { 3587 path = btrfs_alloc_path(); 3588 if (!path) 3589 return -ENOMEM; 3590 } 3591 3592 /* 3593 * cache_write_mutex is here only to save us from balance or automatic 3594 * removal of empty block groups deleting this block group while we are 3595 * writing out the cache 3596 */ 3597 mutex_lock(&trans->transaction->cache_write_mutex); 3598 while (!list_empty(&dirty)) { 3599 cache = list_first_entry(&dirty, 3600 struct btrfs_block_group_cache, 3601 dirty_list); 3602 /* 3603 * this can happen if something re-dirties a block 3604 * group that is already under IO. Just wait for it to 3605 * finish and then do it all again 3606 */ 3607 if (!list_empty(&cache->io_list)) { 3608 list_del_init(&cache->io_list); 3609 btrfs_wait_cache_io(root, trans, cache, 3610 &cache->io_ctl, path, 3611 cache->key.objectid); 3612 btrfs_put_block_group(cache); 3613 } 3614 3615 3616 /* 3617 * btrfs_wait_cache_io uses the cache->dirty_list to decide 3618 * if it should update the cache_state. Don't delete 3619 * until after we wait. 3620 * 3621 * Since we're not running in the commit critical section 3622 * we need the dirty_bgs_lock to protect from update_block_group 3623 */ 3624 spin_lock(&cur_trans->dirty_bgs_lock); 3625 list_del_init(&cache->dirty_list); 3626 spin_unlock(&cur_trans->dirty_bgs_lock); 3627 3628 should_put = 1; 3629 3630 cache_save_setup(cache, trans, path); 3631 3632 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3633 cache->io_ctl.inode = NULL; 3634 ret = btrfs_write_out_cache(root, trans, cache, path); 3635 if (ret == 0 && cache->io_ctl.inode) { 3636 num_started++; 3637 should_put = 0; 3638 3639 /* 3640 * the cache_write_mutex is protecting 3641 * the io_list 3642 */ 3643 list_add_tail(&cache->io_list, io); 3644 } else { 3645 /* 3646 * if we failed to write the cache, the 3647 * generation will be bad and life goes on 3648 */ 3649 ret = 0; 3650 } 3651 } 3652 if (!ret) { 3653 ret = write_one_cache_group(trans, root, path, cache); 3654 /* 3655 * Our block group might still be attached to the list 3656 * of new block groups in the transaction handle of some 3657 * other task (struct btrfs_trans_handle->new_bgs). This 3658 * means its block group item isn't yet in the extent 3659 * tree. If this happens ignore the error, as we will 3660 * try again later in the critical section of the 3661 * transaction commit. 3662 */ 3663 if (ret == -ENOENT) { 3664 ret = 0; 3665 spin_lock(&cur_trans->dirty_bgs_lock); 3666 if (list_empty(&cache->dirty_list)) { 3667 list_add_tail(&cache->dirty_list, 3668 &cur_trans->dirty_bgs); 3669 btrfs_get_block_group(cache); 3670 } 3671 spin_unlock(&cur_trans->dirty_bgs_lock); 3672 } else if (ret) { 3673 btrfs_abort_transaction(trans, ret); 3674 } 3675 } 3676 3677 /* if its not on the io list, we need to put the block group */ 3678 if (should_put) 3679 btrfs_put_block_group(cache); 3680 3681 if (ret) 3682 break; 3683 3684 /* 3685 * Avoid blocking other tasks for too long. It might even save 3686 * us from writing caches for block groups that are going to be 3687 * removed. 3688 */ 3689 mutex_unlock(&trans->transaction->cache_write_mutex); 3690 mutex_lock(&trans->transaction->cache_write_mutex); 3691 } 3692 mutex_unlock(&trans->transaction->cache_write_mutex); 3693 3694 /* 3695 * go through delayed refs for all the stuff we've just kicked off 3696 * and then loop back (just once) 3697 */ 3698 ret = btrfs_run_delayed_refs(trans, root, 0); 3699 if (!ret && loops == 0) { 3700 loops++; 3701 spin_lock(&cur_trans->dirty_bgs_lock); 3702 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3703 /* 3704 * dirty_bgs_lock protects us from concurrent block group 3705 * deletes too (not just cache_write_mutex). 3706 */ 3707 if (!list_empty(&dirty)) { 3708 spin_unlock(&cur_trans->dirty_bgs_lock); 3709 goto again; 3710 } 3711 spin_unlock(&cur_trans->dirty_bgs_lock); 3712 } 3713 3714 btrfs_free_path(path); 3715 return ret; 3716 } 3717 3718 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3719 struct btrfs_root *root) 3720 { 3721 struct btrfs_block_group_cache *cache; 3722 struct btrfs_transaction *cur_trans = trans->transaction; 3723 int ret = 0; 3724 int should_put; 3725 struct btrfs_path *path; 3726 struct list_head *io = &cur_trans->io_bgs; 3727 int num_started = 0; 3728 3729 path = btrfs_alloc_path(); 3730 if (!path) 3731 return -ENOMEM; 3732 3733 /* 3734 * Even though we are in the critical section of the transaction commit, 3735 * we can still have concurrent tasks adding elements to this 3736 * transaction's list of dirty block groups. These tasks correspond to 3737 * endio free space workers started when writeback finishes for a 3738 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 3739 * allocate new block groups as a result of COWing nodes of the root 3740 * tree when updating the free space inode. The writeback for the space 3741 * caches is triggered by an earlier call to 3742 * btrfs_start_dirty_block_groups() and iterations of the following 3743 * loop. 3744 * Also we want to do the cache_save_setup first and then run the 3745 * delayed refs to make sure we have the best chance at doing this all 3746 * in one shot. 3747 */ 3748 spin_lock(&cur_trans->dirty_bgs_lock); 3749 while (!list_empty(&cur_trans->dirty_bgs)) { 3750 cache = list_first_entry(&cur_trans->dirty_bgs, 3751 struct btrfs_block_group_cache, 3752 dirty_list); 3753 3754 /* 3755 * this can happen if cache_save_setup re-dirties a block 3756 * group that is already under IO. Just wait for it to 3757 * finish and then do it all again 3758 */ 3759 if (!list_empty(&cache->io_list)) { 3760 spin_unlock(&cur_trans->dirty_bgs_lock); 3761 list_del_init(&cache->io_list); 3762 btrfs_wait_cache_io(root, trans, cache, 3763 &cache->io_ctl, path, 3764 cache->key.objectid); 3765 btrfs_put_block_group(cache); 3766 spin_lock(&cur_trans->dirty_bgs_lock); 3767 } 3768 3769 /* 3770 * don't remove from the dirty list until after we've waited 3771 * on any pending IO 3772 */ 3773 list_del_init(&cache->dirty_list); 3774 spin_unlock(&cur_trans->dirty_bgs_lock); 3775 should_put = 1; 3776 3777 cache_save_setup(cache, trans, path); 3778 3779 if (!ret) 3780 ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1); 3781 3782 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 3783 cache->io_ctl.inode = NULL; 3784 ret = btrfs_write_out_cache(root, trans, cache, path); 3785 if (ret == 0 && cache->io_ctl.inode) { 3786 num_started++; 3787 should_put = 0; 3788 list_add_tail(&cache->io_list, io); 3789 } else { 3790 /* 3791 * if we failed to write the cache, the 3792 * generation will be bad and life goes on 3793 */ 3794 ret = 0; 3795 } 3796 } 3797 if (!ret) { 3798 ret = write_one_cache_group(trans, root, path, cache); 3799 /* 3800 * One of the free space endio workers might have 3801 * created a new block group while updating a free space 3802 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 3803 * and hasn't released its transaction handle yet, in 3804 * which case the new block group is still attached to 3805 * its transaction handle and its creation has not 3806 * finished yet (no block group item in the extent tree 3807 * yet, etc). If this is the case, wait for all free 3808 * space endio workers to finish and retry. This is a 3809 * a very rare case so no need for a more efficient and 3810 * complex approach. 3811 */ 3812 if (ret == -ENOENT) { 3813 wait_event(cur_trans->writer_wait, 3814 atomic_read(&cur_trans->num_writers) == 1); 3815 ret = write_one_cache_group(trans, root, path, 3816 cache); 3817 } 3818 if (ret) 3819 btrfs_abort_transaction(trans, ret); 3820 } 3821 3822 /* if its not on the io list, we need to put the block group */ 3823 if (should_put) 3824 btrfs_put_block_group(cache); 3825 spin_lock(&cur_trans->dirty_bgs_lock); 3826 } 3827 spin_unlock(&cur_trans->dirty_bgs_lock); 3828 3829 while (!list_empty(io)) { 3830 cache = list_first_entry(io, struct btrfs_block_group_cache, 3831 io_list); 3832 list_del_init(&cache->io_list); 3833 btrfs_wait_cache_io(root, trans, cache, 3834 &cache->io_ctl, path, cache->key.objectid); 3835 btrfs_put_block_group(cache); 3836 } 3837 3838 btrfs_free_path(path); 3839 return ret; 3840 } 3841 3842 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 3843 { 3844 struct btrfs_block_group_cache *block_group; 3845 int readonly = 0; 3846 3847 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 3848 if (!block_group || block_group->ro) 3849 readonly = 1; 3850 if (block_group) 3851 btrfs_put_block_group(block_group); 3852 return readonly; 3853 } 3854 3855 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3856 { 3857 struct btrfs_block_group_cache *bg; 3858 bool ret = true; 3859 3860 bg = btrfs_lookup_block_group(fs_info, bytenr); 3861 if (!bg) 3862 return false; 3863 3864 spin_lock(&bg->lock); 3865 if (bg->ro) 3866 ret = false; 3867 else 3868 atomic_inc(&bg->nocow_writers); 3869 spin_unlock(&bg->lock); 3870 3871 /* no put on block group, done by btrfs_dec_nocow_writers */ 3872 if (!ret) 3873 btrfs_put_block_group(bg); 3874 3875 return ret; 3876 3877 } 3878 3879 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3880 { 3881 struct btrfs_block_group_cache *bg; 3882 3883 bg = btrfs_lookup_block_group(fs_info, bytenr); 3884 ASSERT(bg); 3885 if (atomic_dec_and_test(&bg->nocow_writers)) 3886 wake_up_atomic_t(&bg->nocow_writers); 3887 /* 3888 * Once for our lookup and once for the lookup done by a previous call 3889 * to btrfs_inc_nocow_writers() 3890 */ 3891 btrfs_put_block_group(bg); 3892 btrfs_put_block_group(bg); 3893 } 3894 3895 static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) 3896 { 3897 schedule(); 3898 return 0; 3899 } 3900 3901 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) 3902 { 3903 wait_on_atomic_t(&bg->nocow_writers, 3904 btrfs_wait_nocow_writers_atomic_t, 3905 TASK_UNINTERRUPTIBLE); 3906 } 3907 3908 static const char *alloc_name(u64 flags) 3909 { 3910 switch (flags) { 3911 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3912 return "mixed"; 3913 case BTRFS_BLOCK_GROUP_METADATA: 3914 return "metadata"; 3915 case BTRFS_BLOCK_GROUP_DATA: 3916 return "data"; 3917 case BTRFS_BLOCK_GROUP_SYSTEM: 3918 return "system"; 3919 default: 3920 WARN_ON(1); 3921 return "invalid-combination"; 3922 }; 3923 } 3924 3925 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3926 u64 total_bytes, u64 bytes_used, 3927 u64 bytes_readonly, 3928 struct btrfs_space_info **space_info) 3929 { 3930 struct btrfs_space_info *found; 3931 int i; 3932 int factor; 3933 int ret; 3934 3935 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3936 BTRFS_BLOCK_GROUP_RAID10)) 3937 factor = 2; 3938 else 3939 factor = 1; 3940 3941 found = __find_space_info(info, flags); 3942 if (found) { 3943 spin_lock(&found->lock); 3944 found->total_bytes += total_bytes; 3945 found->disk_total += total_bytes * factor; 3946 found->bytes_used += bytes_used; 3947 found->disk_used += bytes_used * factor; 3948 found->bytes_readonly += bytes_readonly; 3949 if (total_bytes > 0) 3950 found->full = 0; 3951 space_info_add_new_bytes(info, found, total_bytes - 3952 bytes_used - bytes_readonly); 3953 spin_unlock(&found->lock); 3954 *space_info = found; 3955 return 0; 3956 } 3957 found = kzalloc(sizeof(*found), GFP_NOFS); 3958 if (!found) 3959 return -ENOMEM; 3960 3961 ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); 3962 if (ret) { 3963 kfree(found); 3964 return ret; 3965 } 3966 3967 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3968 INIT_LIST_HEAD(&found->block_groups[i]); 3969 init_rwsem(&found->groups_sem); 3970 spin_lock_init(&found->lock); 3971 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3972 found->total_bytes = total_bytes; 3973 found->disk_total = total_bytes * factor; 3974 found->bytes_used = bytes_used; 3975 found->disk_used = bytes_used * factor; 3976 found->bytes_pinned = 0; 3977 found->bytes_reserved = 0; 3978 found->bytes_readonly = bytes_readonly; 3979 found->bytes_may_use = 0; 3980 found->full = 0; 3981 found->max_extent_size = 0; 3982 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3983 found->chunk_alloc = 0; 3984 found->flush = 0; 3985 init_waitqueue_head(&found->wait); 3986 INIT_LIST_HEAD(&found->ro_bgs); 3987 INIT_LIST_HEAD(&found->tickets); 3988 INIT_LIST_HEAD(&found->priority_tickets); 3989 3990 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3991 info->space_info_kobj, "%s", 3992 alloc_name(found->flags)); 3993 if (ret) { 3994 kfree(found); 3995 return ret; 3996 } 3997 3998 *space_info = found; 3999 list_add_rcu(&found->list, &info->space_info); 4000 if (flags & BTRFS_BLOCK_GROUP_DATA) 4001 info->data_sinfo = found; 4002 4003 return ret; 4004 } 4005 4006 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 4007 { 4008 u64 extra_flags = chunk_to_extended(flags) & 4009 BTRFS_EXTENDED_PROFILE_MASK; 4010 4011 write_seqlock(&fs_info->profiles_lock); 4012 if (flags & BTRFS_BLOCK_GROUP_DATA) 4013 fs_info->avail_data_alloc_bits |= extra_flags; 4014 if (flags & BTRFS_BLOCK_GROUP_METADATA) 4015 fs_info->avail_metadata_alloc_bits |= extra_flags; 4016 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4017 fs_info->avail_system_alloc_bits |= extra_flags; 4018 write_sequnlock(&fs_info->profiles_lock); 4019 } 4020 4021 /* 4022 * returns target flags in extended format or 0 if restripe for this 4023 * chunk_type is not in progress 4024 * 4025 * should be called with either volume_mutex or balance_lock held 4026 */ 4027 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 4028 { 4029 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4030 u64 target = 0; 4031 4032 if (!bctl) 4033 return 0; 4034 4035 if (flags & BTRFS_BLOCK_GROUP_DATA && 4036 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4037 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 4038 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 4039 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4040 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 4041 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 4042 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4043 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 4044 } 4045 4046 return target; 4047 } 4048 4049 /* 4050 * @flags: available profiles in extended format (see ctree.h) 4051 * 4052 * Returns reduced profile in chunk format. If profile changing is in 4053 * progress (either running or paused) picks the target profile (if it's 4054 * already available), otherwise falls back to plain reducing. 4055 */ 4056 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 4057 { 4058 u64 num_devices = root->fs_info->fs_devices->rw_devices; 4059 u64 target; 4060 u64 raid_type; 4061 u64 allowed = 0; 4062 4063 /* 4064 * see if restripe for this chunk_type is in progress, if so 4065 * try to reduce to the target profile 4066 */ 4067 spin_lock(&root->fs_info->balance_lock); 4068 target = get_restripe_target(root->fs_info, flags); 4069 if (target) { 4070 /* pick target profile only if it's already available */ 4071 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 4072 spin_unlock(&root->fs_info->balance_lock); 4073 return extended_to_chunk(target); 4074 } 4075 } 4076 spin_unlock(&root->fs_info->balance_lock); 4077 4078 /* First, mask out the RAID levels which aren't possible */ 4079 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 4080 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 4081 allowed |= btrfs_raid_group[raid_type]; 4082 } 4083 allowed &= flags; 4084 4085 if (allowed & BTRFS_BLOCK_GROUP_RAID6) 4086 allowed = BTRFS_BLOCK_GROUP_RAID6; 4087 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 4088 allowed = BTRFS_BLOCK_GROUP_RAID5; 4089 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 4090 allowed = BTRFS_BLOCK_GROUP_RAID10; 4091 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 4092 allowed = BTRFS_BLOCK_GROUP_RAID1; 4093 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 4094 allowed = BTRFS_BLOCK_GROUP_RAID0; 4095 4096 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 4097 4098 return extended_to_chunk(flags | allowed); 4099 } 4100 4101 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) 4102 { 4103 unsigned seq; 4104 u64 flags; 4105 4106 do { 4107 flags = orig_flags; 4108 seq = read_seqbegin(&root->fs_info->profiles_lock); 4109 4110 if (flags & BTRFS_BLOCK_GROUP_DATA) 4111 flags |= root->fs_info->avail_data_alloc_bits; 4112 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4113 flags |= root->fs_info->avail_system_alloc_bits; 4114 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 4115 flags |= root->fs_info->avail_metadata_alloc_bits; 4116 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 4117 4118 return btrfs_reduce_alloc_profile(root, flags); 4119 } 4120 4121 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 4122 { 4123 u64 flags; 4124 u64 ret; 4125 4126 if (data) 4127 flags = BTRFS_BLOCK_GROUP_DATA; 4128 else if (root == root->fs_info->chunk_root) 4129 flags = BTRFS_BLOCK_GROUP_SYSTEM; 4130 else 4131 flags = BTRFS_BLOCK_GROUP_METADATA; 4132 4133 ret = get_alloc_profile(root, flags); 4134 return ret; 4135 } 4136 4137 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes) 4138 { 4139 struct btrfs_space_info *data_sinfo; 4140 struct btrfs_root *root = BTRFS_I(inode)->root; 4141 struct btrfs_fs_info *fs_info = root->fs_info; 4142 u64 used; 4143 int ret = 0; 4144 int need_commit = 2; 4145 int have_pinned_space; 4146 4147 /* make sure bytes are sectorsize aligned */ 4148 bytes = ALIGN(bytes, root->sectorsize); 4149 4150 if (btrfs_is_free_space_inode(inode)) { 4151 need_commit = 0; 4152 ASSERT(current->journal_info); 4153 } 4154 4155 data_sinfo = fs_info->data_sinfo; 4156 if (!data_sinfo) 4157 goto alloc; 4158 4159 again: 4160 /* make sure we have enough space to handle the data first */ 4161 spin_lock(&data_sinfo->lock); 4162 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + 4163 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + 4164 data_sinfo->bytes_may_use; 4165 4166 if (used + bytes > data_sinfo->total_bytes) { 4167 struct btrfs_trans_handle *trans; 4168 4169 /* 4170 * if we don't have enough free bytes in this space then we need 4171 * to alloc a new chunk. 4172 */ 4173 if (!data_sinfo->full) { 4174 u64 alloc_target; 4175 4176 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 4177 spin_unlock(&data_sinfo->lock); 4178 alloc: 4179 alloc_target = btrfs_get_alloc_profile(root, 1); 4180 /* 4181 * It is ugly that we don't call nolock join 4182 * transaction for the free space inode case here. 4183 * But it is safe because we only do the data space 4184 * reservation for the free space cache in the 4185 * transaction context, the common join transaction 4186 * just increase the counter of the current transaction 4187 * handler, doesn't try to acquire the trans_lock of 4188 * the fs. 4189 */ 4190 trans = btrfs_join_transaction(root); 4191 if (IS_ERR(trans)) 4192 return PTR_ERR(trans); 4193 4194 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4195 alloc_target, 4196 CHUNK_ALLOC_NO_FORCE); 4197 btrfs_end_transaction(trans, root); 4198 if (ret < 0) { 4199 if (ret != -ENOSPC) 4200 return ret; 4201 else { 4202 have_pinned_space = 1; 4203 goto commit_trans; 4204 } 4205 } 4206 4207 if (!data_sinfo) 4208 data_sinfo = fs_info->data_sinfo; 4209 4210 goto again; 4211 } 4212 4213 /* 4214 * If we don't have enough pinned space to deal with this 4215 * allocation, and no removed chunk in current transaction, 4216 * don't bother committing the transaction. 4217 */ 4218 have_pinned_space = percpu_counter_compare( 4219 &data_sinfo->total_bytes_pinned, 4220 used + bytes - data_sinfo->total_bytes); 4221 spin_unlock(&data_sinfo->lock); 4222 4223 /* commit the current transaction and try again */ 4224 commit_trans: 4225 if (need_commit && 4226 !atomic_read(&root->fs_info->open_ioctl_trans)) { 4227 need_commit--; 4228 4229 if (need_commit > 0) { 4230 btrfs_start_delalloc_roots(fs_info, 0, -1); 4231 btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); 4232 } 4233 4234 trans = btrfs_join_transaction(root); 4235 if (IS_ERR(trans)) 4236 return PTR_ERR(trans); 4237 if (have_pinned_space >= 0 || 4238 test_bit(BTRFS_TRANS_HAVE_FREE_BGS, 4239 &trans->transaction->flags) || 4240 need_commit > 0) { 4241 ret = btrfs_commit_transaction(trans, root); 4242 if (ret) 4243 return ret; 4244 /* 4245 * The cleaner kthread might still be doing iput 4246 * operations. Wait for it to finish so that 4247 * more space is released. 4248 */ 4249 mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex); 4250 mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex); 4251 goto again; 4252 } else { 4253 btrfs_end_transaction(trans, root); 4254 } 4255 } 4256 4257 trace_btrfs_space_reservation(root->fs_info, 4258 "space_info:enospc", 4259 data_sinfo->flags, bytes, 1); 4260 return -ENOSPC; 4261 } 4262 data_sinfo->bytes_may_use += bytes; 4263 trace_btrfs_space_reservation(root->fs_info, "space_info", 4264 data_sinfo->flags, bytes, 1); 4265 spin_unlock(&data_sinfo->lock); 4266 4267 return ret; 4268 } 4269 4270 /* 4271 * New check_data_free_space() with ability for precious data reservation 4272 * Will replace old btrfs_check_data_free_space(), but for patch split, 4273 * add a new function first and then replace it. 4274 */ 4275 int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) 4276 { 4277 struct btrfs_root *root = BTRFS_I(inode)->root; 4278 int ret; 4279 4280 /* align the range */ 4281 len = round_up(start + len, root->sectorsize) - 4282 round_down(start, root->sectorsize); 4283 start = round_down(start, root->sectorsize); 4284 4285 ret = btrfs_alloc_data_chunk_ondemand(inode, len); 4286 if (ret < 0) 4287 return ret; 4288 4289 /* 4290 * Use new btrfs_qgroup_reserve_data to reserve precious data space 4291 * 4292 * TODO: Find a good method to avoid reserve data space for NOCOW 4293 * range, but don't impact performance on quota disable case. 4294 */ 4295 ret = btrfs_qgroup_reserve_data(inode, start, len); 4296 return ret; 4297 } 4298 4299 /* 4300 * Called if we need to clear a data reservation for this inode 4301 * Normally in a error case. 4302 * 4303 * This one will *NOT* use accurate qgroup reserved space API, just for case 4304 * which we can't sleep and is sure it won't affect qgroup reserved space. 4305 * Like clear_bit_hook(). 4306 */ 4307 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 4308 u64 len) 4309 { 4310 struct btrfs_root *root = BTRFS_I(inode)->root; 4311 struct btrfs_space_info *data_sinfo; 4312 4313 /* Make sure the range is aligned to sectorsize */ 4314 len = round_up(start + len, root->sectorsize) - 4315 round_down(start, root->sectorsize); 4316 start = round_down(start, root->sectorsize); 4317 4318 data_sinfo = root->fs_info->data_sinfo; 4319 spin_lock(&data_sinfo->lock); 4320 if (WARN_ON(data_sinfo->bytes_may_use < len)) 4321 data_sinfo->bytes_may_use = 0; 4322 else 4323 data_sinfo->bytes_may_use -= len; 4324 trace_btrfs_space_reservation(root->fs_info, "space_info", 4325 data_sinfo->flags, len, 0); 4326 spin_unlock(&data_sinfo->lock); 4327 } 4328 4329 /* 4330 * Called if we need to clear a data reservation for this inode 4331 * Normally in a error case. 4332 * 4333 * This one will handle the per-inode data rsv map for accurate reserved 4334 * space framework. 4335 */ 4336 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) 4337 { 4338 btrfs_free_reserved_data_space_noquota(inode, start, len); 4339 btrfs_qgroup_free_data(inode, start, len); 4340 } 4341 4342 static void force_metadata_allocation(struct btrfs_fs_info *info) 4343 { 4344 struct list_head *head = &info->space_info; 4345 struct btrfs_space_info *found; 4346 4347 rcu_read_lock(); 4348 list_for_each_entry_rcu(found, head, list) { 4349 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 4350 found->force_alloc = CHUNK_ALLOC_FORCE; 4351 } 4352 rcu_read_unlock(); 4353 } 4354 4355 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 4356 { 4357 return (global->size << 1); 4358 } 4359 4360 static int should_alloc_chunk(struct btrfs_root *root, 4361 struct btrfs_space_info *sinfo, int force) 4362 { 4363 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4364 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 4365 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 4366 u64 thresh; 4367 4368 if (force == CHUNK_ALLOC_FORCE) 4369 return 1; 4370 4371 /* 4372 * We need to take into account the global rsv because for all intents 4373 * and purposes it's used space. Don't worry about locking the 4374 * global_rsv, it doesn't change except when the transaction commits. 4375 */ 4376 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 4377 num_allocated += calc_global_rsv_need_space(global_rsv); 4378 4379 /* 4380 * in limited mode, we want to have some free space up to 4381 * about 1% of the FS size. 4382 */ 4383 if (force == CHUNK_ALLOC_LIMITED) { 4384 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 4385 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); 4386 4387 if (num_bytes - num_allocated < thresh) 4388 return 1; 4389 } 4390 4391 if (num_allocated + SZ_2M < div_factor(num_bytes, 8)) 4392 return 0; 4393 return 1; 4394 } 4395 4396 static u64 get_profile_num_devs(struct btrfs_root *root, u64 type) 4397 { 4398 u64 num_dev; 4399 4400 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 4401 BTRFS_BLOCK_GROUP_RAID0 | 4402 BTRFS_BLOCK_GROUP_RAID5 | 4403 BTRFS_BLOCK_GROUP_RAID6)) 4404 num_dev = root->fs_info->fs_devices->rw_devices; 4405 else if (type & BTRFS_BLOCK_GROUP_RAID1) 4406 num_dev = 2; 4407 else 4408 num_dev = 1; /* DUP or single */ 4409 4410 return num_dev; 4411 } 4412 4413 /* 4414 * If @is_allocation is true, reserve space in the system space info necessary 4415 * for allocating a chunk, otherwise if it's false, reserve space necessary for 4416 * removing a chunk. 4417 */ 4418 void check_system_chunk(struct btrfs_trans_handle *trans, 4419 struct btrfs_root *root, 4420 u64 type) 4421 { 4422 struct btrfs_space_info *info; 4423 u64 left; 4424 u64 thresh; 4425 int ret = 0; 4426 u64 num_devs; 4427 4428 /* 4429 * Needed because we can end up allocating a system chunk and for an 4430 * atomic and race free space reservation in the chunk block reserve. 4431 */ 4432 ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex)); 4433 4434 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4435 spin_lock(&info->lock); 4436 left = info->total_bytes - info->bytes_used - info->bytes_pinned - 4437 info->bytes_reserved - info->bytes_readonly - 4438 info->bytes_may_use; 4439 spin_unlock(&info->lock); 4440 4441 num_devs = get_profile_num_devs(root, type); 4442 4443 /* num_devs device items to update and 1 chunk item to add or remove */ 4444 thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + 4445 btrfs_calc_trans_metadata_size(root, 1); 4446 4447 if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { 4448 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", 4449 left, thresh, type); 4450 dump_space_info(info, 0, 0); 4451 } 4452 4453 if (left < thresh) { 4454 u64 flags; 4455 4456 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); 4457 /* 4458 * Ignore failure to create system chunk. We might end up not 4459 * needing it, as we might not need to COW all nodes/leafs from 4460 * the paths we visit in the chunk tree (they were already COWed 4461 * or created in the current transaction for example). 4462 */ 4463 ret = btrfs_alloc_chunk(trans, root, flags); 4464 } 4465 4466 if (!ret) { 4467 ret = btrfs_block_rsv_add(root->fs_info->chunk_root, 4468 &root->fs_info->chunk_block_rsv, 4469 thresh, BTRFS_RESERVE_NO_FLUSH); 4470 if (!ret) 4471 trans->chunk_bytes_reserved += thresh; 4472 } 4473 } 4474 4475 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 4476 struct btrfs_root *extent_root, u64 flags, int force) 4477 { 4478 struct btrfs_space_info *space_info; 4479 struct btrfs_fs_info *fs_info = extent_root->fs_info; 4480 int wait_for_alloc = 0; 4481 int ret = 0; 4482 4483 /* Don't re-enter if we're already allocating a chunk */ 4484 if (trans->allocating_chunk) 4485 return -ENOSPC; 4486 4487 space_info = __find_space_info(extent_root->fs_info, flags); 4488 if (!space_info) { 4489 ret = update_space_info(extent_root->fs_info, flags, 4490 0, 0, 0, &space_info); 4491 BUG_ON(ret); /* -ENOMEM */ 4492 } 4493 BUG_ON(!space_info); /* Logic error */ 4494 4495 again: 4496 spin_lock(&space_info->lock); 4497 if (force < space_info->force_alloc) 4498 force = space_info->force_alloc; 4499 if (space_info->full) { 4500 if (should_alloc_chunk(extent_root, space_info, force)) 4501 ret = -ENOSPC; 4502 else 4503 ret = 0; 4504 spin_unlock(&space_info->lock); 4505 return ret; 4506 } 4507 4508 if (!should_alloc_chunk(extent_root, space_info, force)) { 4509 spin_unlock(&space_info->lock); 4510 return 0; 4511 } else if (space_info->chunk_alloc) { 4512 wait_for_alloc = 1; 4513 } else { 4514 space_info->chunk_alloc = 1; 4515 } 4516 4517 spin_unlock(&space_info->lock); 4518 4519 mutex_lock(&fs_info->chunk_mutex); 4520 4521 /* 4522 * The chunk_mutex is held throughout the entirety of a chunk 4523 * allocation, so once we've acquired the chunk_mutex we know that the 4524 * other guy is done and we need to recheck and see if we should 4525 * allocate. 4526 */ 4527 if (wait_for_alloc) { 4528 mutex_unlock(&fs_info->chunk_mutex); 4529 wait_for_alloc = 0; 4530 goto again; 4531 } 4532 4533 trans->allocating_chunk = true; 4534 4535 /* 4536 * If we have mixed data/metadata chunks we want to make sure we keep 4537 * allocating mixed chunks instead of individual chunks. 4538 */ 4539 if (btrfs_mixed_space_info(space_info)) 4540 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 4541 4542 /* 4543 * if we're doing a data chunk, go ahead and make sure that 4544 * we keep a reasonable number of metadata chunks allocated in the 4545 * FS as well. 4546 */ 4547 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 4548 fs_info->data_chunk_allocations++; 4549 if (!(fs_info->data_chunk_allocations % 4550 fs_info->metadata_ratio)) 4551 force_metadata_allocation(fs_info); 4552 } 4553 4554 /* 4555 * Check if we have enough space in SYSTEM chunk because we may need 4556 * to update devices. 4557 */ 4558 check_system_chunk(trans, extent_root, flags); 4559 4560 ret = btrfs_alloc_chunk(trans, extent_root, flags); 4561 trans->allocating_chunk = false; 4562 4563 spin_lock(&space_info->lock); 4564 if (ret < 0 && ret != -ENOSPC) 4565 goto out; 4566 if (ret) 4567 space_info->full = 1; 4568 else 4569 ret = 1; 4570 4571 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4572 out: 4573 space_info->chunk_alloc = 0; 4574 spin_unlock(&space_info->lock); 4575 mutex_unlock(&fs_info->chunk_mutex); 4576 /* 4577 * When we allocate a new chunk we reserve space in the chunk block 4578 * reserve to make sure we can COW nodes/leafs in the chunk tree or 4579 * add new nodes/leafs to it if we end up needing to do it when 4580 * inserting the chunk item and updating device items as part of the 4581 * second phase of chunk allocation, performed by 4582 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a 4583 * large number of new block groups to create in our transaction 4584 * handle's new_bgs list to avoid exhausting the chunk block reserve 4585 * in extreme cases - like having a single transaction create many new 4586 * block groups when starting to write out the free space caches of all 4587 * the block groups that were made dirty during the lifetime of the 4588 * transaction. 4589 */ 4590 if (trans->can_flush_pending_bgs && 4591 trans->chunk_bytes_reserved >= (u64)SZ_2M) { 4592 btrfs_create_pending_block_groups(trans, extent_root); 4593 btrfs_trans_release_chunk_metadata(trans); 4594 } 4595 return ret; 4596 } 4597 4598 static int can_overcommit(struct btrfs_root *root, 4599 struct btrfs_space_info *space_info, u64 bytes, 4600 enum btrfs_reserve_flush_enum flush) 4601 { 4602 struct btrfs_block_rsv *global_rsv; 4603 u64 profile; 4604 u64 space_size; 4605 u64 avail; 4606 u64 used; 4607 4608 /* Don't overcommit when in mixed mode. */ 4609 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 4610 return 0; 4611 4612 BUG_ON(root->fs_info == NULL); 4613 global_rsv = &root->fs_info->global_block_rsv; 4614 profile = btrfs_get_alloc_profile(root, 0); 4615 used = space_info->bytes_used + space_info->bytes_reserved + 4616 space_info->bytes_pinned + space_info->bytes_readonly; 4617 4618 /* 4619 * We only want to allow over committing if we have lots of actual space 4620 * free, but if we don't have enough space to handle the global reserve 4621 * space then we could end up having a real enospc problem when trying 4622 * to allocate a chunk or some other such important allocation. 4623 */ 4624 spin_lock(&global_rsv->lock); 4625 space_size = calc_global_rsv_need_space(global_rsv); 4626 spin_unlock(&global_rsv->lock); 4627 if (used + space_size >= space_info->total_bytes) 4628 return 0; 4629 4630 used += space_info->bytes_may_use; 4631 4632 spin_lock(&root->fs_info->free_chunk_lock); 4633 avail = root->fs_info->free_chunk_space; 4634 spin_unlock(&root->fs_info->free_chunk_lock); 4635 4636 /* 4637 * If we have dup, raid1 or raid10 then only half of the free 4638 * space is actually useable. For raid56, the space info used 4639 * doesn't include the parity drive, so we don't have to 4640 * change the math 4641 */ 4642 if (profile & (BTRFS_BLOCK_GROUP_DUP | 4643 BTRFS_BLOCK_GROUP_RAID1 | 4644 BTRFS_BLOCK_GROUP_RAID10)) 4645 avail >>= 1; 4646 4647 /* 4648 * If we aren't flushing all things, let us overcommit up to 4649 * 1/2th of the space. If we can flush, don't let us overcommit 4650 * too much, let it overcommit up to 1/8 of the space. 4651 */ 4652 if (flush == BTRFS_RESERVE_FLUSH_ALL) 4653 avail >>= 3; 4654 else 4655 avail >>= 1; 4656 4657 if (used + bytes < space_info->total_bytes + avail) 4658 return 1; 4659 return 0; 4660 } 4661 4662 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 4663 unsigned long nr_pages, int nr_items) 4664 { 4665 struct super_block *sb = root->fs_info->sb; 4666 4667 if (down_read_trylock(&sb->s_umount)) { 4668 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4669 up_read(&sb->s_umount); 4670 } else { 4671 /* 4672 * We needn't worry the filesystem going from r/w to r/o though 4673 * we don't acquire ->s_umount mutex, because the filesystem 4674 * should guarantee the delalloc inodes list be empty after 4675 * the filesystem is readonly(all dirty pages are written to 4676 * the disk). 4677 */ 4678 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); 4679 if (!current->journal_info) 4680 btrfs_wait_ordered_roots(root->fs_info, nr_items, 4681 0, (u64)-1); 4682 } 4683 } 4684 4685 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) 4686 { 4687 u64 bytes; 4688 int nr; 4689 4690 bytes = btrfs_calc_trans_metadata_size(root, 1); 4691 nr = (int)div64_u64(to_reclaim, bytes); 4692 if (!nr) 4693 nr = 1; 4694 return nr; 4695 } 4696 4697 #define EXTENT_SIZE_PER_ITEM SZ_256K 4698 4699 /* 4700 * shrink metadata reservation for delalloc 4701 */ 4702 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 4703 bool wait_ordered) 4704 { 4705 struct btrfs_block_rsv *block_rsv; 4706 struct btrfs_space_info *space_info; 4707 struct btrfs_trans_handle *trans; 4708 u64 delalloc_bytes; 4709 u64 max_reclaim; 4710 long time_left; 4711 unsigned long nr_pages; 4712 int loops; 4713 int items; 4714 enum btrfs_reserve_flush_enum flush; 4715 4716 /* Calc the number of the pages we need flush for space reservation */ 4717 items = calc_reclaim_items_nr(root, to_reclaim); 4718 to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; 4719 4720 trans = (struct btrfs_trans_handle *)current->journal_info; 4721 block_rsv = &root->fs_info->delalloc_block_rsv; 4722 space_info = block_rsv->space_info; 4723 4724 delalloc_bytes = percpu_counter_sum_positive( 4725 &root->fs_info->delalloc_bytes); 4726 if (delalloc_bytes == 0) { 4727 if (trans) 4728 return; 4729 if (wait_ordered) 4730 btrfs_wait_ordered_roots(root->fs_info, items, 4731 0, (u64)-1); 4732 return; 4733 } 4734 4735 loops = 0; 4736 while (delalloc_bytes && loops < 3) { 4737 max_reclaim = min(delalloc_bytes, to_reclaim); 4738 nr_pages = max_reclaim >> PAGE_SHIFT; 4739 btrfs_writeback_inodes_sb_nr(root, nr_pages, items); 4740 /* 4741 * We need to wait for the async pages to actually start before 4742 * we do anything. 4743 */ 4744 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); 4745 if (!max_reclaim) 4746 goto skip_async; 4747 4748 if (max_reclaim <= nr_pages) 4749 max_reclaim = 0; 4750 else 4751 max_reclaim -= nr_pages; 4752 4753 wait_event(root->fs_info->async_submit_wait, 4754 atomic_read(&root->fs_info->async_delalloc_pages) <= 4755 (int)max_reclaim); 4756 skip_async: 4757 if (!trans) 4758 flush = BTRFS_RESERVE_FLUSH_ALL; 4759 else 4760 flush = BTRFS_RESERVE_NO_FLUSH; 4761 spin_lock(&space_info->lock); 4762 if (can_overcommit(root, space_info, orig, flush)) { 4763 spin_unlock(&space_info->lock); 4764 break; 4765 } 4766 if (list_empty(&space_info->tickets) && 4767 list_empty(&space_info->priority_tickets)) { 4768 spin_unlock(&space_info->lock); 4769 break; 4770 } 4771 spin_unlock(&space_info->lock); 4772 4773 loops++; 4774 if (wait_ordered && !trans) { 4775 btrfs_wait_ordered_roots(root->fs_info, items, 4776 0, (u64)-1); 4777 } else { 4778 time_left = schedule_timeout_killable(1); 4779 if (time_left) 4780 break; 4781 } 4782 delalloc_bytes = percpu_counter_sum_positive( 4783 &root->fs_info->delalloc_bytes); 4784 } 4785 } 4786 4787 /** 4788 * maybe_commit_transaction - possibly commit the transaction if its ok to 4789 * @root - the root we're allocating for 4790 * @bytes - the number of bytes we want to reserve 4791 * @force - force the commit 4792 * 4793 * This will check to make sure that committing the transaction will actually 4794 * get us somewhere and then commit the transaction if it does. Otherwise it 4795 * will return -ENOSPC. 4796 */ 4797 static int may_commit_transaction(struct btrfs_root *root, 4798 struct btrfs_space_info *space_info, 4799 u64 bytes, int force) 4800 { 4801 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; 4802 struct btrfs_trans_handle *trans; 4803 4804 trans = (struct btrfs_trans_handle *)current->journal_info; 4805 if (trans) 4806 return -EAGAIN; 4807 4808 if (force) 4809 goto commit; 4810 4811 /* See if there is enough pinned space to make this reservation */ 4812 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4813 bytes) >= 0) 4814 goto commit; 4815 4816 /* 4817 * See if there is some space in the delayed insertion reservation for 4818 * this reservation. 4819 */ 4820 if (space_info != delayed_rsv->space_info) 4821 return -ENOSPC; 4822 4823 spin_lock(&delayed_rsv->lock); 4824 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4825 bytes - delayed_rsv->size) >= 0) { 4826 spin_unlock(&delayed_rsv->lock); 4827 return -ENOSPC; 4828 } 4829 spin_unlock(&delayed_rsv->lock); 4830 4831 commit: 4832 trans = btrfs_join_transaction(root); 4833 if (IS_ERR(trans)) 4834 return -ENOSPC; 4835 4836 return btrfs_commit_transaction(trans, root); 4837 } 4838 4839 struct reserve_ticket { 4840 u64 bytes; 4841 int error; 4842 struct list_head list; 4843 wait_queue_head_t wait; 4844 }; 4845 4846 static int flush_space(struct btrfs_root *root, 4847 struct btrfs_space_info *space_info, u64 num_bytes, 4848 u64 orig_bytes, int state) 4849 { 4850 struct btrfs_trans_handle *trans; 4851 int nr; 4852 int ret = 0; 4853 4854 switch (state) { 4855 case FLUSH_DELAYED_ITEMS_NR: 4856 case FLUSH_DELAYED_ITEMS: 4857 if (state == FLUSH_DELAYED_ITEMS_NR) 4858 nr = calc_reclaim_items_nr(root, num_bytes) * 2; 4859 else 4860 nr = -1; 4861 4862 trans = btrfs_join_transaction(root); 4863 if (IS_ERR(trans)) { 4864 ret = PTR_ERR(trans); 4865 break; 4866 } 4867 ret = btrfs_run_delayed_items_nr(trans, root, nr); 4868 btrfs_end_transaction(trans, root); 4869 break; 4870 case FLUSH_DELALLOC: 4871 case FLUSH_DELALLOC_WAIT: 4872 shrink_delalloc(root, num_bytes * 2, orig_bytes, 4873 state == FLUSH_DELALLOC_WAIT); 4874 break; 4875 case ALLOC_CHUNK: 4876 trans = btrfs_join_transaction(root); 4877 if (IS_ERR(trans)) { 4878 ret = PTR_ERR(trans); 4879 break; 4880 } 4881 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4882 btrfs_get_alloc_profile(root, 0), 4883 CHUNK_ALLOC_NO_FORCE); 4884 btrfs_end_transaction(trans, root); 4885 if (ret == -ENOSPC) 4886 ret = 0; 4887 break; 4888 case COMMIT_TRANS: 4889 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 4890 break; 4891 default: 4892 ret = -ENOSPC; 4893 break; 4894 } 4895 4896 trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes, 4897 orig_bytes, state, ret); 4898 return ret; 4899 } 4900 4901 static inline u64 4902 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, 4903 struct btrfs_space_info *space_info) 4904 { 4905 struct reserve_ticket *ticket; 4906 u64 used; 4907 u64 expected; 4908 u64 to_reclaim = 0; 4909 4910 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 4911 if (can_overcommit(root, space_info, to_reclaim, 4912 BTRFS_RESERVE_FLUSH_ALL)) 4913 return 0; 4914 4915 list_for_each_entry(ticket, &space_info->tickets, list) 4916 to_reclaim += ticket->bytes; 4917 list_for_each_entry(ticket, &space_info->priority_tickets, list) 4918 to_reclaim += ticket->bytes; 4919 if (to_reclaim) 4920 return to_reclaim; 4921 4922 used = space_info->bytes_used + space_info->bytes_reserved + 4923 space_info->bytes_pinned + space_info->bytes_readonly + 4924 space_info->bytes_may_use; 4925 if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) 4926 expected = div_factor_fine(space_info->total_bytes, 95); 4927 else 4928 expected = div_factor_fine(space_info->total_bytes, 90); 4929 4930 if (used > expected) 4931 to_reclaim = used - expected; 4932 else 4933 to_reclaim = 0; 4934 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 4935 space_info->bytes_reserved); 4936 return to_reclaim; 4937 } 4938 4939 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, 4940 struct btrfs_root *root, u64 used) 4941 { 4942 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 4943 4944 /* If we're just plain full then async reclaim just slows us down. */ 4945 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 4946 return 0; 4947 4948 if (!btrfs_calc_reclaim_metadata_size(root, space_info)) 4949 return 0; 4950 4951 return (used >= thresh && !btrfs_fs_closing(root->fs_info) && 4952 !test_bit(BTRFS_FS_STATE_REMOUNTING, 4953 &root->fs_info->fs_state)); 4954 } 4955 4956 static void wake_all_tickets(struct list_head *head) 4957 { 4958 struct reserve_ticket *ticket; 4959 4960 while (!list_empty(head)) { 4961 ticket = list_first_entry(head, struct reserve_ticket, list); 4962 list_del_init(&ticket->list); 4963 ticket->error = -ENOSPC; 4964 wake_up(&ticket->wait); 4965 } 4966 } 4967 4968 /* 4969 * This is for normal flushers, we can wait all goddamned day if we want to. We 4970 * will loop and continuously try to flush as long as we are making progress. 4971 * We count progress as clearing off tickets each time we have to loop. 4972 */ 4973 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4974 { 4975 struct reserve_ticket *last_ticket = NULL; 4976 struct btrfs_fs_info *fs_info; 4977 struct btrfs_space_info *space_info; 4978 u64 to_reclaim; 4979 int flush_state; 4980 int commit_cycles = 0; 4981 4982 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 4983 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4984 4985 spin_lock(&space_info->lock); 4986 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 4987 space_info); 4988 if (!to_reclaim) { 4989 space_info->flush = 0; 4990 spin_unlock(&space_info->lock); 4991 return; 4992 } 4993 last_ticket = list_first_entry(&space_info->tickets, 4994 struct reserve_ticket, list); 4995 spin_unlock(&space_info->lock); 4996 4997 flush_state = FLUSH_DELAYED_ITEMS_NR; 4998 do { 4999 struct reserve_ticket *ticket; 5000 int ret; 5001 5002 ret = flush_space(fs_info->fs_root, space_info, to_reclaim, 5003 to_reclaim, flush_state); 5004 spin_lock(&space_info->lock); 5005 if (list_empty(&space_info->tickets)) { 5006 space_info->flush = 0; 5007 spin_unlock(&space_info->lock); 5008 return; 5009 } 5010 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 5011 space_info); 5012 ticket = list_first_entry(&space_info->tickets, 5013 struct reserve_ticket, list); 5014 if (last_ticket == ticket) { 5015 flush_state++; 5016 } else { 5017 last_ticket = ticket; 5018 flush_state = FLUSH_DELAYED_ITEMS_NR; 5019 if (commit_cycles) 5020 commit_cycles--; 5021 } 5022 5023 if (flush_state > COMMIT_TRANS) { 5024 commit_cycles++; 5025 if (commit_cycles > 2) { 5026 wake_all_tickets(&space_info->tickets); 5027 space_info->flush = 0; 5028 } else { 5029 flush_state = FLUSH_DELAYED_ITEMS_NR; 5030 } 5031 } 5032 spin_unlock(&space_info->lock); 5033 } while (flush_state <= COMMIT_TRANS); 5034 } 5035 5036 void btrfs_init_async_reclaim_work(struct work_struct *work) 5037 { 5038 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 5039 } 5040 5041 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 5042 struct btrfs_space_info *space_info, 5043 struct reserve_ticket *ticket) 5044 { 5045 u64 to_reclaim; 5046 int flush_state = FLUSH_DELAYED_ITEMS_NR; 5047 5048 spin_lock(&space_info->lock); 5049 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 5050 space_info); 5051 if (!to_reclaim) { 5052 spin_unlock(&space_info->lock); 5053 return; 5054 } 5055 spin_unlock(&space_info->lock); 5056 5057 do { 5058 flush_space(fs_info->fs_root, space_info, to_reclaim, 5059 to_reclaim, flush_state); 5060 flush_state++; 5061 spin_lock(&space_info->lock); 5062 if (ticket->bytes == 0) { 5063 spin_unlock(&space_info->lock); 5064 return; 5065 } 5066 spin_unlock(&space_info->lock); 5067 5068 /* 5069 * Priority flushers can't wait on delalloc without 5070 * deadlocking. 5071 */ 5072 if (flush_state == FLUSH_DELALLOC || 5073 flush_state == FLUSH_DELALLOC_WAIT) 5074 flush_state = ALLOC_CHUNK; 5075 } while (flush_state < COMMIT_TRANS); 5076 } 5077 5078 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, 5079 struct btrfs_space_info *space_info, 5080 struct reserve_ticket *ticket, u64 orig_bytes) 5081 5082 { 5083 DEFINE_WAIT(wait); 5084 int ret = 0; 5085 5086 spin_lock(&space_info->lock); 5087 while (ticket->bytes > 0 && ticket->error == 0) { 5088 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 5089 if (ret) { 5090 ret = -EINTR; 5091 break; 5092 } 5093 spin_unlock(&space_info->lock); 5094 5095 schedule(); 5096 5097 finish_wait(&ticket->wait, &wait); 5098 spin_lock(&space_info->lock); 5099 } 5100 if (!ret) 5101 ret = ticket->error; 5102 if (!list_empty(&ticket->list)) 5103 list_del_init(&ticket->list); 5104 if (ticket->bytes && ticket->bytes < orig_bytes) { 5105 u64 num_bytes = orig_bytes - ticket->bytes; 5106 space_info->bytes_may_use -= num_bytes; 5107 trace_btrfs_space_reservation(fs_info, "space_info", 5108 space_info->flags, num_bytes, 0); 5109 } 5110 spin_unlock(&space_info->lock); 5111 5112 return ret; 5113 } 5114 5115 /** 5116 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5117 * @root - the root we're allocating for 5118 * @space_info - the space info we want to allocate from 5119 * @orig_bytes - the number of bytes we want 5120 * @flush - whether or not we can flush to make our reservation 5121 * 5122 * This will reserve orig_bytes number of bytes from the space info associated 5123 * with the block_rsv. If there is not enough space it will make an attempt to 5124 * flush out space to make room. It will do this by flushing delalloc if 5125 * possible or committing the transaction. If flush is 0 then no attempts to 5126 * regain reservations will be made and this will fail if there is not enough 5127 * space already. 5128 */ 5129 static int __reserve_metadata_bytes(struct btrfs_root *root, 5130 struct btrfs_space_info *space_info, 5131 u64 orig_bytes, 5132 enum btrfs_reserve_flush_enum flush) 5133 { 5134 struct reserve_ticket ticket; 5135 u64 used; 5136 int ret = 0; 5137 5138 ASSERT(orig_bytes); 5139 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 5140 5141 spin_lock(&space_info->lock); 5142 ret = -ENOSPC; 5143 used = space_info->bytes_used + space_info->bytes_reserved + 5144 space_info->bytes_pinned + space_info->bytes_readonly + 5145 space_info->bytes_may_use; 5146 5147 /* 5148 * If we have enough space then hooray, make our reservation and carry 5149 * on. If not see if we can overcommit, and if we can, hooray carry on. 5150 * If not things get more complicated. 5151 */ 5152 if (used + orig_bytes <= space_info->total_bytes) { 5153 space_info->bytes_may_use += orig_bytes; 5154 trace_btrfs_space_reservation(root->fs_info, "space_info", 5155 space_info->flags, orig_bytes, 5156 1); 5157 ret = 0; 5158 } else if (can_overcommit(root, space_info, orig_bytes, flush)) { 5159 space_info->bytes_may_use += orig_bytes; 5160 trace_btrfs_space_reservation(root->fs_info, "space_info", 5161 space_info->flags, orig_bytes, 5162 1); 5163 ret = 0; 5164 } 5165 5166 /* 5167 * If we couldn't make a reservation then setup our reservation ticket 5168 * and kick the async worker if it's not already running. 5169 * 5170 * If we are a priority flusher then we just need to add our ticket to 5171 * the list and we will do our own flushing further down. 5172 */ 5173 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 5174 ticket.bytes = orig_bytes; 5175 ticket.error = 0; 5176 init_waitqueue_head(&ticket.wait); 5177 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 5178 list_add_tail(&ticket.list, &space_info->tickets); 5179 if (!space_info->flush) { 5180 space_info->flush = 1; 5181 trace_btrfs_trigger_flush(root->fs_info, 5182 space_info->flags, 5183 orig_bytes, flush, 5184 "enospc"); 5185 queue_work(system_unbound_wq, 5186 &root->fs_info->async_reclaim_work); 5187 } 5188 } else { 5189 list_add_tail(&ticket.list, 5190 &space_info->priority_tickets); 5191 } 5192 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 5193 used += orig_bytes; 5194 /* 5195 * We will do the space reservation dance during log replay, 5196 * which means we won't have fs_info->fs_root set, so don't do 5197 * the async reclaim as we will panic. 5198 */ 5199 if (!root->fs_info->log_root_recovering && 5200 need_do_async_reclaim(space_info, root, used) && 5201 !work_busy(&root->fs_info->async_reclaim_work)) { 5202 trace_btrfs_trigger_flush(root->fs_info, 5203 space_info->flags, 5204 orig_bytes, flush, 5205 "preempt"); 5206 queue_work(system_unbound_wq, 5207 &root->fs_info->async_reclaim_work); 5208 } 5209 } 5210 spin_unlock(&space_info->lock); 5211 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 5212 return ret; 5213 5214 if (flush == BTRFS_RESERVE_FLUSH_ALL) 5215 return wait_reserve_ticket(root->fs_info, space_info, &ticket, 5216 orig_bytes); 5217 5218 ret = 0; 5219 priority_reclaim_metadata_space(root->fs_info, space_info, &ticket); 5220 spin_lock(&space_info->lock); 5221 if (ticket.bytes) { 5222 if (ticket.bytes < orig_bytes) { 5223 u64 num_bytes = orig_bytes - ticket.bytes; 5224 space_info->bytes_may_use -= num_bytes; 5225 trace_btrfs_space_reservation(root->fs_info, 5226 "space_info", space_info->flags, 5227 num_bytes, 0); 5228 5229 } 5230 list_del_init(&ticket.list); 5231 ret = -ENOSPC; 5232 } 5233 spin_unlock(&space_info->lock); 5234 ASSERT(list_empty(&ticket.list)); 5235 return ret; 5236 } 5237 5238 /** 5239 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5240 * @root - the root we're allocating for 5241 * @block_rsv - the block_rsv we're allocating for 5242 * @orig_bytes - the number of bytes we want 5243 * @flush - whether or not we can flush to make our reservation 5244 * 5245 * This will reserve orgi_bytes number of bytes from the space info associated 5246 * with the block_rsv. If there is not enough space it will make an attempt to 5247 * flush out space to make room. It will do this by flushing delalloc if 5248 * possible or committing the transaction. If flush is 0 then no attempts to 5249 * regain reservations will be made and this will fail if there is not enough 5250 * space already. 5251 */ 5252 static int reserve_metadata_bytes(struct btrfs_root *root, 5253 struct btrfs_block_rsv *block_rsv, 5254 u64 orig_bytes, 5255 enum btrfs_reserve_flush_enum flush) 5256 { 5257 int ret; 5258 5259 ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, 5260 flush); 5261 if (ret == -ENOSPC && 5262 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 5263 struct btrfs_block_rsv *global_rsv = 5264 &root->fs_info->global_block_rsv; 5265 5266 if (block_rsv != global_rsv && 5267 !block_rsv_use_bytes(global_rsv, orig_bytes)) 5268 ret = 0; 5269 } 5270 if (ret == -ENOSPC) 5271 trace_btrfs_space_reservation(root->fs_info, 5272 "space_info:enospc", 5273 block_rsv->space_info->flags, 5274 orig_bytes, 1); 5275 return ret; 5276 } 5277 5278 static struct btrfs_block_rsv *get_block_rsv( 5279 const struct btrfs_trans_handle *trans, 5280 const struct btrfs_root *root) 5281 { 5282 struct btrfs_block_rsv *block_rsv = NULL; 5283 5284 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 5285 (root == root->fs_info->csum_root && trans->adding_csums) || 5286 (root == root->fs_info->uuid_root)) 5287 block_rsv = trans->block_rsv; 5288 5289 if (!block_rsv) 5290 block_rsv = root->block_rsv; 5291 5292 if (!block_rsv) 5293 block_rsv = &root->fs_info->empty_block_rsv; 5294 5295 return block_rsv; 5296 } 5297 5298 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 5299 u64 num_bytes) 5300 { 5301 int ret = -ENOSPC; 5302 spin_lock(&block_rsv->lock); 5303 if (block_rsv->reserved >= num_bytes) { 5304 block_rsv->reserved -= num_bytes; 5305 if (block_rsv->reserved < block_rsv->size) 5306 block_rsv->full = 0; 5307 ret = 0; 5308 } 5309 spin_unlock(&block_rsv->lock); 5310 return ret; 5311 } 5312 5313 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 5314 u64 num_bytes, int update_size) 5315 { 5316 spin_lock(&block_rsv->lock); 5317 block_rsv->reserved += num_bytes; 5318 if (update_size) 5319 block_rsv->size += num_bytes; 5320 else if (block_rsv->reserved >= block_rsv->size) 5321 block_rsv->full = 1; 5322 spin_unlock(&block_rsv->lock); 5323 } 5324 5325 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 5326 struct btrfs_block_rsv *dest, u64 num_bytes, 5327 int min_factor) 5328 { 5329 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5330 u64 min_bytes; 5331 5332 if (global_rsv->space_info != dest->space_info) 5333 return -ENOSPC; 5334 5335 spin_lock(&global_rsv->lock); 5336 min_bytes = div_factor(global_rsv->size, min_factor); 5337 if (global_rsv->reserved < min_bytes + num_bytes) { 5338 spin_unlock(&global_rsv->lock); 5339 return -ENOSPC; 5340 } 5341 global_rsv->reserved -= num_bytes; 5342 if (global_rsv->reserved < global_rsv->size) 5343 global_rsv->full = 0; 5344 spin_unlock(&global_rsv->lock); 5345 5346 block_rsv_add_bytes(dest, num_bytes, 1); 5347 return 0; 5348 } 5349 5350 /* 5351 * This is for space we already have accounted in space_info->bytes_may_use, so 5352 * basically when we're returning space from block_rsv's. 5353 */ 5354 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 5355 struct btrfs_space_info *space_info, 5356 u64 num_bytes) 5357 { 5358 struct reserve_ticket *ticket; 5359 struct list_head *head; 5360 u64 used; 5361 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 5362 bool check_overcommit = false; 5363 5364 spin_lock(&space_info->lock); 5365 head = &space_info->priority_tickets; 5366 5367 /* 5368 * If we are over our limit then we need to check and see if we can 5369 * overcommit, and if we can't then we just need to free up our space 5370 * and not satisfy any requests. 5371 */ 5372 used = space_info->bytes_used + space_info->bytes_reserved + 5373 space_info->bytes_pinned + space_info->bytes_readonly + 5374 space_info->bytes_may_use; 5375 if (used - num_bytes >= space_info->total_bytes) 5376 check_overcommit = true; 5377 again: 5378 while (!list_empty(head) && num_bytes) { 5379 ticket = list_first_entry(head, struct reserve_ticket, 5380 list); 5381 /* 5382 * We use 0 bytes because this space is already reserved, so 5383 * adding the ticket space would be a double count. 5384 */ 5385 if (check_overcommit && 5386 !can_overcommit(fs_info->extent_root, space_info, 0, 5387 flush)) 5388 break; 5389 if (num_bytes >= ticket->bytes) { 5390 list_del_init(&ticket->list); 5391 num_bytes -= ticket->bytes; 5392 ticket->bytes = 0; 5393 wake_up(&ticket->wait); 5394 } else { 5395 ticket->bytes -= num_bytes; 5396 num_bytes = 0; 5397 } 5398 } 5399 5400 if (num_bytes && head == &space_info->priority_tickets) { 5401 head = &space_info->tickets; 5402 flush = BTRFS_RESERVE_FLUSH_ALL; 5403 goto again; 5404 } 5405 space_info->bytes_may_use -= num_bytes; 5406 trace_btrfs_space_reservation(fs_info, "space_info", 5407 space_info->flags, num_bytes, 0); 5408 spin_unlock(&space_info->lock); 5409 } 5410 5411 /* 5412 * This is for newly allocated space that isn't accounted in 5413 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent 5414 * we use this helper. 5415 */ 5416 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 5417 struct btrfs_space_info *space_info, 5418 u64 num_bytes) 5419 { 5420 struct reserve_ticket *ticket; 5421 struct list_head *head = &space_info->priority_tickets; 5422 5423 again: 5424 while (!list_empty(head) && num_bytes) { 5425 ticket = list_first_entry(head, struct reserve_ticket, 5426 list); 5427 if (num_bytes >= ticket->bytes) { 5428 trace_btrfs_space_reservation(fs_info, "space_info", 5429 space_info->flags, 5430 ticket->bytes, 1); 5431 list_del_init(&ticket->list); 5432 num_bytes -= ticket->bytes; 5433 space_info->bytes_may_use += ticket->bytes; 5434 ticket->bytes = 0; 5435 wake_up(&ticket->wait); 5436 } else { 5437 trace_btrfs_space_reservation(fs_info, "space_info", 5438 space_info->flags, 5439 num_bytes, 1); 5440 space_info->bytes_may_use += num_bytes; 5441 ticket->bytes -= num_bytes; 5442 num_bytes = 0; 5443 } 5444 } 5445 5446 if (num_bytes && head == &space_info->priority_tickets) { 5447 head = &space_info->tickets; 5448 goto again; 5449 } 5450 } 5451 5452 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5453 struct btrfs_block_rsv *block_rsv, 5454 struct btrfs_block_rsv *dest, u64 num_bytes) 5455 { 5456 struct btrfs_space_info *space_info = block_rsv->space_info; 5457 5458 spin_lock(&block_rsv->lock); 5459 if (num_bytes == (u64)-1) 5460 num_bytes = block_rsv->size; 5461 block_rsv->size -= num_bytes; 5462 if (block_rsv->reserved >= block_rsv->size) { 5463 num_bytes = block_rsv->reserved - block_rsv->size; 5464 block_rsv->reserved = block_rsv->size; 5465 block_rsv->full = 1; 5466 } else { 5467 num_bytes = 0; 5468 } 5469 spin_unlock(&block_rsv->lock); 5470 5471 if (num_bytes > 0) { 5472 if (dest) { 5473 spin_lock(&dest->lock); 5474 if (!dest->full) { 5475 u64 bytes_to_add; 5476 5477 bytes_to_add = dest->size - dest->reserved; 5478 bytes_to_add = min(num_bytes, bytes_to_add); 5479 dest->reserved += bytes_to_add; 5480 if (dest->reserved >= dest->size) 5481 dest->full = 1; 5482 num_bytes -= bytes_to_add; 5483 } 5484 spin_unlock(&dest->lock); 5485 } 5486 if (num_bytes) 5487 space_info_add_old_bytes(fs_info, space_info, 5488 num_bytes); 5489 } 5490 } 5491 5492 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, 5493 struct btrfs_block_rsv *dst, u64 num_bytes, 5494 int update_size) 5495 { 5496 int ret; 5497 5498 ret = block_rsv_use_bytes(src, num_bytes); 5499 if (ret) 5500 return ret; 5501 5502 block_rsv_add_bytes(dst, num_bytes, update_size); 5503 return 0; 5504 } 5505 5506 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 5507 { 5508 memset(rsv, 0, sizeof(*rsv)); 5509 spin_lock_init(&rsv->lock); 5510 rsv->type = type; 5511 } 5512 5513 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, 5514 unsigned short type) 5515 { 5516 struct btrfs_block_rsv *block_rsv; 5517 struct btrfs_fs_info *fs_info = root->fs_info; 5518 5519 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 5520 if (!block_rsv) 5521 return NULL; 5522 5523 btrfs_init_block_rsv(block_rsv, type); 5524 block_rsv->space_info = __find_space_info(fs_info, 5525 BTRFS_BLOCK_GROUP_METADATA); 5526 return block_rsv; 5527 } 5528 5529 void btrfs_free_block_rsv(struct btrfs_root *root, 5530 struct btrfs_block_rsv *rsv) 5531 { 5532 if (!rsv) 5533 return; 5534 btrfs_block_rsv_release(root, rsv, (u64)-1); 5535 kfree(rsv); 5536 } 5537 5538 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv) 5539 { 5540 kfree(rsv); 5541 } 5542 5543 int btrfs_block_rsv_add(struct btrfs_root *root, 5544 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 5545 enum btrfs_reserve_flush_enum flush) 5546 { 5547 int ret; 5548 5549 if (num_bytes == 0) 5550 return 0; 5551 5552 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5553 if (!ret) { 5554 block_rsv_add_bytes(block_rsv, num_bytes, 1); 5555 return 0; 5556 } 5557 5558 return ret; 5559 } 5560 5561 int btrfs_block_rsv_check(struct btrfs_root *root, 5562 struct btrfs_block_rsv *block_rsv, int min_factor) 5563 { 5564 u64 num_bytes = 0; 5565 int ret = -ENOSPC; 5566 5567 if (!block_rsv) 5568 return 0; 5569 5570 spin_lock(&block_rsv->lock); 5571 num_bytes = div_factor(block_rsv->size, min_factor); 5572 if (block_rsv->reserved >= num_bytes) 5573 ret = 0; 5574 spin_unlock(&block_rsv->lock); 5575 5576 return ret; 5577 } 5578 5579 int btrfs_block_rsv_refill(struct btrfs_root *root, 5580 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 5581 enum btrfs_reserve_flush_enum flush) 5582 { 5583 u64 num_bytes = 0; 5584 int ret = -ENOSPC; 5585 5586 if (!block_rsv) 5587 return 0; 5588 5589 spin_lock(&block_rsv->lock); 5590 num_bytes = min_reserved; 5591 if (block_rsv->reserved >= num_bytes) 5592 ret = 0; 5593 else 5594 num_bytes -= block_rsv->reserved; 5595 spin_unlock(&block_rsv->lock); 5596 5597 if (!ret) 5598 return 0; 5599 5600 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5601 if (!ret) { 5602 block_rsv_add_bytes(block_rsv, num_bytes, 0); 5603 return 0; 5604 } 5605 5606 return ret; 5607 } 5608 5609 void btrfs_block_rsv_release(struct btrfs_root *root, 5610 struct btrfs_block_rsv *block_rsv, 5611 u64 num_bytes) 5612 { 5613 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 5614 if (global_rsv == block_rsv || 5615 block_rsv->space_info != global_rsv->space_info) 5616 global_rsv = NULL; 5617 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, 5618 num_bytes); 5619 } 5620 5621 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 5622 { 5623 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 5624 struct btrfs_space_info *sinfo = block_rsv->space_info; 5625 u64 num_bytes; 5626 5627 /* 5628 * The global block rsv is based on the size of the extent tree, the 5629 * checksum tree and the root tree. If the fs is empty we want to set 5630 * it to a minimal amount for safety. 5631 */ 5632 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + 5633 btrfs_root_used(&fs_info->csum_root->root_item) + 5634 btrfs_root_used(&fs_info->tree_root->root_item); 5635 num_bytes = max_t(u64, num_bytes, SZ_16M); 5636 5637 spin_lock(&sinfo->lock); 5638 spin_lock(&block_rsv->lock); 5639 5640 block_rsv->size = min_t(u64, num_bytes, SZ_512M); 5641 5642 if (block_rsv->reserved < block_rsv->size) { 5643 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 5644 sinfo->bytes_reserved + sinfo->bytes_readonly + 5645 sinfo->bytes_may_use; 5646 if (sinfo->total_bytes > num_bytes) { 5647 num_bytes = sinfo->total_bytes - num_bytes; 5648 num_bytes = min(num_bytes, 5649 block_rsv->size - block_rsv->reserved); 5650 block_rsv->reserved += num_bytes; 5651 sinfo->bytes_may_use += num_bytes; 5652 trace_btrfs_space_reservation(fs_info, "space_info", 5653 sinfo->flags, num_bytes, 5654 1); 5655 } 5656 } else if (block_rsv->reserved > block_rsv->size) { 5657 num_bytes = block_rsv->reserved - block_rsv->size; 5658 sinfo->bytes_may_use -= num_bytes; 5659 trace_btrfs_space_reservation(fs_info, "space_info", 5660 sinfo->flags, num_bytes, 0); 5661 block_rsv->reserved = block_rsv->size; 5662 } 5663 5664 if (block_rsv->reserved == block_rsv->size) 5665 block_rsv->full = 1; 5666 else 5667 block_rsv->full = 0; 5668 5669 spin_unlock(&block_rsv->lock); 5670 spin_unlock(&sinfo->lock); 5671 } 5672 5673 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 5674 { 5675 struct btrfs_space_info *space_info; 5676 5677 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 5678 fs_info->chunk_block_rsv.space_info = space_info; 5679 5680 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5681 fs_info->global_block_rsv.space_info = space_info; 5682 fs_info->delalloc_block_rsv.space_info = space_info; 5683 fs_info->trans_block_rsv.space_info = space_info; 5684 fs_info->empty_block_rsv.space_info = space_info; 5685 fs_info->delayed_block_rsv.space_info = space_info; 5686 5687 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 5688 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 5689 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 5690 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 5691 if (fs_info->quota_root) 5692 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 5693 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 5694 5695 update_global_block_rsv(fs_info); 5696 } 5697 5698 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 5699 { 5700 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 5701 (u64)-1); 5702 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 5703 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 5704 WARN_ON(fs_info->trans_block_rsv.size > 0); 5705 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 5706 WARN_ON(fs_info->chunk_block_rsv.size > 0); 5707 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 5708 WARN_ON(fs_info->delayed_block_rsv.size > 0); 5709 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 5710 } 5711 5712 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 5713 struct btrfs_root *root) 5714 { 5715 if (!trans->block_rsv) 5716 return; 5717 5718 if (!trans->bytes_reserved) 5719 return; 5720 5721 trace_btrfs_space_reservation(root->fs_info, "transaction", 5722 trans->transid, trans->bytes_reserved, 0); 5723 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 5724 trans->bytes_reserved = 0; 5725 } 5726 5727 /* 5728 * To be called after all the new block groups attached to the transaction 5729 * handle have been created (btrfs_create_pending_block_groups()). 5730 */ 5731 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) 5732 { 5733 struct btrfs_fs_info *fs_info = trans->fs_info; 5734 5735 if (!trans->chunk_bytes_reserved) 5736 return; 5737 5738 WARN_ON_ONCE(!list_empty(&trans->new_bgs)); 5739 5740 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, 5741 trans->chunk_bytes_reserved); 5742 trans->chunk_bytes_reserved = 0; 5743 } 5744 5745 /* Can only return 0 or -ENOSPC */ 5746 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 5747 struct inode *inode) 5748 { 5749 struct btrfs_root *root = BTRFS_I(inode)->root; 5750 /* 5751 * We always use trans->block_rsv here as we will have reserved space 5752 * for our orphan when starting the transaction, using get_block_rsv() 5753 * here will sometimes make us choose the wrong block rsv as we could be 5754 * doing a reloc inode for a non refcounted root. 5755 */ 5756 struct btrfs_block_rsv *src_rsv = trans->block_rsv; 5757 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 5758 5759 /* 5760 * We need to hold space in order to delete our orphan item once we've 5761 * added it, so this takes the reservation so we can release it later 5762 * when we are truly done with the orphan item. 5763 */ 5764 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 5765 trace_btrfs_space_reservation(root->fs_info, "orphan", 5766 btrfs_ino(inode), num_bytes, 1); 5767 return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); 5768 } 5769 5770 void btrfs_orphan_release_metadata(struct inode *inode) 5771 { 5772 struct btrfs_root *root = BTRFS_I(inode)->root; 5773 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 5774 trace_btrfs_space_reservation(root->fs_info, "orphan", 5775 btrfs_ino(inode), num_bytes, 0); 5776 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 5777 } 5778 5779 /* 5780 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 5781 * root: the root of the parent directory 5782 * rsv: block reservation 5783 * items: the number of items that we need do reservation 5784 * qgroup_reserved: used to return the reserved size in qgroup 5785 * 5786 * This function is used to reserve the space for snapshot/subvolume 5787 * creation and deletion. Those operations are different with the 5788 * common file/directory operations, they change two fs/file trees 5789 * and root tree, the number of items that the qgroup reserves is 5790 * different with the free space reservation. So we can not use 5791 * the space reservation mechanism in start_transaction(). 5792 */ 5793 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 5794 struct btrfs_block_rsv *rsv, 5795 int items, 5796 u64 *qgroup_reserved, 5797 bool use_global_rsv) 5798 { 5799 u64 num_bytes; 5800 int ret; 5801 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 5802 5803 if (root->fs_info->quota_enabled) { 5804 /* One for parent inode, two for dir entries */ 5805 num_bytes = 3 * root->nodesize; 5806 ret = btrfs_qgroup_reserve_meta(root, num_bytes); 5807 if (ret) 5808 return ret; 5809 } else { 5810 num_bytes = 0; 5811 } 5812 5813 *qgroup_reserved = num_bytes; 5814 5815 num_bytes = btrfs_calc_trans_metadata_size(root, items); 5816 rsv->space_info = __find_space_info(root->fs_info, 5817 BTRFS_BLOCK_GROUP_METADATA); 5818 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 5819 BTRFS_RESERVE_FLUSH_ALL); 5820 5821 if (ret == -ENOSPC && use_global_rsv) 5822 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); 5823 5824 if (ret && *qgroup_reserved) 5825 btrfs_qgroup_free_meta(root, *qgroup_reserved); 5826 5827 return ret; 5828 } 5829 5830 void btrfs_subvolume_release_metadata(struct btrfs_root *root, 5831 struct btrfs_block_rsv *rsv, 5832 u64 qgroup_reserved) 5833 { 5834 btrfs_block_rsv_release(root, rsv, (u64)-1); 5835 } 5836 5837 /** 5838 * drop_outstanding_extent - drop an outstanding extent 5839 * @inode: the inode we're dropping the extent for 5840 * @num_bytes: the number of bytes we're releasing. 5841 * 5842 * This is called when we are freeing up an outstanding extent, either called 5843 * after an error or after an extent is written. This will return the number of 5844 * reserved extents that need to be freed. This must be called with 5845 * BTRFS_I(inode)->lock held. 5846 */ 5847 static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) 5848 { 5849 unsigned drop_inode_space = 0; 5850 unsigned dropped_extents = 0; 5851 unsigned num_extents = 0; 5852 5853 num_extents = (unsigned)div64_u64(num_bytes + 5854 BTRFS_MAX_EXTENT_SIZE - 1, 5855 BTRFS_MAX_EXTENT_SIZE); 5856 ASSERT(num_extents); 5857 ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents); 5858 BTRFS_I(inode)->outstanding_extents -= num_extents; 5859 5860 if (BTRFS_I(inode)->outstanding_extents == 0 && 5861 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5862 &BTRFS_I(inode)->runtime_flags)) 5863 drop_inode_space = 1; 5864 5865 /* 5866 * If we have more or the same amount of outstanding extents than we have 5867 * reserved then we need to leave the reserved extents count alone. 5868 */ 5869 if (BTRFS_I(inode)->outstanding_extents >= 5870 BTRFS_I(inode)->reserved_extents) 5871 return drop_inode_space; 5872 5873 dropped_extents = BTRFS_I(inode)->reserved_extents - 5874 BTRFS_I(inode)->outstanding_extents; 5875 BTRFS_I(inode)->reserved_extents -= dropped_extents; 5876 return dropped_extents + drop_inode_space; 5877 } 5878 5879 /** 5880 * calc_csum_metadata_size - return the amount of metadata space that must be 5881 * reserved/freed for the given bytes. 5882 * @inode: the inode we're manipulating 5883 * @num_bytes: the number of bytes in question 5884 * @reserve: 1 if we are reserving space, 0 if we are freeing space 5885 * 5886 * This adjusts the number of csum_bytes in the inode and then returns the 5887 * correct amount of metadata that must either be reserved or freed. We 5888 * calculate how many checksums we can fit into one leaf and then divide the 5889 * number of bytes that will need to be checksumed by this value to figure out 5890 * how many checksums will be required. If we are adding bytes then the number 5891 * may go up and we will return the number of additional bytes that must be 5892 * reserved. If it is going down we will return the number of bytes that must 5893 * be freed. 5894 * 5895 * This must be called with BTRFS_I(inode)->lock held. 5896 */ 5897 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, 5898 int reserve) 5899 { 5900 struct btrfs_root *root = BTRFS_I(inode)->root; 5901 u64 old_csums, num_csums; 5902 5903 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && 5904 BTRFS_I(inode)->csum_bytes == 0) 5905 return 0; 5906 5907 old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); 5908 if (reserve) 5909 BTRFS_I(inode)->csum_bytes += num_bytes; 5910 else 5911 BTRFS_I(inode)->csum_bytes -= num_bytes; 5912 num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); 5913 5914 /* No change, no need to reserve more */ 5915 if (old_csums == num_csums) 5916 return 0; 5917 5918 if (reserve) 5919 return btrfs_calc_trans_metadata_size(root, 5920 num_csums - old_csums); 5921 5922 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); 5923 } 5924 5925 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 5926 { 5927 struct btrfs_root *root = BTRFS_I(inode)->root; 5928 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 5929 u64 to_reserve = 0; 5930 u64 csum_bytes; 5931 unsigned nr_extents = 0; 5932 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 5933 int ret = 0; 5934 bool delalloc_lock = true; 5935 u64 to_free = 0; 5936 unsigned dropped; 5937 bool release_extra = false; 5938 5939 /* If we are a free space inode we need to not flush since we will be in 5940 * the middle of a transaction commit. We also don't need the delalloc 5941 * mutex since we won't race with anybody. We need this mostly to make 5942 * lockdep shut its filthy mouth. 5943 * 5944 * If we have a transaction open (can happen if we call truncate_block 5945 * from truncate), then we need FLUSH_LIMIT so we don't deadlock. 5946 */ 5947 if (btrfs_is_free_space_inode(inode)) { 5948 flush = BTRFS_RESERVE_NO_FLUSH; 5949 delalloc_lock = false; 5950 } else if (current->journal_info) { 5951 flush = BTRFS_RESERVE_FLUSH_LIMIT; 5952 } 5953 5954 if (flush != BTRFS_RESERVE_NO_FLUSH && 5955 btrfs_transaction_in_commit(root->fs_info)) 5956 schedule_timeout(1); 5957 5958 if (delalloc_lock) 5959 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 5960 5961 num_bytes = ALIGN(num_bytes, root->sectorsize); 5962 5963 spin_lock(&BTRFS_I(inode)->lock); 5964 nr_extents = (unsigned)div64_u64(num_bytes + 5965 BTRFS_MAX_EXTENT_SIZE - 1, 5966 BTRFS_MAX_EXTENT_SIZE); 5967 BTRFS_I(inode)->outstanding_extents += nr_extents; 5968 5969 nr_extents = 0; 5970 if (BTRFS_I(inode)->outstanding_extents > 5971 BTRFS_I(inode)->reserved_extents) 5972 nr_extents += BTRFS_I(inode)->outstanding_extents - 5973 BTRFS_I(inode)->reserved_extents; 5974 5975 /* We always want to reserve a slot for updating the inode. */ 5976 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1); 5977 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 5978 csum_bytes = BTRFS_I(inode)->csum_bytes; 5979 spin_unlock(&BTRFS_I(inode)->lock); 5980 5981 if (root->fs_info->quota_enabled) { 5982 ret = btrfs_qgroup_reserve_meta(root, 5983 nr_extents * root->nodesize); 5984 if (ret) 5985 goto out_fail; 5986 } 5987 5988 ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); 5989 if (unlikely(ret)) { 5990 btrfs_qgroup_free_meta(root, nr_extents * root->nodesize); 5991 goto out_fail; 5992 } 5993 5994 spin_lock(&BTRFS_I(inode)->lock); 5995 if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5996 &BTRFS_I(inode)->runtime_flags)) { 5997 to_reserve -= btrfs_calc_trans_metadata_size(root, 1); 5998 release_extra = true; 5999 } 6000 BTRFS_I(inode)->reserved_extents += nr_extents; 6001 spin_unlock(&BTRFS_I(inode)->lock); 6002 6003 if (delalloc_lock) 6004 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 6005 6006 if (to_reserve) 6007 trace_btrfs_space_reservation(root->fs_info, "delalloc", 6008 btrfs_ino(inode), to_reserve, 1); 6009 if (release_extra) 6010 btrfs_block_rsv_release(root, block_rsv, 6011 btrfs_calc_trans_metadata_size(root, 6012 1)); 6013 return 0; 6014 6015 out_fail: 6016 spin_lock(&BTRFS_I(inode)->lock); 6017 dropped = drop_outstanding_extent(inode, num_bytes); 6018 /* 6019 * If the inodes csum_bytes is the same as the original 6020 * csum_bytes then we know we haven't raced with any free()ers 6021 * so we can just reduce our inodes csum bytes and carry on. 6022 */ 6023 if (BTRFS_I(inode)->csum_bytes == csum_bytes) { 6024 calc_csum_metadata_size(inode, num_bytes, 0); 6025 } else { 6026 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; 6027 u64 bytes; 6028 6029 /* 6030 * This is tricky, but first we need to figure out how much we 6031 * freed from any free-ers that occurred during this 6032 * reservation, so we reset ->csum_bytes to the csum_bytes 6033 * before we dropped our lock, and then call the free for the 6034 * number of bytes that were freed while we were trying our 6035 * reservation. 6036 */ 6037 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; 6038 BTRFS_I(inode)->csum_bytes = csum_bytes; 6039 to_free = calc_csum_metadata_size(inode, bytes, 0); 6040 6041 6042 /* 6043 * Now we need to see how much we would have freed had we not 6044 * been making this reservation and our ->csum_bytes were not 6045 * artificially inflated. 6046 */ 6047 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; 6048 bytes = csum_bytes - orig_csum_bytes; 6049 bytes = calc_csum_metadata_size(inode, bytes, 0); 6050 6051 /* 6052 * Now reset ->csum_bytes to what it should be. If bytes is 6053 * more than to_free then we would have freed more space had we 6054 * not had an artificially high ->csum_bytes, so we need to free 6055 * the remainder. If bytes is the same or less then we don't 6056 * need to do anything, the other free-ers did the correct 6057 * thing. 6058 */ 6059 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; 6060 if (bytes > to_free) 6061 to_free = bytes - to_free; 6062 else 6063 to_free = 0; 6064 } 6065 spin_unlock(&BTRFS_I(inode)->lock); 6066 if (dropped) 6067 to_free += btrfs_calc_trans_metadata_size(root, dropped); 6068 6069 if (to_free) { 6070 btrfs_block_rsv_release(root, block_rsv, to_free); 6071 trace_btrfs_space_reservation(root->fs_info, "delalloc", 6072 btrfs_ino(inode), to_free, 0); 6073 } 6074 if (delalloc_lock) 6075 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 6076 return ret; 6077 } 6078 6079 /** 6080 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 6081 * @inode: the inode to release the reservation for 6082 * @num_bytes: the number of bytes we're releasing 6083 * 6084 * This will release the metadata reservation for an inode. This can be called 6085 * once we complete IO for a given set of bytes to release their metadata 6086 * reservations. 6087 */ 6088 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 6089 { 6090 struct btrfs_root *root = BTRFS_I(inode)->root; 6091 u64 to_free = 0; 6092 unsigned dropped; 6093 6094 num_bytes = ALIGN(num_bytes, root->sectorsize); 6095 spin_lock(&BTRFS_I(inode)->lock); 6096 dropped = drop_outstanding_extent(inode, num_bytes); 6097 6098 if (num_bytes) 6099 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 6100 spin_unlock(&BTRFS_I(inode)->lock); 6101 if (dropped > 0) 6102 to_free += btrfs_calc_trans_metadata_size(root, dropped); 6103 6104 if (btrfs_is_testing(root->fs_info)) 6105 return; 6106 6107 trace_btrfs_space_reservation(root->fs_info, "delalloc", 6108 btrfs_ino(inode), to_free, 0); 6109 6110 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 6111 to_free); 6112 } 6113 6114 /** 6115 * btrfs_delalloc_reserve_space - reserve data and metadata space for 6116 * delalloc 6117 * @inode: inode we're writing to 6118 * @start: start range we are writing to 6119 * @len: how long the range we are writing to 6120 * 6121 * TODO: This function will finally replace old btrfs_delalloc_reserve_space() 6122 * 6123 * This will do the following things 6124 * 6125 * o reserve space in data space info for num bytes 6126 * and reserve precious corresponding qgroup space 6127 * (Done in check_data_free_space) 6128 * 6129 * o reserve space for metadata space, based on the number of outstanding 6130 * extents and how much csums will be needed 6131 * also reserve metadata space in a per root over-reserve method. 6132 * o add to the inodes->delalloc_bytes 6133 * o add it to the fs_info's delalloc inodes list. 6134 * (Above 3 all done in delalloc_reserve_metadata) 6135 * 6136 * Return 0 for success 6137 * Return <0 for error(-ENOSPC or -EQUOT) 6138 */ 6139 int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) 6140 { 6141 int ret; 6142 6143 ret = btrfs_check_data_free_space(inode, start, len); 6144 if (ret < 0) 6145 return ret; 6146 ret = btrfs_delalloc_reserve_metadata(inode, len); 6147 if (ret < 0) 6148 btrfs_free_reserved_data_space(inode, start, len); 6149 return ret; 6150 } 6151 6152 /** 6153 * btrfs_delalloc_release_space - release data and metadata space for delalloc 6154 * @inode: inode we're releasing space for 6155 * @start: start position of the space already reserved 6156 * @len: the len of the space already reserved 6157 * 6158 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 6159 * called in the case that we don't need the metadata AND data reservations 6160 * anymore. So if there is an error or we insert an inline extent. 6161 * 6162 * This function will release the metadata space that was not used and will 6163 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 6164 * list if there are no delalloc bytes left. 6165 * Also it will handle the qgroup reserved space. 6166 */ 6167 void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) 6168 { 6169 btrfs_delalloc_release_metadata(inode, len); 6170 btrfs_free_reserved_data_space(inode, start, len); 6171 } 6172 6173 static int update_block_group(struct btrfs_trans_handle *trans, 6174 struct btrfs_root *root, u64 bytenr, 6175 u64 num_bytes, int alloc) 6176 { 6177 struct btrfs_block_group_cache *cache = NULL; 6178 struct btrfs_fs_info *info = root->fs_info; 6179 u64 total = num_bytes; 6180 u64 old_val; 6181 u64 byte_in_group; 6182 int factor; 6183 6184 /* block accounting for super block */ 6185 spin_lock(&info->delalloc_root_lock); 6186 old_val = btrfs_super_bytes_used(info->super_copy); 6187 if (alloc) 6188 old_val += num_bytes; 6189 else 6190 old_val -= num_bytes; 6191 btrfs_set_super_bytes_used(info->super_copy, old_val); 6192 spin_unlock(&info->delalloc_root_lock); 6193 6194 while (total) { 6195 cache = btrfs_lookup_block_group(info, bytenr); 6196 if (!cache) 6197 return -ENOENT; 6198 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 6199 BTRFS_BLOCK_GROUP_RAID1 | 6200 BTRFS_BLOCK_GROUP_RAID10)) 6201 factor = 2; 6202 else 6203 factor = 1; 6204 /* 6205 * If this block group has free space cache written out, we 6206 * need to make sure to load it if we are removing space. This 6207 * is because we need the unpinning stage to actually add the 6208 * space back to the block group, otherwise we will leak space. 6209 */ 6210 if (!alloc && cache->cached == BTRFS_CACHE_NO) 6211 cache_block_group(cache, 1); 6212 6213 byte_in_group = bytenr - cache->key.objectid; 6214 WARN_ON(byte_in_group > cache->key.offset); 6215 6216 spin_lock(&cache->space_info->lock); 6217 spin_lock(&cache->lock); 6218 6219 if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && 6220 cache->disk_cache_state < BTRFS_DC_CLEAR) 6221 cache->disk_cache_state = BTRFS_DC_CLEAR; 6222 6223 old_val = btrfs_block_group_used(&cache->item); 6224 num_bytes = min(total, cache->key.offset - byte_in_group); 6225 if (alloc) { 6226 old_val += num_bytes; 6227 btrfs_set_block_group_used(&cache->item, old_val); 6228 cache->reserved -= num_bytes; 6229 cache->space_info->bytes_reserved -= num_bytes; 6230 cache->space_info->bytes_used += num_bytes; 6231 cache->space_info->disk_used += num_bytes * factor; 6232 spin_unlock(&cache->lock); 6233 spin_unlock(&cache->space_info->lock); 6234 } else { 6235 old_val -= num_bytes; 6236 btrfs_set_block_group_used(&cache->item, old_val); 6237 cache->pinned += num_bytes; 6238 cache->space_info->bytes_pinned += num_bytes; 6239 cache->space_info->bytes_used -= num_bytes; 6240 cache->space_info->disk_used -= num_bytes * factor; 6241 spin_unlock(&cache->lock); 6242 spin_unlock(&cache->space_info->lock); 6243 6244 trace_btrfs_space_reservation(root->fs_info, "pinned", 6245 cache->space_info->flags, 6246 num_bytes, 1); 6247 set_extent_dirty(info->pinned_extents, 6248 bytenr, bytenr + num_bytes - 1, 6249 GFP_NOFS | __GFP_NOFAIL); 6250 } 6251 6252 spin_lock(&trans->transaction->dirty_bgs_lock); 6253 if (list_empty(&cache->dirty_list)) { 6254 list_add_tail(&cache->dirty_list, 6255 &trans->transaction->dirty_bgs); 6256 trans->transaction->num_dirty_bgs++; 6257 btrfs_get_block_group(cache); 6258 } 6259 spin_unlock(&trans->transaction->dirty_bgs_lock); 6260 6261 /* 6262 * No longer have used bytes in this block group, queue it for 6263 * deletion. We do this after adding the block group to the 6264 * dirty list to avoid races between cleaner kthread and space 6265 * cache writeout. 6266 */ 6267 if (!alloc && old_val == 0) { 6268 spin_lock(&info->unused_bgs_lock); 6269 if (list_empty(&cache->bg_list)) { 6270 btrfs_get_block_group(cache); 6271 list_add_tail(&cache->bg_list, 6272 &info->unused_bgs); 6273 } 6274 spin_unlock(&info->unused_bgs_lock); 6275 } 6276 6277 btrfs_put_block_group(cache); 6278 total -= num_bytes; 6279 bytenr += num_bytes; 6280 } 6281 return 0; 6282 } 6283 6284 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 6285 { 6286 struct btrfs_block_group_cache *cache; 6287 u64 bytenr; 6288 6289 spin_lock(&root->fs_info->block_group_cache_lock); 6290 bytenr = root->fs_info->first_logical_byte; 6291 spin_unlock(&root->fs_info->block_group_cache_lock); 6292 6293 if (bytenr < (u64)-1) 6294 return bytenr; 6295 6296 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 6297 if (!cache) 6298 return 0; 6299 6300 bytenr = cache->key.objectid; 6301 btrfs_put_block_group(cache); 6302 6303 return bytenr; 6304 } 6305 6306 static int pin_down_extent(struct btrfs_root *root, 6307 struct btrfs_block_group_cache *cache, 6308 u64 bytenr, u64 num_bytes, int reserved) 6309 { 6310 spin_lock(&cache->space_info->lock); 6311 spin_lock(&cache->lock); 6312 cache->pinned += num_bytes; 6313 cache->space_info->bytes_pinned += num_bytes; 6314 if (reserved) { 6315 cache->reserved -= num_bytes; 6316 cache->space_info->bytes_reserved -= num_bytes; 6317 } 6318 spin_unlock(&cache->lock); 6319 spin_unlock(&cache->space_info->lock); 6320 6321 trace_btrfs_space_reservation(root->fs_info, "pinned", 6322 cache->space_info->flags, num_bytes, 1); 6323 set_extent_dirty(root->fs_info->pinned_extents, bytenr, 6324 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 6325 return 0; 6326 } 6327 6328 /* 6329 * this function must be called within transaction 6330 */ 6331 int btrfs_pin_extent(struct btrfs_root *root, 6332 u64 bytenr, u64 num_bytes, int reserved) 6333 { 6334 struct btrfs_block_group_cache *cache; 6335 6336 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 6337 BUG_ON(!cache); /* Logic error */ 6338 6339 pin_down_extent(root, cache, bytenr, num_bytes, reserved); 6340 6341 btrfs_put_block_group(cache); 6342 return 0; 6343 } 6344 6345 /* 6346 * this function must be called within transaction 6347 */ 6348 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 6349 u64 bytenr, u64 num_bytes) 6350 { 6351 struct btrfs_block_group_cache *cache; 6352 int ret; 6353 6354 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 6355 if (!cache) 6356 return -EINVAL; 6357 6358 /* 6359 * pull in the free space cache (if any) so that our pin 6360 * removes the free space from the cache. We have load_only set 6361 * to one because the slow code to read in the free extents does check 6362 * the pinned extents. 6363 */ 6364 cache_block_group(cache, 1); 6365 6366 pin_down_extent(root, cache, bytenr, num_bytes, 0); 6367 6368 /* remove us from the free space cache (if we're there at all) */ 6369 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 6370 btrfs_put_block_group(cache); 6371 return ret; 6372 } 6373 6374 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) 6375 { 6376 int ret; 6377 struct btrfs_block_group_cache *block_group; 6378 struct btrfs_caching_control *caching_ctl; 6379 6380 block_group = btrfs_lookup_block_group(root->fs_info, start); 6381 if (!block_group) 6382 return -EINVAL; 6383 6384 cache_block_group(block_group, 0); 6385 caching_ctl = get_caching_control(block_group); 6386 6387 if (!caching_ctl) { 6388 /* Logic error */ 6389 BUG_ON(!block_group_cache_done(block_group)); 6390 ret = btrfs_remove_free_space(block_group, start, num_bytes); 6391 } else { 6392 mutex_lock(&caching_ctl->mutex); 6393 6394 if (start >= caching_ctl->progress) { 6395 ret = add_excluded_extent(root, start, num_bytes); 6396 } else if (start + num_bytes <= caching_ctl->progress) { 6397 ret = btrfs_remove_free_space(block_group, 6398 start, num_bytes); 6399 } else { 6400 num_bytes = caching_ctl->progress - start; 6401 ret = btrfs_remove_free_space(block_group, 6402 start, num_bytes); 6403 if (ret) 6404 goto out_lock; 6405 6406 num_bytes = (start + num_bytes) - 6407 caching_ctl->progress; 6408 start = caching_ctl->progress; 6409 ret = add_excluded_extent(root, start, num_bytes); 6410 } 6411 out_lock: 6412 mutex_unlock(&caching_ctl->mutex); 6413 put_caching_control(caching_ctl); 6414 } 6415 btrfs_put_block_group(block_group); 6416 return ret; 6417 } 6418 6419 int btrfs_exclude_logged_extents(struct btrfs_root *log, 6420 struct extent_buffer *eb) 6421 { 6422 struct btrfs_file_extent_item *item; 6423 struct btrfs_key key; 6424 int found_type; 6425 int i; 6426 6427 if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) 6428 return 0; 6429 6430 for (i = 0; i < btrfs_header_nritems(eb); i++) { 6431 btrfs_item_key_to_cpu(eb, &key, i); 6432 if (key.type != BTRFS_EXTENT_DATA_KEY) 6433 continue; 6434 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 6435 found_type = btrfs_file_extent_type(eb, item); 6436 if (found_type == BTRFS_FILE_EXTENT_INLINE) 6437 continue; 6438 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 6439 continue; 6440 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 6441 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 6442 __exclude_logged_extent(log, key.objectid, key.offset); 6443 } 6444 6445 return 0; 6446 } 6447 6448 static void 6449 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) 6450 { 6451 atomic_inc(&bg->reservations); 6452 } 6453 6454 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 6455 const u64 start) 6456 { 6457 struct btrfs_block_group_cache *bg; 6458 6459 bg = btrfs_lookup_block_group(fs_info, start); 6460 ASSERT(bg); 6461 if (atomic_dec_and_test(&bg->reservations)) 6462 wake_up_atomic_t(&bg->reservations); 6463 btrfs_put_block_group(bg); 6464 } 6465 6466 static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) 6467 { 6468 schedule(); 6469 return 0; 6470 } 6471 6472 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) 6473 { 6474 struct btrfs_space_info *space_info = bg->space_info; 6475 6476 ASSERT(bg->ro); 6477 6478 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 6479 return; 6480 6481 /* 6482 * Our block group is read only but before we set it to read only, 6483 * some task might have had allocated an extent from it already, but it 6484 * has not yet created a respective ordered extent (and added it to a 6485 * root's list of ordered extents). 6486 * Therefore wait for any task currently allocating extents, since the 6487 * block group's reservations counter is incremented while a read lock 6488 * on the groups' semaphore is held and decremented after releasing 6489 * the read access on that semaphore and creating the ordered extent. 6490 */ 6491 down_write(&space_info->groups_sem); 6492 up_write(&space_info->groups_sem); 6493 6494 wait_on_atomic_t(&bg->reservations, 6495 btrfs_wait_bg_reservations_atomic_t, 6496 TASK_UNINTERRUPTIBLE); 6497 } 6498 6499 /** 6500 * btrfs_update_reserved_bytes - update the block_group and space info counters 6501 * @cache: The cache we are manipulating 6502 * @num_bytes: The number of bytes in question 6503 * @reserve: One of the reservation enums 6504 * @delalloc: The blocks are allocated for the delalloc write 6505 * 6506 * This is called by the allocator when it reserves space, or by somebody who is 6507 * freeing space that was never actually used on disk. For example if you 6508 * reserve some space for a new leaf in transaction A and before transaction A 6509 * commits you free that leaf, you call this with reserve set to 0 in order to 6510 * clear the reservation. 6511 * 6512 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper 6513 * ENOSPC accounting. For data we handle the reservation through clearing the 6514 * delalloc bits in the io_tree. We have to do this since we could end up 6515 * allocating less disk space for the amount of data we have reserved in the 6516 * case of compression. 6517 * 6518 * If this is a reservation and the block group has become read only we cannot 6519 * make the reservation and return -EAGAIN, otherwise this function always 6520 * succeeds. 6521 */ 6522 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 6523 u64 num_bytes, int reserve, int delalloc) 6524 { 6525 struct btrfs_space_info *space_info = cache->space_info; 6526 int ret = 0; 6527 6528 spin_lock(&space_info->lock); 6529 spin_lock(&cache->lock); 6530 if (reserve != RESERVE_FREE) { 6531 if (cache->ro) { 6532 ret = -EAGAIN; 6533 } else { 6534 cache->reserved += num_bytes; 6535 space_info->bytes_reserved += num_bytes; 6536 if (reserve == RESERVE_ALLOC) { 6537 trace_btrfs_space_reservation(cache->fs_info, 6538 "space_info", space_info->flags, 6539 num_bytes, 0); 6540 space_info->bytes_may_use -= num_bytes; 6541 } 6542 6543 if (delalloc) 6544 cache->delalloc_bytes += num_bytes; 6545 } 6546 } else { 6547 if (cache->ro) 6548 space_info->bytes_readonly += num_bytes; 6549 cache->reserved -= num_bytes; 6550 space_info->bytes_reserved -= num_bytes; 6551 6552 if (delalloc) 6553 cache->delalloc_bytes -= num_bytes; 6554 } 6555 spin_unlock(&cache->lock); 6556 spin_unlock(&space_info->lock); 6557 return ret; 6558 } 6559 6560 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 6561 struct btrfs_root *root) 6562 { 6563 struct btrfs_fs_info *fs_info = root->fs_info; 6564 struct btrfs_caching_control *next; 6565 struct btrfs_caching_control *caching_ctl; 6566 struct btrfs_block_group_cache *cache; 6567 6568 down_write(&fs_info->commit_root_sem); 6569 6570 list_for_each_entry_safe(caching_ctl, next, 6571 &fs_info->caching_block_groups, list) { 6572 cache = caching_ctl->block_group; 6573 if (block_group_cache_done(cache)) { 6574 cache->last_byte_to_unpin = (u64)-1; 6575 list_del_init(&caching_ctl->list); 6576 put_caching_control(caching_ctl); 6577 } else { 6578 cache->last_byte_to_unpin = caching_ctl->progress; 6579 } 6580 } 6581 6582 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6583 fs_info->pinned_extents = &fs_info->freed_extents[1]; 6584 else 6585 fs_info->pinned_extents = &fs_info->freed_extents[0]; 6586 6587 up_write(&fs_info->commit_root_sem); 6588 6589 update_global_block_rsv(fs_info); 6590 } 6591 6592 /* 6593 * Returns the free cluster for the given space info and sets empty_cluster to 6594 * what it should be based on the mount options. 6595 */ 6596 static struct btrfs_free_cluster * 6597 fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, 6598 u64 *empty_cluster) 6599 { 6600 struct btrfs_free_cluster *ret = NULL; 6601 bool ssd = btrfs_test_opt(root->fs_info, SSD); 6602 6603 *empty_cluster = 0; 6604 if (btrfs_mixed_space_info(space_info)) 6605 return ret; 6606 6607 if (ssd) 6608 *empty_cluster = SZ_2M; 6609 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 6610 ret = &root->fs_info->meta_alloc_cluster; 6611 if (!ssd) 6612 *empty_cluster = SZ_64K; 6613 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { 6614 ret = &root->fs_info->data_alloc_cluster; 6615 } 6616 6617 return ret; 6618 } 6619 6620 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, 6621 const bool return_free_space) 6622 { 6623 struct btrfs_fs_info *fs_info = root->fs_info; 6624 struct btrfs_block_group_cache *cache = NULL; 6625 struct btrfs_space_info *space_info; 6626 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 6627 struct btrfs_free_cluster *cluster = NULL; 6628 u64 len; 6629 u64 total_unpinned = 0; 6630 u64 empty_cluster = 0; 6631 bool readonly; 6632 6633 while (start <= end) { 6634 readonly = false; 6635 if (!cache || 6636 start >= cache->key.objectid + cache->key.offset) { 6637 if (cache) 6638 btrfs_put_block_group(cache); 6639 total_unpinned = 0; 6640 cache = btrfs_lookup_block_group(fs_info, start); 6641 BUG_ON(!cache); /* Logic error */ 6642 6643 cluster = fetch_cluster_info(root, 6644 cache->space_info, 6645 &empty_cluster); 6646 empty_cluster <<= 1; 6647 } 6648 6649 len = cache->key.objectid + cache->key.offset - start; 6650 len = min(len, end + 1 - start); 6651 6652 if (start < cache->last_byte_to_unpin) { 6653 len = min(len, cache->last_byte_to_unpin - start); 6654 if (return_free_space) 6655 btrfs_add_free_space(cache, start, len); 6656 } 6657 6658 start += len; 6659 total_unpinned += len; 6660 space_info = cache->space_info; 6661 6662 /* 6663 * If this space cluster has been marked as fragmented and we've 6664 * unpinned enough in this block group to potentially allow a 6665 * cluster to be created inside of it go ahead and clear the 6666 * fragmented check. 6667 */ 6668 if (cluster && cluster->fragmented && 6669 total_unpinned > empty_cluster) { 6670 spin_lock(&cluster->lock); 6671 cluster->fragmented = 0; 6672 spin_unlock(&cluster->lock); 6673 } 6674 6675 spin_lock(&space_info->lock); 6676 spin_lock(&cache->lock); 6677 cache->pinned -= len; 6678 space_info->bytes_pinned -= len; 6679 6680 trace_btrfs_space_reservation(fs_info, "pinned", 6681 space_info->flags, len, 0); 6682 space_info->max_extent_size = 0; 6683 percpu_counter_add(&space_info->total_bytes_pinned, -len); 6684 if (cache->ro) { 6685 space_info->bytes_readonly += len; 6686 readonly = true; 6687 } 6688 spin_unlock(&cache->lock); 6689 if (!readonly && return_free_space && 6690 global_rsv->space_info == space_info) { 6691 u64 to_add = len; 6692 WARN_ON(!return_free_space); 6693 spin_lock(&global_rsv->lock); 6694 if (!global_rsv->full) { 6695 to_add = min(len, global_rsv->size - 6696 global_rsv->reserved); 6697 global_rsv->reserved += to_add; 6698 space_info->bytes_may_use += to_add; 6699 if (global_rsv->reserved >= global_rsv->size) 6700 global_rsv->full = 1; 6701 trace_btrfs_space_reservation(fs_info, 6702 "space_info", 6703 space_info->flags, 6704 to_add, 1); 6705 len -= to_add; 6706 } 6707 spin_unlock(&global_rsv->lock); 6708 /* Add to any tickets we may have */ 6709 if (len) 6710 space_info_add_new_bytes(fs_info, space_info, 6711 len); 6712 } 6713 spin_unlock(&space_info->lock); 6714 } 6715 6716 if (cache) 6717 btrfs_put_block_group(cache); 6718 return 0; 6719 } 6720 6721 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 6722 struct btrfs_root *root) 6723 { 6724 struct btrfs_fs_info *fs_info = root->fs_info; 6725 struct btrfs_block_group_cache *block_group, *tmp; 6726 struct list_head *deleted_bgs; 6727 struct extent_io_tree *unpin; 6728 u64 start; 6729 u64 end; 6730 int ret; 6731 6732 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6733 unpin = &fs_info->freed_extents[1]; 6734 else 6735 unpin = &fs_info->freed_extents[0]; 6736 6737 while (!trans->aborted) { 6738 mutex_lock(&fs_info->unused_bg_unpin_mutex); 6739 ret = find_first_extent_bit(unpin, 0, &start, &end, 6740 EXTENT_DIRTY, NULL); 6741 if (ret) { 6742 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6743 break; 6744 } 6745 6746 if (btrfs_test_opt(root->fs_info, DISCARD)) 6747 ret = btrfs_discard_extent(root, start, 6748 end + 1 - start, NULL); 6749 6750 clear_extent_dirty(unpin, start, end); 6751 unpin_extent_range(root, start, end, true); 6752 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6753 cond_resched(); 6754 } 6755 6756 /* 6757 * Transaction is finished. We don't need the lock anymore. We 6758 * do need to clean up the block groups in case of a transaction 6759 * abort. 6760 */ 6761 deleted_bgs = &trans->transaction->deleted_bgs; 6762 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { 6763 u64 trimmed = 0; 6764 6765 ret = -EROFS; 6766 if (!trans->aborted) 6767 ret = btrfs_discard_extent(root, 6768 block_group->key.objectid, 6769 block_group->key.offset, 6770 &trimmed); 6771 6772 list_del_init(&block_group->bg_list); 6773 btrfs_put_block_group_trimming(block_group); 6774 btrfs_put_block_group(block_group); 6775 6776 if (ret) { 6777 const char *errstr = btrfs_decode_error(ret); 6778 btrfs_warn(fs_info, 6779 "Discard failed while removing blockgroup: errno=%d %s\n", 6780 ret, errstr); 6781 } 6782 } 6783 6784 return 0; 6785 } 6786 6787 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, 6788 u64 owner, u64 root_objectid) 6789 { 6790 struct btrfs_space_info *space_info; 6791 u64 flags; 6792 6793 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 6794 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) 6795 flags = BTRFS_BLOCK_GROUP_SYSTEM; 6796 else 6797 flags = BTRFS_BLOCK_GROUP_METADATA; 6798 } else { 6799 flags = BTRFS_BLOCK_GROUP_DATA; 6800 } 6801 6802 space_info = __find_space_info(fs_info, flags); 6803 BUG_ON(!space_info); /* Logic bug */ 6804 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); 6805 } 6806 6807 6808 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 6809 struct btrfs_root *root, 6810 struct btrfs_delayed_ref_node *node, u64 parent, 6811 u64 root_objectid, u64 owner_objectid, 6812 u64 owner_offset, int refs_to_drop, 6813 struct btrfs_delayed_extent_op *extent_op) 6814 { 6815 struct btrfs_key key; 6816 struct btrfs_path *path; 6817 struct btrfs_fs_info *info = root->fs_info; 6818 struct btrfs_root *extent_root = info->extent_root; 6819 struct extent_buffer *leaf; 6820 struct btrfs_extent_item *ei; 6821 struct btrfs_extent_inline_ref *iref; 6822 int ret; 6823 int is_data; 6824 int extent_slot = 0; 6825 int found_extent = 0; 6826 int num_to_del = 1; 6827 u32 item_size; 6828 u64 refs; 6829 u64 bytenr = node->bytenr; 6830 u64 num_bytes = node->num_bytes; 6831 int last_ref = 0; 6832 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 6833 SKINNY_METADATA); 6834 6835 path = btrfs_alloc_path(); 6836 if (!path) 6837 return -ENOMEM; 6838 6839 path->reada = READA_FORWARD; 6840 path->leave_spinning = 1; 6841 6842 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 6843 BUG_ON(!is_data && refs_to_drop != 1); 6844 6845 if (is_data) 6846 skinny_metadata = 0; 6847 6848 ret = lookup_extent_backref(trans, extent_root, path, &iref, 6849 bytenr, num_bytes, parent, 6850 root_objectid, owner_objectid, 6851 owner_offset); 6852 if (ret == 0) { 6853 extent_slot = path->slots[0]; 6854 while (extent_slot >= 0) { 6855 btrfs_item_key_to_cpu(path->nodes[0], &key, 6856 extent_slot); 6857 if (key.objectid != bytenr) 6858 break; 6859 if (key.type == BTRFS_EXTENT_ITEM_KEY && 6860 key.offset == num_bytes) { 6861 found_extent = 1; 6862 break; 6863 } 6864 if (key.type == BTRFS_METADATA_ITEM_KEY && 6865 key.offset == owner_objectid) { 6866 found_extent = 1; 6867 break; 6868 } 6869 if (path->slots[0] - extent_slot > 5) 6870 break; 6871 extent_slot--; 6872 } 6873 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 6874 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 6875 if (found_extent && item_size < sizeof(*ei)) 6876 found_extent = 0; 6877 #endif 6878 if (!found_extent) { 6879 BUG_ON(iref); 6880 ret = remove_extent_backref(trans, extent_root, path, 6881 NULL, refs_to_drop, 6882 is_data, &last_ref); 6883 if (ret) { 6884 btrfs_abort_transaction(trans, ret); 6885 goto out; 6886 } 6887 btrfs_release_path(path); 6888 path->leave_spinning = 1; 6889 6890 key.objectid = bytenr; 6891 key.type = BTRFS_EXTENT_ITEM_KEY; 6892 key.offset = num_bytes; 6893 6894 if (!is_data && skinny_metadata) { 6895 key.type = BTRFS_METADATA_ITEM_KEY; 6896 key.offset = owner_objectid; 6897 } 6898 6899 ret = btrfs_search_slot(trans, extent_root, 6900 &key, path, -1, 1); 6901 if (ret > 0 && skinny_metadata && path->slots[0]) { 6902 /* 6903 * Couldn't find our skinny metadata item, 6904 * see if we have ye olde extent item. 6905 */ 6906 path->slots[0]--; 6907 btrfs_item_key_to_cpu(path->nodes[0], &key, 6908 path->slots[0]); 6909 if (key.objectid == bytenr && 6910 key.type == BTRFS_EXTENT_ITEM_KEY && 6911 key.offset == num_bytes) 6912 ret = 0; 6913 } 6914 6915 if (ret > 0 && skinny_metadata) { 6916 skinny_metadata = false; 6917 key.objectid = bytenr; 6918 key.type = BTRFS_EXTENT_ITEM_KEY; 6919 key.offset = num_bytes; 6920 btrfs_release_path(path); 6921 ret = btrfs_search_slot(trans, extent_root, 6922 &key, path, -1, 1); 6923 } 6924 6925 if (ret) { 6926 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 6927 ret, bytenr); 6928 if (ret > 0) 6929 btrfs_print_leaf(extent_root, 6930 path->nodes[0]); 6931 } 6932 if (ret < 0) { 6933 btrfs_abort_transaction(trans, ret); 6934 goto out; 6935 } 6936 extent_slot = path->slots[0]; 6937 } 6938 } else if (WARN_ON(ret == -ENOENT)) { 6939 btrfs_print_leaf(extent_root, path->nodes[0]); 6940 btrfs_err(info, 6941 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 6942 bytenr, parent, root_objectid, owner_objectid, 6943 owner_offset); 6944 btrfs_abort_transaction(trans, ret); 6945 goto out; 6946 } else { 6947 btrfs_abort_transaction(trans, ret); 6948 goto out; 6949 } 6950 6951 leaf = path->nodes[0]; 6952 item_size = btrfs_item_size_nr(leaf, extent_slot); 6953 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 6954 if (item_size < sizeof(*ei)) { 6955 BUG_ON(found_extent || extent_slot != path->slots[0]); 6956 ret = convert_extent_item_v0(trans, extent_root, path, 6957 owner_objectid, 0); 6958 if (ret < 0) { 6959 btrfs_abort_transaction(trans, ret); 6960 goto out; 6961 } 6962 6963 btrfs_release_path(path); 6964 path->leave_spinning = 1; 6965 6966 key.objectid = bytenr; 6967 key.type = BTRFS_EXTENT_ITEM_KEY; 6968 key.offset = num_bytes; 6969 6970 ret = btrfs_search_slot(trans, extent_root, &key, path, 6971 -1, 1); 6972 if (ret) { 6973 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 6974 ret, bytenr); 6975 btrfs_print_leaf(extent_root, path->nodes[0]); 6976 } 6977 if (ret < 0) { 6978 btrfs_abort_transaction(trans, ret); 6979 goto out; 6980 } 6981 6982 extent_slot = path->slots[0]; 6983 leaf = path->nodes[0]; 6984 item_size = btrfs_item_size_nr(leaf, extent_slot); 6985 } 6986 #endif 6987 BUG_ON(item_size < sizeof(*ei)); 6988 ei = btrfs_item_ptr(leaf, extent_slot, 6989 struct btrfs_extent_item); 6990 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 6991 key.type == BTRFS_EXTENT_ITEM_KEY) { 6992 struct btrfs_tree_block_info *bi; 6993 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 6994 bi = (struct btrfs_tree_block_info *)(ei + 1); 6995 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 6996 } 6997 6998 refs = btrfs_extent_refs(leaf, ei); 6999 if (refs < refs_to_drop) { 7000 btrfs_err(info, "trying to drop %d refs but we only have %Lu " 7001 "for bytenr %Lu", refs_to_drop, refs, bytenr); 7002 ret = -EINVAL; 7003 btrfs_abort_transaction(trans, ret); 7004 goto out; 7005 } 7006 refs -= refs_to_drop; 7007 7008 if (refs > 0) { 7009 if (extent_op) 7010 __run_delayed_extent_op(extent_op, leaf, ei); 7011 /* 7012 * In the case of inline back ref, reference count will 7013 * be updated by remove_extent_backref 7014 */ 7015 if (iref) { 7016 BUG_ON(!found_extent); 7017 } else { 7018 btrfs_set_extent_refs(leaf, ei, refs); 7019 btrfs_mark_buffer_dirty(leaf); 7020 } 7021 if (found_extent) { 7022 ret = remove_extent_backref(trans, extent_root, path, 7023 iref, refs_to_drop, 7024 is_data, &last_ref); 7025 if (ret) { 7026 btrfs_abort_transaction(trans, ret); 7027 goto out; 7028 } 7029 } 7030 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, 7031 root_objectid); 7032 } else { 7033 if (found_extent) { 7034 BUG_ON(is_data && refs_to_drop != 7035 extent_data_ref_count(path, iref)); 7036 if (iref) { 7037 BUG_ON(path->slots[0] != extent_slot); 7038 } else { 7039 BUG_ON(path->slots[0] != extent_slot + 1); 7040 path->slots[0] = extent_slot; 7041 num_to_del = 2; 7042 } 7043 } 7044 7045 last_ref = 1; 7046 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 7047 num_to_del); 7048 if (ret) { 7049 btrfs_abort_transaction(trans, ret); 7050 goto out; 7051 } 7052 btrfs_release_path(path); 7053 7054 if (is_data) { 7055 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 7056 if (ret) { 7057 btrfs_abort_transaction(trans, ret); 7058 goto out; 7059 } 7060 } 7061 7062 ret = add_to_free_space_tree(trans, root->fs_info, bytenr, 7063 num_bytes); 7064 if (ret) { 7065 btrfs_abort_transaction(trans, ret); 7066 goto out; 7067 } 7068 7069 ret = update_block_group(trans, root, bytenr, num_bytes, 0); 7070 if (ret) { 7071 btrfs_abort_transaction(trans, ret); 7072 goto out; 7073 } 7074 } 7075 btrfs_release_path(path); 7076 7077 out: 7078 btrfs_free_path(path); 7079 return ret; 7080 } 7081 7082 /* 7083 * when we free an block, it is possible (and likely) that we free the last 7084 * delayed ref for that extent as well. This searches the delayed ref tree for 7085 * a given extent, and if there are no other delayed refs to be processed, it 7086 * removes it from the tree. 7087 */ 7088 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 7089 struct btrfs_root *root, u64 bytenr) 7090 { 7091 struct btrfs_delayed_ref_head *head; 7092 struct btrfs_delayed_ref_root *delayed_refs; 7093 int ret = 0; 7094 7095 delayed_refs = &trans->transaction->delayed_refs; 7096 spin_lock(&delayed_refs->lock); 7097 head = btrfs_find_delayed_ref_head(trans, bytenr); 7098 if (!head) 7099 goto out_delayed_unlock; 7100 7101 spin_lock(&head->lock); 7102 if (!list_empty(&head->ref_list)) 7103 goto out; 7104 7105 if (head->extent_op) { 7106 if (!head->must_insert_reserved) 7107 goto out; 7108 btrfs_free_delayed_extent_op(head->extent_op); 7109 head->extent_op = NULL; 7110 } 7111 7112 /* 7113 * waiting for the lock here would deadlock. If someone else has it 7114 * locked they are already in the process of dropping it anyway 7115 */ 7116 if (!mutex_trylock(&head->mutex)) 7117 goto out; 7118 7119 /* 7120 * at this point we have a head with no other entries. Go 7121 * ahead and process it. 7122 */ 7123 head->node.in_tree = 0; 7124 rb_erase(&head->href_node, &delayed_refs->href_root); 7125 7126 atomic_dec(&delayed_refs->num_entries); 7127 7128 /* 7129 * we don't take a ref on the node because we're removing it from the 7130 * tree, so we just steal the ref the tree was holding. 7131 */ 7132 delayed_refs->num_heads--; 7133 if (head->processing == 0) 7134 delayed_refs->num_heads_ready--; 7135 head->processing = 0; 7136 spin_unlock(&head->lock); 7137 spin_unlock(&delayed_refs->lock); 7138 7139 BUG_ON(head->extent_op); 7140 if (head->must_insert_reserved) 7141 ret = 1; 7142 7143 mutex_unlock(&head->mutex); 7144 btrfs_put_delayed_ref(&head->node); 7145 return ret; 7146 out: 7147 spin_unlock(&head->lock); 7148 7149 out_delayed_unlock: 7150 spin_unlock(&delayed_refs->lock); 7151 return 0; 7152 } 7153 7154 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 7155 struct btrfs_root *root, 7156 struct extent_buffer *buf, 7157 u64 parent, int last_ref) 7158 { 7159 int pin = 1; 7160 int ret; 7161 7162 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7163 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 7164 buf->start, buf->len, 7165 parent, root->root_key.objectid, 7166 btrfs_header_level(buf), 7167 BTRFS_DROP_DELAYED_REF, NULL); 7168 BUG_ON(ret); /* -ENOMEM */ 7169 } 7170 7171 if (!last_ref) 7172 return; 7173 7174 if (btrfs_header_generation(buf) == trans->transid) { 7175 struct btrfs_block_group_cache *cache; 7176 7177 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7178 ret = check_ref_cleanup(trans, root, buf->start); 7179 if (!ret) 7180 goto out; 7181 } 7182 7183 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 7184 7185 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 7186 pin_down_extent(root, cache, buf->start, buf->len, 1); 7187 btrfs_put_block_group(cache); 7188 goto out; 7189 } 7190 7191 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 7192 7193 btrfs_add_free_space(cache, buf->start, buf->len); 7194 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); 7195 btrfs_put_block_group(cache); 7196 trace_btrfs_reserved_extent_free(root, buf->start, buf->len); 7197 pin = 0; 7198 } 7199 out: 7200 if (pin) 7201 add_pinned_bytes(root->fs_info, buf->len, 7202 btrfs_header_level(buf), 7203 root->root_key.objectid); 7204 7205 /* 7206 * Deleting the buffer, clear the corrupt flag since it doesn't matter 7207 * anymore. 7208 */ 7209 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 7210 } 7211 7212 /* Can return -ENOMEM */ 7213 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, 7214 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 7215 u64 owner, u64 offset) 7216 { 7217 int ret; 7218 struct btrfs_fs_info *fs_info = root->fs_info; 7219 7220 if (btrfs_is_testing(fs_info)) 7221 return 0; 7222 7223 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); 7224 7225 /* 7226 * tree log blocks never actually go into the extent allocation 7227 * tree, just update pinning info and exit early. 7228 */ 7229 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 7230 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 7231 /* unlocks the pinned mutex */ 7232 btrfs_pin_extent(root, bytenr, num_bytes, 1); 7233 ret = 0; 7234 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 7235 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 7236 num_bytes, 7237 parent, root_objectid, (int)owner, 7238 BTRFS_DROP_DELAYED_REF, NULL); 7239 } else { 7240 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 7241 num_bytes, 7242 parent, root_objectid, owner, 7243 offset, 0, 7244 BTRFS_DROP_DELAYED_REF, NULL); 7245 } 7246 return ret; 7247 } 7248 7249 /* 7250 * when we wait for progress in the block group caching, its because 7251 * our allocation attempt failed at least once. So, we must sleep 7252 * and let some progress happen before we try again. 7253 * 7254 * This function will sleep at least once waiting for new free space to 7255 * show up, and then it will check the block group free space numbers 7256 * for our min num_bytes. Another option is to have it go ahead 7257 * and look in the rbtree for a free extent of a given size, but this 7258 * is a good start. 7259 * 7260 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 7261 * any of the information in this block group. 7262 */ 7263 static noinline void 7264 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 7265 u64 num_bytes) 7266 { 7267 struct btrfs_caching_control *caching_ctl; 7268 7269 caching_ctl = get_caching_control(cache); 7270 if (!caching_ctl) 7271 return; 7272 7273 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 7274 (cache->free_space_ctl->free_space >= num_bytes)); 7275 7276 put_caching_control(caching_ctl); 7277 } 7278 7279 static noinline int 7280 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 7281 { 7282 struct btrfs_caching_control *caching_ctl; 7283 int ret = 0; 7284 7285 caching_ctl = get_caching_control(cache); 7286 if (!caching_ctl) 7287 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 7288 7289 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 7290 if (cache->cached == BTRFS_CACHE_ERROR) 7291 ret = -EIO; 7292 put_caching_control(caching_ctl); 7293 return ret; 7294 } 7295 7296 int __get_raid_index(u64 flags) 7297 { 7298 if (flags & BTRFS_BLOCK_GROUP_RAID10) 7299 return BTRFS_RAID_RAID10; 7300 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 7301 return BTRFS_RAID_RAID1; 7302 else if (flags & BTRFS_BLOCK_GROUP_DUP) 7303 return BTRFS_RAID_DUP; 7304 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 7305 return BTRFS_RAID_RAID0; 7306 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 7307 return BTRFS_RAID_RAID5; 7308 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 7309 return BTRFS_RAID_RAID6; 7310 7311 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 7312 } 7313 7314 int get_block_group_index(struct btrfs_block_group_cache *cache) 7315 { 7316 return __get_raid_index(cache->flags); 7317 } 7318 7319 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { 7320 [BTRFS_RAID_RAID10] = "raid10", 7321 [BTRFS_RAID_RAID1] = "raid1", 7322 [BTRFS_RAID_DUP] = "dup", 7323 [BTRFS_RAID_RAID0] = "raid0", 7324 [BTRFS_RAID_SINGLE] = "single", 7325 [BTRFS_RAID_RAID5] = "raid5", 7326 [BTRFS_RAID_RAID6] = "raid6", 7327 }; 7328 7329 static const char *get_raid_name(enum btrfs_raid_types type) 7330 { 7331 if (type >= BTRFS_NR_RAID_TYPES) 7332 return NULL; 7333 7334 return btrfs_raid_type_names[type]; 7335 } 7336 7337 enum btrfs_loop_type { 7338 LOOP_CACHING_NOWAIT = 0, 7339 LOOP_CACHING_WAIT = 1, 7340 LOOP_ALLOC_CHUNK = 2, 7341 LOOP_NO_EMPTY_SIZE = 3, 7342 }; 7343 7344 static inline void 7345 btrfs_lock_block_group(struct btrfs_block_group_cache *cache, 7346 int delalloc) 7347 { 7348 if (delalloc) 7349 down_read(&cache->data_rwsem); 7350 } 7351 7352 static inline void 7353 btrfs_grab_block_group(struct btrfs_block_group_cache *cache, 7354 int delalloc) 7355 { 7356 btrfs_get_block_group(cache); 7357 if (delalloc) 7358 down_read(&cache->data_rwsem); 7359 } 7360 7361 static struct btrfs_block_group_cache * 7362 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, 7363 struct btrfs_free_cluster *cluster, 7364 int delalloc) 7365 { 7366 struct btrfs_block_group_cache *used_bg = NULL; 7367 7368 spin_lock(&cluster->refill_lock); 7369 while (1) { 7370 used_bg = cluster->block_group; 7371 if (!used_bg) 7372 return NULL; 7373 7374 if (used_bg == block_group) 7375 return used_bg; 7376 7377 btrfs_get_block_group(used_bg); 7378 7379 if (!delalloc) 7380 return used_bg; 7381 7382 if (down_read_trylock(&used_bg->data_rwsem)) 7383 return used_bg; 7384 7385 spin_unlock(&cluster->refill_lock); 7386 7387 down_read(&used_bg->data_rwsem); 7388 7389 spin_lock(&cluster->refill_lock); 7390 if (used_bg == cluster->block_group) 7391 return used_bg; 7392 7393 up_read(&used_bg->data_rwsem); 7394 btrfs_put_block_group(used_bg); 7395 } 7396 } 7397 7398 static inline void 7399 btrfs_release_block_group(struct btrfs_block_group_cache *cache, 7400 int delalloc) 7401 { 7402 if (delalloc) 7403 up_read(&cache->data_rwsem); 7404 btrfs_put_block_group(cache); 7405 } 7406 7407 /* 7408 * walks the btree of allocated extents and find a hole of a given size. 7409 * The key ins is changed to record the hole: 7410 * ins->objectid == start position 7411 * ins->flags = BTRFS_EXTENT_ITEM_KEY 7412 * ins->offset == the size of the hole. 7413 * Any available blocks before search_start are skipped. 7414 * 7415 * If there is no suitable free space, we will record the max size of 7416 * the free space extent currently. 7417 */ 7418 static noinline int find_free_extent(struct btrfs_root *orig_root, 7419 u64 num_bytes, u64 empty_size, 7420 u64 hint_byte, struct btrfs_key *ins, 7421 u64 flags, int delalloc) 7422 { 7423 int ret = 0; 7424 struct btrfs_root *root = orig_root->fs_info->extent_root; 7425 struct btrfs_free_cluster *last_ptr = NULL; 7426 struct btrfs_block_group_cache *block_group = NULL; 7427 u64 search_start = 0; 7428 u64 max_extent_size = 0; 7429 u64 empty_cluster = 0; 7430 struct btrfs_space_info *space_info; 7431 int loop = 0; 7432 int index = __get_raid_index(flags); 7433 int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? 7434 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 7435 bool failed_cluster_refill = false; 7436 bool failed_alloc = false; 7437 bool use_cluster = true; 7438 bool have_caching_bg = false; 7439 bool orig_have_caching_bg = false; 7440 bool full_search = false; 7441 7442 WARN_ON(num_bytes < root->sectorsize); 7443 ins->type = BTRFS_EXTENT_ITEM_KEY; 7444 ins->objectid = 0; 7445 ins->offset = 0; 7446 7447 trace_find_free_extent(orig_root, num_bytes, empty_size, flags); 7448 7449 space_info = __find_space_info(root->fs_info, flags); 7450 if (!space_info) { 7451 btrfs_err(root->fs_info, "No space info for %llu", flags); 7452 return -ENOSPC; 7453 } 7454 7455 /* 7456 * If our free space is heavily fragmented we may not be able to make 7457 * big contiguous allocations, so instead of doing the expensive search 7458 * for free space, simply return ENOSPC with our max_extent_size so we 7459 * can go ahead and search for a more manageable chunk. 7460 * 7461 * If our max_extent_size is large enough for our allocation simply 7462 * disable clustering since we will likely not be able to find enough 7463 * space to create a cluster and induce latency trying. 7464 */ 7465 if (unlikely(space_info->max_extent_size)) { 7466 spin_lock(&space_info->lock); 7467 if (space_info->max_extent_size && 7468 num_bytes > space_info->max_extent_size) { 7469 ins->offset = space_info->max_extent_size; 7470 spin_unlock(&space_info->lock); 7471 return -ENOSPC; 7472 } else if (space_info->max_extent_size) { 7473 use_cluster = false; 7474 } 7475 spin_unlock(&space_info->lock); 7476 } 7477 7478 last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster); 7479 if (last_ptr) { 7480 spin_lock(&last_ptr->lock); 7481 if (last_ptr->block_group) 7482 hint_byte = last_ptr->window_start; 7483 if (last_ptr->fragmented) { 7484 /* 7485 * We still set window_start so we can keep track of the 7486 * last place we found an allocation to try and save 7487 * some time. 7488 */ 7489 hint_byte = last_ptr->window_start; 7490 use_cluster = false; 7491 } 7492 spin_unlock(&last_ptr->lock); 7493 } 7494 7495 search_start = max(search_start, first_logical_byte(root, 0)); 7496 search_start = max(search_start, hint_byte); 7497 if (search_start == hint_byte) { 7498 block_group = btrfs_lookup_block_group(root->fs_info, 7499 search_start); 7500 /* 7501 * we don't want to use the block group if it doesn't match our 7502 * allocation bits, or if its not cached. 7503 * 7504 * However if we are re-searching with an ideal block group 7505 * picked out then we don't care that the block group is cached. 7506 */ 7507 if (block_group && block_group_bits(block_group, flags) && 7508 block_group->cached != BTRFS_CACHE_NO) { 7509 down_read(&space_info->groups_sem); 7510 if (list_empty(&block_group->list) || 7511 block_group->ro) { 7512 /* 7513 * someone is removing this block group, 7514 * we can't jump into the have_block_group 7515 * target because our list pointers are not 7516 * valid 7517 */ 7518 btrfs_put_block_group(block_group); 7519 up_read(&space_info->groups_sem); 7520 } else { 7521 index = get_block_group_index(block_group); 7522 btrfs_lock_block_group(block_group, delalloc); 7523 goto have_block_group; 7524 } 7525 } else if (block_group) { 7526 btrfs_put_block_group(block_group); 7527 } 7528 } 7529 search: 7530 have_caching_bg = false; 7531 if (index == 0 || index == __get_raid_index(flags)) 7532 full_search = true; 7533 down_read(&space_info->groups_sem); 7534 list_for_each_entry(block_group, &space_info->block_groups[index], 7535 list) { 7536 u64 offset; 7537 int cached; 7538 7539 btrfs_grab_block_group(block_group, delalloc); 7540 search_start = block_group->key.objectid; 7541 7542 /* 7543 * this can happen if we end up cycling through all the 7544 * raid types, but we want to make sure we only allocate 7545 * for the proper type. 7546 */ 7547 if (!block_group_bits(block_group, flags)) { 7548 u64 extra = BTRFS_BLOCK_GROUP_DUP | 7549 BTRFS_BLOCK_GROUP_RAID1 | 7550 BTRFS_BLOCK_GROUP_RAID5 | 7551 BTRFS_BLOCK_GROUP_RAID6 | 7552 BTRFS_BLOCK_GROUP_RAID10; 7553 7554 /* 7555 * if they asked for extra copies and this block group 7556 * doesn't provide them, bail. This does allow us to 7557 * fill raid0 from raid1. 7558 */ 7559 if ((flags & extra) && !(block_group->flags & extra)) 7560 goto loop; 7561 } 7562 7563 have_block_group: 7564 cached = block_group_cache_done(block_group); 7565 if (unlikely(!cached)) { 7566 have_caching_bg = true; 7567 ret = cache_block_group(block_group, 0); 7568 BUG_ON(ret < 0); 7569 ret = 0; 7570 } 7571 7572 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 7573 goto loop; 7574 if (unlikely(block_group->ro)) 7575 goto loop; 7576 7577 /* 7578 * Ok we want to try and use the cluster allocator, so 7579 * lets look there 7580 */ 7581 if (last_ptr && use_cluster) { 7582 struct btrfs_block_group_cache *used_block_group; 7583 unsigned long aligned_cluster; 7584 /* 7585 * the refill lock keeps out other 7586 * people trying to start a new cluster 7587 */ 7588 used_block_group = btrfs_lock_cluster(block_group, 7589 last_ptr, 7590 delalloc); 7591 if (!used_block_group) 7592 goto refill_cluster; 7593 7594 if (used_block_group != block_group && 7595 (used_block_group->ro || 7596 !block_group_bits(used_block_group, flags))) 7597 goto release_cluster; 7598 7599 offset = btrfs_alloc_from_cluster(used_block_group, 7600 last_ptr, 7601 num_bytes, 7602 used_block_group->key.objectid, 7603 &max_extent_size); 7604 if (offset) { 7605 /* we have a block, we're done */ 7606 spin_unlock(&last_ptr->refill_lock); 7607 trace_btrfs_reserve_extent_cluster(root, 7608 used_block_group, 7609 search_start, num_bytes); 7610 if (used_block_group != block_group) { 7611 btrfs_release_block_group(block_group, 7612 delalloc); 7613 block_group = used_block_group; 7614 } 7615 goto checks; 7616 } 7617 7618 WARN_ON(last_ptr->block_group != used_block_group); 7619 release_cluster: 7620 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 7621 * set up a new clusters, so lets just skip it 7622 * and let the allocator find whatever block 7623 * it can find. If we reach this point, we 7624 * will have tried the cluster allocator 7625 * plenty of times and not have found 7626 * anything, so we are likely way too 7627 * fragmented for the clustering stuff to find 7628 * anything. 7629 * 7630 * However, if the cluster is taken from the 7631 * current block group, release the cluster 7632 * first, so that we stand a better chance of 7633 * succeeding in the unclustered 7634 * allocation. */ 7635 if (loop >= LOOP_NO_EMPTY_SIZE && 7636 used_block_group != block_group) { 7637 spin_unlock(&last_ptr->refill_lock); 7638 btrfs_release_block_group(used_block_group, 7639 delalloc); 7640 goto unclustered_alloc; 7641 } 7642 7643 /* 7644 * this cluster didn't work out, free it and 7645 * start over 7646 */ 7647 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7648 7649 if (used_block_group != block_group) 7650 btrfs_release_block_group(used_block_group, 7651 delalloc); 7652 refill_cluster: 7653 if (loop >= LOOP_NO_EMPTY_SIZE) { 7654 spin_unlock(&last_ptr->refill_lock); 7655 goto unclustered_alloc; 7656 } 7657 7658 aligned_cluster = max_t(unsigned long, 7659 empty_cluster + empty_size, 7660 block_group->full_stripe_len); 7661 7662 /* allocate a cluster in this block group */ 7663 ret = btrfs_find_space_cluster(root, block_group, 7664 last_ptr, search_start, 7665 num_bytes, 7666 aligned_cluster); 7667 if (ret == 0) { 7668 /* 7669 * now pull our allocation out of this 7670 * cluster 7671 */ 7672 offset = btrfs_alloc_from_cluster(block_group, 7673 last_ptr, 7674 num_bytes, 7675 search_start, 7676 &max_extent_size); 7677 if (offset) { 7678 /* we found one, proceed */ 7679 spin_unlock(&last_ptr->refill_lock); 7680 trace_btrfs_reserve_extent_cluster(root, 7681 block_group, search_start, 7682 num_bytes); 7683 goto checks; 7684 } 7685 } else if (!cached && loop > LOOP_CACHING_NOWAIT 7686 && !failed_cluster_refill) { 7687 spin_unlock(&last_ptr->refill_lock); 7688 7689 failed_cluster_refill = true; 7690 wait_block_group_cache_progress(block_group, 7691 num_bytes + empty_cluster + empty_size); 7692 goto have_block_group; 7693 } 7694 7695 /* 7696 * at this point we either didn't find a cluster 7697 * or we weren't able to allocate a block from our 7698 * cluster. Free the cluster we've been trying 7699 * to use, and go to the next block group 7700 */ 7701 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7702 spin_unlock(&last_ptr->refill_lock); 7703 goto loop; 7704 } 7705 7706 unclustered_alloc: 7707 /* 7708 * We are doing an unclustered alloc, set the fragmented flag so 7709 * we don't bother trying to setup a cluster again until we get 7710 * more space. 7711 */ 7712 if (unlikely(last_ptr)) { 7713 spin_lock(&last_ptr->lock); 7714 last_ptr->fragmented = 1; 7715 spin_unlock(&last_ptr->lock); 7716 } 7717 spin_lock(&block_group->free_space_ctl->tree_lock); 7718 if (cached && 7719 block_group->free_space_ctl->free_space < 7720 num_bytes + empty_cluster + empty_size) { 7721 if (block_group->free_space_ctl->free_space > 7722 max_extent_size) 7723 max_extent_size = 7724 block_group->free_space_ctl->free_space; 7725 spin_unlock(&block_group->free_space_ctl->tree_lock); 7726 goto loop; 7727 } 7728 spin_unlock(&block_group->free_space_ctl->tree_lock); 7729 7730 offset = btrfs_find_space_for_alloc(block_group, search_start, 7731 num_bytes, empty_size, 7732 &max_extent_size); 7733 /* 7734 * If we didn't find a chunk, and we haven't failed on this 7735 * block group before, and this block group is in the middle of 7736 * caching and we are ok with waiting, then go ahead and wait 7737 * for progress to be made, and set failed_alloc to true. 7738 * 7739 * If failed_alloc is true then we've already waited on this 7740 * block group once and should move on to the next block group. 7741 */ 7742 if (!offset && !failed_alloc && !cached && 7743 loop > LOOP_CACHING_NOWAIT) { 7744 wait_block_group_cache_progress(block_group, 7745 num_bytes + empty_size); 7746 failed_alloc = true; 7747 goto have_block_group; 7748 } else if (!offset) { 7749 goto loop; 7750 } 7751 checks: 7752 search_start = ALIGN(offset, root->stripesize); 7753 7754 /* move on to the next group */ 7755 if (search_start + num_bytes > 7756 block_group->key.objectid + block_group->key.offset) { 7757 btrfs_add_free_space(block_group, offset, num_bytes); 7758 goto loop; 7759 } 7760 7761 if (offset < search_start) 7762 btrfs_add_free_space(block_group, offset, 7763 search_start - offset); 7764 BUG_ON(offset > search_start); 7765 7766 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 7767 alloc_type, delalloc); 7768 if (ret == -EAGAIN) { 7769 btrfs_add_free_space(block_group, offset, num_bytes); 7770 goto loop; 7771 } 7772 btrfs_inc_block_group_reservations(block_group); 7773 7774 /* we are all good, lets return */ 7775 ins->objectid = search_start; 7776 ins->offset = num_bytes; 7777 7778 trace_btrfs_reserve_extent(orig_root, block_group, 7779 search_start, num_bytes); 7780 btrfs_release_block_group(block_group, delalloc); 7781 break; 7782 loop: 7783 failed_cluster_refill = false; 7784 failed_alloc = false; 7785 BUG_ON(index != get_block_group_index(block_group)); 7786 btrfs_release_block_group(block_group, delalloc); 7787 } 7788 up_read(&space_info->groups_sem); 7789 7790 if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg 7791 && !orig_have_caching_bg) 7792 orig_have_caching_bg = true; 7793 7794 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 7795 goto search; 7796 7797 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 7798 goto search; 7799 7800 /* 7801 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 7802 * caching kthreads as we move along 7803 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 7804 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 7805 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 7806 * again 7807 */ 7808 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 7809 index = 0; 7810 if (loop == LOOP_CACHING_NOWAIT) { 7811 /* 7812 * We want to skip the LOOP_CACHING_WAIT step if we 7813 * don't have any uncached bgs and we've already done a 7814 * full search through. 7815 */ 7816 if (orig_have_caching_bg || !full_search) 7817 loop = LOOP_CACHING_WAIT; 7818 else 7819 loop = LOOP_ALLOC_CHUNK; 7820 } else { 7821 loop++; 7822 } 7823 7824 if (loop == LOOP_ALLOC_CHUNK) { 7825 struct btrfs_trans_handle *trans; 7826 int exist = 0; 7827 7828 trans = current->journal_info; 7829 if (trans) 7830 exist = 1; 7831 else 7832 trans = btrfs_join_transaction(root); 7833 7834 if (IS_ERR(trans)) { 7835 ret = PTR_ERR(trans); 7836 goto out; 7837 } 7838 7839 ret = do_chunk_alloc(trans, root, flags, 7840 CHUNK_ALLOC_FORCE); 7841 7842 /* 7843 * If we can't allocate a new chunk we've already looped 7844 * through at least once, move on to the NO_EMPTY_SIZE 7845 * case. 7846 */ 7847 if (ret == -ENOSPC) 7848 loop = LOOP_NO_EMPTY_SIZE; 7849 7850 /* 7851 * Do not bail out on ENOSPC since we 7852 * can do more things. 7853 */ 7854 if (ret < 0 && ret != -ENOSPC) 7855 btrfs_abort_transaction(trans, ret); 7856 else 7857 ret = 0; 7858 if (!exist) 7859 btrfs_end_transaction(trans, root); 7860 if (ret) 7861 goto out; 7862 } 7863 7864 if (loop == LOOP_NO_EMPTY_SIZE) { 7865 /* 7866 * Don't loop again if we already have no empty_size and 7867 * no empty_cluster. 7868 */ 7869 if (empty_size == 0 && 7870 empty_cluster == 0) { 7871 ret = -ENOSPC; 7872 goto out; 7873 } 7874 empty_size = 0; 7875 empty_cluster = 0; 7876 } 7877 7878 goto search; 7879 } else if (!ins->objectid) { 7880 ret = -ENOSPC; 7881 } else if (ins->objectid) { 7882 if (!use_cluster && last_ptr) { 7883 spin_lock(&last_ptr->lock); 7884 last_ptr->window_start = ins->objectid; 7885 spin_unlock(&last_ptr->lock); 7886 } 7887 ret = 0; 7888 } 7889 out: 7890 if (ret == -ENOSPC) { 7891 spin_lock(&space_info->lock); 7892 space_info->max_extent_size = max_extent_size; 7893 spin_unlock(&space_info->lock); 7894 ins->offset = max_extent_size; 7895 } 7896 return ret; 7897 } 7898 7899 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 7900 int dump_block_groups) 7901 { 7902 struct btrfs_block_group_cache *cache; 7903 int index = 0; 7904 7905 spin_lock(&info->lock); 7906 printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", 7907 info->flags, 7908 info->total_bytes - info->bytes_used - info->bytes_pinned - 7909 info->bytes_reserved - info->bytes_readonly - 7910 info->bytes_may_use, (info->full) ? "" : "not "); 7911 printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " 7912 "reserved=%llu, may_use=%llu, readonly=%llu\n", 7913 info->total_bytes, info->bytes_used, info->bytes_pinned, 7914 info->bytes_reserved, info->bytes_may_use, 7915 info->bytes_readonly); 7916 spin_unlock(&info->lock); 7917 7918 if (!dump_block_groups) 7919 return; 7920 7921 down_read(&info->groups_sem); 7922 again: 7923 list_for_each_entry(cache, &info->block_groups[index], list) { 7924 spin_lock(&cache->lock); 7925 printk(KERN_INFO "BTRFS: " 7926 "block group %llu has %llu bytes, " 7927 "%llu used %llu pinned %llu reserved %s\n", 7928 cache->key.objectid, cache->key.offset, 7929 btrfs_block_group_used(&cache->item), cache->pinned, 7930 cache->reserved, cache->ro ? "[readonly]" : ""); 7931 btrfs_dump_free_space(cache, bytes); 7932 spin_unlock(&cache->lock); 7933 } 7934 if (++index < BTRFS_NR_RAID_TYPES) 7935 goto again; 7936 up_read(&info->groups_sem); 7937 } 7938 7939 int btrfs_reserve_extent(struct btrfs_root *root, 7940 u64 num_bytes, u64 min_alloc_size, 7941 u64 empty_size, u64 hint_byte, 7942 struct btrfs_key *ins, int is_data, int delalloc) 7943 { 7944 bool final_tried = num_bytes == min_alloc_size; 7945 u64 flags; 7946 int ret; 7947 7948 flags = btrfs_get_alloc_profile(root, is_data); 7949 again: 7950 WARN_ON(num_bytes < root->sectorsize); 7951 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, 7952 flags, delalloc); 7953 if (!ret && !is_data) { 7954 btrfs_dec_block_group_reservations(root->fs_info, 7955 ins->objectid); 7956 } else if (ret == -ENOSPC) { 7957 if (!final_tried && ins->offset) { 7958 num_bytes = min(num_bytes >> 1, ins->offset); 7959 num_bytes = round_down(num_bytes, root->sectorsize); 7960 num_bytes = max(num_bytes, min_alloc_size); 7961 if (num_bytes == min_alloc_size) 7962 final_tried = true; 7963 goto again; 7964 } else if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { 7965 struct btrfs_space_info *sinfo; 7966 7967 sinfo = __find_space_info(root->fs_info, flags); 7968 btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", 7969 flags, num_bytes); 7970 if (sinfo) 7971 dump_space_info(sinfo, num_bytes, 1); 7972 } 7973 } 7974 7975 return ret; 7976 } 7977 7978 static int __btrfs_free_reserved_extent(struct btrfs_root *root, 7979 u64 start, u64 len, 7980 int pin, int delalloc) 7981 { 7982 struct btrfs_block_group_cache *cache; 7983 int ret = 0; 7984 7985 cache = btrfs_lookup_block_group(root->fs_info, start); 7986 if (!cache) { 7987 btrfs_err(root->fs_info, "Unable to find block group for %llu", 7988 start); 7989 return -ENOSPC; 7990 } 7991 7992 if (pin) 7993 pin_down_extent(root, cache, start, len, 1); 7994 else { 7995 if (btrfs_test_opt(root->fs_info, DISCARD)) 7996 ret = btrfs_discard_extent(root, start, len, NULL); 7997 btrfs_add_free_space(cache, start, len); 7998 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); 7999 trace_btrfs_reserved_extent_free(root, start, len); 8000 } 8001 8002 btrfs_put_block_group(cache); 8003 return ret; 8004 } 8005 8006 int btrfs_free_reserved_extent(struct btrfs_root *root, 8007 u64 start, u64 len, int delalloc) 8008 { 8009 return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); 8010 } 8011 8012 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 8013 u64 start, u64 len) 8014 { 8015 return __btrfs_free_reserved_extent(root, start, len, 1, 0); 8016 } 8017 8018 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8019 struct btrfs_root *root, 8020 u64 parent, u64 root_objectid, 8021 u64 flags, u64 owner, u64 offset, 8022 struct btrfs_key *ins, int ref_mod) 8023 { 8024 int ret; 8025 struct btrfs_fs_info *fs_info = root->fs_info; 8026 struct btrfs_extent_item *extent_item; 8027 struct btrfs_extent_inline_ref *iref; 8028 struct btrfs_path *path; 8029 struct extent_buffer *leaf; 8030 int type; 8031 u32 size; 8032 8033 if (parent > 0) 8034 type = BTRFS_SHARED_DATA_REF_KEY; 8035 else 8036 type = BTRFS_EXTENT_DATA_REF_KEY; 8037 8038 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 8039 8040 path = btrfs_alloc_path(); 8041 if (!path) 8042 return -ENOMEM; 8043 8044 path->leave_spinning = 1; 8045 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8046 ins, size); 8047 if (ret) { 8048 btrfs_free_path(path); 8049 return ret; 8050 } 8051 8052 leaf = path->nodes[0]; 8053 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8054 struct btrfs_extent_item); 8055 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 8056 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8057 btrfs_set_extent_flags(leaf, extent_item, 8058 flags | BTRFS_EXTENT_FLAG_DATA); 8059 8060 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8061 btrfs_set_extent_inline_ref_type(leaf, iref, type); 8062 if (parent > 0) { 8063 struct btrfs_shared_data_ref *ref; 8064 ref = (struct btrfs_shared_data_ref *)(iref + 1); 8065 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 8066 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 8067 } else { 8068 struct btrfs_extent_data_ref *ref; 8069 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 8070 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 8071 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 8072 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 8073 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 8074 } 8075 8076 btrfs_mark_buffer_dirty(path->nodes[0]); 8077 btrfs_free_path(path); 8078 8079 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, 8080 ins->offset); 8081 if (ret) 8082 return ret; 8083 8084 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 8085 if (ret) { /* -ENOENT, logic error */ 8086 btrfs_err(fs_info, "update block group failed for %llu %llu", 8087 ins->objectid, ins->offset); 8088 BUG(); 8089 } 8090 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); 8091 return ret; 8092 } 8093 8094 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 8095 struct btrfs_root *root, 8096 u64 parent, u64 root_objectid, 8097 u64 flags, struct btrfs_disk_key *key, 8098 int level, struct btrfs_key *ins) 8099 { 8100 int ret; 8101 struct btrfs_fs_info *fs_info = root->fs_info; 8102 struct btrfs_extent_item *extent_item; 8103 struct btrfs_tree_block_info *block_info; 8104 struct btrfs_extent_inline_ref *iref; 8105 struct btrfs_path *path; 8106 struct extent_buffer *leaf; 8107 u32 size = sizeof(*extent_item) + sizeof(*iref); 8108 u64 num_bytes = ins->offset; 8109 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 8110 SKINNY_METADATA); 8111 8112 if (!skinny_metadata) 8113 size += sizeof(*block_info); 8114 8115 path = btrfs_alloc_path(); 8116 if (!path) { 8117 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 8118 root->nodesize); 8119 return -ENOMEM; 8120 } 8121 8122 path->leave_spinning = 1; 8123 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8124 ins, size); 8125 if (ret) { 8126 btrfs_free_path(path); 8127 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 8128 root->nodesize); 8129 return ret; 8130 } 8131 8132 leaf = path->nodes[0]; 8133 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8134 struct btrfs_extent_item); 8135 btrfs_set_extent_refs(leaf, extent_item, 1); 8136 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8137 btrfs_set_extent_flags(leaf, extent_item, 8138 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 8139 8140 if (skinny_metadata) { 8141 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8142 num_bytes = root->nodesize; 8143 } else { 8144 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 8145 btrfs_set_tree_block_key(leaf, block_info, key); 8146 btrfs_set_tree_block_level(leaf, block_info, level); 8147 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 8148 } 8149 8150 if (parent > 0) { 8151 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 8152 btrfs_set_extent_inline_ref_type(leaf, iref, 8153 BTRFS_SHARED_BLOCK_REF_KEY); 8154 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 8155 } else { 8156 btrfs_set_extent_inline_ref_type(leaf, iref, 8157 BTRFS_TREE_BLOCK_REF_KEY); 8158 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 8159 } 8160 8161 btrfs_mark_buffer_dirty(leaf); 8162 btrfs_free_path(path); 8163 8164 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, 8165 num_bytes); 8166 if (ret) 8167 return ret; 8168 8169 ret = update_block_group(trans, root, ins->objectid, root->nodesize, 8170 1); 8171 if (ret) { /* -ENOENT, logic error */ 8172 btrfs_err(fs_info, "update block group failed for %llu %llu", 8173 ins->objectid, ins->offset); 8174 BUG(); 8175 } 8176 8177 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize); 8178 return ret; 8179 } 8180 8181 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8182 struct btrfs_root *root, 8183 u64 root_objectid, u64 owner, 8184 u64 offset, u64 ram_bytes, 8185 struct btrfs_key *ins) 8186 { 8187 int ret; 8188 8189 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 8190 8191 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, 8192 ins->offset, 0, 8193 root_objectid, owner, offset, 8194 ram_bytes, BTRFS_ADD_DELAYED_EXTENT, 8195 NULL); 8196 return ret; 8197 } 8198 8199 /* 8200 * this is used by the tree logging recovery code. It records that 8201 * an extent has been allocated and makes sure to clear the free 8202 * space cache bits as well 8203 */ 8204 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 8205 struct btrfs_root *root, 8206 u64 root_objectid, u64 owner, u64 offset, 8207 struct btrfs_key *ins) 8208 { 8209 int ret; 8210 struct btrfs_block_group_cache *block_group; 8211 8212 /* 8213 * Mixed block groups will exclude before processing the log so we only 8214 * need to do the exclude dance if this fs isn't mixed. 8215 */ 8216 if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { 8217 ret = __exclude_logged_extent(root, ins->objectid, ins->offset); 8218 if (ret) 8219 return ret; 8220 } 8221 8222 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 8223 if (!block_group) 8224 return -EINVAL; 8225 8226 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 8227 RESERVE_ALLOC_NO_ACCOUNT, 0); 8228 BUG_ON(ret); /* logic error */ 8229 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 8230 0, owner, offset, ins, 1); 8231 btrfs_put_block_group(block_group); 8232 return ret; 8233 } 8234 8235 static struct extent_buffer * 8236 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 8237 u64 bytenr, int level) 8238 { 8239 struct extent_buffer *buf; 8240 8241 buf = btrfs_find_create_tree_block(root, bytenr); 8242 if (IS_ERR(buf)) 8243 return buf; 8244 8245 btrfs_set_header_generation(buf, trans->transid); 8246 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 8247 btrfs_tree_lock(buf); 8248 clean_tree_block(trans, root->fs_info, buf); 8249 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 8250 8251 btrfs_set_lock_blocking(buf); 8252 set_extent_buffer_uptodate(buf); 8253 8254 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 8255 buf->log_index = root->log_transid % 2; 8256 /* 8257 * we allow two log transactions at a time, use different 8258 * EXENT bit to differentiate dirty pages. 8259 */ 8260 if (buf->log_index == 0) 8261 set_extent_dirty(&root->dirty_log_pages, buf->start, 8262 buf->start + buf->len - 1, GFP_NOFS); 8263 else 8264 set_extent_new(&root->dirty_log_pages, buf->start, 8265 buf->start + buf->len - 1); 8266 } else { 8267 buf->log_index = -1; 8268 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 8269 buf->start + buf->len - 1, GFP_NOFS); 8270 } 8271 trans->dirty = true; 8272 /* this returns a buffer locked for blocking */ 8273 return buf; 8274 } 8275 8276 static struct btrfs_block_rsv * 8277 use_block_rsv(struct btrfs_trans_handle *trans, 8278 struct btrfs_root *root, u32 blocksize) 8279 { 8280 struct btrfs_block_rsv *block_rsv; 8281 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 8282 int ret; 8283 bool global_updated = false; 8284 8285 block_rsv = get_block_rsv(trans, root); 8286 8287 if (unlikely(block_rsv->size == 0)) 8288 goto try_reserve; 8289 again: 8290 ret = block_rsv_use_bytes(block_rsv, blocksize); 8291 if (!ret) 8292 return block_rsv; 8293 8294 if (block_rsv->failfast) 8295 return ERR_PTR(ret); 8296 8297 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 8298 global_updated = true; 8299 update_global_block_rsv(root->fs_info); 8300 goto again; 8301 } 8302 8303 if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { 8304 static DEFINE_RATELIMIT_STATE(_rs, 8305 DEFAULT_RATELIMIT_INTERVAL * 10, 8306 /*DEFAULT_RATELIMIT_BURST*/ 1); 8307 if (__ratelimit(&_rs)) 8308 WARN(1, KERN_DEBUG 8309 "BTRFS: block rsv returned %d\n", ret); 8310 } 8311 try_reserve: 8312 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 8313 BTRFS_RESERVE_NO_FLUSH); 8314 if (!ret) 8315 return block_rsv; 8316 /* 8317 * If we couldn't reserve metadata bytes try and use some from 8318 * the global reserve if its space type is the same as the global 8319 * reservation. 8320 */ 8321 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 8322 block_rsv->space_info == global_rsv->space_info) { 8323 ret = block_rsv_use_bytes(global_rsv, blocksize); 8324 if (!ret) 8325 return global_rsv; 8326 } 8327 return ERR_PTR(ret); 8328 } 8329 8330 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 8331 struct btrfs_block_rsv *block_rsv, u32 blocksize) 8332 { 8333 block_rsv_add_bytes(block_rsv, blocksize, 0); 8334 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 8335 } 8336 8337 /* 8338 * finds a free extent and does all the dirty work required for allocation 8339 * returns the tree buffer or an ERR_PTR on error. 8340 */ 8341 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 8342 struct btrfs_root *root, 8343 u64 parent, u64 root_objectid, 8344 struct btrfs_disk_key *key, int level, 8345 u64 hint, u64 empty_size) 8346 { 8347 struct btrfs_key ins; 8348 struct btrfs_block_rsv *block_rsv; 8349 struct extent_buffer *buf; 8350 struct btrfs_delayed_extent_op *extent_op; 8351 u64 flags = 0; 8352 int ret; 8353 u32 blocksize = root->nodesize; 8354 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 8355 SKINNY_METADATA); 8356 8357 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 8358 if (btrfs_is_testing(root->fs_info)) { 8359 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 8360 level); 8361 if (!IS_ERR(buf)) 8362 root->alloc_bytenr += blocksize; 8363 return buf; 8364 } 8365 #endif 8366 8367 block_rsv = use_block_rsv(trans, root, blocksize); 8368 if (IS_ERR(block_rsv)) 8369 return ERR_CAST(block_rsv); 8370 8371 ret = btrfs_reserve_extent(root, blocksize, blocksize, 8372 empty_size, hint, &ins, 0, 0); 8373 if (ret) 8374 goto out_unuse; 8375 8376 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); 8377 if (IS_ERR(buf)) { 8378 ret = PTR_ERR(buf); 8379 goto out_free_reserved; 8380 } 8381 8382 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 8383 if (parent == 0) 8384 parent = ins.objectid; 8385 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 8386 } else 8387 BUG_ON(parent > 0); 8388 8389 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 8390 extent_op = btrfs_alloc_delayed_extent_op(); 8391 if (!extent_op) { 8392 ret = -ENOMEM; 8393 goto out_free_buf; 8394 } 8395 if (key) 8396 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 8397 else 8398 memset(&extent_op->key, 0, sizeof(extent_op->key)); 8399 extent_op->flags_to_set = flags; 8400 extent_op->update_key = skinny_metadata ? false : true; 8401 extent_op->update_flags = true; 8402 extent_op->is_data = false; 8403 extent_op->level = level; 8404 8405 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 8406 ins.objectid, ins.offset, 8407 parent, root_objectid, level, 8408 BTRFS_ADD_DELAYED_EXTENT, 8409 extent_op); 8410 if (ret) 8411 goto out_free_delayed; 8412 } 8413 return buf; 8414 8415 out_free_delayed: 8416 btrfs_free_delayed_extent_op(extent_op); 8417 out_free_buf: 8418 free_extent_buffer(buf); 8419 out_free_reserved: 8420 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0); 8421 out_unuse: 8422 unuse_block_rsv(root->fs_info, block_rsv, blocksize); 8423 return ERR_PTR(ret); 8424 } 8425 8426 struct walk_control { 8427 u64 refs[BTRFS_MAX_LEVEL]; 8428 u64 flags[BTRFS_MAX_LEVEL]; 8429 struct btrfs_key update_progress; 8430 int stage; 8431 int level; 8432 int shared_level; 8433 int update_ref; 8434 int keep_locks; 8435 int reada_slot; 8436 int reada_count; 8437 int for_reloc; 8438 }; 8439 8440 #define DROP_REFERENCE 1 8441 #define UPDATE_BACKREF 2 8442 8443 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 8444 struct btrfs_root *root, 8445 struct walk_control *wc, 8446 struct btrfs_path *path) 8447 { 8448 u64 bytenr; 8449 u64 generation; 8450 u64 refs; 8451 u64 flags; 8452 u32 nritems; 8453 u32 blocksize; 8454 struct btrfs_key key; 8455 struct extent_buffer *eb; 8456 int ret; 8457 int slot; 8458 int nread = 0; 8459 8460 if (path->slots[wc->level] < wc->reada_slot) { 8461 wc->reada_count = wc->reada_count * 2 / 3; 8462 wc->reada_count = max(wc->reada_count, 2); 8463 } else { 8464 wc->reada_count = wc->reada_count * 3 / 2; 8465 wc->reada_count = min_t(int, wc->reada_count, 8466 BTRFS_NODEPTRS_PER_BLOCK(root)); 8467 } 8468 8469 eb = path->nodes[wc->level]; 8470 nritems = btrfs_header_nritems(eb); 8471 blocksize = root->nodesize; 8472 8473 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 8474 if (nread >= wc->reada_count) 8475 break; 8476 8477 cond_resched(); 8478 bytenr = btrfs_node_blockptr(eb, slot); 8479 generation = btrfs_node_ptr_generation(eb, slot); 8480 8481 if (slot == path->slots[wc->level]) 8482 goto reada; 8483 8484 if (wc->stage == UPDATE_BACKREF && 8485 generation <= root->root_key.offset) 8486 continue; 8487 8488 /* We don't lock the tree block, it's OK to be racy here */ 8489 ret = btrfs_lookup_extent_info(trans, root, bytenr, 8490 wc->level - 1, 1, &refs, 8491 &flags); 8492 /* We don't care about errors in readahead. */ 8493 if (ret < 0) 8494 continue; 8495 BUG_ON(refs == 0); 8496 8497 if (wc->stage == DROP_REFERENCE) { 8498 if (refs == 1) 8499 goto reada; 8500 8501 if (wc->level == 1 && 8502 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8503 continue; 8504 if (!wc->update_ref || 8505 generation <= root->root_key.offset) 8506 continue; 8507 btrfs_node_key_to_cpu(eb, &key, slot); 8508 ret = btrfs_comp_cpu_keys(&key, 8509 &wc->update_progress); 8510 if (ret < 0) 8511 continue; 8512 } else { 8513 if (wc->level == 1 && 8514 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8515 continue; 8516 } 8517 reada: 8518 readahead_tree_block(root, bytenr); 8519 nread++; 8520 } 8521 wc->reada_slot = slot; 8522 } 8523 8524 /* 8525 * These may not be seen by the usual inc/dec ref code so we have to 8526 * add them here. 8527 */ 8528 static int record_one_subtree_extent(struct btrfs_trans_handle *trans, 8529 struct btrfs_root *root, u64 bytenr, 8530 u64 num_bytes) 8531 { 8532 struct btrfs_qgroup_extent_record *qrecord; 8533 struct btrfs_delayed_ref_root *delayed_refs; 8534 8535 qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS); 8536 if (!qrecord) 8537 return -ENOMEM; 8538 8539 qrecord->bytenr = bytenr; 8540 qrecord->num_bytes = num_bytes; 8541 qrecord->old_roots = NULL; 8542 8543 delayed_refs = &trans->transaction->delayed_refs; 8544 spin_lock(&delayed_refs->lock); 8545 if (btrfs_qgroup_insert_dirty_extent(trans->fs_info, 8546 delayed_refs, qrecord)) 8547 kfree(qrecord); 8548 spin_unlock(&delayed_refs->lock); 8549 8550 return 0; 8551 } 8552 8553 static int account_leaf_items(struct btrfs_trans_handle *trans, 8554 struct btrfs_root *root, 8555 struct extent_buffer *eb) 8556 { 8557 int nr = btrfs_header_nritems(eb); 8558 int i, extent_type, ret; 8559 struct btrfs_key key; 8560 struct btrfs_file_extent_item *fi; 8561 u64 bytenr, num_bytes; 8562 8563 /* We can be called directly from walk_up_proc() */ 8564 if (!root->fs_info->quota_enabled) 8565 return 0; 8566 8567 for (i = 0; i < nr; i++) { 8568 btrfs_item_key_to_cpu(eb, &key, i); 8569 8570 if (key.type != BTRFS_EXTENT_DATA_KEY) 8571 continue; 8572 8573 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 8574 /* filter out non qgroup-accountable extents */ 8575 extent_type = btrfs_file_extent_type(eb, fi); 8576 8577 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 8578 continue; 8579 8580 bytenr = btrfs_file_extent_disk_bytenr(eb, fi); 8581 if (!bytenr) 8582 continue; 8583 8584 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 8585 8586 ret = record_one_subtree_extent(trans, root, bytenr, num_bytes); 8587 if (ret) 8588 return ret; 8589 } 8590 return 0; 8591 } 8592 8593 /* 8594 * Walk up the tree from the bottom, freeing leaves and any interior 8595 * nodes which have had all slots visited. If a node (leaf or 8596 * interior) is freed, the node above it will have it's slot 8597 * incremented. The root node will never be freed. 8598 * 8599 * At the end of this function, we should have a path which has all 8600 * slots incremented to the next position for a search. If we need to 8601 * read a new node it will be NULL and the node above it will have the 8602 * correct slot selected for a later read. 8603 * 8604 * If we increment the root nodes slot counter past the number of 8605 * elements, 1 is returned to signal completion of the search. 8606 */ 8607 static int adjust_slots_upwards(struct btrfs_root *root, 8608 struct btrfs_path *path, int root_level) 8609 { 8610 int level = 0; 8611 int nr, slot; 8612 struct extent_buffer *eb; 8613 8614 if (root_level == 0) 8615 return 1; 8616 8617 while (level <= root_level) { 8618 eb = path->nodes[level]; 8619 nr = btrfs_header_nritems(eb); 8620 path->slots[level]++; 8621 slot = path->slots[level]; 8622 if (slot >= nr || level == 0) { 8623 /* 8624 * Don't free the root - we will detect this 8625 * condition after our loop and return a 8626 * positive value for caller to stop walking the tree. 8627 */ 8628 if (level != root_level) { 8629 btrfs_tree_unlock_rw(eb, path->locks[level]); 8630 path->locks[level] = 0; 8631 8632 free_extent_buffer(eb); 8633 path->nodes[level] = NULL; 8634 path->slots[level] = 0; 8635 } 8636 } else { 8637 /* 8638 * We have a valid slot to walk back down 8639 * from. Stop here so caller can process these 8640 * new nodes. 8641 */ 8642 break; 8643 } 8644 8645 level++; 8646 } 8647 8648 eb = path->nodes[root_level]; 8649 if (path->slots[root_level] >= btrfs_header_nritems(eb)) 8650 return 1; 8651 8652 return 0; 8653 } 8654 8655 /* 8656 * root_eb is the subtree root and is locked before this function is called. 8657 */ 8658 static int account_shared_subtree(struct btrfs_trans_handle *trans, 8659 struct btrfs_root *root, 8660 struct extent_buffer *root_eb, 8661 u64 root_gen, 8662 int root_level) 8663 { 8664 int ret = 0; 8665 int level; 8666 struct extent_buffer *eb = root_eb; 8667 struct btrfs_path *path = NULL; 8668 8669 BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); 8670 BUG_ON(root_eb == NULL); 8671 8672 if (!root->fs_info->quota_enabled) 8673 return 0; 8674 8675 if (!extent_buffer_uptodate(root_eb)) { 8676 ret = btrfs_read_buffer(root_eb, root_gen); 8677 if (ret) 8678 goto out; 8679 } 8680 8681 if (root_level == 0) { 8682 ret = account_leaf_items(trans, root, root_eb); 8683 goto out; 8684 } 8685 8686 path = btrfs_alloc_path(); 8687 if (!path) 8688 return -ENOMEM; 8689 8690 /* 8691 * Walk down the tree. Missing extent blocks are filled in as 8692 * we go. Metadata is accounted every time we read a new 8693 * extent block. 8694 * 8695 * When we reach a leaf, we account for file extent items in it, 8696 * walk back up the tree (adjusting slot pointers as we go) 8697 * and restart the search process. 8698 */ 8699 extent_buffer_get(root_eb); /* For path */ 8700 path->nodes[root_level] = root_eb; 8701 path->slots[root_level] = 0; 8702 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ 8703 walk_down: 8704 level = root_level; 8705 while (level >= 0) { 8706 if (path->nodes[level] == NULL) { 8707 int parent_slot; 8708 u64 child_gen; 8709 u64 child_bytenr; 8710 8711 /* We need to get child blockptr/gen from 8712 * parent before we can read it. */ 8713 eb = path->nodes[level + 1]; 8714 parent_slot = path->slots[level + 1]; 8715 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 8716 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 8717 8718 eb = read_tree_block(root, child_bytenr, child_gen); 8719 if (IS_ERR(eb)) { 8720 ret = PTR_ERR(eb); 8721 goto out; 8722 } else if (!extent_buffer_uptodate(eb)) { 8723 free_extent_buffer(eb); 8724 ret = -EIO; 8725 goto out; 8726 } 8727 8728 path->nodes[level] = eb; 8729 path->slots[level] = 0; 8730 8731 btrfs_tree_read_lock(eb); 8732 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 8733 path->locks[level] = BTRFS_READ_LOCK_BLOCKING; 8734 8735 ret = record_one_subtree_extent(trans, root, child_bytenr, 8736 root->nodesize); 8737 if (ret) 8738 goto out; 8739 } 8740 8741 if (level == 0) { 8742 ret = account_leaf_items(trans, root, path->nodes[level]); 8743 if (ret) 8744 goto out; 8745 8746 /* Nonzero return here means we completed our search */ 8747 ret = adjust_slots_upwards(root, path, root_level); 8748 if (ret) 8749 break; 8750 8751 /* Restart search with new slots */ 8752 goto walk_down; 8753 } 8754 8755 level--; 8756 } 8757 8758 ret = 0; 8759 out: 8760 btrfs_free_path(path); 8761 8762 return ret; 8763 } 8764 8765 /* 8766 * helper to process tree block while walking down the tree. 8767 * 8768 * when wc->stage == UPDATE_BACKREF, this function updates 8769 * back refs for pointers in the block. 8770 * 8771 * NOTE: return value 1 means we should stop walking down. 8772 */ 8773 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 8774 struct btrfs_root *root, 8775 struct btrfs_path *path, 8776 struct walk_control *wc, int lookup_info) 8777 { 8778 int level = wc->level; 8779 struct extent_buffer *eb = path->nodes[level]; 8780 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 8781 int ret; 8782 8783 if (wc->stage == UPDATE_BACKREF && 8784 btrfs_header_owner(eb) != root->root_key.objectid) 8785 return 1; 8786 8787 /* 8788 * when reference count of tree block is 1, it won't increase 8789 * again. once full backref flag is set, we never clear it. 8790 */ 8791 if (lookup_info && 8792 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 8793 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 8794 BUG_ON(!path->locks[level]); 8795 ret = btrfs_lookup_extent_info(trans, root, 8796 eb->start, level, 1, 8797 &wc->refs[level], 8798 &wc->flags[level]); 8799 BUG_ON(ret == -ENOMEM); 8800 if (ret) 8801 return ret; 8802 BUG_ON(wc->refs[level] == 0); 8803 } 8804 8805 if (wc->stage == DROP_REFERENCE) { 8806 if (wc->refs[level] > 1) 8807 return 1; 8808 8809 if (path->locks[level] && !wc->keep_locks) { 8810 btrfs_tree_unlock_rw(eb, path->locks[level]); 8811 path->locks[level] = 0; 8812 } 8813 return 0; 8814 } 8815 8816 /* wc->stage == UPDATE_BACKREF */ 8817 if (!(wc->flags[level] & flag)) { 8818 BUG_ON(!path->locks[level]); 8819 ret = btrfs_inc_ref(trans, root, eb, 1); 8820 BUG_ON(ret); /* -ENOMEM */ 8821 ret = btrfs_dec_ref(trans, root, eb, 0); 8822 BUG_ON(ret); /* -ENOMEM */ 8823 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 8824 eb->len, flag, 8825 btrfs_header_level(eb), 0); 8826 BUG_ON(ret); /* -ENOMEM */ 8827 wc->flags[level] |= flag; 8828 } 8829 8830 /* 8831 * the block is shared by multiple trees, so it's not good to 8832 * keep the tree lock 8833 */ 8834 if (path->locks[level] && level > 0) { 8835 btrfs_tree_unlock_rw(eb, path->locks[level]); 8836 path->locks[level] = 0; 8837 } 8838 return 0; 8839 } 8840 8841 /* 8842 * helper to process tree block pointer. 8843 * 8844 * when wc->stage == DROP_REFERENCE, this function checks 8845 * reference count of the block pointed to. if the block 8846 * is shared and we need update back refs for the subtree 8847 * rooted at the block, this function changes wc->stage to 8848 * UPDATE_BACKREF. if the block is shared and there is no 8849 * need to update back, this function drops the reference 8850 * to the block. 8851 * 8852 * NOTE: return value 1 means we should stop walking down. 8853 */ 8854 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 8855 struct btrfs_root *root, 8856 struct btrfs_path *path, 8857 struct walk_control *wc, int *lookup_info) 8858 { 8859 u64 bytenr; 8860 u64 generation; 8861 u64 parent; 8862 u32 blocksize; 8863 struct btrfs_key key; 8864 struct extent_buffer *next; 8865 int level = wc->level; 8866 int reada = 0; 8867 int ret = 0; 8868 bool need_account = false; 8869 8870 generation = btrfs_node_ptr_generation(path->nodes[level], 8871 path->slots[level]); 8872 /* 8873 * if the lower level block was created before the snapshot 8874 * was created, we know there is no need to update back refs 8875 * for the subtree 8876 */ 8877 if (wc->stage == UPDATE_BACKREF && 8878 generation <= root->root_key.offset) { 8879 *lookup_info = 1; 8880 return 1; 8881 } 8882 8883 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 8884 blocksize = root->nodesize; 8885 8886 next = btrfs_find_tree_block(root->fs_info, bytenr); 8887 if (!next) { 8888 next = btrfs_find_create_tree_block(root, bytenr); 8889 if (IS_ERR(next)) 8890 return PTR_ERR(next); 8891 8892 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 8893 level - 1); 8894 reada = 1; 8895 } 8896 btrfs_tree_lock(next); 8897 btrfs_set_lock_blocking(next); 8898 8899 ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, 8900 &wc->refs[level - 1], 8901 &wc->flags[level - 1]); 8902 if (ret < 0) { 8903 btrfs_tree_unlock(next); 8904 return ret; 8905 } 8906 8907 if (unlikely(wc->refs[level - 1] == 0)) { 8908 btrfs_err(root->fs_info, "Missing references."); 8909 BUG(); 8910 } 8911 *lookup_info = 0; 8912 8913 if (wc->stage == DROP_REFERENCE) { 8914 if (wc->refs[level - 1] > 1) { 8915 need_account = true; 8916 if (level == 1 && 8917 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8918 goto skip; 8919 8920 if (!wc->update_ref || 8921 generation <= root->root_key.offset) 8922 goto skip; 8923 8924 btrfs_node_key_to_cpu(path->nodes[level], &key, 8925 path->slots[level]); 8926 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 8927 if (ret < 0) 8928 goto skip; 8929 8930 wc->stage = UPDATE_BACKREF; 8931 wc->shared_level = level - 1; 8932 } 8933 } else { 8934 if (level == 1 && 8935 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8936 goto skip; 8937 } 8938 8939 if (!btrfs_buffer_uptodate(next, generation, 0)) { 8940 btrfs_tree_unlock(next); 8941 free_extent_buffer(next); 8942 next = NULL; 8943 *lookup_info = 1; 8944 } 8945 8946 if (!next) { 8947 if (reada && level == 1) 8948 reada_walk_down(trans, root, wc, path); 8949 next = read_tree_block(root, bytenr, generation); 8950 if (IS_ERR(next)) { 8951 return PTR_ERR(next); 8952 } else if (!extent_buffer_uptodate(next)) { 8953 free_extent_buffer(next); 8954 return -EIO; 8955 } 8956 btrfs_tree_lock(next); 8957 btrfs_set_lock_blocking(next); 8958 } 8959 8960 level--; 8961 BUG_ON(level != btrfs_header_level(next)); 8962 path->nodes[level] = next; 8963 path->slots[level] = 0; 8964 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8965 wc->level = level; 8966 if (wc->level == 1) 8967 wc->reada_slot = 0; 8968 return 0; 8969 skip: 8970 wc->refs[level - 1] = 0; 8971 wc->flags[level - 1] = 0; 8972 if (wc->stage == DROP_REFERENCE) { 8973 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 8974 parent = path->nodes[level]->start; 8975 } else { 8976 BUG_ON(root->root_key.objectid != 8977 btrfs_header_owner(path->nodes[level])); 8978 parent = 0; 8979 } 8980 8981 if (need_account) { 8982 ret = account_shared_subtree(trans, root, next, 8983 generation, level - 1); 8984 if (ret) { 8985 btrfs_err_rl(root->fs_info, 8986 "Error " 8987 "%d accounting shared subtree. Quota " 8988 "is out of sync, rescan required.", 8989 ret); 8990 } 8991 } 8992 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 8993 root->root_key.objectid, level - 1, 0); 8994 BUG_ON(ret); /* -ENOMEM */ 8995 } 8996 btrfs_tree_unlock(next); 8997 free_extent_buffer(next); 8998 *lookup_info = 1; 8999 return 1; 9000 } 9001 9002 /* 9003 * helper to process tree block while walking up the tree. 9004 * 9005 * when wc->stage == DROP_REFERENCE, this function drops 9006 * reference count on the block. 9007 * 9008 * when wc->stage == UPDATE_BACKREF, this function changes 9009 * wc->stage back to DROP_REFERENCE if we changed wc->stage 9010 * to UPDATE_BACKREF previously while processing the block. 9011 * 9012 * NOTE: return value 1 means we should stop walking up. 9013 */ 9014 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 9015 struct btrfs_root *root, 9016 struct btrfs_path *path, 9017 struct walk_control *wc) 9018 { 9019 int ret; 9020 int level = wc->level; 9021 struct extent_buffer *eb = path->nodes[level]; 9022 u64 parent = 0; 9023 9024 if (wc->stage == UPDATE_BACKREF) { 9025 BUG_ON(wc->shared_level < level); 9026 if (level < wc->shared_level) 9027 goto out; 9028 9029 ret = find_next_key(path, level + 1, &wc->update_progress); 9030 if (ret > 0) 9031 wc->update_ref = 0; 9032 9033 wc->stage = DROP_REFERENCE; 9034 wc->shared_level = -1; 9035 path->slots[level] = 0; 9036 9037 /* 9038 * check reference count again if the block isn't locked. 9039 * we should start walking down the tree again if reference 9040 * count is one. 9041 */ 9042 if (!path->locks[level]) { 9043 BUG_ON(level == 0); 9044 btrfs_tree_lock(eb); 9045 btrfs_set_lock_blocking(eb); 9046 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9047 9048 ret = btrfs_lookup_extent_info(trans, root, 9049 eb->start, level, 1, 9050 &wc->refs[level], 9051 &wc->flags[level]); 9052 if (ret < 0) { 9053 btrfs_tree_unlock_rw(eb, path->locks[level]); 9054 path->locks[level] = 0; 9055 return ret; 9056 } 9057 BUG_ON(wc->refs[level] == 0); 9058 if (wc->refs[level] == 1) { 9059 btrfs_tree_unlock_rw(eb, path->locks[level]); 9060 path->locks[level] = 0; 9061 return 1; 9062 } 9063 } 9064 } 9065 9066 /* wc->stage == DROP_REFERENCE */ 9067 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 9068 9069 if (wc->refs[level] == 1) { 9070 if (level == 0) { 9071 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 9072 ret = btrfs_dec_ref(trans, root, eb, 1); 9073 else 9074 ret = btrfs_dec_ref(trans, root, eb, 0); 9075 BUG_ON(ret); /* -ENOMEM */ 9076 ret = account_leaf_items(trans, root, eb); 9077 if (ret) { 9078 btrfs_err_rl(root->fs_info, 9079 "error " 9080 "%d accounting leaf items. Quota " 9081 "is out of sync, rescan required.", 9082 ret); 9083 } 9084 } 9085 /* make block locked assertion in clean_tree_block happy */ 9086 if (!path->locks[level] && 9087 btrfs_header_generation(eb) == trans->transid) { 9088 btrfs_tree_lock(eb); 9089 btrfs_set_lock_blocking(eb); 9090 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9091 } 9092 clean_tree_block(trans, root->fs_info, eb); 9093 } 9094 9095 if (eb == root->node) { 9096 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 9097 parent = eb->start; 9098 else 9099 BUG_ON(root->root_key.objectid != 9100 btrfs_header_owner(eb)); 9101 } else { 9102 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 9103 parent = path->nodes[level + 1]->start; 9104 else 9105 BUG_ON(root->root_key.objectid != 9106 btrfs_header_owner(path->nodes[level + 1])); 9107 } 9108 9109 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 9110 out: 9111 wc->refs[level] = 0; 9112 wc->flags[level] = 0; 9113 return 0; 9114 } 9115 9116 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 9117 struct btrfs_root *root, 9118 struct btrfs_path *path, 9119 struct walk_control *wc) 9120 { 9121 int level = wc->level; 9122 int lookup_info = 1; 9123 int ret; 9124 9125 while (level >= 0) { 9126 ret = walk_down_proc(trans, root, path, wc, lookup_info); 9127 if (ret > 0) 9128 break; 9129 9130 if (level == 0) 9131 break; 9132 9133 if (path->slots[level] >= 9134 btrfs_header_nritems(path->nodes[level])) 9135 break; 9136 9137 ret = do_walk_down(trans, root, path, wc, &lookup_info); 9138 if (ret > 0) { 9139 path->slots[level]++; 9140 continue; 9141 } else if (ret < 0) 9142 return ret; 9143 level = wc->level; 9144 } 9145 return 0; 9146 } 9147 9148 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 9149 struct btrfs_root *root, 9150 struct btrfs_path *path, 9151 struct walk_control *wc, int max_level) 9152 { 9153 int level = wc->level; 9154 int ret; 9155 9156 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 9157 while (level < max_level && path->nodes[level]) { 9158 wc->level = level; 9159 if (path->slots[level] + 1 < 9160 btrfs_header_nritems(path->nodes[level])) { 9161 path->slots[level]++; 9162 return 0; 9163 } else { 9164 ret = walk_up_proc(trans, root, path, wc); 9165 if (ret > 0) 9166 return 0; 9167 9168 if (path->locks[level]) { 9169 btrfs_tree_unlock_rw(path->nodes[level], 9170 path->locks[level]); 9171 path->locks[level] = 0; 9172 } 9173 free_extent_buffer(path->nodes[level]); 9174 path->nodes[level] = NULL; 9175 level++; 9176 } 9177 } 9178 return 1; 9179 } 9180 9181 /* 9182 * drop a subvolume tree. 9183 * 9184 * this function traverses the tree freeing any blocks that only 9185 * referenced by the tree. 9186 * 9187 * when a shared tree block is found. this function decreases its 9188 * reference count by one. if update_ref is true, this function 9189 * also make sure backrefs for the shared block and all lower level 9190 * blocks are properly updated. 9191 * 9192 * If called with for_reloc == 0, may exit early with -EAGAIN 9193 */ 9194 int btrfs_drop_snapshot(struct btrfs_root *root, 9195 struct btrfs_block_rsv *block_rsv, int update_ref, 9196 int for_reloc) 9197 { 9198 struct btrfs_path *path; 9199 struct btrfs_trans_handle *trans; 9200 struct btrfs_root *tree_root = root->fs_info->tree_root; 9201 struct btrfs_root_item *root_item = &root->root_item; 9202 struct walk_control *wc; 9203 struct btrfs_key key; 9204 int err = 0; 9205 int ret; 9206 int level; 9207 bool root_dropped = false; 9208 9209 btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid); 9210 9211 path = btrfs_alloc_path(); 9212 if (!path) { 9213 err = -ENOMEM; 9214 goto out; 9215 } 9216 9217 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9218 if (!wc) { 9219 btrfs_free_path(path); 9220 err = -ENOMEM; 9221 goto out; 9222 } 9223 9224 trans = btrfs_start_transaction(tree_root, 0); 9225 if (IS_ERR(trans)) { 9226 err = PTR_ERR(trans); 9227 goto out_free; 9228 } 9229 9230 if (block_rsv) 9231 trans->block_rsv = block_rsv; 9232 9233 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 9234 level = btrfs_header_level(root->node); 9235 path->nodes[level] = btrfs_lock_root_node(root); 9236 btrfs_set_lock_blocking(path->nodes[level]); 9237 path->slots[level] = 0; 9238 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9239 memset(&wc->update_progress, 0, 9240 sizeof(wc->update_progress)); 9241 } else { 9242 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 9243 memcpy(&wc->update_progress, &key, 9244 sizeof(wc->update_progress)); 9245 9246 level = root_item->drop_level; 9247 BUG_ON(level == 0); 9248 path->lowest_level = level; 9249 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 9250 path->lowest_level = 0; 9251 if (ret < 0) { 9252 err = ret; 9253 goto out_end_trans; 9254 } 9255 WARN_ON(ret > 0); 9256 9257 /* 9258 * unlock our path, this is safe because only this 9259 * function is allowed to delete this snapshot 9260 */ 9261 btrfs_unlock_up_safe(path, 0); 9262 9263 level = btrfs_header_level(root->node); 9264 while (1) { 9265 btrfs_tree_lock(path->nodes[level]); 9266 btrfs_set_lock_blocking(path->nodes[level]); 9267 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9268 9269 ret = btrfs_lookup_extent_info(trans, root, 9270 path->nodes[level]->start, 9271 level, 1, &wc->refs[level], 9272 &wc->flags[level]); 9273 if (ret < 0) { 9274 err = ret; 9275 goto out_end_trans; 9276 } 9277 BUG_ON(wc->refs[level] == 0); 9278 9279 if (level == root_item->drop_level) 9280 break; 9281 9282 btrfs_tree_unlock(path->nodes[level]); 9283 path->locks[level] = 0; 9284 WARN_ON(wc->refs[level] != 1); 9285 level--; 9286 } 9287 } 9288 9289 wc->level = level; 9290 wc->shared_level = -1; 9291 wc->stage = DROP_REFERENCE; 9292 wc->update_ref = update_ref; 9293 wc->keep_locks = 0; 9294 wc->for_reloc = for_reloc; 9295 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 9296 9297 while (1) { 9298 9299 ret = walk_down_tree(trans, root, path, wc); 9300 if (ret < 0) { 9301 err = ret; 9302 break; 9303 } 9304 9305 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 9306 if (ret < 0) { 9307 err = ret; 9308 break; 9309 } 9310 9311 if (ret > 0) { 9312 BUG_ON(wc->stage != DROP_REFERENCE); 9313 break; 9314 } 9315 9316 if (wc->stage == DROP_REFERENCE) { 9317 level = wc->level; 9318 btrfs_node_key(path->nodes[level], 9319 &root_item->drop_progress, 9320 path->slots[level]); 9321 root_item->drop_level = level; 9322 } 9323 9324 BUG_ON(wc->level == 0); 9325 if (btrfs_should_end_transaction(trans, tree_root) || 9326 (!for_reloc && btrfs_need_cleaner_sleep(root))) { 9327 ret = btrfs_update_root(trans, tree_root, 9328 &root->root_key, 9329 root_item); 9330 if (ret) { 9331 btrfs_abort_transaction(trans, ret); 9332 err = ret; 9333 goto out_end_trans; 9334 } 9335 9336 btrfs_end_transaction_throttle(trans, tree_root); 9337 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 9338 pr_debug("BTRFS: drop snapshot early exit\n"); 9339 err = -EAGAIN; 9340 goto out_free; 9341 } 9342 9343 trans = btrfs_start_transaction(tree_root, 0); 9344 if (IS_ERR(trans)) { 9345 err = PTR_ERR(trans); 9346 goto out_free; 9347 } 9348 if (block_rsv) 9349 trans->block_rsv = block_rsv; 9350 } 9351 } 9352 btrfs_release_path(path); 9353 if (err) 9354 goto out_end_trans; 9355 9356 ret = btrfs_del_root(trans, tree_root, &root->root_key); 9357 if (ret) { 9358 btrfs_abort_transaction(trans, ret); 9359 goto out_end_trans; 9360 } 9361 9362 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 9363 ret = btrfs_find_root(tree_root, &root->root_key, path, 9364 NULL, NULL); 9365 if (ret < 0) { 9366 btrfs_abort_transaction(trans, ret); 9367 err = ret; 9368 goto out_end_trans; 9369 } else if (ret > 0) { 9370 /* if we fail to delete the orphan item this time 9371 * around, it'll get picked up the next time. 9372 * 9373 * The most common failure here is just -ENOENT. 9374 */ 9375 btrfs_del_orphan_item(trans, tree_root, 9376 root->root_key.objectid); 9377 } 9378 } 9379 9380 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 9381 btrfs_add_dropped_root(trans, root); 9382 } else { 9383 free_extent_buffer(root->node); 9384 free_extent_buffer(root->commit_root); 9385 btrfs_put_fs_root(root); 9386 } 9387 root_dropped = true; 9388 out_end_trans: 9389 btrfs_end_transaction_throttle(trans, tree_root); 9390 out_free: 9391 kfree(wc); 9392 btrfs_free_path(path); 9393 out: 9394 /* 9395 * So if we need to stop dropping the snapshot for whatever reason we 9396 * need to make sure to add it back to the dead root list so that we 9397 * keep trying to do the work later. This also cleans up roots if we 9398 * don't have it in the radix (like when we recover after a power fail 9399 * or unmount) so we don't leak memory. 9400 */ 9401 if (!for_reloc && root_dropped == false) 9402 btrfs_add_dead_root(root); 9403 if (err && err != -EAGAIN) 9404 btrfs_handle_fs_error(root->fs_info, err, NULL); 9405 return err; 9406 } 9407 9408 /* 9409 * drop subtree rooted at tree block 'node'. 9410 * 9411 * NOTE: this function will unlock and release tree block 'node' 9412 * only used by relocation code 9413 */ 9414 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 9415 struct btrfs_root *root, 9416 struct extent_buffer *node, 9417 struct extent_buffer *parent) 9418 { 9419 struct btrfs_path *path; 9420 struct walk_control *wc; 9421 int level; 9422 int parent_level; 9423 int ret = 0; 9424 int wret; 9425 9426 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 9427 9428 path = btrfs_alloc_path(); 9429 if (!path) 9430 return -ENOMEM; 9431 9432 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9433 if (!wc) { 9434 btrfs_free_path(path); 9435 return -ENOMEM; 9436 } 9437 9438 btrfs_assert_tree_locked(parent); 9439 parent_level = btrfs_header_level(parent); 9440 extent_buffer_get(parent); 9441 path->nodes[parent_level] = parent; 9442 path->slots[parent_level] = btrfs_header_nritems(parent); 9443 9444 btrfs_assert_tree_locked(node); 9445 level = btrfs_header_level(node); 9446 path->nodes[level] = node; 9447 path->slots[level] = 0; 9448 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9449 9450 wc->refs[parent_level] = 1; 9451 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 9452 wc->level = level; 9453 wc->shared_level = -1; 9454 wc->stage = DROP_REFERENCE; 9455 wc->update_ref = 0; 9456 wc->keep_locks = 1; 9457 wc->for_reloc = 1; 9458 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 9459 9460 while (1) { 9461 wret = walk_down_tree(trans, root, path, wc); 9462 if (wret < 0) { 9463 ret = wret; 9464 break; 9465 } 9466 9467 wret = walk_up_tree(trans, root, path, wc, parent_level); 9468 if (wret < 0) 9469 ret = wret; 9470 if (wret != 0) 9471 break; 9472 } 9473 9474 kfree(wc); 9475 btrfs_free_path(path); 9476 return ret; 9477 } 9478 9479 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 9480 { 9481 u64 num_devices; 9482 u64 stripped; 9483 9484 /* 9485 * if restripe for this chunk_type is on pick target profile and 9486 * return, otherwise do the usual balance 9487 */ 9488 stripped = get_restripe_target(root->fs_info, flags); 9489 if (stripped) 9490 return extended_to_chunk(stripped); 9491 9492 num_devices = root->fs_info->fs_devices->rw_devices; 9493 9494 stripped = BTRFS_BLOCK_GROUP_RAID0 | 9495 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 9496 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 9497 9498 if (num_devices == 1) { 9499 stripped |= BTRFS_BLOCK_GROUP_DUP; 9500 stripped = flags & ~stripped; 9501 9502 /* turn raid0 into single device chunks */ 9503 if (flags & BTRFS_BLOCK_GROUP_RAID0) 9504 return stripped; 9505 9506 /* turn mirroring into duplication */ 9507 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 9508 BTRFS_BLOCK_GROUP_RAID10)) 9509 return stripped | BTRFS_BLOCK_GROUP_DUP; 9510 } else { 9511 /* they already had raid on here, just return */ 9512 if (flags & stripped) 9513 return flags; 9514 9515 stripped |= BTRFS_BLOCK_GROUP_DUP; 9516 stripped = flags & ~stripped; 9517 9518 /* switch duplicated blocks with raid1 */ 9519 if (flags & BTRFS_BLOCK_GROUP_DUP) 9520 return stripped | BTRFS_BLOCK_GROUP_RAID1; 9521 9522 /* this is drive concat, leave it alone */ 9523 } 9524 9525 return flags; 9526 } 9527 9528 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) 9529 { 9530 struct btrfs_space_info *sinfo = cache->space_info; 9531 u64 num_bytes; 9532 u64 min_allocable_bytes; 9533 int ret = -ENOSPC; 9534 9535 /* 9536 * We need some metadata space and system metadata space for 9537 * allocating chunks in some corner cases until we force to set 9538 * it to be readonly. 9539 */ 9540 if ((sinfo->flags & 9541 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 9542 !force) 9543 min_allocable_bytes = SZ_1M; 9544 else 9545 min_allocable_bytes = 0; 9546 9547 spin_lock(&sinfo->lock); 9548 spin_lock(&cache->lock); 9549 9550 if (cache->ro) { 9551 cache->ro++; 9552 ret = 0; 9553 goto out; 9554 } 9555 9556 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 9557 cache->bytes_super - btrfs_block_group_used(&cache->item); 9558 9559 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 9560 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + 9561 min_allocable_bytes <= sinfo->total_bytes) { 9562 sinfo->bytes_readonly += num_bytes; 9563 cache->ro++; 9564 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 9565 ret = 0; 9566 } 9567 out: 9568 spin_unlock(&cache->lock); 9569 spin_unlock(&sinfo->lock); 9570 return ret; 9571 } 9572 9573 int btrfs_inc_block_group_ro(struct btrfs_root *root, 9574 struct btrfs_block_group_cache *cache) 9575 9576 { 9577 struct btrfs_trans_handle *trans; 9578 u64 alloc_flags; 9579 int ret; 9580 9581 again: 9582 trans = btrfs_join_transaction(root); 9583 if (IS_ERR(trans)) 9584 return PTR_ERR(trans); 9585 9586 /* 9587 * we're not allowed to set block groups readonly after the dirty 9588 * block groups cache has started writing. If it already started, 9589 * back off and let this transaction commit 9590 */ 9591 mutex_lock(&root->fs_info->ro_block_group_mutex); 9592 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 9593 u64 transid = trans->transid; 9594 9595 mutex_unlock(&root->fs_info->ro_block_group_mutex); 9596 btrfs_end_transaction(trans, root); 9597 9598 ret = btrfs_wait_for_commit(root, transid); 9599 if (ret) 9600 return ret; 9601 goto again; 9602 } 9603 9604 /* 9605 * if we are changing raid levels, try to allocate a corresponding 9606 * block group with the new raid level. 9607 */ 9608 alloc_flags = update_block_group_flags(root, cache->flags); 9609 if (alloc_flags != cache->flags) { 9610 ret = do_chunk_alloc(trans, root, alloc_flags, 9611 CHUNK_ALLOC_FORCE); 9612 /* 9613 * ENOSPC is allowed here, we may have enough space 9614 * already allocated at the new raid level to 9615 * carry on 9616 */ 9617 if (ret == -ENOSPC) 9618 ret = 0; 9619 if (ret < 0) 9620 goto out; 9621 } 9622 9623 ret = inc_block_group_ro(cache, 0); 9624 if (!ret) 9625 goto out; 9626 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 9627 ret = do_chunk_alloc(trans, root, alloc_flags, 9628 CHUNK_ALLOC_FORCE); 9629 if (ret < 0) 9630 goto out; 9631 ret = inc_block_group_ro(cache, 0); 9632 out: 9633 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 9634 alloc_flags = update_block_group_flags(root, cache->flags); 9635 lock_chunks(root->fs_info->chunk_root); 9636 check_system_chunk(trans, root, alloc_flags); 9637 unlock_chunks(root->fs_info->chunk_root); 9638 } 9639 mutex_unlock(&root->fs_info->ro_block_group_mutex); 9640 9641 btrfs_end_transaction(trans, root); 9642 return ret; 9643 } 9644 9645 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 9646 struct btrfs_root *root, u64 type) 9647 { 9648 u64 alloc_flags = get_alloc_profile(root, type); 9649 return do_chunk_alloc(trans, root, alloc_flags, 9650 CHUNK_ALLOC_FORCE); 9651 } 9652 9653 /* 9654 * helper to account the unused space of all the readonly block group in the 9655 * space_info. takes mirrors into account. 9656 */ 9657 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 9658 { 9659 struct btrfs_block_group_cache *block_group; 9660 u64 free_bytes = 0; 9661 int factor; 9662 9663 /* It's df, we don't care if it's racy */ 9664 if (list_empty(&sinfo->ro_bgs)) 9665 return 0; 9666 9667 spin_lock(&sinfo->lock); 9668 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { 9669 spin_lock(&block_group->lock); 9670 9671 if (!block_group->ro) { 9672 spin_unlock(&block_group->lock); 9673 continue; 9674 } 9675 9676 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 9677 BTRFS_BLOCK_GROUP_RAID10 | 9678 BTRFS_BLOCK_GROUP_DUP)) 9679 factor = 2; 9680 else 9681 factor = 1; 9682 9683 free_bytes += (block_group->key.offset - 9684 btrfs_block_group_used(&block_group->item)) * 9685 factor; 9686 9687 spin_unlock(&block_group->lock); 9688 } 9689 spin_unlock(&sinfo->lock); 9690 9691 return free_bytes; 9692 } 9693 9694 void btrfs_dec_block_group_ro(struct btrfs_root *root, 9695 struct btrfs_block_group_cache *cache) 9696 { 9697 struct btrfs_space_info *sinfo = cache->space_info; 9698 u64 num_bytes; 9699 9700 BUG_ON(!cache->ro); 9701 9702 spin_lock(&sinfo->lock); 9703 spin_lock(&cache->lock); 9704 if (!--cache->ro) { 9705 num_bytes = cache->key.offset - cache->reserved - 9706 cache->pinned - cache->bytes_super - 9707 btrfs_block_group_used(&cache->item); 9708 sinfo->bytes_readonly -= num_bytes; 9709 list_del_init(&cache->ro_list); 9710 } 9711 spin_unlock(&cache->lock); 9712 spin_unlock(&sinfo->lock); 9713 } 9714 9715 /* 9716 * checks to see if its even possible to relocate this block group. 9717 * 9718 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 9719 * ok to go ahead and try. 9720 */ 9721 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) 9722 { 9723 struct btrfs_block_group_cache *block_group; 9724 struct btrfs_space_info *space_info; 9725 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 9726 struct btrfs_device *device; 9727 struct btrfs_trans_handle *trans; 9728 u64 min_free; 9729 u64 dev_min = 1; 9730 u64 dev_nr = 0; 9731 u64 target; 9732 int debug; 9733 int index; 9734 int full = 0; 9735 int ret = 0; 9736 9737 debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG); 9738 9739 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 9740 9741 /* odd, couldn't find the block group, leave it alone */ 9742 if (!block_group) { 9743 if (debug) 9744 btrfs_warn(root->fs_info, 9745 "can't find block group for bytenr %llu", 9746 bytenr); 9747 return -1; 9748 } 9749 9750 min_free = btrfs_block_group_used(&block_group->item); 9751 9752 /* no bytes used, we're good */ 9753 if (!min_free) 9754 goto out; 9755 9756 space_info = block_group->space_info; 9757 spin_lock(&space_info->lock); 9758 9759 full = space_info->full; 9760 9761 /* 9762 * if this is the last block group we have in this space, we can't 9763 * relocate it unless we're able to allocate a new chunk below. 9764 * 9765 * Otherwise, we need to make sure we have room in the space to handle 9766 * all of the extents from this block group. If we can, we're good 9767 */ 9768 if ((space_info->total_bytes != block_group->key.offset) && 9769 (space_info->bytes_used + space_info->bytes_reserved + 9770 space_info->bytes_pinned + space_info->bytes_readonly + 9771 min_free < space_info->total_bytes)) { 9772 spin_unlock(&space_info->lock); 9773 goto out; 9774 } 9775 spin_unlock(&space_info->lock); 9776 9777 /* 9778 * ok we don't have enough space, but maybe we have free space on our 9779 * devices to allocate new chunks for relocation, so loop through our 9780 * alloc devices and guess if we have enough space. if this block 9781 * group is going to be restriped, run checks against the target 9782 * profile instead of the current one. 9783 */ 9784 ret = -1; 9785 9786 /* 9787 * index: 9788 * 0: raid10 9789 * 1: raid1 9790 * 2: dup 9791 * 3: raid0 9792 * 4: single 9793 */ 9794 target = get_restripe_target(root->fs_info, block_group->flags); 9795 if (target) { 9796 index = __get_raid_index(extended_to_chunk(target)); 9797 } else { 9798 /* 9799 * this is just a balance, so if we were marked as full 9800 * we know there is no space for a new chunk 9801 */ 9802 if (full) { 9803 if (debug) 9804 btrfs_warn(root->fs_info, 9805 "no space to alloc new chunk for block group %llu", 9806 block_group->key.objectid); 9807 goto out; 9808 } 9809 9810 index = get_block_group_index(block_group); 9811 } 9812 9813 if (index == BTRFS_RAID_RAID10) { 9814 dev_min = 4; 9815 /* Divide by 2 */ 9816 min_free >>= 1; 9817 } else if (index == BTRFS_RAID_RAID1) { 9818 dev_min = 2; 9819 } else if (index == BTRFS_RAID_DUP) { 9820 /* Multiply by 2 */ 9821 min_free <<= 1; 9822 } else if (index == BTRFS_RAID_RAID0) { 9823 dev_min = fs_devices->rw_devices; 9824 min_free = div64_u64(min_free, dev_min); 9825 } 9826 9827 /* We need to do this so that we can look at pending chunks */ 9828 trans = btrfs_join_transaction(root); 9829 if (IS_ERR(trans)) { 9830 ret = PTR_ERR(trans); 9831 goto out; 9832 } 9833 9834 mutex_lock(&root->fs_info->chunk_mutex); 9835 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 9836 u64 dev_offset; 9837 9838 /* 9839 * check to make sure we can actually find a chunk with enough 9840 * space to fit our block group in. 9841 */ 9842 if (device->total_bytes > device->bytes_used + min_free && 9843 !device->is_tgtdev_for_dev_replace) { 9844 ret = find_free_dev_extent(trans, device, min_free, 9845 &dev_offset, NULL); 9846 if (!ret) 9847 dev_nr++; 9848 9849 if (dev_nr >= dev_min) 9850 break; 9851 9852 ret = -1; 9853 } 9854 } 9855 if (debug && ret == -1) 9856 btrfs_warn(root->fs_info, 9857 "no space to allocate a new chunk for block group %llu", 9858 block_group->key.objectid); 9859 mutex_unlock(&root->fs_info->chunk_mutex); 9860 btrfs_end_transaction(trans, root); 9861 out: 9862 btrfs_put_block_group(block_group); 9863 return ret; 9864 } 9865 9866 static int find_first_block_group(struct btrfs_root *root, 9867 struct btrfs_path *path, struct btrfs_key *key) 9868 { 9869 int ret = 0; 9870 struct btrfs_key found_key; 9871 struct extent_buffer *leaf; 9872 int slot; 9873 9874 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 9875 if (ret < 0) 9876 goto out; 9877 9878 while (1) { 9879 slot = path->slots[0]; 9880 leaf = path->nodes[0]; 9881 if (slot >= btrfs_header_nritems(leaf)) { 9882 ret = btrfs_next_leaf(root, path); 9883 if (ret == 0) 9884 continue; 9885 if (ret < 0) 9886 goto out; 9887 break; 9888 } 9889 btrfs_item_key_to_cpu(leaf, &found_key, slot); 9890 9891 if (found_key.objectid >= key->objectid && 9892 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 9893 struct extent_map_tree *em_tree; 9894 struct extent_map *em; 9895 9896 em_tree = &root->fs_info->mapping_tree.map_tree; 9897 read_lock(&em_tree->lock); 9898 em = lookup_extent_mapping(em_tree, found_key.objectid, 9899 found_key.offset); 9900 read_unlock(&em_tree->lock); 9901 if (!em) { 9902 btrfs_err(root->fs_info, 9903 "logical %llu len %llu found bg but no related chunk", 9904 found_key.objectid, found_key.offset); 9905 ret = -ENOENT; 9906 } else { 9907 ret = 0; 9908 } 9909 goto out; 9910 } 9911 path->slots[0]++; 9912 } 9913 out: 9914 return ret; 9915 } 9916 9917 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 9918 { 9919 struct btrfs_block_group_cache *block_group; 9920 u64 last = 0; 9921 9922 while (1) { 9923 struct inode *inode; 9924 9925 block_group = btrfs_lookup_first_block_group(info, last); 9926 while (block_group) { 9927 spin_lock(&block_group->lock); 9928 if (block_group->iref) 9929 break; 9930 spin_unlock(&block_group->lock); 9931 block_group = next_block_group(info->tree_root, 9932 block_group); 9933 } 9934 if (!block_group) { 9935 if (last == 0) 9936 break; 9937 last = 0; 9938 continue; 9939 } 9940 9941 inode = block_group->inode; 9942 block_group->iref = 0; 9943 block_group->inode = NULL; 9944 spin_unlock(&block_group->lock); 9945 iput(inode); 9946 last = block_group->key.objectid + block_group->key.offset; 9947 btrfs_put_block_group(block_group); 9948 } 9949 } 9950 9951 int btrfs_free_block_groups(struct btrfs_fs_info *info) 9952 { 9953 struct btrfs_block_group_cache *block_group; 9954 struct btrfs_space_info *space_info; 9955 struct btrfs_caching_control *caching_ctl; 9956 struct rb_node *n; 9957 9958 down_write(&info->commit_root_sem); 9959 while (!list_empty(&info->caching_block_groups)) { 9960 caching_ctl = list_entry(info->caching_block_groups.next, 9961 struct btrfs_caching_control, list); 9962 list_del(&caching_ctl->list); 9963 put_caching_control(caching_ctl); 9964 } 9965 up_write(&info->commit_root_sem); 9966 9967 spin_lock(&info->unused_bgs_lock); 9968 while (!list_empty(&info->unused_bgs)) { 9969 block_group = list_first_entry(&info->unused_bgs, 9970 struct btrfs_block_group_cache, 9971 bg_list); 9972 list_del_init(&block_group->bg_list); 9973 btrfs_put_block_group(block_group); 9974 } 9975 spin_unlock(&info->unused_bgs_lock); 9976 9977 spin_lock(&info->block_group_cache_lock); 9978 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 9979 block_group = rb_entry(n, struct btrfs_block_group_cache, 9980 cache_node); 9981 rb_erase(&block_group->cache_node, 9982 &info->block_group_cache_tree); 9983 RB_CLEAR_NODE(&block_group->cache_node); 9984 spin_unlock(&info->block_group_cache_lock); 9985 9986 down_write(&block_group->space_info->groups_sem); 9987 list_del(&block_group->list); 9988 up_write(&block_group->space_info->groups_sem); 9989 9990 if (block_group->cached == BTRFS_CACHE_STARTED) 9991 wait_block_group_cache_done(block_group); 9992 9993 /* 9994 * We haven't cached this block group, which means we could 9995 * possibly have excluded extents on this block group. 9996 */ 9997 if (block_group->cached == BTRFS_CACHE_NO || 9998 block_group->cached == BTRFS_CACHE_ERROR) 9999 free_excluded_extents(info->extent_root, block_group); 10000 10001 btrfs_remove_free_space_cache(block_group); 10002 btrfs_put_block_group(block_group); 10003 10004 spin_lock(&info->block_group_cache_lock); 10005 } 10006 spin_unlock(&info->block_group_cache_lock); 10007 10008 /* now that all the block groups are freed, go through and 10009 * free all the space_info structs. This is only called during 10010 * the final stages of unmount, and so we know nobody is 10011 * using them. We call synchronize_rcu() once before we start, 10012 * just to be on the safe side. 10013 */ 10014 synchronize_rcu(); 10015 10016 release_global_block_rsv(info); 10017 10018 while (!list_empty(&info->space_info)) { 10019 int i; 10020 10021 space_info = list_entry(info->space_info.next, 10022 struct btrfs_space_info, 10023 list); 10024 10025 /* 10026 * Do not hide this behind enospc_debug, this is actually 10027 * important and indicates a real bug if this happens. 10028 */ 10029 if (WARN_ON(space_info->bytes_pinned > 0 || 10030 space_info->bytes_reserved > 0 || 10031 space_info->bytes_may_use > 0)) 10032 dump_space_info(space_info, 0, 0); 10033 list_del(&space_info->list); 10034 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 10035 struct kobject *kobj; 10036 kobj = space_info->block_group_kobjs[i]; 10037 space_info->block_group_kobjs[i] = NULL; 10038 if (kobj) { 10039 kobject_del(kobj); 10040 kobject_put(kobj); 10041 } 10042 } 10043 kobject_del(&space_info->kobj); 10044 kobject_put(&space_info->kobj); 10045 } 10046 return 0; 10047 } 10048 10049 static void __link_block_group(struct btrfs_space_info *space_info, 10050 struct btrfs_block_group_cache *cache) 10051 { 10052 int index = get_block_group_index(cache); 10053 bool first = false; 10054 10055 down_write(&space_info->groups_sem); 10056 if (list_empty(&space_info->block_groups[index])) 10057 first = true; 10058 list_add_tail(&cache->list, &space_info->block_groups[index]); 10059 up_write(&space_info->groups_sem); 10060 10061 if (first) { 10062 struct raid_kobject *rkobj; 10063 int ret; 10064 10065 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); 10066 if (!rkobj) 10067 goto out_err; 10068 rkobj->raid_type = index; 10069 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 10070 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 10071 "%s", get_raid_name(index)); 10072 if (ret) { 10073 kobject_put(&rkobj->kobj); 10074 goto out_err; 10075 } 10076 space_info->block_group_kobjs[index] = &rkobj->kobj; 10077 } 10078 10079 return; 10080 out_err: 10081 pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); 10082 } 10083 10084 static struct btrfs_block_group_cache * 10085 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) 10086 { 10087 struct btrfs_block_group_cache *cache; 10088 10089 cache = kzalloc(sizeof(*cache), GFP_NOFS); 10090 if (!cache) 10091 return NULL; 10092 10093 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 10094 GFP_NOFS); 10095 if (!cache->free_space_ctl) { 10096 kfree(cache); 10097 return NULL; 10098 } 10099 10100 cache->key.objectid = start; 10101 cache->key.offset = size; 10102 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 10103 10104 cache->sectorsize = root->sectorsize; 10105 cache->fs_info = root->fs_info; 10106 cache->full_stripe_len = btrfs_full_stripe_len(root, 10107 &root->fs_info->mapping_tree, 10108 start); 10109 set_free_space_tree_thresholds(cache); 10110 10111 atomic_set(&cache->count, 1); 10112 spin_lock_init(&cache->lock); 10113 init_rwsem(&cache->data_rwsem); 10114 INIT_LIST_HEAD(&cache->list); 10115 INIT_LIST_HEAD(&cache->cluster_list); 10116 INIT_LIST_HEAD(&cache->bg_list); 10117 INIT_LIST_HEAD(&cache->ro_list); 10118 INIT_LIST_HEAD(&cache->dirty_list); 10119 INIT_LIST_HEAD(&cache->io_list); 10120 btrfs_init_free_space_ctl(cache); 10121 atomic_set(&cache->trimming, 0); 10122 mutex_init(&cache->free_space_lock); 10123 10124 return cache; 10125 } 10126 10127 int btrfs_read_block_groups(struct btrfs_root *root) 10128 { 10129 struct btrfs_path *path; 10130 int ret; 10131 struct btrfs_block_group_cache *cache; 10132 struct btrfs_fs_info *info = root->fs_info; 10133 struct btrfs_space_info *space_info; 10134 struct btrfs_key key; 10135 struct btrfs_key found_key; 10136 struct extent_buffer *leaf; 10137 int need_clear = 0; 10138 u64 cache_gen; 10139 10140 root = info->extent_root; 10141 key.objectid = 0; 10142 key.offset = 0; 10143 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 10144 path = btrfs_alloc_path(); 10145 if (!path) 10146 return -ENOMEM; 10147 path->reada = READA_FORWARD; 10148 10149 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 10150 if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && 10151 btrfs_super_generation(root->fs_info->super_copy) != cache_gen) 10152 need_clear = 1; 10153 if (btrfs_test_opt(root->fs_info, CLEAR_CACHE)) 10154 need_clear = 1; 10155 10156 while (1) { 10157 ret = find_first_block_group(root, path, &key); 10158 if (ret > 0) 10159 break; 10160 if (ret != 0) 10161 goto error; 10162 10163 leaf = path->nodes[0]; 10164 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 10165 10166 cache = btrfs_create_block_group_cache(root, found_key.objectid, 10167 found_key.offset); 10168 if (!cache) { 10169 ret = -ENOMEM; 10170 goto error; 10171 } 10172 10173 if (need_clear) { 10174 /* 10175 * When we mount with old space cache, we need to 10176 * set BTRFS_DC_CLEAR and set dirty flag. 10177 * 10178 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 10179 * truncate the old free space cache inode and 10180 * setup a new one. 10181 * b) Setting 'dirty flag' makes sure that we flush 10182 * the new space cache info onto disk. 10183 */ 10184 if (btrfs_test_opt(root->fs_info, SPACE_CACHE)) 10185 cache->disk_cache_state = BTRFS_DC_CLEAR; 10186 } 10187 10188 read_extent_buffer(leaf, &cache->item, 10189 btrfs_item_ptr_offset(leaf, path->slots[0]), 10190 sizeof(cache->item)); 10191 cache->flags = btrfs_block_group_flags(&cache->item); 10192 10193 key.objectid = found_key.objectid + found_key.offset; 10194 btrfs_release_path(path); 10195 10196 /* 10197 * We need to exclude the super stripes now so that the space 10198 * info has super bytes accounted for, otherwise we'll think 10199 * we have more space than we actually do. 10200 */ 10201 ret = exclude_super_stripes(root, cache); 10202 if (ret) { 10203 /* 10204 * We may have excluded something, so call this just in 10205 * case. 10206 */ 10207 free_excluded_extents(root, cache); 10208 btrfs_put_block_group(cache); 10209 goto error; 10210 } 10211 10212 /* 10213 * check for two cases, either we are full, and therefore 10214 * don't need to bother with the caching work since we won't 10215 * find any space, or we are empty, and we can just add all 10216 * the space in and be done with it. This saves us _alot_ of 10217 * time, particularly in the full case. 10218 */ 10219 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 10220 cache->last_byte_to_unpin = (u64)-1; 10221 cache->cached = BTRFS_CACHE_FINISHED; 10222 free_excluded_extents(root, cache); 10223 } else if (btrfs_block_group_used(&cache->item) == 0) { 10224 cache->last_byte_to_unpin = (u64)-1; 10225 cache->cached = BTRFS_CACHE_FINISHED; 10226 add_new_free_space(cache, root->fs_info, 10227 found_key.objectid, 10228 found_key.objectid + 10229 found_key.offset); 10230 free_excluded_extents(root, cache); 10231 } 10232 10233 ret = btrfs_add_block_group_cache(root->fs_info, cache); 10234 if (ret) { 10235 btrfs_remove_free_space_cache(cache); 10236 btrfs_put_block_group(cache); 10237 goto error; 10238 } 10239 10240 trace_btrfs_add_block_group(root->fs_info, cache, 0); 10241 ret = update_space_info(info, cache->flags, found_key.offset, 10242 btrfs_block_group_used(&cache->item), 10243 cache->bytes_super, &space_info); 10244 if (ret) { 10245 btrfs_remove_free_space_cache(cache); 10246 spin_lock(&info->block_group_cache_lock); 10247 rb_erase(&cache->cache_node, 10248 &info->block_group_cache_tree); 10249 RB_CLEAR_NODE(&cache->cache_node); 10250 spin_unlock(&info->block_group_cache_lock); 10251 btrfs_put_block_group(cache); 10252 goto error; 10253 } 10254 10255 cache->space_info = space_info; 10256 10257 __link_block_group(space_info, cache); 10258 10259 set_avail_alloc_bits(root->fs_info, cache->flags); 10260 if (btrfs_chunk_readonly(root, cache->key.objectid)) { 10261 inc_block_group_ro(cache, 1); 10262 } else if (btrfs_block_group_used(&cache->item) == 0) { 10263 spin_lock(&info->unused_bgs_lock); 10264 /* Should always be true but just in case. */ 10265 if (list_empty(&cache->bg_list)) { 10266 btrfs_get_block_group(cache); 10267 list_add_tail(&cache->bg_list, 10268 &info->unused_bgs); 10269 } 10270 spin_unlock(&info->unused_bgs_lock); 10271 } 10272 } 10273 10274 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 10275 if (!(get_alloc_profile(root, space_info->flags) & 10276 (BTRFS_BLOCK_GROUP_RAID10 | 10277 BTRFS_BLOCK_GROUP_RAID1 | 10278 BTRFS_BLOCK_GROUP_RAID5 | 10279 BTRFS_BLOCK_GROUP_RAID6 | 10280 BTRFS_BLOCK_GROUP_DUP))) 10281 continue; 10282 /* 10283 * avoid allocating from un-mirrored block group if there are 10284 * mirrored block groups. 10285 */ 10286 list_for_each_entry(cache, 10287 &space_info->block_groups[BTRFS_RAID_RAID0], 10288 list) 10289 inc_block_group_ro(cache, 1); 10290 list_for_each_entry(cache, 10291 &space_info->block_groups[BTRFS_RAID_SINGLE], 10292 list) 10293 inc_block_group_ro(cache, 1); 10294 } 10295 10296 init_global_block_rsv(info); 10297 ret = 0; 10298 error: 10299 btrfs_free_path(path); 10300 return ret; 10301 } 10302 10303 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 10304 struct btrfs_root *root) 10305 { 10306 struct btrfs_block_group_cache *block_group, *tmp; 10307 struct btrfs_root *extent_root = root->fs_info->extent_root; 10308 struct btrfs_block_group_item item; 10309 struct btrfs_key key; 10310 int ret = 0; 10311 bool can_flush_pending_bgs = trans->can_flush_pending_bgs; 10312 10313 trans->can_flush_pending_bgs = false; 10314 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { 10315 if (ret) 10316 goto next; 10317 10318 spin_lock(&block_group->lock); 10319 memcpy(&item, &block_group->item, sizeof(item)); 10320 memcpy(&key, &block_group->key, sizeof(key)); 10321 spin_unlock(&block_group->lock); 10322 10323 ret = btrfs_insert_item(trans, extent_root, &key, &item, 10324 sizeof(item)); 10325 if (ret) 10326 btrfs_abort_transaction(trans, ret); 10327 ret = btrfs_finish_chunk_alloc(trans, extent_root, 10328 key.objectid, key.offset); 10329 if (ret) 10330 btrfs_abort_transaction(trans, ret); 10331 add_block_group_free_space(trans, root->fs_info, block_group); 10332 /* already aborted the transaction if it failed. */ 10333 next: 10334 list_del_init(&block_group->bg_list); 10335 } 10336 trans->can_flush_pending_bgs = can_flush_pending_bgs; 10337 } 10338 10339 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 10340 struct btrfs_root *root, u64 bytes_used, 10341 u64 type, u64 chunk_objectid, u64 chunk_offset, 10342 u64 size) 10343 { 10344 int ret; 10345 struct btrfs_root *extent_root; 10346 struct btrfs_block_group_cache *cache; 10347 extent_root = root->fs_info->extent_root; 10348 10349 btrfs_set_log_full_commit(root->fs_info, trans); 10350 10351 cache = btrfs_create_block_group_cache(root, chunk_offset, size); 10352 if (!cache) 10353 return -ENOMEM; 10354 10355 btrfs_set_block_group_used(&cache->item, bytes_used); 10356 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 10357 btrfs_set_block_group_flags(&cache->item, type); 10358 10359 cache->flags = type; 10360 cache->last_byte_to_unpin = (u64)-1; 10361 cache->cached = BTRFS_CACHE_FINISHED; 10362 cache->needs_free_space = 1; 10363 ret = exclude_super_stripes(root, cache); 10364 if (ret) { 10365 /* 10366 * We may have excluded something, so call this just in 10367 * case. 10368 */ 10369 free_excluded_extents(root, cache); 10370 btrfs_put_block_group(cache); 10371 return ret; 10372 } 10373 10374 add_new_free_space(cache, root->fs_info, chunk_offset, 10375 chunk_offset + size); 10376 10377 free_excluded_extents(root, cache); 10378 10379 #ifdef CONFIG_BTRFS_DEBUG 10380 if (btrfs_should_fragment_free_space(root, cache)) { 10381 u64 new_bytes_used = size - bytes_used; 10382 10383 bytes_used += new_bytes_used >> 1; 10384 fragment_free_space(root, cache); 10385 } 10386 #endif 10387 /* 10388 * Call to ensure the corresponding space_info object is created and 10389 * assigned to our block group, but don't update its counters just yet. 10390 * We want our bg to be added to the rbtree with its ->space_info set. 10391 */ 10392 ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0, 10393 &cache->space_info); 10394 if (ret) { 10395 btrfs_remove_free_space_cache(cache); 10396 btrfs_put_block_group(cache); 10397 return ret; 10398 } 10399 10400 ret = btrfs_add_block_group_cache(root->fs_info, cache); 10401 if (ret) { 10402 btrfs_remove_free_space_cache(cache); 10403 btrfs_put_block_group(cache); 10404 return ret; 10405 } 10406 10407 /* 10408 * Now that our block group has its ->space_info set and is inserted in 10409 * the rbtree, update the space info's counters. 10410 */ 10411 trace_btrfs_add_block_group(root->fs_info, cache, 1); 10412 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 10413 cache->bytes_super, &cache->space_info); 10414 if (ret) { 10415 btrfs_remove_free_space_cache(cache); 10416 spin_lock(&root->fs_info->block_group_cache_lock); 10417 rb_erase(&cache->cache_node, 10418 &root->fs_info->block_group_cache_tree); 10419 RB_CLEAR_NODE(&cache->cache_node); 10420 spin_unlock(&root->fs_info->block_group_cache_lock); 10421 btrfs_put_block_group(cache); 10422 return ret; 10423 } 10424 update_global_block_rsv(root->fs_info); 10425 10426 __link_block_group(cache->space_info, cache); 10427 10428 list_add_tail(&cache->bg_list, &trans->new_bgs); 10429 10430 set_avail_alloc_bits(extent_root->fs_info, type); 10431 return 0; 10432 } 10433 10434 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 10435 { 10436 u64 extra_flags = chunk_to_extended(flags) & 10437 BTRFS_EXTENDED_PROFILE_MASK; 10438 10439 write_seqlock(&fs_info->profiles_lock); 10440 if (flags & BTRFS_BLOCK_GROUP_DATA) 10441 fs_info->avail_data_alloc_bits &= ~extra_flags; 10442 if (flags & BTRFS_BLOCK_GROUP_METADATA) 10443 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 10444 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 10445 fs_info->avail_system_alloc_bits &= ~extra_flags; 10446 write_sequnlock(&fs_info->profiles_lock); 10447 } 10448 10449 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 10450 struct btrfs_root *root, u64 group_start, 10451 struct extent_map *em) 10452 { 10453 struct btrfs_path *path; 10454 struct btrfs_block_group_cache *block_group; 10455 struct btrfs_free_cluster *cluster; 10456 struct btrfs_root *tree_root = root->fs_info->tree_root; 10457 struct btrfs_key key; 10458 struct inode *inode; 10459 struct kobject *kobj = NULL; 10460 int ret; 10461 int index; 10462 int factor; 10463 struct btrfs_caching_control *caching_ctl = NULL; 10464 bool remove_em; 10465 10466 root = root->fs_info->extent_root; 10467 10468 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 10469 BUG_ON(!block_group); 10470 BUG_ON(!block_group->ro); 10471 10472 /* 10473 * Free the reserved super bytes from this block group before 10474 * remove it. 10475 */ 10476 free_excluded_extents(root, block_group); 10477 10478 memcpy(&key, &block_group->key, sizeof(key)); 10479 index = get_block_group_index(block_group); 10480 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 10481 BTRFS_BLOCK_GROUP_RAID1 | 10482 BTRFS_BLOCK_GROUP_RAID10)) 10483 factor = 2; 10484 else 10485 factor = 1; 10486 10487 /* make sure this block group isn't part of an allocation cluster */ 10488 cluster = &root->fs_info->data_alloc_cluster; 10489 spin_lock(&cluster->refill_lock); 10490 btrfs_return_cluster_to_free_space(block_group, cluster); 10491 spin_unlock(&cluster->refill_lock); 10492 10493 /* 10494 * make sure this block group isn't part of a metadata 10495 * allocation cluster 10496 */ 10497 cluster = &root->fs_info->meta_alloc_cluster; 10498 spin_lock(&cluster->refill_lock); 10499 btrfs_return_cluster_to_free_space(block_group, cluster); 10500 spin_unlock(&cluster->refill_lock); 10501 10502 path = btrfs_alloc_path(); 10503 if (!path) { 10504 ret = -ENOMEM; 10505 goto out; 10506 } 10507 10508 /* 10509 * get the inode first so any iput calls done for the io_list 10510 * aren't the final iput (no unlinks allowed now) 10511 */ 10512 inode = lookup_free_space_inode(tree_root, block_group, path); 10513 10514 mutex_lock(&trans->transaction->cache_write_mutex); 10515 /* 10516 * make sure our free spache cache IO is done before remove the 10517 * free space inode 10518 */ 10519 spin_lock(&trans->transaction->dirty_bgs_lock); 10520 if (!list_empty(&block_group->io_list)) { 10521 list_del_init(&block_group->io_list); 10522 10523 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 10524 10525 spin_unlock(&trans->transaction->dirty_bgs_lock); 10526 btrfs_wait_cache_io(root, trans, block_group, 10527 &block_group->io_ctl, path, 10528 block_group->key.objectid); 10529 btrfs_put_block_group(block_group); 10530 spin_lock(&trans->transaction->dirty_bgs_lock); 10531 } 10532 10533 if (!list_empty(&block_group->dirty_list)) { 10534 list_del_init(&block_group->dirty_list); 10535 btrfs_put_block_group(block_group); 10536 } 10537 spin_unlock(&trans->transaction->dirty_bgs_lock); 10538 mutex_unlock(&trans->transaction->cache_write_mutex); 10539 10540 if (!IS_ERR(inode)) { 10541 ret = btrfs_orphan_add(trans, inode); 10542 if (ret) { 10543 btrfs_add_delayed_iput(inode); 10544 goto out; 10545 } 10546 clear_nlink(inode); 10547 /* One for the block groups ref */ 10548 spin_lock(&block_group->lock); 10549 if (block_group->iref) { 10550 block_group->iref = 0; 10551 block_group->inode = NULL; 10552 spin_unlock(&block_group->lock); 10553 iput(inode); 10554 } else { 10555 spin_unlock(&block_group->lock); 10556 } 10557 /* One for our lookup ref */ 10558 btrfs_add_delayed_iput(inode); 10559 } 10560 10561 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 10562 key.offset = block_group->key.objectid; 10563 key.type = 0; 10564 10565 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 10566 if (ret < 0) 10567 goto out; 10568 if (ret > 0) 10569 btrfs_release_path(path); 10570 if (ret == 0) { 10571 ret = btrfs_del_item(trans, tree_root, path); 10572 if (ret) 10573 goto out; 10574 btrfs_release_path(path); 10575 } 10576 10577 spin_lock(&root->fs_info->block_group_cache_lock); 10578 rb_erase(&block_group->cache_node, 10579 &root->fs_info->block_group_cache_tree); 10580 RB_CLEAR_NODE(&block_group->cache_node); 10581 10582 if (root->fs_info->first_logical_byte == block_group->key.objectid) 10583 root->fs_info->first_logical_byte = (u64)-1; 10584 spin_unlock(&root->fs_info->block_group_cache_lock); 10585 10586 down_write(&block_group->space_info->groups_sem); 10587 /* 10588 * we must use list_del_init so people can check to see if they 10589 * are still on the list after taking the semaphore 10590 */ 10591 list_del_init(&block_group->list); 10592 if (list_empty(&block_group->space_info->block_groups[index])) { 10593 kobj = block_group->space_info->block_group_kobjs[index]; 10594 block_group->space_info->block_group_kobjs[index] = NULL; 10595 clear_avail_alloc_bits(root->fs_info, block_group->flags); 10596 } 10597 up_write(&block_group->space_info->groups_sem); 10598 if (kobj) { 10599 kobject_del(kobj); 10600 kobject_put(kobj); 10601 } 10602 10603 if (block_group->has_caching_ctl) 10604 caching_ctl = get_caching_control(block_group); 10605 if (block_group->cached == BTRFS_CACHE_STARTED) 10606 wait_block_group_cache_done(block_group); 10607 if (block_group->has_caching_ctl) { 10608 down_write(&root->fs_info->commit_root_sem); 10609 if (!caching_ctl) { 10610 struct btrfs_caching_control *ctl; 10611 10612 list_for_each_entry(ctl, 10613 &root->fs_info->caching_block_groups, list) 10614 if (ctl->block_group == block_group) { 10615 caching_ctl = ctl; 10616 atomic_inc(&caching_ctl->count); 10617 break; 10618 } 10619 } 10620 if (caching_ctl) 10621 list_del_init(&caching_ctl->list); 10622 up_write(&root->fs_info->commit_root_sem); 10623 if (caching_ctl) { 10624 /* Once for the caching bgs list and once for us. */ 10625 put_caching_control(caching_ctl); 10626 put_caching_control(caching_ctl); 10627 } 10628 } 10629 10630 spin_lock(&trans->transaction->dirty_bgs_lock); 10631 if (!list_empty(&block_group->dirty_list)) { 10632 WARN_ON(1); 10633 } 10634 if (!list_empty(&block_group->io_list)) { 10635 WARN_ON(1); 10636 } 10637 spin_unlock(&trans->transaction->dirty_bgs_lock); 10638 btrfs_remove_free_space_cache(block_group); 10639 10640 spin_lock(&block_group->space_info->lock); 10641 list_del_init(&block_group->ro_list); 10642 10643 if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { 10644 WARN_ON(block_group->space_info->total_bytes 10645 < block_group->key.offset); 10646 WARN_ON(block_group->space_info->bytes_readonly 10647 < block_group->key.offset); 10648 WARN_ON(block_group->space_info->disk_total 10649 < block_group->key.offset * factor); 10650 } 10651 block_group->space_info->total_bytes -= block_group->key.offset; 10652 block_group->space_info->bytes_readonly -= block_group->key.offset; 10653 block_group->space_info->disk_total -= block_group->key.offset * factor; 10654 10655 spin_unlock(&block_group->space_info->lock); 10656 10657 memcpy(&key, &block_group->key, sizeof(key)); 10658 10659 lock_chunks(root); 10660 if (!list_empty(&em->list)) { 10661 /* We're in the transaction->pending_chunks list. */ 10662 free_extent_map(em); 10663 } 10664 spin_lock(&block_group->lock); 10665 block_group->removed = 1; 10666 /* 10667 * At this point trimming can't start on this block group, because we 10668 * removed the block group from the tree fs_info->block_group_cache_tree 10669 * so no one can't find it anymore and even if someone already got this 10670 * block group before we removed it from the rbtree, they have already 10671 * incremented block_group->trimming - if they didn't, they won't find 10672 * any free space entries because we already removed them all when we 10673 * called btrfs_remove_free_space_cache(). 10674 * 10675 * And we must not remove the extent map from the fs_info->mapping_tree 10676 * to prevent the same logical address range and physical device space 10677 * ranges from being reused for a new block group. This is because our 10678 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 10679 * completely transactionless, so while it is trimming a range the 10680 * currently running transaction might finish and a new one start, 10681 * allowing for new block groups to be created that can reuse the same 10682 * physical device locations unless we take this special care. 10683 * 10684 * There may also be an implicit trim operation if the file system 10685 * is mounted with -odiscard. The same protections must remain 10686 * in place until the extents have been discarded completely when 10687 * the transaction commit has completed. 10688 */ 10689 remove_em = (atomic_read(&block_group->trimming) == 0); 10690 /* 10691 * Make sure a trimmer task always sees the em in the pinned_chunks list 10692 * if it sees block_group->removed == 1 (needs to lock block_group->lock 10693 * before checking block_group->removed). 10694 */ 10695 if (!remove_em) { 10696 /* 10697 * Our em might be in trans->transaction->pending_chunks which 10698 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), 10699 * and so is the fs_info->pinned_chunks list. 10700 * 10701 * So at this point we must be holding the chunk_mutex to avoid 10702 * any races with chunk allocation (more specifically at 10703 * volumes.c:contains_pending_extent()), to ensure it always 10704 * sees the em, either in the pending_chunks list or in the 10705 * pinned_chunks list. 10706 */ 10707 list_move_tail(&em->list, &root->fs_info->pinned_chunks); 10708 } 10709 spin_unlock(&block_group->lock); 10710 10711 if (remove_em) { 10712 struct extent_map_tree *em_tree; 10713 10714 em_tree = &root->fs_info->mapping_tree.map_tree; 10715 write_lock(&em_tree->lock); 10716 /* 10717 * The em might be in the pending_chunks list, so make sure the 10718 * chunk mutex is locked, since remove_extent_mapping() will 10719 * delete us from that list. 10720 */ 10721 remove_extent_mapping(em_tree, em); 10722 write_unlock(&em_tree->lock); 10723 /* once for the tree */ 10724 free_extent_map(em); 10725 } 10726 10727 unlock_chunks(root); 10728 10729 ret = remove_block_group_free_space(trans, root->fs_info, block_group); 10730 if (ret) 10731 goto out; 10732 10733 btrfs_put_block_group(block_group); 10734 btrfs_put_block_group(block_group); 10735 10736 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 10737 if (ret > 0) 10738 ret = -EIO; 10739 if (ret < 0) 10740 goto out; 10741 10742 ret = btrfs_del_item(trans, root, path); 10743 out: 10744 btrfs_free_path(path); 10745 return ret; 10746 } 10747 10748 struct btrfs_trans_handle * 10749 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, 10750 const u64 chunk_offset) 10751 { 10752 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 10753 struct extent_map *em; 10754 struct map_lookup *map; 10755 unsigned int num_items; 10756 10757 read_lock(&em_tree->lock); 10758 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 10759 read_unlock(&em_tree->lock); 10760 ASSERT(em && em->start == chunk_offset); 10761 10762 /* 10763 * We need to reserve 3 + N units from the metadata space info in order 10764 * to remove a block group (done at btrfs_remove_chunk() and at 10765 * btrfs_remove_block_group()), which are used for: 10766 * 10767 * 1 unit for adding the free space inode's orphan (located in the tree 10768 * of tree roots). 10769 * 1 unit for deleting the block group item (located in the extent 10770 * tree). 10771 * 1 unit for deleting the free space item (located in tree of tree 10772 * roots). 10773 * N units for deleting N device extent items corresponding to each 10774 * stripe (located in the device tree). 10775 * 10776 * In order to remove a block group we also need to reserve units in the 10777 * system space info in order to update the chunk tree (update one or 10778 * more device items and remove one chunk item), but this is done at 10779 * btrfs_remove_chunk() through a call to check_system_chunk(). 10780 */ 10781 map = em->map_lookup; 10782 num_items = 3 + map->num_stripes; 10783 free_extent_map(em); 10784 10785 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, 10786 num_items, 1); 10787 } 10788 10789 /* 10790 * Process the unused_bgs list and remove any that don't have any allocated 10791 * space inside of them. 10792 */ 10793 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 10794 { 10795 struct btrfs_block_group_cache *block_group; 10796 struct btrfs_space_info *space_info; 10797 struct btrfs_root *root = fs_info->extent_root; 10798 struct btrfs_trans_handle *trans; 10799 int ret = 0; 10800 10801 if (!fs_info->open) 10802 return; 10803 10804 spin_lock(&fs_info->unused_bgs_lock); 10805 while (!list_empty(&fs_info->unused_bgs)) { 10806 u64 start, end; 10807 int trimming; 10808 10809 block_group = list_first_entry(&fs_info->unused_bgs, 10810 struct btrfs_block_group_cache, 10811 bg_list); 10812 list_del_init(&block_group->bg_list); 10813 10814 space_info = block_group->space_info; 10815 10816 if (ret || btrfs_mixed_space_info(space_info)) { 10817 btrfs_put_block_group(block_group); 10818 continue; 10819 } 10820 spin_unlock(&fs_info->unused_bgs_lock); 10821 10822 mutex_lock(&fs_info->delete_unused_bgs_mutex); 10823 10824 /* Don't want to race with allocators so take the groups_sem */ 10825 down_write(&space_info->groups_sem); 10826 spin_lock(&block_group->lock); 10827 if (block_group->reserved || 10828 btrfs_block_group_used(&block_group->item) || 10829 block_group->ro || 10830 list_is_singular(&block_group->list)) { 10831 /* 10832 * We want to bail if we made new allocations or have 10833 * outstanding allocations in this block group. We do 10834 * the ro check in case balance is currently acting on 10835 * this block group. 10836 */ 10837 spin_unlock(&block_group->lock); 10838 up_write(&space_info->groups_sem); 10839 goto next; 10840 } 10841 spin_unlock(&block_group->lock); 10842 10843 /* We don't want to force the issue, only flip if it's ok. */ 10844 ret = inc_block_group_ro(block_group, 0); 10845 up_write(&space_info->groups_sem); 10846 if (ret < 0) { 10847 ret = 0; 10848 goto next; 10849 } 10850 10851 /* 10852 * Want to do this before we do anything else so we can recover 10853 * properly if we fail to join the transaction. 10854 */ 10855 trans = btrfs_start_trans_remove_block_group(fs_info, 10856 block_group->key.objectid); 10857 if (IS_ERR(trans)) { 10858 btrfs_dec_block_group_ro(root, block_group); 10859 ret = PTR_ERR(trans); 10860 goto next; 10861 } 10862 10863 /* 10864 * We could have pending pinned extents for this block group, 10865 * just delete them, we don't care about them anymore. 10866 */ 10867 start = block_group->key.objectid; 10868 end = start + block_group->key.offset - 1; 10869 /* 10870 * Hold the unused_bg_unpin_mutex lock to avoid racing with 10871 * btrfs_finish_extent_commit(). If we are at transaction N, 10872 * another task might be running finish_extent_commit() for the 10873 * previous transaction N - 1, and have seen a range belonging 10874 * to the block group in freed_extents[] before we were able to 10875 * clear the whole block group range from freed_extents[]. This 10876 * means that task can lookup for the block group after we 10877 * unpinned it from freed_extents[] and removed it, leading to 10878 * a BUG_ON() at btrfs_unpin_extent_range(). 10879 */ 10880 mutex_lock(&fs_info->unused_bg_unpin_mutex); 10881 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, 10882 EXTENT_DIRTY); 10883 if (ret) { 10884 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10885 btrfs_dec_block_group_ro(root, block_group); 10886 goto end_trans; 10887 } 10888 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, 10889 EXTENT_DIRTY); 10890 if (ret) { 10891 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10892 btrfs_dec_block_group_ro(root, block_group); 10893 goto end_trans; 10894 } 10895 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10896 10897 /* Reset pinned so btrfs_put_block_group doesn't complain */ 10898 spin_lock(&space_info->lock); 10899 spin_lock(&block_group->lock); 10900 10901 space_info->bytes_pinned -= block_group->pinned; 10902 space_info->bytes_readonly += block_group->pinned; 10903 percpu_counter_add(&space_info->total_bytes_pinned, 10904 -block_group->pinned); 10905 block_group->pinned = 0; 10906 10907 spin_unlock(&block_group->lock); 10908 spin_unlock(&space_info->lock); 10909 10910 /* DISCARD can flip during remount */ 10911 trimming = btrfs_test_opt(root->fs_info, DISCARD); 10912 10913 /* Implicit trim during transaction commit. */ 10914 if (trimming) 10915 btrfs_get_block_group_trimming(block_group); 10916 10917 /* 10918 * Btrfs_remove_chunk will abort the transaction if things go 10919 * horribly wrong. 10920 */ 10921 ret = btrfs_remove_chunk(trans, root, 10922 block_group->key.objectid); 10923 10924 if (ret) { 10925 if (trimming) 10926 btrfs_put_block_group_trimming(block_group); 10927 goto end_trans; 10928 } 10929 10930 /* 10931 * If we're not mounted with -odiscard, we can just forget 10932 * about this block group. Otherwise we'll need to wait 10933 * until transaction commit to do the actual discard. 10934 */ 10935 if (trimming) { 10936 spin_lock(&fs_info->unused_bgs_lock); 10937 /* 10938 * A concurrent scrub might have added us to the list 10939 * fs_info->unused_bgs, so use a list_move operation 10940 * to add the block group to the deleted_bgs list. 10941 */ 10942 list_move(&block_group->bg_list, 10943 &trans->transaction->deleted_bgs); 10944 spin_unlock(&fs_info->unused_bgs_lock); 10945 btrfs_get_block_group(block_group); 10946 } 10947 end_trans: 10948 btrfs_end_transaction(trans, root); 10949 next: 10950 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 10951 btrfs_put_block_group(block_group); 10952 spin_lock(&fs_info->unused_bgs_lock); 10953 } 10954 spin_unlock(&fs_info->unused_bgs_lock); 10955 } 10956 10957 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 10958 { 10959 struct btrfs_space_info *space_info; 10960 struct btrfs_super_block *disk_super; 10961 u64 features; 10962 u64 flags; 10963 int mixed = 0; 10964 int ret; 10965 10966 disk_super = fs_info->super_copy; 10967 if (!btrfs_super_root(disk_super)) 10968 return -EINVAL; 10969 10970 features = btrfs_super_incompat_flags(disk_super); 10971 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 10972 mixed = 1; 10973 10974 flags = BTRFS_BLOCK_GROUP_SYSTEM; 10975 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10976 if (ret) 10977 goto out; 10978 10979 if (mixed) { 10980 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 10981 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10982 } else { 10983 flags = BTRFS_BLOCK_GROUP_METADATA; 10984 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10985 if (ret) 10986 goto out; 10987 10988 flags = BTRFS_BLOCK_GROUP_DATA; 10989 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10990 } 10991 out: 10992 return ret; 10993 } 10994 10995 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 10996 { 10997 return unpin_extent_range(root, start, end, false); 10998 } 10999 11000 /* 11001 * It used to be that old block groups would be left around forever. 11002 * Iterating over them would be enough to trim unused space. Since we 11003 * now automatically remove them, we also need to iterate over unallocated 11004 * space. 11005 * 11006 * We don't want a transaction for this since the discard may take a 11007 * substantial amount of time. We don't require that a transaction be 11008 * running, but we do need to take a running transaction into account 11009 * to ensure that we're not discarding chunks that were released in 11010 * the current transaction. 11011 * 11012 * Holding the chunks lock will prevent other threads from allocating 11013 * or releasing chunks, but it won't prevent a running transaction 11014 * from committing and releasing the memory that the pending chunks 11015 * list head uses. For that, we need to take a reference to the 11016 * transaction. 11017 */ 11018 static int btrfs_trim_free_extents(struct btrfs_device *device, 11019 u64 minlen, u64 *trimmed) 11020 { 11021 u64 start = 0, len = 0; 11022 int ret; 11023 11024 *trimmed = 0; 11025 11026 /* Not writeable = nothing to do. */ 11027 if (!device->writeable) 11028 return 0; 11029 11030 /* No free space = nothing to do. */ 11031 if (device->total_bytes <= device->bytes_used) 11032 return 0; 11033 11034 ret = 0; 11035 11036 while (1) { 11037 struct btrfs_fs_info *fs_info = device->dev_root->fs_info; 11038 struct btrfs_transaction *trans; 11039 u64 bytes; 11040 11041 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 11042 if (ret) 11043 return ret; 11044 11045 down_read(&fs_info->commit_root_sem); 11046 11047 spin_lock(&fs_info->trans_lock); 11048 trans = fs_info->running_transaction; 11049 if (trans) 11050 atomic_inc(&trans->use_count); 11051 spin_unlock(&fs_info->trans_lock); 11052 11053 ret = find_free_dev_extent_start(trans, device, minlen, start, 11054 &start, &len); 11055 if (trans) 11056 btrfs_put_transaction(trans); 11057 11058 if (ret) { 11059 up_read(&fs_info->commit_root_sem); 11060 mutex_unlock(&fs_info->chunk_mutex); 11061 if (ret == -ENOSPC) 11062 ret = 0; 11063 break; 11064 } 11065 11066 ret = btrfs_issue_discard(device->bdev, start, len, &bytes); 11067 up_read(&fs_info->commit_root_sem); 11068 mutex_unlock(&fs_info->chunk_mutex); 11069 11070 if (ret) 11071 break; 11072 11073 start += len; 11074 *trimmed += bytes; 11075 11076 if (fatal_signal_pending(current)) { 11077 ret = -ERESTARTSYS; 11078 break; 11079 } 11080 11081 cond_resched(); 11082 } 11083 11084 return ret; 11085 } 11086 11087 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) 11088 { 11089 struct btrfs_fs_info *fs_info = root->fs_info; 11090 struct btrfs_block_group_cache *cache = NULL; 11091 struct btrfs_device *device; 11092 struct list_head *devices; 11093 u64 group_trimmed; 11094 u64 start; 11095 u64 end; 11096 u64 trimmed = 0; 11097 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 11098 int ret = 0; 11099 11100 /* 11101 * try to trim all FS space, our block group may start from non-zero. 11102 */ 11103 if (range->len == total_bytes) 11104 cache = btrfs_lookup_first_block_group(fs_info, range->start); 11105 else 11106 cache = btrfs_lookup_block_group(fs_info, range->start); 11107 11108 while (cache) { 11109 if (cache->key.objectid >= (range->start + range->len)) { 11110 btrfs_put_block_group(cache); 11111 break; 11112 } 11113 11114 start = max(range->start, cache->key.objectid); 11115 end = min(range->start + range->len, 11116 cache->key.objectid + cache->key.offset); 11117 11118 if (end - start >= range->minlen) { 11119 if (!block_group_cache_done(cache)) { 11120 ret = cache_block_group(cache, 0); 11121 if (ret) { 11122 btrfs_put_block_group(cache); 11123 break; 11124 } 11125 ret = wait_block_group_cache_done(cache); 11126 if (ret) { 11127 btrfs_put_block_group(cache); 11128 break; 11129 } 11130 } 11131 ret = btrfs_trim_block_group(cache, 11132 &group_trimmed, 11133 start, 11134 end, 11135 range->minlen); 11136 11137 trimmed += group_trimmed; 11138 if (ret) { 11139 btrfs_put_block_group(cache); 11140 break; 11141 } 11142 } 11143 11144 cache = next_block_group(fs_info->tree_root, cache); 11145 } 11146 11147 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 11148 devices = &root->fs_info->fs_devices->alloc_list; 11149 list_for_each_entry(device, devices, dev_alloc_list) { 11150 ret = btrfs_trim_free_extents(device, range->minlen, 11151 &group_trimmed); 11152 if (ret) 11153 break; 11154 11155 trimmed += group_trimmed; 11156 } 11157 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 11158 11159 range->len = trimmed; 11160 return ret; 11161 } 11162 11163 /* 11164 * btrfs_{start,end}_write_no_snapshoting() are similar to 11165 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing 11166 * data into the page cache through nocow before the subvolume is snapshoted, 11167 * but flush the data into disk after the snapshot creation, or to prevent 11168 * operations while snapshoting is ongoing and that cause the snapshot to be 11169 * inconsistent (writes followed by expanding truncates for example). 11170 */ 11171 void btrfs_end_write_no_snapshoting(struct btrfs_root *root) 11172 { 11173 percpu_counter_dec(&root->subv_writers->counter); 11174 /* 11175 * Make sure counter is updated before we wake up waiters. 11176 */ 11177 smp_mb(); 11178 if (waitqueue_active(&root->subv_writers->wait)) 11179 wake_up(&root->subv_writers->wait); 11180 } 11181 11182 int btrfs_start_write_no_snapshoting(struct btrfs_root *root) 11183 { 11184 if (atomic_read(&root->will_be_snapshoted)) 11185 return 0; 11186 11187 percpu_counter_inc(&root->subv_writers->counter); 11188 /* 11189 * Make sure counter is updated before we check for snapshot creation. 11190 */ 11191 smp_mb(); 11192 if (atomic_read(&root->will_be_snapshoted)) { 11193 btrfs_end_write_no_snapshoting(root); 11194 return 0; 11195 } 11196 return 1; 11197 } 11198 11199 static int wait_snapshoting_atomic_t(atomic_t *a) 11200 { 11201 schedule(); 11202 return 0; 11203 } 11204 11205 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) 11206 { 11207 while (true) { 11208 int ret; 11209 11210 ret = btrfs_start_write_no_snapshoting(root); 11211 if (ret) 11212 break; 11213 wait_on_atomic_t(&root->will_be_snapshoted, 11214 wait_snapshoting_atomic_t, 11215 TASK_UNINTERRUPTIBLE); 11216 } 11217 } 11218