1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/sched/signal.h> 20 #include <linux/pagemap.h> 21 #include <linux/writeback.h> 22 #include <linux/blkdev.h> 23 #include <linux/sort.h> 24 #include <linux/rcupdate.h> 25 #include <linux/kthread.h> 26 #include <linux/slab.h> 27 #include <linux/ratelimit.h> 28 #include <linux/percpu_counter.h> 29 #include "hash.h" 30 #include "tree-log.h" 31 #include "disk-io.h" 32 #include "print-tree.h" 33 #include "volumes.h" 34 #include "raid56.h" 35 #include "locking.h" 36 #include "free-space-cache.h" 37 #include "free-space-tree.h" 38 #include "math.h" 39 #include "sysfs.h" 40 #include "qgroup.h" 41 42 #undef SCRAMBLE_DELAYED_REFS 43 44 /* 45 * control flags for do_chunk_alloc's force field 46 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 47 * if we really need one. 48 * 49 * CHUNK_ALLOC_LIMITED means to only try and allocate one 50 * if we have very few chunks already allocated. This is 51 * used as part of the clustering code to help make sure 52 * we have a good pool of storage to cluster in, without 53 * filling the FS with empty chunks 54 * 55 * CHUNK_ALLOC_FORCE means it must try to allocate one 56 * 57 */ 58 enum { 59 CHUNK_ALLOC_NO_FORCE = 0, 60 CHUNK_ALLOC_LIMITED = 1, 61 CHUNK_ALLOC_FORCE = 2, 62 }; 63 64 static int update_block_group(struct btrfs_trans_handle *trans, 65 struct btrfs_fs_info *fs_info, u64 bytenr, 66 u64 num_bytes, int alloc); 67 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 68 struct btrfs_fs_info *fs_info, 69 struct btrfs_delayed_ref_node *node, u64 parent, 70 u64 root_objectid, u64 owner_objectid, 71 u64 owner_offset, int refs_to_drop, 72 struct btrfs_delayed_extent_op *extra_op); 73 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 74 struct extent_buffer *leaf, 75 struct btrfs_extent_item *ei); 76 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 77 struct btrfs_fs_info *fs_info, 78 u64 parent, u64 root_objectid, 79 u64 flags, u64 owner, u64 offset, 80 struct btrfs_key *ins, int ref_mod); 81 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 82 struct btrfs_fs_info *fs_info, 83 u64 parent, u64 root_objectid, 84 u64 flags, struct btrfs_disk_key *key, 85 int level, struct btrfs_key *ins); 86 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 87 struct btrfs_fs_info *fs_info, u64 flags, 88 int force); 89 static int find_next_key(struct btrfs_path *path, int level, 90 struct btrfs_key *key); 91 static void dump_space_info(struct btrfs_fs_info *fs_info, 92 struct btrfs_space_info *info, u64 bytes, 93 int dump_block_groups); 94 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, 95 u64 ram_bytes, u64 num_bytes, int delalloc); 96 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, 97 u64 num_bytes, int delalloc); 98 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 99 u64 num_bytes); 100 static int __reserve_metadata_bytes(struct btrfs_root *root, 101 struct btrfs_space_info *space_info, 102 u64 orig_bytes, 103 enum btrfs_reserve_flush_enum flush); 104 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 105 struct btrfs_space_info *space_info, 106 u64 num_bytes); 107 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 108 struct btrfs_space_info *space_info, 109 u64 num_bytes); 110 111 static noinline int 112 block_group_cache_done(struct btrfs_block_group_cache *cache) 113 { 114 smp_mb(); 115 return cache->cached == BTRFS_CACHE_FINISHED || 116 cache->cached == BTRFS_CACHE_ERROR; 117 } 118 119 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 120 { 121 return (cache->flags & bits) == bits; 122 } 123 124 void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 125 { 126 atomic_inc(&cache->count); 127 } 128 129 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 130 { 131 if (atomic_dec_and_test(&cache->count)) { 132 WARN_ON(cache->pinned > 0); 133 WARN_ON(cache->reserved > 0); 134 135 /* 136 * If not empty, someone is still holding mutex of 137 * full_stripe_lock, which can only be released by caller. 138 * And it will definitely cause use-after-free when caller 139 * tries to release full stripe lock. 140 * 141 * No better way to resolve, but only to warn. 142 */ 143 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); 144 kfree(cache->free_space_ctl); 145 kfree(cache); 146 } 147 } 148 149 /* 150 * this adds the block group to the fs_info rb tree for the block group 151 * cache 152 */ 153 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 154 struct btrfs_block_group_cache *block_group) 155 { 156 struct rb_node **p; 157 struct rb_node *parent = NULL; 158 struct btrfs_block_group_cache *cache; 159 160 spin_lock(&info->block_group_cache_lock); 161 p = &info->block_group_cache_tree.rb_node; 162 163 while (*p) { 164 parent = *p; 165 cache = rb_entry(parent, struct btrfs_block_group_cache, 166 cache_node); 167 if (block_group->key.objectid < cache->key.objectid) { 168 p = &(*p)->rb_left; 169 } else if (block_group->key.objectid > cache->key.objectid) { 170 p = &(*p)->rb_right; 171 } else { 172 spin_unlock(&info->block_group_cache_lock); 173 return -EEXIST; 174 } 175 } 176 177 rb_link_node(&block_group->cache_node, parent, p); 178 rb_insert_color(&block_group->cache_node, 179 &info->block_group_cache_tree); 180 181 if (info->first_logical_byte > block_group->key.objectid) 182 info->first_logical_byte = block_group->key.objectid; 183 184 spin_unlock(&info->block_group_cache_lock); 185 186 return 0; 187 } 188 189 /* 190 * This will return the block group at or after bytenr if contains is 0, else 191 * it will return the block group that contains the bytenr 192 */ 193 static struct btrfs_block_group_cache * 194 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 195 int contains) 196 { 197 struct btrfs_block_group_cache *cache, *ret = NULL; 198 struct rb_node *n; 199 u64 end, start; 200 201 spin_lock(&info->block_group_cache_lock); 202 n = info->block_group_cache_tree.rb_node; 203 204 while (n) { 205 cache = rb_entry(n, struct btrfs_block_group_cache, 206 cache_node); 207 end = cache->key.objectid + cache->key.offset - 1; 208 start = cache->key.objectid; 209 210 if (bytenr < start) { 211 if (!contains && (!ret || start < ret->key.objectid)) 212 ret = cache; 213 n = n->rb_left; 214 } else if (bytenr > start) { 215 if (contains && bytenr <= end) { 216 ret = cache; 217 break; 218 } 219 n = n->rb_right; 220 } else { 221 ret = cache; 222 break; 223 } 224 } 225 if (ret) { 226 btrfs_get_block_group(ret); 227 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 228 info->first_logical_byte = ret->key.objectid; 229 } 230 spin_unlock(&info->block_group_cache_lock); 231 232 return ret; 233 } 234 235 static int add_excluded_extent(struct btrfs_fs_info *fs_info, 236 u64 start, u64 num_bytes) 237 { 238 u64 end = start + num_bytes - 1; 239 set_extent_bits(&fs_info->freed_extents[0], 240 start, end, EXTENT_UPTODATE); 241 set_extent_bits(&fs_info->freed_extents[1], 242 start, end, EXTENT_UPTODATE); 243 return 0; 244 } 245 246 static void free_excluded_extents(struct btrfs_fs_info *fs_info, 247 struct btrfs_block_group_cache *cache) 248 { 249 u64 start, end; 250 251 start = cache->key.objectid; 252 end = start + cache->key.offset - 1; 253 254 clear_extent_bits(&fs_info->freed_extents[0], 255 start, end, EXTENT_UPTODATE); 256 clear_extent_bits(&fs_info->freed_extents[1], 257 start, end, EXTENT_UPTODATE); 258 } 259 260 static int exclude_super_stripes(struct btrfs_fs_info *fs_info, 261 struct btrfs_block_group_cache *cache) 262 { 263 u64 bytenr; 264 u64 *logical; 265 int stripe_len; 266 int i, nr, ret; 267 268 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 269 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 270 cache->bytes_super += stripe_len; 271 ret = add_excluded_extent(fs_info, cache->key.objectid, 272 stripe_len); 273 if (ret) 274 return ret; 275 } 276 277 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 278 bytenr = btrfs_sb_offset(i); 279 ret = btrfs_rmap_block(fs_info, cache->key.objectid, 280 bytenr, 0, &logical, &nr, &stripe_len); 281 if (ret) 282 return ret; 283 284 while (nr--) { 285 u64 start, len; 286 287 if (logical[nr] > cache->key.objectid + 288 cache->key.offset) 289 continue; 290 291 if (logical[nr] + stripe_len <= cache->key.objectid) 292 continue; 293 294 start = logical[nr]; 295 if (start < cache->key.objectid) { 296 start = cache->key.objectid; 297 len = (logical[nr] + stripe_len) - start; 298 } else { 299 len = min_t(u64, stripe_len, 300 cache->key.objectid + 301 cache->key.offset - start); 302 } 303 304 cache->bytes_super += len; 305 ret = add_excluded_extent(fs_info, start, len); 306 if (ret) { 307 kfree(logical); 308 return ret; 309 } 310 } 311 312 kfree(logical); 313 } 314 return 0; 315 } 316 317 static struct btrfs_caching_control * 318 get_caching_control(struct btrfs_block_group_cache *cache) 319 { 320 struct btrfs_caching_control *ctl; 321 322 spin_lock(&cache->lock); 323 if (!cache->caching_ctl) { 324 spin_unlock(&cache->lock); 325 return NULL; 326 } 327 328 ctl = cache->caching_ctl; 329 refcount_inc(&ctl->count); 330 spin_unlock(&cache->lock); 331 return ctl; 332 } 333 334 static void put_caching_control(struct btrfs_caching_control *ctl) 335 { 336 if (refcount_dec_and_test(&ctl->count)) 337 kfree(ctl); 338 } 339 340 #ifdef CONFIG_BTRFS_DEBUG 341 static void fragment_free_space(struct btrfs_block_group_cache *block_group) 342 { 343 struct btrfs_fs_info *fs_info = block_group->fs_info; 344 u64 start = block_group->key.objectid; 345 u64 len = block_group->key.offset; 346 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 347 fs_info->nodesize : fs_info->sectorsize; 348 u64 step = chunk << 1; 349 350 while (len > chunk) { 351 btrfs_remove_free_space(block_group, start, chunk); 352 start += step; 353 if (len < step) 354 len = 0; 355 else 356 len -= step; 357 } 358 } 359 #endif 360 361 /* 362 * this is only called by cache_block_group, since we could have freed extents 363 * we need to check the pinned_extents for any extents that can't be used yet 364 * since their free space will be released as soon as the transaction commits. 365 */ 366 u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 367 struct btrfs_fs_info *info, u64 start, u64 end) 368 { 369 u64 extent_start, extent_end, size, total_added = 0; 370 int ret; 371 372 while (start < end) { 373 ret = find_first_extent_bit(info->pinned_extents, start, 374 &extent_start, &extent_end, 375 EXTENT_DIRTY | EXTENT_UPTODATE, 376 NULL); 377 if (ret) 378 break; 379 380 if (extent_start <= start) { 381 start = extent_end + 1; 382 } else if (extent_start > start && extent_start < end) { 383 size = extent_start - start; 384 total_added += size; 385 ret = btrfs_add_free_space(block_group, start, 386 size); 387 BUG_ON(ret); /* -ENOMEM or logic error */ 388 start = extent_end + 1; 389 } else { 390 break; 391 } 392 } 393 394 if (start < end) { 395 size = end - start; 396 total_added += size; 397 ret = btrfs_add_free_space(block_group, start, size); 398 BUG_ON(ret); /* -ENOMEM or logic error */ 399 } 400 401 return total_added; 402 } 403 404 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 405 { 406 struct btrfs_block_group_cache *block_group = caching_ctl->block_group; 407 struct btrfs_fs_info *fs_info = block_group->fs_info; 408 struct btrfs_root *extent_root = fs_info->extent_root; 409 struct btrfs_path *path; 410 struct extent_buffer *leaf; 411 struct btrfs_key key; 412 u64 total_found = 0; 413 u64 last = 0; 414 u32 nritems; 415 int ret; 416 bool wakeup = true; 417 418 path = btrfs_alloc_path(); 419 if (!path) 420 return -ENOMEM; 421 422 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 423 424 #ifdef CONFIG_BTRFS_DEBUG 425 /* 426 * If we're fragmenting we don't want to make anybody think we can 427 * allocate from this block group until we've had a chance to fragment 428 * the free space. 429 */ 430 if (btrfs_should_fragment_free_space(block_group)) 431 wakeup = false; 432 #endif 433 /* 434 * We don't want to deadlock with somebody trying to allocate a new 435 * extent for the extent root while also trying to search the extent 436 * root to add free space. So we skip locking and search the commit 437 * root, since its read-only 438 */ 439 path->skip_locking = 1; 440 path->search_commit_root = 1; 441 path->reada = READA_FORWARD; 442 443 key.objectid = last; 444 key.offset = 0; 445 key.type = BTRFS_EXTENT_ITEM_KEY; 446 447 next: 448 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 449 if (ret < 0) 450 goto out; 451 452 leaf = path->nodes[0]; 453 nritems = btrfs_header_nritems(leaf); 454 455 while (1) { 456 if (btrfs_fs_closing(fs_info) > 1) { 457 last = (u64)-1; 458 break; 459 } 460 461 if (path->slots[0] < nritems) { 462 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 463 } else { 464 ret = find_next_key(path, 0, &key); 465 if (ret) 466 break; 467 468 if (need_resched() || 469 rwsem_is_contended(&fs_info->commit_root_sem)) { 470 if (wakeup) 471 caching_ctl->progress = last; 472 btrfs_release_path(path); 473 up_read(&fs_info->commit_root_sem); 474 mutex_unlock(&caching_ctl->mutex); 475 cond_resched(); 476 mutex_lock(&caching_ctl->mutex); 477 down_read(&fs_info->commit_root_sem); 478 goto next; 479 } 480 481 ret = btrfs_next_leaf(extent_root, path); 482 if (ret < 0) 483 goto out; 484 if (ret) 485 break; 486 leaf = path->nodes[0]; 487 nritems = btrfs_header_nritems(leaf); 488 continue; 489 } 490 491 if (key.objectid < last) { 492 key.objectid = last; 493 key.offset = 0; 494 key.type = BTRFS_EXTENT_ITEM_KEY; 495 496 if (wakeup) 497 caching_ctl->progress = last; 498 btrfs_release_path(path); 499 goto next; 500 } 501 502 if (key.objectid < block_group->key.objectid) { 503 path->slots[0]++; 504 continue; 505 } 506 507 if (key.objectid >= block_group->key.objectid + 508 block_group->key.offset) 509 break; 510 511 if (key.type == BTRFS_EXTENT_ITEM_KEY || 512 key.type == BTRFS_METADATA_ITEM_KEY) { 513 total_found += add_new_free_space(block_group, 514 fs_info, last, 515 key.objectid); 516 if (key.type == BTRFS_METADATA_ITEM_KEY) 517 last = key.objectid + 518 fs_info->nodesize; 519 else 520 last = key.objectid + key.offset; 521 522 if (total_found > CACHING_CTL_WAKE_UP) { 523 total_found = 0; 524 if (wakeup) 525 wake_up(&caching_ctl->wait); 526 } 527 } 528 path->slots[0]++; 529 } 530 ret = 0; 531 532 total_found += add_new_free_space(block_group, fs_info, last, 533 block_group->key.objectid + 534 block_group->key.offset); 535 caching_ctl->progress = (u64)-1; 536 537 out: 538 btrfs_free_path(path); 539 return ret; 540 } 541 542 static noinline void caching_thread(struct btrfs_work *work) 543 { 544 struct btrfs_block_group_cache *block_group; 545 struct btrfs_fs_info *fs_info; 546 struct btrfs_caching_control *caching_ctl; 547 struct btrfs_root *extent_root; 548 int ret; 549 550 caching_ctl = container_of(work, struct btrfs_caching_control, work); 551 block_group = caching_ctl->block_group; 552 fs_info = block_group->fs_info; 553 extent_root = fs_info->extent_root; 554 555 mutex_lock(&caching_ctl->mutex); 556 down_read(&fs_info->commit_root_sem); 557 558 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 559 ret = load_free_space_tree(caching_ctl); 560 else 561 ret = load_extent_tree_free(caching_ctl); 562 563 spin_lock(&block_group->lock); 564 block_group->caching_ctl = NULL; 565 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 566 spin_unlock(&block_group->lock); 567 568 #ifdef CONFIG_BTRFS_DEBUG 569 if (btrfs_should_fragment_free_space(block_group)) { 570 u64 bytes_used; 571 572 spin_lock(&block_group->space_info->lock); 573 spin_lock(&block_group->lock); 574 bytes_used = block_group->key.offset - 575 btrfs_block_group_used(&block_group->item); 576 block_group->space_info->bytes_used += bytes_used >> 1; 577 spin_unlock(&block_group->lock); 578 spin_unlock(&block_group->space_info->lock); 579 fragment_free_space(block_group); 580 } 581 #endif 582 583 caching_ctl->progress = (u64)-1; 584 585 up_read(&fs_info->commit_root_sem); 586 free_excluded_extents(fs_info, block_group); 587 mutex_unlock(&caching_ctl->mutex); 588 589 wake_up(&caching_ctl->wait); 590 591 put_caching_control(caching_ctl); 592 btrfs_put_block_group(block_group); 593 } 594 595 static int cache_block_group(struct btrfs_block_group_cache *cache, 596 int load_cache_only) 597 { 598 DEFINE_WAIT(wait); 599 struct btrfs_fs_info *fs_info = cache->fs_info; 600 struct btrfs_caching_control *caching_ctl; 601 int ret = 0; 602 603 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 604 if (!caching_ctl) 605 return -ENOMEM; 606 607 INIT_LIST_HEAD(&caching_ctl->list); 608 mutex_init(&caching_ctl->mutex); 609 init_waitqueue_head(&caching_ctl->wait); 610 caching_ctl->block_group = cache; 611 caching_ctl->progress = cache->key.objectid; 612 refcount_set(&caching_ctl->count, 1); 613 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, 614 caching_thread, NULL, NULL); 615 616 spin_lock(&cache->lock); 617 /* 618 * This should be a rare occasion, but this could happen I think in the 619 * case where one thread starts to load the space cache info, and then 620 * some other thread starts a transaction commit which tries to do an 621 * allocation while the other thread is still loading the space cache 622 * info. The previous loop should have kept us from choosing this block 623 * group, but if we've moved to the state where we will wait on caching 624 * block groups we need to first check if we're doing a fast load here, 625 * so we can wait for it to finish, otherwise we could end up allocating 626 * from a block group who's cache gets evicted for one reason or 627 * another. 628 */ 629 while (cache->cached == BTRFS_CACHE_FAST) { 630 struct btrfs_caching_control *ctl; 631 632 ctl = cache->caching_ctl; 633 refcount_inc(&ctl->count); 634 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 635 spin_unlock(&cache->lock); 636 637 schedule(); 638 639 finish_wait(&ctl->wait, &wait); 640 put_caching_control(ctl); 641 spin_lock(&cache->lock); 642 } 643 644 if (cache->cached != BTRFS_CACHE_NO) { 645 spin_unlock(&cache->lock); 646 kfree(caching_ctl); 647 return 0; 648 } 649 WARN_ON(cache->caching_ctl); 650 cache->caching_ctl = caching_ctl; 651 cache->cached = BTRFS_CACHE_FAST; 652 spin_unlock(&cache->lock); 653 654 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 655 mutex_lock(&caching_ctl->mutex); 656 ret = load_free_space_cache(fs_info, cache); 657 658 spin_lock(&cache->lock); 659 if (ret == 1) { 660 cache->caching_ctl = NULL; 661 cache->cached = BTRFS_CACHE_FINISHED; 662 cache->last_byte_to_unpin = (u64)-1; 663 caching_ctl->progress = (u64)-1; 664 } else { 665 if (load_cache_only) { 666 cache->caching_ctl = NULL; 667 cache->cached = BTRFS_CACHE_NO; 668 } else { 669 cache->cached = BTRFS_CACHE_STARTED; 670 cache->has_caching_ctl = 1; 671 } 672 } 673 spin_unlock(&cache->lock); 674 #ifdef CONFIG_BTRFS_DEBUG 675 if (ret == 1 && 676 btrfs_should_fragment_free_space(cache)) { 677 u64 bytes_used; 678 679 spin_lock(&cache->space_info->lock); 680 spin_lock(&cache->lock); 681 bytes_used = cache->key.offset - 682 btrfs_block_group_used(&cache->item); 683 cache->space_info->bytes_used += bytes_used >> 1; 684 spin_unlock(&cache->lock); 685 spin_unlock(&cache->space_info->lock); 686 fragment_free_space(cache); 687 } 688 #endif 689 mutex_unlock(&caching_ctl->mutex); 690 691 wake_up(&caching_ctl->wait); 692 if (ret == 1) { 693 put_caching_control(caching_ctl); 694 free_excluded_extents(fs_info, cache); 695 return 0; 696 } 697 } else { 698 /* 699 * We're either using the free space tree or no caching at all. 700 * Set cached to the appropriate value and wakeup any waiters. 701 */ 702 spin_lock(&cache->lock); 703 if (load_cache_only) { 704 cache->caching_ctl = NULL; 705 cache->cached = BTRFS_CACHE_NO; 706 } else { 707 cache->cached = BTRFS_CACHE_STARTED; 708 cache->has_caching_ctl = 1; 709 } 710 spin_unlock(&cache->lock); 711 wake_up(&caching_ctl->wait); 712 } 713 714 if (load_cache_only) { 715 put_caching_control(caching_ctl); 716 return 0; 717 } 718 719 down_write(&fs_info->commit_root_sem); 720 refcount_inc(&caching_ctl->count); 721 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 722 up_write(&fs_info->commit_root_sem); 723 724 btrfs_get_block_group(cache); 725 726 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 727 728 return ret; 729 } 730 731 /* 732 * return the block group that starts at or after bytenr 733 */ 734 static struct btrfs_block_group_cache * 735 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 736 { 737 return block_group_cache_tree_search(info, bytenr, 0); 738 } 739 740 /* 741 * return the block group that contains the given bytenr 742 */ 743 struct btrfs_block_group_cache *btrfs_lookup_block_group( 744 struct btrfs_fs_info *info, 745 u64 bytenr) 746 { 747 return block_group_cache_tree_search(info, bytenr, 1); 748 } 749 750 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 751 u64 flags) 752 { 753 struct list_head *head = &info->space_info; 754 struct btrfs_space_info *found; 755 756 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 757 758 rcu_read_lock(); 759 list_for_each_entry_rcu(found, head, list) { 760 if (found->flags & flags) { 761 rcu_read_unlock(); 762 return found; 763 } 764 } 765 rcu_read_unlock(); 766 return NULL; 767 } 768 769 /* 770 * after adding space to the filesystem, we need to clear the full flags 771 * on all the space infos. 772 */ 773 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 774 { 775 struct list_head *head = &info->space_info; 776 struct btrfs_space_info *found; 777 778 rcu_read_lock(); 779 list_for_each_entry_rcu(found, head, list) 780 found->full = 0; 781 rcu_read_unlock(); 782 } 783 784 /* simple helper to search for an existing data extent at a given offset */ 785 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) 786 { 787 int ret; 788 struct btrfs_key key; 789 struct btrfs_path *path; 790 791 path = btrfs_alloc_path(); 792 if (!path) 793 return -ENOMEM; 794 795 key.objectid = start; 796 key.offset = len; 797 key.type = BTRFS_EXTENT_ITEM_KEY; 798 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); 799 btrfs_free_path(path); 800 return ret; 801 } 802 803 /* 804 * helper function to lookup reference count and flags of a tree block. 805 * 806 * the head node for delayed ref is used to store the sum of all the 807 * reference count modifications queued up in the rbtree. the head 808 * node may also store the extent flags to set. This way you can check 809 * to see what the reference count and extent flags would be if all of 810 * the delayed refs are not processed. 811 */ 812 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 813 struct btrfs_fs_info *fs_info, u64 bytenr, 814 u64 offset, int metadata, u64 *refs, u64 *flags) 815 { 816 struct btrfs_delayed_ref_head *head; 817 struct btrfs_delayed_ref_root *delayed_refs; 818 struct btrfs_path *path; 819 struct btrfs_extent_item *ei; 820 struct extent_buffer *leaf; 821 struct btrfs_key key; 822 u32 item_size; 823 u64 num_refs; 824 u64 extent_flags; 825 int ret; 826 827 /* 828 * If we don't have skinny metadata, don't bother doing anything 829 * different 830 */ 831 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { 832 offset = fs_info->nodesize; 833 metadata = 0; 834 } 835 836 path = btrfs_alloc_path(); 837 if (!path) 838 return -ENOMEM; 839 840 if (!trans) { 841 path->skip_locking = 1; 842 path->search_commit_root = 1; 843 } 844 845 search_again: 846 key.objectid = bytenr; 847 key.offset = offset; 848 if (metadata) 849 key.type = BTRFS_METADATA_ITEM_KEY; 850 else 851 key.type = BTRFS_EXTENT_ITEM_KEY; 852 853 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); 854 if (ret < 0) 855 goto out_free; 856 857 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 858 if (path->slots[0]) { 859 path->slots[0]--; 860 btrfs_item_key_to_cpu(path->nodes[0], &key, 861 path->slots[0]); 862 if (key.objectid == bytenr && 863 key.type == BTRFS_EXTENT_ITEM_KEY && 864 key.offset == fs_info->nodesize) 865 ret = 0; 866 } 867 } 868 869 if (ret == 0) { 870 leaf = path->nodes[0]; 871 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 872 if (item_size >= sizeof(*ei)) { 873 ei = btrfs_item_ptr(leaf, path->slots[0], 874 struct btrfs_extent_item); 875 num_refs = btrfs_extent_refs(leaf, ei); 876 extent_flags = btrfs_extent_flags(leaf, ei); 877 } else { 878 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 879 struct btrfs_extent_item_v0 *ei0; 880 BUG_ON(item_size != sizeof(*ei0)); 881 ei0 = btrfs_item_ptr(leaf, path->slots[0], 882 struct btrfs_extent_item_v0); 883 num_refs = btrfs_extent_refs_v0(leaf, ei0); 884 /* FIXME: this isn't correct for data */ 885 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 886 #else 887 BUG(); 888 #endif 889 } 890 BUG_ON(num_refs == 0); 891 } else { 892 num_refs = 0; 893 extent_flags = 0; 894 ret = 0; 895 } 896 897 if (!trans) 898 goto out; 899 900 delayed_refs = &trans->transaction->delayed_refs; 901 spin_lock(&delayed_refs->lock); 902 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 903 if (head) { 904 if (!mutex_trylock(&head->mutex)) { 905 refcount_inc(&head->node.refs); 906 spin_unlock(&delayed_refs->lock); 907 908 btrfs_release_path(path); 909 910 /* 911 * Mutex was contended, block until it's released and try 912 * again 913 */ 914 mutex_lock(&head->mutex); 915 mutex_unlock(&head->mutex); 916 btrfs_put_delayed_ref(&head->node); 917 goto search_again; 918 } 919 spin_lock(&head->lock); 920 if (head->extent_op && head->extent_op->update_flags) 921 extent_flags |= head->extent_op->flags_to_set; 922 else 923 BUG_ON(num_refs == 0); 924 925 num_refs += head->node.ref_mod; 926 spin_unlock(&head->lock); 927 mutex_unlock(&head->mutex); 928 } 929 spin_unlock(&delayed_refs->lock); 930 out: 931 WARN_ON(num_refs == 0); 932 if (refs) 933 *refs = num_refs; 934 if (flags) 935 *flags = extent_flags; 936 out_free: 937 btrfs_free_path(path); 938 return ret; 939 } 940 941 /* 942 * Back reference rules. Back refs have three main goals: 943 * 944 * 1) differentiate between all holders of references to an extent so that 945 * when a reference is dropped we can make sure it was a valid reference 946 * before freeing the extent. 947 * 948 * 2) Provide enough information to quickly find the holders of an extent 949 * if we notice a given block is corrupted or bad. 950 * 951 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 952 * maintenance. This is actually the same as #2, but with a slightly 953 * different use case. 954 * 955 * There are two kinds of back refs. The implicit back refs is optimized 956 * for pointers in non-shared tree blocks. For a given pointer in a block, 957 * back refs of this kind provide information about the block's owner tree 958 * and the pointer's key. These information allow us to find the block by 959 * b-tree searching. The full back refs is for pointers in tree blocks not 960 * referenced by their owner trees. The location of tree block is recorded 961 * in the back refs. Actually the full back refs is generic, and can be 962 * used in all cases the implicit back refs is used. The major shortcoming 963 * of the full back refs is its overhead. Every time a tree block gets 964 * COWed, we have to update back refs entry for all pointers in it. 965 * 966 * For a newly allocated tree block, we use implicit back refs for 967 * pointers in it. This means most tree related operations only involve 968 * implicit back refs. For a tree block created in old transaction, the 969 * only way to drop a reference to it is COW it. So we can detect the 970 * event that tree block loses its owner tree's reference and do the 971 * back refs conversion. 972 * 973 * When a tree block is COWed through a tree, there are four cases: 974 * 975 * The reference count of the block is one and the tree is the block's 976 * owner tree. Nothing to do in this case. 977 * 978 * The reference count of the block is one and the tree is not the 979 * block's owner tree. In this case, full back refs is used for pointers 980 * in the block. Remove these full back refs, add implicit back refs for 981 * every pointers in the new block. 982 * 983 * The reference count of the block is greater than one and the tree is 984 * the block's owner tree. In this case, implicit back refs is used for 985 * pointers in the block. Add full back refs for every pointers in the 986 * block, increase lower level extents' reference counts. The original 987 * implicit back refs are entailed to the new block. 988 * 989 * The reference count of the block is greater than one and the tree is 990 * not the block's owner tree. Add implicit back refs for every pointer in 991 * the new block, increase lower level extents' reference count. 992 * 993 * Back Reference Key composing: 994 * 995 * The key objectid corresponds to the first byte in the extent, 996 * The key type is used to differentiate between types of back refs. 997 * There are different meanings of the key offset for different types 998 * of back refs. 999 * 1000 * File extents can be referenced by: 1001 * 1002 * - multiple snapshots, subvolumes, or different generations in one subvol 1003 * - different files inside a single subvolume 1004 * - different offsets inside a file (bookend extents in file.c) 1005 * 1006 * The extent ref structure for the implicit back refs has fields for: 1007 * 1008 * - Objectid of the subvolume root 1009 * - objectid of the file holding the reference 1010 * - original offset in the file 1011 * - how many bookend extents 1012 * 1013 * The key offset for the implicit back refs is hash of the first 1014 * three fields. 1015 * 1016 * The extent ref structure for the full back refs has field for: 1017 * 1018 * - number of pointers in the tree leaf 1019 * 1020 * The key offset for the implicit back refs is the first byte of 1021 * the tree leaf 1022 * 1023 * When a file extent is allocated, The implicit back refs is used. 1024 * the fields are filled in: 1025 * 1026 * (root_key.objectid, inode objectid, offset in file, 1) 1027 * 1028 * When a file extent is removed file truncation, we find the 1029 * corresponding implicit back refs and check the following fields: 1030 * 1031 * (btrfs_header_owner(leaf), inode objectid, offset in file) 1032 * 1033 * Btree extents can be referenced by: 1034 * 1035 * - Different subvolumes 1036 * 1037 * Both the implicit back refs and the full back refs for tree blocks 1038 * only consist of key. The key offset for the implicit back refs is 1039 * objectid of block's owner tree. The key offset for the full back refs 1040 * is the first byte of parent block. 1041 * 1042 * When implicit back refs is used, information about the lowest key and 1043 * level of the tree block are required. These information are stored in 1044 * tree block info structure. 1045 */ 1046 1047 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1048 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 1049 struct btrfs_fs_info *fs_info, 1050 struct btrfs_path *path, 1051 u64 owner, u32 extra_size) 1052 { 1053 struct btrfs_root *root = fs_info->extent_root; 1054 struct btrfs_extent_item *item; 1055 struct btrfs_extent_item_v0 *ei0; 1056 struct btrfs_extent_ref_v0 *ref0; 1057 struct btrfs_tree_block_info *bi; 1058 struct extent_buffer *leaf; 1059 struct btrfs_key key; 1060 struct btrfs_key found_key; 1061 u32 new_size = sizeof(*item); 1062 u64 refs; 1063 int ret; 1064 1065 leaf = path->nodes[0]; 1066 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 1067 1068 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1069 ei0 = btrfs_item_ptr(leaf, path->slots[0], 1070 struct btrfs_extent_item_v0); 1071 refs = btrfs_extent_refs_v0(leaf, ei0); 1072 1073 if (owner == (u64)-1) { 1074 while (1) { 1075 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1076 ret = btrfs_next_leaf(root, path); 1077 if (ret < 0) 1078 return ret; 1079 BUG_ON(ret > 0); /* Corruption */ 1080 leaf = path->nodes[0]; 1081 } 1082 btrfs_item_key_to_cpu(leaf, &found_key, 1083 path->slots[0]); 1084 BUG_ON(key.objectid != found_key.objectid); 1085 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 1086 path->slots[0]++; 1087 continue; 1088 } 1089 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1090 struct btrfs_extent_ref_v0); 1091 owner = btrfs_ref_objectid_v0(leaf, ref0); 1092 break; 1093 } 1094 } 1095 btrfs_release_path(path); 1096 1097 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1098 new_size += sizeof(*bi); 1099 1100 new_size -= sizeof(*ei0); 1101 ret = btrfs_search_slot(trans, root, &key, path, 1102 new_size + extra_size, 1); 1103 if (ret < 0) 1104 return ret; 1105 BUG_ON(ret); /* Corruption */ 1106 1107 btrfs_extend_item(fs_info, path, new_size); 1108 1109 leaf = path->nodes[0]; 1110 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1111 btrfs_set_extent_refs(leaf, item, refs); 1112 /* FIXME: get real generation */ 1113 btrfs_set_extent_generation(leaf, item, 0); 1114 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1115 btrfs_set_extent_flags(leaf, item, 1116 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1117 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1118 bi = (struct btrfs_tree_block_info *)(item + 1); 1119 /* FIXME: get first key of the block */ 1120 memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi)); 1121 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1122 } else { 1123 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1124 } 1125 btrfs_mark_buffer_dirty(leaf); 1126 return 0; 1127 } 1128 #endif 1129 1130 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1131 { 1132 u32 high_crc = ~(u32)0; 1133 u32 low_crc = ~(u32)0; 1134 __le64 lenum; 1135 1136 lenum = cpu_to_le64(root_objectid); 1137 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 1138 lenum = cpu_to_le64(owner); 1139 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1140 lenum = cpu_to_le64(offset); 1141 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1142 1143 return ((u64)high_crc << 31) ^ (u64)low_crc; 1144 } 1145 1146 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1147 struct btrfs_extent_data_ref *ref) 1148 { 1149 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1150 btrfs_extent_data_ref_objectid(leaf, ref), 1151 btrfs_extent_data_ref_offset(leaf, ref)); 1152 } 1153 1154 static int match_extent_data_ref(struct extent_buffer *leaf, 1155 struct btrfs_extent_data_ref *ref, 1156 u64 root_objectid, u64 owner, u64 offset) 1157 { 1158 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1159 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1160 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1161 return 0; 1162 return 1; 1163 } 1164 1165 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1166 struct btrfs_fs_info *fs_info, 1167 struct btrfs_path *path, 1168 u64 bytenr, u64 parent, 1169 u64 root_objectid, 1170 u64 owner, u64 offset) 1171 { 1172 struct btrfs_root *root = fs_info->extent_root; 1173 struct btrfs_key key; 1174 struct btrfs_extent_data_ref *ref; 1175 struct extent_buffer *leaf; 1176 u32 nritems; 1177 int ret; 1178 int recow; 1179 int err = -ENOENT; 1180 1181 key.objectid = bytenr; 1182 if (parent) { 1183 key.type = BTRFS_SHARED_DATA_REF_KEY; 1184 key.offset = parent; 1185 } else { 1186 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1187 key.offset = hash_extent_data_ref(root_objectid, 1188 owner, offset); 1189 } 1190 again: 1191 recow = 0; 1192 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1193 if (ret < 0) { 1194 err = ret; 1195 goto fail; 1196 } 1197 1198 if (parent) { 1199 if (!ret) 1200 return 0; 1201 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1202 key.type = BTRFS_EXTENT_REF_V0_KEY; 1203 btrfs_release_path(path); 1204 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1205 if (ret < 0) { 1206 err = ret; 1207 goto fail; 1208 } 1209 if (!ret) 1210 return 0; 1211 #endif 1212 goto fail; 1213 } 1214 1215 leaf = path->nodes[0]; 1216 nritems = btrfs_header_nritems(leaf); 1217 while (1) { 1218 if (path->slots[0] >= nritems) { 1219 ret = btrfs_next_leaf(root, path); 1220 if (ret < 0) 1221 err = ret; 1222 if (ret) 1223 goto fail; 1224 1225 leaf = path->nodes[0]; 1226 nritems = btrfs_header_nritems(leaf); 1227 recow = 1; 1228 } 1229 1230 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1231 if (key.objectid != bytenr || 1232 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1233 goto fail; 1234 1235 ref = btrfs_item_ptr(leaf, path->slots[0], 1236 struct btrfs_extent_data_ref); 1237 1238 if (match_extent_data_ref(leaf, ref, root_objectid, 1239 owner, offset)) { 1240 if (recow) { 1241 btrfs_release_path(path); 1242 goto again; 1243 } 1244 err = 0; 1245 break; 1246 } 1247 path->slots[0]++; 1248 } 1249 fail: 1250 return err; 1251 } 1252 1253 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1254 struct btrfs_fs_info *fs_info, 1255 struct btrfs_path *path, 1256 u64 bytenr, u64 parent, 1257 u64 root_objectid, u64 owner, 1258 u64 offset, int refs_to_add) 1259 { 1260 struct btrfs_root *root = fs_info->extent_root; 1261 struct btrfs_key key; 1262 struct extent_buffer *leaf; 1263 u32 size; 1264 u32 num_refs; 1265 int ret; 1266 1267 key.objectid = bytenr; 1268 if (parent) { 1269 key.type = BTRFS_SHARED_DATA_REF_KEY; 1270 key.offset = parent; 1271 size = sizeof(struct btrfs_shared_data_ref); 1272 } else { 1273 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1274 key.offset = hash_extent_data_ref(root_objectid, 1275 owner, offset); 1276 size = sizeof(struct btrfs_extent_data_ref); 1277 } 1278 1279 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1280 if (ret && ret != -EEXIST) 1281 goto fail; 1282 1283 leaf = path->nodes[0]; 1284 if (parent) { 1285 struct btrfs_shared_data_ref *ref; 1286 ref = btrfs_item_ptr(leaf, path->slots[0], 1287 struct btrfs_shared_data_ref); 1288 if (ret == 0) { 1289 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1290 } else { 1291 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1292 num_refs += refs_to_add; 1293 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1294 } 1295 } else { 1296 struct btrfs_extent_data_ref *ref; 1297 while (ret == -EEXIST) { 1298 ref = btrfs_item_ptr(leaf, path->slots[0], 1299 struct btrfs_extent_data_ref); 1300 if (match_extent_data_ref(leaf, ref, root_objectid, 1301 owner, offset)) 1302 break; 1303 btrfs_release_path(path); 1304 key.offset++; 1305 ret = btrfs_insert_empty_item(trans, root, path, &key, 1306 size); 1307 if (ret && ret != -EEXIST) 1308 goto fail; 1309 1310 leaf = path->nodes[0]; 1311 } 1312 ref = btrfs_item_ptr(leaf, path->slots[0], 1313 struct btrfs_extent_data_ref); 1314 if (ret == 0) { 1315 btrfs_set_extent_data_ref_root(leaf, ref, 1316 root_objectid); 1317 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1318 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1319 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1320 } else { 1321 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1322 num_refs += refs_to_add; 1323 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1324 } 1325 } 1326 btrfs_mark_buffer_dirty(leaf); 1327 ret = 0; 1328 fail: 1329 btrfs_release_path(path); 1330 return ret; 1331 } 1332 1333 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1334 struct btrfs_fs_info *fs_info, 1335 struct btrfs_path *path, 1336 int refs_to_drop, int *last_ref) 1337 { 1338 struct btrfs_key key; 1339 struct btrfs_extent_data_ref *ref1 = NULL; 1340 struct btrfs_shared_data_ref *ref2 = NULL; 1341 struct extent_buffer *leaf; 1342 u32 num_refs = 0; 1343 int ret = 0; 1344 1345 leaf = path->nodes[0]; 1346 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1347 1348 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1349 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1350 struct btrfs_extent_data_ref); 1351 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1352 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1353 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1354 struct btrfs_shared_data_ref); 1355 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1356 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1357 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1358 struct btrfs_extent_ref_v0 *ref0; 1359 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1360 struct btrfs_extent_ref_v0); 1361 num_refs = btrfs_ref_count_v0(leaf, ref0); 1362 #endif 1363 } else { 1364 BUG(); 1365 } 1366 1367 BUG_ON(num_refs < refs_to_drop); 1368 num_refs -= refs_to_drop; 1369 1370 if (num_refs == 0) { 1371 ret = btrfs_del_item(trans, fs_info->extent_root, path); 1372 *last_ref = 1; 1373 } else { 1374 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1375 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1376 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1377 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1378 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1379 else { 1380 struct btrfs_extent_ref_v0 *ref0; 1381 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1382 struct btrfs_extent_ref_v0); 1383 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1384 } 1385 #endif 1386 btrfs_mark_buffer_dirty(leaf); 1387 } 1388 return ret; 1389 } 1390 1391 static noinline u32 extent_data_ref_count(struct btrfs_path *path, 1392 struct btrfs_extent_inline_ref *iref) 1393 { 1394 struct btrfs_key key; 1395 struct extent_buffer *leaf; 1396 struct btrfs_extent_data_ref *ref1; 1397 struct btrfs_shared_data_ref *ref2; 1398 u32 num_refs = 0; 1399 1400 leaf = path->nodes[0]; 1401 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1402 if (iref) { 1403 if (btrfs_extent_inline_ref_type(leaf, iref) == 1404 BTRFS_EXTENT_DATA_REF_KEY) { 1405 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1406 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1407 } else { 1408 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1409 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1410 } 1411 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1412 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1413 struct btrfs_extent_data_ref); 1414 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1415 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1416 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1417 struct btrfs_shared_data_ref); 1418 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1419 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1420 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1421 struct btrfs_extent_ref_v0 *ref0; 1422 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1423 struct btrfs_extent_ref_v0); 1424 num_refs = btrfs_ref_count_v0(leaf, ref0); 1425 #endif 1426 } else { 1427 WARN_ON(1); 1428 } 1429 return num_refs; 1430 } 1431 1432 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1433 struct btrfs_fs_info *fs_info, 1434 struct btrfs_path *path, 1435 u64 bytenr, u64 parent, 1436 u64 root_objectid) 1437 { 1438 struct btrfs_root *root = fs_info->extent_root; 1439 struct btrfs_key key; 1440 int ret; 1441 1442 key.objectid = bytenr; 1443 if (parent) { 1444 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1445 key.offset = parent; 1446 } else { 1447 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1448 key.offset = root_objectid; 1449 } 1450 1451 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1452 if (ret > 0) 1453 ret = -ENOENT; 1454 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1455 if (ret == -ENOENT && parent) { 1456 btrfs_release_path(path); 1457 key.type = BTRFS_EXTENT_REF_V0_KEY; 1458 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1459 if (ret > 0) 1460 ret = -ENOENT; 1461 } 1462 #endif 1463 return ret; 1464 } 1465 1466 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1467 struct btrfs_fs_info *fs_info, 1468 struct btrfs_path *path, 1469 u64 bytenr, u64 parent, 1470 u64 root_objectid) 1471 { 1472 struct btrfs_key key; 1473 int ret; 1474 1475 key.objectid = bytenr; 1476 if (parent) { 1477 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1478 key.offset = parent; 1479 } else { 1480 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1481 key.offset = root_objectid; 1482 } 1483 1484 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, 1485 path, &key, 0); 1486 btrfs_release_path(path); 1487 return ret; 1488 } 1489 1490 static inline int extent_ref_type(u64 parent, u64 owner) 1491 { 1492 int type; 1493 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1494 if (parent > 0) 1495 type = BTRFS_SHARED_BLOCK_REF_KEY; 1496 else 1497 type = BTRFS_TREE_BLOCK_REF_KEY; 1498 } else { 1499 if (parent > 0) 1500 type = BTRFS_SHARED_DATA_REF_KEY; 1501 else 1502 type = BTRFS_EXTENT_DATA_REF_KEY; 1503 } 1504 return type; 1505 } 1506 1507 static int find_next_key(struct btrfs_path *path, int level, 1508 struct btrfs_key *key) 1509 1510 { 1511 for (; level < BTRFS_MAX_LEVEL; level++) { 1512 if (!path->nodes[level]) 1513 break; 1514 if (path->slots[level] + 1 >= 1515 btrfs_header_nritems(path->nodes[level])) 1516 continue; 1517 if (level == 0) 1518 btrfs_item_key_to_cpu(path->nodes[level], key, 1519 path->slots[level] + 1); 1520 else 1521 btrfs_node_key_to_cpu(path->nodes[level], key, 1522 path->slots[level] + 1); 1523 return 0; 1524 } 1525 return 1; 1526 } 1527 1528 /* 1529 * look for inline back ref. if back ref is found, *ref_ret is set 1530 * to the address of inline back ref, and 0 is returned. 1531 * 1532 * if back ref isn't found, *ref_ret is set to the address where it 1533 * should be inserted, and -ENOENT is returned. 1534 * 1535 * if insert is true and there are too many inline back refs, the path 1536 * points to the extent item, and -EAGAIN is returned. 1537 * 1538 * NOTE: inline back refs are ordered in the same way that back ref 1539 * items in the tree are ordered. 1540 */ 1541 static noinline_for_stack 1542 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1543 struct btrfs_fs_info *fs_info, 1544 struct btrfs_path *path, 1545 struct btrfs_extent_inline_ref **ref_ret, 1546 u64 bytenr, u64 num_bytes, 1547 u64 parent, u64 root_objectid, 1548 u64 owner, u64 offset, int insert) 1549 { 1550 struct btrfs_root *root = fs_info->extent_root; 1551 struct btrfs_key key; 1552 struct extent_buffer *leaf; 1553 struct btrfs_extent_item *ei; 1554 struct btrfs_extent_inline_ref *iref; 1555 u64 flags; 1556 u64 item_size; 1557 unsigned long ptr; 1558 unsigned long end; 1559 int extra_size; 1560 int type; 1561 int want; 1562 int ret; 1563 int err = 0; 1564 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 1565 1566 key.objectid = bytenr; 1567 key.type = BTRFS_EXTENT_ITEM_KEY; 1568 key.offset = num_bytes; 1569 1570 want = extent_ref_type(parent, owner); 1571 if (insert) { 1572 extra_size = btrfs_extent_inline_ref_size(want); 1573 path->keep_locks = 1; 1574 } else 1575 extra_size = -1; 1576 1577 /* 1578 * Owner is our parent level, so we can just add one to get the level 1579 * for the block we are interested in. 1580 */ 1581 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1582 key.type = BTRFS_METADATA_ITEM_KEY; 1583 key.offset = owner; 1584 } 1585 1586 again: 1587 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1588 if (ret < 0) { 1589 err = ret; 1590 goto out; 1591 } 1592 1593 /* 1594 * We may be a newly converted file system which still has the old fat 1595 * extent entries for metadata, so try and see if we have one of those. 1596 */ 1597 if (ret > 0 && skinny_metadata) { 1598 skinny_metadata = false; 1599 if (path->slots[0]) { 1600 path->slots[0]--; 1601 btrfs_item_key_to_cpu(path->nodes[0], &key, 1602 path->slots[0]); 1603 if (key.objectid == bytenr && 1604 key.type == BTRFS_EXTENT_ITEM_KEY && 1605 key.offset == num_bytes) 1606 ret = 0; 1607 } 1608 if (ret) { 1609 key.objectid = bytenr; 1610 key.type = BTRFS_EXTENT_ITEM_KEY; 1611 key.offset = num_bytes; 1612 btrfs_release_path(path); 1613 goto again; 1614 } 1615 } 1616 1617 if (ret && !insert) { 1618 err = -ENOENT; 1619 goto out; 1620 } else if (WARN_ON(ret)) { 1621 err = -EIO; 1622 goto out; 1623 } 1624 1625 leaf = path->nodes[0]; 1626 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1627 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1628 if (item_size < sizeof(*ei)) { 1629 if (!insert) { 1630 err = -ENOENT; 1631 goto out; 1632 } 1633 ret = convert_extent_item_v0(trans, fs_info, path, owner, 1634 extra_size); 1635 if (ret < 0) { 1636 err = ret; 1637 goto out; 1638 } 1639 leaf = path->nodes[0]; 1640 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1641 } 1642 #endif 1643 BUG_ON(item_size < sizeof(*ei)); 1644 1645 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1646 flags = btrfs_extent_flags(leaf, ei); 1647 1648 ptr = (unsigned long)(ei + 1); 1649 end = (unsigned long)ei + item_size; 1650 1651 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1652 ptr += sizeof(struct btrfs_tree_block_info); 1653 BUG_ON(ptr > end); 1654 } 1655 1656 err = -ENOENT; 1657 while (1) { 1658 if (ptr >= end) { 1659 WARN_ON(ptr > end); 1660 break; 1661 } 1662 iref = (struct btrfs_extent_inline_ref *)ptr; 1663 type = btrfs_extent_inline_ref_type(leaf, iref); 1664 if (want < type) 1665 break; 1666 if (want > type) { 1667 ptr += btrfs_extent_inline_ref_size(type); 1668 continue; 1669 } 1670 1671 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1672 struct btrfs_extent_data_ref *dref; 1673 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1674 if (match_extent_data_ref(leaf, dref, root_objectid, 1675 owner, offset)) { 1676 err = 0; 1677 break; 1678 } 1679 if (hash_extent_data_ref_item(leaf, dref) < 1680 hash_extent_data_ref(root_objectid, owner, offset)) 1681 break; 1682 } else { 1683 u64 ref_offset; 1684 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1685 if (parent > 0) { 1686 if (parent == ref_offset) { 1687 err = 0; 1688 break; 1689 } 1690 if (ref_offset < parent) 1691 break; 1692 } else { 1693 if (root_objectid == ref_offset) { 1694 err = 0; 1695 break; 1696 } 1697 if (ref_offset < root_objectid) 1698 break; 1699 } 1700 } 1701 ptr += btrfs_extent_inline_ref_size(type); 1702 } 1703 if (err == -ENOENT && insert) { 1704 if (item_size + extra_size >= 1705 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1706 err = -EAGAIN; 1707 goto out; 1708 } 1709 /* 1710 * To add new inline back ref, we have to make sure 1711 * there is no corresponding back ref item. 1712 * For simplicity, we just do not add new inline back 1713 * ref if there is any kind of item for this block 1714 */ 1715 if (find_next_key(path, 0, &key) == 0 && 1716 key.objectid == bytenr && 1717 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1718 err = -EAGAIN; 1719 goto out; 1720 } 1721 } 1722 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1723 out: 1724 if (insert) { 1725 path->keep_locks = 0; 1726 btrfs_unlock_up_safe(path, 1); 1727 } 1728 return err; 1729 } 1730 1731 /* 1732 * helper to add new inline back ref 1733 */ 1734 static noinline_for_stack 1735 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, 1736 struct btrfs_path *path, 1737 struct btrfs_extent_inline_ref *iref, 1738 u64 parent, u64 root_objectid, 1739 u64 owner, u64 offset, int refs_to_add, 1740 struct btrfs_delayed_extent_op *extent_op) 1741 { 1742 struct extent_buffer *leaf; 1743 struct btrfs_extent_item *ei; 1744 unsigned long ptr; 1745 unsigned long end; 1746 unsigned long item_offset; 1747 u64 refs; 1748 int size; 1749 int type; 1750 1751 leaf = path->nodes[0]; 1752 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1753 item_offset = (unsigned long)iref - (unsigned long)ei; 1754 1755 type = extent_ref_type(parent, owner); 1756 size = btrfs_extent_inline_ref_size(type); 1757 1758 btrfs_extend_item(fs_info, path, size); 1759 1760 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1761 refs = btrfs_extent_refs(leaf, ei); 1762 refs += refs_to_add; 1763 btrfs_set_extent_refs(leaf, ei, refs); 1764 if (extent_op) 1765 __run_delayed_extent_op(extent_op, leaf, ei); 1766 1767 ptr = (unsigned long)ei + item_offset; 1768 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1769 if (ptr < end - size) 1770 memmove_extent_buffer(leaf, ptr + size, ptr, 1771 end - size - ptr); 1772 1773 iref = (struct btrfs_extent_inline_ref *)ptr; 1774 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1775 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1776 struct btrfs_extent_data_ref *dref; 1777 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1778 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1779 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1780 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1781 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1782 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1783 struct btrfs_shared_data_ref *sref; 1784 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1785 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1786 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1787 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1788 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1789 } else { 1790 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1791 } 1792 btrfs_mark_buffer_dirty(leaf); 1793 } 1794 1795 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1796 struct btrfs_fs_info *fs_info, 1797 struct btrfs_path *path, 1798 struct btrfs_extent_inline_ref **ref_ret, 1799 u64 bytenr, u64 num_bytes, u64 parent, 1800 u64 root_objectid, u64 owner, u64 offset) 1801 { 1802 int ret; 1803 1804 ret = lookup_inline_extent_backref(trans, fs_info, path, ref_ret, 1805 bytenr, num_bytes, parent, 1806 root_objectid, owner, offset, 0); 1807 if (ret != -ENOENT) 1808 return ret; 1809 1810 btrfs_release_path(path); 1811 *ref_ret = NULL; 1812 1813 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1814 ret = lookup_tree_block_ref(trans, fs_info, path, bytenr, 1815 parent, root_objectid); 1816 } else { 1817 ret = lookup_extent_data_ref(trans, fs_info, path, bytenr, 1818 parent, root_objectid, owner, 1819 offset); 1820 } 1821 return ret; 1822 } 1823 1824 /* 1825 * helper to update/remove inline back ref 1826 */ 1827 static noinline_for_stack 1828 void update_inline_extent_backref(struct btrfs_fs_info *fs_info, 1829 struct btrfs_path *path, 1830 struct btrfs_extent_inline_ref *iref, 1831 int refs_to_mod, 1832 struct btrfs_delayed_extent_op *extent_op, 1833 int *last_ref) 1834 { 1835 struct extent_buffer *leaf; 1836 struct btrfs_extent_item *ei; 1837 struct btrfs_extent_data_ref *dref = NULL; 1838 struct btrfs_shared_data_ref *sref = NULL; 1839 unsigned long ptr; 1840 unsigned long end; 1841 u32 item_size; 1842 int size; 1843 int type; 1844 u64 refs; 1845 1846 leaf = path->nodes[0]; 1847 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1848 refs = btrfs_extent_refs(leaf, ei); 1849 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1850 refs += refs_to_mod; 1851 btrfs_set_extent_refs(leaf, ei, refs); 1852 if (extent_op) 1853 __run_delayed_extent_op(extent_op, leaf, ei); 1854 1855 type = btrfs_extent_inline_ref_type(leaf, iref); 1856 1857 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1858 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1859 refs = btrfs_extent_data_ref_count(leaf, dref); 1860 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1861 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1862 refs = btrfs_shared_data_ref_count(leaf, sref); 1863 } else { 1864 refs = 1; 1865 BUG_ON(refs_to_mod != -1); 1866 } 1867 1868 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1869 refs += refs_to_mod; 1870 1871 if (refs > 0) { 1872 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1873 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1874 else 1875 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1876 } else { 1877 *last_ref = 1; 1878 size = btrfs_extent_inline_ref_size(type); 1879 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1880 ptr = (unsigned long)iref; 1881 end = (unsigned long)ei + item_size; 1882 if (ptr + size < end) 1883 memmove_extent_buffer(leaf, ptr, ptr + size, 1884 end - ptr - size); 1885 item_size -= size; 1886 btrfs_truncate_item(fs_info, path, item_size, 1); 1887 } 1888 btrfs_mark_buffer_dirty(leaf); 1889 } 1890 1891 static noinline_for_stack 1892 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1893 struct btrfs_fs_info *fs_info, 1894 struct btrfs_path *path, 1895 u64 bytenr, u64 num_bytes, u64 parent, 1896 u64 root_objectid, u64 owner, 1897 u64 offset, int refs_to_add, 1898 struct btrfs_delayed_extent_op *extent_op) 1899 { 1900 struct btrfs_extent_inline_ref *iref; 1901 int ret; 1902 1903 ret = lookup_inline_extent_backref(trans, fs_info, path, &iref, 1904 bytenr, num_bytes, parent, 1905 root_objectid, owner, offset, 1); 1906 if (ret == 0) { 1907 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1908 update_inline_extent_backref(fs_info, path, iref, 1909 refs_to_add, extent_op, NULL); 1910 } else if (ret == -ENOENT) { 1911 setup_inline_extent_backref(fs_info, path, iref, parent, 1912 root_objectid, owner, offset, 1913 refs_to_add, extent_op); 1914 ret = 0; 1915 } 1916 return ret; 1917 } 1918 1919 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1920 struct btrfs_fs_info *fs_info, 1921 struct btrfs_path *path, 1922 u64 bytenr, u64 parent, u64 root_objectid, 1923 u64 owner, u64 offset, int refs_to_add) 1924 { 1925 int ret; 1926 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1927 BUG_ON(refs_to_add != 1); 1928 ret = insert_tree_block_ref(trans, fs_info, path, bytenr, 1929 parent, root_objectid); 1930 } else { 1931 ret = insert_extent_data_ref(trans, fs_info, path, bytenr, 1932 parent, root_objectid, 1933 owner, offset, refs_to_add); 1934 } 1935 return ret; 1936 } 1937 1938 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1939 struct btrfs_fs_info *fs_info, 1940 struct btrfs_path *path, 1941 struct btrfs_extent_inline_ref *iref, 1942 int refs_to_drop, int is_data, int *last_ref) 1943 { 1944 int ret = 0; 1945 1946 BUG_ON(!is_data && refs_to_drop != 1); 1947 if (iref) { 1948 update_inline_extent_backref(fs_info, path, iref, 1949 -refs_to_drop, NULL, last_ref); 1950 } else if (is_data) { 1951 ret = remove_extent_data_ref(trans, fs_info, path, refs_to_drop, 1952 last_ref); 1953 } else { 1954 *last_ref = 1; 1955 ret = btrfs_del_item(trans, fs_info->extent_root, path); 1956 } 1957 return ret; 1958 } 1959 1960 #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) 1961 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, 1962 u64 *discarded_bytes) 1963 { 1964 int j, ret = 0; 1965 u64 bytes_left, end; 1966 u64 aligned_start = ALIGN(start, 1 << 9); 1967 1968 if (WARN_ON(start != aligned_start)) { 1969 len -= aligned_start - start; 1970 len = round_down(len, 1 << 9); 1971 start = aligned_start; 1972 } 1973 1974 *discarded_bytes = 0; 1975 1976 if (!len) 1977 return 0; 1978 1979 end = start + len; 1980 bytes_left = len; 1981 1982 /* Skip any superblocks on this device. */ 1983 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { 1984 u64 sb_start = btrfs_sb_offset(j); 1985 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; 1986 u64 size = sb_start - start; 1987 1988 if (!in_range(sb_start, start, bytes_left) && 1989 !in_range(sb_end, start, bytes_left) && 1990 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) 1991 continue; 1992 1993 /* 1994 * Superblock spans beginning of range. Adjust start and 1995 * try again. 1996 */ 1997 if (sb_start <= start) { 1998 start += sb_end - start; 1999 if (start > end) { 2000 bytes_left = 0; 2001 break; 2002 } 2003 bytes_left = end - start; 2004 continue; 2005 } 2006 2007 if (size) { 2008 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, 2009 GFP_NOFS, 0); 2010 if (!ret) 2011 *discarded_bytes += size; 2012 else if (ret != -EOPNOTSUPP) 2013 return ret; 2014 } 2015 2016 start = sb_end; 2017 if (start > end) { 2018 bytes_left = 0; 2019 break; 2020 } 2021 bytes_left = end - start; 2022 } 2023 2024 if (bytes_left) { 2025 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, 2026 GFP_NOFS, 0); 2027 if (!ret) 2028 *discarded_bytes += bytes_left; 2029 } 2030 return ret; 2031 } 2032 2033 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, 2034 u64 num_bytes, u64 *actual_bytes) 2035 { 2036 int ret; 2037 u64 discarded_bytes = 0; 2038 struct btrfs_bio *bbio = NULL; 2039 2040 2041 /* 2042 * Avoid races with device replace and make sure our bbio has devices 2043 * associated to its stripes that don't go away while we are discarding. 2044 */ 2045 btrfs_bio_counter_inc_blocked(fs_info); 2046 /* Tell the block device(s) that the sectors can be discarded */ 2047 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, 2048 &bbio, 0); 2049 /* Error condition is -ENOMEM */ 2050 if (!ret) { 2051 struct btrfs_bio_stripe *stripe = bbio->stripes; 2052 int i; 2053 2054 2055 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 2056 u64 bytes; 2057 if (!stripe->dev->can_discard) 2058 continue; 2059 2060 ret = btrfs_issue_discard(stripe->dev->bdev, 2061 stripe->physical, 2062 stripe->length, 2063 &bytes); 2064 if (!ret) 2065 discarded_bytes += bytes; 2066 else if (ret != -EOPNOTSUPP) 2067 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 2068 2069 /* 2070 * Just in case we get back EOPNOTSUPP for some reason, 2071 * just ignore the return value so we don't screw up 2072 * people calling discard_extent. 2073 */ 2074 ret = 0; 2075 } 2076 btrfs_put_bbio(bbio); 2077 } 2078 btrfs_bio_counter_dec(fs_info); 2079 2080 if (actual_bytes) 2081 *actual_bytes = discarded_bytes; 2082 2083 2084 if (ret == -EOPNOTSUPP) 2085 ret = 0; 2086 return ret; 2087 } 2088 2089 /* Can return -ENOMEM */ 2090 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2091 struct btrfs_fs_info *fs_info, 2092 u64 bytenr, u64 num_bytes, u64 parent, 2093 u64 root_objectid, u64 owner, u64 offset) 2094 { 2095 int ret; 2096 2097 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 2098 root_objectid == BTRFS_TREE_LOG_OBJECTID); 2099 2100 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 2101 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 2102 num_bytes, 2103 parent, root_objectid, (int)owner, 2104 BTRFS_ADD_DELAYED_REF, NULL); 2105 } else { 2106 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 2107 num_bytes, parent, root_objectid, 2108 owner, offset, 0, 2109 BTRFS_ADD_DELAYED_REF); 2110 } 2111 return ret; 2112 } 2113 2114 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2115 struct btrfs_fs_info *fs_info, 2116 struct btrfs_delayed_ref_node *node, 2117 u64 parent, u64 root_objectid, 2118 u64 owner, u64 offset, int refs_to_add, 2119 struct btrfs_delayed_extent_op *extent_op) 2120 { 2121 struct btrfs_path *path; 2122 struct extent_buffer *leaf; 2123 struct btrfs_extent_item *item; 2124 struct btrfs_key key; 2125 u64 bytenr = node->bytenr; 2126 u64 num_bytes = node->num_bytes; 2127 u64 refs; 2128 int ret; 2129 2130 path = btrfs_alloc_path(); 2131 if (!path) 2132 return -ENOMEM; 2133 2134 path->reada = READA_FORWARD; 2135 path->leave_spinning = 1; 2136 /* this will setup the path even if it fails to insert the back ref */ 2137 ret = insert_inline_extent_backref(trans, fs_info, path, bytenr, 2138 num_bytes, parent, root_objectid, 2139 owner, offset, 2140 refs_to_add, extent_op); 2141 if ((ret < 0 && ret != -EAGAIN) || !ret) 2142 goto out; 2143 2144 /* 2145 * Ok we had -EAGAIN which means we didn't have space to insert and 2146 * inline extent ref, so just update the reference count and add a 2147 * normal backref. 2148 */ 2149 leaf = path->nodes[0]; 2150 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2151 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2152 refs = btrfs_extent_refs(leaf, item); 2153 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2154 if (extent_op) 2155 __run_delayed_extent_op(extent_op, leaf, item); 2156 2157 btrfs_mark_buffer_dirty(leaf); 2158 btrfs_release_path(path); 2159 2160 path->reada = READA_FORWARD; 2161 path->leave_spinning = 1; 2162 /* now insert the actual backref */ 2163 ret = insert_extent_backref(trans, fs_info, path, bytenr, parent, 2164 root_objectid, owner, offset, refs_to_add); 2165 if (ret) 2166 btrfs_abort_transaction(trans, ret); 2167 out: 2168 btrfs_free_path(path); 2169 return ret; 2170 } 2171 2172 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2173 struct btrfs_fs_info *fs_info, 2174 struct btrfs_delayed_ref_node *node, 2175 struct btrfs_delayed_extent_op *extent_op, 2176 int insert_reserved) 2177 { 2178 int ret = 0; 2179 struct btrfs_delayed_data_ref *ref; 2180 struct btrfs_key ins; 2181 u64 parent = 0; 2182 u64 ref_root = 0; 2183 u64 flags = 0; 2184 2185 ins.objectid = node->bytenr; 2186 ins.offset = node->num_bytes; 2187 ins.type = BTRFS_EXTENT_ITEM_KEY; 2188 2189 ref = btrfs_delayed_node_to_data_ref(node); 2190 trace_run_delayed_data_ref(fs_info, node, ref, node->action); 2191 2192 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2193 parent = ref->parent; 2194 ref_root = ref->root; 2195 2196 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2197 if (extent_op) 2198 flags |= extent_op->flags_to_set; 2199 ret = alloc_reserved_file_extent(trans, fs_info, 2200 parent, ref_root, flags, 2201 ref->objectid, ref->offset, 2202 &ins, node->ref_mod); 2203 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2204 ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent, 2205 ref_root, ref->objectid, 2206 ref->offset, node->ref_mod, 2207 extent_op); 2208 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2209 ret = __btrfs_free_extent(trans, fs_info, node, parent, 2210 ref_root, ref->objectid, 2211 ref->offset, node->ref_mod, 2212 extent_op); 2213 } else { 2214 BUG(); 2215 } 2216 return ret; 2217 } 2218 2219 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2220 struct extent_buffer *leaf, 2221 struct btrfs_extent_item *ei) 2222 { 2223 u64 flags = btrfs_extent_flags(leaf, ei); 2224 if (extent_op->update_flags) { 2225 flags |= extent_op->flags_to_set; 2226 btrfs_set_extent_flags(leaf, ei, flags); 2227 } 2228 2229 if (extent_op->update_key) { 2230 struct btrfs_tree_block_info *bi; 2231 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2232 bi = (struct btrfs_tree_block_info *)(ei + 1); 2233 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2234 } 2235 } 2236 2237 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2238 struct btrfs_fs_info *fs_info, 2239 struct btrfs_delayed_ref_node *node, 2240 struct btrfs_delayed_extent_op *extent_op) 2241 { 2242 struct btrfs_key key; 2243 struct btrfs_path *path; 2244 struct btrfs_extent_item *ei; 2245 struct extent_buffer *leaf; 2246 u32 item_size; 2247 int ret; 2248 int err = 0; 2249 int metadata = !extent_op->is_data; 2250 2251 if (trans->aborted) 2252 return 0; 2253 2254 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2255 metadata = 0; 2256 2257 path = btrfs_alloc_path(); 2258 if (!path) 2259 return -ENOMEM; 2260 2261 key.objectid = node->bytenr; 2262 2263 if (metadata) { 2264 key.type = BTRFS_METADATA_ITEM_KEY; 2265 key.offset = extent_op->level; 2266 } else { 2267 key.type = BTRFS_EXTENT_ITEM_KEY; 2268 key.offset = node->num_bytes; 2269 } 2270 2271 again: 2272 path->reada = READA_FORWARD; 2273 path->leave_spinning = 1; 2274 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); 2275 if (ret < 0) { 2276 err = ret; 2277 goto out; 2278 } 2279 if (ret > 0) { 2280 if (metadata) { 2281 if (path->slots[0] > 0) { 2282 path->slots[0]--; 2283 btrfs_item_key_to_cpu(path->nodes[0], &key, 2284 path->slots[0]); 2285 if (key.objectid == node->bytenr && 2286 key.type == BTRFS_EXTENT_ITEM_KEY && 2287 key.offset == node->num_bytes) 2288 ret = 0; 2289 } 2290 if (ret > 0) { 2291 btrfs_release_path(path); 2292 metadata = 0; 2293 2294 key.objectid = node->bytenr; 2295 key.offset = node->num_bytes; 2296 key.type = BTRFS_EXTENT_ITEM_KEY; 2297 goto again; 2298 } 2299 } else { 2300 err = -EIO; 2301 goto out; 2302 } 2303 } 2304 2305 leaf = path->nodes[0]; 2306 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2307 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2308 if (item_size < sizeof(*ei)) { 2309 ret = convert_extent_item_v0(trans, fs_info, path, (u64)-1, 0); 2310 if (ret < 0) { 2311 err = ret; 2312 goto out; 2313 } 2314 leaf = path->nodes[0]; 2315 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2316 } 2317 #endif 2318 BUG_ON(item_size < sizeof(*ei)); 2319 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2320 __run_delayed_extent_op(extent_op, leaf, ei); 2321 2322 btrfs_mark_buffer_dirty(leaf); 2323 out: 2324 btrfs_free_path(path); 2325 return err; 2326 } 2327 2328 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2329 struct btrfs_fs_info *fs_info, 2330 struct btrfs_delayed_ref_node *node, 2331 struct btrfs_delayed_extent_op *extent_op, 2332 int insert_reserved) 2333 { 2334 int ret = 0; 2335 struct btrfs_delayed_tree_ref *ref; 2336 struct btrfs_key ins; 2337 u64 parent = 0; 2338 u64 ref_root = 0; 2339 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 2340 2341 ref = btrfs_delayed_node_to_tree_ref(node); 2342 trace_run_delayed_tree_ref(fs_info, node, ref, node->action); 2343 2344 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2345 parent = ref->parent; 2346 ref_root = ref->root; 2347 2348 ins.objectid = node->bytenr; 2349 if (skinny_metadata) { 2350 ins.offset = ref->level; 2351 ins.type = BTRFS_METADATA_ITEM_KEY; 2352 } else { 2353 ins.offset = node->num_bytes; 2354 ins.type = BTRFS_EXTENT_ITEM_KEY; 2355 } 2356 2357 if (node->ref_mod != 1) { 2358 btrfs_err(fs_info, 2359 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", 2360 node->bytenr, node->ref_mod, node->action, ref_root, 2361 parent); 2362 return -EIO; 2363 } 2364 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2365 BUG_ON(!extent_op || !extent_op->update_flags); 2366 ret = alloc_reserved_tree_block(trans, fs_info, 2367 parent, ref_root, 2368 extent_op->flags_to_set, 2369 &extent_op->key, 2370 ref->level, &ins); 2371 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2372 ret = __btrfs_inc_extent_ref(trans, fs_info, node, 2373 parent, ref_root, 2374 ref->level, 0, 1, 2375 extent_op); 2376 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2377 ret = __btrfs_free_extent(trans, fs_info, node, 2378 parent, ref_root, 2379 ref->level, 0, 1, extent_op); 2380 } else { 2381 BUG(); 2382 } 2383 return ret; 2384 } 2385 2386 /* helper function to actually process a single delayed ref entry */ 2387 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2388 struct btrfs_fs_info *fs_info, 2389 struct btrfs_delayed_ref_node *node, 2390 struct btrfs_delayed_extent_op *extent_op, 2391 int insert_reserved) 2392 { 2393 int ret = 0; 2394 2395 if (trans->aborted) { 2396 if (insert_reserved) 2397 btrfs_pin_extent(fs_info, node->bytenr, 2398 node->num_bytes, 1); 2399 return 0; 2400 } 2401 2402 if (btrfs_delayed_ref_is_head(node)) { 2403 struct btrfs_delayed_ref_head *head; 2404 /* 2405 * we've hit the end of the chain and we were supposed 2406 * to insert this extent into the tree. But, it got 2407 * deleted before we ever needed to insert it, so all 2408 * we have to do is clean up the accounting 2409 */ 2410 BUG_ON(extent_op); 2411 head = btrfs_delayed_node_to_head(node); 2412 trace_run_delayed_ref_head(fs_info, node, head, node->action); 2413 2414 if (insert_reserved) { 2415 btrfs_pin_extent(fs_info, node->bytenr, 2416 node->num_bytes, 1); 2417 if (head->is_data) { 2418 ret = btrfs_del_csums(trans, fs_info, 2419 node->bytenr, 2420 node->num_bytes); 2421 } 2422 } 2423 2424 /* Also free its reserved qgroup space */ 2425 btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, 2426 head->qgroup_reserved); 2427 return ret; 2428 } 2429 2430 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2431 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2432 ret = run_delayed_tree_ref(trans, fs_info, node, extent_op, 2433 insert_reserved); 2434 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2435 node->type == BTRFS_SHARED_DATA_REF_KEY) 2436 ret = run_delayed_data_ref(trans, fs_info, node, extent_op, 2437 insert_reserved); 2438 else 2439 BUG(); 2440 return ret; 2441 } 2442 2443 static inline struct btrfs_delayed_ref_node * 2444 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2445 { 2446 struct btrfs_delayed_ref_node *ref; 2447 2448 if (list_empty(&head->ref_list)) 2449 return NULL; 2450 2451 /* 2452 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. 2453 * This is to prevent a ref count from going down to zero, which deletes 2454 * the extent item from the extent tree, when there still are references 2455 * to add, which would fail because they would not find the extent item. 2456 */ 2457 if (!list_empty(&head->ref_add_list)) 2458 return list_first_entry(&head->ref_add_list, 2459 struct btrfs_delayed_ref_node, add_list); 2460 2461 ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, 2462 list); 2463 ASSERT(list_empty(&ref->add_list)); 2464 return ref; 2465 } 2466 2467 /* 2468 * Returns 0 on success or if called with an already aborted transaction. 2469 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2470 */ 2471 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2472 struct btrfs_fs_info *fs_info, 2473 unsigned long nr) 2474 { 2475 struct btrfs_delayed_ref_root *delayed_refs; 2476 struct btrfs_delayed_ref_node *ref; 2477 struct btrfs_delayed_ref_head *locked_ref = NULL; 2478 struct btrfs_delayed_extent_op *extent_op; 2479 ktime_t start = ktime_get(); 2480 int ret; 2481 unsigned long count = 0; 2482 unsigned long actual_count = 0; 2483 int must_insert_reserved = 0; 2484 2485 delayed_refs = &trans->transaction->delayed_refs; 2486 while (1) { 2487 if (!locked_ref) { 2488 if (count >= nr) 2489 break; 2490 2491 spin_lock(&delayed_refs->lock); 2492 locked_ref = btrfs_select_ref_head(trans); 2493 if (!locked_ref) { 2494 spin_unlock(&delayed_refs->lock); 2495 break; 2496 } 2497 2498 /* grab the lock that says we are going to process 2499 * all the refs for this head */ 2500 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2501 spin_unlock(&delayed_refs->lock); 2502 /* 2503 * we may have dropped the spin lock to get the head 2504 * mutex lock, and that might have given someone else 2505 * time to free the head. If that's true, it has been 2506 * removed from our list and we can move on. 2507 */ 2508 if (ret == -EAGAIN) { 2509 locked_ref = NULL; 2510 count++; 2511 continue; 2512 } 2513 } 2514 2515 /* 2516 * We need to try and merge add/drops of the same ref since we 2517 * can run into issues with relocate dropping the implicit ref 2518 * and then it being added back again before the drop can 2519 * finish. If we merged anything we need to re-loop so we can 2520 * get a good ref. 2521 * Or we can get node references of the same type that weren't 2522 * merged when created due to bumps in the tree mod seq, and 2523 * we need to merge them to prevent adding an inline extent 2524 * backref before dropping it (triggering a BUG_ON at 2525 * insert_inline_extent_backref()). 2526 */ 2527 spin_lock(&locked_ref->lock); 2528 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2529 locked_ref); 2530 2531 /* 2532 * locked_ref is the head node, so we have to go one 2533 * node back for any delayed ref updates 2534 */ 2535 ref = select_delayed_ref(locked_ref); 2536 2537 if (ref && ref->seq && 2538 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2539 spin_unlock(&locked_ref->lock); 2540 spin_lock(&delayed_refs->lock); 2541 locked_ref->processing = 0; 2542 delayed_refs->num_heads_ready++; 2543 spin_unlock(&delayed_refs->lock); 2544 btrfs_delayed_ref_unlock(locked_ref); 2545 locked_ref = NULL; 2546 cond_resched(); 2547 count++; 2548 continue; 2549 } 2550 2551 /* 2552 * record the must insert reserved flag before we 2553 * drop the spin lock. 2554 */ 2555 must_insert_reserved = locked_ref->must_insert_reserved; 2556 locked_ref->must_insert_reserved = 0; 2557 2558 extent_op = locked_ref->extent_op; 2559 locked_ref->extent_op = NULL; 2560 2561 if (!ref) { 2562 2563 2564 /* All delayed refs have been processed, Go ahead 2565 * and send the head node to run_one_delayed_ref, 2566 * so that any accounting fixes can happen 2567 */ 2568 ref = &locked_ref->node; 2569 2570 if (extent_op && must_insert_reserved) { 2571 btrfs_free_delayed_extent_op(extent_op); 2572 extent_op = NULL; 2573 } 2574 2575 if (extent_op) { 2576 spin_unlock(&locked_ref->lock); 2577 ret = run_delayed_extent_op(trans, fs_info, 2578 ref, extent_op); 2579 btrfs_free_delayed_extent_op(extent_op); 2580 2581 if (ret) { 2582 /* 2583 * Need to reset must_insert_reserved if 2584 * there was an error so the abort stuff 2585 * can cleanup the reserved space 2586 * properly. 2587 */ 2588 if (must_insert_reserved) 2589 locked_ref->must_insert_reserved = 1; 2590 spin_lock(&delayed_refs->lock); 2591 locked_ref->processing = 0; 2592 delayed_refs->num_heads_ready++; 2593 spin_unlock(&delayed_refs->lock); 2594 btrfs_debug(fs_info, 2595 "run_delayed_extent_op returned %d", 2596 ret); 2597 btrfs_delayed_ref_unlock(locked_ref); 2598 return ret; 2599 } 2600 continue; 2601 } 2602 2603 /* 2604 * Need to drop our head ref lock and re-acquire the 2605 * delayed ref lock and then re-check to make sure 2606 * nobody got added. 2607 */ 2608 spin_unlock(&locked_ref->lock); 2609 spin_lock(&delayed_refs->lock); 2610 spin_lock(&locked_ref->lock); 2611 if (!list_empty(&locked_ref->ref_list) || 2612 locked_ref->extent_op) { 2613 spin_unlock(&locked_ref->lock); 2614 spin_unlock(&delayed_refs->lock); 2615 continue; 2616 } 2617 ref->in_tree = 0; 2618 delayed_refs->num_heads--; 2619 rb_erase(&locked_ref->href_node, 2620 &delayed_refs->href_root); 2621 spin_unlock(&delayed_refs->lock); 2622 } else { 2623 actual_count++; 2624 ref->in_tree = 0; 2625 list_del(&ref->list); 2626 if (!list_empty(&ref->add_list)) 2627 list_del(&ref->add_list); 2628 } 2629 atomic_dec(&delayed_refs->num_entries); 2630 2631 if (!btrfs_delayed_ref_is_head(ref)) { 2632 /* 2633 * when we play the delayed ref, also correct the 2634 * ref_mod on head 2635 */ 2636 switch (ref->action) { 2637 case BTRFS_ADD_DELAYED_REF: 2638 case BTRFS_ADD_DELAYED_EXTENT: 2639 locked_ref->node.ref_mod -= ref->ref_mod; 2640 break; 2641 case BTRFS_DROP_DELAYED_REF: 2642 locked_ref->node.ref_mod += ref->ref_mod; 2643 break; 2644 default: 2645 WARN_ON(1); 2646 } 2647 } 2648 spin_unlock(&locked_ref->lock); 2649 2650 ret = run_one_delayed_ref(trans, fs_info, ref, extent_op, 2651 must_insert_reserved); 2652 2653 btrfs_free_delayed_extent_op(extent_op); 2654 if (ret) { 2655 spin_lock(&delayed_refs->lock); 2656 locked_ref->processing = 0; 2657 delayed_refs->num_heads_ready++; 2658 spin_unlock(&delayed_refs->lock); 2659 btrfs_delayed_ref_unlock(locked_ref); 2660 btrfs_put_delayed_ref(ref); 2661 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", 2662 ret); 2663 return ret; 2664 } 2665 2666 /* 2667 * If this node is a head, that means all the refs in this head 2668 * have been dealt with, and we will pick the next head to deal 2669 * with, so we must unlock the head and drop it from the cluster 2670 * list before we release it. 2671 */ 2672 if (btrfs_delayed_ref_is_head(ref)) { 2673 if (locked_ref->is_data && 2674 locked_ref->total_ref_mod < 0) { 2675 spin_lock(&delayed_refs->lock); 2676 delayed_refs->pending_csums -= ref->num_bytes; 2677 spin_unlock(&delayed_refs->lock); 2678 } 2679 btrfs_delayed_ref_unlock(locked_ref); 2680 locked_ref = NULL; 2681 } 2682 btrfs_put_delayed_ref(ref); 2683 count++; 2684 cond_resched(); 2685 } 2686 2687 /* 2688 * We don't want to include ref heads since we can have empty ref heads 2689 * and those will drastically skew our runtime down since we just do 2690 * accounting, no actual extent tree updates. 2691 */ 2692 if (actual_count > 0) { 2693 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2694 u64 avg; 2695 2696 /* 2697 * We weigh the current average higher than our current runtime 2698 * to avoid large swings in the average. 2699 */ 2700 spin_lock(&delayed_refs->lock); 2701 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2702 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ 2703 spin_unlock(&delayed_refs->lock); 2704 } 2705 return 0; 2706 } 2707 2708 #ifdef SCRAMBLE_DELAYED_REFS 2709 /* 2710 * Normally delayed refs get processed in ascending bytenr order. This 2711 * correlates in most cases to the order added. To expose dependencies on this 2712 * order, we start to process the tree in the middle instead of the beginning 2713 */ 2714 static u64 find_middle(struct rb_root *root) 2715 { 2716 struct rb_node *n = root->rb_node; 2717 struct btrfs_delayed_ref_node *entry; 2718 int alt = 1; 2719 u64 middle; 2720 u64 first = 0, last = 0; 2721 2722 n = rb_first(root); 2723 if (n) { 2724 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2725 first = entry->bytenr; 2726 } 2727 n = rb_last(root); 2728 if (n) { 2729 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2730 last = entry->bytenr; 2731 } 2732 n = root->rb_node; 2733 2734 while (n) { 2735 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2736 WARN_ON(!entry->in_tree); 2737 2738 middle = entry->bytenr; 2739 2740 if (alt) 2741 n = n->rb_left; 2742 else 2743 n = n->rb_right; 2744 2745 alt = 1 - alt; 2746 } 2747 return middle; 2748 } 2749 #endif 2750 2751 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads) 2752 { 2753 u64 num_bytes; 2754 2755 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2756 sizeof(struct btrfs_extent_inline_ref)); 2757 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2758 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2759 2760 /* 2761 * We don't ever fill up leaves all the way so multiply by 2 just to be 2762 * closer to what we're really going to want to use. 2763 */ 2764 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info)); 2765 } 2766 2767 /* 2768 * Takes the number of bytes to be csumm'ed and figures out how many leaves it 2769 * would require to store the csums for that many bytes. 2770 */ 2771 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) 2772 { 2773 u64 csum_size; 2774 u64 num_csums_per_leaf; 2775 u64 num_csums; 2776 2777 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); 2778 num_csums_per_leaf = div64_u64(csum_size, 2779 (u64)btrfs_super_csum_size(fs_info->super_copy)); 2780 num_csums = div64_u64(csum_bytes, fs_info->sectorsize); 2781 num_csums += num_csums_per_leaf - 1; 2782 num_csums = div64_u64(num_csums, num_csums_per_leaf); 2783 return num_csums; 2784 } 2785 2786 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, 2787 struct btrfs_fs_info *fs_info) 2788 { 2789 struct btrfs_block_rsv *global_rsv; 2790 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; 2791 u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; 2792 u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; 2793 u64 num_bytes, num_dirty_bgs_bytes; 2794 int ret = 0; 2795 2796 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 2797 num_heads = heads_to_leaves(fs_info, num_heads); 2798 if (num_heads > 1) 2799 num_bytes += (num_heads - 1) * fs_info->nodesize; 2800 num_bytes <<= 1; 2801 num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) * 2802 fs_info->nodesize; 2803 num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info, 2804 num_dirty_bgs); 2805 global_rsv = &fs_info->global_block_rsv; 2806 2807 /* 2808 * If we can't allocate any more chunks lets make sure we have _lots_ of 2809 * wiggle room since running delayed refs can create more delayed refs. 2810 */ 2811 if (global_rsv->space_info->full) { 2812 num_dirty_bgs_bytes <<= 1; 2813 num_bytes <<= 1; 2814 } 2815 2816 spin_lock(&global_rsv->lock); 2817 if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) 2818 ret = 1; 2819 spin_unlock(&global_rsv->lock); 2820 return ret; 2821 } 2822 2823 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 2824 struct btrfs_fs_info *fs_info) 2825 { 2826 u64 num_entries = 2827 atomic_read(&trans->transaction->delayed_refs.num_entries); 2828 u64 avg_runtime; 2829 u64 val; 2830 2831 smp_mb(); 2832 avg_runtime = fs_info->avg_delayed_ref_runtime; 2833 val = num_entries * avg_runtime; 2834 if (val >= NSEC_PER_SEC) 2835 return 1; 2836 if (val >= NSEC_PER_SEC / 2) 2837 return 2; 2838 2839 return btrfs_check_space_for_delayed_refs(trans, fs_info); 2840 } 2841 2842 struct async_delayed_refs { 2843 struct btrfs_root *root; 2844 u64 transid; 2845 int count; 2846 int error; 2847 int sync; 2848 struct completion wait; 2849 struct btrfs_work work; 2850 }; 2851 2852 static inline struct async_delayed_refs * 2853 to_async_delayed_refs(struct btrfs_work *work) 2854 { 2855 return container_of(work, struct async_delayed_refs, work); 2856 } 2857 2858 static void delayed_ref_async_start(struct btrfs_work *work) 2859 { 2860 struct async_delayed_refs *async = to_async_delayed_refs(work); 2861 struct btrfs_trans_handle *trans; 2862 struct btrfs_fs_info *fs_info = async->root->fs_info; 2863 int ret; 2864 2865 /* if the commit is already started, we don't need to wait here */ 2866 if (btrfs_transaction_blocked(fs_info)) 2867 goto done; 2868 2869 trans = btrfs_join_transaction(async->root); 2870 if (IS_ERR(trans)) { 2871 async->error = PTR_ERR(trans); 2872 goto done; 2873 } 2874 2875 /* 2876 * trans->sync means that when we call end_transaction, we won't 2877 * wait on delayed refs 2878 */ 2879 trans->sync = true; 2880 2881 /* Don't bother flushing if we got into a different transaction */ 2882 if (trans->transid > async->transid) 2883 goto end; 2884 2885 ret = btrfs_run_delayed_refs(trans, fs_info, async->count); 2886 if (ret) 2887 async->error = ret; 2888 end: 2889 ret = btrfs_end_transaction(trans); 2890 if (ret && !async->error) 2891 async->error = ret; 2892 done: 2893 if (async->sync) 2894 complete(&async->wait); 2895 else 2896 kfree(async); 2897 } 2898 2899 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, 2900 unsigned long count, u64 transid, int wait) 2901 { 2902 struct async_delayed_refs *async; 2903 int ret; 2904 2905 async = kmalloc(sizeof(*async), GFP_NOFS); 2906 if (!async) 2907 return -ENOMEM; 2908 2909 async->root = fs_info->tree_root; 2910 async->count = count; 2911 async->error = 0; 2912 async->transid = transid; 2913 if (wait) 2914 async->sync = 1; 2915 else 2916 async->sync = 0; 2917 init_completion(&async->wait); 2918 2919 btrfs_init_work(&async->work, btrfs_extent_refs_helper, 2920 delayed_ref_async_start, NULL, NULL); 2921 2922 btrfs_queue_work(fs_info->extent_workers, &async->work); 2923 2924 if (wait) { 2925 wait_for_completion(&async->wait); 2926 ret = async->error; 2927 kfree(async); 2928 return ret; 2929 } 2930 return 0; 2931 } 2932 2933 /* 2934 * this starts processing the delayed reference count updates and 2935 * extent insertions we have queued up so far. count can be 2936 * 0, which means to process everything in the tree at the start 2937 * of the run (but not newly added entries), or it can be some target 2938 * number you'd like to process. 2939 * 2940 * Returns 0 on success or if called with an aborted transaction 2941 * Returns <0 on error and aborts the transaction 2942 */ 2943 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2944 struct btrfs_fs_info *fs_info, unsigned long count) 2945 { 2946 struct rb_node *node; 2947 struct btrfs_delayed_ref_root *delayed_refs; 2948 struct btrfs_delayed_ref_head *head; 2949 int ret; 2950 int run_all = count == (unsigned long)-1; 2951 bool can_flush_pending_bgs = trans->can_flush_pending_bgs; 2952 2953 /* We'll clean this up in btrfs_cleanup_transaction */ 2954 if (trans->aborted) 2955 return 0; 2956 2957 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) 2958 return 0; 2959 2960 delayed_refs = &trans->transaction->delayed_refs; 2961 if (count == 0) 2962 count = atomic_read(&delayed_refs->num_entries) * 2; 2963 2964 again: 2965 #ifdef SCRAMBLE_DELAYED_REFS 2966 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2967 #endif 2968 trans->can_flush_pending_bgs = false; 2969 ret = __btrfs_run_delayed_refs(trans, fs_info, count); 2970 if (ret < 0) { 2971 btrfs_abort_transaction(trans, ret); 2972 return ret; 2973 } 2974 2975 if (run_all) { 2976 if (!list_empty(&trans->new_bgs)) 2977 btrfs_create_pending_block_groups(trans, fs_info); 2978 2979 spin_lock(&delayed_refs->lock); 2980 node = rb_first(&delayed_refs->href_root); 2981 if (!node) { 2982 spin_unlock(&delayed_refs->lock); 2983 goto out; 2984 } 2985 2986 while (node) { 2987 head = rb_entry(node, struct btrfs_delayed_ref_head, 2988 href_node); 2989 if (btrfs_delayed_ref_is_head(&head->node)) { 2990 struct btrfs_delayed_ref_node *ref; 2991 2992 ref = &head->node; 2993 refcount_inc(&ref->refs); 2994 2995 spin_unlock(&delayed_refs->lock); 2996 /* 2997 * Mutex was contended, block until it's 2998 * released and try again 2999 */ 3000 mutex_lock(&head->mutex); 3001 mutex_unlock(&head->mutex); 3002 3003 btrfs_put_delayed_ref(ref); 3004 cond_resched(); 3005 goto again; 3006 } else { 3007 WARN_ON(1); 3008 } 3009 node = rb_next(node); 3010 } 3011 spin_unlock(&delayed_refs->lock); 3012 cond_resched(); 3013 goto again; 3014 } 3015 out: 3016 trans->can_flush_pending_bgs = can_flush_pending_bgs; 3017 return 0; 3018 } 3019 3020 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 3021 struct btrfs_fs_info *fs_info, 3022 u64 bytenr, u64 num_bytes, u64 flags, 3023 int level, int is_data) 3024 { 3025 struct btrfs_delayed_extent_op *extent_op; 3026 int ret; 3027 3028 extent_op = btrfs_alloc_delayed_extent_op(); 3029 if (!extent_op) 3030 return -ENOMEM; 3031 3032 extent_op->flags_to_set = flags; 3033 extent_op->update_flags = true; 3034 extent_op->update_key = false; 3035 extent_op->is_data = is_data ? true : false; 3036 extent_op->level = level; 3037 3038 ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr, 3039 num_bytes, extent_op); 3040 if (ret) 3041 btrfs_free_delayed_extent_op(extent_op); 3042 return ret; 3043 } 3044 3045 static noinline int check_delayed_ref(struct btrfs_root *root, 3046 struct btrfs_path *path, 3047 u64 objectid, u64 offset, u64 bytenr) 3048 { 3049 struct btrfs_delayed_ref_head *head; 3050 struct btrfs_delayed_ref_node *ref; 3051 struct btrfs_delayed_data_ref *data_ref; 3052 struct btrfs_delayed_ref_root *delayed_refs; 3053 struct btrfs_transaction *cur_trans; 3054 int ret = 0; 3055 3056 cur_trans = root->fs_info->running_transaction; 3057 if (!cur_trans) 3058 return 0; 3059 3060 delayed_refs = &cur_trans->delayed_refs; 3061 spin_lock(&delayed_refs->lock); 3062 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 3063 if (!head) { 3064 spin_unlock(&delayed_refs->lock); 3065 return 0; 3066 } 3067 3068 if (!mutex_trylock(&head->mutex)) { 3069 refcount_inc(&head->node.refs); 3070 spin_unlock(&delayed_refs->lock); 3071 3072 btrfs_release_path(path); 3073 3074 /* 3075 * Mutex was contended, block until it's released and let 3076 * caller try again 3077 */ 3078 mutex_lock(&head->mutex); 3079 mutex_unlock(&head->mutex); 3080 btrfs_put_delayed_ref(&head->node); 3081 return -EAGAIN; 3082 } 3083 spin_unlock(&delayed_refs->lock); 3084 3085 spin_lock(&head->lock); 3086 list_for_each_entry(ref, &head->ref_list, list) { 3087 /* If it's a shared ref we know a cross reference exists */ 3088 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 3089 ret = 1; 3090 break; 3091 } 3092 3093 data_ref = btrfs_delayed_node_to_data_ref(ref); 3094 3095 /* 3096 * If our ref doesn't match the one we're currently looking at 3097 * then we have a cross reference. 3098 */ 3099 if (data_ref->root != root->root_key.objectid || 3100 data_ref->objectid != objectid || 3101 data_ref->offset != offset) { 3102 ret = 1; 3103 break; 3104 } 3105 } 3106 spin_unlock(&head->lock); 3107 mutex_unlock(&head->mutex); 3108 return ret; 3109 } 3110 3111 static noinline int check_committed_ref(struct btrfs_root *root, 3112 struct btrfs_path *path, 3113 u64 objectid, u64 offset, u64 bytenr) 3114 { 3115 struct btrfs_fs_info *fs_info = root->fs_info; 3116 struct btrfs_root *extent_root = fs_info->extent_root; 3117 struct extent_buffer *leaf; 3118 struct btrfs_extent_data_ref *ref; 3119 struct btrfs_extent_inline_ref *iref; 3120 struct btrfs_extent_item *ei; 3121 struct btrfs_key key; 3122 u32 item_size; 3123 int ret; 3124 3125 key.objectid = bytenr; 3126 key.offset = (u64)-1; 3127 key.type = BTRFS_EXTENT_ITEM_KEY; 3128 3129 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 3130 if (ret < 0) 3131 goto out; 3132 BUG_ON(ret == 0); /* Corruption */ 3133 3134 ret = -ENOENT; 3135 if (path->slots[0] == 0) 3136 goto out; 3137 3138 path->slots[0]--; 3139 leaf = path->nodes[0]; 3140 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3141 3142 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 3143 goto out; 3144 3145 ret = 1; 3146 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3147 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 3148 if (item_size < sizeof(*ei)) { 3149 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 3150 goto out; 3151 } 3152 #endif 3153 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 3154 3155 if (item_size != sizeof(*ei) + 3156 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 3157 goto out; 3158 3159 if (btrfs_extent_generation(leaf, ei) <= 3160 btrfs_root_last_snapshot(&root->root_item)) 3161 goto out; 3162 3163 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 3164 if (btrfs_extent_inline_ref_type(leaf, iref) != 3165 BTRFS_EXTENT_DATA_REF_KEY) 3166 goto out; 3167 3168 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 3169 if (btrfs_extent_refs(leaf, ei) != 3170 btrfs_extent_data_ref_count(leaf, ref) || 3171 btrfs_extent_data_ref_root(leaf, ref) != 3172 root->root_key.objectid || 3173 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 3174 btrfs_extent_data_ref_offset(leaf, ref) != offset) 3175 goto out; 3176 3177 ret = 0; 3178 out: 3179 return ret; 3180 } 3181 3182 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, 3183 u64 bytenr) 3184 { 3185 struct btrfs_path *path; 3186 int ret; 3187 int ret2; 3188 3189 path = btrfs_alloc_path(); 3190 if (!path) 3191 return -ENOENT; 3192 3193 do { 3194 ret = check_committed_ref(root, path, objectid, 3195 offset, bytenr); 3196 if (ret && ret != -ENOENT) 3197 goto out; 3198 3199 ret2 = check_delayed_ref(root, path, objectid, 3200 offset, bytenr); 3201 } while (ret2 == -EAGAIN); 3202 3203 if (ret2 && ret2 != -ENOENT) { 3204 ret = ret2; 3205 goto out; 3206 } 3207 3208 if (ret != -ENOENT || ret2 != -ENOENT) 3209 ret = 0; 3210 out: 3211 btrfs_free_path(path); 3212 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3213 WARN_ON(ret > 0); 3214 return ret; 3215 } 3216 3217 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3218 struct btrfs_root *root, 3219 struct extent_buffer *buf, 3220 int full_backref, int inc) 3221 { 3222 struct btrfs_fs_info *fs_info = root->fs_info; 3223 u64 bytenr; 3224 u64 num_bytes; 3225 u64 parent; 3226 u64 ref_root; 3227 u32 nritems; 3228 struct btrfs_key key; 3229 struct btrfs_file_extent_item *fi; 3230 int i; 3231 int level; 3232 int ret = 0; 3233 int (*process_func)(struct btrfs_trans_handle *, 3234 struct btrfs_fs_info *, 3235 u64, u64, u64, u64, u64, u64); 3236 3237 3238 if (btrfs_is_testing(fs_info)) 3239 return 0; 3240 3241 ref_root = btrfs_header_owner(buf); 3242 nritems = btrfs_header_nritems(buf); 3243 level = btrfs_header_level(buf); 3244 3245 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3246 return 0; 3247 3248 if (inc) 3249 process_func = btrfs_inc_extent_ref; 3250 else 3251 process_func = btrfs_free_extent; 3252 3253 if (full_backref) 3254 parent = buf->start; 3255 else 3256 parent = 0; 3257 3258 for (i = 0; i < nritems; i++) { 3259 if (level == 0) { 3260 btrfs_item_key_to_cpu(buf, &key, i); 3261 if (key.type != BTRFS_EXTENT_DATA_KEY) 3262 continue; 3263 fi = btrfs_item_ptr(buf, i, 3264 struct btrfs_file_extent_item); 3265 if (btrfs_file_extent_type(buf, fi) == 3266 BTRFS_FILE_EXTENT_INLINE) 3267 continue; 3268 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3269 if (bytenr == 0) 3270 continue; 3271 3272 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3273 key.offset -= btrfs_file_extent_offset(buf, fi); 3274 ret = process_func(trans, fs_info, bytenr, num_bytes, 3275 parent, ref_root, key.objectid, 3276 key.offset); 3277 if (ret) 3278 goto fail; 3279 } else { 3280 bytenr = btrfs_node_blockptr(buf, i); 3281 num_bytes = fs_info->nodesize; 3282 ret = process_func(trans, fs_info, bytenr, num_bytes, 3283 parent, ref_root, level - 1, 0); 3284 if (ret) 3285 goto fail; 3286 } 3287 } 3288 return 0; 3289 fail: 3290 return ret; 3291 } 3292 3293 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3294 struct extent_buffer *buf, int full_backref) 3295 { 3296 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 3297 } 3298 3299 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3300 struct extent_buffer *buf, int full_backref) 3301 { 3302 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 3303 } 3304 3305 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3306 struct btrfs_fs_info *fs_info, 3307 struct btrfs_path *path, 3308 struct btrfs_block_group_cache *cache) 3309 { 3310 int ret; 3311 struct btrfs_root *extent_root = fs_info->extent_root; 3312 unsigned long bi; 3313 struct extent_buffer *leaf; 3314 3315 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3316 if (ret) { 3317 if (ret > 0) 3318 ret = -ENOENT; 3319 goto fail; 3320 } 3321 3322 leaf = path->nodes[0]; 3323 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3324 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3325 btrfs_mark_buffer_dirty(leaf); 3326 fail: 3327 btrfs_release_path(path); 3328 return ret; 3329 3330 } 3331 3332 static struct btrfs_block_group_cache * 3333 next_block_group(struct btrfs_fs_info *fs_info, 3334 struct btrfs_block_group_cache *cache) 3335 { 3336 struct rb_node *node; 3337 3338 spin_lock(&fs_info->block_group_cache_lock); 3339 3340 /* If our block group was removed, we need a full search. */ 3341 if (RB_EMPTY_NODE(&cache->cache_node)) { 3342 const u64 next_bytenr = cache->key.objectid + cache->key.offset; 3343 3344 spin_unlock(&fs_info->block_group_cache_lock); 3345 btrfs_put_block_group(cache); 3346 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; 3347 } 3348 node = rb_next(&cache->cache_node); 3349 btrfs_put_block_group(cache); 3350 if (node) { 3351 cache = rb_entry(node, struct btrfs_block_group_cache, 3352 cache_node); 3353 btrfs_get_block_group(cache); 3354 } else 3355 cache = NULL; 3356 spin_unlock(&fs_info->block_group_cache_lock); 3357 return cache; 3358 } 3359 3360 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3361 struct btrfs_trans_handle *trans, 3362 struct btrfs_path *path) 3363 { 3364 struct btrfs_fs_info *fs_info = block_group->fs_info; 3365 struct btrfs_root *root = fs_info->tree_root; 3366 struct inode *inode = NULL; 3367 u64 alloc_hint = 0; 3368 int dcs = BTRFS_DC_ERROR; 3369 u64 num_pages = 0; 3370 int retries = 0; 3371 int ret = 0; 3372 3373 /* 3374 * If this block group is smaller than 100 megs don't bother caching the 3375 * block group. 3376 */ 3377 if (block_group->key.offset < (100 * SZ_1M)) { 3378 spin_lock(&block_group->lock); 3379 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3380 spin_unlock(&block_group->lock); 3381 return 0; 3382 } 3383 3384 if (trans->aborted) 3385 return 0; 3386 again: 3387 inode = lookup_free_space_inode(fs_info, block_group, path); 3388 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3389 ret = PTR_ERR(inode); 3390 btrfs_release_path(path); 3391 goto out; 3392 } 3393 3394 if (IS_ERR(inode)) { 3395 BUG_ON(retries); 3396 retries++; 3397 3398 if (block_group->ro) 3399 goto out_free; 3400 3401 ret = create_free_space_inode(fs_info, trans, block_group, 3402 path); 3403 if (ret) 3404 goto out_free; 3405 goto again; 3406 } 3407 3408 /* We've already setup this transaction, go ahead and exit */ 3409 if (block_group->cache_generation == trans->transid && 3410 i_size_read(inode)) { 3411 dcs = BTRFS_DC_SETUP; 3412 goto out_put; 3413 } 3414 3415 /* 3416 * We want to set the generation to 0, that way if anything goes wrong 3417 * from here on out we know not to trust this cache when we load up next 3418 * time. 3419 */ 3420 BTRFS_I(inode)->generation = 0; 3421 ret = btrfs_update_inode(trans, root, inode); 3422 if (ret) { 3423 /* 3424 * So theoretically we could recover from this, simply set the 3425 * super cache generation to 0 so we know to invalidate the 3426 * cache, but then we'd have to keep track of the block groups 3427 * that fail this way so we know we _have_ to reset this cache 3428 * before the next commit or risk reading stale cache. So to 3429 * limit our exposure to horrible edge cases lets just abort the 3430 * transaction, this only happens in really bad situations 3431 * anyway. 3432 */ 3433 btrfs_abort_transaction(trans, ret); 3434 goto out_put; 3435 } 3436 WARN_ON(ret); 3437 3438 if (i_size_read(inode) > 0) { 3439 ret = btrfs_check_trunc_cache_free_space(fs_info, 3440 &fs_info->global_block_rsv); 3441 if (ret) 3442 goto out_put; 3443 3444 ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 3445 if (ret) 3446 goto out_put; 3447 } 3448 3449 spin_lock(&block_group->lock); 3450 if (block_group->cached != BTRFS_CACHE_FINISHED || 3451 !btrfs_test_opt(fs_info, SPACE_CACHE)) { 3452 /* 3453 * don't bother trying to write stuff out _if_ 3454 * a) we're not cached, 3455 * b) we're with nospace_cache mount option, 3456 * c) we're with v2 space_cache (FREE_SPACE_TREE). 3457 */ 3458 dcs = BTRFS_DC_WRITTEN; 3459 spin_unlock(&block_group->lock); 3460 goto out_put; 3461 } 3462 spin_unlock(&block_group->lock); 3463 3464 /* 3465 * We hit an ENOSPC when setting up the cache in this transaction, just 3466 * skip doing the setup, we've already cleared the cache so we're safe. 3467 */ 3468 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 3469 ret = -ENOSPC; 3470 goto out_put; 3471 } 3472 3473 /* 3474 * Try to preallocate enough space based on how big the block group is. 3475 * Keep in mind this has to include any pinned space which could end up 3476 * taking up quite a bit since it's not folded into the other space 3477 * cache. 3478 */ 3479 num_pages = div_u64(block_group->key.offset, SZ_256M); 3480 if (!num_pages) 3481 num_pages = 1; 3482 3483 num_pages *= 16; 3484 num_pages *= PAGE_SIZE; 3485 3486 ret = btrfs_check_data_free_space(inode, 0, num_pages); 3487 if (ret) 3488 goto out_put; 3489 3490 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3491 num_pages, num_pages, 3492 &alloc_hint); 3493 /* 3494 * Our cache requires contiguous chunks so that we don't modify a bunch 3495 * of metadata or split extents when writing the cache out, which means 3496 * we can enospc if we are heavily fragmented in addition to just normal 3497 * out of space conditions. So if we hit this just skip setting up any 3498 * other block groups for this transaction, maybe we'll unpin enough 3499 * space the next time around. 3500 */ 3501 if (!ret) 3502 dcs = BTRFS_DC_SETUP; 3503 else if (ret == -ENOSPC) 3504 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 3505 3506 out_put: 3507 iput(inode); 3508 out_free: 3509 btrfs_release_path(path); 3510 out: 3511 spin_lock(&block_group->lock); 3512 if (!ret && dcs == BTRFS_DC_SETUP) 3513 block_group->cache_generation = trans->transid; 3514 block_group->disk_cache_state = dcs; 3515 spin_unlock(&block_group->lock); 3516 3517 return ret; 3518 } 3519 3520 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, 3521 struct btrfs_fs_info *fs_info) 3522 { 3523 struct btrfs_block_group_cache *cache, *tmp; 3524 struct btrfs_transaction *cur_trans = trans->transaction; 3525 struct btrfs_path *path; 3526 3527 if (list_empty(&cur_trans->dirty_bgs) || 3528 !btrfs_test_opt(fs_info, SPACE_CACHE)) 3529 return 0; 3530 3531 path = btrfs_alloc_path(); 3532 if (!path) 3533 return -ENOMEM; 3534 3535 /* Could add new block groups, use _safe just in case */ 3536 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 3537 dirty_list) { 3538 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3539 cache_save_setup(cache, trans, path); 3540 } 3541 3542 btrfs_free_path(path); 3543 return 0; 3544 } 3545 3546 /* 3547 * transaction commit does final block group cache writeback during a 3548 * critical section where nothing is allowed to change the FS. This is 3549 * required in order for the cache to actually match the block group, 3550 * but can introduce a lot of latency into the commit. 3551 * 3552 * So, btrfs_start_dirty_block_groups is here to kick off block group 3553 * cache IO. There's a chance we'll have to redo some of it if the 3554 * block group changes again during the commit, but it greatly reduces 3555 * the commit latency by getting rid of the easy block groups while 3556 * we're still allowing others to join the commit. 3557 */ 3558 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, 3559 struct btrfs_fs_info *fs_info) 3560 { 3561 struct btrfs_block_group_cache *cache; 3562 struct btrfs_transaction *cur_trans = trans->transaction; 3563 int ret = 0; 3564 int should_put; 3565 struct btrfs_path *path = NULL; 3566 LIST_HEAD(dirty); 3567 struct list_head *io = &cur_trans->io_bgs; 3568 int num_started = 0; 3569 int loops = 0; 3570 3571 spin_lock(&cur_trans->dirty_bgs_lock); 3572 if (list_empty(&cur_trans->dirty_bgs)) { 3573 spin_unlock(&cur_trans->dirty_bgs_lock); 3574 return 0; 3575 } 3576 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3577 spin_unlock(&cur_trans->dirty_bgs_lock); 3578 3579 again: 3580 /* 3581 * make sure all the block groups on our dirty list actually 3582 * exist 3583 */ 3584 btrfs_create_pending_block_groups(trans, fs_info); 3585 3586 if (!path) { 3587 path = btrfs_alloc_path(); 3588 if (!path) 3589 return -ENOMEM; 3590 } 3591 3592 /* 3593 * cache_write_mutex is here only to save us from balance or automatic 3594 * removal of empty block groups deleting this block group while we are 3595 * writing out the cache 3596 */ 3597 mutex_lock(&trans->transaction->cache_write_mutex); 3598 while (!list_empty(&dirty)) { 3599 cache = list_first_entry(&dirty, 3600 struct btrfs_block_group_cache, 3601 dirty_list); 3602 /* 3603 * this can happen if something re-dirties a block 3604 * group that is already under IO. Just wait for it to 3605 * finish and then do it all again 3606 */ 3607 if (!list_empty(&cache->io_list)) { 3608 list_del_init(&cache->io_list); 3609 btrfs_wait_cache_io(trans, cache, path); 3610 btrfs_put_block_group(cache); 3611 } 3612 3613 3614 /* 3615 * btrfs_wait_cache_io uses the cache->dirty_list to decide 3616 * if it should update the cache_state. Don't delete 3617 * until after we wait. 3618 * 3619 * Since we're not running in the commit critical section 3620 * we need the dirty_bgs_lock to protect from update_block_group 3621 */ 3622 spin_lock(&cur_trans->dirty_bgs_lock); 3623 list_del_init(&cache->dirty_list); 3624 spin_unlock(&cur_trans->dirty_bgs_lock); 3625 3626 should_put = 1; 3627 3628 cache_save_setup(cache, trans, path); 3629 3630 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3631 cache->io_ctl.inode = NULL; 3632 ret = btrfs_write_out_cache(fs_info, trans, 3633 cache, path); 3634 if (ret == 0 && cache->io_ctl.inode) { 3635 num_started++; 3636 should_put = 0; 3637 3638 /* 3639 * the cache_write_mutex is protecting 3640 * the io_list 3641 */ 3642 list_add_tail(&cache->io_list, io); 3643 } else { 3644 /* 3645 * if we failed to write the cache, the 3646 * generation will be bad and life goes on 3647 */ 3648 ret = 0; 3649 } 3650 } 3651 if (!ret) { 3652 ret = write_one_cache_group(trans, fs_info, 3653 path, cache); 3654 /* 3655 * Our block group might still be attached to the list 3656 * of new block groups in the transaction handle of some 3657 * other task (struct btrfs_trans_handle->new_bgs). This 3658 * means its block group item isn't yet in the extent 3659 * tree. If this happens ignore the error, as we will 3660 * try again later in the critical section of the 3661 * transaction commit. 3662 */ 3663 if (ret == -ENOENT) { 3664 ret = 0; 3665 spin_lock(&cur_trans->dirty_bgs_lock); 3666 if (list_empty(&cache->dirty_list)) { 3667 list_add_tail(&cache->dirty_list, 3668 &cur_trans->dirty_bgs); 3669 btrfs_get_block_group(cache); 3670 } 3671 spin_unlock(&cur_trans->dirty_bgs_lock); 3672 } else if (ret) { 3673 btrfs_abort_transaction(trans, ret); 3674 } 3675 } 3676 3677 /* if its not on the io list, we need to put the block group */ 3678 if (should_put) 3679 btrfs_put_block_group(cache); 3680 3681 if (ret) 3682 break; 3683 3684 /* 3685 * Avoid blocking other tasks for too long. It might even save 3686 * us from writing caches for block groups that are going to be 3687 * removed. 3688 */ 3689 mutex_unlock(&trans->transaction->cache_write_mutex); 3690 mutex_lock(&trans->transaction->cache_write_mutex); 3691 } 3692 mutex_unlock(&trans->transaction->cache_write_mutex); 3693 3694 /* 3695 * go through delayed refs for all the stuff we've just kicked off 3696 * and then loop back (just once) 3697 */ 3698 ret = btrfs_run_delayed_refs(trans, fs_info, 0); 3699 if (!ret && loops == 0) { 3700 loops++; 3701 spin_lock(&cur_trans->dirty_bgs_lock); 3702 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3703 /* 3704 * dirty_bgs_lock protects us from concurrent block group 3705 * deletes too (not just cache_write_mutex). 3706 */ 3707 if (!list_empty(&dirty)) { 3708 spin_unlock(&cur_trans->dirty_bgs_lock); 3709 goto again; 3710 } 3711 spin_unlock(&cur_trans->dirty_bgs_lock); 3712 } else if (ret < 0) { 3713 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 3714 } 3715 3716 btrfs_free_path(path); 3717 return ret; 3718 } 3719 3720 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3721 struct btrfs_fs_info *fs_info) 3722 { 3723 struct btrfs_block_group_cache *cache; 3724 struct btrfs_transaction *cur_trans = trans->transaction; 3725 int ret = 0; 3726 int should_put; 3727 struct btrfs_path *path; 3728 struct list_head *io = &cur_trans->io_bgs; 3729 int num_started = 0; 3730 3731 path = btrfs_alloc_path(); 3732 if (!path) 3733 return -ENOMEM; 3734 3735 /* 3736 * Even though we are in the critical section of the transaction commit, 3737 * we can still have concurrent tasks adding elements to this 3738 * transaction's list of dirty block groups. These tasks correspond to 3739 * endio free space workers started when writeback finishes for a 3740 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 3741 * allocate new block groups as a result of COWing nodes of the root 3742 * tree when updating the free space inode. The writeback for the space 3743 * caches is triggered by an earlier call to 3744 * btrfs_start_dirty_block_groups() and iterations of the following 3745 * loop. 3746 * Also we want to do the cache_save_setup first and then run the 3747 * delayed refs to make sure we have the best chance at doing this all 3748 * in one shot. 3749 */ 3750 spin_lock(&cur_trans->dirty_bgs_lock); 3751 while (!list_empty(&cur_trans->dirty_bgs)) { 3752 cache = list_first_entry(&cur_trans->dirty_bgs, 3753 struct btrfs_block_group_cache, 3754 dirty_list); 3755 3756 /* 3757 * this can happen if cache_save_setup re-dirties a block 3758 * group that is already under IO. Just wait for it to 3759 * finish and then do it all again 3760 */ 3761 if (!list_empty(&cache->io_list)) { 3762 spin_unlock(&cur_trans->dirty_bgs_lock); 3763 list_del_init(&cache->io_list); 3764 btrfs_wait_cache_io(trans, cache, path); 3765 btrfs_put_block_group(cache); 3766 spin_lock(&cur_trans->dirty_bgs_lock); 3767 } 3768 3769 /* 3770 * don't remove from the dirty list until after we've waited 3771 * on any pending IO 3772 */ 3773 list_del_init(&cache->dirty_list); 3774 spin_unlock(&cur_trans->dirty_bgs_lock); 3775 should_put = 1; 3776 3777 cache_save_setup(cache, trans, path); 3778 3779 if (!ret) 3780 ret = btrfs_run_delayed_refs(trans, fs_info, 3781 (unsigned long) -1); 3782 3783 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 3784 cache->io_ctl.inode = NULL; 3785 ret = btrfs_write_out_cache(fs_info, trans, 3786 cache, path); 3787 if (ret == 0 && cache->io_ctl.inode) { 3788 num_started++; 3789 should_put = 0; 3790 list_add_tail(&cache->io_list, io); 3791 } else { 3792 /* 3793 * if we failed to write the cache, the 3794 * generation will be bad and life goes on 3795 */ 3796 ret = 0; 3797 } 3798 } 3799 if (!ret) { 3800 ret = write_one_cache_group(trans, fs_info, 3801 path, cache); 3802 /* 3803 * One of the free space endio workers might have 3804 * created a new block group while updating a free space 3805 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 3806 * and hasn't released its transaction handle yet, in 3807 * which case the new block group is still attached to 3808 * its transaction handle and its creation has not 3809 * finished yet (no block group item in the extent tree 3810 * yet, etc). If this is the case, wait for all free 3811 * space endio workers to finish and retry. This is a 3812 * a very rare case so no need for a more efficient and 3813 * complex approach. 3814 */ 3815 if (ret == -ENOENT) { 3816 wait_event(cur_trans->writer_wait, 3817 atomic_read(&cur_trans->num_writers) == 1); 3818 ret = write_one_cache_group(trans, fs_info, 3819 path, cache); 3820 } 3821 if (ret) 3822 btrfs_abort_transaction(trans, ret); 3823 } 3824 3825 /* if its not on the io list, we need to put the block group */ 3826 if (should_put) 3827 btrfs_put_block_group(cache); 3828 spin_lock(&cur_trans->dirty_bgs_lock); 3829 } 3830 spin_unlock(&cur_trans->dirty_bgs_lock); 3831 3832 while (!list_empty(io)) { 3833 cache = list_first_entry(io, struct btrfs_block_group_cache, 3834 io_list); 3835 list_del_init(&cache->io_list); 3836 btrfs_wait_cache_io(trans, cache, path); 3837 btrfs_put_block_group(cache); 3838 } 3839 3840 btrfs_free_path(path); 3841 return ret; 3842 } 3843 3844 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) 3845 { 3846 struct btrfs_block_group_cache *block_group; 3847 int readonly = 0; 3848 3849 block_group = btrfs_lookup_block_group(fs_info, bytenr); 3850 if (!block_group || block_group->ro) 3851 readonly = 1; 3852 if (block_group) 3853 btrfs_put_block_group(block_group); 3854 return readonly; 3855 } 3856 3857 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3858 { 3859 struct btrfs_block_group_cache *bg; 3860 bool ret = true; 3861 3862 bg = btrfs_lookup_block_group(fs_info, bytenr); 3863 if (!bg) 3864 return false; 3865 3866 spin_lock(&bg->lock); 3867 if (bg->ro) 3868 ret = false; 3869 else 3870 atomic_inc(&bg->nocow_writers); 3871 spin_unlock(&bg->lock); 3872 3873 /* no put on block group, done by btrfs_dec_nocow_writers */ 3874 if (!ret) 3875 btrfs_put_block_group(bg); 3876 3877 return ret; 3878 3879 } 3880 3881 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3882 { 3883 struct btrfs_block_group_cache *bg; 3884 3885 bg = btrfs_lookup_block_group(fs_info, bytenr); 3886 ASSERT(bg); 3887 if (atomic_dec_and_test(&bg->nocow_writers)) 3888 wake_up_atomic_t(&bg->nocow_writers); 3889 /* 3890 * Once for our lookup and once for the lookup done by a previous call 3891 * to btrfs_inc_nocow_writers() 3892 */ 3893 btrfs_put_block_group(bg); 3894 btrfs_put_block_group(bg); 3895 } 3896 3897 static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) 3898 { 3899 schedule(); 3900 return 0; 3901 } 3902 3903 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) 3904 { 3905 wait_on_atomic_t(&bg->nocow_writers, 3906 btrfs_wait_nocow_writers_atomic_t, 3907 TASK_UNINTERRUPTIBLE); 3908 } 3909 3910 static const char *alloc_name(u64 flags) 3911 { 3912 switch (flags) { 3913 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3914 return "mixed"; 3915 case BTRFS_BLOCK_GROUP_METADATA: 3916 return "metadata"; 3917 case BTRFS_BLOCK_GROUP_DATA: 3918 return "data"; 3919 case BTRFS_BLOCK_GROUP_SYSTEM: 3920 return "system"; 3921 default: 3922 WARN_ON(1); 3923 return "invalid-combination"; 3924 }; 3925 } 3926 3927 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3928 u64 total_bytes, u64 bytes_used, 3929 u64 bytes_readonly, 3930 struct btrfs_space_info **space_info) 3931 { 3932 struct btrfs_space_info *found; 3933 int i; 3934 int factor; 3935 int ret; 3936 3937 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3938 BTRFS_BLOCK_GROUP_RAID10)) 3939 factor = 2; 3940 else 3941 factor = 1; 3942 3943 found = __find_space_info(info, flags); 3944 if (found) { 3945 spin_lock(&found->lock); 3946 found->total_bytes += total_bytes; 3947 found->disk_total += total_bytes * factor; 3948 found->bytes_used += bytes_used; 3949 found->disk_used += bytes_used * factor; 3950 found->bytes_readonly += bytes_readonly; 3951 if (total_bytes > 0) 3952 found->full = 0; 3953 space_info_add_new_bytes(info, found, total_bytes - 3954 bytes_used - bytes_readonly); 3955 spin_unlock(&found->lock); 3956 *space_info = found; 3957 return 0; 3958 } 3959 found = kzalloc(sizeof(*found), GFP_NOFS); 3960 if (!found) 3961 return -ENOMEM; 3962 3963 ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); 3964 if (ret) { 3965 kfree(found); 3966 return ret; 3967 } 3968 3969 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3970 INIT_LIST_HEAD(&found->block_groups[i]); 3971 init_rwsem(&found->groups_sem); 3972 spin_lock_init(&found->lock); 3973 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3974 found->total_bytes = total_bytes; 3975 found->disk_total = total_bytes * factor; 3976 found->bytes_used = bytes_used; 3977 found->disk_used = bytes_used * factor; 3978 found->bytes_pinned = 0; 3979 found->bytes_reserved = 0; 3980 found->bytes_readonly = bytes_readonly; 3981 found->bytes_may_use = 0; 3982 found->full = 0; 3983 found->max_extent_size = 0; 3984 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3985 found->chunk_alloc = 0; 3986 found->flush = 0; 3987 init_waitqueue_head(&found->wait); 3988 INIT_LIST_HEAD(&found->ro_bgs); 3989 INIT_LIST_HEAD(&found->tickets); 3990 INIT_LIST_HEAD(&found->priority_tickets); 3991 3992 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3993 info->space_info_kobj, "%s", 3994 alloc_name(found->flags)); 3995 if (ret) { 3996 kfree(found); 3997 return ret; 3998 } 3999 4000 *space_info = found; 4001 list_add_rcu(&found->list, &info->space_info); 4002 if (flags & BTRFS_BLOCK_GROUP_DATA) 4003 info->data_sinfo = found; 4004 4005 return ret; 4006 } 4007 4008 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 4009 { 4010 u64 extra_flags = chunk_to_extended(flags) & 4011 BTRFS_EXTENDED_PROFILE_MASK; 4012 4013 write_seqlock(&fs_info->profiles_lock); 4014 if (flags & BTRFS_BLOCK_GROUP_DATA) 4015 fs_info->avail_data_alloc_bits |= extra_flags; 4016 if (flags & BTRFS_BLOCK_GROUP_METADATA) 4017 fs_info->avail_metadata_alloc_bits |= extra_flags; 4018 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4019 fs_info->avail_system_alloc_bits |= extra_flags; 4020 write_sequnlock(&fs_info->profiles_lock); 4021 } 4022 4023 /* 4024 * returns target flags in extended format or 0 if restripe for this 4025 * chunk_type is not in progress 4026 * 4027 * should be called with either volume_mutex or balance_lock held 4028 */ 4029 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 4030 { 4031 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4032 u64 target = 0; 4033 4034 if (!bctl) 4035 return 0; 4036 4037 if (flags & BTRFS_BLOCK_GROUP_DATA && 4038 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4039 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 4040 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 4041 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4042 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 4043 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 4044 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4045 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 4046 } 4047 4048 return target; 4049 } 4050 4051 /* 4052 * @flags: available profiles in extended format (see ctree.h) 4053 * 4054 * Returns reduced profile in chunk format. If profile changing is in 4055 * progress (either running or paused) picks the target profile (if it's 4056 * already available), otherwise falls back to plain reducing. 4057 */ 4058 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 4059 { 4060 u64 num_devices = fs_info->fs_devices->rw_devices; 4061 u64 target; 4062 u64 raid_type; 4063 u64 allowed = 0; 4064 4065 /* 4066 * see if restripe for this chunk_type is in progress, if so 4067 * try to reduce to the target profile 4068 */ 4069 spin_lock(&fs_info->balance_lock); 4070 target = get_restripe_target(fs_info, flags); 4071 if (target) { 4072 /* pick target profile only if it's already available */ 4073 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 4074 spin_unlock(&fs_info->balance_lock); 4075 return extended_to_chunk(target); 4076 } 4077 } 4078 spin_unlock(&fs_info->balance_lock); 4079 4080 /* First, mask out the RAID levels which aren't possible */ 4081 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 4082 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 4083 allowed |= btrfs_raid_group[raid_type]; 4084 } 4085 allowed &= flags; 4086 4087 if (allowed & BTRFS_BLOCK_GROUP_RAID6) 4088 allowed = BTRFS_BLOCK_GROUP_RAID6; 4089 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 4090 allowed = BTRFS_BLOCK_GROUP_RAID5; 4091 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 4092 allowed = BTRFS_BLOCK_GROUP_RAID10; 4093 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 4094 allowed = BTRFS_BLOCK_GROUP_RAID1; 4095 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 4096 allowed = BTRFS_BLOCK_GROUP_RAID0; 4097 4098 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 4099 4100 return extended_to_chunk(flags | allowed); 4101 } 4102 4103 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 4104 { 4105 unsigned seq; 4106 u64 flags; 4107 4108 do { 4109 flags = orig_flags; 4110 seq = read_seqbegin(&fs_info->profiles_lock); 4111 4112 if (flags & BTRFS_BLOCK_GROUP_DATA) 4113 flags |= fs_info->avail_data_alloc_bits; 4114 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4115 flags |= fs_info->avail_system_alloc_bits; 4116 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 4117 flags |= fs_info->avail_metadata_alloc_bits; 4118 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4119 4120 return btrfs_reduce_alloc_profile(fs_info, flags); 4121 } 4122 4123 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 4124 { 4125 struct btrfs_fs_info *fs_info = root->fs_info; 4126 u64 flags; 4127 u64 ret; 4128 4129 if (data) 4130 flags = BTRFS_BLOCK_GROUP_DATA; 4131 else if (root == fs_info->chunk_root) 4132 flags = BTRFS_BLOCK_GROUP_SYSTEM; 4133 else 4134 flags = BTRFS_BLOCK_GROUP_METADATA; 4135 4136 ret = get_alloc_profile(fs_info, flags); 4137 return ret; 4138 } 4139 4140 static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 4141 bool may_use_included) 4142 { 4143 ASSERT(s_info); 4144 return s_info->bytes_used + s_info->bytes_reserved + 4145 s_info->bytes_pinned + s_info->bytes_readonly + 4146 (may_use_included ? s_info->bytes_may_use : 0); 4147 } 4148 4149 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) 4150 { 4151 struct btrfs_space_info *data_sinfo; 4152 struct btrfs_root *root = inode->root; 4153 struct btrfs_fs_info *fs_info = root->fs_info; 4154 u64 used; 4155 int ret = 0; 4156 int need_commit = 2; 4157 int have_pinned_space; 4158 4159 /* make sure bytes are sectorsize aligned */ 4160 bytes = ALIGN(bytes, fs_info->sectorsize); 4161 4162 if (btrfs_is_free_space_inode(inode)) { 4163 need_commit = 0; 4164 ASSERT(current->journal_info); 4165 } 4166 4167 data_sinfo = fs_info->data_sinfo; 4168 if (!data_sinfo) 4169 goto alloc; 4170 4171 again: 4172 /* make sure we have enough space to handle the data first */ 4173 spin_lock(&data_sinfo->lock); 4174 used = btrfs_space_info_used(data_sinfo, true); 4175 4176 if (used + bytes > data_sinfo->total_bytes) { 4177 struct btrfs_trans_handle *trans; 4178 4179 /* 4180 * if we don't have enough free bytes in this space then we need 4181 * to alloc a new chunk. 4182 */ 4183 if (!data_sinfo->full) { 4184 u64 alloc_target; 4185 4186 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 4187 spin_unlock(&data_sinfo->lock); 4188 alloc: 4189 alloc_target = btrfs_get_alloc_profile(root, 1); 4190 /* 4191 * It is ugly that we don't call nolock join 4192 * transaction for the free space inode case here. 4193 * But it is safe because we only do the data space 4194 * reservation for the free space cache in the 4195 * transaction context, the common join transaction 4196 * just increase the counter of the current transaction 4197 * handler, doesn't try to acquire the trans_lock of 4198 * the fs. 4199 */ 4200 trans = btrfs_join_transaction(root); 4201 if (IS_ERR(trans)) 4202 return PTR_ERR(trans); 4203 4204 ret = do_chunk_alloc(trans, fs_info, alloc_target, 4205 CHUNK_ALLOC_NO_FORCE); 4206 btrfs_end_transaction(trans); 4207 if (ret < 0) { 4208 if (ret != -ENOSPC) 4209 return ret; 4210 else { 4211 have_pinned_space = 1; 4212 goto commit_trans; 4213 } 4214 } 4215 4216 if (!data_sinfo) 4217 data_sinfo = fs_info->data_sinfo; 4218 4219 goto again; 4220 } 4221 4222 /* 4223 * If we don't have enough pinned space to deal with this 4224 * allocation, and no removed chunk in current transaction, 4225 * don't bother committing the transaction. 4226 */ 4227 have_pinned_space = percpu_counter_compare( 4228 &data_sinfo->total_bytes_pinned, 4229 used + bytes - data_sinfo->total_bytes); 4230 spin_unlock(&data_sinfo->lock); 4231 4232 /* commit the current transaction and try again */ 4233 commit_trans: 4234 if (need_commit && 4235 !atomic_read(&fs_info->open_ioctl_trans)) { 4236 need_commit--; 4237 4238 if (need_commit > 0) { 4239 btrfs_start_delalloc_roots(fs_info, 0, -1); 4240 btrfs_wait_ordered_roots(fs_info, -1, 0, 4241 (u64)-1); 4242 } 4243 4244 trans = btrfs_join_transaction(root); 4245 if (IS_ERR(trans)) 4246 return PTR_ERR(trans); 4247 if (have_pinned_space >= 0 || 4248 test_bit(BTRFS_TRANS_HAVE_FREE_BGS, 4249 &trans->transaction->flags) || 4250 need_commit > 0) { 4251 ret = btrfs_commit_transaction(trans); 4252 if (ret) 4253 return ret; 4254 /* 4255 * The cleaner kthread might still be doing iput 4256 * operations. Wait for it to finish so that 4257 * more space is released. 4258 */ 4259 mutex_lock(&fs_info->cleaner_delayed_iput_mutex); 4260 mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); 4261 goto again; 4262 } else { 4263 btrfs_end_transaction(trans); 4264 } 4265 } 4266 4267 trace_btrfs_space_reservation(fs_info, 4268 "space_info:enospc", 4269 data_sinfo->flags, bytes, 1); 4270 return -ENOSPC; 4271 } 4272 data_sinfo->bytes_may_use += bytes; 4273 trace_btrfs_space_reservation(fs_info, "space_info", 4274 data_sinfo->flags, bytes, 1); 4275 spin_unlock(&data_sinfo->lock); 4276 4277 return ret; 4278 } 4279 4280 /* 4281 * New check_data_free_space() with ability for precious data reservation 4282 * Will replace old btrfs_check_data_free_space(), but for patch split, 4283 * add a new function first and then replace it. 4284 */ 4285 int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) 4286 { 4287 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4288 int ret; 4289 4290 /* align the range */ 4291 len = round_up(start + len, fs_info->sectorsize) - 4292 round_down(start, fs_info->sectorsize); 4293 start = round_down(start, fs_info->sectorsize); 4294 4295 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); 4296 if (ret < 0) 4297 return ret; 4298 4299 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ 4300 ret = btrfs_qgroup_reserve_data(inode, start, len); 4301 if (ret) 4302 btrfs_free_reserved_data_space_noquota(inode, start, len); 4303 return ret; 4304 } 4305 4306 /* 4307 * Called if we need to clear a data reservation for this inode 4308 * Normally in a error case. 4309 * 4310 * This one will *NOT* use accurate qgroup reserved space API, just for case 4311 * which we can't sleep and is sure it won't affect qgroup reserved space. 4312 * Like clear_bit_hook(). 4313 */ 4314 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 4315 u64 len) 4316 { 4317 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4318 struct btrfs_space_info *data_sinfo; 4319 4320 /* Make sure the range is aligned to sectorsize */ 4321 len = round_up(start + len, fs_info->sectorsize) - 4322 round_down(start, fs_info->sectorsize); 4323 start = round_down(start, fs_info->sectorsize); 4324 4325 data_sinfo = fs_info->data_sinfo; 4326 spin_lock(&data_sinfo->lock); 4327 if (WARN_ON(data_sinfo->bytes_may_use < len)) 4328 data_sinfo->bytes_may_use = 0; 4329 else 4330 data_sinfo->bytes_may_use -= len; 4331 trace_btrfs_space_reservation(fs_info, "space_info", 4332 data_sinfo->flags, len, 0); 4333 spin_unlock(&data_sinfo->lock); 4334 } 4335 4336 /* 4337 * Called if we need to clear a data reservation for this inode 4338 * Normally in a error case. 4339 * 4340 * This one will handle the per-inode data rsv map for accurate reserved 4341 * space framework. 4342 */ 4343 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) 4344 { 4345 struct btrfs_root *root = BTRFS_I(inode)->root; 4346 4347 /* Make sure the range is aligned to sectorsize */ 4348 len = round_up(start + len, root->fs_info->sectorsize) - 4349 round_down(start, root->fs_info->sectorsize); 4350 start = round_down(start, root->fs_info->sectorsize); 4351 4352 btrfs_free_reserved_data_space_noquota(inode, start, len); 4353 btrfs_qgroup_free_data(inode, start, len); 4354 } 4355 4356 static void force_metadata_allocation(struct btrfs_fs_info *info) 4357 { 4358 struct list_head *head = &info->space_info; 4359 struct btrfs_space_info *found; 4360 4361 rcu_read_lock(); 4362 list_for_each_entry_rcu(found, head, list) { 4363 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 4364 found->force_alloc = CHUNK_ALLOC_FORCE; 4365 } 4366 rcu_read_unlock(); 4367 } 4368 4369 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 4370 { 4371 return (global->size << 1); 4372 } 4373 4374 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 4375 struct btrfs_space_info *sinfo, int force) 4376 { 4377 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4378 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 4379 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 4380 u64 thresh; 4381 4382 if (force == CHUNK_ALLOC_FORCE) 4383 return 1; 4384 4385 /* 4386 * We need to take into account the global rsv because for all intents 4387 * and purposes it's used space. Don't worry about locking the 4388 * global_rsv, it doesn't change except when the transaction commits. 4389 */ 4390 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 4391 num_allocated += calc_global_rsv_need_space(global_rsv); 4392 4393 /* 4394 * in limited mode, we want to have some free space up to 4395 * about 1% of the FS size. 4396 */ 4397 if (force == CHUNK_ALLOC_LIMITED) { 4398 thresh = btrfs_super_total_bytes(fs_info->super_copy); 4399 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); 4400 4401 if (num_bytes - num_allocated < thresh) 4402 return 1; 4403 } 4404 4405 if (num_allocated + SZ_2M < div_factor(num_bytes, 8)) 4406 return 0; 4407 return 1; 4408 } 4409 4410 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 4411 { 4412 u64 num_dev; 4413 4414 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 4415 BTRFS_BLOCK_GROUP_RAID0 | 4416 BTRFS_BLOCK_GROUP_RAID5 | 4417 BTRFS_BLOCK_GROUP_RAID6)) 4418 num_dev = fs_info->fs_devices->rw_devices; 4419 else if (type & BTRFS_BLOCK_GROUP_RAID1) 4420 num_dev = 2; 4421 else 4422 num_dev = 1; /* DUP or single */ 4423 4424 return num_dev; 4425 } 4426 4427 /* 4428 * If @is_allocation is true, reserve space in the system space info necessary 4429 * for allocating a chunk, otherwise if it's false, reserve space necessary for 4430 * removing a chunk. 4431 */ 4432 void check_system_chunk(struct btrfs_trans_handle *trans, 4433 struct btrfs_fs_info *fs_info, u64 type) 4434 { 4435 struct btrfs_space_info *info; 4436 u64 left; 4437 u64 thresh; 4438 int ret = 0; 4439 u64 num_devs; 4440 4441 /* 4442 * Needed because we can end up allocating a system chunk and for an 4443 * atomic and race free space reservation in the chunk block reserve. 4444 */ 4445 ASSERT(mutex_is_locked(&fs_info->chunk_mutex)); 4446 4447 info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4448 spin_lock(&info->lock); 4449 left = info->total_bytes - btrfs_space_info_used(info, true); 4450 spin_unlock(&info->lock); 4451 4452 num_devs = get_profile_num_devs(fs_info, type); 4453 4454 /* num_devs device items to update and 1 chunk item to add or remove */ 4455 thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) + 4456 btrfs_calc_trans_metadata_size(fs_info, 1); 4457 4458 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 4459 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 4460 left, thresh, type); 4461 dump_space_info(fs_info, info, 0, 0); 4462 } 4463 4464 if (left < thresh) { 4465 u64 flags; 4466 4467 flags = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4468 /* 4469 * Ignore failure to create system chunk. We might end up not 4470 * needing it, as we might not need to COW all nodes/leafs from 4471 * the paths we visit in the chunk tree (they were already COWed 4472 * or created in the current transaction for example). 4473 */ 4474 ret = btrfs_alloc_chunk(trans, fs_info, flags); 4475 } 4476 4477 if (!ret) { 4478 ret = btrfs_block_rsv_add(fs_info->chunk_root, 4479 &fs_info->chunk_block_rsv, 4480 thresh, BTRFS_RESERVE_NO_FLUSH); 4481 if (!ret) 4482 trans->chunk_bytes_reserved += thresh; 4483 } 4484 } 4485 4486 /* 4487 * If force is CHUNK_ALLOC_FORCE: 4488 * - return 1 if it successfully allocates a chunk, 4489 * - return errors including -ENOSPC otherwise. 4490 * If force is NOT CHUNK_ALLOC_FORCE: 4491 * - return 0 if it doesn't need to allocate a new chunk, 4492 * - return 1 if it successfully allocates a chunk, 4493 * - return errors including -ENOSPC otherwise. 4494 */ 4495 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 4496 struct btrfs_fs_info *fs_info, u64 flags, int force) 4497 { 4498 struct btrfs_space_info *space_info; 4499 int wait_for_alloc = 0; 4500 int ret = 0; 4501 4502 /* Don't re-enter if we're already allocating a chunk */ 4503 if (trans->allocating_chunk) 4504 return -ENOSPC; 4505 4506 space_info = __find_space_info(fs_info, flags); 4507 if (!space_info) { 4508 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 4509 BUG_ON(ret); /* -ENOMEM */ 4510 } 4511 BUG_ON(!space_info); /* Logic error */ 4512 4513 again: 4514 spin_lock(&space_info->lock); 4515 if (force < space_info->force_alloc) 4516 force = space_info->force_alloc; 4517 if (space_info->full) { 4518 if (should_alloc_chunk(fs_info, space_info, force)) 4519 ret = -ENOSPC; 4520 else 4521 ret = 0; 4522 spin_unlock(&space_info->lock); 4523 return ret; 4524 } 4525 4526 if (!should_alloc_chunk(fs_info, space_info, force)) { 4527 spin_unlock(&space_info->lock); 4528 return 0; 4529 } else if (space_info->chunk_alloc) { 4530 wait_for_alloc = 1; 4531 } else { 4532 space_info->chunk_alloc = 1; 4533 } 4534 4535 spin_unlock(&space_info->lock); 4536 4537 mutex_lock(&fs_info->chunk_mutex); 4538 4539 /* 4540 * The chunk_mutex is held throughout the entirety of a chunk 4541 * allocation, so once we've acquired the chunk_mutex we know that the 4542 * other guy is done and we need to recheck and see if we should 4543 * allocate. 4544 */ 4545 if (wait_for_alloc) { 4546 mutex_unlock(&fs_info->chunk_mutex); 4547 wait_for_alloc = 0; 4548 goto again; 4549 } 4550 4551 trans->allocating_chunk = true; 4552 4553 /* 4554 * If we have mixed data/metadata chunks we want to make sure we keep 4555 * allocating mixed chunks instead of individual chunks. 4556 */ 4557 if (btrfs_mixed_space_info(space_info)) 4558 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 4559 4560 /* 4561 * if we're doing a data chunk, go ahead and make sure that 4562 * we keep a reasonable number of metadata chunks allocated in the 4563 * FS as well. 4564 */ 4565 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 4566 fs_info->data_chunk_allocations++; 4567 if (!(fs_info->data_chunk_allocations % 4568 fs_info->metadata_ratio)) 4569 force_metadata_allocation(fs_info); 4570 } 4571 4572 /* 4573 * Check if we have enough space in SYSTEM chunk because we may need 4574 * to update devices. 4575 */ 4576 check_system_chunk(trans, fs_info, flags); 4577 4578 ret = btrfs_alloc_chunk(trans, fs_info, flags); 4579 trans->allocating_chunk = false; 4580 4581 spin_lock(&space_info->lock); 4582 if (ret < 0 && ret != -ENOSPC) 4583 goto out; 4584 if (ret) 4585 space_info->full = 1; 4586 else 4587 ret = 1; 4588 4589 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4590 out: 4591 space_info->chunk_alloc = 0; 4592 spin_unlock(&space_info->lock); 4593 mutex_unlock(&fs_info->chunk_mutex); 4594 /* 4595 * When we allocate a new chunk we reserve space in the chunk block 4596 * reserve to make sure we can COW nodes/leafs in the chunk tree or 4597 * add new nodes/leafs to it if we end up needing to do it when 4598 * inserting the chunk item and updating device items as part of the 4599 * second phase of chunk allocation, performed by 4600 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a 4601 * large number of new block groups to create in our transaction 4602 * handle's new_bgs list to avoid exhausting the chunk block reserve 4603 * in extreme cases - like having a single transaction create many new 4604 * block groups when starting to write out the free space caches of all 4605 * the block groups that were made dirty during the lifetime of the 4606 * transaction. 4607 */ 4608 if (trans->can_flush_pending_bgs && 4609 trans->chunk_bytes_reserved >= (u64)SZ_2M) { 4610 btrfs_create_pending_block_groups(trans, fs_info); 4611 btrfs_trans_release_chunk_metadata(trans); 4612 } 4613 return ret; 4614 } 4615 4616 static int can_overcommit(struct btrfs_root *root, 4617 struct btrfs_space_info *space_info, u64 bytes, 4618 enum btrfs_reserve_flush_enum flush) 4619 { 4620 struct btrfs_fs_info *fs_info = root->fs_info; 4621 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4622 u64 profile; 4623 u64 space_size; 4624 u64 avail; 4625 u64 used; 4626 4627 /* Don't overcommit when in mixed mode. */ 4628 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 4629 return 0; 4630 4631 profile = btrfs_get_alloc_profile(root, 0); 4632 used = btrfs_space_info_used(space_info, false); 4633 4634 /* 4635 * We only want to allow over committing if we have lots of actual space 4636 * free, but if we don't have enough space to handle the global reserve 4637 * space then we could end up having a real enospc problem when trying 4638 * to allocate a chunk or some other such important allocation. 4639 */ 4640 spin_lock(&global_rsv->lock); 4641 space_size = calc_global_rsv_need_space(global_rsv); 4642 spin_unlock(&global_rsv->lock); 4643 if (used + space_size >= space_info->total_bytes) 4644 return 0; 4645 4646 used += space_info->bytes_may_use; 4647 4648 spin_lock(&fs_info->free_chunk_lock); 4649 avail = fs_info->free_chunk_space; 4650 spin_unlock(&fs_info->free_chunk_lock); 4651 4652 /* 4653 * If we have dup, raid1 or raid10 then only half of the free 4654 * space is actually useable. For raid56, the space info used 4655 * doesn't include the parity drive, so we don't have to 4656 * change the math 4657 */ 4658 if (profile & (BTRFS_BLOCK_GROUP_DUP | 4659 BTRFS_BLOCK_GROUP_RAID1 | 4660 BTRFS_BLOCK_GROUP_RAID10)) 4661 avail >>= 1; 4662 4663 /* 4664 * If we aren't flushing all things, let us overcommit up to 4665 * 1/2th of the space. If we can flush, don't let us overcommit 4666 * too much, let it overcommit up to 1/8 of the space. 4667 */ 4668 if (flush == BTRFS_RESERVE_FLUSH_ALL) 4669 avail >>= 3; 4670 else 4671 avail >>= 1; 4672 4673 if (used + bytes < space_info->total_bytes + avail) 4674 return 1; 4675 return 0; 4676 } 4677 4678 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 4679 unsigned long nr_pages, int nr_items) 4680 { 4681 struct super_block *sb = fs_info->sb; 4682 4683 if (down_read_trylock(&sb->s_umount)) { 4684 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4685 up_read(&sb->s_umount); 4686 } else { 4687 /* 4688 * We needn't worry the filesystem going from r/w to r/o though 4689 * we don't acquire ->s_umount mutex, because the filesystem 4690 * should guarantee the delalloc inodes list be empty after 4691 * the filesystem is readonly(all dirty pages are written to 4692 * the disk). 4693 */ 4694 btrfs_start_delalloc_roots(fs_info, 0, nr_items); 4695 if (!current->journal_info) 4696 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 4697 } 4698 } 4699 4700 static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 4701 u64 to_reclaim) 4702 { 4703 u64 bytes; 4704 int nr; 4705 4706 bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 4707 nr = (int)div64_u64(to_reclaim, bytes); 4708 if (!nr) 4709 nr = 1; 4710 return nr; 4711 } 4712 4713 #define EXTENT_SIZE_PER_ITEM SZ_256K 4714 4715 /* 4716 * shrink metadata reservation for delalloc 4717 */ 4718 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 4719 bool wait_ordered) 4720 { 4721 struct btrfs_fs_info *fs_info = root->fs_info; 4722 struct btrfs_block_rsv *block_rsv; 4723 struct btrfs_space_info *space_info; 4724 struct btrfs_trans_handle *trans; 4725 u64 delalloc_bytes; 4726 u64 max_reclaim; 4727 long time_left; 4728 unsigned long nr_pages; 4729 int loops; 4730 int items; 4731 enum btrfs_reserve_flush_enum flush; 4732 4733 /* Calc the number of the pages we need flush for space reservation */ 4734 items = calc_reclaim_items_nr(fs_info, to_reclaim); 4735 to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; 4736 4737 trans = (struct btrfs_trans_handle *)current->journal_info; 4738 block_rsv = &fs_info->delalloc_block_rsv; 4739 space_info = block_rsv->space_info; 4740 4741 delalloc_bytes = percpu_counter_sum_positive( 4742 &fs_info->delalloc_bytes); 4743 if (delalloc_bytes == 0) { 4744 if (trans) 4745 return; 4746 if (wait_ordered) 4747 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4748 return; 4749 } 4750 4751 loops = 0; 4752 while (delalloc_bytes && loops < 3) { 4753 max_reclaim = min(delalloc_bytes, to_reclaim); 4754 nr_pages = max_reclaim >> PAGE_SHIFT; 4755 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 4756 /* 4757 * We need to wait for the async pages to actually start before 4758 * we do anything. 4759 */ 4760 max_reclaim = atomic_read(&fs_info->async_delalloc_pages); 4761 if (!max_reclaim) 4762 goto skip_async; 4763 4764 if (max_reclaim <= nr_pages) 4765 max_reclaim = 0; 4766 else 4767 max_reclaim -= nr_pages; 4768 4769 wait_event(fs_info->async_submit_wait, 4770 atomic_read(&fs_info->async_delalloc_pages) <= 4771 (int)max_reclaim); 4772 skip_async: 4773 if (!trans) 4774 flush = BTRFS_RESERVE_FLUSH_ALL; 4775 else 4776 flush = BTRFS_RESERVE_NO_FLUSH; 4777 spin_lock(&space_info->lock); 4778 if (can_overcommit(root, space_info, orig, flush)) { 4779 spin_unlock(&space_info->lock); 4780 break; 4781 } 4782 if (list_empty(&space_info->tickets) && 4783 list_empty(&space_info->priority_tickets)) { 4784 spin_unlock(&space_info->lock); 4785 break; 4786 } 4787 spin_unlock(&space_info->lock); 4788 4789 loops++; 4790 if (wait_ordered && !trans) { 4791 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4792 } else { 4793 time_left = schedule_timeout_killable(1); 4794 if (time_left) 4795 break; 4796 } 4797 delalloc_bytes = percpu_counter_sum_positive( 4798 &fs_info->delalloc_bytes); 4799 } 4800 } 4801 4802 /** 4803 * maybe_commit_transaction - possibly commit the transaction if its ok to 4804 * @root - the root we're allocating for 4805 * @bytes - the number of bytes we want to reserve 4806 * @force - force the commit 4807 * 4808 * This will check to make sure that committing the transaction will actually 4809 * get us somewhere and then commit the transaction if it does. Otherwise it 4810 * will return -ENOSPC. 4811 */ 4812 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 4813 struct btrfs_space_info *space_info, 4814 u64 bytes, int force) 4815 { 4816 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 4817 struct btrfs_trans_handle *trans; 4818 4819 trans = (struct btrfs_trans_handle *)current->journal_info; 4820 if (trans) 4821 return -EAGAIN; 4822 4823 if (force) 4824 goto commit; 4825 4826 /* See if there is enough pinned space to make this reservation */ 4827 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4828 bytes) >= 0) 4829 goto commit; 4830 4831 /* 4832 * See if there is some space in the delayed insertion reservation for 4833 * this reservation. 4834 */ 4835 if (space_info != delayed_rsv->space_info) 4836 return -ENOSPC; 4837 4838 spin_lock(&delayed_rsv->lock); 4839 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4840 bytes - delayed_rsv->size) >= 0) { 4841 spin_unlock(&delayed_rsv->lock); 4842 return -ENOSPC; 4843 } 4844 spin_unlock(&delayed_rsv->lock); 4845 4846 commit: 4847 trans = btrfs_join_transaction(fs_info->fs_root); 4848 if (IS_ERR(trans)) 4849 return -ENOSPC; 4850 4851 return btrfs_commit_transaction(trans); 4852 } 4853 4854 struct reserve_ticket { 4855 u64 bytes; 4856 int error; 4857 struct list_head list; 4858 wait_queue_head_t wait; 4859 }; 4860 4861 static int flush_space(struct btrfs_fs_info *fs_info, 4862 struct btrfs_space_info *space_info, u64 num_bytes, 4863 u64 orig_bytes, int state) 4864 { 4865 struct btrfs_root *root = fs_info->fs_root; 4866 struct btrfs_trans_handle *trans; 4867 int nr; 4868 int ret = 0; 4869 4870 switch (state) { 4871 case FLUSH_DELAYED_ITEMS_NR: 4872 case FLUSH_DELAYED_ITEMS: 4873 if (state == FLUSH_DELAYED_ITEMS_NR) 4874 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 4875 else 4876 nr = -1; 4877 4878 trans = btrfs_join_transaction(root); 4879 if (IS_ERR(trans)) { 4880 ret = PTR_ERR(trans); 4881 break; 4882 } 4883 ret = btrfs_run_delayed_items_nr(trans, fs_info, nr); 4884 btrfs_end_transaction(trans); 4885 break; 4886 case FLUSH_DELALLOC: 4887 case FLUSH_DELALLOC_WAIT: 4888 shrink_delalloc(root, num_bytes * 2, orig_bytes, 4889 state == FLUSH_DELALLOC_WAIT); 4890 break; 4891 case ALLOC_CHUNK: 4892 trans = btrfs_join_transaction(root); 4893 if (IS_ERR(trans)) { 4894 ret = PTR_ERR(trans); 4895 break; 4896 } 4897 ret = do_chunk_alloc(trans, fs_info, 4898 btrfs_get_alloc_profile(root, 0), 4899 CHUNK_ALLOC_NO_FORCE); 4900 btrfs_end_transaction(trans); 4901 if (ret > 0 || ret == -ENOSPC) 4902 ret = 0; 4903 break; 4904 case COMMIT_TRANS: 4905 ret = may_commit_transaction(fs_info, space_info, 4906 orig_bytes, 0); 4907 break; 4908 default: 4909 ret = -ENOSPC; 4910 break; 4911 } 4912 4913 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, 4914 orig_bytes, state, ret); 4915 return ret; 4916 } 4917 4918 static inline u64 4919 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, 4920 struct btrfs_space_info *space_info) 4921 { 4922 struct reserve_ticket *ticket; 4923 u64 used; 4924 u64 expected; 4925 u64 to_reclaim = 0; 4926 4927 list_for_each_entry(ticket, &space_info->tickets, list) 4928 to_reclaim += ticket->bytes; 4929 list_for_each_entry(ticket, &space_info->priority_tickets, list) 4930 to_reclaim += ticket->bytes; 4931 if (to_reclaim) 4932 return to_reclaim; 4933 4934 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 4935 if (can_overcommit(root, space_info, to_reclaim, 4936 BTRFS_RESERVE_FLUSH_ALL)) 4937 return 0; 4938 4939 used = space_info->bytes_used + space_info->bytes_reserved + 4940 space_info->bytes_pinned + space_info->bytes_readonly + 4941 space_info->bytes_may_use; 4942 if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) 4943 expected = div_factor_fine(space_info->total_bytes, 95); 4944 else 4945 expected = div_factor_fine(space_info->total_bytes, 90); 4946 4947 if (used > expected) 4948 to_reclaim = used - expected; 4949 else 4950 to_reclaim = 0; 4951 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 4952 space_info->bytes_reserved); 4953 return to_reclaim; 4954 } 4955 4956 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, 4957 struct btrfs_root *root, u64 used) 4958 { 4959 struct btrfs_fs_info *fs_info = root->fs_info; 4960 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 4961 4962 /* If we're just plain full then async reclaim just slows us down. */ 4963 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 4964 return 0; 4965 4966 if (!btrfs_calc_reclaim_metadata_size(root, space_info)) 4967 return 0; 4968 4969 return (used >= thresh && !btrfs_fs_closing(fs_info) && 4970 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 4971 } 4972 4973 static void wake_all_tickets(struct list_head *head) 4974 { 4975 struct reserve_ticket *ticket; 4976 4977 while (!list_empty(head)) { 4978 ticket = list_first_entry(head, struct reserve_ticket, list); 4979 list_del_init(&ticket->list); 4980 ticket->error = -ENOSPC; 4981 wake_up(&ticket->wait); 4982 } 4983 } 4984 4985 /* 4986 * This is for normal flushers, we can wait all goddamned day if we want to. We 4987 * will loop and continuously try to flush as long as we are making progress. 4988 * We count progress as clearing off tickets each time we have to loop. 4989 */ 4990 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4991 { 4992 struct btrfs_fs_info *fs_info; 4993 struct btrfs_space_info *space_info; 4994 u64 to_reclaim; 4995 int flush_state; 4996 int commit_cycles = 0; 4997 u64 last_tickets_id; 4998 4999 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 5000 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5001 5002 spin_lock(&space_info->lock); 5003 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 5004 space_info); 5005 if (!to_reclaim) { 5006 space_info->flush = 0; 5007 spin_unlock(&space_info->lock); 5008 return; 5009 } 5010 last_tickets_id = space_info->tickets_id; 5011 spin_unlock(&space_info->lock); 5012 5013 flush_state = FLUSH_DELAYED_ITEMS_NR; 5014 do { 5015 struct reserve_ticket *ticket; 5016 int ret; 5017 5018 ret = flush_space(fs_info, space_info, to_reclaim, to_reclaim, 5019 flush_state); 5020 spin_lock(&space_info->lock); 5021 if (list_empty(&space_info->tickets)) { 5022 space_info->flush = 0; 5023 spin_unlock(&space_info->lock); 5024 return; 5025 } 5026 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 5027 space_info); 5028 ticket = list_first_entry(&space_info->tickets, 5029 struct reserve_ticket, list); 5030 if (last_tickets_id == space_info->tickets_id) { 5031 flush_state++; 5032 } else { 5033 last_tickets_id = space_info->tickets_id; 5034 flush_state = FLUSH_DELAYED_ITEMS_NR; 5035 if (commit_cycles) 5036 commit_cycles--; 5037 } 5038 5039 if (flush_state > COMMIT_TRANS) { 5040 commit_cycles++; 5041 if (commit_cycles > 2) { 5042 wake_all_tickets(&space_info->tickets); 5043 space_info->flush = 0; 5044 } else { 5045 flush_state = FLUSH_DELAYED_ITEMS_NR; 5046 } 5047 } 5048 spin_unlock(&space_info->lock); 5049 } while (flush_state <= COMMIT_TRANS); 5050 } 5051 5052 void btrfs_init_async_reclaim_work(struct work_struct *work) 5053 { 5054 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 5055 } 5056 5057 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 5058 struct btrfs_space_info *space_info, 5059 struct reserve_ticket *ticket) 5060 { 5061 u64 to_reclaim; 5062 int flush_state = FLUSH_DELAYED_ITEMS_NR; 5063 5064 spin_lock(&space_info->lock); 5065 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 5066 space_info); 5067 if (!to_reclaim) { 5068 spin_unlock(&space_info->lock); 5069 return; 5070 } 5071 spin_unlock(&space_info->lock); 5072 5073 do { 5074 flush_space(fs_info, space_info, to_reclaim, to_reclaim, 5075 flush_state); 5076 flush_state++; 5077 spin_lock(&space_info->lock); 5078 if (ticket->bytes == 0) { 5079 spin_unlock(&space_info->lock); 5080 return; 5081 } 5082 spin_unlock(&space_info->lock); 5083 5084 /* 5085 * Priority flushers can't wait on delalloc without 5086 * deadlocking. 5087 */ 5088 if (flush_state == FLUSH_DELALLOC || 5089 flush_state == FLUSH_DELALLOC_WAIT) 5090 flush_state = ALLOC_CHUNK; 5091 } while (flush_state < COMMIT_TRANS); 5092 } 5093 5094 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, 5095 struct btrfs_space_info *space_info, 5096 struct reserve_ticket *ticket, u64 orig_bytes) 5097 5098 { 5099 DEFINE_WAIT(wait); 5100 int ret = 0; 5101 5102 spin_lock(&space_info->lock); 5103 while (ticket->bytes > 0 && ticket->error == 0) { 5104 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 5105 if (ret) { 5106 ret = -EINTR; 5107 break; 5108 } 5109 spin_unlock(&space_info->lock); 5110 5111 schedule(); 5112 5113 finish_wait(&ticket->wait, &wait); 5114 spin_lock(&space_info->lock); 5115 } 5116 if (!ret) 5117 ret = ticket->error; 5118 if (!list_empty(&ticket->list)) 5119 list_del_init(&ticket->list); 5120 if (ticket->bytes && ticket->bytes < orig_bytes) { 5121 u64 num_bytes = orig_bytes - ticket->bytes; 5122 space_info->bytes_may_use -= num_bytes; 5123 trace_btrfs_space_reservation(fs_info, "space_info", 5124 space_info->flags, num_bytes, 0); 5125 } 5126 spin_unlock(&space_info->lock); 5127 5128 return ret; 5129 } 5130 5131 /** 5132 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5133 * @root - the root we're allocating for 5134 * @space_info - the space info we want to allocate from 5135 * @orig_bytes - the number of bytes we want 5136 * @flush - whether or not we can flush to make our reservation 5137 * 5138 * This will reserve orig_bytes number of bytes from the space info associated 5139 * with the block_rsv. If there is not enough space it will make an attempt to 5140 * flush out space to make room. It will do this by flushing delalloc if 5141 * possible or committing the transaction. If flush is 0 then no attempts to 5142 * regain reservations will be made and this will fail if there is not enough 5143 * space already. 5144 */ 5145 static int __reserve_metadata_bytes(struct btrfs_root *root, 5146 struct btrfs_space_info *space_info, 5147 u64 orig_bytes, 5148 enum btrfs_reserve_flush_enum flush) 5149 { 5150 struct btrfs_fs_info *fs_info = root->fs_info; 5151 struct reserve_ticket ticket; 5152 u64 used; 5153 int ret = 0; 5154 5155 ASSERT(orig_bytes); 5156 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 5157 5158 spin_lock(&space_info->lock); 5159 ret = -ENOSPC; 5160 used = btrfs_space_info_used(space_info, true); 5161 5162 /* 5163 * If we have enough space then hooray, make our reservation and carry 5164 * on. If not see if we can overcommit, and if we can, hooray carry on. 5165 * If not things get more complicated. 5166 */ 5167 if (used + orig_bytes <= space_info->total_bytes) { 5168 space_info->bytes_may_use += orig_bytes; 5169 trace_btrfs_space_reservation(fs_info, "space_info", 5170 space_info->flags, orig_bytes, 1); 5171 ret = 0; 5172 } else if (can_overcommit(root, space_info, orig_bytes, flush)) { 5173 space_info->bytes_may_use += orig_bytes; 5174 trace_btrfs_space_reservation(fs_info, "space_info", 5175 space_info->flags, orig_bytes, 1); 5176 ret = 0; 5177 } 5178 5179 /* 5180 * If we couldn't make a reservation then setup our reservation ticket 5181 * and kick the async worker if it's not already running. 5182 * 5183 * If we are a priority flusher then we just need to add our ticket to 5184 * the list and we will do our own flushing further down. 5185 */ 5186 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 5187 ticket.bytes = orig_bytes; 5188 ticket.error = 0; 5189 init_waitqueue_head(&ticket.wait); 5190 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 5191 list_add_tail(&ticket.list, &space_info->tickets); 5192 if (!space_info->flush) { 5193 space_info->flush = 1; 5194 trace_btrfs_trigger_flush(fs_info, 5195 space_info->flags, 5196 orig_bytes, flush, 5197 "enospc"); 5198 queue_work(system_unbound_wq, 5199 &root->fs_info->async_reclaim_work); 5200 } 5201 } else { 5202 list_add_tail(&ticket.list, 5203 &space_info->priority_tickets); 5204 } 5205 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 5206 used += orig_bytes; 5207 /* 5208 * We will do the space reservation dance during log replay, 5209 * which means we won't have fs_info->fs_root set, so don't do 5210 * the async reclaim as we will panic. 5211 */ 5212 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 5213 need_do_async_reclaim(space_info, root, used) && 5214 !work_busy(&fs_info->async_reclaim_work)) { 5215 trace_btrfs_trigger_flush(fs_info, space_info->flags, 5216 orig_bytes, flush, "preempt"); 5217 queue_work(system_unbound_wq, 5218 &fs_info->async_reclaim_work); 5219 } 5220 } 5221 spin_unlock(&space_info->lock); 5222 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 5223 return ret; 5224 5225 if (flush == BTRFS_RESERVE_FLUSH_ALL) 5226 return wait_reserve_ticket(fs_info, space_info, &ticket, 5227 orig_bytes); 5228 5229 ret = 0; 5230 priority_reclaim_metadata_space(fs_info, space_info, &ticket); 5231 spin_lock(&space_info->lock); 5232 if (ticket.bytes) { 5233 if (ticket.bytes < orig_bytes) { 5234 u64 num_bytes = orig_bytes - ticket.bytes; 5235 space_info->bytes_may_use -= num_bytes; 5236 trace_btrfs_space_reservation(fs_info, "space_info", 5237 space_info->flags, 5238 num_bytes, 0); 5239 5240 } 5241 list_del_init(&ticket.list); 5242 ret = -ENOSPC; 5243 } 5244 spin_unlock(&space_info->lock); 5245 ASSERT(list_empty(&ticket.list)); 5246 return ret; 5247 } 5248 5249 /** 5250 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5251 * @root - the root we're allocating for 5252 * @block_rsv - the block_rsv we're allocating for 5253 * @orig_bytes - the number of bytes we want 5254 * @flush - whether or not we can flush to make our reservation 5255 * 5256 * This will reserve orgi_bytes number of bytes from the space info associated 5257 * with the block_rsv. If there is not enough space it will make an attempt to 5258 * flush out space to make room. It will do this by flushing delalloc if 5259 * possible or committing the transaction. If flush is 0 then no attempts to 5260 * regain reservations will be made and this will fail if there is not enough 5261 * space already. 5262 */ 5263 static int reserve_metadata_bytes(struct btrfs_root *root, 5264 struct btrfs_block_rsv *block_rsv, 5265 u64 orig_bytes, 5266 enum btrfs_reserve_flush_enum flush) 5267 { 5268 struct btrfs_fs_info *fs_info = root->fs_info; 5269 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5270 int ret; 5271 5272 ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, 5273 flush); 5274 if (ret == -ENOSPC && 5275 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 5276 if (block_rsv != global_rsv && 5277 !block_rsv_use_bytes(global_rsv, orig_bytes)) 5278 ret = 0; 5279 } 5280 if (ret == -ENOSPC) 5281 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 5282 block_rsv->space_info->flags, 5283 orig_bytes, 1); 5284 return ret; 5285 } 5286 5287 static struct btrfs_block_rsv *get_block_rsv( 5288 const struct btrfs_trans_handle *trans, 5289 const struct btrfs_root *root) 5290 { 5291 struct btrfs_fs_info *fs_info = root->fs_info; 5292 struct btrfs_block_rsv *block_rsv = NULL; 5293 5294 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 5295 (root == fs_info->csum_root && trans->adding_csums) || 5296 (root == fs_info->uuid_root)) 5297 block_rsv = trans->block_rsv; 5298 5299 if (!block_rsv) 5300 block_rsv = root->block_rsv; 5301 5302 if (!block_rsv) 5303 block_rsv = &fs_info->empty_block_rsv; 5304 5305 return block_rsv; 5306 } 5307 5308 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 5309 u64 num_bytes) 5310 { 5311 int ret = -ENOSPC; 5312 spin_lock(&block_rsv->lock); 5313 if (block_rsv->reserved >= num_bytes) { 5314 block_rsv->reserved -= num_bytes; 5315 if (block_rsv->reserved < block_rsv->size) 5316 block_rsv->full = 0; 5317 ret = 0; 5318 } 5319 spin_unlock(&block_rsv->lock); 5320 return ret; 5321 } 5322 5323 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 5324 u64 num_bytes, int update_size) 5325 { 5326 spin_lock(&block_rsv->lock); 5327 block_rsv->reserved += num_bytes; 5328 if (update_size) 5329 block_rsv->size += num_bytes; 5330 else if (block_rsv->reserved >= block_rsv->size) 5331 block_rsv->full = 1; 5332 spin_unlock(&block_rsv->lock); 5333 } 5334 5335 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 5336 struct btrfs_block_rsv *dest, u64 num_bytes, 5337 int min_factor) 5338 { 5339 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5340 u64 min_bytes; 5341 5342 if (global_rsv->space_info != dest->space_info) 5343 return -ENOSPC; 5344 5345 spin_lock(&global_rsv->lock); 5346 min_bytes = div_factor(global_rsv->size, min_factor); 5347 if (global_rsv->reserved < min_bytes + num_bytes) { 5348 spin_unlock(&global_rsv->lock); 5349 return -ENOSPC; 5350 } 5351 global_rsv->reserved -= num_bytes; 5352 if (global_rsv->reserved < global_rsv->size) 5353 global_rsv->full = 0; 5354 spin_unlock(&global_rsv->lock); 5355 5356 block_rsv_add_bytes(dest, num_bytes, 1); 5357 return 0; 5358 } 5359 5360 /* 5361 * This is for space we already have accounted in space_info->bytes_may_use, so 5362 * basically when we're returning space from block_rsv's. 5363 */ 5364 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 5365 struct btrfs_space_info *space_info, 5366 u64 num_bytes) 5367 { 5368 struct reserve_ticket *ticket; 5369 struct list_head *head; 5370 u64 used; 5371 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 5372 bool check_overcommit = false; 5373 5374 spin_lock(&space_info->lock); 5375 head = &space_info->priority_tickets; 5376 5377 /* 5378 * If we are over our limit then we need to check and see if we can 5379 * overcommit, and if we can't then we just need to free up our space 5380 * and not satisfy any requests. 5381 */ 5382 used = space_info->bytes_used + space_info->bytes_reserved + 5383 space_info->bytes_pinned + space_info->bytes_readonly + 5384 space_info->bytes_may_use; 5385 if (used - num_bytes >= space_info->total_bytes) 5386 check_overcommit = true; 5387 again: 5388 while (!list_empty(head) && num_bytes) { 5389 ticket = list_first_entry(head, struct reserve_ticket, 5390 list); 5391 /* 5392 * We use 0 bytes because this space is already reserved, so 5393 * adding the ticket space would be a double count. 5394 */ 5395 if (check_overcommit && 5396 !can_overcommit(fs_info->extent_root, space_info, 0, 5397 flush)) 5398 break; 5399 if (num_bytes >= ticket->bytes) { 5400 list_del_init(&ticket->list); 5401 num_bytes -= ticket->bytes; 5402 ticket->bytes = 0; 5403 space_info->tickets_id++; 5404 wake_up(&ticket->wait); 5405 } else { 5406 ticket->bytes -= num_bytes; 5407 num_bytes = 0; 5408 } 5409 } 5410 5411 if (num_bytes && head == &space_info->priority_tickets) { 5412 head = &space_info->tickets; 5413 flush = BTRFS_RESERVE_FLUSH_ALL; 5414 goto again; 5415 } 5416 space_info->bytes_may_use -= num_bytes; 5417 trace_btrfs_space_reservation(fs_info, "space_info", 5418 space_info->flags, num_bytes, 0); 5419 spin_unlock(&space_info->lock); 5420 } 5421 5422 /* 5423 * This is for newly allocated space that isn't accounted in 5424 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent 5425 * we use this helper. 5426 */ 5427 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 5428 struct btrfs_space_info *space_info, 5429 u64 num_bytes) 5430 { 5431 struct reserve_ticket *ticket; 5432 struct list_head *head = &space_info->priority_tickets; 5433 5434 again: 5435 while (!list_empty(head) && num_bytes) { 5436 ticket = list_first_entry(head, struct reserve_ticket, 5437 list); 5438 if (num_bytes >= ticket->bytes) { 5439 trace_btrfs_space_reservation(fs_info, "space_info", 5440 space_info->flags, 5441 ticket->bytes, 1); 5442 list_del_init(&ticket->list); 5443 num_bytes -= ticket->bytes; 5444 space_info->bytes_may_use += ticket->bytes; 5445 ticket->bytes = 0; 5446 space_info->tickets_id++; 5447 wake_up(&ticket->wait); 5448 } else { 5449 trace_btrfs_space_reservation(fs_info, "space_info", 5450 space_info->flags, 5451 num_bytes, 1); 5452 space_info->bytes_may_use += num_bytes; 5453 ticket->bytes -= num_bytes; 5454 num_bytes = 0; 5455 } 5456 } 5457 5458 if (num_bytes && head == &space_info->priority_tickets) { 5459 head = &space_info->tickets; 5460 goto again; 5461 } 5462 } 5463 5464 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5465 struct btrfs_block_rsv *block_rsv, 5466 struct btrfs_block_rsv *dest, u64 num_bytes) 5467 { 5468 struct btrfs_space_info *space_info = block_rsv->space_info; 5469 5470 spin_lock(&block_rsv->lock); 5471 if (num_bytes == (u64)-1) 5472 num_bytes = block_rsv->size; 5473 block_rsv->size -= num_bytes; 5474 if (block_rsv->reserved >= block_rsv->size) { 5475 num_bytes = block_rsv->reserved - block_rsv->size; 5476 block_rsv->reserved = block_rsv->size; 5477 block_rsv->full = 1; 5478 } else { 5479 num_bytes = 0; 5480 } 5481 spin_unlock(&block_rsv->lock); 5482 5483 if (num_bytes > 0) { 5484 if (dest) { 5485 spin_lock(&dest->lock); 5486 if (!dest->full) { 5487 u64 bytes_to_add; 5488 5489 bytes_to_add = dest->size - dest->reserved; 5490 bytes_to_add = min(num_bytes, bytes_to_add); 5491 dest->reserved += bytes_to_add; 5492 if (dest->reserved >= dest->size) 5493 dest->full = 1; 5494 num_bytes -= bytes_to_add; 5495 } 5496 spin_unlock(&dest->lock); 5497 } 5498 if (num_bytes) 5499 space_info_add_old_bytes(fs_info, space_info, 5500 num_bytes); 5501 } 5502 } 5503 5504 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, 5505 struct btrfs_block_rsv *dst, u64 num_bytes, 5506 int update_size) 5507 { 5508 int ret; 5509 5510 ret = block_rsv_use_bytes(src, num_bytes); 5511 if (ret) 5512 return ret; 5513 5514 block_rsv_add_bytes(dst, num_bytes, update_size); 5515 return 0; 5516 } 5517 5518 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 5519 { 5520 memset(rsv, 0, sizeof(*rsv)); 5521 spin_lock_init(&rsv->lock); 5522 rsv->type = type; 5523 } 5524 5525 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 5526 unsigned short type) 5527 { 5528 struct btrfs_block_rsv *block_rsv; 5529 5530 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 5531 if (!block_rsv) 5532 return NULL; 5533 5534 btrfs_init_block_rsv(block_rsv, type); 5535 block_rsv->space_info = __find_space_info(fs_info, 5536 BTRFS_BLOCK_GROUP_METADATA); 5537 return block_rsv; 5538 } 5539 5540 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, 5541 struct btrfs_block_rsv *rsv) 5542 { 5543 if (!rsv) 5544 return; 5545 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 5546 kfree(rsv); 5547 } 5548 5549 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv) 5550 { 5551 kfree(rsv); 5552 } 5553 5554 int btrfs_block_rsv_add(struct btrfs_root *root, 5555 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 5556 enum btrfs_reserve_flush_enum flush) 5557 { 5558 int ret; 5559 5560 if (num_bytes == 0) 5561 return 0; 5562 5563 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5564 if (!ret) { 5565 block_rsv_add_bytes(block_rsv, num_bytes, 1); 5566 return 0; 5567 } 5568 5569 return ret; 5570 } 5571 5572 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) 5573 { 5574 u64 num_bytes = 0; 5575 int ret = -ENOSPC; 5576 5577 if (!block_rsv) 5578 return 0; 5579 5580 spin_lock(&block_rsv->lock); 5581 num_bytes = div_factor(block_rsv->size, min_factor); 5582 if (block_rsv->reserved >= num_bytes) 5583 ret = 0; 5584 spin_unlock(&block_rsv->lock); 5585 5586 return ret; 5587 } 5588 5589 int btrfs_block_rsv_refill(struct btrfs_root *root, 5590 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 5591 enum btrfs_reserve_flush_enum flush) 5592 { 5593 u64 num_bytes = 0; 5594 int ret = -ENOSPC; 5595 5596 if (!block_rsv) 5597 return 0; 5598 5599 spin_lock(&block_rsv->lock); 5600 num_bytes = min_reserved; 5601 if (block_rsv->reserved >= num_bytes) 5602 ret = 0; 5603 else 5604 num_bytes -= block_rsv->reserved; 5605 spin_unlock(&block_rsv->lock); 5606 5607 if (!ret) 5608 return 0; 5609 5610 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5611 if (!ret) { 5612 block_rsv_add_bytes(block_rsv, num_bytes, 0); 5613 return 0; 5614 } 5615 5616 return ret; 5617 } 5618 5619 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5620 struct btrfs_block_rsv *block_rsv, 5621 u64 num_bytes) 5622 { 5623 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5624 5625 if (global_rsv == block_rsv || 5626 block_rsv->space_info != global_rsv->space_info) 5627 global_rsv = NULL; 5628 block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes); 5629 } 5630 5631 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 5632 { 5633 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 5634 struct btrfs_space_info *sinfo = block_rsv->space_info; 5635 u64 num_bytes; 5636 5637 /* 5638 * The global block rsv is based on the size of the extent tree, the 5639 * checksum tree and the root tree. If the fs is empty we want to set 5640 * it to a minimal amount for safety. 5641 */ 5642 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + 5643 btrfs_root_used(&fs_info->csum_root->root_item) + 5644 btrfs_root_used(&fs_info->tree_root->root_item); 5645 num_bytes = max_t(u64, num_bytes, SZ_16M); 5646 5647 spin_lock(&sinfo->lock); 5648 spin_lock(&block_rsv->lock); 5649 5650 block_rsv->size = min_t(u64, num_bytes, SZ_512M); 5651 5652 if (block_rsv->reserved < block_rsv->size) { 5653 num_bytes = btrfs_space_info_used(sinfo, true); 5654 if (sinfo->total_bytes > num_bytes) { 5655 num_bytes = sinfo->total_bytes - num_bytes; 5656 num_bytes = min(num_bytes, 5657 block_rsv->size - block_rsv->reserved); 5658 block_rsv->reserved += num_bytes; 5659 sinfo->bytes_may_use += num_bytes; 5660 trace_btrfs_space_reservation(fs_info, "space_info", 5661 sinfo->flags, num_bytes, 5662 1); 5663 } 5664 } else if (block_rsv->reserved > block_rsv->size) { 5665 num_bytes = block_rsv->reserved - block_rsv->size; 5666 sinfo->bytes_may_use -= num_bytes; 5667 trace_btrfs_space_reservation(fs_info, "space_info", 5668 sinfo->flags, num_bytes, 0); 5669 block_rsv->reserved = block_rsv->size; 5670 } 5671 5672 if (block_rsv->reserved == block_rsv->size) 5673 block_rsv->full = 1; 5674 else 5675 block_rsv->full = 0; 5676 5677 spin_unlock(&block_rsv->lock); 5678 spin_unlock(&sinfo->lock); 5679 } 5680 5681 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 5682 { 5683 struct btrfs_space_info *space_info; 5684 5685 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 5686 fs_info->chunk_block_rsv.space_info = space_info; 5687 5688 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5689 fs_info->global_block_rsv.space_info = space_info; 5690 fs_info->delalloc_block_rsv.space_info = space_info; 5691 fs_info->trans_block_rsv.space_info = space_info; 5692 fs_info->empty_block_rsv.space_info = space_info; 5693 fs_info->delayed_block_rsv.space_info = space_info; 5694 5695 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 5696 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 5697 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 5698 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 5699 if (fs_info->quota_root) 5700 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 5701 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 5702 5703 update_global_block_rsv(fs_info); 5704 } 5705 5706 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 5707 { 5708 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 5709 (u64)-1); 5710 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 5711 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 5712 WARN_ON(fs_info->trans_block_rsv.size > 0); 5713 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 5714 WARN_ON(fs_info->chunk_block_rsv.size > 0); 5715 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 5716 WARN_ON(fs_info->delayed_block_rsv.size > 0); 5717 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 5718 } 5719 5720 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 5721 struct btrfs_fs_info *fs_info) 5722 { 5723 if (!trans->block_rsv) 5724 return; 5725 5726 if (!trans->bytes_reserved) 5727 return; 5728 5729 trace_btrfs_space_reservation(fs_info, "transaction", 5730 trans->transid, trans->bytes_reserved, 0); 5731 btrfs_block_rsv_release(fs_info, trans->block_rsv, 5732 trans->bytes_reserved); 5733 trans->bytes_reserved = 0; 5734 } 5735 5736 /* 5737 * To be called after all the new block groups attached to the transaction 5738 * handle have been created (btrfs_create_pending_block_groups()). 5739 */ 5740 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) 5741 { 5742 struct btrfs_fs_info *fs_info = trans->fs_info; 5743 5744 if (!trans->chunk_bytes_reserved) 5745 return; 5746 5747 WARN_ON_ONCE(!list_empty(&trans->new_bgs)); 5748 5749 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, 5750 trans->chunk_bytes_reserved); 5751 trans->chunk_bytes_reserved = 0; 5752 } 5753 5754 /* Can only return 0 or -ENOSPC */ 5755 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 5756 struct btrfs_inode *inode) 5757 { 5758 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5759 struct btrfs_root *root = inode->root; 5760 /* 5761 * We always use trans->block_rsv here as we will have reserved space 5762 * for our orphan when starting the transaction, using get_block_rsv() 5763 * here will sometimes make us choose the wrong block rsv as we could be 5764 * doing a reloc inode for a non refcounted root. 5765 */ 5766 struct btrfs_block_rsv *src_rsv = trans->block_rsv; 5767 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 5768 5769 /* 5770 * We need to hold space in order to delete our orphan item once we've 5771 * added it, so this takes the reservation so we can release it later 5772 * when we are truly done with the orphan item. 5773 */ 5774 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 5775 5776 trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), 5777 num_bytes, 1); 5778 return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); 5779 } 5780 5781 void btrfs_orphan_release_metadata(struct btrfs_inode *inode) 5782 { 5783 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5784 struct btrfs_root *root = inode->root; 5785 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 5786 5787 trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), 5788 num_bytes, 0); 5789 btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes); 5790 } 5791 5792 /* 5793 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 5794 * root: the root of the parent directory 5795 * rsv: block reservation 5796 * items: the number of items that we need do reservation 5797 * qgroup_reserved: used to return the reserved size in qgroup 5798 * 5799 * This function is used to reserve the space for snapshot/subvolume 5800 * creation and deletion. Those operations are different with the 5801 * common file/directory operations, they change two fs/file trees 5802 * and root tree, the number of items that the qgroup reserves is 5803 * different with the free space reservation. So we can not use 5804 * the space reservation mechanism in start_transaction(). 5805 */ 5806 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 5807 struct btrfs_block_rsv *rsv, 5808 int items, 5809 u64 *qgroup_reserved, 5810 bool use_global_rsv) 5811 { 5812 u64 num_bytes; 5813 int ret; 5814 struct btrfs_fs_info *fs_info = root->fs_info; 5815 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5816 5817 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 5818 /* One for parent inode, two for dir entries */ 5819 num_bytes = 3 * fs_info->nodesize; 5820 ret = btrfs_qgroup_reserve_meta(root, num_bytes, true); 5821 if (ret) 5822 return ret; 5823 } else { 5824 num_bytes = 0; 5825 } 5826 5827 *qgroup_reserved = num_bytes; 5828 5829 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); 5830 rsv->space_info = __find_space_info(fs_info, 5831 BTRFS_BLOCK_GROUP_METADATA); 5832 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 5833 BTRFS_RESERVE_FLUSH_ALL); 5834 5835 if (ret == -ENOSPC && use_global_rsv) 5836 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); 5837 5838 if (ret && *qgroup_reserved) 5839 btrfs_qgroup_free_meta(root, *qgroup_reserved); 5840 5841 return ret; 5842 } 5843 5844 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, 5845 struct btrfs_block_rsv *rsv) 5846 { 5847 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 5848 } 5849 5850 /** 5851 * drop_outstanding_extent - drop an outstanding extent 5852 * @inode: the inode we're dropping the extent for 5853 * @num_bytes: the number of bytes we're releasing. 5854 * 5855 * This is called when we are freeing up an outstanding extent, either called 5856 * after an error or after an extent is written. This will return the number of 5857 * reserved extents that need to be freed. This must be called with 5858 * BTRFS_I(inode)->lock held. 5859 */ 5860 static unsigned drop_outstanding_extent(struct btrfs_inode *inode, 5861 u64 num_bytes) 5862 { 5863 unsigned drop_inode_space = 0; 5864 unsigned dropped_extents = 0; 5865 unsigned num_extents; 5866 5867 num_extents = count_max_extents(num_bytes); 5868 ASSERT(num_extents); 5869 ASSERT(inode->outstanding_extents >= num_extents); 5870 inode->outstanding_extents -= num_extents; 5871 5872 if (inode->outstanding_extents == 0 && 5873 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5874 &inode->runtime_flags)) 5875 drop_inode_space = 1; 5876 5877 /* 5878 * If we have more or the same amount of outstanding extents than we have 5879 * reserved then we need to leave the reserved extents count alone. 5880 */ 5881 if (inode->outstanding_extents >= inode->reserved_extents) 5882 return drop_inode_space; 5883 5884 dropped_extents = inode->reserved_extents - inode->outstanding_extents; 5885 inode->reserved_extents -= dropped_extents; 5886 return dropped_extents + drop_inode_space; 5887 } 5888 5889 /** 5890 * calc_csum_metadata_size - return the amount of metadata space that must be 5891 * reserved/freed for the given bytes. 5892 * @inode: the inode we're manipulating 5893 * @num_bytes: the number of bytes in question 5894 * @reserve: 1 if we are reserving space, 0 if we are freeing space 5895 * 5896 * This adjusts the number of csum_bytes in the inode and then returns the 5897 * correct amount of metadata that must either be reserved or freed. We 5898 * calculate how many checksums we can fit into one leaf and then divide the 5899 * number of bytes that will need to be checksumed by this value to figure out 5900 * how many checksums will be required. If we are adding bytes then the number 5901 * may go up and we will return the number of additional bytes that must be 5902 * reserved. If it is going down we will return the number of bytes that must 5903 * be freed. 5904 * 5905 * This must be called with BTRFS_I(inode)->lock held. 5906 */ 5907 static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes, 5908 int reserve) 5909 { 5910 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5911 u64 old_csums, num_csums; 5912 5913 if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0) 5914 return 0; 5915 5916 old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); 5917 if (reserve) 5918 inode->csum_bytes += num_bytes; 5919 else 5920 inode->csum_bytes -= num_bytes; 5921 num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); 5922 5923 /* No change, no need to reserve more */ 5924 if (old_csums == num_csums) 5925 return 0; 5926 5927 if (reserve) 5928 return btrfs_calc_trans_metadata_size(fs_info, 5929 num_csums - old_csums); 5930 5931 return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums); 5932 } 5933 5934 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) 5935 { 5936 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5937 struct btrfs_root *root = inode->root; 5938 struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv; 5939 u64 to_reserve = 0; 5940 u64 csum_bytes; 5941 unsigned nr_extents; 5942 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 5943 int ret = 0; 5944 bool delalloc_lock = true; 5945 u64 to_free = 0; 5946 unsigned dropped; 5947 bool release_extra = false; 5948 5949 /* If we are a free space inode we need to not flush since we will be in 5950 * the middle of a transaction commit. We also don't need the delalloc 5951 * mutex since we won't race with anybody. We need this mostly to make 5952 * lockdep shut its filthy mouth. 5953 * 5954 * If we have a transaction open (can happen if we call truncate_block 5955 * from truncate), then we need FLUSH_LIMIT so we don't deadlock. 5956 */ 5957 if (btrfs_is_free_space_inode(inode)) { 5958 flush = BTRFS_RESERVE_NO_FLUSH; 5959 delalloc_lock = false; 5960 } else if (current->journal_info) { 5961 flush = BTRFS_RESERVE_FLUSH_LIMIT; 5962 } 5963 5964 if (flush != BTRFS_RESERVE_NO_FLUSH && 5965 btrfs_transaction_in_commit(fs_info)) 5966 schedule_timeout(1); 5967 5968 if (delalloc_lock) 5969 mutex_lock(&inode->delalloc_mutex); 5970 5971 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 5972 5973 spin_lock(&inode->lock); 5974 nr_extents = count_max_extents(num_bytes); 5975 inode->outstanding_extents += nr_extents; 5976 5977 nr_extents = 0; 5978 if (inode->outstanding_extents > inode->reserved_extents) 5979 nr_extents += inode->outstanding_extents - 5980 inode->reserved_extents; 5981 5982 /* We always want to reserve a slot for updating the inode. */ 5983 to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1); 5984 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 5985 csum_bytes = inode->csum_bytes; 5986 spin_unlock(&inode->lock); 5987 5988 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 5989 ret = btrfs_qgroup_reserve_meta(root, 5990 nr_extents * fs_info->nodesize, true); 5991 if (ret) 5992 goto out_fail; 5993 } 5994 5995 ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); 5996 if (unlikely(ret)) { 5997 btrfs_qgroup_free_meta(root, 5998 nr_extents * fs_info->nodesize); 5999 goto out_fail; 6000 } 6001 6002 spin_lock(&inode->lock); 6003 if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 6004 &inode->runtime_flags)) { 6005 to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1); 6006 release_extra = true; 6007 } 6008 inode->reserved_extents += nr_extents; 6009 spin_unlock(&inode->lock); 6010 6011 if (delalloc_lock) 6012 mutex_unlock(&inode->delalloc_mutex); 6013 6014 if (to_reserve) 6015 trace_btrfs_space_reservation(fs_info, "delalloc", 6016 btrfs_ino(inode), to_reserve, 1); 6017 if (release_extra) 6018 btrfs_block_rsv_release(fs_info, block_rsv, 6019 btrfs_calc_trans_metadata_size(fs_info, 1)); 6020 return 0; 6021 6022 out_fail: 6023 spin_lock(&inode->lock); 6024 dropped = drop_outstanding_extent(inode, num_bytes); 6025 /* 6026 * If the inodes csum_bytes is the same as the original 6027 * csum_bytes then we know we haven't raced with any free()ers 6028 * so we can just reduce our inodes csum bytes and carry on. 6029 */ 6030 if (inode->csum_bytes == csum_bytes) { 6031 calc_csum_metadata_size(inode, num_bytes, 0); 6032 } else { 6033 u64 orig_csum_bytes = inode->csum_bytes; 6034 u64 bytes; 6035 6036 /* 6037 * This is tricky, but first we need to figure out how much we 6038 * freed from any free-ers that occurred during this 6039 * reservation, so we reset ->csum_bytes to the csum_bytes 6040 * before we dropped our lock, and then call the free for the 6041 * number of bytes that were freed while we were trying our 6042 * reservation. 6043 */ 6044 bytes = csum_bytes - inode->csum_bytes; 6045 inode->csum_bytes = csum_bytes; 6046 to_free = calc_csum_metadata_size(inode, bytes, 0); 6047 6048 6049 /* 6050 * Now we need to see how much we would have freed had we not 6051 * been making this reservation and our ->csum_bytes were not 6052 * artificially inflated. 6053 */ 6054 inode->csum_bytes = csum_bytes - num_bytes; 6055 bytes = csum_bytes - orig_csum_bytes; 6056 bytes = calc_csum_metadata_size(inode, bytes, 0); 6057 6058 /* 6059 * Now reset ->csum_bytes to what it should be. If bytes is 6060 * more than to_free then we would have freed more space had we 6061 * not had an artificially high ->csum_bytes, so we need to free 6062 * the remainder. If bytes is the same or less then we don't 6063 * need to do anything, the other free-ers did the correct 6064 * thing. 6065 */ 6066 inode->csum_bytes = orig_csum_bytes - num_bytes; 6067 if (bytes > to_free) 6068 to_free = bytes - to_free; 6069 else 6070 to_free = 0; 6071 } 6072 spin_unlock(&inode->lock); 6073 if (dropped) 6074 to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); 6075 6076 if (to_free) { 6077 btrfs_block_rsv_release(fs_info, block_rsv, to_free); 6078 trace_btrfs_space_reservation(fs_info, "delalloc", 6079 btrfs_ino(inode), to_free, 0); 6080 } 6081 if (delalloc_lock) 6082 mutex_unlock(&inode->delalloc_mutex); 6083 return ret; 6084 } 6085 6086 /** 6087 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 6088 * @inode: the inode to release the reservation for 6089 * @num_bytes: the number of bytes we're releasing 6090 * 6091 * This will release the metadata reservation for an inode. This can be called 6092 * once we complete IO for a given set of bytes to release their metadata 6093 * reservations. 6094 */ 6095 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) 6096 { 6097 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6098 u64 to_free = 0; 6099 unsigned dropped; 6100 6101 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6102 spin_lock(&inode->lock); 6103 dropped = drop_outstanding_extent(inode, num_bytes); 6104 6105 if (num_bytes) 6106 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 6107 spin_unlock(&inode->lock); 6108 if (dropped > 0) 6109 to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); 6110 6111 if (btrfs_is_testing(fs_info)) 6112 return; 6113 6114 trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), 6115 to_free, 0); 6116 6117 btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free); 6118 } 6119 6120 /** 6121 * btrfs_delalloc_reserve_space - reserve data and metadata space for 6122 * delalloc 6123 * @inode: inode we're writing to 6124 * @start: start range we are writing to 6125 * @len: how long the range we are writing to 6126 * 6127 * This will do the following things 6128 * 6129 * o reserve space in data space info for num bytes 6130 * and reserve precious corresponding qgroup space 6131 * (Done in check_data_free_space) 6132 * 6133 * o reserve space for metadata space, based on the number of outstanding 6134 * extents and how much csums will be needed 6135 * also reserve metadata space in a per root over-reserve method. 6136 * o add to the inodes->delalloc_bytes 6137 * o add it to the fs_info's delalloc inodes list. 6138 * (Above 3 all done in delalloc_reserve_metadata) 6139 * 6140 * Return 0 for success 6141 * Return <0 for error(-ENOSPC or -EQUOT) 6142 */ 6143 int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) 6144 { 6145 int ret; 6146 6147 ret = btrfs_check_data_free_space(inode, start, len); 6148 if (ret < 0) 6149 return ret; 6150 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); 6151 if (ret < 0) 6152 btrfs_free_reserved_data_space(inode, start, len); 6153 return ret; 6154 } 6155 6156 /** 6157 * btrfs_delalloc_release_space - release data and metadata space for delalloc 6158 * @inode: inode we're releasing space for 6159 * @start: start position of the space already reserved 6160 * @len: the len of the space already reserved 6161 * 6162 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 6163 * called in the case that we don't need the metadata AND data reservations 6164 * anymore. So if there is an error or we insert an inline extent. 6165 * 6166 * This function will release the metadata space that was not used and will 6167 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 6168 * list if there are no delalloc bytes left. 6169 * Also it will handle the qgroup reserved space. 6170 */ 6171 void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) 6172 { 6173 btrfs_delalloc_release_metadata(BTRFS_I(inode), len); 6174 btrfs_free_reserved_data_space(inode, start, len); 6175 } 6176 6177 static int update_block_group(struct btrfs_trans_handle *trans, 6178 struct btrfs_fs_info *info, u64 bytenr, 6179 u64 num_bytes, int alloc) 6180 { 6181 struct btrfs_block_group_cache *cache = NULL; 6182 u64 total = num_bytes; 6183 u64 old_val; 6184 u64 byte_in_group; 6185 int factor; 6186 6187 /* block accounting for super block */ 6188 spin_lock(&info->delalloc_root_lock); 6189 old_val = btrfs_super_bytes_used(info->super_copy); 6190 if (alloc) 6191 old_val += num_bytes; 6192 else 6193 old_val -= num_bytes; 6194 btrfs_set_super_bytes_used(info->super_copy, old_val); 6195 spin_unlock(&info->delalloc_root_lock); 6196 6197 while (total) { 6198 cache = btrfs_lookup_block_group(info, bytenr); 6199 if (!cache) 6200 return -ENOENT; 6201 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 6202 BTRFS_BLOCK_GROUP_RAID1 | 6203 BTRFS_BLOCK_GROUP_RAID10)) 6204 factor = 2; 6205 else 6206 factor = 1; 6207 /* 6208 * If this block group has free space cache written out, we 6209 * need to make sure to load it if we are removing space. This 6210 * is because we need the unpinning stage to actually add the 6211 * space back to the block group, otherwise we will leak space. 6212 */ 6213 if (!alloc && cache->cached == BTRFS_CACHE_NO) 6214 cache_block_group(cache, 1); 6215 6216 byte_in_group = bytenr - cache->key.objectid; 6217 WARN_ON(byte_in_group > cache->key.offset); 6218 6219 spin_lock(&cache->space_info->lock); 6220 spin_lock(&cache->lock); 6221 6222 if (btrfs_test_opt(info, SPACE_CACHE) && 6223 cache->disk_cache_state < BTRFS_DC_CLEAR) 6224 cache->disk_cache_state = BTRFS_DC_CLEAR; 6225 6226 old_val = btrfs_block_group_used(&cache->item); 6227 num_bytes = min(total, cache->key.offset - byte_in_group); 6228 if (alloc) { 6229 old_val += num_bytes; 6230 btrfs_set_block_group_used(&cache->item, old_val); 6231 cache->reserved -= num_bytes; 6232 cache->space_info->bytes_reserved -= num_bytes; 6233 cache->space_info->bytes_used += num_bytes; 6234 cache->space_info->disk_used += num_bytes * factor; 6235 spin_unlock(&cache->lock); 6236 spin_unlock(&cache->space_info->lock); 6237 } else { 6238 old_val -= num_bytes; 6239 btrfs_set_block_group_used(&cache->item, old_val); 6240 cache->pinned += num_bytes; 6241 cache->space_info->bytes_pinned += num_bytes; 6242 cache->space_info->bytes_used -= num_bytes; 6243 cache->space_info->disk_used -= num_bytes * factor; 6244 spin_unlock(&cache->lock); 6245 spin_unlock(&cache->space_info->lock); 6246 6247 trace_btrfs_space_reservation(info, "pinned", 6248 cache->space_info->flags, 6249 num_bytes, 1); 6250 set_extent_dirty(info->pinned_extents, 6251 bytenr, bytenr + num_bytes - 1, 6252 GFP_NOFS | __GFP_NOFAIL); 6253 } 6254 6255 spin_lock(&trans->transaction->dirty_bgs_lock); 6256 if (list_empty(&cache->dirty_list)) { 6257 list_add_tail(&cache->dirty_list, 6258 &trans->transaction->dirty_bgs); 6259 trans->transaction->num_dirty_bgs++; 6260 btrfs_get_block_group(cache); 6261 } 6262 spin_unlock(&trans->transaction->dirty_bgs_lock); 6263 6264 /* 6265 * No longer have used bytes in this block group, queue it for 6266 * deletion. We do this after adding the block group to the 6267 * dirty list to avoid races between cleaner kthread and space 6268 * cache writeout. 6269 */ 6270 if (!alloc && old_val == 0) { 6271 spin_lock(&info->unused_bgs_lock); 6272 if (list_empty(&cache->bg_list)) { 6273 btrfs_get_block_group(cache); 6274 list_add_tail(&cache->bg_list, 6275 &info->unused_bgs); 6276 } 6277 spin_unlock(&info->unused_bgs_lock); 6278 } 6279 6280 btrfs_put_block_group(cache); 6281 total -= num_bytes; 6282 bytenr += num_bytes; 6283 } 6284 return 0; 6285 } 6286 6287 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) 6288 { 6289 struct btrfs_block_group_cache *cache; 6290 u64 bytenr; 6291 6292 spin_lock(&fs_info->block_group_cache_lock); 6293 bytenr = fs_info->first_logical_byte; 6294 spin_unlock(&fs_info->block_group_cache_lock); 6295 6296 if (bytenr < (u64)-1) 6297 return bytenr; 6298 6299 cache = btrfs_lookup_first_block_group(fs_info, search_start); 6300 if (!cache) 6301 return 0; 6302 6303 bytenr = cache->key.objectid; 6304 btrfs_put_block_group(cache); 6305 6306 return bytenr; 6307 } 6308 6309 static int pin_down_extent(struct btrfs_fs_info *fs_info, 6310 struct btrfs_block_group_cache *cache, 6311 u64 bytenr, u64 num_bytes, int reserved) 6312 { 6313 spin_lock(&cache->space_info->lock); 6314 spin_lock(&cache->lock); 6315 cache->pinned += num_bytes; 6316 cache->space_info->bytes_pinned += num_bytes; 6317 if (reserved) { 6318 cache->reserved -= num_bytes; 6319 cache->space_info->bytes_reserved -= num_bytes; 6320 } 6321 spin_unlock(&cache->lock); 6322 spin_unlock(&cache->space_info->lock); 6323 6324 trace_btrfs_space_reservation(fs_info, "pinned", 6325 cache->space_info->flags, num_bytes, 1); 6326 set_extent_dirty(fs_info->pinned_extents, bytenr, 6327 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 6328 return 0; 6329 } 6330 6331 /* 6332 * this function must be called within transaction 6333 */ 6334 int btrfs_pin_extent(struct btrfs_fs_info *fs_info, 6335 u64 bytenr, u64 num_bytes, int reserved) 6336 { 6337 struct btrfs_block_group_cache *cache; 6338 6339 cache = btrfs_lookup_block_group(fs_info, bytenr); 6340 BUG_ON(!cache); /* Logic error */ 6341 6342 pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved); 6343 6344 btrfs_put_block_group(cache); 6345 return 0; 6346 } 6347 6348 /* 6349 * this function must be called within transaction 6350 */ 6351 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, 6352 u64 bytenr, u64 num_bytes) 6353 { 6354 struct btrfs_block_group_cache *cache; 6355 int ret; 6356 6357 cache = btrfs_lookup_block_group(fs_info, bytenr); 6358 if (!cache) 6359 return -EINVAL; 6360 6361 /* 6362 * pull in the free space cache (if any) so that our pin 6363 * removes the free space from the cache. We have load_only set 6364 * to one because the slow code to read in the free extents does check 6365 * the pinned extents. 6366 */ 6367 cache_block_group(cache, 1); 6368 6369 pin_down_extent(fs_info, cache, bytenr, num_bytes, 0); 6370 6371 /* remove us from the free space cache (if we're there at all) */ 6372 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 6373 btrfs_put_block_group(cache); 6374 return ret; 6375 } 6376 6377 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, 6378 u64 start, u64 num_bytes) 6379 { 6380 int ret; 6381 struct btrfs_block_group_cache *block_group; 6382 struct btrfs_caching_control *caching_ctl; 6383 6384 block_group = btrfs_lookup_block_group(fs_info, start); 6385 if (!block_group) 6386 return -EINVAL; 6387 6388 cache_block_group(block_group, 0); 6389 caching_ctl = get_caching_control(block_group); 6390 6391 if (!caching_ctl) { 6392 /* Logic error */ 6393 BUG_ON(!block_group_cache_done(block_group)); 6394 ret = btrfs_remove_free_space(block_group, start, num_bytes); 6395 } else { 6396 mutex_lock(&caching_ctl->mutex); 6397 6398 if (start >= caching_ctl->progress) { 6399 ret = add_excluded_extent(fs_info, start, num_bytes); 6400 } else if (start + num_bytes <= caching_ctl->progress) { 6401 ret = btrfs_remove_free_space(block_group, 6402 start, num_bytes); 6403 } else { 6404 num_bytes = caching_ctl->progress - start; 6405 ret = btrfs_remove_free_space(block_group, 6406 start, num_bytes); 6407 if (ret) 6408 goto out_lock; 6409 6410 num_bytes = (start + num_bytes) - 6411 caching_ctl->progress; 6412 start = caching_ctl->progress; 6413 ret = add_excluded_extent(fs_info, start, num_bytes); 6414 } 6415 out_lock: 6416 mutex_unlock(&caching_ctl->mutex); 6417 put_caching_control(caching_ctl); 6418 } 6419 btrfs_put_block_group(block_group); 6420 return ret; 6421 } 6422 6423 int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, 6424 struct extent_buffer *eb) 6425 { 6426 struct btrfs_file_extent_item *item; 6427 struct btrfs_key key; 6428 int found_type; 6429 int i; 6430 6431 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) 6432 return 0; 6433 6434 for (i = 0; i < btrfs_header_nritems(eb); i++) { 6435 btrfs_item_key_to_cpu(eb, &key, i); 6436 if (key.type != BTRFS_EXTENT_DATA_KEY) 6437 continue; 6438 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 6439 found_type = btrfs_file_extent_type(eb, item); 6440 if (found_type == BTRFS_FILE_EXTENT_INLINE) 6441 continue; 6442 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 6443 continue; 6444 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 6445 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 6446 __exclude_logged_extent(fs_info, key.objectid, key.offset); 6447 } 6448 6449 return 0; 6450 } 6451 6452 static void 6453 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) 6454 { 6455 atomic_inc(&bg->reservations); 6456 } 6457 6458 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 6459 const u64 start) 6460 { 6461 struct btrfs_block_group_cache *bg; 6462 6463 bg = btrfs_lookup_block_group(fs_info, start); 6464 ASSERT(bg); 6465 if (atomic_dec_and_test(&bg->reservations)) 6466 wake_up_atomic_t(&bg->reservations); 6467 btrfs_put_block_group(bg); 6468 } 6469 6470 static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) 6471 { 6472 schedule(); 6473 return 0; 6474 } 6475 6476 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) 6477 { 6478 struct btrfs_space_info *space_info = bg->space_info; 6479 6480 ASSERT(bg->ro); 6481 6482 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 6483 return; 6484 6485 /* 6486 * Our block group is read only but before we set it to read only, 6487 * some task might have had allocated an extent from it already, but it 6488 * has not yet created a respective ordered extent (and added it to a 6489 * root's list of ordered extents). 6490 * Therefore wait for any task currently allocating extents, since the 6491 * block group's reservations counter is incremented while a read lock 6492 * on the groups' semaphore is held and decremented after releasing 6493 * the read access on that semaphore and creating the ordered extent. 6494 */ 6495 down_write(&space_info->groups_sem); 6496 up_write(&space_info->groups_sem); 6497 6498 wait_on_atomic_t(&bg->reservations, 6499 btrfs_wait_bg_reservations_atomic_t, 6500 TASK_UNINTERRUPTIBLE); 6501 } 6502 6503 /** 6504 * btrfs_add_reserved_bytes - update the block_group and space info counters 6505 * @cache: The cache we are manipulating 6506 * @ram_bytes: The number of bytes of file content, and will be same to 6507 * @num_bytes except for the compress path. 6508 * @num_bytes: The number of bytes in question 6509 * @delalloc: The blocks are allocated for the delalloc write 6510 * 6511 * This is called by the allocator when it reserves space. If this is a 6512 * reservation and the block group has become read only we cannot make the 6513 * reservation and return -EAGAIN, otherwise this function always succeeds. 6514 */ 6515 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, 6516 u64 ram_bytes, u64 num_bytes, int delalloc) 6517 { 6518 struct btrfs_space_info *space_info = cache->space_info; 6519 int ret = 0; 6520 6521 spin_lock(&space_info->lock); 6522 spin_lock(&cache->lock); 6523 if (cache->ro) { 6524 ret = -EAGAIN; 6525 } else { 6526 cache->reserved += num_bytes; 6527 space_info->bytes_reserved += num_bytes; 6528 6529 trace_btrfs_space_reservation(cache->fs_info, 6530 "space_info", space_info->flags, 6531 ram_bytes, 0); 6532 space_info->bytes_may_use -= ram_bytes; 6533 if (delalloc) 6534 cache->delalloc_bytes += num_bytes; 6535 } 6536 spin_unlock(&cache->lock); 6537 spin_unlock(&space_info->lock); 6538 return ret; 6539 } 6540 6541 /** 6542 * btrfs_free_reserved_bytes - update the block_group and space info counters 6543 * @cache: The cache we are manipulating 6544 * @num_bytes: The number of bytes in question 6545 * @delalloc: The blocks are allocated for the delalloc write 6546 * 6547 * This is called by somebody who is freeing space that was never actually used 6548 * on disk. For example if you reserve some space for a new leaf in transaction 6549 * A and before transaction A commits you free that leaf, you call this with 6550 * reserve set to 0 in order to clear the reservation. 6551 */ 6552 6553 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, 6554 u64 num_bytes, int delalloc) 6555 { 6556 struct btrfs_space_info *space_info = cache->space_info; 6557 int ret = 0; 6558 6559 spin_lock(&space_info->lock); 6560 spin_lock(&cache->lock); 6561 if (cache->ro) 6562 space_info->bytes_readonly += num_bytes; 6563 cache->reserved -= num_bytes; 6564 space_info->bytes_reserved -= num_bytes; 6565 6566 if (delalloc) 6567 cache->delalloc_bytes -= num_bytes; 6568 spin_unlock(&cache->lock); 6569 spin_unlock(&space_info->lock); 6570 return ret; 6571 } 6572 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) 6573 { 6574 struct btrfs_caching_control *next; 6575 struct btrfs_caching_control *caching_ctl; 6576 struct btrfs_block_group_cache *cache; 6577 6578 down_write(&fs_info->commit_root_sem); 6579 6580 list_for_each_entry_safe(caching_ctl, next, 6581 &fs_info->caching_block_groups, list) { 6582 cache = caching_ctl->block_group; 6583 if (block_group_cache_done(cache)) { 6584 cache->last_byte_to_unpin = (u64)-1; 6585 list_del_init(&caching_ctl->list); 6586 put_caching_control(caching_ctl); 6587 } else { 6588 cache->last_byte_to_unpin = caching_ctl->progress; 6589 } 6590 } 6591 6592 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6593 fs_info->pinned_extents = &fs_info->freed_extents[1]; 6594 else 6595 fs_info->pinned_extents = &fs_info->freed_extents[0]; 6596 6597 up_write(&fs_info->commit_root_sem); 6598 6599 update_global_block_rsv(fs_info); 6600 } 6601 6602 /* 6603 * Returns the free cluster for the given space info and sets empty_cluster to 6604 * what it should be based on the mount options. 6605 */ 6606 static struct btrfs_free_cluster * 6607 fetch_cluster_info(struct btrfs_fs_info *fs_info, 6608 struct btrfs_space_info *space_info, u64 *empty_cluster) 6609 { 6610 struct btrfs_free_cluster *ret = NULL; 6611 bool ssd = btrfs_test_opt(fs_info, SSD); 6612 6613 *empty_cluster = 0; 6614 if (btrfs_mixed_space_info(space_info)) 6615 return ret; 6616 6617 if (ssd) 6618 *empty_cluster = SZ_2M; 6619 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 6620 ret = &fs_info->meta_alloc_cluster; 6621 if (!ssd) 6622 *empty_cluster = SZ_64K; 6623 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { 6624 ret = &fs_info->data_alloc_cluster; 6625 } 6626 6627 return ret; 6628 } 6629 6630 static int unpin_extent_range(struct btrfs_fs_info *fs_info, 6631 u64 start, u64 end, 6632 const bool return_free_space) 6633 { 6634 struct btrfs_block_group_cache *cache = NULL; 6635 struct btrfs_space_info *space_info; 6636 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 6637 struct btrfs_free_cluster *cluster = NULL; 6638 u64 len; 6639 u64 total_unpinned = 0; 6640 u64 empty_cluster = 0; 6641 bool readonly; 6642 6643 while (start <= end) { 6644 readonly = false; 6645 if (!cache || 6646 start >= cache->key.objectid + cache->key.offset) { 6647 if (cache) 6648 btrfs_put_block_group(cache); 6649 total_unpinned = 0; 6650 cache = btrfs_lookup_block_group(fs_info, start); 6651 BUG_ON(!cache); /* Logic error */ 6652 6653 cluster = fetch_cluster_info(fs_info, 6654 cache->space_info, 6655 &empty_cluster); 6656 empty_cluster <<= 1; 6657 } 6658 6659 len = cache->key.objectid + cache->key.offset - start; 6660 len = min(len, end + 1 - start); 6661 6662 if (start < cache->last_byte_to_unpin) { 6663 len = min(len, cache->last_byte_to_unpin - start); 6664 if (return_free_space) 6665 btrfs_add_free_space(cache, start, len); 6666 } 6667 6668 start += len; 6669 total_unpinned += len; 6670 space_info = cache->space_info; 6671 6672 /* 6673 * If this space cluster has been marked as fragmented and we've 6674 * unpinned enough in this block group to potentially allow a 6675 * cluster to be created inside of it go ahead and clear the 6676 * fragmented check. 6677 */ 6678 if (cluster && cluster->fragmented && 6679 total_unpinned > empty_cluster) { 6680 spin_lock(&cluster->lock); 6681 cluster->fragmented = 0; 6682 spin_unlock(&cluster->lock); 6683 } 6684 6685 spin_lock(&space_info->lock); 6686 spin_lock(&cache->lock); 6687 cache->pinned -= len; 6688 space_info->bytes_pinned -= len; 6689 6690 trace_btrfs_space_reservation(fs_info, "pinned", 6691 space_info->flags, len, 0); 6692 space_info->max_extent_size = 0; 6693 percpu_counter_add(&space_info->total_bytes_pinned, -len); 6694 if (cache->ro) { 6695 space_info->bytes_readonly += len; 6696 readonly = true; 6697 } 6698 spin_unlock(&cache->lock); 6699 if (!readonly && return_free_space && 6700 global_rsv->space_info == space_info) { 6701 u64 to_add = len; 6702 WARN_ON(!return_free_space); 6703 spin_lock(&global_rsv->lock); 6704 if (!global_rsv->full) { 6705 to_add = min(len, global_rsv->size - 6706 global_rsv->reserved); 6707 global_rsv->reserved += to_add; 6708 space_info->bytes_may_use += to_add; 6709 if (global_rsv->reserved >= global_rsv->size) 6710 global_rsv->full = 1; 6711 trace_btrfs_space_reservation(fs_info, 6712 "space_info", 6713 space_info->flags, 6714 to_add, 1); 6715 len -= to_add; 6716 } 6717 spin_unlock(&global_rsv->lock); 6718 /* Add to any tickets we may have */ 6719 if (len) 6720 space_info_add_new_bytes(fs_info, space_info, 6721 len); 6722 } 6723 spin_unlock(&space_info->lock); 6724 } 6725 6726 if (cache) 6727 btrfs_put_block_group(cache); 6728 return 0; 6729 } 6730 6731 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 6732 struct btrfs_fs_info *fs_info) 6733 { 6734 struct btrfs_block_group_cache *block_group, *tmp; 6735 struct list_head *deleted_bgs; 6736 struct extent_io_tree *unpin; 6737 u64 start; 6738 u64 end; 6739 int ret; 6740 6741 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6742 unpin = &fs_info->freed_extents[1]; 6743 else 6744 unpin = &fs_info->freed_extents[0]; 6745 6746 while (!trans->aborted) { 6747 mutex_lock(&fs_info->unused_bg_unpin_mutex); 6748 ret = find_first_extent_bit(unpin, 0, &start, &end, 6749 EXTENT_DIRTY, NULL); 6750 if (ret) { 6751 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6752 break; 6753 } 6754 6755 if (btrfs_test_opt(fs_info, DISCARD)) 6756 ret = btrfs_discard_extent(fs_info, start, 6757 end + 1 - start, NULL); 6758 6759 clear_extent_dirty(unpin, start, end); 6760 unpin_extent_range(fs_info, start, end, true); 6761 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6762 cond_resched(); 6763 } 6764 6765 /* 6766 * Transaction is finished. We don't need the lock anymore. We 6767 * do need to clean up the block groups in case of a transaction 6768 * abort. 6769 */ 6770 deleted_bgs = &trans->transaction->deleted_bgs; 6771 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { 6772 u64 trimmed = 0; 6773 6774 ret = -EROFS; 6775 if (!trans->aborted) 6776 ret = btrfs_discard_extent(fs_info, 6777 block_group->key.objectid, 6778 block_group->key.offset, 6779 &trimmed); 6780 6781 list_del_init(&block_group->bg_list); 6782 btrfs_put_block_group_trimming(block_group); 6783 btrfs_put_block_group(block_group); 6784 6785 if (ret) { 6786 const char *errstr = btrfs_decode_error(ret); 6787 btrfs_warn(fs_info, 6788 "Discard failed while removing blockgroup: errno=%d %s\n", 6789 ret, errstr); 6790 } 6791 } 6792 6793 return 0; 6794 } 6795 6796 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, 6797 u64 owner, u64 root_objectid) 6798 { 6799 struct btrfs_space_info *space_info; 6800 u64 flags; 6801 6802 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 6803 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) 6804 flags = BTRFS_BLOCK_GROUP_SYSTEM; 6805 else 6806 flags = BTRFS_BLOCK_GROUP_METADATA; 6807 } else { 6808 flags = BTRFS_BLOCK_GROUP_DATA; 6809 } 6810 6811 space_info = __find_space_info(fs_info, flags); 6812 BUG_ON(!space_info); /* Logic bug */ 6813 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); 6814 } 6815 6816 6817 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 6818 struct btrfs_fs_info *info, 6819 struct btrfs_delayed_ref_node *node, u64 parent, 6820 u64 root_objectid, u64 owner_objectid, 6821 u64 owner_offset, int refs_to_drop, 6822 struct btrfs_delayed_extent_op *extent_op) 6823 { 6824 struct btrfs_key key; 6825 struct btrfs_path *path; 6826 struct btrfs_root *extent_root = info->extent_root; 6827 struct extent_buffer *leaf; 6828 struct btrfs_extent_item *ei; 6829 struct btrfs_extent_inline_ref *iref; 6830 int ret; 6831 int is_data; 6832 int extent_slot = 0; 6833 int found_extent = 0; 6834 int num_to_del = 1; 6835 u32 item_size; 6836 u64 refs; 6837 u64 bytenr = node->bytenr; 6838 u64 num_bytes = node->num_bytes; 6839 int last_ref = 0; 6840 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); 6841 6842 path = btrfs_alloc_path(); 6843 if (!path) 6844 return -ENOMEM; 6845 6846 path->reada = READA_FORWARD; 6847 path->leave_spinning = 1; 6848 6849 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 6850 BUG_ON(!is_data && refs_to_drop != 1); 6851 6852 if (is_data) 6853 skinny_metadata = 0; 6854 6855 ret = lookup_extent_backref(trans, info, path, &iref, 6856 bytenr, num_bytes, parent, 6857 root_objectid, owner_objectid, 6858 owner_offset); 6859 if (ret == 0) { 6860 extent_slot = path->slots[0]; 6861 while (extent_slot >= 0) { 6862 btrfs_item_key_to_cpu(path->nodes[0], &key, 6863 extent_slot); 6864 if (key.objectid != bytenr) 6865 break; 6866 if (key.type == BTRFS_EXTENT_ITEM_KEY && 6867 key.offset == num_bytes) { 6868 found_extent = 1; 6869 break; 6870 } 6871 if (key.type == BTRFS_METADATA_ITEM_KEY && 6872 key.offset == owner_objectid) { 6873 found_extent = 1; 6874 break; 6875 } 6876 if (path->slots[0] - extent_slot > 5) 6877 break; 6878 extent_slot--; 6879 } 6880 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 6881 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 6882 if (found_extent && item_size < sizeof(*ei)) 6883 found_extent = 0; 6884 #endif 6885 if (!found_extent) { 6886 BUG_ON(iref); 6887 ret = remove_extent_backref(trans, info, path, NULL, 6888 refs_to_drop, 6889 is_data, &last_ref); 6890 if (ret) { 6891 btrfs_abort_transaction(trans, ret); 6892 goto out; 6893 } 6894 btrfs_release_path(path); 6895 path->leave_spinning = 1; 6896 6897 key.objectid = bytenr; 6898 key.type = BTRFS_EXTENT_ITEM_KEY; 6899 key.offset = num_bytes; 6900 6901 if (!is_data && skinny_metadata) { 6902 key.type = BTRFS_METADATA_ITEM_KEY; 6903 key.offset = owner_objectid; 6904 } 6905 6906 ret = btrfs_search_slot(trans, extent_root, 6907 &key, path, -1, 1); 6908 if (ret > 0 && skinny_metadata && path->slots[0]) { 6909 /* 6910 * Couldn't find our skinny metadata item, 6911 * see if we have ye olde extent item. 6912 */ 6913 path->slots[0]--; 6914 btrfs_item_key_to_cpu(path->nodes[0], &key, 6915 path->slots[0]); 6916 if (key.objectid == bytenr && 6917 key.type == BTRFS_EXTENT_ITEM_KEY && 6918 key.offset == num_bytes) 6919 ret = 0; 6920 } 6921 6922 if (ret > 0 && skinny_metadata) { 6923 skinny_metadata = false; 6924 key.objectid = bytenr; 6925 key.type = BTRFS_EXTENT_ITEM_KEY; 6926 key.offset = num_bytes; 6927 btrfs_release_path(path); 6928 ret = btrfs_search_slot(trans, extent_root, 6929 &key, path, -1, 1); 6930 } 6931 6932 if (ret) { 6933 btrfs_err(info, 6934 "umm, got %d back from search, was looking for %llu", 6935 ret, bytenr); 6936 if (ret > 0) 6937 btrfs_print_leaf(info, path->nodes[0]); 6938 } 6939 if (ret < 0) { 6940 btrfs_abort_transaction(trans, ret); 6941 goto out; 6942 } 6943 extent_slot = path->slots[0]; 6944 } 6945 } else if (WARN_ON(ret == -ENOENT)) { 6946 btrfs_print_leaf(info, path->nodes[0]); 6947 btrfs_err(info, 6948 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 6949 bytenr, parent, root_objectid, owner_objectid, 6950 owner_offset); 6951 btrfs_abort_transaction(trans, ret); 6952 goto out; 6953 } else { 6954 btrfs_abort_transaction(trans, ret); 6955 goto out; 6956 } 6957 6958 leaf = path->nodes[0]; 6959 item_size = btrfs_item_size_nr(leaf, extent_slot); 6960 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 6961 if (item_size < sizeof(*ei)) { 6962 BUG_ON(found_extent || extent_slot != path->slots[0]); 6963 ret = convert_extent_item_v0(trans, info, path, owner_objectid, 6964 0); 6965 if (ret < 0) { 6966 btrfs_abort_transaction(trans, ret); 6967 goto out; 6968 } 6969 6970 btrfs_release_path(path); 6971 path->leave_spinning = 1; 6972 6973 key.objectid = bytenr; 6974 key.type = BTRFS_EXTENT_ITEM_KEY; 6975 key.offset = num_bytes; 6976 6977 ret = btrfs_search_slot(trans, extent_root, &key, path, 6978 -1, 1); 6979 if (ret) { 6980 btrfs_err(info, 6981 "umm, got %d back from search, was looking for %llu", 6982 ret, bytenr); 6983 btrfs_print_leaf(info, path->nodes[0]); 6984 } 6985 if (ret < 0) { 6986 btrfs_abort_transaction(trans, ret); 6987 goto out; 6988 } 6989 6990 extent_slot = path->slots[0]; 6991 leaf = path->nodes[0]; 6992 item_size = btrfs_item_size_nr(leaf, extent_slot); 6993 } 6994 #endif 6995 BUG_ON(item_size < sizeof(*ei)); 6996 ei = btrfs_item_ptr(leaf, extent_slot, 6997 struct btrfs_extent_item); 6998 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 6999 key.type == BTRFS_EXTENT_ITEM_KEY) { 7000 struct btrfs_tree_block_info *bi; 7001 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 7002 bi = (struct btrfs_tree_block_info *)(ei + 1); 7003 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 7004 } 7005 7006 refs = btrfs_extent_refs(leaf, ei); 7007 if (refs < refs_to_drop) { 7008 btrfs_err(info, 7009 "trying to drop %d refs but we only have %Lu for bytenr %Lu", 7010 refs_to_drop, refs, bytenr); 7011 ret = -EINVAL; 7012 btrfs_abort_transaction(trans, ret); 7013 goto out; 7014 } 7015 refs -= refs_to_drop; 7016 7017 if (refs > 0) { 7018 if (extent_op) 7019 __run_delayed_extent_op(extent_op, leaf, ei); 7020 /* 7021 * In the case of inline back ref, reference count will 7022 * be updated by remove_extent_backref 7023 */ 7024 if (iref) { 7025 BUG_ON(!found_extent); 7026 } else { 7027 btrfs_set_extent_refs(leaf, ei, refs); 7028 btrfs_mark_buffer_dirty(leaf); 7029 } 7030 if (found_extent) { 7031 ret = remove_extent_backref(trans, info, path, 7032 iref, refs_to_drop, 7033 is_data, &last_ref); 7034 if (ret) { 7035 btrfs_abort_transaction(trans, ret); 7036 goto out; 7037 } 7038 } 7039 add_pinned_bytes(info, -num_bytes, owner_objectid, 7040 root_objectid); 7041 } else { 7042 if (found_extent) { 7043 BUG_ON(is_data && refs_to_drop != 7044 extent_data_ref_count(path, iref)); 7045 if (iref) { 7046 BUG_ON(path->slots[0] != extent_slot); 7047 } else { 7048 BUG_ON(path->slots[0] != extent_slot + 1); 7049 path->slots[0] = extent_slot; 7050 num_to_del = 2; 7051 } 7052 } 7053 7054 last_ref = 1; 7055 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 7056 num_to_del); 7057 if (ret) { 7058 btrfs_abort_transaction(trans, ret); 7059 goto out; 7060 } 7061 btrfs_release_path(path); 7062 7063 if (is_data) { 7064 ret = btrfs_del_csums(trans, info, bytenr, num_bytes); 7065 if (ret) { 7066 btrfs_abort_transaction(trans, ret); 7067 goto out; 7068 } 7069 } 7070 7071 ret = add_to_free_space_tree(trans, info, bytenr, num_bytes); 7072 if (ret) { 7073 btrfs_abort_transaction(trans, ret); 7074 goto out; 7075 } 7076 7077 ret = update_block_group(trans, info, bytenr, num_bytes, 0); 7078 if (ret) { 7079 btrfs_abort_transaction(trans, ret); 7080 goto out; 7081 } 7082 } 7083 btrfs_release_path(path); 7084 7085 out: 7086 btrfs_free_path(path); 7087 return ret; 7088 } 7089 7090 /* 7091 * when we free an block, it is possible (and likely) that we free the last 7092 * delayed ref for that extent as well. This searches the delayed ref tree for 7093 * a given extent, and if there are no other delayed refs to be processed, it 7094 * removes it from the tree. 7095 */ 7096 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 7097 u64 bytenr) 7098 { 7099 struct btrfs_delayed_ref_head *head; 7100 struct btrfs_delayed_ref_root *delayed_refs; 7101 int ret = 0; 7102 7103 delayed_refs = &trans->transaction->delayed_refs; 7104 spin_lock(&delayed_refs->lock); 7105 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 7106 if (!head) 7107 goto out_delayed_unlock; 7108 7109 spin_lock(&head->lock); 7110 if (!list_empty(&head->ref_list)) 7111 goto out; 7112 7113 if (head->extent_op) { 7114 if (!head->must_insert_reserved) 7115 goto out; 7116 btrfs_free_delayed_extent_op(head->extent_op); 7117 head->extent_op = NULL; 7118 } 7119 7120 /* 7121 * waiting for the lock here would deadlock. If someone else has it 7122 * locked they are already in the process of dropping it anyway 7123 */ 7124 if (!mutex_trylock(&head->mutex)) 7125 goto out; 7126 7127 /* 7128 * at this point we have a head with no other entries. Go 7129 * ahead and process it. 7130 */ 7131 head->node.in_tree = 0; 7132 rb_erase(&head->href_node, &delayed_refs->href_root); 7133 7134 atomic_dec(&delayed_refs->num_entries); 7135 7136 /* 7137 * we don't take a ref on the node because we're removing it from the 7138 * tree, so we just steal the ref the tree was holding. 7139 */ 7140 delayed_refs->num_heads--; 7141 if (head->processing == 0) 7142 delayed_refs->num_heads_ready--; 7143 head->processing = 0; 7144 spin_unlock(&head->lock); 7145 spin_unlock(&delayed_refs->lock); 7146 7147 BUG_ON(head->extent_op); 7148 if (head->must_insert_reserved) 7149 ret = 1; 7150 7151 mutex_unlock(&head->mutex); 7152 btrfs_put_delayed_ref(&head->node); 7153 return ret; 7154 out: 7155 spin_unlock(&head->lock); 7156 7157 out_delayed_unlock: 7158 spin_unlock(&delayed_refs->lock); 7159 return 0; 7160 } 7161 7162 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 7163 struct btrfs_root *root, 7164 struct extent_buffer *buf, 7165 u64 parent, int last_ref) 7166 { 7167 struct btrfs_fs_info *fs_info = root->fs_info; 7168 int pin = 1; 7169 int ret; 7170 7171 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7172 ret = btrfs_add_delayed_tree_ref(fs_info, trans, 7173 buf->start, buf->len, 7174 parent, 7175 root->root_key.objectid, 7176 btrfs_header_level(buf), 7177 BTRFS_DROP_DELAYED_REF, NULL); 7178 BUG_ON(ret); /* -ENOMEM */ 7179 } 7180 7181 if (!last_ref) 7182 return; 7183 7184 if (btrfs_header_generation(buf) == trans->transid) { 7185 struct btrfs_block_group_cache *cache; 7186 7187 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7188 ret = check_ref_cleanup(trans, buf->start); 7189 if (!ret) 7190 goto out; 7191 } 7192 7193 cache = btrfs_lookup_block_group(fs_info, buf->start); 7194 7195 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 7196 pin_down_extent(fs_info, cache, buf->start, 7197 buf->len, 1); 7198 btrfs_put_block_group(cache); 7199 goto out; 7200 } 7201 7202 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 7203 7204 btrfs_add_free_space(cache, buf->start, buf->len); 7205 btrfs_free_reserved_bytes(cache, buf->len, 0); 7206 btrfs_put_block_group(cache); 7207 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); 7208 pin = 0; 7209 } 7210 out: 7211 if (pin) 7212 add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf), 7213 root->root_key.objectid); 7214 7215 /* 7216 * Deleting the buffer, clear the corrupt flag since it doesn't matter 7217 * anymore. 7218 */ 7219 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 7220 } 7221 7222 /* Can return -ENOMEM */ 7223 int btrfs_free_extent(struct btrfs_trans_handle *trans, 7224 struct btrfs_fs_info *fs_info, 7225 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 7226 u64 owner, u64 offset) 7227 { 7228 int ret; 7229 7230 if (btrfs_is_testing(fs_info)) 7231 return 0; 7232 7233 add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); 7234 7235 /* 7236 * tree log blocks never actually go into the extent allocation 7237 * tree, just update pinning info and exit early. 7238 */ 7239 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 7240 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 7241 /* unlocks the pinned mutex */ 7242 btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); 7243 ret = 0; 7244 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 7245 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 7246 num_bytes, 7247 parent, root_objectid, (int)owner, 7248 BTRFS_DROP_DELAYED_REF, NULL); 7249 } else { 7250 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 7251 num_bytes, 7252 parent, root_objectid, owner, 7253 offset, 0, 7254 BTRFS_DROP_DELAYED_REF); 7255 } 7256 return ret; 7257 } 7258 7259 /* 7260 * when we wait for progress in the block group caching, its because 7261 * our allocation attempt failed at least once. So, we must sleep 7262 * and let some progress happen before we try again. 7263 * 7264 * This function will sleep at least once waiting for new free space to 7265 * show up, and then it will check the block group free space numbers 7266 * for our min num_bytes. Another option is to have it go ahead 7267 * and look in the rbtree for a free extent of a given size, but this 7268 * is a good start. 7269 * 7270 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 7271 * any of the information in this block group. 7272 */ 7273 static noinline void 7274 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 7275 u64 num_bytes) 7276 { 7277 struct btrfs_caching_control *caching_ctl; 7278 7279 caching_ctl = get_caching_control(cache); 7280 if (!caching_ctl) 7281 return; 7282 7283 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 7284 (cache->free_space_ctl->free_space >= num_bytes)); 7285 7286 put_caching_control(caching_ctl); 7287 } 7288 7289 static noinline int 7290 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 7291 { 7292 struct btrfs_caching_control *caching_ctl; 7293 int ret = 0; 7294 7295 caching_ctl = get_caching_control(cache); 7296 if (!caching_ctl) 7297 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 7298 7299 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 7300 if (cache->cached == BTRFS_CACHE_ERROR) 7301 ret = -EIO; 7302 put_caching_control(caching_ctl); 7303 return ret; 7304 } 7305 7306 int __get_raid_index(u64 flags) 7307 { 7308 if (flags & BTRFS_BLOCK_GROUP_RAID10) 7309 return BTRFS_RAID_RAID10; 7310 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 7311 return BTRFS_RAID_RAID1; 7312 else if (flags & BTRFS_BLOCK_GROUP_DUP) 7313 return BTRFS_RAID_DUP; 7314 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 7315 return BTRFS_RAID_RAID0; 7316 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 7317 return BTRFS_RAID_RAID5; 7318 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 7319 return BTRFS_RAID_RAID6; 7320 7321 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 7322 } 7323 7324 int get_block_group_index(struct btrfs_block_group_cache *cache) 7325 { 7326 return __get_raid_index(cache->flags); 7327 } 7328 7329 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { 7330 [BTRFS_RAID_RAID10] = "raid10", 7331 [BTRFS_RAID_RAID1] = "raid1", 7332 [BTRFS_RAID_DUP] = "dup", 7333 [BTRFS_RAID_RAID0] = "raid0", 7334 [BTRFS_RAID_SINGLE] = "single", 7335 [BTRFS_RAID_RAID5] = "raid5", 7336 [BTRFS_RAID_RAID6] = "raid6", 7337 }; 7338 7339 static const char *get_raid_name(enum btrfs_raid_types type) 7340 { 7341 if (type >= BTRFS_NR_RAID_TYPES) 7342 return NULL; 7343 7344 return btrfs_raid_type_names[type]; 7345 } 7346 7347 enum btrfs_loop_type { 7348 LOOP_CACHING_NOWAIT = 0, 7349 LOOP_CACHING_WAIT = 1, 7350 LOOP_ALLOC_CHUNK = 2, 7351 LOOP_NO_EMPTY_SIZE = 3, 7352 }; 7353 7354 static inline void 7355 btrfs_lock_block_group(struct btrfs_block_group_cache *cache, 7356 int delalloc) 7357 { 7358 if (delalloc) 7359 down_read(&cache->data_rwsem); 7360 } 7361 7362 static inline void 7363 btrfs_grab_block_group(struct btrfs_block_group_cache *cache, 7364 int delalloc) 7365 { 7366 btrfs_get_block_group(cache); 7367 if (delalloc) 7368 down_read(&cache->data_rwsem); 7369 } 7370 7371 static struct btrfs_block_group_cache * 7372 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, 7373 struct btrfs_free_cluster *cluster, 7374 int delalloc) 7375 { 7376 struct btrfs_block_group_cache *used_bg = NULL; 7377 7378 spin_lock(&cluster->refill_lock); 7379 while (1) { 7380 used_bg = cluster->block_group; 7381 if (!used_bg) 7382 return NULL; 7383 7384 if (used_bg == block_group) 7385 return used_bg; 7386 7387 btrfs_get_block_group(used_bg); 7388 7389 if (!delalloc) 7390 return used_bg; 7391 7392 if (down_read_trylock(&used_bg->data_rwsem)) 7393 return used_bg; 7394 7395 spin_unlock(&cluster->refill_lock); 7396 7397 /* We should only have one-level nested. */ 7398 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); 7399 7400 spin_lock(&cluster->refill_lock); 7401 if (used_bg == cluster->block_group) 7402 return used_bg; 7403 7404 up_read(&used_bg->data_rwsem); 7405 btrfs_put_block_group(used_bg); 7406 } 7407 } 7408 7409 static inline void 7410 btrfs_release_block_group(struct btrfs_block_group_cache *cache, 7411 int delalloc) 7412 { 7413 if (delalloc) 7414 up_read(&cache->data_rwsem); 7415 btrfs_put_block_group(cache); 7416 } 7417 7418 /* 7419 * walks the btree of allocated extents and find a hole of a given size. 7420 * The key ins is changed to record the hole: 7421 * ins->objectid == start position 7422 * ins->flags = BTRFS_EXTENT_ITEM_KEY 7423 * ins->offset == the size of the hole. 7424 * Any available blocks before search_start are skipped. 7425 * 7426 * If there is no suitable free space, we will record the max size of 7427 * the free space extent currently. 7428 */ 7429 static noinline int find_free_extent(struct btrfs_fs_info *fs_info, 7430 u64 ram_bytes, u64 num_bytes, u64 empty_size, 7431 u64 hint_byte, struct btrfs_key *ins, 7432 u64 flags, int delalloc) 7433 { 7434 int ret = 0; 7435 struct btrfs_root *root = fs_info->extent_root; 7436 struct btrfs_free_cluster *last_ptr = NULL; 7437 struct btrfs_block_group_cache *block_group = NULL; 7438 u64 search_start = 0; 7439 u64 max_extent_size = 0; 7440 u64 empty_cluster = 0; 7441 struct btrfs_space_info *space_info; 7442 int loop = 0; 7443 int index = __get_raid_index(flags); 7444 bool failed_cluster_refill = false; 7445 bool failed_alloc = false; 7446 bool use_cluster = true; 7447 bool have_caching_bg = false; 7448 bool orig_have_caching_bg = false; 7449 bool full_search = false; 7450 7451 WARN_ON(num_bytes < fs_info->sectorsize); 7452 ins->type = BTRFS_EXTENT_ITEM_KEY; 7453 ins->objectid = 0; 7454 ins->offset = 0; 7455 7456 trace_find_free_extent(fs_info, num_bytes, empty_size, flags); 7457 7458 space_info = __find_space_info(fs_info, flags); 7459 if (!space_info) { 7460 btrfs_err(fs_info, "No space info for %llu", flags); 7461 return -ENOSPC; 7462 } 7463 7464 /* 7465 * If our free space is heavily fragmented we may not be able to make 7466 * big contiguous allocations, so instead of doing the expensive search 7467 * for free space, simply return ENOSPC with our max_extent_size so we 7468 * can go ahead and search for a more manageable chunk. 7469 * 7470 * If our max_extent_size is large enough for our allocation simply 7471 * disable clustering since we will likely not be able to find enough 7472 * space to create a cluster and induce latency trying. 7473 */ 7474 if (unlikely(space_info->max_extent_size)) { 7475 spin_lock(&space_info->lock); 7476 if (space_info->max_extent_size && 7477 num_bytes > space_info->max_extent_size) { 7478 ins->offset = space_info->max_extent_size; 7479 spin_unlock(&space_info->lock); 7480 return -ENOSPC; 7481 } else if (space_info->max_extent_size) { 7482 use_cluster = false; 7483 } 7484 spin_unlock(&space_info->lock); 7485 } 7486 7487 last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster); 7488 if (last_ptr) { 7489 spin_lock(&last_ptr->lock); 7490 if (last_ptr->block_group) 7491 hint_byte = last_ptr->window_start; 7492 if (last_ptr->fragmented) { 7493 /* 7494 * We still set window_start so we can keep track of the 7495 * last place we found an allocation to try and save 7496 * some time. 7497 */ 7498 hint_byte = last_ptr->window_start; 7499 use_cluster = false; 7500 } 7501 spin_unlock(&last_ptr->lock); 7502 } 7503 7504 search_start = max(search_start, first_logical_byte(fs_info, 0)); 7505 search_start = max(search_start, hint_byte); 7506 if (search_start == hint_byte) { 7507 block_group = btrfs_lookup_block_group(fs_info, search_start); 7508 /* 7509 * we don't want to use the block group if it doesn't match our 7510 * allocation bits, or if its not cached. 7511 * 7512 * However if we are re-searching with an ideal block group 7513 * picked out then we don't care that the block group is cached. 7514 */ 7515 if (block_group && block_group_bits(block_group, flags) && 7516 block_group->cached != BTRFS_CACHE_NO) { 7517 down_read(&space_info->groups_sem); 7518 if (list_empty(&block_group->list) || 7519 block_group->ro) { 7520 /* 7521 * someone is removing this block group, 7522 * we can't jump into the have_block_group 7523 * target because our list pointers are not 7524 * valid 7525 */ 7526 btrfs_put_block_group(block_group); 7527 up_read(&space_info->groups_sem); 7528 } else { 7529 index = get_block_group_index(block_group); 7530 btrfs_lock_block_group(block_group, delalloc); 7531 goto have_block_group; 7532 } 7533 } else if (block_group) { 7534 btrfs_put_block_group(block_group); 7535 } 7536 } 7537 search: 7538 have_caching_bg = false; 7539 if (index == 0 || index == __get_raid_index(flags)) 7540 full_search = true; 7541 down_read(&space_info->groups_sem); 7542 list_for_each_entry(block_group, &space_info->block_groups[index], 7543 list) { 7544 u64 offset; 7545 int cached; 7546 7547 btrfs_grab_block_group(block_group, delalloc); 7548 search_start = block_group->key.objectid; 7549 7550 /* 7551 * this can happen if we end up cycling through all the 7552 * raid types, but we want to make sure we only allocate 7553 * for the proper type. 7554 */ 7555 if (!block_group_bits(block_group, flags)) { 7556 u64 extra = BTRFS_BLOCK_GROUP_DUP | 7557 BTRFS_BLOCK_GROUP_RAID1 | 7558 BTRFS_BLOCK_GROUP_RAID5 | 7559 BTRFS_BLOCK_GROUP_RAID6 | 7560 BTRFS_BLOCK_GROUP_RAID10; 7561 7562 /* 7563 * if they asked for extra copies and this block group 7564 * doesn't provide them, bail. This does allow us to 7565 * fill raid0 from raid1. 7566 */ 7567 if ((flags & extra) && !(block_group->flags & extra)) 7568 goto loop; 7569 } 7570 7571 have_block_group: 7572 cached = block_group_cache_done(block_group); 7573 if (unlikely(!cached)) { 7574 have_caching_bg = true; 7575 ret = cache_block_group(block_group, 0); 7576 BUG_ON(ret < 0); 7577 ret = 0; 7578 } 7579 7580 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 7581 goto loop; 7582 if (unlikely(block_group->ro)) 7583 goto loop; 7584 7585 /* 7586 * Ok we want to try and use the cluster allocator, so 7587 * lets look there 7588 */ 7589 if (last_ptr && use_cluster) { 7590 struct btrfs_block_group_cache *used_block_group; 7591 unsigned long aligned_cluster; 7592 /* 7593 * the refill lock keeps out other 7594 * people trying to start a new cluster 7595 */ 7596 used_block_group = btrfs_lock_cluster(block_group, 7597 last_ptr, 7598 delalloc); 7599 if (!used_block_group) 7600 goto refill_cluster; 7601 7602 if (used_block_group != block_group && 7603 (used_block_group->ro || 7604 !block_group_bits(used_block_group, flags))) 7605 goto release_cluster; 7606 7607 offset = btrfs_alloc_from_cluster(used_block_group, 7608 last_ptr, 7609 num_bytes, 7610 used_block_group->key.objectid, 7611 &max_extent_size); 7612 if (offset) { 7613 /* we have a block, we're done */ 7614 spin_unlock(&last_ptr->refill_lock); 7615 trace_btrfs_reserve_extent_cluster(fs_info, 7616 used_block_group, 7617 search_start, num_bytes); 7618 if (used_block_group != block_group) { 7619 btrfs_release_block_group(block_group, 7620 delalloc); 7621 block_group = used_block_group; 7622 } 7623 goto checks; 7624 } 7625 7626 WARN_ON(last_ptr->block_group != used_block_group); 7627 release_cluster: 7628 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 7629 * set up a new clusters, so lets just skip it 7630 * and let the allocator find whatever block 7631 * it can find. If we reach this point, we 7632 * will have tried the cluster allocator 7633 * plenty of times and not have found 7634 * anything, so we are likely way too 7635 * fragmented for the clustering stuff to find 7636 * anything. 7637 * 7638 * However, if the cluster is taken from the 7639 * current block group, release the cluster 7640 * first, so that we stand a better chance of 7641 * succeeding in the unclustered 7642 * allocation. */ 7643 if (loop >= LOOP_NO_EMPTY_SIZE && 7644 used_block_group != block_group) { 7645 spin_unlock(&last_ptr->refill_lock); 7646 btrfs_release_block_group(used_block_group, 7647 delalloc); 7648 goto unclustered_alloc; 7649 } 7650 7651 /* 7652 * this cluster didn't work out, free it and 7653 * start over 7654 */ 7655 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7656 7657 if (used_block_group != block_group) 7658 btrfs_release_block_group(used_block_group, 7659 delalloc); 7660 refill_cluster: 7661 if (loop >= LOOP_NO_EMPTY_SIZE) { 7662 spin_unlock(&last_ptr->refill_lock); 7663 goto unclustered_alloc; 7664 } 7665 7666 aligned_cluster = max_t(unsigned long, 7667 empty_cluster + empty_size, 7668 block_group->full_stripe_len); 7669 7670 /* allocate a cluster in this block group */ 7671 ret = btrfs_find_space_cluster(fs_info, block_group, 7672 last_ptr, search_start, 7673 num_bytes, 7674 aligned_cluster); 7675 if (ret == 0) { 7676 /* 7677 * now pull our allocation out of this 7678 * cluster 7679 */ 7680 offset = btrfs_alloc_from_cluster(block_group, 7681 last_ptr, 7682 num_bytes, 7683 search_start, 7684 &max_extent_size); 7685 if (offset) { 7686 /* we found one, proceed */ 7687 spin_unlock(&last_ptr->refill_lock); 7688 trace_btrfs_reserve_extent_cluster(fs_info, 7689 block_group, search_start, 7690 num_bytes); 7691 goto checks; 7692 } 7693 } else if (!cached && loop > LOOP_CACHING_NOWAIT 7694 && !failed_cluster_refill) { 7695 spin_unlock(&last_ptr->refill_lock); 7696 7697 failed_cluster_refill = true; 7698 wait_block_group_cache_progress(block_group, 7699 num_bytes + empty_cluster + empty_size); 7700 goto have_block_group; 7701 } 7702 7703 /* 7704 * at this point we either didn't find a cluster 7705 * or we weren't able to allocate a block from our 7706 * cluster. Free the cluster we've been trying 7707 * to use, and go to the next block group 7708 */ 7709 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7710 spin_unlock(&last_ptr->refill_lock); 7711 goto loop; 7712 } 7713 7714 unclustered_alloc: 7715 /* 7716 * We are doing an unclustered alloc, set the fragmented flag so 7717 * we don't bother trying to setup a cluster again until we get 7718 * more space. 7719 */ 7720 if (unlikely(last_ptr)) { 7721 spin_lock(&last_ptr->lock); 7722 last_ptr->fragmented = 1; 7723 spin_unlock(&last_ptr->lock); 7724 } 7725 if (cached) { 7726 struct btrfs_free_space_ctl *ctl = 7727 block_group->free_space_ctl; 7728 7729 spin_lock(&ctl->tree_lock); 7730 if (ctl->free_space < 7731 num_bytes + empty_cluster + empty_size) { 7732 if (ctl->free_space > max_extent_size) 7733 max_extent_size = ctl->free_space; 7734 spin_unlock(&ctl->tree_lock); 7735 goto loop; 7736 } 7737 spin_unlock(&ctl->tree_lock); 7738 } 7739 7740 offset = btrfs_find_space_for_alloc(block_group, search_start, 7741 num_bytes, empty_size, 7742 &max_extent_size); 7743 /* 7744 * If we didn't find a chunk, and we haven't failed on this 7745 * block group before, and this block group is in the middle of 7746 * caching and we are ok with waiting, then go ahead and wait 7747 * for progress to be made, and set failed_alloc to true. 7748 * 7749 * If failed_alloc is true then we've already waited on this 7750 * block group once and should move on to the next block group. 7751 */ 7752 if (!offset && !failed_alloc && !cached && 7753 loop > LOOP_CACHING_NOWAIT) { 7754 wait_block_group_cache_progress(block_group, 7755 num_bytes + empty_size); 7756 failed_alloc = true; 7757 goto have_block_group; 7758 } else if (!offset) { 7759 goto loop; 7760 } 7761 checks: 7762 search_start = ALIGN(offset, fs_info->stripesize); 7763 7764 /* move on to the next group */ 7765 if (search_start + num_bytes > 7766 block_group->key.objectid + block_group->key.offset) { 7767 btrfs_add_free_space(block_group, offset, num_bytes); 7768 goto loop; 7769 } 7770 7771 if (offset < search_start) 7772 btrfs_add_free_space(block_group, offset, 7773 search_start - offset); 7774 BUG_ON(offset > search_start); 7775 7776 ret = btrfs_add_reserved_bytes(block_group, ram_bytes, 7777 num_bytes, delalloc); 7778 if (ret == -EAGAIN) { 7779 btrfs_add_free_space(block_group, offset, num_bytes); 7780 goto loop; 7781 } 7782 btrfs_inc_block_group_reservations(block_group); 7783 7784 /* we are all good, lets return */ 7785 ins->objectid = search_start; 7786 ins->offset = num_bytes; 7787 7788 trace_btrfs_reserve_extent(fs_info, block_group, 7789 search_start, num_bytes); 7790 btrfs_release_block_group(block_group, delalloc); 7791 break; 7792 loop: 7793 failed_cluster_refill = false; 7794 failed_alloc = false; 7795 BUG_ON(index != get_block_group_index(block_group)); 7796 btrfs_release_block_group(block_group, delalloc); 7797 } 7798 up_read(&space_info->groups_sem); 7799 7800 if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg 7801 && !orig_have_caching_bg) 7802 orig_have_caching_bg = true; 7803 7804 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 7805 goto search; 7806 7807 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 7808 goto search; 7809 7810 /* 7811 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 7812 * caching kthreads as we move along 7813 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 7814 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 7815 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 7816 * again 7817 */ 7818 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 7819 index = 0; 7820 if (loop == LOOP_CACHING_NOWAIT) { 7821 /* 7822 * We want to skip the LOOP_CACHING_WAIT step if we 7823 * don't have any uncached bgs and we've already done a 7824 * full search through. 7825 */ 7826 if (orig_have_caching_bg || !full_search) 7827 loop = LOOP_CACHING_WAIT; 7828 else 7829 loop = LOOP_ALLOC_CHUNK; 7830 } else { 7831 loop++; 7832 } 7833 7834 if (loop == LOOP_ALLOC_CHUNK) { 7835 struct btrfs_trans_handle *trans; 7836 int exist = 0; 7837 7838 trans = current->journal_info; 7839 if (trans) 7840 exist = 1; 7841 else 7842 trans = btrfs_join_transaction(root); 7843 7844 if (IS_ERR(trans)) { 7845 ret = PTR_ERR(trans); 7846 goto out; 7847 } 7848 7849 ret = do_chunk_alloc(trans, fs_info, flags, 7850 CHUNK_ALLOC_FORCE); 7851 7852 /* 7853 * If we can't allocate a new chunk we've already looped 7854 * through at least once, move on to the NO_EMPTY_SIZE 7855 * case. 7856 */ 7857 if (ret == -ENOSPC) 7858 loop = LOOP_NO_EMPTY_SIZE; 7859 7860 /* 7861 * Do not bail out on ENOSPC since we 7862 * can do more things. 7863 */ 7864 if (ret < 0 && ret != -ENOSPC) 7865 btrfs_abort_transaction(trans, ret); 7866 else 7867 ret = 0; 7868 if (!exist) 7869 btrfs_end_transaction(trans); 7870 if (ret) 7871 goto out; 7872 } 7873 7874 if (loop == LOOP_NO_EMPTY_SIZE) { 7875 /* 7876 * Don't loop again if we already have no empty_size and 7877 * no empty_cluster. 7878 */ 7879 if (empty_size == 0 && 7880 empty_cluster == 0) { 7881 ret = -ENOSPC; 7882 goto out; 7883 } 7884 empty_size = 0; 7885 empty_cluster = 0; 7886 } 7887 7888 goto search; 7889 } else if (!ins->objectid) { 7890 ret = -ENOSPC; 7891 } else if (ins->objectid) { 7892 if (!use_cluster && last_ptr) { 7893 spin_lock(&last_ptr->lock); 7894 last_ptr->window_start = ins->objectid; 7895 spin_unlock(&last_ptr->lock); 7896 } 7897 ret = 0; 7898 } 7899 out: 7900 if (ret == -ENOSPC) { 7901 spin_lock(&space_info->lock); 7902 space_info->max_extent_size = max_extent_size; 7903 spin_unlock(&space_info->lock); 7904 ins->offset = max_extent_size; 7905 } 7906 return ret; 7907 } 7908 7909 static void dump_space_info(struct btrfs_fs_info *fs_info, 7910 struct btrfs_space_info *info, u64 bytes, 7911 int dump_block_groups) 7912 { 7913 struct btrfs_block_group_cache *cache; 7914 int index = 0; 7915 7916 spin_lock(&info->lock); 7917 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 7918 info->flags, 7919 info->total_bytes - btrfs_space_info_used(info, true), 7920 info->full ? "" : "not "); 7921 btrfs_info(fs_info, 7922 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 7923 info->total_bytes, info->bytes_used, info->bytes_pinned, 7924 info->bytes_reserved, info->bytes_may_use, 7925 info->bytes_readonly); 7926 spin_unlock(&info->lock); 7927 7928 if (!dump_block_groups) 7929 return; 7930 7931 down_read(&info->groups_sem); 7932 again: 7933 list_for_each_entry(cache, &info->block_groups[index], list) { 7934 spin_lock(&cache->lock); 7935 btrfs_info(fs_info, 7936 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 7937 cache->key.objectid, cache->key.offset, 7938 btrfs_block_group_used(&cache->item), cache->pinned, 7939 cache->reserved, cache->ro ? "[readonly]" : ""); 7940 btrfs_dump_free_space(cache, bytes); 7941 spin_unlock(&cache->lock); 7942 } 7943 if (++index < BTRFS_NR_RAID_TYPES) 7944 goto again; 7945 up_read(&info->groups_sem); 7946 } 7947 7948 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, 7949 u64 num_bytes, u64 min_alloc_size, 7950 u64 empty_size, u64 hint_byte, 7951 struct btrfs_key *ins, int is_data, int delalloc) 7952 { 7953 struct btrfs_fs_info *fs_info = root->fs_info; 7954 bool final_tried = num_bytes == min_alloc_size; 7955 u64 flags; 7956 int ret; 7957 7958 flags = btrfs_get_alloc_profile(root, is_data); 7959 again: 7960 WARN_ON(num_bytes < fs_info->sectorsize); 7961 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, 7962 hint_byte, ins, flags, delalloc); 7963 if (!ret && !is_data) { 7964 btrfs_dec_block_group_reservations(fs_info, ins->objectid); 7965 } else if (ret == -ENOSPC) { 7966 if (!final_tried && ins->offset) { 7967 num_bytes = min(num_bytes >> 1, ins->offset); 7968 num_bytes = round_down(num_bytes, 7969 fs_info->sectorsize); 7970 num_bytes = max(num_bytes, min_alloc_size); 7971 ram_bytes = num_bytes; 7972 if (num_bytes == min_alloc_size) 7973 final_tried = true; 7974 goto again; 7975 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 7976 struct btrfs_space_info *sinfo; 7977 7978 sinfo = __find_space_info(fs_info, flags); 7979 btrfs_err(fs_info, 7980 "allocation failed flags %llu, wanted %llu", 7981 flags, num_bytes); 7982 if (sinfo) 7983 dump_space_info(fs_info, sinfo, num_bytes, 1); 7984 } 7985 } 7986 7987 return ret; 7988 } 7989 7990 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 7991 u64 start, u64 len, 7992 int pin, int delalloc) 7993 { 7994 struct btrfs_block_group_cache *cache; 7995 int ret = 0; 7996 7997 cache = btrfs_lookup_block_group(fs_info, start); 7998 if (!cache) { 7999 btrfs_err(fs_info, "Unable to find block group for %llu", 8000 start); 8001 return -ENOSPC; 8002 } 8003 8004 if (pin) 8005 pin_down_extent(fs_info, cache, start, len, 1); 8006 else { 8007 if (btrfs_test_opt(fs_info, DISCARD)) 8008 ret = btrfs_discard_extent(fs_info, start, len, NULL); 8009 btrfs_add_free_space(cache, start, len); 8010 btrfs_free_reserved_bytes(cache, len, delalloc); 8011 trace_btrfs_reserved_extent_free(fs_info, start, len); 8012 } 8013 8014 btrfs_put_block_group(cache); 8015 return ret; 8016 } 8017 8018 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 8019 u64 start, u64 len, int delalloc) 8020 { 8021 return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc); 8022 } 8023 8024 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, 8025 u64 start, u64 len) 8026 { 8027 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0); 8028 } 8029 8030 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8031 struct btrfs_fs_info *fs_info, 8032 u64 parent, u64 root_objectid, 8033 u64 flags, u64 owner, u64 offset, 8034 struct btrfs_key *ins, int ref_mod) 8035 { 8036 int ret; 8037 struct btrfs_extent_item *extent_item; 8038 struct btrfs_extent_inline_ref *iref; 8039 struct btrfs_path *path; 8040 struct extent_buffer *leaf; 8041 int type; 8042 u32 size; 8043 8044 if (parent > 0) 8045 type = BTRFS_SHARED_DATA_REF_KEY; 8046 else 8047 type = BTRFS_EXTENT_DATA_REF_KEY; 8048 8049 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 8050 8051 path = btrfs_alloc_path(); 8052 if (!path) 8053 return -ENOMEM; 8054 8055 path->leave_spinning = 1; 8056 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8057 ins, size); 8058 if (ret) { 8059 btrfs_free_path(path); 8060 return ret; 8061 } 8062 8063 leaf = path->nodes[0]; 8064 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8065 struct btrfs_extent_item); 8066 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 8067 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8068 btrfs_set_extent_flags(leaf, extent_item, 8069 flags | BTRFS_EXTENT_FLAG_DATA); 8070 8071 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8072 btrfs_set_extent_inline_ref_type(leaf, iref, type); 8073 if (parent > 0) { 8074 struct btrfs_shared_data_ref *ref; 8075 ref = (struct btrfs_shared_data_ref *)(iref + 1); 8076 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 8077 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 8078 } else { 8079 struct btrfs_extent_data_ref *ref; 8080 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 8081 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 8082 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 8083 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 8084 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 8085 } 8086 8087 btrfs_mark_buffer_dirty(path->nodes[0]); 8088 btrfs_free_path(path); 8089 8090 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, 8091 ins->offset); 8092 if (ret) 8093 return ret; 8094 8095 ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1); 8096 if (ret) { /* -ENOENT, logic error */ 8097 btrfs_err(fs_info, "update block group failed for %llu %llu", 8098 ins->objectid, ins->offset); 8099 BUG(); 8100 } 8101 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); 8102 return ret; 8103 } 8104 8105 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 8106 struct btrfs_fs_info *fs_info, 8107 u64 parent, u64 root_objectid, 8108 u64 flags, struct btrfs_disk_key *key, 8109 int level, struct btrfs_key *ins) 8110 { 8111 int ret; 8112 struct btrfs_extent_item *extent_item; 8113 struct btrfs_tree_block_info *block_info; 8114 struct btrfs_extent_inline_ref *iref; 8115 struct btrfs_path *path; 8116 struct extent_buffer *leaf; 8117 u32 size = sizeof(*extent_item) + sizeof(*iref); 8118 u64 num_bytes = ins->offset; 8119 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 8120 8121 if (!skinny_metadata) 8122 size += sizeof(*block_info); 8123 8124 path = btrfs_alloc_path(); 8125 if (!path) { 8126 btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid, 8127 fs_info->nodesize); 8128 return -ENOMEM; 8129 } 8130 8131 path->leave_spinning = 1; 8132 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8133 ins, size); 8134 if (ret) { 8135 btrfs_free_path(path); 8136 btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid, 8137 fs_info->nodesize); 8138 return ret; 8139 } 8140 8141 leaf = path->nodes[0]; 8142 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8143 struct btrfs_extent_item); 8144 btrfs_set_extent_refs(leaf, extent_item, 1); 8145 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8146 btrfs_set_extent_flags(leaf, extent_item, 8147 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 8148 8149 if (skinny_metadata) { 8150 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8151 num_bytes = fs_info->nodesize; 8152 } else { 8153 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 8154 btrfs_set_tree_block_key(leaf, block_info, key); 8155 btrfs_set_tree_block_level(leaf, block_info, level); 8156 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 8157 } 8158 8159 if (parent > 0) { 8160 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 8161 btrfs_set_extent_inline_ref_type(leaf, iref, 8162 BTRFS_SHARED_BLOCK_REF_KEY); 8163 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 8164 } else { 8165 btrfs_set_extent_inline_ref_type(leaf, iref, 8166 BTRFS_TREE_BLOCK_REF_KEY); 8167 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 8168 } 8169 8170 btrfs_mark_buffer_dirty(leaf); 8171 btrfs_free_path(path); 8172 8173 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, 8174 num_bytes); 8175 if (ret) 8176 return ret; 8177 8178 ret = update_block_group(trans, fs_info, ins->objectid, 8179 fs_info->nodesize, 1); 8180 if (ret) { /* -ENOENT, logic error */ 8181 btrfs_err(fs_info, "update block group failed for %llu %llu", 8182 ins->objectid, ins->offset); 8183 BUG(); 8184 } 8185 8186 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, 8187 fs_info->nodesize); 8188 return ret; 8189 } 8190 8191 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8192 u64 root_objectid, u64 owner, 8193 u64 offset, u64 ram_bytes, 8194 struct btrfs_key *ins) 8195 { 8196 struct btrfs_fs_info *fs_info = trans->fs_info; 8197 int ret; 8198 8199 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 8200 8201 ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, 8202 ins->offset, 0, 8203 root_objectid, owner, offset, 8204 ram_bytes, BTRFS_ADD_DELAYED_EXTENT); 8205 return ret; 8206 } 8207 8208 /* 8209 * this is used by the tree logging recovery code. It records that 8210 * an extent has been allocated and makes sure to clear the free 8211 * space cache bits as well 8212 */ 8213 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 8214 struct btrfs_fs_info *fs_info, 8215 u64 root_objectid, u64 owner, u64 offset, 8216 struct btrfs_key *ins) 8217 { 8218 int ret; 8219 struct btrfs_block_group_cache *block_group; 8220 struct btrfs_space_info *space_info; 8221 8222 /* 8223 * Mixed block groups will exclude before processing the log so we only 8224 * need to do the exclude dance if this fs isn't mixed. 8225 */ 8226 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 8227 ret = __exclude_logged_extent(fs_info, ins->objectid, 8228 ins->offset); 8229 if (ret) 8230 return ret; 8231 } 8232 8233 block_group = btrfs_lookup_block_group(fs_info, ins->objectid); 8234 if (!block_group) 8235 return -EINVAL; 8236 8237 space_info = block_group->space_info; 8238 spin_lock(&space_info->lock); 8239 spin_lock(&block_group->lock); 8240 space_info->bytes_reserved += ins->offset; 8241 block_group->reserved += ins->offset; 8242 spin_unlock(&block_group->lock); 8243 spin_unlock(&space_info->lock); 8244 8245 ret = alloc_reserved_file_extent(trans, fs_info, 0, root_objectid, 8246 0, owner, offset, ins, 1); 8247 btrfs_put_block_group(block_group); 8248 return ret; 8249 } 8250 8251 static struct extent_buffer * 8252 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 8253 u64 bytenr, int level) 8254 { 8255 struct btrfs_fs_info *fs_info = root->fs_info; 8256 struct extent_buffer *buf; 8257 8258 buf = btrfs_find_create_tree_block(fs_info, bytenr); 8259 if (IS_ERR(buf)) 8260 return buf; 8261 8262 btrfs_set_header_generation(buf, trans->transid); 8263 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 8264 btrfs_tree_lock(buf); 8265 clean_tree_block(fs_info, buf); 8266 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 8267 8268 btrfs_set_lock_blocking(buf); 8269 set_extent_buffer_uptodate(buf); 8270 8271 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 8272 buf->log_index = root->log_transid % 2; 8273 /* 8274 * we allow two log transactions at a time, use different 8275 * EXENT bit to differentiate dirty pages. 8276 */ 8277 if (buf->log_index == 0) 8278 set_extent_dirty(&root->dirty_log_pages, buf->start, 8279 buf->start + buf->len - 1, GFP_NOFS); 8280 else 8281 set_extent_new(&root->dirty_log_pages, buf->start, 8282 buf->start + buf->len - 1); 8283 } else { 8284 buf->log_index = -1; 8285 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 8286 buf->start + buf->len - 1, GFP_NOFS); 8287 } 8288 trans->dirty = true; 8289 /* this returns a buffer locked for blocking */ 8290 return buf; 8291 } 8292 8293 static struct btrfs_block_rsv * 8294 use_block_rsv(struct btrfs_trans_handle *trans, 8295 struct btrfs_root *root, u32 blocksize) 8296 { 8297 struct btrfs_fs_info *fs_info = root->fs_info; 8298 struct btrfs_block_rsv *block_rsv; 8299 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 8300 int ret; 8301 bool global_updated = false; 8302 8303 block_rsv = get_block_rsv(trans, root); 8304 8305 if (unlikely(block_rsv->size == 0)) 8306 goto try_reserve; 8307 again: 8308 ret = block_rsv_use_bytes(block_rsv, blocksize); 8309 if (!ret) 8310 return block_rsv; 8311 8312 if (block_rsv->failfast) 8313 return ERR_PTR(ret); 8314 8315 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 8316 global_updated = true; 8317 update_global_block_rsv(fs_info); 8318 goto again; 8319 } 8320 8321 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 8322 static DEFINE_RATELIMIT_STATE(_rs, 8323 DEFAULT_RATELIMIT_INTERVAL * 10, 8324 /*DEFAULT_RATELIMIT_BURST*/ 1); 8325 if (__ratelimit(&_rs)) 8326 WARN(1, KERN_DEBUG 8327 "BTRFS: block rsv returned %d\n", ret); 8328 } 8329 try_reserve: 8330 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 8331 BTRFS_RESERVE_NO_FLUSH); 8332 if (!ret) 8333 return block_rsv; 8334 /* 8335 * If we couldn't reserve metadata bytes try and use some from 8336 * the global reserve if its space type is the same as the global 8337 * reservation. 8338 */ 8339 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 8340 block_rsv->space_info == global_rsv->space_info) { 8341 ret = block_rsv_use_bytes(global_rsv, blocksize); 8342 if (!ret) 8343 return global_rsv; 8344 } 8345 return ERR_PTR(ret); 8346 } 8347 8348 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 8349 struct btrfs_block_rsv *block_rsv, u32 blocksize) 8350 { 8351 block_rsv_add_bytes(block_rsv, blocksize, 0); 8352 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 8353 } 8354 8355 /* 8356 * finds a free extent and does all the dirty work required for allocation 8357 * returns the tree buffer or an ERR_PTR on error. 8358 */ 8359 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 8360 struct btrfs_root *root, 8361 u64 parent, u64 root_objectid, 8362 const struct btrfs_disk_key *key, 8363 int level, u64 hint, 8364 u64 empty_size) 8365 { 8366 struct btrfs_fs_info *fs_info = root->fs_info; 8367 struct btrfs_key ins; 8368 struct btrfs_block_rsv *block_rsv; 8369 struct extent_buffer *buf; 8370 struct btrfs_delayed_extent_op *extent_op; 8371 u64 flags = 0; 8372 int ret; 8373 u32 blocksize = fs_info->nodesize; 8374 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 8375 8376 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 8377 if (btrfs_is_testing(fs_info)) { 8378 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 8379 level); 8380 if (!IS_ERR(buf)) 8381 root->alloc_bytenr += blocksize; 8382 return buf; 8383 } 8384 #endif 8385 8386 block_rsv = use_block_rsv(trans, root, blocksize); 8387 if (IS_ERR(block_rsv)) 8388 return ERR_CAST(block_rsv); 8389 8390 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, 8391 empty_size, hint, &ins, 0, 0); 8392 if (ret) 8393 goto out_unuse; 8394 8395 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); 8396 if (IS_ERR(buf)) { 8397 ret = PTR_ERR(buf); 8398 goto out_free_reserved; 8399 } 8400 8401 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 8402 if (parent == 0) 8403 parent = ins.objectid; 8404 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 8405 } else 8406 BUG_ON(parent > 0); 8407 8408 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 8409 extent_op = btrfs_alloc_delayed_extent_op(); 8410 if (!extent_op) { 8411 ret = -ENOMEM; 8412 goto out_free_buf; 8413 } 8414 if (key) 8415 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 8416 else 8417 memset(&extent_op->key, 0, sizeof(extent_op->key)); 8418 extent_op->flags_to_set = flags; 8419 extent_op->update_key = skinny_metadata ? false : true; 8420 extent_op->update_flags = true; 8421 extent_op->is_data = false; 8422 extent_op->level = level; 8423 8424 ret = btrfs_add_delayed_tree_ref(fs_info, trans, 8425 ins.objectid, ins.offset, 8426 parent, root_objectid, level, 8427 BTRFS_ADD_DELAYED_EXTENT, 8428 extent_op); 8429 if (ret) 8430 goto out_free_delayed; 8431 } 8432 return buf; 8433 8434 out_free_delayed: 8435 btrfs_free_delayed_extent_op(extent_op); 8436 out_free_buf: 8437 free_extent_buffer(buf); 8438 out_free_reserved: 8439 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); 8440 out_unuse: 8441 unuse_block_rsv(fs_info, block_rsv, blocksize); 8442 return ERR_PTR(ret); 8443 } 8444 8445 struct walk_control { 8446 u64 refs[BTRFS_MAX_LEVEL]; 8447 u64 flags[BTRFS_MAX_LEVEL]; 8448 struct btrfs_key update_progress; 8449 int stage; 8450 int level; 8451 int shared_level; 8452 int update_ref; 8453 int keep_locks; 8454 int reada_slot; 8455 int reada_count; 8456 int for_reloc; 8457 }; 8458 8459 #define DROP_REFERENCE 1 8460 #define UPDATE_BACKREF 2 8461 8462 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 8463 struct btrfs_root *root, 8464 struct walk_control *wc, 8465 struct btrfs_path *path) 8466 { 8467 struct btrfs_fs_info *fs_info = root->fs_info; 8468 u64 bytenr; 8469 u64 generation; 8470 u64 refs; 8471 u64 flags; 8472 u32 nritems; 8473 struct btrfs_key key; 8474 struct extent_buffer *eb; 8475 int ret; 8476 int slot; 8477 int nread = 0; 8478 8479 if (path->slots[wc->level] < wc->reada_slot) { 8480 wc->reada_count = wc->reada_count * 2 / 3; 8481 wc->reada_count = max(wc->reada_count, 2); 8482 } else { 8483 wc->reada_count = wc->reada_count * 3 / 2; 8484 wc->reada_count = min_t(int, wc->reada_count, 8485 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 8486 } 8487 8488 eb = path->nodes[wc->level]; 8489 nritems = btrfs_header_nritems(eb); 8490 8491 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 8492 if (nread >= wc->reada_count) 8493 break; 8494 8495 cond_resched(); 8496 bytenr = btrfs_node_blockptr(eb, slot); 8497 generation = btrfs_node_ptr_generation(eb, slot); 8498 8499 if (slot == path->slots[wc->level]) 8500 goto reada; 8501 8502 if (wc->stage == UPDATE_BACKREF && 8503 generation <= root->root_key.offset) 8504 continue; 8505 8506 /* We don't lock the tree block, it's OK to be racy here */ 8507 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, 8508 wc->level - 1, 1, &refs, 8509 &flags); 8510 /* We don't care about errors in readahead. */ 8511 if (ret < 0) 8512 continue; 8513 BUG_ON(refs == 0); 8514 8515 if (wc->stage == DROP_REFERENCE) { 8516 if (refs == 1) 8517 goto reada; 8518 8519 if (wc->level == 1 && 8520 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8521 continue; 8522 if (!wc->update_ref || 8523 generation <= root->root_key.offset) 8524 continue; 8525 btrfs_node_key_to_cpu(eb, &key, slot); 8526 ret = btrfs_comp_cpu_keys(&key, 8527 &wc->update_progress); 8528 if (ret < 0) 8529 continue; 8530 } else { 8531 if (wc->level == 1 && 8532 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8533 continue; 8534 } 8535 reada: 8536 readahead_tree_block(fs_info, bytenr); 8537 nread++; 8538 } 8539 wc->reada_slot = slot; 8540 } 8541 8542 /* 8543 * helper to process tree block while walking down the tree. 8544 * 8545 * when wc->stage == UPDATE_BACKREF, this function updates 8546 * back refs for pointers in the block. 8547 * 8548 * NOTE: return value 1 means we should stop walking down. 8549 */ 8550 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 8551 struct btrfs_root *root, 8552 struct btrfs_path *path, 8553 struct walk_control *wc, int lookup_info) 8554 { 8555 struct btrfs_fs_info *fs_info = root->fs_info; 8556 int level = wc->level; 8557 struct extent_buffer *eb = path->nodes[level]; 8558 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 8559 int ret; 8560 8561 if (wc->stage == UPDATE_BACKREF && 8562 btrfs_header_owner(eb) != root->root_key.objectid) 8563 return 1; 8564 8565 /* 8566 * when reference count of tree block is 1, it won't increase 8567 * again. once full backref flag is set, we never clear it. 8568 */ 8569 if (lookup_info && 8570 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 8571 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 8572 BUG_ON(!path->locks[level]); 8573 ret = btrfs_lookup_extent_info(trans, fs_info, 8574 eb->start, level, 1, 8575 &wc->refs[level], 8576 &wc->flags[level]); 8577 BUG_ON(ret == -ENOMEM); 8578 if (ret) 8579 return ret; 8580 BUG_ON(wc->refs[level] == 0); 8581 } 8582 8583 if (wc->stage == DROP_REFERENCE) { 8584 if (wc->refs[level] > 1) 8585 return 1; 8586 8587 if (path->locks[level] && !wc->keep_locks) { 8588 btrfs_tree_unlock_rw(eb, path->locks[level]); 8589 path->locks[level] = 0; 8590 } 8591 return 0; 8592 } 8593 8594 /* wc->stage == UPDATE_BACKREF */ 8595 if (!(wc->flags[level] & flag)) { 8596 BUG_ON(!path->locks[level]); 8597 ret = btrfs_inc_ref(trans, root, eb, 1); 8598 BUG_ON(ret); /* -ENOMEM */ 8599 ret = btrfs_dec_ref(trans, root, eb, 0); 8600 BUG_ON(ret); /* -ENOMEM */ 8601 ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start, 8602 eb->len, flag, 8603 btrfs_header_level(eb), 0); 8604 BUG_ON(ret); /* -ENOMEM */ 8605 wc->flags[level] |= flag; 8606 } 8607 8608 /* 8609 * the block is shared by multiple trees, so it's not good to 8610 * keep the tree lock 8611 */ 8612 if (path->locks[level] && level > 0) { 8613 btrfs_tree_unlock_rw(eb, path->locks[level]); 8614 path->locks[level] = 0; 8615 } 8616 return 0; 8617 } 8618 8619 /* 8620 * helper to process tree block pointer. 8621 * 8622 * when wc->stage == DROP_REFERENCE, this function checks 8623 * reference count of the block pointed to. if the block 8624 * is shared and we need update back refs for the subtree 8625 * rooted at the block, this function changes wc->stage to 8626 * UPDATE_BACKREF. if the block is shared and there is no 8627 * need to update back, this function drops the reference 8628 * to the block. 8629 * 8630 * NOTE: return value 1 means we should stop walking down. 8631 */ 8632 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 8633 struct btrfs_root *root, 8634 struct btrfs_path *path, 8635 struct walk_control *wc, int *lookup_info) 8636 { 8637 struct btrfs_fs_info *fs_info = root->fs_info; 8638 u64 bytenr; 8639 u64 generation; 8640 u64 parent; 8641 u32 blocksize; 8642 struct btrfs_key key; 8643 struct extent_buffer *next; 8644 int level = wc->level; 8645 int reada = 0; 8646 int ret = 0; 8647 bool need_account = false; 8648 8649 generation = btrfs_node_ptr_generation(path->nodes[level], 8650 path->slots[level]); 8651 /* 8652 * if the lower level block was created before the snapshot 8653 * was created, we know there is no need to update back refs 8654 * for the subtree 8655 */ 8656 if (wc->stage == UPDATE_BACKREF && 8657 generation <= root->root_key.offset) { 8658 *lookup_info = 1; 8659 return 1; 8660 } 8661 8662 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 8663 blocksize = fs_info->nodesize; 8664 8665 next = find_extent_buffer(fs_info, bytenr); 8666 if (!next) { 8667 next = btrfs_find_create_tree_block(fs_info, bytenr); 8668 if (IS_ERR(next)) 8669 return PTR_ERR(next); 8670 8671 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 8672 level - 1); 8673 reada = 1; 8674 } 8675 btrfs_tree_lock(next); 8676 btrfs_set_lock_blocking(next); 8677 8678 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, 8679 &wc->refs[level - 1], 8680 &wc->flags[level - 1]); 8681 if (ret < 0) 8682 goto out_unlock; 8683 8684 if (unlikely(wc->refs[level - 1] == 0)) { 8685 btrfs_err(fs_info, "Missing references."); 8686 ret = -EIO; 8687 goto out_unlock; 8688 } 8689 *lookup_info = 0; 8690 8691 if (wc->stage == DROP_REFERENCE) { 8692 if (wc->refs[level - 1] > 1) { 8693 need_account = true; 8694 if (level == 1 && 8695 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8696 goto skip; 8697 8698 if (!wc->update_ref || 8699 generation <= root->root_key.offset) 8700 goto skip; 8701 8702 btrfs_node_key_to_cpu(path->nodes[level], &key, 8703 path->slots[level]); 8704 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 8705 if (ret < 0) 8706 goto skip; 8707 8708 wc->stage = UPDATE_BACKREF; 8709 wc->shared_level = level - 1; 8710 } 8711 } else { 8712 if (level == 1 && 8713 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8714 goto skip; 8715 } 8716 8717 if (!btrfs_buffer_uptodate(next, generation, 0)) { 8718 btrfs_tree_unlock(next); 8719 free_extent_buffer(next); 8720 next = NULL; 8721 *lookup_info = 1; 8722 } 8723 8724 if (!next) { 8725 if (reada && level == 1) 8726 reada_walk_down(trans, root, wc, path); 8727 next = read_tree_block(fs_info, bytenr, generation); 8728 if (IS_ERR(next)) { 8729 return PTR_ERR(next); 8730 } else if (!extent_buffer_uptodate(next)) { 8731 free_extent_buffer(next); 8732 return -EIO; 8733 } 8734 btrfs_tree_lock(next); 8735 btrfs_set_lock_blocking(next); 8736 } 8737 8738 level--; 8739 ASSERT(level == btrfs_header_level(next)); 8740 if (level != btrfs_header_level(next)) { 8741 btrfs_err(root->fs_info, "mismatched level"); 8742 ret = -EIO; 8743 goto out_unlock; 8744 } 8745 path->nodes[level] = next; 8746 path->slots[level] = 0; 8747 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8748 wc->level = level; 8749 if (wc->level == 1) 8750 wc->reada_slot = 0; 8751 return 0; 8752 skip: 8753 wc->refs[level - 1] = 0; 8754 wc->flags[level - 1] = 0; 8755 if (wc->stage == DROP_REFERENCE) { 8756 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 8757 parent = path->nodes[level]->start; 8758 } else { 8759 ASSERT(root->root_key.objectid == 8760 btrfs_header_owner(path->nodes[level])); 8761 if (root->root_key.objectid != 8762 btrfs_header_owner(path->nodes[level])) { 8763 btrfs_err(root->fs_info, 8764 "mismatched block owner"); 8765 ret = -EIO; 8766 goto out_unlock; 8767 } 8768 parent = 0; 8769 } 8770 8771 if (need_account) { 8772 ret = btrfs_qgroup_trace_subtree(trans, root, next, 8773 generation, level - 1); 8774 if (ret) { 8775 btrfs_err_rl(fs_info, 8776 "Error %d accounting shared subtree. Quota is out of sync, rescan required.", 8777 ret); 8778 } 8779 } 8780 ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize, 8781 parent, root->root_key.objectid, 8782 level - 1, 0); 8783 if (ret) 8784 goto out_unlock; 8785 } 8786 8787 *lookup_info = 1; 8788 ret = 1; 8789 8790 out_unlock: 8791 btrfs_tree_unlock(next); 8792 free_extent_buffer(next); 8793 8794 return ret; 8795 } 8796 8797 /* 8798 * helper to process tree block while walking up the tree. 8799 * 8800 * when wc->stage == DROP_REFERENCE, this function drops 8801 * reference count on the block. 8802 * 8803 * when wc->stage == UPDATE_BACKREF, this function changes 8804 * wc->stage back to DROP_REFERENCE if we changed wc->stage 8805 * to UPDATE_BACKREF previously while processing the block. 8806 * 8807 * NOTE: return value 1 means we should stop walking up. 8808 */ 8809 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 8810 struct btrfs_root *root, 8811 struct btrfs_path *path, 8812 struct walk_control *wc) 8813 { 8814 struct btrfs_fs_info *fs_info = root->fs_info; 8815 int ret; 8816 int level = wc->level; 8817 struct extent_buffer *eb = path->nodes[level]; 8818 u64 parent = 0; 8819 8820 if (wc->stage == UPDATE_BACKREF) { 8821 BUG_ON(wc->shared_level < level); 8822 if (level < wc->shared_level) 8823 goto out; 8824 8825 ret = find_next_key(path, level + 1, &wc->update_progress); 8826 if (ret > 0) 8827 wc->update_ref = 0; 8828 8829 wc->stage = DROP_REFERENCE; 8830 wc->shared_level = -1; 8831 path->slots[level] = 0; 8832 8833 /* 8834 * check reference count again if the block isn't locked. 8835 * we should start walking down the tree again if reference 8836 * count is one. 8837 */ 8838 if (!path->locks[level]) { 8839 BUG_ON(level == 0); 8840 btrfs_tree_lock(eb); 8841 btrfs_set_lock_blocking(eb); 8842 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8843 8844 ret = btrfs_lookup_extent_info(trans, fs_info, 8845 eb->start, level, 1, 8846 &wc->refs[level], 8847 &wc->flags[level]); 8848 if (ret < 0) { 8849 btrfs_tree_unlock_rw(eb, path->locks[level]); 8850 path->locks[level] = 0; 8851 return ret; 8852 } 8853 BUG_ON(wc->refs[level] == 0); 8854 if (wc->refs[level] == 1) { 8855 btrfs_tree_unlock_rw(eb, path->locks[level]); 8856 path->locks[level] = 0; 8857 return 1; 8858 } 8859 } 8860 } 8861 8862 /* wc->stage == DROP_REFERENCE */ 8863 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 8864 8865 if (wc->refs[level] == 1) { 8866 if (level == 0) { 8867 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8868 ret = btrfs_dec_ref(trans, root, eb, 1); 8869 else 8870 ret = btrfs_dec_ref(trans, root, eb, 0); 8871 BUG_ON(ret); /* -ENOMEM */ 8872 ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, eb); 8873 if (ret) { 8874 btrfs_err_rl(fs_info, 8875 "error %d accounting leaf items. Quota is out of sync, rescan required.", 8876 ret); 8877 } 8878 } 8879 /* make block locked assertion in clean_tree_block happy */ 8880 if (!path->locks[level] && 8881 btrfs_header_generation(eb) == trans->transid) { 8882 btrfs_tree_lock(eb); 8883 btrfs_set_lock_blocking(eb); 8884 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8885 } 8886 clean_tree_block(fs_info, eb); 8887 } 8888 8889 if (eb == root->node) { 8890 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8891 parent = eb->start; 8892 else 8893 BUG_ON(root->root_key.objectid != 8894 btrfs_header_owner(eb)); 8895 } else { 8896 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8897 parent = path->nodes[level + 1]->start; 8898 else 8899 BUG_ON(root->root_key.objectid != 8900 btrfs_header_owner(path->nodes[level + 1])); 8901 } 8902 8903 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 8904 out: 8905 wc->refs[level] = 0; 8906 wc->flags[level] = 0; 8907 return 0; 8908 } 8909 8910 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 8911 struct btrfs_root *root, 8912 struct btrfs_path *path, 8913 struct walk_control *wc) 8914 { 8915 int level = wc->level; 8916 int lookup_info = 1; 8917 int ret; 8918 8919 while (level >= 0) { 8920 ret = walk_down_proc(trans, root, path, wc, lookup_info); 8921 if (ret > 0) 8922 break; 8923 8924 if (level == 0) 8925 break; 8926 8927 if (path->slots[level] >= 8928 btrfs_header_nritems(path->nodes[level])) 8929 break; 8930 8931 ret = do_walk_down(trans, root, path, wc, &lookup_info); 8932 if (ret > 0) { 8933 path->slots[level]++; 8934 continue; 8935 } else if (ret < 0) 8936 return ret; 8937 level = wc->level; 8938 } 8939 return 0; 8940 } 8941 8942 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 8943 struct btrfs_root *root, 8944 struct btrfs_path *path, 8945 struct walk_control *wc, int max_level) 8946 { 8947 int level = wc->level; 8948 int ret; 8949 8950 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 8951 while (level < max_level && path->nodes[level]) { 8952 wc->level = level; 8953 if (path->slots[level] + 1 < 8954 btrfs_header_nritems(path->nodes[level])) { 8955 path->slots[level]++; 8956 return 0; 8957 } else { 8958 ret = walk_up_proc(trans, root, path, wc); 8959 if (ret > 0) 8960 return 0; 8961 8962 if (path->locks[level]) { 8963 btrfs_tree_unlock_rw(path->nodes[level], 8964 path->locks[level]); 8965 path->locks[level] = 0; 8966 } 8967 free_extent_buffer(path->nodes[level]); 8968 path->nodes[level] = NULL; 8969 level++; 8970 } 8971 } 8972 return 1; 8973 } 8974 8975 /* 8976 * drop a subvolume tree. 8977 * 8978 * this function traverses the tree freeing any blocks that only 8979 * referenced by the tree. 8980 * 8981 * when a shared tree block is found. this function decreases its 8982 * reference count by one. if update_ref is true, this function 8983 * also make sure backrefs for the shared block and all lower level 8984 * blocks are properly updated. 8985 * 8986 * If called with for_reloc == 0, may exit early with -EAGAIN 8987 */ 8988 int btrfs_drop_snapshot(struct btrfs_root *root, 8989 struct btrfs_block_rsv *block_rsv, int update_ref, 8990 int for_reloc) 8991 { 8992 struct btrfs_fs_info *fs_info = root->fs_info; 8993 struct btrfs_path *path; 8994 struct btrfs_trans_handle *trans; 8995 struct btrfs_root *tree_root = fs_info->tree_root; 8996 struct btrfs_root_item *root_item = &root->root_item; 8997 struct walk_control *wc; 8998 struct btrfs_key key; 8999 int err = 0; 9000 int ret; 9001 int level; 9002 bool root_dropped = false; 9003 9004 btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid); 9005 9006 path = btrfs_alloc_path(); 9007 if (!path) { 9008 err = -ENOMEM; 9009 goto out; 9010 } 9011 9012 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9013 if (!wc) { 9014 btrfs_free_path(path); 9015 err = -ENOMEM; 9016 goto out; 9017 } 9018 9019 trans = btrfs_start_transaction(tree_root, 0); 9020 if (IS_ERR(trans)) { 9021 err = PTR_ERR(trans); 9022 goto out_free; 9023 } 9024 9025 if (block_rsv) 9026 trans->block_rsv = block_rsv; 9027 9028 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 9029 level = btrfs_header_level(root->node); 9030 path->nodes[level] = btrfs_lock_root_node(root); 9031 btrfs_set_lock_blocking(path->nodes[level]); 9032 path->slots[level] = 0; 9033 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9034 memset(&wc->update_progress, 0, 9035 sizeof(wc->update_progress)); 9036 } else { 9037 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 9038 memcpy(&wc->update_progress, &key, 9039 sizeof(wc->update_progress)); 9040 9041 level = root_item->drop_level; 9042 BUG_ON(level == 0); 9043 path->lowest_level = level; 9044 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 9045 path->lowest_level = 0; 9046 if (ret < 0) { 9047 err = ret; 9048 goto out_end_trans; 9049 } 9050 WARN_ON(ret > 0); 9051 9052 /* 9053 * unlock our path, this is safe because only this 9054 * function is allowed to delete this snapshot 9055 */ 9056 btrfs_unlock_up_safe(path, 0); 9057 9058 level = btrfs_header_level(root->node); 9059 while (1) { 9060 btrfs_tree_lock(path->nodes[level]); 9061 btrfs_set_lock_blocking(path->nodes[level]); 9062 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9063 9064 ret = btrfs_lookup_extent_info(trans, fs_info, 9065 path->nodes[level]->start, 9066 level, 1, &wc->refs[level], 9067 &wc->flags[level]); 9068 if (ret < 0) { 9069 err = ret; 9070 goto out_end_trans; 9071 } 9072 BUG_ON(wc->refs[level] == 0); 9073 9074 if (level == root_item->drop_level) 9075 break; 9076 9077 btrfs_tree_unlock(path->nodes[level]); 9078 path->locks[level] = 0; 9079 WARN_ON(wc->refs[level] != 1); 9080 level--; 9081 } 9082 } 9083 9084 wc->level = level; 9085 wc->shared_level = -1; 9086 wc->stage = DROP_REFERENCE; 9087 wc->update_ref = update_ref; 9088 wc->keep_locks = 0; 9089 wc->for_reloc = for_reloc; 9090 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 9091 9092 while (1) { 9093 9094 ret = walk_down_tree(trans, root, path, wc); 9095 if (ret < 0) { 9096 err = ret; 9097 break; 9098 } 9099 9100 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 9101 if (ret < 0) { 9102 err = ret; 9103 break; 9104 } 9105 9106 if (ret > 0) { 9107 BUG_ON(wc->stage != DROP_REFERENCE); 9108 break; 9109 } 9110 9111 if (wc->stage == DROP_REFERENCE) { 9112 level = wc->level; 9113 btrfs_node_key(path->nodes[level], 9114 &root_item->drop_progress, 9115 path->slots[level]); 9116 root_item->drop_level = level; 9117 } 9118 9119 BUG_ON(wc->level == 0); 9120 if (btrfs_should_end_transaction(trans) || 9121 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { 9122 ret = btrfs_update_root(trans, tree_root, 9123 &root->root_key, 9124 root_item); 9125 if (ret) { 9126 btrfs_abort_transaction(trans, ret); 9127 err = ret; 9128 goto out_end_trans; 9129 } 9130 9131 btrfs_end_transaction_throttle(trans); 9132 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { 9133 btrfs_debug(fs_info, 9134 "drop snapshot early exit"); 9135 err = -EAGAIN; 9136 goto out_free; 9137 } 9138 9139 trans = btrfs_start_transaction(tree_root, 0); 9140 if (IS_ERR(trans)) { 9141 err = PTR_ERR(trans); 9142 goto out_free; 9143 } 9144 if (block_rsv) 9145 trans->block_rsv = block_rsv; 9146 } 9147 } 9148 btrfs_release_path(path); 9149 if (err) 9150 goto out_end_trans; 9151 9152 ret = btrfs_del_root(trans, tree_root, &root->root_key); 9153 if (ret) { 9154 btrfs_abort_transaction(trans, ret); 9155 goto out_end_trans; 9156 } 9157 9158 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 9159 ret = btrfs_find_root(tree_root, &root->root_key, path, 9160 NULL, NULL); 9161 if (ret < 0) { 9162 btrfs_abort_transaction(trans, ret); 9163 err = ret; 9164 goto out_end_trans; 9165 } else if (ret > 0) { 9166 /* if we fail to delete the orphan item this time 9167 * around, it'll get picked up the next time. 9168 * 9169 * The most common failure here is just -ENOENT. 9170 */ 9171 btrfs_del_orphan_item(trans, tree_root, 9172 root->root_key.objectid); 9173 } 9174 } 9175 9176 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 9177 btrfs_add_dropped_root(trans, root); 9178 } else { 9179 free_extent_buffer(root->node); 9180 free_extent_buffer(root->commit_root); 9181 btrfs_put_fs_root(root); 9182 } 9183 root_dropped = true; 9184 out_end_trans: 9185 btrfs_end_transaction_throttle(trans); 9186 out_free: 9187 kfree(wc); 9188 btrfs_free_path(path); 9189 out: 9190 /* 9191 * So if we need to stop dropping the snapshot for whatever reason we 9192 * need to make sure to add it back to the dead root list so that we 9193 * keep trying to do the work later. This also cleans up roots if we 9194 * don't have it in the radix (like when we recover after a power fail 9195 * or unmount) so we don't leak memory. 9196 */ 9197 if (!for_reloc && root_dropped == false) 9198 btrfs_add_dead_root(root); 9199 if (err && err != -EAGAIN) 9200 btrfs_handle_fs_error(fs_info, err, NULL); 9201 return err; 9202 } 9203 9204 /* 9205 * drop subtree rooted at tree block 'node'. 9206 * 9207 * NOTE: this function will unlock and release tree block 'node' 9208 * only used by relocation code 9209 */ 9210 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 9211 struct btrfs_root *root, 9212 struct extent_buffer *node, 9213 struct extent_buffer *parent) 9214 { 9215 struct btrfs_fs_info *fs_info = root->fs_info; 9216 struct btrfs_path *path; 9217 struct walk_control *wc; 9218 int level; 9219 int parent_level; 9220 int ret = 0; 9221 int wret; 9222 9223 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 9224 9225 path = btrfs_alloc_path(); 9226 if (!path) 9227 return -ENOMEM; 9228 9229 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9230 if (!wc) { 9231 btrfs_free_path(path); 9232 return -ENOMEM; 9233 } 9234 9235 btrfs_assert_tree_locked(parent); 9236 parent_level = btrfs_header_level(parent); 9237 extent_buffer_get(parent); 9238 path->nodes[parent_level] = parent; 9239 path->slots[parent_level] = btrfs_header_nritems(parent); 9240 9241 btrfs_assert_tree_locked(node); 9242 level = btrfs_header_level(node); 9243 path->nodes[level] = node; 9244 path->slots[level] = 0; 9245 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9246 9247 wc->refs[parent_level] = 1; 9248 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 9249 wc->level = level; 9250 wc->shared_level = -1; 9251 wc->stage = DROP_REFERENCE; 9252 wc->update_ref = 0; 9253 wc->keep_locks = 1; 9254 wc->for_reloc = 1; 9255 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 9256 9257 while (1) { 9258 wret = walk_down_tree(trans, root, path, wc); 9259 if (wret < 0) { 9260 ret = wret; 9261 break; 9262 } 9263 9264 wret = walk_up_tree(trans, root, path, wc, parent_level); 9265 if (wret < 0) 9266 ret = wret; 9267 if (wret != 0) 9268 break; 9269 } 9270 9271 kfree(wc); 9272 btrfs_free_path(path); 9273 return ret; 9274 } 9275 9276 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) 9277 { 9278 u64 num_devices; 9279 u64 stripped; 9280 9281 /* 9282 * if restripe for this chunk_type is on pick target profile and 9283 * return, otherwise do the usual balance 9284 */ 9285 stripped = get_restripe_target(fs_info, flags); 9286 if (stripped) 9287 return extended_to_chunk(stripped); 9288 9289 num_devices = fs_info->fs_devices->rw_devices; 9290 9291 stripped = BTRFS_BLOCK_GROUP_RAID0 | 9292 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 9293 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 9294 9295 if (num_devices == 1) { 9296 stripped |= BTRFS_BLOCK_GROUP_DUP; 9297 stripped = flags & ~stripped; 9298 9299 /* turn raid0 into single device chunks */ 9300 if (flags & BTRFS_BLOCK_GROUP_RAID0) 9301 return stripped; 9302 9303 /* turn mirroring into duplication */ 9304 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 9305 BTRFS_BLOCK_GROUP_RAID10)) 9306 return stripped | BTRFS_BLOCK_GROUP_DUP; 9307 } else { 9308 /* they already had raid on here, just return */ 9309 if (flags & stripped) 9310 return flags; 9311 9312 stripped |= BTRFS_BLOCK_GROUP_DUP; 9313 stripped = flags & ~stripped; 9314 9315 /* switch duplicated blocks with raid1 */ 9316 if (flags & BTRFS_BLOCK_GROUP_DUP) 9317 return stripped | BTRFS_BLOCK_GROUP_RAID1; 9318 9319 /* this is drive concat, leave it alone */ 9320 } 9321 9322 return flags; 9323 } 9324 9325 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) 9326 { 9327 struct btrfs_space_info *sinfo = cache->space_info; 9328 u64 num_bytes; 9329 u64 min_allocable_bytes; 9330 int ret = -ENOSPC; 9331 9332 /* 9333 * We need some metadata space and system metadata space for 9334 * allocating chunks in some corner cases until we force to set 9335 * it to be readonly. 9336 */ 9337 if ((sinfo->flags & 9338 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 9339 !force) 9340 min_allocable_bytes = SZ_1M; 9341 else 9342 min_allocable_bytes = 0; 9343 9344 spin_lock(&sinfo->lock); 9345 spin_lock(&cache->lock); 9346 9347 if (cache->ro) { 9348 cache->ro++; 9349 ret = 0; 9350 goto out; 9351 } 9352 9353 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 9354 cache->bytes_super - btrfs_block_group_used(&cache->item); 9355 9356 if (btrfs_space_info_used(sinfo, true) + num_bytes + 9357 min_allocable_bytes <= sinfo->total_bytes) { 9358 sinfo->bytes_readonly += num_bytes; 9359 cache->ro++; 9360 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 9361 ret = 0; 9362 } 9363 out: 9364 spin_unlock(&cache->lock); 9365 spin_unlock(&sinfo->lock); 9366 return ret; 9367 } 9368 9369 int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info, 9370 struct btrfs_block_group_cache *cache) 9371 9372 { 9373 struct btrfs_trans_handle *trans; 9374 u64 alloc_flags; 9375 int ret; 9376 9377 again: 9378 trans = btrfs_join_transaction(fs_info->extent_root); 9379 if (IS_ERR(trans)) 9380 return PTR_ERR(trans); 9381 9382 /* 9383 * we're not allowed to set block groups readonly after the dirty 9384 * block groups cache has started writing. If it already started, 9385 * back off and let this transaction commit 9386 */ 9387 mutex_lock(&fs_info->ro_block_group_mutex); 9388 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 9389 u64 transid = trans->transid; 9390 9391 mutex_unlock(&fs_info->ro_block_group_mutex); 9392 btrfs_end_transaction(trans); 9393 9394 ret = btrfs_wait_for_commit(fs_info, transid); 9395 if (ret) 9396 return ret; 9397 goto again; 9398 } 9399 9400 /* 9401 * if we are changing raid levels, try to allocate a corresponding 9402 * block group with the new raid level. 9403 */ 9404 alloc_flags = update_block_group_flags(fs_info, cache->flags); 9405 if (alloc_flags != cache->flags) { 9406 ret = do_chunk_alloc(trans, fs_info, alloc_flags, 9407 CHUNK_ALLOC_FORCE); 9408 /* 9409 * ENOSPC is allowed here, we may have enough space 9410 * already allocated at the new raid level to 9411 * carry on 9412 */ 9413 if (ret == -ENOSPC) 9414 ret = 0; 9415 if (ret < 0) 9416 goto out; 9417 } 9418 9419 ret = inc_block_group_ro(cache, 0); 9420 if (!ret) 9421 goto out; 9422 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); 9423 ret = do_chunk_alloc(trans, fs_info, alloc_flags, 9424 CHUNK_ALLOC_FORCE); 9425 if (ret < 0) 9426 goto out; 9427 ret = inc_block_group_ro(cache, 0); 9428 out: 9429 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 9430 alloc_flags = update_block_group_flags(fs_info, cache->flags); 9431 mutex_lock(&fs_info->chunk_mutex); 9432 check_system_chunk(trans, fs_info, alloc_flags); 9433 mutex_unlock(&fs_info->chunk_mutex); 9434 } 9435 mutex_unlock(&fs_info->ro_block_group_mutex); 9436 9437 btrfs_end_transaction(trans); 9438 return ret; 9439 } 9440 9441 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 9442 struct btrfs_fs_info *fs_info, u64 type) 9443 { 9444 u64 alloc_flags = get_alloc_profile(fs_info, type); 9445 9446 return do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE); 9447 } 9448 9449 /* 9450 * helper to account the unused space of all the readonly block group in the 9451 * space_info. takes mirrors into account. 9452 */ 9453 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 9454 { 9455 struct btrfs_block_group_cache *block_group; 9456 u64 free_bytes = 0; 9457 int factor; 9458 9459 /* It's df, we don't care if it's racy */ 9460 if (list_empty(&sinfo->ro_bgs)) 9461 return 0; 9462 9463 spin_lock(&sinfo->lock); 9464 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { 9465 spin_lock(&block_group->lock); 9466 9467 if (!block_group->ro) { 9468 spin_unlock(&block_group->lock); 9469 continue; 9470 } 9471 9472 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 9473 BTRFS_BLOCK_GROUP_RAID10 | 9474 BTRFS_BLOCK_GROUP_DUP)) 9475 factor = 2; 9476 else 9477 factor = 1; 9478 9479 free_bytes += (block_group->key.offset - 9480 btrfs_block_group_used(&block_group->item)) * 9481 factor; 9482 9483 spin_unlock(&block_group->lock); 9484 } 9485 spin_unlock(&sinfo->lock); 9486 9487 return free_bytes; 9488 } 9489 9490 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) 9491 { 9492 struct btrfs_space_info *sinfo = cache->space_info; 9493 u64 num_bytes; 9494 9495 BUG_ON(!cache->ro); 9496 9497 spin_lock(&sinfo->lock); 9498 spin_lock(&cache->lock); 9499 if (!--cache->ro) { 9500 num_bytes = cache->key.offset - cache->reserved - 9501 cache->pinned - cache->bytes_super - 9502 btrfs_block_group_used(&cache->item); 9503 sinfo->bytes_readonly -= num_bytes; 9504 list_del_init(&cache->ro_list); 9505 } 9506 spin_unlock(&cache->lock); 9507 spin_unlock(&sinfo->lock); 9508 } 9509 9510 /* 9511 * checks to see if its even possible to relocate this block group. 9512 * 9513 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 9514 * ok to go ahead and try. 9515 */ 9516 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) 9517 { 9518 struct btrfs_root *root = fs_info->extent_root; 9519 struct btrfs_block_group_cache *block_group; 9520 struct btrfs_space_info *space_info; 9521 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 9522 struct btrfs_device *device; 9523 struct btrfs_trans_handle *trans; 9524 u64 min_free; 9525 u64 dev_min = 1; 9526 u64 dev_nr = 0; 9527 u64 target; 9528 int debug; 9529 int index; 9530 int full = 0; 9531 int ret = 0; 9532 9533 debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); 9534 9535 block_group = btrfs_lookup_block_group(fs_info, bytenr); 9536 9537 /* odd, couldn't find the block group, leave it alone */ 9538 if (!block_group) { 9539 if (debug) 9540 btrfs_warn(fs_info, 9541 "can't find block group for bytenr %llu", 9542 bytenr); 9543 return -1; 9544 } 9545 9546 min_free = btrfs_block_group_used(&block_group->item); 9547 9548 /* no bytes used, we're good */ 9549 if (!min_free) 9550 goto out; 9551 9552 space_info = block_group->space_info; 9553 spin_lock(&space_info->lock); 9554 9555 full = space_info->full; 9556 9557 /* 9558 * if this is the last block group we have in this space, we can't 9559 * relocate it unless we're able to allocate a new chunk below. 9560 * 9561 * Otherwise, we need to make sure we have room in the space to handle 9562 * all of the extents from this block group. If we can, we're good 9563 */ 9564 if ((space_info->total_bytes != block_group->key.offset) && 9565 (btrfs_space_info_used(space_info, false) + min_free < 9566 space_info->total_bytes)) { 9567 spin_unlock(&space_info->lock); 9568 goto out; 9569 } 9570 spin_unlock(&space_info->lock); 9571 9572 /* 9573 * ok we don't have enough space, but maybe we have free space on our 9574 * devices to allocate new chunks for relocation, so loop through our 9575 * alloc devices and guess if we have enough space. if this block 9576 * group is going to be restriped, run checks against the target 9577 * profile instead of the current one. 9578 */ 9579 ret = -1; 9580 9581 /* 9582 * index: 9583 * 0: raid10 9584 * 1: raid1 9585 * 2: dup 9586 * 3: raid0 9587 * 4: single 9588 */ 9589 target = get_restripe_target(fs_info, block_group->flags); 9590 if (target) { 9591 index = __get_raid_index(extended_to_chunk(target)); 9592 } else { 9593 /* 9594 * this is just a balance, so if we were marked as full 9595 * we know there is no space for a new chunk 9596 */ 9597 if (full) { 9598 if (debug) 9599 btrfs_warn(fs_info, 9600 "no space to alloc new chunk for block group %llu", 9601 block_group->key.objectid); 9602 goto out; 9603 } 9604 9605 index = get_block_group_index(block_group); 9606 } 9607 9608 if (index == BTRFS_RAID_RAID10) { 9609 dev_min = 4; 9610 /* Divide by 2 */ 9611 min_free >>= 1; 9612 } else if (index == BTRFS_RAID_RAID1) { 9613 dev_min = 2; 9614 } else if (index == BTRFS_RAID_DUP) { 9615 /* Multiply by 2 */ 9616 min_free <<= 1; 9617 } else if (index == BTRFS_RAID_RAID0) { 9618 dev_min = fs_devices->rw_devices; 9619 min_free = div64_u64(min_free, dev_min); 9620 } 9621 9622 /* We need to do this so that we can look at pending chunks */ 9623 trans = btrfs_join_transaction(root); 9624 if (IS_ERR(trans)) { 9625 ret = PTR_ERR(trans); 9626 goto out; 9627 } 9628 9629 mutex_lock(&fs_info->chunk_mutex); 9630 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 9631 u64 dev_offset; 9632 9633 /* 9634 * check to make sure we can actually find a chunk with enough 9635 * space to fit our block group in. 9636 */ 9637 if (device->total_bytes > device->bytes_used + min_free && 9638 !device->is_tgtdev_for_dev_replace) { 9639 ret = find_free_dev_extent(trans, device, min_free, 9640 &dev_offset, NULL); 9641 if (!ret) 9642 dev_nr++; 9643 9644 if (dev_nr >= dev_min) 9645 break; 9646 9647 ret = -1; 9648 } 9649 } 9650 if (debug && ret == -1) 9651 btrfs_warn(fs_info, 9652 "no space to allocate a new chunk for block group %llu", 9653 block_group->key.objectid); 9654 mutex_unlock(&fs_info->chunk_mutex); 9655 btrfs_end_transaction(trans); 9656 out: 9657 btrfs_put_block_group(block_group); 9658 return ret; 9659 } 9660 9661 static int find_first_block_group(struct btrfs_fs_info *fs_info, 9662 struct btrfs_path *path, 9663 struct btrfs_key *key) 9664 { 9665 struct btrfs_root *root = fs_info->extent_root; 9666 int ret = 0; 9667 struct btrfs_key found_key; 9668 struct extent_buffer *leaf; 9669 int slot; 9670 9671 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 9672 if (ret < 0) 9673 goto out; 9674 9675 while (1) { 9676 slot = path->slots[0]; 9677 leaf = path->nodes[0]; 9678 if (slot >= btrfs_header_nritems(leaf)) { 9679 ret = btrfs_next_leaf(root, path); 9680 if (ret == 0) 9681 continue; 9682 if (ret < 0) 9683 goto out; 9684 break; 9685 } 9686 btrfs_item_key_to_cpu(leaf, &found_key, slot); 9687 9688 if (found_key.objectid >= key->objectid && 9689 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 9690 struct extent_map_tree *em_tree; 9691 struct extent_map *em; 9692 9693 em_tree = &root->fs_info->mapping_tree.map_tree; 9694 read_lock(&em_tree->lock); 9695 em = lookup_extent_mapping(em_tree, found_key.objectid, 9696 found_key.offset); 9697 read_unlock(&em_tree->lock); 9698 if (!em) { 9699 btrfs_err(fs_info, 9700 "logical %llu len %llu found bg but no related chunk", 9701 found_key.objectid, found_key.offset); 9702 ret = -ENOENT; 9703 } else { 9704 ret = 0; 9705 } 9706 free_extent_map(em); 9707 goto out; 9708 } 9709 path->slots[0]++; 9710 } 9711 out: 9712 return ret; 9713 } 9714 9715 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 9716 { 9717 struct btrfs_block_group_cache *block_group; 9718 u64 last = 0; 9719 9720 while (1) { 9721 struct inode *inode; 9722 9723 block_group = btrfs_lookup_first_block_group(info, last); 9724 while (block_group) { 9725 spin_lock(&block_group->lock); 9726 if (block_group->iref) 9727 break; 9728 spin_unlock(&block_group->lock); 9729 block_group = next_block_group(info, block_group); 9730 } 9731 if (!block_group) { 9732 if (last == 0) 9733 break; 9734 last = 0; 9735 continue; 9736 } 9737 9738 inode = block_group->inode; 9739 block_group->iref = 0; 9740 block_group->inode = NULL; 9741 spin_unlock(&block_group->lock); 9742 ASSERT(block_group->io_ctl.inode == NULL); 9743 iput(inode); 9744 last = block_group->key.objectid + block_group->key.offset; 9745 btrfs_put_block_group(block_group); 9746 } 9747 } 9748 9749 /* 9750 * Must be called only after stopping all workers, since we could have block 9751 * group caching kthreads running, and therefore they could race with us if we 9752 * freed the block groups before stopping them. 9753 */ 9754 int btrfs_free_block_groups(struct btrfs_fs_info *info) 9755 { 9756 struct btrfs_block_group_cache *block_group; 9757 struct btrfs_space_info *space_info; 9758 struct btrfs_caching_control *caching_ctl; 9759 struct rb_node *n; 9760 9761 down_write(&info->commit_root_sem); 9762 while (!list_empty(&info->caching_block_groups)) { 9763 caching_ctl = list_entry(info->caching_block_groups.next, 9764 struct btrfs_caching_control, list); 9765 list_del(&caching_ctl->list); 9766 put_caching_control(caching_ctl); 9767 } 9768 up_write(&info->commit_root_sem); 9769 9770 spin_lock(&info->unused_bgs_lock); 9771 while (!list_empty(&info->unused_bgs)) { 9772 block_group = list_first_entry(&info->unused_bgs, 9773 struct btrfs_block_group_cache, 9774 bg_list); 9775 list_del_init(&block_group->bg_list); 9776 btrfs_put_block_group(block_group); 9777 } 9778 spin_unlock(&info->unused_bgs_lock); 9779 9780 spin_lock(&info->block_group_cache_lock); 9781 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 9782 block_group = rb_entry(n, struct btrfs_block_group_cache, 9783 cache_node); 9784 rb_erase(&block_group->cache_node, 9785 &info->block_group_cache_tree); 9786 RB_CLEAR_NODE(&block_group->cache_node); 9787 spin_unlock(&info->block_group_cache_lock); 9788 9789 down_write(&block_group->space_info->groups_sem); 9790 list_del(&block_group->list); 9791 up_write(&block_group->space_info->groups_sem); 9792 9793 /* 9794 * We haven't cached this block group, which means we could 9795 * possibly have excluded extents on this block group. 9796 */ 9797 if (block_group->cached == BTRFS_CACHE_NO || 9798 block_group->cached == BTRFS_CACHE_ERROR) 9799 free_excluded_extents(info, block_group); 9800 9801 btrfs_remove_free_space_cache(block_group); 9802 ASSERT(block_group->cached != BTRFS_CACHE_STARTED); 9803 ASSERT(list_empty(&block_group->dirty_list)); 9804 ASSERT(list_empty(&block_group->io_list)); 9805 ASSERT(list_empty(&block_group->bg_list)); 9806 ASSERT(atomic_read(&block_group->count) == 1); 9807 btrfs_put_block_group(block_group); 9808 9809 spin_lock(&info->block_group_cache_lock); 9810 } 9811 spin_unlock(&info->block_group_cache_lock); 9812 9813 /* now that all the block groups are freed, go through and 9814 * free all the space_info structs. This is only called during 9815 * the final stages of unmount, and so we know nobody is 9816 * using them. We call synchronize_rcu() once before we start, 9817 * just to be on the safe side. 9818 */ 9819 synchronize_rcu(); 9820 9821 release_global_block_rsv(info); 9822 9823 while (!list_empty(&info->space_info)) { 9824 int i; 9825 9826 space_info = list_entry(info->space_info.next, 9827 struct btrfs_space_info, 9828 list); 9829 9830 /* 9831 * Do not hide this behind enospc_debug, this is actually 9832 * important and indicates a real bug if this happens. 9833 */ 9834 if (WARN_ON(space_info->bytes_pinned > 0 || 9835 space_info->bytes_reserved > 0 || 9836 space_info->bytes_may_use > 0)) 9837 dump_space_info(info, space_info, 0, 0); 9838 list_del(&space_info->list); 9839 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 9840 struct kobject *kobj; 9841 kobj = space_info->block_group_kobjs[i]; 9842 space_info->block_group_kobjs[i] = NULL; 9843 if (kobj) { 9844 kobject_del(kobj); 9845 kobject_put(kobj); 9846 } 9847 } 9848 kobject_del(&space_info->kobj); 9849 kobject_put(&space_info->kobj); 9850 } 9851 return 0; 9852 } 9853 9854 static void __link_block_group(struct btrfs_space_info *space_info, 9855 struct btrfs_block_group_cache *cache) 9856 { 9857 int index = get_block_group_index(cache); 9858 bool first = false; 9859 9860 down_write(&space_info->groups_sem); 9861 if (list_empty(&space_info->block_groups[index])) 9862 first = true; 9863 list_add_tail(&cache->list, &space_info->block_groups[index]); 9864 up_write(&space_info->groups_sem); 9865 9866 if (first) { 9867 struct raid_kobject *rkobj; 9868 int ret; 9869 9870 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); 9871 if (!rkobj) 9872 goto out_err; 9873 rkobj->raid_type = index; 9874 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 9875 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 9876 "%s", get_raid_name(index)); 9877 if (ret) { 9878 kobject_put(&rkobj->kobj); 9879 goto out_err; 9880 } 9881 space_info->block_group_kobjs[index] = &rkobj->kobj; 9882 } 9883 9884 return; 9885 out_err: 9886 btrfs_warn(cache->fs_info, 9887 "failed to add kobject for block cache, ignoring"); 9888 } 9889 9890 static struct btrfs_block_group_cache * 9891 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, 9892 u64 start, u64 size) 9893 { 9894 struct btrfs_block_group_cache *cache; 9895 9896 cache = kzalloc(sizeof(*cache), GFP_NOFS); 9897 if (!cache) 9898 return NULL; 9899 9900 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 9901 GFP_NOFS); 9902 if (!cache->free_space_ctl) { 9903 kfree(cache); 9904 return NULL; 9905 } 9906 9907 cache->key.objectid = start; 9908 cache->key.offset = size; 9909 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 9910 9911 cache->sectorsize = fs_info->sectorsize; 9912 cache->fs_info = fs_info; 9913 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, 9914 &fs_info->mapping_tree, 9915 start); 9916 set_free_space_tree_thresholds(cache); 9917 9918 atomic_set(&cache->count, 1); 9919 spin_lock_init(&cache->lock); 9920 init_rwsem(&cache->data_rwsem); 9921 INIT_LIST_HEAD(&cache->list); 9922 INIT_LIST_HEAD(&cache->cluster_list); 9923 INIT_LIST_HEAD(&cache->bg_list); 9924 INIT_LIST_HEAD(&cache->ro_list); 9925 INIT_LIST_HEAD(&cache->dirty_list); 9926 INIT_LIST_HEAD(&cache->io_list); 9927 btrfs_init_free_space_ctl(cache); 9928 atomic_set(&cache->trimming, 0); 9929 mutex_init(&cache->free_space_lock); 9930 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); 9931 9932 return cache; 9933 } 9934 9935 int btrfs_read_block_groups(struct btrfs_fs_info *info) 9936 { 9937 struct btrfs_path *path; 9938 int ret; 9939 struct btrfs_block_group_cache *cache; 9940 struct btrfs_space_info *space_info; 9941 struct btrfs_key key; 9942 struct btrfs_key found_key; 9943 struct extent_buffer *leaf; 9944 int need_clear = 0; 9945 u64 cache_gen; 9946 u64 feature; 9947 int mixed; 9948 9949 feature = btrfs_super_incompat_flags(info->super_copy); 9950 mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); 9951 9952 key.objectid = 0; 9953 key.offset = 0; 9954 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 9955 path = btrfs_alloc_path(); 9956 if (!path) 9957 return -ENOMEM; 9958 path->reada = READA_FORWARD; 9959 9960 cache_gen = btrfs_super_cache_generation(info->super_copy); 9961 if (btrfs_test_opt(info, SPACE_CACHE) && 9962 btrfs_super_generation(info->super_copy) != cache_gen) 9963 need_clear = 1; 9964 if (btrfs_test_opt(info, CLEAR_CACHE)) 9965 need_clear = 1; 9966 9967 while (1) { 9968 ret = find_first_block_group(info, path, &key); 9969 if (ret > 0) 9970 break; 9971 if (ret != 0) 9972 goto error; 9973 9974 leaf = path->nodes[0]; 9975 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 9976 9977 cache = btrfs_create_block_group_cache(info, found_key.objectid, 9978 found_key.offset); 9979 if (!cache) { 9980 ret = -ENOMEM; 9981 goto error; 9982 } 9983 9984 if (need_clear) { 9985 /* 9986 * When we mount with old space cache, we need to 9987 * set BTRFS_DC_CLEAR and set dirty flag. 9988 * 9989 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 9990 * truncate the old free space cache inode and 9991 * setup a new one. 9992 * b) Setting 'dirty flag' makes sure that we flush 9993 * the new space cache info onto disk. 9994 */ 9995 if (btrfs_test_opt(info, SPACE_CACHE)) 9996 cache->disk_cache_state = BTRFS_DC_CLEAR; 9997 } 9998 9999 read_extent_buffer(leaf, &cache->item, 10000 btrfs_item_ptr_offset(leaf, path->slots[0]), 10001 sizeof(cache->item)); 10002 cache->flags = btrfs_block_group_flags(&cache->item); 10003 if (!mixed && 10004 ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 10005 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 10006 btrfs_err(info, 10007 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 10008 cache->key.objectid); 10009 ret = -EINVAL; 10010 goto error; 10011 } 10012 10013 key.objectid = found_key.objectid + found_key.offset; 10014 btrfs_release_path(path); 10015 10016 /* 10017 * We need to exclude the super stripes now so that the space 10018 * info has super bytes accounted for, otherwise we'll think 10019 * we have more space than we actually do. 10020 */ 10021 ret = exclude_super_stripes(info, cache); 10022 if (ret) { 10023 /* 10024 * We may have excluded something, so call this just in 10025 * case. 10026 */ 10027 free_excluded_extents(info, cache); 10028 btrfs_put_block_group(cache); 10029 goto error; 10030 } 10031 10032 /* 10033 * check for two cases, either we are full, and therefore 10034 * don't need to bother with the caching work since we won't 10035 * find any space, or we are empty, and we can just add all 10036 * the space in and be done with it. This saves us _alot_ of 10037 * time, particularly in the full case. 10038 */ 10039 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 10040 cache->last_byte_to_unpin = (u64)-1; 10041 cache->cached = BTRFS_CACHE_FINISHED; 10042 free_excluded_extents(info, cache); 10043 } else if (btrfs_block_group_used(&cache->item) == 0) { 10044 cache->last_byte_to_unpin = (u64)-1; 10045 cache->cached = BTRFS_CACHE_FINISHED; 10046 add_new_free_space(cache, info, 10047 found_key.objectid, 10048 found_key.objectid + 10049 found_key.offset); 10050 free_excluded_extents(info, cache); 10051 } 10052 10053 ret = btrfs_add_block_group_cache(info, cache); 10054 if (ret) { 10055 btrfs_remove_free_space_cache(cache); 10056 btrfs_put_block_group(cache); 10057 goto error; 10058 } 10059 10060 trace_btrfs_add_block_group(info, cache, 0); 10061 ret = update_space_info(info, cache->flags, found_key.offset, 10062 btrfs_block_group_used(&cache->item), 10063 cache->bytes_super, &space_info); 10064 if (ret) { 10065 btrfs_remove_free_space_cache(cache); 10066 spin_lock(&info->block_group_cache_lock); 10067 rb_erase(&cache->cache_node, 10068 &info->block_group_cache_tree); 10069 RB_CLEAR_NODE(&cache->cache_node); 10070 spin_unlock(&info->block_group_cache_lock); 10071 btrfs_put_block_group(cache); 10072 goto error; 10073 } 10074 10075 cache->space_info = space_info; 10076 10077 __link_block_group(space_info, cache); 10078 10079 set_avail_alloc_bits(info, cache->flags); 10080 if (btrfs_chunk_readonly(info, cache->key.objectid)) { 10081 inc_block_group_ro(cache, 1); 10082 } else if (btrfs_block_group_used(&cache->item) == 0) { 10083 spin_lock(&info->unused_bgs_lock); 10084 /* Should always be true but just in case. */ 10085 if (list_empty(&cache->bg_list)) { 10086 btrfs_get_block_group(cache); 10087 list_add_tail(&cache->bg_list, 10088 &info->unused_bgs); 10089 } 10090 spin_unlock(&info->unused_bgs_lock); 10091 } 10092 } 10093 10094 list_for_each_entry_rcu(space_info, &info->space_info, list) { 10095 if (!(get_alloc_profile(info, space_info->flags) & 10096 (BTRFS_BLOCK_GROUP_RAID10 | 10097 BTRFS_BLOCK_GROUP_RAID1 | 10098 BTRFS_BLOCK_GROUP_RAID5 | 10099 BTRFS_BLOCK_GROUP_RAID6 | 10100 BTRFS_BLOCK_GROUP_DUP))) 10101 continue; 10102 /* 10103 * avoid allocating from un-mirrored block group if there are 10104 * mirrored block groups. 10105 */ 10106 list_for_each_entry(cache, 10107 &space_info->block_groups[BTRFS_RAID_RAID0], 10108 list) 10109 inc_block_group_ro(cache, 1); 10110 list_for_each_entry(cache, 10111 &space_info->block_groups[BTRFS_RAID_SINGLE], 10112 list) 10113 inc_block_group_ro(cache, 1); 10114 } 10115 10116 init_global_block_rsv(info); 10117 ret = 0; 10118 error: 10119 btrfs_free_path(path); 10120 return ret; 10121 } 10122 10123 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 10124 struct btrfs_fs_info *fs_info) 10125 { 10126 struct btrfs_block_group_cache *block_group, *tmp; 10127 struct btrfs_root *extent_root = fs_info->extent_root; 10128 struct btrfs_block_group_item item; 10129 struct btrfs_key key; 10130 int ret = 0; 10131 bool can_flush_pending_bgs = trans->can_flush_pending_bgs; 10132 10133 trans->can_flush_pending_bgs = false; 10134 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { 10135 if (ret) 10136 goto next; 10137 10138 spin_lock(&block_group->lock); 10139 memcpy(&item, &block_group->item, sizeof(item)); 10140 memcpy(&key, &block_group->key, sizeof(key)); 10141 spin_unlock(&block_group->lock); 10142 10143 ret = btrfs_insert_item(trans, extent_root, &key, &item, 10144 sizeof(item)); 10145 if (ret) 10146 btrfs_abort_transaction(trans, ret); 10147 ret = btrfs_finish_chunk_alloc(trans, fs_info, key.objectid, 10148 key.offset); 10149 if (ret) 10150 btrfs_abort_transaction(trans, ret); 10151 add_block_group_free_space(trans, fs_info, block_group); 10152 /* already aborted the transaction if it failed. */ 10153 next: 10154 list_del_init(&block_group->bg_list); 10155 } 10156 trans->can_flush_pending_bgs = can_flush_pending_bgs; 10157 } 10158 10159 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 10160 struct btrfs_fs_info *fs_info, u64 bytes_used, 10161 u64 type, u64 chunk_objectid, u64 chunk_offset, 10162 u64 size) 10163 { 10164 struct btrfs_block_group_cache *cache; 10165 int ret; 10166 10167 btrfs_set_log_full_commit(fs_info, trans); 10168 10169 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); 10170 if (!cache) 10171 return -ENOMEM; 10172 10173 btrfs_set_block_group_used(&cache->item, bytes_used); 10174 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 10175 btrfs_set_block_group_flags(&cache->item, type); 10176 10177 cache->flags = type; 10178 cache->last_byte_to_unpin = (u64)-1; 10179 cache->cached = BTRFS_CACHE_FINISHED; 10180 cache->needs_free_space = 1; 10181 ret = exclude_super_stripes(fs_info, cache); 10182 if (ret) { 10183 /* 10184 * We may have excluded something, so call this just in 10185 * case. 10186 */ 10187 free_excluded_extents(fs_info, cache); 10188 btrfs_put_block_group(cache); 10189 return ret; 10190 } 10191 10192 add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size); 10193 10194 free_excluded_extents(fs_info, cache); 10195 10196 #ifdef CONFIG_BTRFS_DEBUG 10197 if (btrfs_should_fragment_free_space(cache)) { 10198 u64 new_bytes_used = size - bytes_used; 10199 10200 bytes_used += new_bytes_used >> 1; 10201 fragment_free_space(cache); 10202 } 10203 #endif 10204 /* 10205 * Call to ensure the corresponding space_info object is created and 10206 * assigned to our block group, but don't update its counters just yet. 10207 * We want our bg to be added to the rbtree with its ->space_info set. 10208 */ 10209 ret = update_space_info(fs_info, cache->flags, 0, 0, 0, 10210 &cache->space_info); 10211 if (ret) { 10212 btrfs_remove_free_space_cache(cache); 10213 btrfs_put_block_group(cache); 10214 return ret; 10215 } 10216 10217 ret = btrfs_add_block_group_cache(fs_info, cache); 10218 if (ret) { 10219 btrfs_remove_free_space_cache(cache); 10220 btrfs_put_block_group(cache); 10221 return ret; 10222 } 10223 10224 /* 10225 * Now that our block group has its ->space_info set and is inserted in 10226 * the rbtree, update the space info's counters. 10227 */ 10228 trace_btrfs_add_block_group(fs_info, cache, 1); 10229 ret = update_space_info(fs_info, cache->flags, size, bytes_used, 10230 cache->bytes_super, &cache->space_info); 10231 if (ret) { 10232 btrfs_remove_free_space_cache(cache); 10233 spin_lock(&fs_info->block_group_cache_lock); 10234 rb_erase(&cache->cache_node, 10235 &fs_info->block_group_cache_tree); 10236 RB_CLEAR_NODE(&cache->cache_node); 10237 spin_unlock(&fs_info->block_group_cache_lock); 10238 btrfs_put_block_group(cache); 10239 return ret; 10240 } 10241 update_global_block_rsv(fs_info); 10242 10243 __link_block_group(cache->space_info, cache); 10244 10245 list_add_tail(&cache->bg_list, &trans->new_bgs); 10246 10247 set_avail_alloc_bits(fs_info, type); 10248 return 0; 10249 } 10250 10251 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 10252 { 10253 u64 extra_flags = chunk_to_extended(flags) & 10254 BTRFS_EXTENDED_PROFILE_MASK; 10255 10256 write_seqlock(&fs_info->profiles_lock); 10257 if (flags & BTRFS_BLOCK_GROUP_DATA) 10258 fs_info->avail_data_alloc_bits &= ~extra_flags; 10259 if (flags & BTRFS_BLOCK_GROUP_METADATA) 10260 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 10261 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 10262 fs_info->avail_system_alloc_bits &= ~extra_flags; 10263 write_sequnlock(&fs_info->profiles_lock); 10264 } 10265 10266 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 10267 struct btrfs_fs_info *fs_info, u64 group_start, 10268 struct extent_map *em) 10269 { 10270 struct btrfs_root *root = fs_info->extent_root; 10271 struct btrfs_path *path; 10272 struct btrfs_block_group_cache *block_group; 10273 struct btrfs_free_cluster *cluster; 10274 struct btrfs_root *tree_root = fs_info->tree_root; 10275 struct btrfs_key key; 10276 struct inode *inode; 10277 struct kobject *kobj = NULL; 10278 int ret; 10279 int index; 10280 int factor; 10281 struct btrfs_caching_control *caching_ctl = NULL; 10282 bool remove_em; 10283 10284 block_group = btrfs_lookup_block_group(fs_info, group_start); 10285 BUG_ON(!block_group); 10286 BUG_ON(!block_group->ro); 10287 10288 /* 10289 * Free the reserved super bytes from this block group before 10290 * remove it. 10291 */ 10292 free_excluded_extents(fs_info, block_group); 10293 10294 memcpy(&key, &block_group->key, sizeof(key)); 10295 index = get_block_group_index(block_group); 10296 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 10297 BTRFS_BLOCK_GROUP_RAID1 | 10298 BTRFS_BLOCK_GROUP_RAID10)) 10299 factor = 2; 10300 else 10301 factor = 1; 10302 10303 /* make sure this block group isn't part of an allocation cluster */ 10304 cluster = &fs_info->data_alloc_cluster; 10305 spin_lock(&cluster->refill_lock); 10306 btrfs_return_cluster_to_free_space(block_group, cluster); 10307 spin_unlock(&cluster->refill_lock); 10308 10309 /* 10310 * make sure this block group isn't part of a metadata 10311 * allocation cluster 10312 */ 10313 cluster = &fs_info->meta_alloc_cluster; 10314 spin_lock(&cluster->refill_lock); 10315 btrfs_return_cluster_to_free_space(block_group, cluster); 10316 spin_unlock(&cluster->refill_lock); 10317 10318 path = btrfs_alloc_path(); 10319 if (!path) { 10320 ret = -ENOMEM; 10321 goto out; 10322 } 10323 10324 /* 10325 * get the inode first so any iput calls done for the io_list 10326 * aren't the final iput (no unlinks allowed now) 10327 */ 10328 inode = lookup_free_space_inode(fs_info, block_group, path); 10329 10330 mutex_lock(&trans->transaction->cache_write_mutex); 10331 /* 10332 * make sure our free spache cache IO is done before remove the 10333 * free space inode 10334 */ 10335 spin_lock(&trans->transaction->dirty_bgs_lock); 10336 if (!list_empty(&block_group->io_list)) { 10337 list_del_init(&block_group->io_list); 10338 10339 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 10340 10341 spin_unlock(&trans->transaction->dirty_bgs_lock); 10342 btrfs_wait_cache_io(trans, block_group, path); 10343 btrfs_put_block_group(block_group); 10344 spin_lock(&trans->transaction->dirty_bgs_lock); 10345 } 10346 10347 if (!list_empty(&block_group->dirty_list)) { 10348 list_del_init(&block_group->dirty_list); 10349 btrfs_put_block_group(block_group); 10350 } 10351 spin_unlock(&trans->transaction->dirty_bgs_lock); 10352 mutex_unlock(&trans->transaction->cache_write_mutex); 10353 10354 if (!IS_ERR(inode)) { 10355 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 10356 if (ret) { 10357 btrfs_add_delayed_iput(inode); 10358 goto out; 10359 } 10360 clear_nlink(inode); 10361 /* One for the block groups ref */ 10362 spin_lock(&block_group->lock); 10363 if (block_group->iref) { 10364 block_group->iref = 0; 10365 block_group->inode = NULL; 10366 spin_unlock(&block_group->lock); 10367 iput(inode); 10368 } else { 10369 spin_unlock(&block_group->lock); 10370 } 10371 /* One for our lookup ref */ 10372 btrfs_add_delayed_iput(inode); 10373 } 10374 10375 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 10376 key.offset = block_group->key.objectid; 10377 key.type = 0; 10378 10379 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 10380 if (ret < 0) 10381 goto out; 10382 if (ret > 0) 10383 btrfs_release_path(path); 10384 if (ret == 0) { 10385 ret = btrfs_del_item(trans, tree_root, path); 10386 if (ret) 10387 goto out; 10388 btrfs_release_path(path); 10389 } 10390 10391 spin_lock(&fs_info->block_group_cache_lock); 10392 rb_erase(&block_group->cache_node, 10393 &fs_info->block_group_cache_tree); 10394 RB_CLEAR_NODE(&block_group->cache_node); 10395 10396 if (fs_info->first_logical_byte == block_group->key.objectid) 10397 fs_info->first_logical_byte = (u64)-1; 10398 spin_unlock(&fs_info->block_group_cache_lock); 10399 10400 down_write(&block_group->space_info->groups_sem); 10401 /* 10402 * we must use list_del_init so people can check to see if they 10403 * are still on the list after taking the semaphore 10404 */ 10405 list_del_init(&block_group->list); 10406 if (list_empty(&block_group->space_info->block_groups[index])) { 10407 kobj = block_group->space_info->block_group_kobjs[index]; 10408 block_group->space_info->block_group_kobjs[index] = NULL; 10409 clear_avail_alloc_bits(fs_info, block_group->flags); 10410 } 10411 up_write(&block_group->space_info->groups_sem); 10412 if (kobj) { 10413 kobject_del(kobj); 10414 kobject_put(kobj); 10415 } 10416 10417 if (block_group->has_caching_ctl) 10418 caching_ctl = get_caching_control(block_group); 10419 if (block_group->cached == BTRFS_CACHE_STARTED) 10420 wait_block_group_cache_done(block_group); 10421 if (block_group->has_caching_ctl) { 10422 down_write(&fs_info->commit_root_sem); 10423 if (!caching_ctl) { 10424 struct btrfs_caching_control *ctl; 10425 10426 list_for_each_entry(ctl, 10427 &fs_info->caching_block_groups, list) 10428 if (ctl->block_group == block_group) { 10429 caching_ctl = ctl; 10430 refcount_inc(&caching_ctl->count); 10431 break; 10432 } 10433 } 10434 if (caching_ctl) 10435 list_del_init(&caching_ctl->list); 10436 up_write(&fs_info->commit_root_sem); 10437 if (caching_ctl) { 10438 /* Once for the caching bgs list and once for us. */ 10439 put_caching_control(caching_ctl); 10440 put_caching_control(caching_ctl); 10441 } 10442 } 10443 10444 spin_lock(&trans->transaction->dirty_bgs_lock); 10445 if (!list_empty(&block_group->dirty_list)) { 10446 WARN_ON(1); 10447 } 10448 if (!list_empty(&block_group->io_list)) { 10449 WARN_ON(1); 10450 } 10451 spin_unlock(&trans->transaction->dirty_bgs_lock); 10452 btrfs_remove_free_space_cache(block_group); 10453 10454 spin_lock(&block_group->space_info->lock); 10455 list_del_init(&block_group->ro_list); 10456 10457 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 10458 WARN_ON(block_group->space_info->total_bytes 10459 < block_group->key.offset); 10460 WARN_ON(block_group->space_info->bytes_readonly 10461 < block_group->key.offset); 10462 WARN_ON(block_group->space_info->disk_total 10463 < block_group->key.offset * factor); 10464 } 10465 block_group->space_info->total_bytes -= block_group->key.offset; 10466 block_group->space_info->bytes_readonly -= block_group->key.offset; 10467 block_group->space_info->disk_total -= block_group->key.offset * factor; 10468 10469 spin_unlock(&block_group->space_info->lock); 10470 10471 memcpy(&key, &block_group->key, sizeof(key)); 10472 10473 mutex_lock(&fs_info->chunk_mutex); 10474 if (!list_empty(&em->list)) { 10475 /* We're in the transaction->pending_chunks list. */ 10476 free_extent_map(em); 10477 } 10478 spin_lock(&block_group->lock); 10479 block_group->removed = 1; 10480 /* 10481 * At this point trimming can't start on this block group, because we 10482 * removed the block group from the tree fs_info->block_group_cache_tree 10483 * so no one can't find it anymore and even if someone already got this 10484 * block group before we removed it from the rbtree, they have already 10485 * incremented block_group->trimming - if they didn't, they won't find 10486 * any free space entries because we already removed them all when we 10487 * called btrfs_remove_free_space_cache(). 10488 * 10489 * And we must not remove the extent map from the fs_info->mapping_tree 10490 * to prevent the same logical address range and physical device space 10491 * ranges from being reused for a new block group. This is because our 10492 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 10493 * completely transactionless, so while it is trimming a range the 10494 * currently running transaction might finish and a new one start, 10495 * allowing for new block groups to be created that can reuse the same 10496 * physical device locations unless we take this special care. 10497 * 10498 * There may also be an implicit trim operation if the file system 10499 * is mounted with -odiscard. The same protections must remain 10500 * in place until the extents have been discarded completely when 10501 * the transaction commit has completed. 10502 */ 10503 remove_em = (atomic_read(&block_group->trimming) == 0); 10504 /* 10505 * Make sure a trimmer task always sees the em in the pinned_chunks list 10506 * if it sees block_group->removed == 1 (needs to lock block_group->lock 10507 * before checking block_group->removed). 10508 */ 10509 if (!remove_em) { 10510 /* 10511 * Our em might be in trans->transaction->pending_chunks which 10512 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), 10513 * and so is the fs_info->pinned_chunks list. 10514 * 10515 * So at this point we must be holding the chunk_mutex to avoid 10516 * any races with chunk allocation (more specifically at 10517 * volumes.c:contains_pending_extent()), to ensure it always 10518 * sees the em, either in the pending_chunks list or in the 10519 * pinned_chunks list. 10520 */ 10521 list_move_tail(&em->list, &fs_info->pinned_chunks); 10522 } 10523 spin_unlock(&block_group->lock); 10524 10525 if (remove_em) { 10526 struct extent_map_tree *em_tree; 10527 10528 em_tree = &fs_info->mapping_tree.map_tree; 10529 write_lock(&em_tree->lock); 10530 /* 10531 * The em might be in the pending_chunks list, so make sure the 10532 * chunk mutex is locked, since remove_extent_mapping() will 10533 * delete us from that list. 10534 */ 10535 remove_extent_mapping(em_tree, em); 10536 write_unlock(&em_tree->lock); 10537 /* once for the tree */ 10538 free_extent_map(em); 10539 } 10540 10541 mutex_unlock(&fs_info->chunk_mutex); 10542 10543 ret = remove_block_group_free_space(trans, fs_info, block_group); 10544 if (ret) 10545 goto out; 10546 10547 btrfs_put_block_group(block_group); 10548 btrfs_put_block_group(block_group); 10549 10550 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 10551 if (ret > 0) 10552 ret = -EIO; 10553 if (ret < 0) 10554 goto out; 10555 10556 ret = btrfs_del_item(trans, root, path); 10557 out: 10558 btrfs_free_path(path); 10559 return ret; 10560 } 10561 10562 struct btrfs_trans_handle * 10563 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, 10564 const u64 chunk_offset) 10565 { 10566 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 10567 struct extent_map *em; 10568 struct map_lookup *map; 10569 unsigned int num_items; 10570 10571 read_lock(&em_tree->lock); 10572 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 10573 read_unlock(&em_tree->lock); 10574 ASSERT(em && em->start == chunk_offset); 10575 10576 /* 10577 * We need to reserve 3 + N units from the metadata space info in order 10578 * to remove a block group (done at btrfs_remove_chunk() and at 10579 * btrfs_remove_block_group()), which are used for: 10580 * 10581 * 1 unit for adding the free space inode's orphan (located in the tree 10582 * of tree roots). 10583 * 1 unit for deleting the block group item (located in the extent 10584 * tree). 10585 * 1 unit for deleting the free space item (located in tree of tree 10586 * roots). 10587 * N units for deleting N device extent items corresponding to each 10588 * stripe (located in the device tree). 10589 * 10590 * In order to remove a block group we also need to reserve units in the 10591 * system space info in order to update the chunk tree (update one or 10592 * more device items and remove one chunk item), but this is done at 10593 * btrfs_remove_chunk() through a call to check_system_chunk(). 10594 */ 10595 map = em->map_lookup; 10596 num_items = 3 + map->num_stripes; 10597 free_extent_map(em); 10598 10599 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, 10600 num_items, 1); 10601 } 10602 10603 /* 10604 * Process the unused_bgs list and remove any that don't have any allocated 10605 * space inside of them. 10606 */ 10607 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 10608 { 10609 struct btrfs_block_group_cache *block_group; 10610 struct btrfs_space_info *space_info; 10611 struct btrfs_trans_handle *trans; 10612 int ret = 0; 10613 10614 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 10615 return; 10616 10617 spin_lock(&fs_info->unused_bgs_lock); 10618 while (!list_empty(&fs_info->unused_bgs)) { 10619 u64 start, end; 10620 int trimming; 10621 10622 block_group = list_first_entry(&fs_info->unused_bgs, 10623 struct btrfs_block_group_cache, 10624 bg_list); 10625 list_del_init(&block_group->bg_list); 10626 10627 space_info = block_group->space_info; 10628 10629 if (ret || btrfs_mixed_space_info(space_info)) { 10630 btrfs_put_block_group(block_group); 10631 continue; 10632 } 10633 spin_unlock(&fs_info->unused_bgs_lock); 10634 10635 mutex_lock(&fs_info->delete_unused_bgs_mutex); 10636 10637 /* Don't want to race with allocators so take the groups_sem */ 10638 down_write(&space_info->groups_sem); 10639 spin_lock(&block_group->lock); 10640 if (block_group->reserved || 10641 btrfs_block_group_used(&block_group->item) || 10642 block_group->ro || 10643 list_is_singular(&block_group->list)) { 10644 /* 10645 * We want to bail if we made new allocations or have 10646 * outstanding allocations in this block group. We do 10647 * the ro check in case balance is currently acting on 10648 * this block group. 10649 */ 10650 spin_unlock(&block_group->lock); 10651 up_write(&space_info->groups_sem); 10652 goto next; 10653 } 10654 spin_unlock(&block_group->lock); 10655 10656 /* We don't want to force the issue, only flip if it's ok. */ 10657 ret = inc_block_group_ro(block_group, 0); 10658 up_write(&space_info->groups_sem); 10659 if (ret < 0) { 10660 ret = 0; 10661 goto next; 10662 } 10663 10664 /* 10665 * Want to do this before we do anything else so we can recover 10666 * properly if we fail to join the transaction. 10667 */ 10668 trans = btrfs_start_trans_remove_block_group(fs_info, 10669 block_group->key.objectid); 10670 if (IS_ERR(trans)) { 10671 btrfs_dec_block_group_ro(block_group); 10672 ret = PTR_ERR(trans); 10673 goto next; 10674 } 10675 10676 /* 10677 * We could have pending pinned extents for this block group, 10678 * just delete them, we don't care about them anymore. 10679 */ 10680 start = block_group->key.objectid; 10681 end = start + block_group->key.offset - 1; 10682 /* 10683 * Hold the unused_bg_unpin_mutex lock to avoid racing with 10684 * btrfs_finish_extent_commit(). If we are at transaction N, 10685 * another task might be running finish_extent_commit() for the 10686 * previous transaction N - 1, and have seen a range belonging 10687 * to the block group in freed_extents[] before we were able to 10688 * clear the whole block group range from freed_extents[]. This 10689 * means that task can lookup for the block group after we 10690 * unpinned it from freed_extents[] and removed it, leading to 10691 * a BUG_ON() at btrfs_unpin_extent_range(). 10692 */ 10693 mutex_lock(&fs_info->unused_bg_unpin_mutex); 10694 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, 10695 EXTENT_DIRTY); 10696 if (ret) { 10697 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10698 btrfs_dec_block_group_ro(block_group); 10699 goto end_trans; 10700 } 10701 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, 10702 EXTENT_DIRTY); 10703 if (ret) { 10704 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10705 btrfs_dec_block_group_ro(block_group); 10706 goto end_trans; 10707 } 10708 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10709 10710 /* Reset pinned so btrfs_put_block_group doesn't complain */ 10711 spin_lock(&space_info->lock); 10712 spin_lock(&block_group->lock); 10713 10714 space_info->bytes_pinned -= block_group->pinned; 10715 space_info->bytes_readonly += block_group->pinned; 10716 percpu_counter_add(&space_info->total_bytes_pinned, 10717 -block_group->pinned); 10718 block_group->pinned = 0; 10719 10720 spin_unlock(&block_group->lock); 10721 spin_unlock(&space_info->lock); 10722 10723 /* DISCARD can flip during remount */ 10724 trimming = btrfs_test_opt(fs_info, DISCARD); 10725 10726 /* Implicit trim during transaction commit. */ 10727 if (trimming) 10728 btrfs_get_block_group_trimming(block_group); 10729 10730 /* 10731 * Btrfs_remove_chunk will abort the transaction if things go 10732 * horribly wrong. 10733 */ 10734 ret = btrfs_remove_chunk(trans, fs_info, 10735 block_group->key.objectid); 10736 10737 if (ret) { 10738 if (trimming) 10739 btrfs_put_block_group_trimming(block_group); 10740 goto end_trans; 10741 } 10742 10743 /* 10744 * If we're not mounted with -odiscard, we can just forget 10745 * about this block group. Otherwise we'll need to wait 10746 * until transaction commit to do the actual discard. 10747 */ 10748 if (trimming) { 10749 spin_lock(&fs_info->unused_bgs_lock); 10750 /* 10751 * A concurrent scrub might have added us to the list 10752 * fs_info->unused_bgs, so use a list_move operation 10753 * to add the block group to the deleted_bgs list. 10754 */ 10755 list_move(&block_group->bg_list, 10756 &trans->transaction->deleted_bgs); 10757 spin_unlock(&fs_info->unused_bgs_lock); 10758 btrfs_get_block_group(block_group); 10759 } 10760 end_trans: 10761 btrfs_end_transaction(trans); 10762 next: 10763 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 10764 btrfs_put_block_group(block_group); 10765 spin_lock(&fs_info->unused_bgs_lock); 10766 } 10767 spin_unlock(&fs_info->unused_bgs_lock); 10768 } 10769 10770 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 10771 { 10772 struct btrfs_space_info *space_info; 10773 struct btrfs_super_block *disk_super; 10774 u64 features; 10775 u64 flags; 10776 int mixed = 0; 10777 int ret; 10778 10779 disk_super = fs_info->super_copy; 10780 if (!btrfs_super_root(disk_super)) 10781 return -EINVAL; 10782 10783 features = btrfs_super_incompat_flags(disk_super); 10784 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 10785 mixed = 1; 10786 10787 flags = BTRFS_BLOCK_GROUP_SYSTEM; 10788 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10789 if (ret) 10790 goto out; 10791 10792 if (mixed) { 10793 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 10794 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10795 } else { 10796 flags = BTRFS_BLOCK_GROUP_METADATA; 10797 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10798 if (ret) 10799 goto out; 10800 10801 flags = BTRFS_BLOCK_GROUP_DATA; 10802 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10803 } 10804 out: 10805 return ret; 10806 } 10807 10808 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, 10809 u64 start, u64 end) 10810 { 10811 return unpin_extent_range(fs_info, start, end, false); 10812 } 10813 10814 /* 10815 * It used to be that old block groups would be left around forever. 10816 * Iterating over them would be enough to trim unused space. Since we 10817 * now automatically remove them, we also need to iterate over unallocated 10818 * space. 10819 * 10820 * We don't want a transaction for this since the discard may take a 10821 * substantial amount of time. We don't require that a transaction be 10822 * running, but we do need to take a running transaction into account 10823 * to ensure that we're not discarding chunks that were released in 10824 * the current transaction. 10825 * 10826 * Holding the chunks lock will prevent other threads from allocating 10827 * or releasing chunks, but it won't prevent a running transaction 10828 * from committing and releasing the memory that the pending chunks 10829 * list head uses. For that, we need to take a reference to the 10830 * transaction. 10831 */ 10832 static int btrfs_trim_free_extents(struct btrfs_device *device, 10833 u64 minlen, u64 *trimmed) 10834 { 10835 u64 start = 0, len = 0; 10836 int ret; 10837 10838 *trimmed = 0; 10839 10840 /* Not writeable = nothing to do. */ 10841 if (!device->writeable) 10842 return 0; 10843 10844 /* No free space = nothing to do. */ 10845 if (device->total_bytes <= device->bytes_used) 10846 return 0; 10847 10848 ret = 0; 10849 10850 while (1) { 10851 struct btrfs_fs_info *fs_info = device->fs_info; 10852 struct btrfs_transaction *trans; 10853 u64 bytes; 10854 10855 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 10856 if (ret) 10857 return ret; 10858 10859 down_read(&fs_info->commit_root_sem); 10860 10861 spin_lock(&fs_info->trans_lock); 10862 trans = fs_info->running_transaction; 10863 if (trans) 10864 refcount_inc(&trans->use_count); 10865 spin_unlock(&fs_info->trans_lock); 10866 10867 ret = find_free_dev_extent_start(trans, device, minlen, start, 10868 &start, &len); 10869 if (trans) 10870 btrfs_put_transaction(trans); 10871 10872 if (ret) { 10873 up_read(&fs_info->commit_root_sem); 10874 mutex_unlock(&fs_info->chunk_mutex); 10875 if (ret == -ENOSPC) 10876 ret = 0; 10877 break; 10878 } 10879 10880 ret = btrfs_issue_discard(device->bdev, start, len, &bytes); 10881 up_read(&fs_info->commit_root_sem); 10882 mutex_unlock(&fs_info->chunk_mutex); 10883 10884 if (ret) 10885 break; 10886 10887 start += len; 10888 *trimmed += bytes; 10889 10890 if (fatal_signal_pending(current)) { 10891 ret = -ERESTARTSYS; 10892 break; 10893 } 10894 10895 cond_resched(); 10896 } 10897 10898 return ret; 10899 } 10900 10901 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) 10902 { 10903 struct btrfs_block_group_cache *cache = NULL; 10904 struct btrfs_device *device; 10905 struct list_head *devices; 10906 u64 group_trimmed; 10907 u64 start; 10908 u64 end; 10909 u64 trimmed = 0; 10910 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 10911 int ret = 0; 10912 10913 /* 10914 * try to trim all FS space, our block group may start from non-zero. 10915 */ 10916 if (range->len == total_bytes) 10917 cache = btrfs_lookup_first_block_group(fs_info, range->start); 10918 else 10919 cache = btrfs_lookup_block_group(fs_info, range->start); 10920 10921 while (cache) { 10922 if (cache->key.objectid >= (range->start + range->len)) { 10923 btrfs_put_block_group(cache); 10924 break; 10925 } 10926 10927 start = max(range->start, cache->key.objectid); 10928 end = min(range->start + range->len, 10929 cache->key.objectid + cache->key.offset); 10930 10931 if (end - start >= range->minlen) { 10932 if (!block_group_cache_done(cache)) { 10933 ret = cache_block_group(cache, 0); 10934 if (ret) { 10935 btrfs_put_block_group(cache); 10936 break; 10937 } 10938 ret = wait_block_group_cache_done(cache); 10939 if (ret) { 10940 btrfs_put_block_group(cache); 10941 break; 10942 } 10943 } 10944 ret = btrfs_trim_block_group(cache, 10945 &group_trimmed, 10946 start, 10947 end, 10948 range->minlen); 10949 10950 trimmed += group_trimmed; 10951 if (ret) { 10952 btrfs_put_block_group(cache); 10953 break; 10954 } 10955 } 10956 10957 cache = next_block_group(fs_info, cache); 10958 } 10959 10960 mutex_lock(&fs_info->fs_devices->device_list_mutex); 10961 devices = &fs_info->fs_devices->alloc_list; 10962 list_for_each_entry(device, devices, dev_alloc_list) { 10963 ret = btrfs_trim_free_extents(device, range->minlen, 10964 &group_trimmed); 10965 if (ret) 10966 break; 10967 10968 trimmed += group_trimmed; 10969 } 10970 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 10971 10972 range->len = trimmed; 10973 return ret; 10974 } 10975 10976 /* 10977 * btrfs_{start,end}_write_no_snapshoting() are similar to 10978 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing 10979 * data into the page cache through nocow before the subvolume is snapshoted, 10980 * but flush the data into disk after the snapshot creation, or to prevent 10981 * operations while snapshoting is ongoing and that cause the snapshot to be 10982 * inconsistent (writes followed by expanding truncates for example). 10983 */ 10984 void btrfs_end_write_no_snapshoting(struct btrfs_root *root) 10985 { 10986 percpu_counter_dec(&root->subv_writers->counter); 10987 /* 10988 * Make sure counter is updated before we wake up waiters. 10989 */ 10990 smp_mb(); 10991 if (waitqueue_active(&root->subv_writers->wait)) 10992 wake_up(&root->subv_writers->wait); 10993 } 10994 10995 int btrfs_start_write_no_snapshoting(struct btrfs_root *root) 10996 { 10997 if (atomic_read(&root->will_be_snapshoted)) 10998 return 0; 10999 11000 percpu_counter_inc(&root->subv_writers->counter); 11001 /* 11002 * Make sure counter is updated before we check for snapshot creation. 11003 */ 11004 smp_mb(); 11005 if (atomic_read(&root->will_be_snapshoted)) { 11006 btrfs_end_write_no_snapshoting(root); 11007 return 0; 11008 } 11009 return 1; 11010 } 11011 11012 static int wait_snapshoting_atomic_t(atomic_t *a) 11013 { 11014 schedule(); 11015 return 0; 11016 } 11017 11018 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) 11019 { 11020 while (true) { 11021 int ret; 11022 11023 ret = btrfs_start_write_no_snapshoting(root); 11024 if (ret) 11025 break; 11026 wait_on_atomic_t(&root->will_be_snapshoted, 11027 wait_snapshoting_atomic_t, 11028 TASK_UNINTERRUPTIBLE); 11029 } 11030 } 11031