1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "misc.h" 4 #include "ctree.h" 5 #include "block-group.h" 6 #include "space-info.h" 7 #include "disk-io.h" 8 #include "free-space-cache.h" 9 #include "free-space-tree.h" 10 #include "volumes.h" 11 #include "transaction.h" 12 #include "ref-verify.h" 13 #include "sysfs.h" 14 #include "tree-log.h" 15 #include "delalloc-space.h" 16 #include "discard.h" 17 #include "raid56.h" 18 19 /* 20 * Return target flags in extended format or 0 if restripe for this chunk_type 21 * is not in progress 22 * 23 * Should be called with balance_lock held 24 */ 25 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 26 { 27 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 28 u64 target = 0; 29 30 if (!bctl) 31 return 0; 32 33 if (flags & BTRFS_BLOCK_GROUP_DATA && 34 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 35 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 36 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 37 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 38 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 39 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 40 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 41 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 42 } 43 44 return target; 45 } 46 47 /* 48 * @flags: available profiles in extended format (see ctree.h) 49 * 50 * Return reduced profile in chunk format. If profile changing is in progress 51 * (either running or paused) picks the target profile (if it's already 52 * available), otherwise falls back to plain reducing. 53 */ 54 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 55 { 56 u64 num_devices = fs_info->fs_devices->rw_devices; 57 u64 target; 58 u64 raid_type; 59 u64 allowed = 0; 60 61 /* 62 * See if restripe for this chunk_type is in progress, if so try to 63 * reduce to the target profile 64 */ 65 spin_lock(&fs_info->balance_lock); 66 target = get_restripe_target(fs_info, flags); 67 if (target) { 68 spin_unlock(&fs_info->balance_lock); 69 return extended_to_chunk(target); 70 } 71 spin_unlock(&fs_info->balance_lock); 72 73 /* First, mask out the RAID levels which aren't possible */ 74 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 75 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 76 allowed |= btrfs_raid_array[raid_type].bg_flag; 77 } 78 allowed &= flags; 79 80 if (allowed & BTRFS_BLOCK_GROUP_RAID6) 81 allowed = BTRFS_BLOCK_GROUP_RAID6; 82 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 83 allowed = BTRFS_BLOCK_GROUP_RAID5; 84 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 85 allowed = BTRFS_BLOCK_GROUP_RAID10; 86 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 87 allowed = BTRFS_BLOCK_GROUP_RAID1; 88 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 89 allowed = BTRFS_BLOCK_GROUP_RAID0; 90 91 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 92 93 return extended_to_chunk(flags | allowed); 94 } 95 96 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 97 { 98 unsigned seq; 99 u64 flags; 100 101 do { 102 flags = orig_flags; 103 seq = read_seqbegin(&fs_info->profiles_lock); 104 105 if (flags & BTRFS_BLOCK_GROUP_DATA) 106 flags |= fs_info->avail_data_alloc_bits; 107 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 108 flags |= fs_info->avail_system_alloc_bits; 109 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 110 flags |= fs_info->avail_metadata_alloc_bits; 111 } while (read_seqretry(&fs_info->profiles_lock, seq)); 112 113 return btrfs_reduce_alloc_profile(fs_info, flags); 114 } 115 116 void btrfs_get_block_group(struct btrfs_block_group *cache) 117 { 118 refcount_inc(&cache->refs); 119 } 120 121 void btrfs_put_block_group(struct btrfs_block_group *cache) 122 { 123 if (refcount_dec_and_test(&cache->refs)) { 124 WARN_ON(cache->pinned > 0); 125 WARN_ON(cache->reserved > 0); 126 127 /* 128 * A block_group shouldn't be on the discard_list anymore. 129 * Remove the block_group from the discard_list to prevent us 130 * from causing a panic due to NULL pointer dereference. 131 */ 132 if (WARN_ON(!list_empty(&cache->discard_list))) 133 btrfs_discard_cancel_work(&cache->fs_info->discard_ctl, 134 cache); 135 136 /* 137 * If not empty, someone is still holding mutex of 138 * full_stripe_lock, which can only be released by caller. 139 * And it will definitely cause use-after-free when caller 140 * tries to release full stripe lock. 141 * 142 * No better way to resolve, but only to warn. 143 */ 144 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); 145 kfree(cache->free_space_ctl); 146 kfree(cache); 147 } 148 } 149 150 /* 151 * This adds the block group to the fs_info rb tree for the block group cache 152 */ 153 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 154 struct btrfs_block_group *block_group) 155 { 156 struct rb_node **p; 157 struct rb_node *parent = NULL; 158 struct btrfs_block_group *cache; 159 160 ASSERT(block_group->length != 0); 161 162 spin_lock(&info->block_group_cache_lock); 163 p = &info->block_group_cache_tree.rb_node; 164 165 while (*p) { 166 parent = *p; 167 cache = rb_entry(parent, struct btrfs_block_group, cache_node); 168 if (block_group->start < cache->start) { 169 p = &(*p)->rb_left; 170 } else if (block_group->start > cache->start) { 171 p = &(*p)->rb_right; 172 } else { 173 spin_unlock(&info->block_group_cache_lock); 174 return -EEXIST; 175 } 176 } 177 178 rb_link_node(&block_group->cache_node, parent, p); 179 rb_insert_color(&block_group->cache_node, 180 &info->block_group_cache_tree); 181 182 if (info->first_logical_byte > block_group->start) 183 info->first_logical_byte = block_group->start; 184 185 spin_unlock(&info->block_group_cache_lock); 186 187 return 0; 188 } 189 190 /* 191 * This will return the block group at or after bytenr if contains is 0, else 192 * it will return the block group that contains the bytenr 193 */ 194 static struct btrfs_block_group *block_group_cache_tree_search( 195 struct btrfs_fs_info *info, u64 bytenr, int contains) 196 { 197 struct btrfs_block_group *cache, *ret = NULL; 198 struct rb_node *n; 199 u64 end, start; 200 201 spin_lock(&info->block_group_cache_lock); 202 n = info->block_group_cache_tree.rb_node; 203 204 while (n) { 205 cache = rb_entry(n, struct btrfs_block_group, cache_node); 206 end = cache->start + cache->length - 1; 207 start = cache->start; 208 209 if (bytenr < start) { 210 if (!contains && (!ret || start < ret->start)) 211 ret = cache; 212 n = n->rb_left; 213 } else if (bytenr > start) { 214 if (contains && bytenr <= end) { 215 ret = cache; 216 break; 217 } 218 n = n->rb_right; 219 } else { 220 ret = cache; 221 break; 222 } 223 } 224 if (ret) { 225 btrfs_get_block_group(ret); 226 if (bytenr == 0 && info->first_logical_byte > ret->start) 227 info->first_logical_byte = ret->start; 228 } 229 spin_unlock(&info->block_group_cache_lock); 230 231 return ret; 232 } 233 234 /* 235 * Return the block group that starts at or after bytenr 236 */ 237 struct btrfs_block_group *btrfs_lookup_first_block_group( 238 struct btrfs_fs_info *info, u64 bytenr) 239 { 240 return block_group_cache_tree_search(info, bytenr, 0); 241 } 242 243 /* 244 * Return the block group that contains the given bytenr 245 */ 246 struct btrfs_block_group *btrfs_lookup_block_group( 247 struct btrfs_fs_info *info, u64 bytenr) 248 { 249 return block_group_cache_tree_search(info, bytenr, 1); 250 } 251 252 struct btrfs_block_group *btrfs_next_block_group( 253 struct btrfs_block_group *cache) 254 { 255 struct btrfs_fs_info *fs_info = cache->fs_info; 256 struct rb_node *node; 257 258 spin_lock(&fs_info->block_group_cache_lock); 259 260 /* If our block group was removed, we need a full search. */ 261 if (RB_EMPTY_NODE(&cache->cache_node)) { 262 const u64 next_bytenr = cache->start + cache->length; 263 264 spin_unlock(&fs_info->block_group_cache_lock); 265 btrfs_put_block_group(cache); 266 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; 267 } 268 node = rb_next(&cache->cache_node); 269 btrfs_put_block_group(cache); 270 if (node) { 271 cache = rb_entry(node, struct btrfs_block_group, cache_node); 272 btrfs_get_block_group(cache); 273 } else 274 cache = NULL; 275 spin_unlock(&fs_info->block_group_cache_lock); 276 return cache; 277 } 278 279 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 280 { 281 struct btrfs_block_group *bg; 282 bool ret = true; 283 284 bg = btrfs_lookup_block_group(fs_info, bytenr); 285 if (!bg) 286 return false; 287 288 spin_lock(&bg->lock); 289 if (bg->ro) 290 ret = false; 291 else 292 atomic_inc(&bg->nocow_writers); 293 spin_unlock(&bg->lock); 294 295 /* No put on block group, done by btrfs_dec_nocow_writers */ 296 if (!ret) 297 btrfs_put_block_group(bg); 298 299 return ret; 300 } 301 302 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 303 { 304 struct btrfs_block_group *bg; 305 306 bg = btrfs_lookup_block_group(fs_info, bytenr); 307 ASSERT(bg); 308 if (atomic_dec_and_test(&bg->nocow_writers)) 309 wake_up_var(&bg->nocow_writers); 310 /* 311 * Once for our lookup and once for the lookup done by a previous call 312 * to btrfs_inc_nocow_writers() 313 */ 314 btrfs_put_block_group(bg); 315 btrfs_put_block_group(bg); 316 } 317 318 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg) 319 { 320 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); 321 } 322 323 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 324 const u64 start) 325 { 326 struct btrfs_block_group *bg; 327 328 bg = btrfs_lookup_block_group(fs_info, start); 329 ASSERT(bg); 330 if (atomic_dec_and_test(&bg->reservations)) 331 wake_up_var(&bg->reservations); 332 btrfs_put_block_group(bg); 333 } 334 335 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg) 336 { 337 struct btrfs_space_info *space_info = bg->space_info; 338 339 ASSERT(bg->ro); 340 341 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 342 return; 343 344 /* 345 * Our block group is read only but before we set it to read only, 346 * some task might have had allocated an extent from it already, but it 347 * has not yet created a respective ordered extent (and added it to a 348 * root's list of ordered extents). 349 * Therefore wait for any task currently allocating extents, since the 350 * block group's reservations counter is incremented while a read lock 351 * on the groups' semaphore is held and decremented after releasing 352 * the read access on that semaphore and creating the ordered extent. 353 */ 354 down_write(&space_info->groups_sem); 355 up_write(&space_info->groups_sem); 356 357 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); 358 } 359 360 struct btrfs_caching_control *btrfs_get_caching_control( 361 struct btrfs_block_group *cache) 362 { 363 struct btrfs_caching_control *ctl; 364 365 spin_lock(&cache->lock); 366 if (!cache->caching_ctl) { 367 spin_unlock(&cache->lock); 368 return NULL; 369 } 370 371 ctl = cache->caching_ctl; 372 refcount_inc(&ctl->count); 373 spin_unlock(&cache->lock); 374 return ctl; 375 } 376 377 void btrfs_put_caching_control(struct btrfs_caching_control *ctl) 378 { 379 if (refcount_dec_and_test(&ctl->count)) 380 kfree(ctl); 381 } 382 383 /* 384 * When we wait for progress in the block group caching, its because our 385 * allocation attempt failed at least once. So, we must sleep and let some 386 * progress happen before we try again. 387 * 388 * This function will sleep at least once waiting for new free space to show 389 * up, and then it will check the block group free space numbers for our min 390 * num_bytes. Another option is to have it go ahead and look in the rbtree for 391 * a free extent of a given size, but this is a good start. 392 * 393 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 394 * any of the information in this block group. 395 */ 396 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, 397 u64 num_bytes) 398 { 399 struct btrfs_caching_control *caching_ctl; 400 401 caching_ctl = btrfs_get_caching_control(cache); 402 if (!caching_ctl) 403 return; 404 405 wait_event(caching_ctl->wait, btrfs_block_group_done(cache) || 406 (cache->free_space_ctl->free_space >= num_bytes)); 407 408 btrfs_put_caching_control(caching_ctl); 409 } 410 411 int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) 412 { 413 struct btrfs_caching_control *caching_ctl; 414 int ret = 0; 415 416 caching_ctl = btrfs_get_caching_control(cache); 417 if (!caching_ctl) 418 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 419 420 wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); 421 if (cache->cached == BTRFS_CACHE_ERROR) 422 ret = -EIO; 423 btrfs_put_caching_control(caching_ctl); 424 return ret; 425 } 426 427 static bool space_cache_v1_done(struct btrfs_block_group *cache) 428 { 429 bool ret; 430 431 spin_lock(&cache->lock); 432 ret = cache->cached != BTRFS_CACHE_FAST; 433 spin_unlock(&cache->lock); 434 435 return ret; 436 } 437 438 void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, 439 struct btrfs_caching_control *caching_ctl) 440 { 441 wait_event(caching_ctl->wait, space_cache_v1_done(cache)); 442 } 443 444 #ifdef CONFIG_BTRFS_DEBUG 445 static void fragment_free_space(struct btrfs_block_group *block_group) 446 { 447 struct btrfs_fs_info *fs_info = block_group->fs_info; 448 u64 start = block_group->start; 449 u64 len = block_group->length; 450 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 451 fs_info->nodesize : fs_info->sectorsize; 452 u64 step = chunk << 1; 453 454 while (len > chunk) { 455 btrfs_remove_free_space(block_group, start, chunk); 456 start += step; 457 if (len < step) 458 len = 0; 459 else 460 len -= step; 461 } 462 } 463 #endif 464 465 /* 466 * This is only called by btrfs_cache_block_group, since we could have freed 467 * extents we need to check the pinned_extents for any extents that can't be 468 * used yet since their free space will be released as soon as the transaction 469 * commits. 470 */ 471 u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end) 472 { 473 struct btrfs_fs_info *info = block_group->fs_info; 474 u64 extent_start, extent_end, size, total_added = 0; 475 int ret; 476 477 while (start < end) { 478 ret = find_first_extent_bit(&info->excluded_extents, start, 479 &extent_start, &extent_end, 480 EXTENT_DIRTY | EXTENT_UPTODATE, 481 NULL); 482 if (ret) 483 break; 484 485 if (extent_start <= start) { 486 start = extent_end + 1; 487 } else if (extent_start > start && extent_start < end) { 488 size = extent_start - start; 489 total_added += size; 490 ret = btrfs_add_free_space_async_trimmed(block_group, 491 start, size); 492 BUG_ON(ret); /* -ENOMEM or logic error */ 493 start = extent_end + 1; 494 } else { 495 break; 496 } 497 } 498 499 if (start < end) { 500 size = end - start; 501 total_added += size; 502 ret = btrfs_add_free_space_async_trimmed(block_group, start, 503 size); 504 BUG_ON(ret); /* -ENOMEM or logic error */ 505 } 506 507 return total_added; 508 } 509 510 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 511 { 512 struct btrfs_block_group *block_group = caching_ctl->block_group; 513 struct btrfs_fs_info *fs_info = block_group->fs_info; 514 struct btrfs_root *extent_root = fs_info->extent_root; 515 struct btrfs_path *path; 516 struct extent_buffer *leaf; 517 struct btrfs_key key; 518 u64 total_found = 0; 519 u64 last = 0; 520 u32 nritems; 521 int ret; 522 bool wakeup = true; 523 524 path = btrfs_alloc_path(); 525 if (!path) 526 return -ENOMEM; 527 528 last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); 529 530 #ifdef CONFIG_BTRFS_DEBUG 531 /* 532 * If we're fragmenting we don't want to make anybody think we can 533 * allocate from this block group until we've had a chance to fragment 534 * the free space. 535 */ 536 if (btrfs_should_fragment_free_space(block_group)) 537 wakeup = false; 538 #endif 539 /* 540 * We don't want to deadlock with somebody trying to allocate a new 541 * extent for the extent root while also trying to search the extent 542 * root to add free space. So we skip locking and search the commit 543 * root, since its read-only 544 */ 545 path->skip_locking = 1; 546 path->search_commit_root = 1; 547 path->reada = READA_FORWARD; 548 549 key.objectid = last; 550 key.offset = 0; 551 key.type = BTRFS_EXTENT_ITEM_KEY; 552 553 next: 554 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 555 if (ret < 0) 556 goto out; 557 558 leaf = path->nodes[0]; 559 nritems = btrfs_header_nritems(leaf); 560 561 while (1) { 562 if (btrfs_fs_closing(fs_info) > 1) { 563 last = (u64)-1; 564 break; 565 } 566 567 if (path->slots[0] < nritems) { 568 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 569 } else { 570 ret = btrfs_find_next_key(extent_root, path, &key, 0, 0); 571 if (ret) 572 break; 573 574 if (need_resched() || 575 rwsem_is_contended(&fs_info->commit_root_sem)) { 576 if (wakeup) 577 caching_ctl->progress = last; 578 btrfs_release_path(path); 579 up_read(&fs_info->commit_root_sem); 580 mutex_unlock(&caching_ctl->mutex); 581 cond_resched(); 582 mutex_lock(&caching_ctl->mutex); 583 down_read(&fs_info->commit_root_sem); 584 goto next; 585 } 586 587 ret = btrfs_next_leaf(extent_root, path); 588 if (ret < 0) 589 goto out; 590 if (ret) 591 break; 592 leaf = path->nodes[0]; 593 nritems = btrfs_header_nritems(leaf); 594 continue; 595 } 596 597 if (key.objectid < last) { 598 key.objectid = last; 599 key.offset = 0; 600 key.type = BTRFS_EXTENT_ITEM_KEY; 601 602 if (wakeup) 603 caching_ctl->progress = last; 604 btrfs_release_path(path); 605 goto next; 606 } 607 608 if (key.objectid < block_group->start) { 609 path->slots[0]++; 610 continue; 611 } 612 613 if (key.objectid >= block_group->start + block_group->length) 614 break; 615 616 if (key.type == BTRFS_EXTENT_ITEM_KEY || 617 key.type == BTRFS_METADATA_ITEM_KEY) { 618 total_found += add_new_free_space(block_group, last, 619 key.objectid); 620 if (key.type == BTRFS_METADATA_ITEM_KEY) 621 last = key.objectid + 622 fs_info->nodesize; 623 else 624 last = key.objectid + key.offset; 625 626 if (total_found > CACHING_CTL_WAKE_UP) { 627 total_found = 0; 628 if (wakeup) 629 wake_up(&caching_ctl->wait); 630 } 631 } 632 path->slots[0]++; 633 } 634 ret = 0; 635 636 total_found += add_new_free_space(block_group, last, 637 block_group->start + block_group->length); 638 caching_ctl->progress = (u64)-1; 639 640 out: 641 btrfs_free_path(path); 642 return ret; 643 } 644 645 static noinline void caching_thread(struct btrfs_work *work) 646 { 647 struct btrfs_block_group *block_group; 648 struct btrfs_fs_info *fs_info; 649 struct btrfs_caching_control *caching_ctl; 650 int ret; 651 652 caching_ctl = container_of(work, struct btrfs_caching_control, work); 653 block_group = caching_ctl->block_group; 654 fs_info = block_group->fs_info; 655 656 mutex_lock(&caching_ctl->mutex); 657 down_read(&fs_info->commit_root_sem); 658 659 if (btrfs_test_opt(fs_info, SPACE_CACHE)) { 660 ret = load_free_space_cache(block_group); 661 if (ret == 1) { 662 ret = 0; 663 goto done; 664 } 665 666 /* 667 * We failed to load the space cache, set ourselves to 668 * CACHE_STARTED and carry on. 669 */ 670 spin_lock(&block_group->lock); 671 block_group->cached = BTRFS_CACHE_STARTED; 672 spin_unlock(&block_group->lock); 673 wake_up(&caching_ctl->wait); 674 } 675 676 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 677 ret = load_free_space_tree(caching_ctl); 678 else 679 ret = load_extent_tree_free(caching_ctl); 680 done: 681 spin_lock(&block_group->lock); 682 block_group->caching_ctl = NULL; 683 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 684 spin_unlock(&block_group->lock); 685 686 #ifdef CONFIG_BTRFS_DEBUG 687 if (btrfs_should_fragment_free_space(block_group)) { 688 u64 bytes_used; 689 690 spin_lock(&block_group->space_info->lock); 691 spin_lock(&block_group->lock); 692 bytes_used = block_group->length - block_group->used; 693 block_group->space_info->bytes_used += bytes_used >> 1; 694 spin_unlock(&block_group->lock); 695 spin_unlock(&block_group->space_info->lock); 696 fragment_free_space(block_group); 697 } 698 #endif 699 700 caching_ctl->progress = (u64)-1; 701 702 up_read(&fs_info->commit_root_sem); 703 btrfs_free_excluded_extents(block_group); 704 mutex_unlock(&caching_ctl->mutex); 705 706 wake_up(&caching_ctl->wait); 707 708 btrfs_put_caching_control(caching_ctl); 709 btrfs_put_block_group(block_group); 710 } 711 712 int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only) 713 { 714 DEFINE_WAIT(wait); 715 struct btrfs_fs_info *fs_info = cache->fs_info; 716 struct btrfs_caching_control *caching_ctl = NULL; 717 int ret = 0; 718 719 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 720 if (!caching_ctl) 721 return -ENOMEM; 722 723 INIT_LIST_HEAD(&caching_ctl->list); 724 mutex_init(&caching_ctl->mutex); 725 init_waitqueue_head(&caching_ctl->wait); 726 caching_ctl->block_group = cache; 727 caching_ctl->progress = cache->start; 728 refcount_set(&caching_ctl->count, 2); 729 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); 730 731 spin_lock(&cache->lock); 732 if (cache->cached != BTRFS_CACHE_NO) { 733 kfree(caching_ctl); 734 735 caching_ctl = cache->caching_ctl; 736 if (caching_ctl) 737 refcount_inc(&caching_ctl->count); 738 spin_unlock(&cache->lock); 739 goto out; 740 } 741 WARN_ON(cache->caching_ctl); 742 cache->caching_ctl = caching_ctl; 743 if (btrfs_test_opt(fs_info, SPACE_CACHE)) 744 cache->cached = BTRFS_CACHE_FAST; 745 else 746 cache->cached = BTRFS_CACHE_STARTED; 747 cache->has_caching_ctl = 1; 748 spin_unlock(&cache->lock); 749 750 spin_lock(&fs_info->block_group_cache_lock); 751 refcount_inc(&caching_ctl->count); 752 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 753 spin_unlock(&fs_info->block_group_cache_lock); 754 755 btrfs_get_block_group(cache); 756 757 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 758 out: 759 if (load_cache_only && caching_ctl) 760 btrfs_wait_space_cache_v1_finished(cache, caching_ctl); 761 if (caching_ctl) 762 btrfs_put_caching_control(caching_ctl); 763 764 return ret; 765 } 766 767 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 768 { 769 u64 extra_flags = chunk_to_extended(flags) & 770 BTRFS_EXTENDED_PROFILE_MASK; 771 772 write_seqlock(&fs_info->profiles_lock); 773 if (flags & BTRFS_BLOCK_GROUP_DATA) 774 fs_info->avail_data_alloc_bits &= ~extra_flags; 775 if (flags & BTRFS_BLOCK_GROUP_METADATA) 776 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 777 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 778 fs_info->avail_system_alloc_bits &= ~extra_flags; 779 write_sequnlock(&fs_info->profiles_lock); 780 } 781 782 /* 783 * Clear incompat bits for the following feature(s): 784 * 785 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group 786 * in the whole filesystem 787 * 788 * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups 789 */ 790 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) 791 { 792 bool found_raid56 = false; 793 bool found_raid1c34 = false; 794 795 if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) || 796 (flags & BTRFS_BLOCK_GROUP_RAID1C3) || 797 (flags & BTRFS_BLOCK_GROUP_RAID1C4)) { 798 struct list_head *head = &fs_info->space_info; 799 struct btrfs_space_info *sinfo; 800 801 list_for_each_entry_rcu(sinfo, head, list) { 802 down_read(&sinfo->groups_sem); 803 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) 804 found_raid56 = true; 805 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) 806 found_raid56 = true; 807 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3])) 808 found_raid1c34 = true; 809 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4])) 810 found_raid1c34 = true; 811 up_read(&sinfo->groups_sem); 812 } 813 if (!found_raid56) 814 btrfs_clear_fs_incompat(fs_info, RAID56); 815 if (!found_raid1c34) 816 btrfs_clear_fs_incompat(fs_info, RAID1C34); 817 } 818 } 819 820 static int remove_block_group_item(struct btrfs_trans_handle *trans, 821 struct btrfs_path *path, 822 struct btrfs_block_group *block_group) 823 { 824 struct btrfs_fs_info *fs_info = trans->fs_info; 825 struct btrfs_root *root; 826 struct btrfs_key key; 827 int ret; 828 829 root = fs_info->extent_root; 830 key.objectid = block_group->start; 831 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 832 key.offset = block_group->length; 833 834 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 835 if (ret > 0) 836 ret = -ENOENT; 837 if (ret < 0) 838 return ret; 839 840 ret = btrfs_del_item(trans, root, path); 841 return ret; 842 } 843 844 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 845 u64 group_start, struct extent_map *em) 846 { 847 struct btrfs_fs_info *fs_info = trans->fs_info; 848 struct btrfs_path *path; 849 struct btrfs_block_group *block_group; 850 struct btrfs_free_cluster *cluster; 851 struct inode *inode; 852 struct kobject *kobj = NULL; 853 int ret; 854 int index; 855 int factor; 856 struct btrfs_caching_control *caching_ctl = NULL; 857 bool remove_em; 858 bool remove_rsv = false; 859 860 block_group = btrfs_lookup_block_group(fs_info, group_start); 861 BUG_ON(!block_group); 862 BUG_ON(!block_group->ro); 863 864 trace_btrfs_remove_block_group(block_group); 865 /* 866 * Free the reserved super bytes from this block group before 867 * remove it. 868 */ 869 btrfs_free_excluded_extents(block_group); 870 btrfs_free_ref_tree_range(fs_info, block_group->start, 871 block_group->length); 872 873 index = btrfs_bg_flags_to_raid_index(block_group->flags); 874 factor = btrfs_bg_type_to_factor(block_group->flags); 875 876 /* make sure this block group isn't part of an allocation cluster */ 877 cluster = &fs_info->data_alloc_cluster; 878 spin_lock(&cluster->refill_lock); 879 btrfs_return_cluster_to_free_space(block_group, cluster); 880 spin_unlock(&cluster->refill_lock); 881 882 /* 883 * make sure this block group isn't part of a metadata 884 * allocation cluster 885 */ 886 cluster = &fs_info->meta_alloc_cluster; 887 spin_lock(&cluster->refill_lock); 888 btrfs_return_cluster_to_free_space(block_group, cluster); 889 spin_unlock(&cluster->refill_lock); 890 891 path = btrfs_alloc_path(); 892 if (!path) { 893 ret = -ENOMEM; 894 goto out; 895 } 896 897 /* 898 * get the inode first so any iput calls done for the io_list 899 * aren't the final iput (no unlinks allowed now) 900 */ 901 inode = lookup_free_space_inode(block_group, path); 902 903 mutex_lock(&trans->transaction->cache_write_mutex); 904 /* 905 * Make sure our free space cache IO is done before removing the 906 * free space inode 907 */ 908 spin_lock(&trans->transaction->dirty_bgs_lock); 909 if (!list_empty(&block_group->io_list)) { 910 list_del_init(&block_group->io_list); 911 912 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 913 914 spin_unlock(&trans->transaction->dirty_bgs_lock); 915 btrfs_wait_cache_io(trans, block_group, path); 916 btrfs_put_block_group(block_group); 917 spin_lock(&trans->transaction->dirty_bgs_lock); 918 } 919 920 if (!list_empty(&block_group->dirty_list)) { 921 list_del_init(&block_group->dirty_list); 922 remove_rsv = true; 923 btrfs_put_block_group(block_group); 924 } 925 spin_unlock(&trans->transaction->dirty_bgs_lock); 926 mutex_unlock(&trans->transaction->cache_write_mutex); 927 928 ret = btrfs_remove_free_space_inode(trans, inode, block_group); 929 if (ret) 930 goto out; 931 932 spin_lock(&fs_info->block_group_cache_lock); 933 rb_erase(&block_group->cache_node, 934 &fs_info->block_group_cache_tree); 935 RB_CLEAR_NODE(&block_group->cache_node); 936 937 /* Once for the block groups rbtree */ 938 btrfs_put_block_group(block_group); 939 940 if (fs_info->first_logical_byte == block_group->start) 941 fs_info->first_logical_byte = (u64)-1; 942 spin_unlock(&fs_info->block_group_cache_lock); 943 944 down_write(&block_group->space_info->groups_sem); 945 /* 946 * we must use list_del_init so people can check to see if they 947 * are still on the list after taking the semaphore 948 */ 949 list_del_init(&block_group->list); 950 if (list_empty(&block_group->space_info->block_groups[index])) { 951 kobj = block_group->space_info->block_group_kobjs[index]; 952 block_group->space_info->block_group_kobjs[index] = NULL; 953 clear_avail_alloc_bits(fs_info, block_group->flags); 954 } 955 up_write(&block_group->space_info->groups_sem); 956 clear_incompat_bg_bits(fs_info, block_group->flags); 957 if (kobj) { 958 kobject_del(kobj); 959 kobject_put(kobj); 960 } 961 962 if (block_group->has_caching_ctl) 963 caching_ctl = btrfs_get_caching_control(block_group); 964 if (block_group->cached == BTRFS_CACHE_STARTED) 965 btrfs_wait_block_group_cache_done(block_group); 966 if (block_group->has_caching_ctl) { 967 spin_lock(&fs_info->block_group_cache_lock); 968 if (!caching_ctl) { 969 struct btrfs_caching_control *ctl; 970 971 list_for_each_entry(ctl, 972 &fs_info->caching_block_groups, list) 973 if (ctl->block_group == block_group) { 974 caching_ctl = ctl; 975 refcount_inc(&caching_ctl->count); 976 break; 977 } 978 } 979 if (caching_ctl) 980 list_del_init(&caching_ctl->list); 981 spin_unlock(&fs_info->block_group_cache_lock); 982 if (caching_ctl) { 983 /* Once for the caching bgs list and once for us. */ 984 btrfs_put_caching_control(caching_ctl); 985 btrfs_put_caching_control(caching_ctl); 986 } 987 } 988 989 spin_lock(&trans->transaction->dirty_bgs_lock); 990 WARN_ON(!list_empty(&block_group->dirty_list)); 991 WARN_ON(!list_empty(&block_group->io_list)); 992 spin_unlock(&trans->transaction->dirty_bgs_lock); 993 994 btrfs_remove_free_space_cache(block_group); 995 996 spin_lock(&block_group->space_info->lock); 997 list_del_init(&block_group->ro_list); 998 999 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 1000 WARN_ON(block_group->space_info->total_bytes 1001 < block_group->length); 1002 WARN_ON(block_group->space_info->bytes_readonly 1003 < block_group->length); 1004 WARN_ON(block_group->space_info->disk_total 1005 < block_group->length * factor); 1006 } 1007 block_group->space_info->total_bytes -= block_group->length; 1008 block_group->space_info->bytes_readonly -= block_group->length; 1009 block_group->space_info->disk_total -= block_group->length * factor; 1010 1011 spin_unlock(&block_group->space_info->lock); 1012 1013 /* 1014 * Remove the free space for the block group from the free space tree 1015 * and the block group's item from the extent tree before marking the 1016 * block group as removed. This is to prevent races with tasks that 1017 * freeze and unfreeze a block group, this task and another task 1018 * allocating a new block group - the unfreeze task ends up removing 1019 * the block group's extent map before the task calling this function 1020 * deletes the block group item from the extent tree, allowing for 1021 * another task to attempt to create another block group with the same 1022 * item key (and failing with -EEXIST and a transaction abort). 1023 */ 1024 ret = remove_block_group_free_space(trans, block_group); 1025 if (ret) 1026 goto out; 1027 1028 ret = remove_block_group_item(trans, path, block_group); 1029 if (ret < 0) 1030 goto out; 1031 1032 spin_lock(&block_group->lock); 1033 block_group->removed = 1; 1034 /* 1035 * At this point trimming or scrub can't start on this block group, 1036 * because we removed the block group from the rbtree 1037 * fs_info->block_group_cache_tree so no one can't find it anymore and 1038 * even if someone already got this block group before we removed it 1039 * from the rbtree, they have already incremented block_group->frozen - 1040 * if they didn't, for the trimming case they won't find any free space 1041 * entries because we already removed them all when we called 1042 * btrfs_remove_free_space_cache(). 1043 * 1044 * And we must not remove the extent map from the fs_info->mapping_tree 1045 * to prevent the same logical address range and physical device space 1046 * ranges from being reused for a new block group. This is needed to 1047 * avoid races with trimming and scrub. 1048 * 1049 * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 1050 * completely transactionless, so while it is trimming a range the 1051 * currently running transaction might finish and a new one start, 1052 * allowing for new block groups to be created that can reuse the same 1053 * physical device locations unless we take this special care. 1054 * 1055 * There may also be an implicit trim operation if the file system 1056 * is mounted with -odiscard. The same protections must remain 1057 * in place until the extents have been discarded completely when 1058 * the transaction commit has completed. 1059 */ 1060 remove_em = (atomic_read(&block_group->frozen) == 0); 1061 spin_unlock(&block_group->lock); 1062 1063 if (remove_em) { 1064 struct extent_map_tree *em_tree; 1065 1066 em_tree = &fs_info->mapping_tree; 1067 write_lock(&em_tree->lock); 1068 remove_extent_mapping(em_tree, em); 1069 write_unlock(&em_tree->lock); 1070 /* once for the tree */ 1071 free_extent_map(em); 1072 } 1073 1074 out: 1075 /* Once for the lookup reference */ 1076 btrfs_put_block_group(block_group); 1077 if (remove_rsv) 1078 btrfs_delayed_refs_rsv_release(fs_info, 1); 1079 btrfs_free_path(path); 1080 return ret; 1081 } 1082 1083 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( 1084 struct btrfs_fs_info *fs_info, const u64 chunk_offset) 1085 { 1086 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 1087 struct extent_map *em; 1088 struct map_lookup *map; 1089 unsigned int num_items; 1090 1091 read_lock(&em_tree->lock); 1092 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1093 read_unlock(&em_tree->lock); 1094 ASSERT(em && em->start == chunk_offset); 1095 1096 /* 1097 * We need to reserve 3 + N units from the metadata space info in order 1098 * to remove a block group (done at btrfs_remove_chunk() and at 1099 * btrfs_remove_block_group()), which are used for: 1100 * 1101 * 1 unit for adding the free space inode's orphan (located in the tree 1102 * of tree roots). 1103 * 1 unit for deleting the block group item (located in the extent 1104 * tree). 1105 * 1 unit for deleting the free space item (located in tree of tree 1106 * roots). 1107 * N units for deleting N device extent items corresponding to each 1108 * stripe (located in the device tree). 1109 * 1110 * In order to remove a block group we also need to reserve units in the 1111 * system space info in order to update the chunk tree (update one or 1112 * more device items and remove one chunk item), but this is done at 1113 * btrfs_remove_chunk() through a call to check_system_chunk(). 1114 */ 1115 map = em->map_lookup; 1116 num_items = 3 + map->num_stripes; 1117 free_extent_map(em); 1118 1119 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, 1120 num_items); 1121 } 1122 1123 /* 1124 * Mark block group @cache read-only, so later write won't happen to block 1125 * group @cache. 1126 * 1127 * If @force is not set, this function will only mark the block group readonly 1128 * if we have enough free space (1M) in other metadata/system block groups. 1129 * If @force is not set, this function will mark the block group readonly 1130 * without checking free space. 1131 * 1132 * NOTE: This function doesn't care if other block groups can contain all the 1133 * data in this block group. That check should be done by relocation routine, 1134 * not this function. 1135 */ 1136 static int inc_block_group_ro(struct btrfs_block_group *cache, int force) 1137 { 1138 struct btrfs_space_info *sinfo = cache->space_info; 1139 u64 num_bytes; 1140 int ret = -ENOSPC; 1141 1142 spin_lock(&sinfo->lock); 1143 spin_lock(&cache->lock); 1144 1145 if (cache->ro) { 1146 cache->ro++; 1147 ret = 0; 1148 goto out; 1149 } 1150 1151 num_bytes = cache->length - cache->reserved - cache->pinned - 1152 cache->bytes_super - cache->used; 1153 1154 /* 1155 * Data never overcommits, even in mixed mode, so do just the straight 1156 * check of left over space in how much we have allocated. 1157 */ 1158 if (force) { 1159 ret = 0; 1160 } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) { 1161 u64 sinfo_used = btrfs_space_info_used(sinfo, true); 1162 1163 /* 1164 * Here we make sure if we mark this bg RO, we still have enough 1165 * free space as buffer. 1166 */ 1167 if (sinfo_used + num_bytes <= sinfo->total_bytes) 1168 ret = 0; 1169 } else { 1170 /* 1171 * We overcommit metadata, so we need to do the 1172 * btrfs_can_overcommit check here, and we need to pass in 1173 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of 1174 * leeway to allow us to mark this block group as read only. 1175 */ 1176 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, 1177 BTRFS_RESERVE_NO_FLUSH)) 1178 ret = 0; 1179 } 1180 1181 if (!ret) { 1182 sinfo->bytes_readonly += num_bytes; 1183 cache->ro++; 1184 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 1185 } 1186 out: 1187 spin_unlock(&cache->lock); 1188 spin_unlock(&sinfo->lock); 1189 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { 1190 btrfs_info(cache->fs_info, 1191 "unable to make block group %llu ro", cache->start); 1192 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); 1193 } 1194 return ret; 1195 } 1196 1197 static bool clean_pinned_extents(struct btrfs_trans_handle *trans, 1198 struct btrfs_block_group *bg) 1199 { 1200 struct btrfs_fs_info *fs_info = bg->fs_info; 1201 struct btrfs_transaction *prev_trans = NULL; 1202 const u64 start = bg->start; 1203 const u64 end = start + bg->length - 1; 1204 int ret; 1205 1206 spin_lock(&fs_info->trans_lock); 1207 if (trans->transaction->list.prev != &fs_info->trans_list) { 1208 prev_trans = list_last_entry(&trans->transaction->list, 1209 struct btrfs_transaction, list); 1210 refcount_inc(&prev_trans->use_count); 1211 } 1212 spin_unlock(&fs_info->trans_lock); 1213 1214 /* 1215 * Hold the unused_bg_unpin_mutex lock to avoid racing with 1216 * btrfs_finish_extent_commit(). If we are at transaction N, another 1217 * task might be running finish_extent_commit() for the previous 1218 * transaction N - 1, and have seen a range belonging to the block 1219 * group in pinned_extents before we were able to clear the whole block 1220 * group range from pinned_extents. This means that task can lookup for 1221 * the block group after we unpinned it from pinned_extents and removed 1222 * it, leading to a BUG_ON() at unpin_extent_range(). 1223 */ 1224 mutex_lock(&fs_info->unused_bg_unpin_mutex); 1225 if (prev_trans) { 1226 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, 1227 EXTENT_DIRTY); 1228 if (ret) 1229 goto out; 1230 } 1231 1232 ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, 1233 EXTENT_DIRTY); 1234 out: 1235 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 1236 if (prev_trans) 1237 btrfs_put_transaction(prev_trans); 1238 1239 return ret == 0; 1240 } 1241 1242 /* 1243 * Process the unused_bgs list and remove any that don't have any allocated 1244 * space inside of them. 1245 */ 1246 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 1247 { 1248 struct btrfs_block_group *block_group; 1249 struct btrfs_space_info *space_info; 1250 struct btrfs_trans_handle *trans; 1251 const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC); 1252 int ret = 0; 1253 1254 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1255 return; 1256 1257 spin_lock(&fs_info->unused_bgs_lock); 1258 while (!list_empty(&fs_info->unused_bgs)) { 1259 int trimming; 1260 1261 block_group = list_first_entry(&fs_info->unused_bgs, 1262 struct btrfs_block_group, 1263 bg_list); 1264 list_del_init(&block_group->bg_list); 1265 1266 space_info = block_group->space_info; 1267 1268 if (ret || btrfs_mixed_space_info(space_info)) { 1269 btrfs_put_block_group(block_group); 1270 continue; 1271 } 1272 spin_unlock(&fs_info->unused_bgs_lock); 1273 1274 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 1275 1276 mutex_lock(&fs_info->delete_unused_bgs_mutex); 1277 1278 /* Don't want to race with allocators so take the groups_sem */ 1279 down_write(&space_info->groups_sem); 1280 1281 /* 1282 * Async discard moves the final block group discard to be prior 1283 * to the unused_bgs code path. Therefore, if it's not fully 1284 * trimmed, punt it back to the async discard lists. 1285 */ 1286 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) && 1287 !btrfs_is_free_space_trimmed(block_group)) { 1288 trace_btrfs_skip_unused_block_group(block_group); 1289 up_write(&space_info->groups_sem); 1290 /* Requeue if we failed because of async discard */ 1291 btrfs_discard_queue_work(&fs_info->discard_ctl, 1292 block_group); 1293 goto next; 1294 } 1295 1296 spin_lock(&block_group->lock); 1297 if (block_group->reserved || block_group->pinned || 1298 block_group->used || block_group->ro || 1299 list_is_singular(&block_group->list)) { 1300 /* 1301 * We want to bail if we made new allocations or have 1302 * outstanding allocations in this block group. We do 1303 * the ro check in case balance is currently acting on 1304 * this block group. 1305 */ 1306 trace_btrfs_skip_unused_block_group(block_group); 1307 spin_unlock(&block_group->lock); 1308 up_write(&space_info->groups_sem); 1309 goto next; 1310 } 1311 spin_unlock(&block_group->lock); 1312 1313 /* We don't want to force the issue, only flip if it's ok. */ 1314 ret = inc_block_group_ro(block_group, 0); 1315 up_write(&space_info->groups_sem); 1316 if (ret < 0) { 1317 ret = 0; 1318 goto next; 1319 } 1320 1321 /* 1322 * Want to do this before we do anything else so we can recover 1323 * properly if we fail to join the transaction. 1324 */ 1325 trans = btrfs_start_trans_remove_block_group(fs_info, 1326 block_group->start); 1327 if (IS_ERR(trans)) { 1328 btrfs_dec_block_group_ro(block_group); 1329 ret = PTR_ERR(trans); 1330 goto next; 1331 } 1332 1333 /* 1334 * We could have pending pinned extents for this block group, 1335 * just delete them, we don't care about them anymore. 1336 */ 1337 if (!clean_pinned_extents(trans, block_group)) { 1338 btrfs_dec_block_group_ro(block_group); 1339 goto end_trans; 1340 } 1341 1342 /* 1343 * At this point, the block_group is read only and should fail 1344 * new allocations. However, btrfs_finish_extent_commit() can 1345 * cause this block_group to be placed back on the discard 1346 * lists because now the block_group isn't fully discarded. 1347 * Bail here and try again later after discarding everything. 1348 */ 1349 spin_lock(&fs_info->discard_ctl.lock); 1350 if (!list_empty(&block_group->discard_list)) { 1351 spin_unlock(&fs_info->discard_ctl.lock); 1352 btrfs_dec_block_group_ro(block_group); 1353 btrfs_discard_queue_work(&fs_info->discard_ctl, 1354 block_group); 1355 goto end_trans; 1356 } 1357 spin_unlock(&fs_info->discard_ctl.lock); 1358 1359 /* Reset pinned so btrfs_put_block_group doesn't complain */ 1360 spin_lock(&space_info->lock); 1361 spin_lock(&block_group->lock); 1362 1363 btrfs_space_info_update_bytes_pinned(fs_info, space_info, 1364 -block_group->pinned); 1365 space_info->bytes_readonly += block_group->pinned; 1366 percpu_counter_add_batch(&space_info->total_bytes_pinned, 1367 -block_group->pinned, 1368 BTRFS_TOTAL_BYTES_PINNED_BATCH); 1369 block_group->pinned = 0; 1370 1371 spin_unlock(&block_group->lock); 1372 spin_unlock(&space_info->lock); 1373 1374 /* 1375 * The normal path here is an unused block group is passed here, 1376 * then trimming is handled in the transaction commit path. 1377 * Async discard interposes before this to do the trimming 1378 * before coming down the unused block group path as trimming 1379 * will no longer be done later in the transaction commit path. 1380 */ 1381 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) 1382 goto flip_async; 1383 1384 /* DISCARD can flip during remount */ 1385 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC); 1386 1387 /* Implicit trim during transaction commit. */ 1388 if (trimming) 1389 btrfs_freeze_block_group(block_group); 1390 1391 /* 1392 * Btrfs_remove_chunk will abort the transaction if things go 1393 * horribly wrong. 1394 */ 1395 ret = btrfs_remove_chunk(trans, block_group->start); 1396 1397 if (ret) { 1398 if (trimming) 1399 btrfs_unfreeze_block_group(block_group); 1400 goto end_trans; 1401 } 1402 1403 /* 1404 * If we're not mounted with -odiscard, we can just forget 1405 * about this block group. Otherwise we'll need to wait 1406 * until transaction commit to do the actual discard. 1407 */ 1408 if (trimming) { 1409 spin_lock(&fs_info->unused_bgs_lock); 1410 /* 1411 * A concurrent scrub might have added us to the list 1412 * fs_info->unused_bgs, so use a list_move operation 1413 * to add the block group to the deleted_bgs list. 1414 */ 1415 list_move(&block_group->bg_list, 1416 &trans->transaction->deleted_bgs); 1417 spin_unlock(&fs_info->unused_bgs_lock); 1418 btrfs_get_block_group(block_group); 1419 } 1420 end_trans: 1421 btrfs_end_transaction(trans); 1422 next: 1423 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 1424 btrfs_put_block_group(block_group); 1425 spin_lock(&fs_info->unused_bgs_lock); 1426 } 1427 spin_unlock(&fs_info->unused_bgs_lock); 1428 return; 1429 1430 flip_async: 1431 btrfs_end_transaction(trans); 1432 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 1433 btrfs_put_block_group(block_group); 1434 btrfs_discard_punt_unused_bgs_list(fs_info); 1435 } 1436 1437 void btrfs_mark_bg_unused(struct btrfs_block_group *bg) 1438 { 1439 struct btrfs_fs_info *fs_info = bg->fs_info; 1440 1441 spin_lock(&fs_info->unused_bgs_lock); 1442 if (list_empty(&bg->bg_list)) { 1443 btrfs_get_block_group(bg); 1444 trace_btrfs_add_unused_block_group(bg); 1445 list_add_tail(&bg->bg_list, &fs_info->unused_bgs); 1446 } 1447 spin_unlock(&fs_info->unused_bgs_lock); 1448 } 1449 1450 static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 1451 struct btrfs_path *path) 1452 { 1453 struct extent_map_tree *em_tree; 1454 struct extent_map *em; 1455 struct btrfs_block_group_item bg; 1456 struct extent_buffer *leaf; 1457 int slot; 1458 u64 flags; 1459 int ret = 0; 1460 1461 slot = path->slots[0]; 1462 leaf = path->nodes[0]; 1463 1464 em_tree = &fs_info->mapping_tree; 1465 read_lock(&em_tree->lock); 1466 em = lookup_extent_mapping(em_tree, key->objectid, key->offset); 1467 read_unlock(&em_tree->lock); 1468 if (!em) { 1469 btrfs_err(fs_info, 1470 "logical %llu len %llu found bg but no related chunk", 1471 key->objectid, key->offset); 1472 return -ENOENT; 1473 } 1474 1475 if (em->start != key->objectid || em->len != key->offset) { 1476 btrfs_err(fs_info, 1477 "block group %llu len %llu mismatch with chunk %llu len %llu", 1478 key->objectid, key->offset, em->start, em->len); 1479 ret = -EUCLEAN; 1480 goto out_free_em; 1481 } 1482 1483 read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), 1484 sizeof(bg)); 1485 flags = btrfs_stack_block_group_flags(&bg) & 1486 BTRFS_BLOCK_GROUP_TYPE_MASK; 1487 1488 if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 1489 btrfs_err(fs_info, 1490 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", 1491 key->objectid, key->offset, flags, 1492 (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type)); 1493 ret = -EUCLEAN; 1494 } 1495 1496 out_free_em: 1497 free_extent_map(em); 1498 return ret; 1499 } 1500 1501 static int find_first_block_group(struct btrfs_fs_info *fs_info, 1502 struct btrfs_path *path, 1503 struct btrfs_key *key) 1504 { 1505 struct btrfs_root *root = fs_info->extent_root; 1506 int ret; 1507 struct btrfs_key found_key; 1508 struct extent_buffer *leaf; 1509 int slot; 1510 1511 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1512 if (ret < 0) 1513 return ret; 1514 1515 while (1) { 1516 slot = path->slots[0]; 1517 leaf = path->nodes[0]; 1518 if (slot >= btrfs_header_nritems(leaf)) { 1519 ret = btrfs_next_leaf(root, path); 1520 if (ret == 0) 1521 continue; 1522 if (ret < 0) 1523 goto out; 1524 break; 1525 } 1526 btrfs_item_key_to_cpu(leaf, &found_key, slot); 1527 1528 if (found_key.objectid >= key->objectid && 1529 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 1530 ret = read_bg_from_eb(fs_info, &found_key, path); 1531 break; 1532 } 1533 1534 path->slots[0]++; 1535 } 1536 out: 1537 return ret; 1538 } 1539 1540 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 1541 { 1542 u64 extra_flags = chunk_to_extended(flags) & 1543 BTRFS_EXTENDED_PROFILE_MASK; 1544 1545 write_seqlock(&fs_info->profiles_lock); 1546 if (flags & BTRFS_BLOCK_GROUP_DATA) 1547 fs_info->avail_data_alloc_bits |= extra_flags; 1548 if (flags & BTRFS_BLOCK_GROUP_METADATA) 1549 fs_info->avail_metadata_alloc_bits |= extra_flags; 1550 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 1551 fs_info->avail_system_alloc_bits |= extra_flags; 1552 write_sequnlock(&fs_info->profiles_lock); 1553 } 1554 1555 /** 1556 * btrfs_rmap_block - Map a physical disk address to a list of logical addresses 1557 * @chunk_start: logical address of block group 1558 * @physical: physical address to map to logical addresses 1559 * @logical: return array of logical addresses which map to @physical 1560 * @naddrs: length of @logical 1561 * @stripe_len: size of IO stripe for the given block group 1562 * 1563 * Maps a particular @physical disk address to a list of @logical addresses. 1564 * Used primarily to exclude those portions of a block group that contain super 1565 * block copies. 1566 */ 1567 EXPORT_FOR_TESTS 1568 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, 1569 u64 physical, u64 **logical, int *naddrs, int *stripe_len) 1570 { 1571 struct extent_map *em; 1572 struct map_lookup *map; 1573 u64 *buf; 1574 u64 bytenr; 1575 u64 data_stripe_length; 1576 u64 io_stripe_size; 1577 int i, nr = 0; 1578 int ret = 0; 1579 1580 em = btrfs_get_chunk_map(fs_info, chunk_start, 1); 1581 if (IS_ERR(em)) 1582 return -EIO; 1583 1584 map = em->map_lookup; 1585 data_stripe_length = em->orig_block_len; 1586 io_stripe_size = map->stripe_len; 1587 1588 /* For RAID5/6 adjust to a full IO stripe length */ 1589 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 1590 io_stripe_size = map->stripe_len * nr_data_stripes(map); 1591 1592 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 1593 if (!buf) { 1594 ret = -ENOMEM; 1595 goto out; 1596 } 1597 1598 for (i = 0; i < map->num_stripes; i++) { 1599 bool already_inserted = false; 1600 u64 stripe_nr; 1601 int j; 1602 1603 if (!in_range(physical, map->stripes[i].physical, 1604 data_stripe_length)) 1605 continue; 1606 1607 stripe_nr = physical - map->stripes[i].physical; 1608 stripe_nr = div64_u64(stripe_nr, map->stripe_len); 1609 1610 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1611 stripe_nr = stripe_nr * map->num_stripes + i; 1612 stripe_nr = div_u64(stripe_nr, map->sub_stripes); 1613 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1614 stripe_nr = stripe_nr * map->num_stripes + i; 1615 } 1616 /* 1617 * The remaining case would be for RAID56, multiply by 1618 * nr_data_stripes(). Alternatively, just use rmap_len below 1619 * instead of map->stripe_len 1620 */ 1621 1622 bytenr = chunk_start + stripe_nr * io_stripe_size; 1623 1624 /* Ensure we don't add duplicate addresses */ 1625 for (j = 0; j < nr; j++) { 1626 if (buf[j] == bytenr) { 1627 already_inserted = true; 1628 break; 1629 } 1630 } 1631 1632 if (!already_inserted) 1633 buf[nr++] = bytenr; 1634 } 1635 1636 *logical = buf; 1637 *naddrs = nr; 1638 *stripe_len = io_stripe_size; 1639 out: 1640 free_extent_map(em); 1641 return ret; 1642 } 1643 1644 static int exclude_super_stripes(struct btrfs_block_group *cache) 1645 { 1646 struct btrfs_fs_info *fs_info = cache->fs_info; 1647 const bool zoned = btrfs_is_zoned(fs_info); 1648 u64 bytenr; 1649 u64 *logical; 1650 int stripe_len; 1651 int i, nr, ret; 1652 1653 if (cache->start < BTRFS_SUPER_INFO_OFFSET) { 1654 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; 1655 cache->bytes_super += stripe_len; 1656 ret = btrfs_add_excluded_extent(fs_info, cache->start, 1657 stripe_len); 1658 if (ret) 1659 return ret; 1660 } 1661 1662 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 1663 bytenr = btrfs_sb_offset(i); 1664 ret = btrfs_rmap_block(fs_info, cache->start, 1665 bytenr, &logical, &nr, &stripe_len); 1666 if (ret) 1667 return ret; 1668 1669 /* Shouldn't have super stripes in sequential zones */ 1670 if (zoned && nr) { 1671 btrfs_err(fs_info, 1672 "zoned: block group %llu must not contain super block", 1673 cache->start); 1674 return -EUCLEAN; 1675 } 1676 1677 while (nr--) { 1678 u64 len = min_t(u64, stripe_len, 1679 cache->start + cache->length - logical[nr]); 1680 1681 cache->bytes_super += len; 1682 ret = btrfs_add_excluded_extent(fs_info, logical[nr], 1683 len); 1684 if (ret) { 1685 kfree(logical); 1686 return ret; 1687 } 1688 } 1689 1690 kfree(logical); 1691 } 1692 return 0; 1693 } 1694 1695 static void link_block_group(struct btrfs_block_group *cache) 1696 { 1697 struct btrfs_space_info *space_info = cache->space_info; 1698 int index = btrfs_bg_flags_to_raid_index(cache->flags); 1699 1700 down_write(&space_info->groups_sem); 1701 list_add_tail(&cache->list, &space_info->block_groups[index]); 1702 up_write(&space_info->groups_sem); 1703 } 1704 1705 static struct btrfs_block_group *btrfs_create_block_group_cache( 1706 struct btrfs_fs_info *fs_info, u64 start) 1707 { 1708 struct btrfs_block_group *cache; 1709 1710 cache = kzalloc(sizeof(*cache), GFP_NOFS); 1711 if (!cache) 1712 return NULL; 1713 1714 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 1715 GFP_NOFS); 1716 if (!cache->free_space_ctl) { 1717 kfree(cache); 1718 return NULL; 1719 } 1720 1721 cache->start = start; 1722 1723 cache->fs_info = fs_info; 1724 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); 1725 1726 cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; 1727 1728 refcount_set(&cache->refs, 1); 1729 spin_lock_init(&cache->lock); 1730 init_rwsem(&cache->data_rwsem); 1731 INIT_LIST_HEAD(&cache->list); 1732 INIT_LIST_HEAD(&cache->cluster_list); 1733 INIT_LIST_HEAD(&cache->bg_list); 1734 INIT_LIST_HEAD(&cache->ro_list); 1735 INIT_LIST_HEAD(&cache->discard_list); 1736 INIT_LIST_HEAD(&cache->dirty_list); 1737 INIT_LIST_HEAD(&cache->io_list); 1738 btrfs_init_free_space_ctl(cache, cache->free_space_ctl); 1739 atomic_set(&cache->frozen, 0); 1740 mutex_init(&cache->free_space_lock); 1741 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); 1742 1743 return cache; 1744 } 1745 1746 /* 1747 * Iterate all chunks and verify that each of them has the corresponding block 1748 * group 1749 */ 1750 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) 1751 { 1752 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 1753 struct extent_map *em; 1754 struct btrfs_block_group *bg; 1755 u64 start = 0; 1756 int ret = 0; 1757 1758 while (1) { 1759 read_lock(&map_tree->lock); 1760 /* 1761 * lookup_extent_mapping will return the first extent map 1762 * intersecting the range, so setting @len to 1 is enough to 1763 * get the first chunk. 1764 */ 1765 em = lookup_extent_mapping(map_tree, start, 1); 1766 read_unlock(&map_tree->lock); 1767 if (!em) 1768 break; 1769 1770 bg = btrfs_lookup_block_group(fs_info, em->start); 1771 if (!bg) { 1772 btrfs_err(fs_info, 1773 "chunk start=%llu len=%llu doesn't have corresponding block group", 1774 em->start, em->len); 1775 ret = -EUCLEAN; 1776 free_extent_map(em); 1777 break; 1778 } 1779 if (bg->start != em->start || bg->length != em->len || 1780 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != 1781 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 1782 btrfs_err(fs_info, 1783 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", 1784 em->start, em->len, 1785 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, 1786 bg->start, bg->length, 1787 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); 1788 ret = -EUCLEAN; 1789 free_extent_map(em); 1790 btrfs_put_block_group(bg); 1791 break; 1792 } 1793 start = em->start + em->len; 1794 free_extent_map(em); 1795 btrfs_put_block_group(bg); 1796 } 1797 return ret; 1798 } 1799 1800 static void read_block_group_item(struct btrfs_block_group *cache, 1801 struct btrfs_path *path, 1802 const struct btrfs_key *key) 1803 { 1804 struct extent_buffer *leaf = path->nodes[0]; 1805 struct btrfs_block_group_item bgi; 1806 int slot = path->slots[0]; 1807 1808 cache->length = key->offset; 1809 1810 read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), 1811 sizeof(bgi)); 1812 cache->used = btrfs_stack_block_group_used(&bgi); 1813 cache->flags = btrfs_stack_block_group_flags(&bgi); 1814 } 1815 1816 static int read_one_block_group(struct btrfs_fs_info *info, 1817 struct btrfs_path *path, 1818 const struct btrfs_key *key, 1819 int need_clear) 1820 { 1821 struct btrfs_block_group *cache; 1822 struct btrfs_space_info *space_info; 1823 const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); 1824 int ret; 1825 1826 ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); 1827 1828 cache = btrfs_create_block_group_cache(info, key->objectid); 1829 if (!cache) 1830 return -ENOMEM; 1831 1832 read_block_group_item(cache, path, key); 1833 1834 set_free_space_tree_thresholds(cache); 1835 1836 if (need_clear) { 1837 /* 1838 * When we mount with old space cache, we need to 1839 * set BTRFS_DC_CLEAR and set dirty flag. 1840 * 1841 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 1842 * truncate the old free space cache inode and 1843 * setup a new one. 1844 * b) Setting 'dirty flag' makes sure that we flush 1845 * the new space cache info onto disk. 1846 */ 1847 if (btrfs_test_opt(info, SPACE_CACHE)) 1848 cache->disk_cache_state = BTRFS_DC_CLEAR; 1849 } 1850 if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 1851 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 1852 btrfs_err(info, 1853 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 1854 cache->start); 1855 ret = -EINVAL; 1856 goto error; 1857 } 1858 1859 /* 1860 * We need to exclude the super stripes now so that the space info has 1861 * super bytes accounted for, otherwise we'll think we have more space 1862 * than we actually do. 1863 */ 1864 ret = exclude_super_stripes(cache); 1865 if (ret) { 1866 /* We may have excluded something, so call this just in case. */ 1867 btrfs_free_excluded_extents(cache); 1868 goto error; 1869 } 1870 1871 /* 1872 * Check for two cases, either we are full, and therefore don't need 1873 * to bother with the caching work since we won't find any space, or we 1874 * are empty, and we can just add all the space in and be done with it. 1875 * This saves us _a_lot_ of time, particularly in the full case. 1876 */ 1877 if (cache->length == cache->used) { 1878 cache->last_byte_to_unpin = (u64)-1; 1879 cache->cached = BTRFS_CACHE_FINISHED; 1880 btrfs_free_excluded_extents(cache); 1881 } else if (cache->used == 0) { 1882 cache->last_byte_to_unpin = (u64)-1; 1883 cache->cached = BTRFS_CACHE_FINISHED; 1884 add_new_free_space(cache, cache->start, 1885 cache->start + cache->length); 1886 btrfs_free_excluded_extents(cache); 1887 } 1888 1889 ret = btrfs_add_block_group_cache(info, cache); 1890 if (ret) { 1891 btrfs_remove_free_space_cache(cache); 1892 goto error; 1893 } 1894 trace_btrfs_add_block_group(info, cache, 0); 1895 btrfs_update_space_info(info, cache->flags, cache->length, 1896 cache->used, cache->bytes_super, &space_info); 1897 1898 cache->space_info = space_info; 1899 1900 link_block_group(cache); 1901 1902 set_avail_alloc_bits(info, cache->flags); 1903 if (btrfs_chunk_readonly(info, cache->start)) { 1904 inc_block_group_ro(cache, 1); 1905 } else if (cache->used == 0) { 1906 ASSERT(list_empty(&cache->bg_list)); 1907 if (btrfs_test_opt(info, DISCARD_ASYNC)) 1908 btrfs_discard_queue_work(&info->discard_ctl, cache); 1909 else 1910 btrfs_mark_bg_unused(cache); 1911 } 1912 return 0; 1913 error: 1914 btrfs_put_block_group(cache); 1915 return ret; 1916 } 1917 1918 static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) 1919 { 1920 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 1921 struct btrfs_space_info *space_info; 1922 struct rb_node *node; 1923 int ret = 0; 1924 1925 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 1926 struct extent_map *em; 1927 struct map_lookup *map; 1928 struct btrfs_block_group *bg; 1929 1930 em = rb_entry(node, struct extent_map, rb_node); 1931 map = em->map_lookup; 1932 bg = btrfs_create_block_group_cache(fs_info, em->start); 1933 if (!bg) { 1934 ret = -ENOMEM; 1935 break; 1936 } 1937 1938 /* Fill dummy cache as FULL */ 1939 bg->length = em->len; 1940 bg->flags = map->type; 1941 bg->last_byte_to_unpin = (u64)-1; 1942 bg->cached = BTRFS_CACHE_FINISHED; 1943 bg->used = em->len; 1944 bg->flags = map->type; 1945 ret = btrfs_add_block_group_cache(fs_info, bg); 1946 if (ret) { 1947 btrfs_remove_free_space_cache(bg); 1948 btrfs_put_block_group(bg); 1949 break; 1950 } 1951 btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, 1952 0, &space_info); 1953 bg->space_info = space_info; 1954 link_block_group(bg); 1955 1956 set_avail_alloc_bits(fs_info, bg->flags); 1957 } 1958 if (!ret) 1959 btrfs_init_global_block_rsv(fs_info); 1960 return ret; 1961 } 1962 1963 int btrfs_read_block_groups(struct btrfs_fs_info *info) 1964 { 1965 struct btrfs_path *path; 1966 int ret; 1967 struct btrfs_block_group *cache; 1968 struct btrfs_space_info *space_info; 1969 struct btrfs_key key; 1970 int need_clear = 0; 1971 u64 cache_gen; 1972 1973 if (!info->extent_root) 1974 return fill_dummy_bgs(info); 1975 1976 key.objectid = 0; 1977 key.offset = 0; 1978 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 1979 path = btrfs_alloc_path(); 1980 if (!path) 1981 return -ENOMEM; 1982 1983 cache_gen = btrfs_super_cache_generation(info->super_copy); 1984 if (btrfs_test_opt(info, SPACE_CACHE) && 1985 btrfs_super_generation(info->super_copy) != cache_gen) 1986 need_clear = 1; 1987 if (btrfs_test_opt(info, CLEAR_CACHE)) 1988 need_clear = 1; 1989 1990 while (1) { 1991 ret = find_first_block_group(info, path, &key); 1992 if (ret > 0) 1993 break; 1994 if (ret != 0) 1995 goto error; 1996 1997 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1998 ret = read_one_block_group(info, path, &key, need_clear); 1999 if (ret < 0) 2000 goto error; 2001 key.objectid += key.offset; 2002 key.offset = 0; 2003 btrfs_release_path(path); 2004 } 2005 btrfs_release_path(path); 2006 2007 list_for_each_entry(space_info, &info->space_info, list) { 2008 int i; 2009 2010 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 2011 if (list_empty(&space_info->block_groups[i])) 2012 continue; 2013 cache = list_first_entry(&space_info->block_groups[i], 2014 struct btrfs_block_group, 2015 list); 2016 btrfs_sysfs_add_block_group_type(cache); 2017 } 2018 2019 if (!(btrfs_get_alloc_profile(info, space_info->flags) & 2020 (BTRFS_BLOCK_GROUP_RAID10 | 2021 BTRFS_BLOCK_GROUP_RAID1_MASK | 2022 BTRFS_BLOCK_GROUP_RAID56_MASK | 2023 BTRFS_BLOCK_GROUP_DUP))) 2024 continue; 2025 /* 2026 * Avoid allocating from un-mirrored block group if there are 2027 * mirrored block groups. 2028 */ 2029 list_for_each_entry(cache, 2030 &space_info->block_groups[BTRFS_RAID_RAID0], 2031 list) 2032 inc_block_group_ro(cache, 1); 2033 list_for_each_entry(cache, 2034 &space_info->block_groups[BTRFS_RAID_SINGLE], 2035 list) 2036 inc_block_group_ro(cache, 1); 2037 } 2038 2039 btrfs_init_global_block_rsv(info); 2040 ret = check_chunk_block_group_mappings(info); 2041 error: 2042 btrfs_free_path(path); 2043 return ret; 2044 } 2045 2046 static int insert_block_group_item(struct btrfs_trans_handle *trans, 2047 struct btrfs_block_group *block_group) 2048 { 2049 struct btrfs_fs_info *fs_info = trans->fs_info; 2050 struct btrfs_block_group_item bgi; 2051 struct btrfs_root *root; 2052 struct btrfs_key key; 2053 2054 spin_lock(&block_group->lock); 2055 btrfs_set_stack_block_group_used(&bgi, block_group->used); 2056 btrfs_set_stack_block_group_chunk_objectid(&bgi, 2057 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 2058 btrfs_set_stack_block_group_flags(&bgi, block_group->flags); 2059 key.objectid = block_group->start; 2060 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2061 key.offset = block_group->length; 2062 spin_unlock(&block_group->lock); 2063 2064 root = fs_info->extent_root; 2065 return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); 2066 } 2067 2068 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) 2069 { 2070 struct btrfs_fs_info *fs_info = trans->fs_info; 2071 struct btrfs_block_group *block_group; 2072 int ret = 0; 2073 2074 if (!trans->can_flush_pending_bgs) 2075 return; 2076 2077 while (!list_empty(&trans->new_bgs)) { 2078 int index; 2079 2080 block_group = list_first_entry(&trans->new_bgs, 2081 struct btrfs_block_group, 2082 bg_list); 2083 if (ret) 2084 goto next; 2085 2086 index = btrfs_bg_flags_to_raid_index(block_group->flags); 2087 2088 ret = insert_block_group_item(trans, block_group); 2089 if (ret) 2090 btrfs_abort_transaction(trans, ret); 2091 ret = btrfs_finish_chunk_alloc(trans, block_group->start, 2092 block_group->length); 2093 if (ret) 2094 btrfs_abort_transaction(trans, ret); 2095 add_block_group_free_space(trans, block_group); 2096 2097 /* 2098 * If we restriped during balance, we may have added a new raid 2099 * type, so now add the sysfs entries when it is safe to do so. 2100 * We don't have to worry about locking here as it's handled in 2101 * btrfs_sysfs_add_block_group_type. 2102 */ 2103 if (block_group->space_info->block_group_kobjs[index] == NULL) 2104 btrfs_sysfs_add_block_group_type(block_group); 2105 2106 /* Already aborted the transaction if it failed. */ 2107 next: 2108 btrfs_delayed_refs_rsv_release(fs_info, 1); 2109 list_del_init(&block_group->bg_list); 2110 } 2111 btrfs_trans_release_chunk_metadata(trans); 2112 } 2113 2114 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, 2115 u64 type, u64 chunk_offset, u64 size) 2116 { 2117 struct btrfs_fs_info *fs_info = trans->fs_info; 2118 struct btrfs_block_group *cache; 2119 int ret; 2120 2121 btrfs_set_log_full_commit(trans); 2122 2123 cache = btrfs_create_block_group_cache(fs_info, chunk_offset); 2124 if (!cache) 2125 return -ENOMEM; 2126 2127 cache->length = size; 2128 set_free_space_tree_thresholds(cache); 2129 cache->used = bytes_used; 2130 cache->flags = type; 2131 cache->last_byte_to_unpin = (u64)-1; 2132 cache->cached = BTRFS_CACHE_FINISHED; 2133 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 2134 cache->needs_free_space = 1; 2135 ret = exclude_super_stripes(cache); 2136 if (ret) { 2137 /* We may have excluded something, so call this just in case */ 2138 btrfs_free_excluded_extents(cache); 2139 btrfs_put_block_group(cache); 2140 return ret; 2141 } 2142 2143 add_new_free_space(cache, chunk_offset, chunk_offset + size); 2144 2145 btrfs_free_excluded_extents(cache); 2146 2147 #ifdef CONFIG_BTRFS_DEBUG 2148 if (btrfs_should_fragment_free_space(cache)) { 2149 u64 new_bytes_used = size - bytes_used; 2150 2151 bytes_used += new_bytes_used >> 1; 2152 fragment_free_space(cache); 2153 } 2154 #endif 2155 /* 2156 * Ensure the corresponding space_info object is created and 2157 * assigned to our block group. We want our bg to be added to the rbtree 2158 * with its ->space_info set. 2159 */ 2160 cache->space_info = btrfs_find_space_info(fs_info, cache->flags); 2161 ASSERT(cache->space_info); 2162 2163 ret = btrfs_add_block_group_cache(fs_info, cache); 2164 if (ret) { 2165 btrfs_remove_free_space_cache(cache); 2166 btrfs_put_block_group(cache); 2167 return ret; 2168 } 2169 2170 /* 2171 * Now that our block group has its ->space_info set and is inserted in 2172 * the rbtree, update the space info's counters. 2173 */ 2174 trace_btrfs_add_block_group(fs_info, cache, 1); 2175 btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, 2176 cache->bytes_super, &cache->space_info); 2177 btrfs_update_global_block_rsv(fs_info); 2178 2179 link_block_group(cache); 2180 2181 list_add_tail(&cache->bg_list, &trans->new_bgs); 2182 trans->delayed_ref_updates++; 2183 btrfs_update_delayed_refs_rsv(trans); 2184 2185 set_avail_alloc_bits(fs_info, type); 2186 return 0; 2187 } 2188 2189 /* 2190 * Mark one block group RO, can be called several times for the same block 2191 * group. 2192 * 2193 * @cache: the destination block group 2194 * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to 2195 * ensure we still have some free space after marking this 2196 * block group RO. 2197 */ 2198 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, 2199 bool do_chunk_alloc) 2200 { 2201 struct btrfs_fs_info *fs_info = cache->fs_info; 2202 struct btrfs_trans_handle *trans; 2203 u64 alloc_flags; 2204 int ret; 2205 2206 again: 2207 trans = btrfs_join_transaction(fs_info->extent_root); 2208 if (IS_ERR(trans)) 2209 return PTR_ERR(trans); 2210 2211 /* 2212 * we're not allowed to set block groups readonly after the dirty 2213 * block groups cache has started writing. If it already started, 2214 * back off and let this transaction commit 2215 */ 2216 mutex_lock(&fs_info->ro_block_group_mutex); 2217 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 2218 u64 transid = trans->transid; 2219 2220 mutex_unlock(&fs_info->ro_block_group_mutex); 2221 btrfs_end_transaction(trans); 2222 2223 ret = btrfs_wait_for_commit(fs_info, transid); 2224 if (ret) 2225 return ret; 2226 goto again; 2227 } 2228 2229 if (do_chunk_alloc) { 2230 /* 2231 * If we are changing raid levels, try to allocate a 2232 * corresponding block group with the new raid level. 2233 */ 2234 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); 2235 if (alloc_flags != cache->flags) { 2236 ret = btrfs_chunk_alloc(trans, alloc_flags, 2237 CHUNK_ALLOC_FORCE); 2238 /* 2239 * ENOSPC is allowed here, we may have enough space 2240 * already allocated at the new raid level to carry on 2241 */ 2242 if (ret == -ENOSPC) 2243 ret = 0; 2244 if (ret < 0) 2245 goto out; 2246 } 2247 } 2248 2249 ret = inc_block_group_ro(cache, 0); 2250 if (!do_chunk_alloc) 2251 goto unlock_out; 2252 if (!ret) 2253 goto out; 2254 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); 2255 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 2256 if (ret < 0) 2257 goto out; 2258 ret = inc_block_group_ro(cache, 0); 2259 out: 2260 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 2261 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); 2262 mutex_lock(&fs_info->chunk_mutex); 2263 check_system_chunk(trans, alloc_flags); 2264 mutex_unlock(&fs_info->chunk_mutex); 2265 } 2266 unlock_out: 2267 mutex_unlock(&fs_info->ro_block_group_mutex); 2268 2269 btrfs_end_transaction(trans); 2270 return ret; 2271 } 2272 2273 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) 2274 { 2275 struct btrfs_space_info *sinfo = cache->space_info; 2276 u64 num_bytes; 2277 2278 BUG_ON(!cache->ro); 2279 2280 spin_lock(&sinfo->lock); 2281 spin_lock(&cache->lock); 2282 if (!--cache->ro) { 2283 num_bytes = cache->length - cache->reserved - 2284 cache->pinned - cache->bytes_super - cache->used; 2285 sinfo->bytes_readonly -= num_bytes; 2286 list_del_init(&cache->ro_list); 2287 } 2288 spin_unlock(&cache->lock); 2289 spin_unlock(&sinfo->lock); 2290 } 2291 2292 static int update_block_group_item(struct btrfs_trans_handle *trans, 2293 struct btrfs_path *path, 2294 struct btrfs_block_group *cache) 2295 { 2296 struct btrfs_fs_info *fs_info = trans->fs_info; 2297 int ret; 2298 struct btrfs_root *root = fs_info->extent_root; 2299 unsigned long bi; 2300 struct extent_buffer *leaf; 2301 struct btrfs_block_group_item bgi; 2302 struct btrfs_key key; 2303 2304 key.objectid = cache->start; 2305 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2306 key.offset = cache->length; 2307 2308 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2309 if (ret) { 2310 if (ret > 0) 2311 ret = -ENOENT; 2312 goto fail; 2313 } 2314 2315 leaf = path->nodes[0]; 2316 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 2317 btrfs_set_stack_block_group_used(&bgi, cache->used); 2318 btrfs_set_stack_block_group_chunk_objectid(&bgi, 2319 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 2320 btrfs_set_stack_block_group_flags(&bgi, cache->flags); 2321 write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); 2322 btrfs_mark_buffer_dirty(leaf); 2323 fail: 2324 btrfs_release_path(path); 2325 return ret; 2326 2327 } 2328 2329 static int cache_save_setup(struct btrfs_block_group *block_group, 2330 struct btrfs_trans_handle *trans, 2331 struct btrfs_path *path) 2332 { 2333 struct btrfs_fs_info *fs_info = block_group->fs_info; 2334 struct btrfs_root *root = fs_info->tree_root; 2335 struct inode *inode = NULL; 2336 struct extent_changeset *data_reserved = NULL; 2337 u64 alloc_hint = 0; 2338 int dcs = BTRFS_DC_ERROR; 2339 u64 num_pages = 0; 2340 int retries = 0; 2341 int ret = 0; 2342 2343 if (!btrfs_test_opt(fs_info, SPACE_CACHE)) 2344 return 0; 2345 2346 /* 2347 * If this block group is smaller than 100 megs don't bother caching the 2348 * block group. 2349 */ 2350 if (block_group->length < (100 * SZ_1M)) { 2351 spin_lock(&block_group->lock); 2352 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 2353 spin_unlock(&block_group->lock); 2354 return 0; 2355 } 2356 2357 if (TRANS_ABORTED(trans)) 2358 return 0; 2359 again: 2360 inode = lookup_free_space_inode(block_group, path); 2361 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 2362 ret = PTR_ERR(inode); 2363 btrfs_release_path(path); 2364 goto out; 2365 } 2366 2367 if (IS_ERR(inode)) { 2368 BUG_ON(retries); 2369 retries++; 2370 2371 if (block_group->ro) 2372 goto out_free; 2373 2374 ret = create_free_space_inode(trans, block_group, path); 2375 if (ret) 2376 goto out_free; 2377 goto again; 2378 } 2379 2380 /* 2381 * We want to set the generation to 0, that way if anything goes wrong 2382 * from here on out we know not to trust this cache when we load up next 2383 * time. 2384 */ 2385 BTRFS_I(inode)->generation = 0; 2386 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 2387 if (ret) { 2388 /* 2389 * So theoretically we could recover from this, simply set the 2390 * super cache generation to 0 so we know to invalidate the 2391 * cache, but then we'd have to keep track of the block groups 2392 * that fail this way so we know we _have_ to reset this cache 2393 * before the next commit or risk reading stale cache. So to 2394 * limit our exposure to horrible edge cases lets just abort the 2395 * transaction, this only happens in really bad situations 2396 * anyway. 2397 */ 2398 btrfs_abort_transaction(trans, ret); 2399 goto out_put; 2400 } 2401 WARN_ON(ret); 2402 2403 /* We've already setup this transaction, go ahead and exit */ 2404 if (block_group->cache_generation == trans->transid && 2405 i_size_read(inode)) { 2406 dcs = BTRFS_DC_SETUP; 2407 goto out_put; 2408 } 2409 2410 if (i_size_read(inode) > 0) { 2411 ret = btrfs_check_trunc_cache_free_space(fs_info, 2412 &fs_info->global_block_rsv); 2413 if (ret) 2414 goto out_put; 2415 2416 ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 2417 if (ret) 2418 goto out_put; 2419 } 2420 2421 spin_lock(&block_group->lock); 2422 if (block_group->cached != BTRFS_CACHE_FINISHED || 2423 !btrfs_test_opt(fs_info, SPACE_CACHE)) { 2424 /* 2425 * don't bother trying to write stuff out _if_ 2426 * a) we're not cached, 2427 * b) we're with nospace_cache mount option, 2428 * c) we're with v2 space_cache (FREE_SPACE_TREE). 2429 */ 2430 dcs = BTRFS_DC_WRITTEN; 2431 spin_unlock(&block_group->lock); 2432 goto out_put; 2433 } 2434 spin_unlock(&block_group->lock); 2435 2436 /* 2437 * We hit an ENOSPC when setting up the cache in this transaction, just 2438 * skip doing the setup, we've already cleared the cache so we're safe. 2439 */ 2440 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 2441 ret = -ENOSPC; 2442 goto out_put; 2443 } 2444 2445 /* 2446 * Try to preallocate enough space based on how big the block group is. 2447 * Keep in mind this has to include any pinned space which could end up 2448 * taking up quite a bit since it's not folded into the other space 2449 * cache. 2450 */ 2451 num_pages = div_u64(block_group->length, SZ_256M); 2452 if (!num_pages) 2453 num_pages = 1; 2454 2455 num_pages *= 16; 2456 num_pages *= PAGE_SIZE; 2457 2458 ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, 2459 num_pages); 2460 if (ret) 2461 goto out_put; 2462 2463 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 2464 num_pages, num_pages, 2465 &alloc_hint); 2466 /* 2467 * Our cache requires contiguous chunks so that we don't modify a bunch 2468 * of metadata or split extents when writing the cache out, which means 2469 * we can enospc if we are heavily fragmented in addition to just normal 2470 * out of space conditions. So if we hit this just skip setting up any 2471 * other block groups for this transaction, maybe we'll unpin enough 2472 * space the next time around. 2473 */ 2474 if (!ret) 2475 dcs = BTRFS_DC_SETUP; 2476 else if (ret == -ENOSPC) 2477 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 2478 2479 out_put: 2480 iput(inode); 2481 out_free: 2482 btrfs_release_path(path); 2483 out: 2484 spin_lock(&block_group->lock); 2485 if (!ret && dcs == BTRFS_DC_SETUP) 2486 block_group->cache_generation = trans->transid; 2487 block_group->disk_cache_state = dcs; 2488 spin_unlock(&block_group->lock); 2489 2490 extent_changeset_free(data_reserved); 2491 return ret; 2492 } 2493 2494 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) 2495 { 2496 struct btrfs_fs_info *fs_info = trans->fs_info; 2497 struct btrfs_block_group *cache, *tmp; 2498 struct btrfs_transaction *cur_trans = trans->transaction; 2499 struct btrfs_path *path; 2500 2501 if (list_empty(&cur_trans->dirty_bgs) || 2502 !btrfs_test_opt(fs_info, SPACE_CACHE)) 2503 return 0; 2504 2505 path = btrfs_alloc_path(); 2506 if (!path) 2507 return -ENOMEM; 2508 2509 /* Could add new block groups, use _safe just in case */ 2510 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 2511 dirty_list) { 2512 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 2513 cache_save_setup(cache, trans, path); 2514 } 2515 2516 btrfs_free_path(path); 2517 return 0; 2518 } 2519 2520 /* 2521 * Transaction commit does final block group cache writeback during a critical 2522 * section where nothing is allowed to change the FS. This is required in 2523 * order for the cache to actually match the block group, but can introduce a 2524 * lot of latency into the commit. 2525 * 2526 * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO. 2527 * There's a chance we'll have to redo some of it if the block group changes 2528 * again during the commit, but it greatly reduces the commit latency by 2529 * getting rid of the easy block groups while we're still allowing others to 2530 * join the commit. 2531 */ 2532 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) 2533 { 2534 struct btrfs_fs_info *fs_info = trans->fs_info; 2535 struct btrfs_block_group *cache; 2536 struct btrfs_transaction *cur_trans = trans->transaction; 2537 int ret = 0; 2538 int should_put; 2539 struct btrfs_path *path = NULL; 2540 LIST_HEAD(dirty); 2541 struct list_head *io = &cur_trans->io_bgs; 2542 int num_started = 0; 2543 int loops = 0; 2544 2545 spin_lock(&cur_trans->dirty_bgs_lock); 2546 if (list_empty(&cur_trans->dirty_bgs)) { 2547 spin_unlock(&cur_trans->dirty_bgs_lock); 2548 return 0; 2549 } 2550 list_splice_init(&cur_trans->dirty_bgs, &dirty); 2551 spin_unlock(&cur_trans->dirty_bgs_lock); 2552 2553 again: 2554 /* Make sure all the block groups on our dirty list actually exist */ 2555 btrfs_create_pending_block_groups(trans); 2556 2557 if (!path) { 2558 path = btrfs_alloc_path(); 2559 if (!path) 2560 return -ENOMEM; 2561 } 2562 2563 /* 2564 * cache_write_mutex is here only to save us from balance or automatic 2565 * removal of empty block groups deleting this block group while we are 2566 * writing out the cache 2567 */ 2568 mutex_lock(&trans->transaction->cache_write_mutex); 2569 while (!list_empty(&dirty)) { 2570 bool drop_reserve = true; 2571 2572 cache = list_first_entry(&dirty, struct btrfs_block_group, 2573 dirty_list); 2574 /* 2575 * This can happen if something re-dirties a block group that 2576 * is already under IO. Just wait for it to finish and then do 2577 * it all again 2578 */ 2579 if (!list_empty(&cache->io_list)) { 2580 list_del_init(&cache->io_list); 2581 btrfs_wait_cache_io(trans, cache, path); 2582 btrfs_put_block_group(cache); 2583 } 2584 2585 2586 /* 2587 * btrfs_wait_cache_io uses the cache->dirty_list to decide if 2588 * it should update the cache_state. Don't delete until after 2589 * we wait. 2590 * 2591 * Since we're not running in the commit critical section 2592 * we need the dirty_bgs_lock to protect from update_block_group 2593 */ 2594 spin_lock(&cur_trans->dirty_bgs_lock); 2595 list_del_init(&cache->dirty_list); 2596 spin_unlock(&cur_trans->dirty_bgs_lock); 2597 2598 should_put = 1; 2599 2600 cache_save_setup(cache, trans, path); 2601 2602 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 2603 cache->io_ctl.inode = NULL; 2604 ret = btrfs_write_out_cache(trans, cache, path); 2605 if (ret == 0 && cache->io_ctl.inode) { 2606 num_started++; 2607 should_put = 0; 2608 2609 /* 2610 * The cache_write_mutex is protecting the 2611 * io_list, also refer to the definition of 2612 * btrfs_transaction::io_bgs for more details 2613 */ 2614 list_add_tail(&cache->io_list, io); 2615 } else { 2616 /* 2617 * If we failed to write the cache, the 2618 * generation will be bad and life goes on 2619 */ 2620 ret = 0; 2621 } 2622 } 2623 if (!ret) { 2624 ret = update_block_group_item(trans, path, cache); 2625 /* 2626 * Our block group might still be attached to the list 2627 * of new block groups in the transaction handle of some 2628 * other task (struct btrfs_trans_handle->new_bgs). This 2629 * means its block group item isn't yet in the extent 2630 * tree. If this happens ignore the error, as we will 2631 * try again later in the critical section of the 2632 * transaction commit. 2633 */ 2634 if (ret == -ENOENT) { 2635 ret = 0; 2636 spin_lock(&cur_trans->dirty_bgs_lock); 2637 if (list_empty(&cache->dirty_list)) { 2638 list_add_tail(&cache->dirty_list, 2639 &cur_trans->dirty_bgs); 2640 btrfs_get_block_group(cache); 2641 drop_reserve = false; 2642 } 2643 spin_unlock(&cur_trans->dirty_bgs_lock); 2644 } else if (ret) { 2645 btrfs_abort_transaction(trans, ret); 2646 } 2647 } 2648 2649 /* If it's not on the io list, we need to put the block group */ 2650 if (should_put) 2651 btrfs_put_block_group(cache); 2652 if (drop_reserve) 2653 btrfs_delayed_refs_rsv_release(fs_info, 1); 2654 2655 if (ret) 2656 break; 2657 2658 /* 2659 * Avoid blocking other tasks for too long. It might even save 2660 * us from writing caches for block groups that are going to be 2661 * removed. 2662 */ 2663 mutex_unlock(&trans->transaction->cache_write_mutex); 2664 mutex_lock(&trans->transaction->cache_write_mutex); 2665 } 2666 mutex_unlock(&trans->transaction->cache_write_mutex); 2667 2668 /* 2669 * Go through delayed refs for all the stuff we've just kicked off 2670 * and then loop back (just once) 2671 */ 2672 ret = btrfs_run_delayed_refs(trans, 0); 2673 if (!ret && loops == 0) { 2674 loops++; 2675 spin_lock(&cur_trans->dirty_bgs_lock); 2676 list_splice_init(&cur_trans->dirty_bgs, &dirty); 2677 /* 2678 * dirty_bgs_lock protects us from concurrent block group 2679 * deletes too (not just cache_write_mutex). 2680 */ 2681 if (!list_empty(&dirty)) { 2682 spin_unlock(&cur_trans->dirty_bgs_lock); 2683 goto again; 2684 } 2685 spin_unlock(&cur_trans->dirty_bgs_lock); 2686 } else if (ret < 0) { 2687 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 2688 } 2689 2690 btrfs_free_path(path); 2691 return ret; 2692 } 2693 2694 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) 2695 { 2696 struct btrfs_fs_info *fs_info = trans->fs_info; 2697 struct btrfs_block_group *cache; 2698 struct btrfs_transaction *cur_trans = trans->transaction; 2699 int ret = 0; 2700 int should_put; 2701 struct btrfs_path *path; 2702 struct list_head *io = &cur_trans->io_bgs; 2703 int num_started = 0; 2704 2705 path = btrfs_alloc_path(); 2706 if (!path) 2707 return -ENOMEM; 2708 2709 /* 2710 * Even though we are in the critical section of the transaction commit, 2711 * we can still have concurrent tasks adding elements to this 2712 * transaction's list of dirty block groups. These tasks correspond to 2713 * endio free space workers started when writeback finishes for a 2714 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 2715 * allocate new block groups as a result of COWing nodes of the root 2716 * tree when updating the free space inode. The writeback for the space 2717 * caches is triggered by an earlier call to 2718 * btrfs_start_dirty_block_groups() and iterations of the following 2719 * loop. 2720 * Also we want to do the cache_save_setup first and then run the 2721 * delayed refs to make sure we have the best chance at doing this all 2722 * in one shot. 2723 */ 2724 spin_lock(&cur_trans->dirty_bgs_lock); 2725 while (!list_empty(&cur_trans->dirty_bgs)) { 2726 cache = list_first_entry(&cur_trans->dirty_bgs, 2727 struct btrfs_block_group, 2728 dirty_list); 2729 2730 /* 2731 * This can happen if cache_save_setup re-dirties a block group 2732 * that is already under IO. Just wait for it to finish and 2733 * then do it all again 2734 */ 2735 if (!list_empty(&cache->io_list)) { 2736 spin_unlock(&cur_trans->dirty_bgs_lock); 2737 list_del_init(&cache->io_list); 2738 btrfs_wait_cache_io(trans, cache, path); 2739 btrfs_put_block_group(cache); 2740 spin_lock(&cur_trans->dirty_bgs_lock); 2741 } 2742 2743 /* 2744 * Don't remove from the dirty list until after we've waited on 2745 * any pending IO 2746 */ 2747 list_del_init(&cache->dirty_list); 2748 spin_unlock(&cur_trans->dirty_bgs_lock); 2749 should_put = 1; 2750 2751 cache_save_setup(cache, trans, path); 2752 2753 if (!ret) 2754 ret = btrfs_run_delayed_refs(trans, 2755 (unsigned long) -1); 2756 2757 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 2758 cache->io_ctl.inode = NULL; 2759 ret = btrfs_write_out_cache(trans, cache, path); 2760 if (ret == 0 && cache->io_ctl.inode) { 2761 num_started++; 2762 should_put = 0; 2763 list_add_tail(&cache->io_list, io); 2764 } else { 2765 /* 2766 * If we failed to write the cache, the 2767 * generation will be bad and life goes on 2768 */ 2769 ret = 0; 2770 } 2771 } 2772 if (!ret) { 2773 ret = update_block_group_item(trans, path, cache); 2774 /* 2775 * One of the free space endio workers might have 2776 * created a new block group while updating a free space 2777 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 2778 * and hasn't released its transaction handle yet, in 2779 * which case the new block group is still attached to 2780 * its transaction handle and its creation has not 2781 * finished yet (no block group item in the extent tree 2782 * yet, etc). If this is the case, wait for all free 2783 * space endio workers to finish and retry. This is a 2784 * very rare case so no need for a more efficient and 2785 * complex approach. 2786 */ 2787 if (ret == -ENOENT) { 2788 wait_event(cur_trans->writer_wait, 2789 atomic_read(&cur_trans->num_writers) == 1); 2790 ret = update_block_group_item(trans, path, cache); 2791 } 2792 if (ret) 2793 btrfs_abort_transaction(trans, ret); 2794 } 2795 2796 /* If its not on the io list, we need to put the block group */ 2797 if (should_put) 2798 btrfs_put_block_group(cache); 2799 btrfs_delayed_refs_rsv_release(fs_info, 1); 2800 spin_lock(&cur_trans->dirty_bgs_lock); 2801 } 2802 spin_unlock(&cur_trans->dirty_bgs_lock); 2803 2804 /* 2805 * Refer to the definition of io_bgs member for details why it's safe 2806 * to use it without any locking 2807 */ 2808 while (!list_empty(io)) { 2809 cache = list_first_entry(io, struct btrfs_block_group, 2810 io_list); 2811 list_del_init(&cache->io_list); 2812 btrfs_wait_cache_io(trans, cache, path); 2813 btrfs_put_block_group(cache); 2814 } 2815 2816 btrfs_free_path(path); 2817 return ret; 2818 } 2819 2820 int btrfs_update_block_group(struct btrfs_trans_handle *trans, 2821 u64 bytenr, u64 num_bytes, int alloc) 2822 { 2823 struct btrfs_fs_info *info = trans->fs_info; 2824 struct btrfs_block_group *cache = NULL; 2825 u64 total = num_bytes; 2826 u64 old_val; 2827 u64 byte_in_group; 2828 int factor; 2829 int ret = 0; 2830 2831 /* Block accounting for super block */ 2832 spin_lock(&info->delalloc_root_lock); 2833 old_val = btrfs_super_bytes_used(info->super_copy); 2834 if (alloc) 2835 old_val += num_bytes; 2836 else 2837 old_val -= num_bytes; 2838 btrfs_set_super_bytes_used(info->super_copy, old_val); 2839 spin_unlock(&info->delalloc_root_lock); 2840 2841 while (total) { 2842 cache = btrfs_lookup_block_group(info, bytenr); 2843 if (!cache) { 2844 ret = -ENOENT; 2845 break; 2846 } 2847 factor = btrfs_bg_type_to_factor(cache->flags); 2848 2849 /* 2850 * If this block group has free space cache written out, we 2851 * need to make sure to load it if we are removing space. This 2852 * is because we need the unpinning stage to actually add the 2853 * space back to the block group, otherwise we will leak space. 2854 */ 2855 if (!alloc && !btrfs_block_group_done(cache)) 2856 btrfs_cache_block_group(cache, 1); 2857 2858 byte_in_group = bytenr - cache->start; 2859 WARN_ON(byte_in_group > cache->length); 2860 2861 spin_lock(&cache->space_info->lock); 2862 spin_lock(&cache->lock); 2863 2864 if (btrfs_test_opt(info, SPACE_CACHE) && 2865 cache->disk_cache_state < BTRFS_DC_CLEAR) 2866 cache->disk_cache_state = BTRFS_DC_CLEAR; 2867 2868 old_val = cache->used; 2869 num_bytes = min(total, cache->length - byte_in_group); 2870 if (alloc) { 2871 old_val += num_bytes; 2872 cache->used = old_val; 2873 cache->reserved -= num_bytes; 2874 cache->space_info->bytes_reserved -= num_bytes; 2875 cache->space_info->bytes_used += num_bytes; 2876 cache->space_info->disk_used += num_bytes * factor; 2877 spin_unlock(&cache->lock); 2878 spin_unlock(&cache->space_info->lock); 2879 } else { 2880 old_val -= num_bytes; 2881 cache->used = old_val; 2882 cache->pinned += num_bytes; 2883 btrfs_space_info_update_bytes_pinned(info, 2884 cache->space_info, num_bytes); 2885 cache->space_info->bytes_used -= num_bytes; 2886 cache->space_info->disk_used -= num_bytes * factor; 2887 spin_unlock(&cache->lock); 2888 spin_unlock(&cache->space_info->lock); 2889 2890 percpu_counter_add_batch( 2891 &cache->space_info->total_bytes_pinned, 2892 num_bytes, 2893 BTRFS_TOTAL_BYTES_PINNED_BATCH); 2894 set_extent_dirty(&trans->transaction->pinned_extents, 2895 bytenr, bytenr + num_bytes - 1, 2896 GFP_NOFS | __GFP_NOFAIL); 2897 } 2898 2899 spin_lock(&trans->transaction->dirty_bgs_lock); 2900 if (list_empty(&cache->dirty_list)) { 2901 list_add_tail(&cache->dirty_list, 2902 &trans->transaction->dirty_bgs); 2903 trans->delayed_ref_updates++; 2904 btrfs_get_block_group(cache); 2905 } 2906 spin_unlock(&trans->transaction->dirty_bgs_lock); 2907 2908 /* 2909 * No longer have used bytes in this block group, queue it for 2910 * deletion. We do this after adding the block group to the 2911 * dirty list to avoid races between cleaner kthread and space 2912 * cache writeout. 2913 */ 2914 if (!alloc && old_val == 0) { 2915 if (!btrfs_test_opt(info, DISCARD_ASYNC)) 2916 btrfs_mark_bg_unused(cache); 2917 } 2918 2919 btrfs_put_block_group(cache); 2920 total -= num_bytes; 2921 bytenr += num_bytes; 2922 } 2923 2924 /* Modified block groups are accounted for in the delayed_refs_rsv. */ 2925 btrfs_update_delayed_refs_rsv(trans); 2926 return ret; 2927 } 2928 2929 /** 2930 * btrfs_add_reserved_bytes - update the block_group and space info counters 2931 * @cache: The cache we are manipulating 2932 * @ram_bytes: The number of bytes of file content, and will be same to 2933 * @num_bytes except for the compress path. 2934 * @num_bytes: The number of bytes in question 2935 * @delalloc: The blocks are allocated for the delalloc write 2936 * 2937 * This is called by the allocator when it reserves space. If this is a 2938 * reservation and the block group has become read only we cannot make the 2939 * reservation and return -EAGAIN, otherwise this function always succeeds. 2940 */ 2941 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, 2942 u64 ram_bytes, u64 num_bytes, int delalloc) 2943 { 2944 struct btrfs_space_info *space_info = cache->space_info; 2945 int ret = 0; 2946 2947 spin_lock(&space_info->lock); 2948 spin_lock(&cache->lock); 2949 if (cache->ro) { 2950 ret = -EAGAIN; 2951 } else { 2952 cache->reserved += num_bytes; 2953 space_info->bytes_reserved += num_bytes; 2954 trace_btrfs_space_reservation(cache->fs_info, "space_info", 2955 space_info->flags, num_bytes, 1); 2956 btrfs_space_info_update_bytes_may_use(cache->fs_info, 2957 space_info, -ram_bytes); 2958 if (delalloc) 2959 cache->delalloc_bytes += num_bytes; 2960 2961 /* 2962 * Compression can use less space than we reserved, so wake 2963 * tickets if that happens 2964 */ 2965 if (num_bytes < ram_bytes) 2966 btrfs_try_granting_tickets(cache->fs_info, space_info); 2967 } 2968 spin_unlock(&cache->lock); 2969 spin_unlock(&space_info->lock); 2970 return ret; 2971 } 2972 2973 /** 2974 * btrfs_free_reserved_bytes - update the block_group and space info counters 2975 * @cache: The cache we are manipulating 2976 * @num_bytes: The number of bytes in question 2977 * @delalloc: The blocks are allocated for the delalloc write 2978 * 2979 * This is called by somebody who is freeing space that was never actually used 2980 * on disk. For example if you reserve some space for a new leaf in transaction 2981 * A and before transaction A commits you free that leaf, you call this with 2982 * reserve set to 0 in order to clear the reservation. 2983 */ 2984 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, 2985 u64 num_bytes, int delalloc) 2986 { 2987 struct btrfs_space_info *space_info = cache->space_info; 2988 2989 spin_lock(&space_info->lock); 2990 spin_lock(&cache->lock); 2991 if (cache->ro) 2992 space_info->bytes_readonly += num_bytes; 2993 cache->reserved -= num_bytes; 2994 space_info->bytes_reserved -= num_bytes; 2995 space_info->max_extent_size = 0; 2996 2997 if (delalloc) 2998 cache->delalloc_bytes -= num_bytes; 2999 spin_unlock(&cache->lock); 3000 3001 btrfs_try_granting_tickets(cache->fs_info, space_info); 3002 spin_unlock(&space_info->lock); 3003 } 3004 3005 static void force_metadata_allocation(struct btrfs_fs_info *info) 3006 { 3007 struct list_head *head = &info->space_info; 3008 struct btrfs_space_info *found; 3009 3010 list_for_each_entry(found, head, list) { 3011 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3012 found->force_alloc = CHUNK_ALLOC_FORCE; 3013 } 3014 } 3015 3016 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 3017 struct btrfs_space_info *sinfo, int force) 3018 { 3019 u64 bytes_used = btrfs_space_info_used(sinfo, false); 3020 u64 thresh; 3021 3022 if (force == CHUNK_ALLOC_FORCE) 3023 return 1; 3024 3025 /* 3026 * in limited mode, we want to have some free space up to 3027 * about 1% of the FS size. 3028 */ 3029 if (force == CHUNK_ALLOC_LIMITED) { 3030 thresh = btrfs_super_total_bytes(fs_info->super_copy); 3031 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); 3032 3033 if (sinfo->total_bytes - bytes_used < thresh) 3034 return 1; 3035 } 3036 3037 if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) 3038 return 0; 3039 return 1; 3040 } 3041 3042 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) 3043 { 3044 u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); 3045 3046 return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 3047 } 3048 3049 /* 3050 * If force is CHUNK_ALLOC_FORCE: 3051 * - return 1 if it successfully allocates a chunk, 3052 * - return errors including -ENOSPC otherwise. 3053 * If force is NOT CHUNK_ALLOC_FORCE: 3054 * - return 0 if it doesn't need to allocate a new chunk, 3055 * - return 1 if it successfully allocates a chunk, 3056 * - return errors including -ENOSPC otherwise. 3057 */ 3058 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 3059 enum btrfs_chunk_alloc_enum force) 3060 { 3061 struct btrfs_fs_info *fs_info = trans->fs_info; 3062 struct btrfs_space_info *space_info; 3063 bool wait_for_alloc = false; 3064 bool should_alloc = false; 3065 int ret = 0; 3066 3067 /* Don't re-enter if we're already allocating a chunk */ 3068 if (trans->allocating_chunk) 3069 return -ENOSPC; 3070 3071 space_info = btrfs_find_space_info(fs_info, flags); 3072 ASSERT(space_info); 3073 3074 do { 3075 spin_lock(&space_info->lock); 3076 if (force < space_info->force_alloc) 3077 force = space_info->force_alloc; 3078 should_alloc = should_alloc_chunk(fs_info, space_info, force); 3079 if (space_info->full) { 3080 /* No more free physical space */ 3081 if (should_alloc) 3082 ret = -ENOSPC; 3083 else 3084 ret = 0; 3085 spin_unlock(&space_info->lock); 3086 return ret; 3087 } else if (!should_alloc) { 3088 spin_unlock(&space_info->lock); 3089 return 0; 3090 } else if (space_info->chunk_alloc) { 3091 /* 3092 * Someone is already allocating, so we need to block 3093 * until this someone is finished and then loop to 3094 * recheck if we should continue with our allocation 3095 * attempt. 3096 */ 3097 wait_for_alloc = true; 3098 spin_unlock(&space_info->lock); 3099 mutex_lock(&fs_info->chunk_mutex); 3100 mutex_unlock(&fs_info->chunk_mutex); 3101 } else { 3102 /* Proceed with allocation */ 3103 space_info->chunk_alloc = 1; 3104 wait_for_alloc = false; 3105 spin_unlock(&space_info->lock); 3106 } 3107 3108 cond_resched(); 3109 } while (wait_for_alloc); 3110 3111 mutex_lock(&fs_info->chunk_mutex); 3112 trans->allocating_chunk = true; 3113 3114 /* 3115 * If we have mixed data/metadata chunks we want to make sure we keep 3116 * allocating mixed chunks instead of individual chunks. 3117 */ 3118 if (btrfs_mixed_space_info(space_info)) 3119 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 3120 3121 /* 3122 * if we're doing a data chunk, go ahead and make sure that 3123 * we keep a reasonable number of metadata chunks allocated in the 3124 * FS as well. 3125 */ 3126 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3127 fs_info->data_chunk_allocations++; 3128 if (!(fs_info->data_chunk_allocations % 3129 fs_info->metadata_ratio)) 3130 force_metadata_allocation(fs_info); 3131 } 3132 3133 /* 3134 * Check if we have enough space in SYSTEM chunk because we may need 3135 * to update devices. 3136 */ 3137 check_system_chunk(trans, flags); 3138 3139 ret = btrfs_alloc_chunk(trans, flags); 3140 trans->allocating_chunk = false; 3141 3142 spin_lock(&space_info->lock); 3143 if (ret < 0) { 3144 if (ret == -ENOSPC) 3145 space_info->full = 1; 3146 else 3147 goto out; 3148 } else { 3149 ret = 1; 3150 space_info->max_extent_size = 0; 3151 } 3152 3153 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3154 out: 3155 space_info->chunk_alloc = 0; 3156 spin_unlock(&space_info->lock); 3157 mutex_unlock(&fs_info->chunk_mutex); 3158 /* 3159 * When we allocate a new chunk we reserve space in the chunk block 3160 * reserve to make sure we can COW nodes/leafs in the chunk tree or 3161 * add new nodes/leafs to it if we end up needing to do it when 3162 * inserting the chunk item and updating device items as part of the 3163 * second phase of chunk allocation, performed by 3164 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a 3165 * large number of new block groups to create in our transaction 3166 * handle's new_bgs list to avoid exhausting the chunk block reserve 3167 * in extreme cases - like having a single transaction create many new 3168 * block groups when starting to write out the free space caches of all 3169 * the block groups that were made dirty during the lifetime of the 3170 * transaction. 3171 */ 3172 if (trans->chunk_bytes_reserved >= (u64)SZ_2M) 3173 btrfs_create_pending_block_groups(trans); 3174 3175 return ret; 3176 } 3177 3178 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 3179 { 3180 u64 num_dev; 3181 3182 num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; 3183 if (!num_dev) 3184 num_dev = fs_info->fs_devices->rw_devices; 3185 3186 return num_dev; 3187 } 3188 3189 /* 3190 * Reserve space in the system space for allocating or removing a chunk 3191 */ 3192 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) 3193 { 3194 struct btrfs_fs_info *fs_info = trans->fs_info; 3195 struct btrfs_space_info *info; 3196 u64 left; 3197 u64 thresh; 3198 int ret = 0; 3199 u64 num_devs; 3200 3201 /* 3202 * Needed because we can end up allocating a system chunk and for an 3203 * atomic and race free space reservation in the chunk block reserve. 3204 */ 3205 lockdep_assert_held(&fs_info->chunk_mutex); 3206 3207 info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3208 spin_lock(&info->lock); 3209 left = info->total_bytes - btrfs_space_info_used(info, true); 3210 spin_unlock(&info->lock); 3211 3212 num_devs = get_profile_num_devs(fs_info, type); 3213 3214 /* num_devs device items to update and 1 chunk item to add or remove */ 3215 thresh = btrfs_calc_metadata_size(fs_info, num_devs) + 3216 btrfs_calc_insert_metadata_size(fs_info, 1); 3217 3218 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 3219 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 3220 left, thresh, type); 3221 btrfs_dump_space_info(fs_info, info, 0, 0); 3222 } 3223 3224 if (left < thresh) { 3225 u64 flags = btrfs_system_alloc_profile(fs_info); 3226 3227 /* 3228 * Ignore failure to create system chunk. We might end up not 3229 * needing it, as we might not need to COW all nodes/leafs from 3230 * the paths we visit in the chunk tree (they were already COWed 3231 * or created in the current transaction for example). 3232 */ 3233 ret = btrfs_alloc_chunk(trans, flags); 3234 } 3235 3236 if (!ret) { 3237 ret = btrfs_block_rsv_add(fs_info->chunk_root, 3238 &fs_info->chunk_block_rsv, 3239 thresh, BTRFS_RESERVE_NO_FLUSH); 3240 if (!ret) 3241 trans->chunk_bytes_reserved += thresh; 3242 } 3243 } 3244 3245 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 3246 { 3247 struct btrfs_block_group *block_group; 3248 u64 last = 0; 3249 3250 while (1) { 3251 struct inode *inode; 3252 3253 block_group = btrfs_lookup_first_block_group(info, last); 3254 while (block_group) { 3255 btrfs_wait_block_group_cache_done(block_group); 3256 spin_lock(&block_group->lock); 3257 if (block_group->iref) 3258 break; 3259 spin_unlock(&block_group->lock); 3260 block_group = btrfs_next_block_group(block_group); 3261 } 3262 if (!block_group) { 3263 if (last == 0) 3264 break; 3265 last = 0; 3266 continue; 3267 } 3268 3269 inode = block_group->inode; 3270 block_group->iref = 0; 3271 block_group->inode = NULL; 3272 spin_unlock(&block_group->lock); 3273 ASSERT(block_group->io_ctl.inode == NULL); 3274 iput(inode); 3275 last = block_group->start + block_group->length; 3276 btrfs_put_block_group(block_group); 3277 } 3278 } 3279 3280 /* 3281 * Must be called only after stopping all workers, since we could have block 3282 * group caching kthreads running, and therefore they could race with us if we 3283 * freed the block groups before stopping them. 3284 */ 3285 int btrfs_free_block_groups(struct btrfs_fs_info *info) 3286 { 3287 struct btrfs_block_group *block_group; 3288 struct btrfs_space_info *space_info; 3289 struct btrfs_caching_control *caching_ctl; 3290 struct rb_node *n; 3291 3292 spin_lock(&info->block_group_cache_lock); 3293 while (!list_empty(&info->caching_block_groups)) { 3294 caching_ctl = list_entry(info->caching_block_groups.next, 3295 struct btrfs_caching_control, list); 3296 list_del(&caching_ctl->list); 3297 btrfs_put_caching_control(caching_ctl); 3298 } 3299 spin_unlock(&info->block_group_cache_lock); 3300 3301 spin_lock(&info->unused_bgs_lock); 3302 while (!list_empty(&info->unused_bgs)) { 3303 block_group = list_first_entry(&info->unused_bgs, 3304 struct btrfs_block_group, 3305 bg_list); 3306 list_del_init(&block_group->bg_list); 3307 btrfs_put_block_group(block_group); 3308 } 3309 spin_unlock(&info->unused_bgs_lock); 3310 3311 spin_lock(&info->block_group_cache_lock); 3312 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 3313 block_group = rb_entry(n, struct btrfs_block_group, 3314 cache_node); 3315 rb_erase(&block_group->cache_node, 3316 &info->block_group_cache_tree); 3317 RB_CLEAR_NODE(&block_group->cache_node); 3318 spin_unlock(&info->block_group_cache_lock); 3319 3320 down_write(&block_group->space_info->groups_sem); 3321 list_del(&block_group->list); 3322 up_write(&block_group->space_info->groups_sem); 3323 3324 /* 3325 * We haven't cached this block group, which means we could 3326 * possibly have excluded extents on this block group. 3327 */ 3328 if (block_group->cached == BTRFS_CACHE_NO || 3329 block_group->cached == BTRFS_CACHE_ERROR) 3330 btrfs_free_excluded_extents(block_group); 3331 3332 btrfs_remove_free_space_cache(block_group); 3333 ASSERT(block_group->cached != BTRFS_CACHE_STARTED); 3334 ASSERT(list_empty(&block_group->dirty_list)); 3335 ASSERT(list_empty(&block_group->io_list)); 3336 ASSERT(list_empty(&block_group->bg_list)); 3337 ASSERT(refcount_read(&block_group->refs) == 1); 3338 btrfs_put_block_group(block_group); 3339 3340 spin_lock(&info->block_group_cache_lock); 3341 } 3342 spin_unlock(&info->block_group_cache_lock); 3343 3344 btrfs_release_global_block_rsv(info); 3345 3346 while (!list_empty(&info->space_info)) { 3347 space_info = list_entry(info->space_info.next, 3348 struct btrfs_space_info, 3349 list); 3350 3351 /* 3352 * Do not hide this behind enospc_debug, this is actually 3353 * important and indicates a real bug if this happens. 3354 */ 3355 if (WARN_ON(space_info->bytes_pinned > 0 || 3356 space_info->bytes_reserved > 0 || 3357 space_info->bytes_may_use > 0)) 3358 btrfs_dump_space_info(info, space_info, 0, 0); 3359 WARN_ON(space_info->reclaim_size > 0); 3360 list_del(&space_info->list); 3361 btrfs_sysfs_remove_space_info(space_info); 3362 } 3363 return 0; 3364 } 3365 3366 void btrfs_freeze_block_group(struct btrfs_block_group *cache) 3367 { 3368 atomic_inc(&cache->frozen); 3369 } 3370 3371 void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) 3372 { 3373 struct btrfs_fs_info *fs_info = block_group->fs_info; 3374 struct extent_map_tree *em_tree; 3375 struct extent_map *em; 3376 bool cleanup; 3377 3378 spin_lock(&block_group->lock); 3379 cleanup = (atomic_dec_and_test(&block_group->frozen) && 3380 block_group->removed); 3381 spin_unlock(&block_group->lock); 3382 3383 if (cleanup) { 3384 em_tree = &fs_info->mapping_tree; 3385 write_lock(&em_tree->lock); 3386 em = lookup_extent_mapping(em_tree, block_group->start, 3387 1); 3388 BUG_ON(!em); /* logic error, can't happen */ 3389 remove_extent_mapping(em_tree, em); 3390 write_unlock(&em_tree->lock); 3391 3392 /* once for us and once for the tree */ 3393 free_extent_map(em); 3394 free_extent_map(em); 3395 3396 /* 3397 * We may have left one free space entry and other possible 3398 * tasks trimming this block group have left 1 entry each one. 3399 * Free them if any. 3400 */ 3401 __btrfs_remove_free_space_cache(block_group->free_space_ctl); 3402 } 3403 } 3404