1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "misc.h" 4 #include "ctree.h" 5 #include "space-info.h" 6 #include "sysfs.h" 7 #include "volumes.h" 8 #include "free-space-cache.h" 9 #include "ordered-data.h" 10 #include "transaction.h" 11 #include "block-group.h" 12 13 u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 14 bool may_use_included) 15 { 16 ASSERT(s_info); 17 return s_info->bytes_used + s_info->bytes_reserved + 18 s_info->bytes_pinned + s_info->bytes_readonly + 19 (may_use_included ? s_info->bytes_may_use : 0); 20 } 21 22 /* 23 * after adding space to the filesystem, we need to clear the full flags 24 * on all the space infos. 25 */ 26 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 27 { 28 struct list_head *head = &info->space_info; 29 struct btrfs_space_info *found; 30 31 rcu_read_lock(); 32 list_for_each_entry_rcu(found, head, list) 33 found->full = 0; 34 rcu_read_unlock(); 35 } 36 37 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 38 { 39 40 struct btrfs_space_info *space_info; 41 int i; 42 int ret; 43 44 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 45 if (!space_info) 46 return -ENOMEM; 47 48 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 49 GFP_KERNEL); 50 if (ret) { 51 kfree(space_info); 52 return ret; 53 } 54 55 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 56 INIT_LIST_HEAD(&space_info->block_groups[i]); 57 init_rwsem(&space_info->groups_sem); 58 spin_lock_init(&space_info->lock); 59 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 60 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 61 init_waitqueue_head(&space_info->wait); 62 INIT_LIST_HEAD(&space_info->ro_bgs); 63 INIT_LIST_HEAD(&space_info->tickets); 64 INIT_LIST_HEAD(&space_info->priority_tickets); 65 66 ret = btrfs_sysfs_add_space_info_type(info, space_info); 67 if (ret) 68 return ret; 69 70 list_add_rcu(&space_info->list, &info->space_info); 71 if (flags & BTRFS_BLOCK_GROUP_DATA) 72 info->data_sinfo = space_info; 73 74 return ret; 75 } 76 77 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 78 { 79 struct btrfs_super_block *disk_super; 80 u64 features; 81 u64 flags; 82 int mixed = 0; 83 int ret; 84 85 disk_super = fs_info->super_copy; 86 if (!btrfs_super_root(disk_super)) 87 return -EINVAL; 88 89 features = btrfs_super_incompat_flags(disk_super); 90 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 91 mixed = 1; 92 93 flags = BTRFS_BLOCK_GROUP_SYSTEM; 94 ret = create_space_info(fs_info, flags); 95 if (ret) 96 goto out; 97 98 if (mixed) { 99 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 100 ret = create_space_info(fs_info, flags); 101 } else { 102 flags = BTRFS_BLOCK_GROUP_METADATA; 103 ret = create_space_info(fs_info, flags); 104 if (ret) 105 goto out; 106 107 flags = BTRFS_BLOCK_GROUP_DATA; 108 ret = create_space_info(fs_info, flags); 109 } 110 out: 111 return ret; 112 } 113 114 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, 115 u64 total_bytes, u64 bytes_used, 116 u64 bytes_readonly, 117 struct btrfs_space_info **space_info) 118 { 119 struct btrfs_space_info *found; 120 int factor; 121 122 factor = btrfs_bg_type_to_factor(flags); 123 124 found = btrfs_find_space_info(info, flags); 125 ASSERT(found); 126 spin_lock(&found->lock); 127 found->total_bytes += total_bytes; 128 found->disk_total += total_bytes * factor; 129 found->bytes_used += bytes_used; 130 found->disk_used += bytes_used * factor; 131 found->bytes_readonly += bytes_readonly; 132 if (total_bytes > 0) 133 found->full = 0; 134 btrfs_try_granting_tickets(info, found); 135 spin_unlock(&found->lock); 136 *space_info = found; 137 } 138 139 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 140 u64 flags) 141 { 142 struct list_head *head = &info->space_info; 143 struct btrfs_space_info *found; 144 145 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 146 147 rcu_read_lock(); 148 list_for_each_entry_rcu(found, head, list) { 149 if (found->flags & flags) { 150 rcu_read_unlock(); 151 return found; 152 } 153 } 154 rcu_read_unlock(); 155 return NULL; 156 } 157 158 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 159 { 160 return (global->size << 1); 161 } 162 163 static int can_overcommit(struct btrfs_fs_info *fs_info, 164 struct btrfs_space_info *space_info, u64 bytes, 165 enum btrfs_reserve_flush_enum flush, 166 bool system_chunk) 167 { 168 u64 profile; 169 u64 avail; 170 u64 used; 171 int factor; 172 173 /* Don't overcommit when in mixed mode. */ 174 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 175 return 0; 176 177 if (system_chunk) 178 profile = btrfs_system_alloc_profile(fs_info); 179 else 180 profile = btrfs_metadata_alloc_profile(fs_info); 181 182 used = btrfs_space_info_used(space_info, true); 183 avail = atomic64_read(&fs_info->free_chunk_space); 184 185 /* 186 * If we have dup, raid1 or raid10 then only half of the free 187 * space is actually usable. For raid56, the space info used 188 * doesn't include the parity drive, so we don't have to 189 * change the math 190 */ 191 factor = btrfs_bg_type_to_factor(profile); 192 avail = div_u64(avail, factor); 193 194 /* 195 * If we aren't flushing all things, let us overcommit up to 196 * 1/2th of the space. If we can flush, don't let us overcommit 197 * too much, let it overcommit up to 1/8 of the space. 198 */ 199 if (flush == BTRFS_RESERVE_FLUSH_ALL) 200 avail >>= 3; 201 else 202 avail >>= 1; 203 204 if (used + bytes < space_info->total_bytes + avail) 205 return 1; 206 return 0; 207 } 208 209 /* 210 * This is for space we already have accounted in space_info->bytes_may_use, so 211 * basically when we're returning space from block_rsv's. 212 */ 213 void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, 214 struct btrfs_space_info *space_info) 215 { 216 struct list_head *head; 217 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 218 219 lockdep_assert_held(&space_info->lock); 220 221 head = &space_info->priority_tickets; 222 again: 223 while (!list_empty(head)) { 224 struct reserve_ticket *ticket; 225 u64 used = btrfs_space_info_used(space_info, true); 226 227 ticket = list_first_entry(head, struct reserve_ticket, list); 228 229 /* Check and see if our ticket can be satisified now. */ 230 if ((used + ticket->bytes <= space_info->total_bytes) || 231 can_overcommit(fs_info, space_info, ticket->bytes, flush, 232 false)) { 233 btrfs_space_info_update_bytes_may_use(fs_info, 234 space_info, 235 ticket->bytes); 236 list_del_init(&ticket->list); 237 ticket->bytes = 0; 238 space_info->tickets_id++; 239 wake_up(&ticket->wait); 240 } else { 241 break; 242 } 243 } 244 245 if (head == &space_info->priority_tickets) { 246 head = &space_info->tickets; 247 flush = BTRFS_RESERVE_FLUSH_ALL; 248 goto again; 249 } 250 } 251 252 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 253 do { \ 254 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 255 spin_lock(&__rsv->lock); \ 256 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 257 __rsv->size, __rsv->reserved); \ 258 spin_unlock(&__rsv->lock); \ 259 } while (0) 260 261 static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 262 struct btrfs_space_info *info) 263 { 264 lockdep_assert_held(&info->lock); 265 266 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 267 info->flags, 268 info->total_bytes - btrfs_space_info_used(info, true), 269 info->full ? "" : "not "); 270 btrfs_info(fs_info, 271 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 272 info->total_bytes, info->bytes_used, info->bytes_pinned, 273 info->bytes_reserved, info->bytes_may_use, 274 info->bytes_readonly); 275 276 DUMP_BLOCK_RSV(fs_info, global_block_rsv); 277 DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 278 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 279 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 280 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 281 282 } 283 284 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 285 struct btrfs_space_info *info, u64 bytes, 286 int dump_block_groups) 287 { 288 struct btrfs_block_group_cache *cache; 289 int index = 0; 290 291 spin_lock(&info->lock); 292 __btrfs_dump_space_info(fs_info, info); 293 spin_unlock(&info->lock); 294 295 if (!dump_block_groups) 296 return; 297 298 down_read(&info->groups_sem); 299 again: 300 list_for_each_entry(cache, &info->block_groups[index], list) { 301 spin_lock(&cache->lock); 302 btrfs_info(fs_info, 303 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 304 cache->key.objectid, cache->key.offset, 305 btrfs_block_group_used(&cache->item), cache->pinned, 306 cache->reserved, cache->ro ? "[readonly]" : ""); 307 btrfs_dump_free_space(cache, bytes); 308 spin_unlock(&cache->lock); 309 } 310 if (++index < BTRFS_NR_RAID_TYPES) 311 goto again; 312 up_read(&info->groups_sem); 313 } 314 315 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 316 unsigned long nr_pages, int nr_items) 317 { 318 struct super_block *sb = fs_info->sb; 319 320 if (down_read_trylock(&sb->s_umount)) { 321 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 322 up_read(&sb->s_umount); 323 } else { 324 /* 325 * We needn't worry the filesystem going from r/w to r/o though 326 * we don't acquire ->s_umount mutex, because the filesystem 327 * should guarantee the delalloc inodes list be empty after 328 * the filesystem is readonly(all dirty pages are written to 329 * the disk). 330 */ 331 btrfs_start_delalloc_roots(fs_info, nr_items); 332 if (!current->journal_info) 333 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 334 } 335 } 336 337 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 338 u64 to_reclaim) 339 { 340 u64 bytes; 341 u64 nr; 342 343 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 344 nr = div64_u64(to_reclaim, bytes); 345 if (!nr) 346 nr = 1; 347 return nr; 348 } 349 350 #define EXTENT_SIZE_PER_ITEM SZ_256K 351 352 /* 353 * shrink metadata reservation for delalloc 354 */ 355 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 356 u64 orig, bool wait_ordered) 357 { 358 struct btrfs_space_info *space_info; 359 struct btrfs_trans_handle *trans; 360 u64 delalloc_bytes; 361 u64 dio_bytes; 362 u64 async_pages; 363 u64 items; 364 long time_left; 365 unsigned long nr_pages; 366 int loops; 367 368 /* Calc the number of the pages we need flush for space reservation */ 369 items = calc_reclaim_items_nr(fs_info, to_reclaim); 370 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 371 372 trans = (struct btrfs_trans_handle *)current->journal_info; 373 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 374 375 delalloc_bytes = percpu_counter_sum_positive( 376 &fs_info->delalloc_bytes); 377 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 378 if (delalloc_bytes == 0 && dio_bytes == 0) { 379 if (trans) 380 return; 381 if (wait_ordered) 382 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 383 return; 384 } 385 386 /* 387 * If we are doing more ordered than delalloc we need to just wait on 388 * ordered extents, otherwise we'll waste time trying to flush delalloc 389 * that likely won't give us the space back we need. 390 */ 391 if (dio_bytes > delalloc_bytes) 392 wait_ordered = true; 393 394 loops = 0; 395 while ((delalloc_bytes || dio_bytes) && loops < 3) { 396 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 397 398 /* 399 * Triggers inode writeback for up to nr_pages. This will invoke 400 * ->writepages callback and trigger delalloc filling 401 * (btrfs_run_delalloc_range()). 402 */ 403 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 404 405 /* 406 * We need to wait for the compressed pages to start before 407 * we continue. 408 */ 409 async_pages = atomic_read(&fs_info->async_delalloc_pages); 410 if (!async_pages) 411 goto skip_async; 412 413 /* 414 * Calculate how many compressed pages we want to be written 415 * before we continue. I.e if there are more async pages than we 416 * require wait_event will wait until nr_pages are written. 417 */ 418 if (async_pages <= nr_pages) 419 async_pages = 0; 420 else 421 async_pages -= nr_pages; 422 423 wait_event(fs_info->async_submit_wait, 424 atomic_read(&fs_info->async_delalloc_pages) <= 425 (int)async_pages); 426 skip_async: 427 spin_lock(&space_info->lock); 428 if (list_empty(&space_info->tickets) && 429 list_empty(&space_info->priority_tickets)) { 430 spin_unlock(&space_info->lock); 431 break; 432 } 433 spin_unlock(&space_info->lock); 434 435 loops++; 436 if (wait_ordered && !trans) { 437 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 438 } else { 439 time_left = schedule_timeout_killable(1); 440 if (time_left) 441 break; 442 } 443 delalloc_bytes = percpu_counter_sum_positive( 444 &fs_info->delalloc_bytes); 445 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 446 } 447 } 448 449 /** 450 * maybe_commit_transaction - possibly commit the transaction if its ok to 451 * @root - the root we're allocating for 452 * @bytes - the number of bytes we want to reserve 453 * @force - force the commit 454 * 455 * This will check to make sure that committing the transaction will actually 456 * get us somewhere and then commit the transaction if it does. Otherwise it 457 * will return -ENOSPC. 458 */ 459 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 460 struct btrfs_space_info *space_info) 461 { 462 struct reserve_ticket *ticket = NULL; 463 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 464 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 465 struct btrfs_trans_handle *trans; 466 u64 bytes_needed; 467 u64 reclaim_bytes = 0; 468 u64 cur_free_bytes = 0; 469 470 trans = (struct btrfs_trans_handle *)current->journal_info; 471 if (trans) 472 return -EAGAIN; 473 474 spin_lock(&space_info->lock); 475 cur_free_bytes = btrfs_space_info_used(space_info, true); 476 if (cur_free_bytes < space_info->total_bytes) 477 cur_free_bytes = space_info->total_bytes - cur_free_bytes; 478 else 479 cur_free_bytes = 0; 480 481 if (!list_empty(&space_info->priority_tickets)) 482 ticket = list_first_entry(&space_info->priority_tickets, 483 struct reserve_ticket, list); 484 else if (!list_empty(&space_info->tickets)) 485 ticket = list_first_entry(&space_info->tickets, 486 struct reserve_ticket, list); 487 bytes_needed = (ticket) ? ticket->bytes : 0; 488 489 if (bytes_needed > cur_free_bytes) 490 bytes_needed -= cur_free_bytes; 491 else 492 bytes_needed = 0; 493 spin_unlock(&space_info->lock); 494 495 if (!bytes_needed) 496 return 0; 497 498 trans = btrfs_join_transaction(fs_info->extent_root); 499 if (IS_ERR(trans)) 500 return PTR_ERR(trans); 501 502 /* 503 * See if there is enough pinned space to make this reservation, or if 504 * we have block groups that are going to be freed, allowing us to 505 * possibly do a chunk allocation the next loop through. 506 */ 507 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 508 __percpu_counter_compare(&space_info->total_bytes_pinned, 509 bytes_needed, 510 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 511 goto commit; 512 513 /* 514 * See if there is some space in the delayed insertion reservation for 515 * this reservation. 516 */ 517 if (space_info != delayed_rsv->space_info) 518 goto enospc; 519 520 spin_lock(&delayed_rsv->lock); 521 reclaim_bytes += delayed_rsv->reserved; 522 spin_unlock(&delayed_rsv->lock); 523 524 spin_lock(&delayed_refs_rsv->lock); 525 reclaim_bytes += delayed_refs_rsv->reserved; 526 spin_unlock(&delayed_refs_rsv->lock); 527 if (reclaim_bytes >= bytes_needed) 528 goto commit; 529 bytes_needed -= reclaim_bytes; 530 531 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 532 bytes_needed, 533 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 534 goto enospc; 535 536 commit: 537 return btrfs_commit_transaction(trans); 538 enospc: 539 btrfs_end_transaction(trans); 540 return -ENOSPC; 541 } 542 543 /* 544 * Try to flush some data based on policy set by @state. This is only advisory 545 * and may fail for various reasons. The caller is supposed to examine the 546 * state of @space_info to detect the outcome. 547 */ 548 static void flush_space(struct btrfs_fs_info *fs_info, 549 struct btrfs_space_info *space_info, u64 num_bytes, 550 int state) 551 { 552 struct btrfs_root *root = fs_info->extent_root; 553 struct btrfs_trans_handle *trans; 554 int nr; 555 int ret = 0; 556 557 switch (state) { 558 case FLUSH_DELAYED_ITEMS_NR: 559 case FLUSH_DELAYED_ITEMS: 560 if (state == FLUSH_DELAYED_ITEMS_NR) 561 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 562 else 563 nr = -1; 564 565 trans = btrfs_join_transaction(root); 566 if (IS_ERR(trans)) { 567 ret = PTR_ERR(trans); 568 break; 569 } 570 ret = btrfs_run_delayed_items_nr(trans, nr); 571 btrfs_end_transaction(trans); 572 break; 573 case FLUSH_DELALLOC: 574 case FLUSH_DELALLOC_WAIT: 575 shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 576 state == FLUSH_DELALLOC_WAIT); 577 break; 578 case FLUSH_DELAYED_REFS_NR: 579 case FLUSH_DELAYED_REFS: 580 trans = btrfs_join_transaction(root); 581 if (IS_ERR(trans)) { 582 ret = PTR_ERR(trans); 583 break; 584 } 585 if (state == FLUSH_DELAYED_REFS_NR) 586 nr = calc_reclaim_items_nr(fs_info, num_bytes); 587 else 588 nr = 0; 589 btrfs_run_delayed_refs(trans, nr); 590 btrfs_end_transaction(trans); 591 break; 592 case ALLOC_CHUNK: 593 case ALLOC_CHUNK_FORCE: 594 trans = btrfs_join_transaction(root); 595 if (IS_ERR(trans)) { 596 ret = PTR_ERR(trans); 597 break; 598 } 599 ret = btrfs_chunk_alloc(trans, 600 btrfs_metadata_alloc_profile(fs_info), 601 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : 602 CHUNK_ALLOC_FORCE); 603 btrfs_end_transaction(trans); 604 if (ret > 0 || ret == -ENOSPC) 605 ret = 0; 606 break; 607 case RUN_DELAYED_IPUTS: 608 /* 609 * If we have pending delayed iputs then we could free up a 610 * bunch of pinned space, so make sure we run the iputs before 611 * we do our pinned bytes check below. 612 */ 613 btrfs_run_delayed_iputs(fs_info); 614 btrfs_wait_on_delayed_iputs(fs_info); 615 break; 616 case COMMIT_TRANS: 617 ret = may_commit_transaction(fs_info, space_info); 618 break; 619 default: 620 ret = -ENOSPC; 621 break; 622 } 623 624 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 625 ret); 626 return; 627 } 628 629 static inline u64 630 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 631 struct btrfs_space_info *space_info, 632 bool system_chunk) 633 { 634 struct reserve_ticket *ticket; 635 u64 used; 636 u64 expected; 637 u64 to_reclaim = 0; 638 639 list_for_each_entry(ticket, &space_info->tickets, list) 640 to_reclaim += ticket->bytes; 641 list_for_each_entry(ticket, &space_info->priority_tickets, list) 642 to_reclaim += ticket->bytes; 643 if (to_reclaim) 644 return to_reclaim; 645 646 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 647 if (can_overcommit(fs_info, space_info, to_reclaim, 648 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 649 return 0; 650 651 used = btrfs_space_info_used(space_info, true); 652 653 if (can_overcommit(fs_info, space_info, SZ_1M, 654 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 655 expected = div_factor_fine(space_info->total_bytes, 95); 656 else 657 expected = div_factor_fine(space_info->total_bytes, 90); 658 659 if (used > expected) 660 to_reclaim = used - expected; 661 else 662 to_reclaim = 0; 663 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 664 space_info->bytes_reserved); 665 return to_reclaim; 666 } 667 668 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 669 struct btrfs_space_info *space_info, 670 u64 used, bool system_chunk) 671 { 672 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 673 674 /* If we're just plain full then async reclaim just slows us down. */ 675 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 676 return 0; 677 678 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, 679 system_chunk)) 680 return 0; 681 682 return (used >= thresh && !btrfs_fs_closing(fs_info) && 683 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 684 } 685 686 /* 687 * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets 688 * @fs_info - fs_info for this fs 689 * @space_info - the space info we were flushing 690 * 691 * We call this when we've exhausted our flushing ability and haven't made 692 * progress in satisfying tickets. The reservation code handles tickets in 693 * order, so if there is a large ticket first and then smaller ones we could 694 * very well satisfy the smaller tickets. This will attempt to wake up any 695 * tickets in the list to catch this case. 696 * 697 * This function returns true if it was able to make progress by clearing out 698 * other tickets, or if it stumbles across a ticket that was smaller than the 699 * first ticket. 700 */ 701 static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, 702 struct btrfs_space_info *space_info) 703 { 704 struct reserve_ticket *ticket; 705 u64 tickets_id = space_info->tickets_id; 706 u64 first_ticket_bytes = 0; 707 708 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 709 btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); 710 __btrfs_dump_space_info(fs_info, space_info); 711 } 712 713 while (!list_empty(&space_info->tickets) && 714 tickets_id == space_info->tickets_id) { 715 ticket = list_first_entry(&space_info->tickets, 716 struct reserve_ticket, list); 717 718 /* 719 * may_commit_transaction will avoid committing the transaction 720 * if it doesn't feel like the space reclaimed by the commit 721 * would result in the ticket succeeding. However if we have a 722 * smaller ticket in the queue it may be small enough to be 723 * satisified by committing the transaction, so if any 724 * subsequent ticket is smaller than the first ticket go ahead 725 * and send us back for another loop through the enospc flushing 726 * code. 727 */ 728 if (first_ticket_bytes == 0) 729 first_ticket_bytes = ticket->bytes; 730 else if (first_ticket_bytes > ticket->bytes) 731 return true; 732 733 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 734 btrfs_info(fs_info, "failing ticket with %llu bytes", 735 ticket->bytes); 736 737 list_del_init(&ticket->list); 738 ticket->error = -ENOSPC; 739 wake_up(&ticket->wait); 740 741 /* 742 * We're just throwing tickets away, so more flushing may not 743 * trip over btrfs_try_granting_tickets, so we need to call it 744 * here to see if we can make progress with the next ticket in 745 * the list. 746 */ 747 btrfs_try_granting_tickets(fs_info, space_info); 748 } 749 return (tickets_id != space_info->tickets_id); 750 } 751 752 /* 753 * This is for normal flushers, we can wait all goddamned day if we want to. We 754 * will loop and continuously try to flush as long as we are making progress. 755 * We count progress as clearing off tickets each time we have to loop. 756 */ 757 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 758 { 759 struct btrfs_fs_info *fs_info; 760 struct btrfs_space_info *space_info; 761 u64 to_reclaim; 762 int flush_state; 763 int commit_cycles = 0; 764 u64 last_tickets_id; 765 766 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 767 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 768 769 spin_lock(&space_info->lock); 770 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 771 false); 772 if (!to_reclaim) { 773 space_info->flush = 0; 774 spin_unlock(&space_info->lock); 775 return; 776 } 777 last_tickets_id = space_info->tickets_id; 778 spin_unlock(&space_info->lock); 779 780 flush_state = FLUSH_DELAYED_ITEMS_NR; 781 do { 782 flush_space(fs_info, space_info, to_reclaim, flush_state); 783 spin_lock(&space_info->lock); 784 if (list_empty(&space_info->tickets)) { 785 space_info->flush = 0; 786 spin_unlock(&space_info->lock); 787 return; 788 } 789 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 790 space_info, 791 false); 792 if (last_tickets_id == space_info->tickets_id) { 793 flush_state++; 794 } else { 795 last_tickets_id = space_info->tickets_id; 796 flush_state = FLUSH_DELAYED_ITEMS_NR; 797 if (commit_cycles) 798 commit_cycles--; 799 } 800 801 /* 802 * We don't want to force a chunk allocation until we've tried 803 * pretty hard to reclaim space. Think of the case where we 804 * freed up a bunch of space and so have a lot of pinned space 805 * to reclaim. We would rather use that than possibly create a 806 * underutilized metadata chunk. So if this is our first run 807 * through the flushing state machine skip ALLOC_CHUNK_FORCE and 808 * commit the transaction. If nothing has changed the next go 809 * around then we can force a chunk allocation. 810 */ 811 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 812 flush_state++; 813 814 if (flush_state > COMMIT_TRANS) { 815 commit_cycles++; 816 if (commit_cycles > 2) { 817 if (maybe_fail_all_tickets(fs_info, space_info)) { 818 flush_state = FLUSH_DELAYED_ITEMS_NR; 819 commit_cycles--; 820 } else { 821 space_info->flush = 0; 822 } 823 } else { 824 flush_state = FLUSH_DELAYED_ITEMS_NR; 825 } 826 } 827 spin_unlock(&space_info->lock); 828 } while (flush_state <= COMMIT_TRANS); 829 } 830 831 void btrfs_init_async_reclaim_work(struct work_struct *work) 832 { 833 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 834 } 835 836 static const enum btrfs_flush_state priority_flush_states[] = { 837 FLUSH_DELAYED_ITEMS_NR, 838 FLUSH_DELAYED_ITEMS, 839 ALLOC_CHUNK, 840 }; 841 842 static const enum btrfs_flush_state evict_flush_states[] = { 843 FLUSH_DELAYED_ITEMS_NR, 844 FLUSH_DELAYED_ITEMS, 845 FLUSH_DELAYED_REFS_NR, 846 FLUSH_DELAYED_REFS, 847 FLUSH_DELALLOC, 848 FLUSH_DELALLOC_WAIT, 849 ALLOC_CHUNK, 850 COMMIT_TRANS, 851 }; 852 853 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 854 struct btrfs_space_info *space_info, 855 struct reserve_ticket *ticket, 856 const enum btrfs_flush_state *states, 857 int states_nr) 858 { 859 u64 to_reclaim; 860 int flush_state; 861 862 spin_lock(&space_info->lock); 863 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 864 false); 865 if (!to_reclaim) { 866 spin_unlock(&space_info->lock); 867 return; 868 } 869 spin_unlock(&space_info->lock); 870 871 flush_state = 0; 872 do { 873 flush_space(fs_info, space_info, to_reclaim, states[flush_state]); 874 flush_state++; 875 spin_lock(&space_info->lock); 876 if (ticket->bytes == 0) { 877 spin_unlock(&space_info->lock); 878 return; 879 } 880 spin_unlock(&space_info->lock); 881 } while (flush_state < states_nr); 882 } 883 884 static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, 885 struct btrfs_space_info *space_info, 886 struct reserve_ticket *ticket) 887 888 { 889 DEFINE_WAIT(wait); 890 int ret = 0; 891 892 spin_lock(&space_info->lock); 893 while (ticket->bytes > 0 && ticket->error == 0) { 894 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 895 if (ret) { 896 /* 897 * Delete us from the list. After we unlock the space 898 * info, we don't want the async reclaim job to reserve 899 * space for this ticket. If that would happen, then the 900 * ticket's task would not known that space was reserved 901 * despite getting an error, resulting in a space leak 902 * (bytes_may_use counter of our space_info). 903 */ 904 list_del_init(&ticket->list); 905 ticket->error = -EINTR; 906 break; 907 } 908 spin_unlock(&space_info->lock); 909 910 schedule(); 911 912 finish_wait(&ticket->wait, &wait); 913 spin_lock(&space_info->lock); 914 } 915 spin_unlock(&space_info->lock); 916 } 917 918 /** 919 * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket 920 * @fs_info - the fs 921 * @space_info - the space_info for the reservation 922 * @ticket - the ticket for the reservation 923 * @flush - how much we can flush 924 * 925 * This does the work of figuring out how to flush for the ticket, waiting for 926 * the reservation, and returning the appropriate error if there is one. 927 */ 928 static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, 929 struct btrfs_space_info *space_info, 930 struct reserve_ticket *ticket, 931 enum btrfs_reserve_flush_enum flush) 932 { 933 int ret; 934 935 switch (flush) { 936 case BTRFS_RESERVE_FLUSH_ALL: 937 wait_reserve_ticket(fs_info, space_info, ticket); 938 break; 939 case BTRFS_RESERVE_FLUSH_LIMIT: 940 priority_reclaim_metadata_space(fs_info, space_info, ticket, 941 priority_flush_states, 942 ARRAY_SIZE(priority_flush_states)); 943 break; 944 case BTRFS_RESERVE_FLUSH_EVICT: 945 priority_reclaim_metadata_space(fs_info, space_info, ticket, 946 evict_flush_states, 947 ARRAY_SIZE(evict_flush_states)); 948 break; 949 default: 950 ASSERT(0); 951 break; 952 } 953 954 spin_lock(&space_info->lock); 955 ret = ticket->error; 956 if (ticket->bytes || ticket->error) { 957 /* 958 * Need to delete here for priority tickets. For regular tickets 959 * either the async reclaim job deletes the ticket from the list 960 * or we delete it ourselves at wait_reserve_ticket(). 961 */ 962 list_del_init(&ticket->list); 963 if (!ret) 964 ret = -ENOSPC; 965 } 966 spin_unlock(&space_info->lock); 967 ASSERT(list_empty(&ticket->list)); 968 /* 969 * Check that we can't have an error set if the reservation succeeded, 970 * as that would confuse tasks and lead them to error out without 971 * releasing reserved space (if an error happens the expectation is that 972 * space wasn't reserved at all). 973 */ 974 ASSERT(!(ticket->bytes == 0 && ticket->error)); 975 return ret; 976 } 977 978 /** 979 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 980 * @root - the root we're allocating for 981 * @space_info - the space info we want to allocate from 982 * @orig_bytes - the number of bytes we want 983 * @flush - whether or not we can flush to make our reservation 984 * 985 * This will reserve orig_bytes number of bytes from the space info associated 986 * with the block_rsv. If there is not enough space it will make an attempt to 987 * flush out space to make room. It will do this by flushing delalloc if 988 * possible or committing the transaction. If flush is 0 then no attempts to 989 * regain reservations will be made and this will fail if there is not enough 990 * space already. 991 */ 992 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 993 struct btrfs_space_info *space_info, 994 u64 orig_bytes, 995 enum btrfs_reserve_flush_enum flush, 996 bool system_chunk) 997 { 998 struct reserve_ticket ticket; 999 u64 used; 1000 int ret = 0; 1001 bool pending_tickets; 1002 1003 ASSERT(orig_bytes); 1004 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 1005 1006 spin_lock(&space_info->lock); 1007 ret = -ENOSPC; 1008 used = btrfs_space_info_used(space_info, true); 1009 pending_tickets = !list_empty(&space_info->tickets) || 1010 !list_empty(&space_info->priority_tickets); 1011 1012 /* 1013 * Carry on if we have enough space (short-circuit) OR call 1014 * can_overcommit() to ensure we can overcommit to continue. 1015 */ 1016 if (!pending_tickets && 1017 ((used + orig_bytes <= space_info->total_bytes) || 1018 can_overcommit(fs_info, space_info, orig_bytes, flush, 1019 system_chunk))) { 1020 btrfs_space_info_update_bytes_may_use(fs_info, space_info, 1021 orig_bytes); 1022 ret = 0; 1023 } 1024 1025 /* 1026 * If we couldn't make a reservation then setup our reservation ticket 1027 * and kick the async worker if it's not already running. 1028 * 1029 * If we are a priority flusher then we just need to add our ticket to 1030 * the list and we will do our own flushing further down. 1031 */ 1032 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 1033 ticket.bytes = orig_bytes; 1034 ticket.error = 0; 1035 init_waitqueue_head(&ticket.wait); 1036 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 1037 list_add_tail(&ticket.list, &space_info->tickets); 1038 if (!space_info->flush) { 1039 space_info->flush = 1; 1040 trace_btrfs_trigger_flush(fs_info, 1041 space_info->flags, 1042 orig_bytes, flush, 1043 "enospc"); 1044 queue_work(system_unbound_wq, 1045 &fs_info->async_reclaim_work); 1046 } 1047 } else { 1048 list_add_tail(&ticket.list, 1049 &space_info->priority_tickets); 1050 } 1051 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 1052 used += orig_bytes; 1053 /* 1054 * We will do the space reservation dance during log replay, 1055 * which means we won't have fs_info->fs_root set, so don't do 1056 * the async reclaim as we will panic. 1057 */ 1058 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 1059 need_do_async_reclaim(fs_info, space_info, 1060 used, system_chunk) && 1061 !work_busy(&fs_info->async_reclaim_work)) { 1062 trace_btrfs_trigger_flush(fs_info, space_info->flags, 1063 orig_bytes, flush, "preempt"); 1064 queue_work(system_unbound_wq, 1065 &fs_info->async_reclaim_work); 1066 } 1067 } 1068 spin_unlock(&space_info->lock); 1069 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 1070 return ret; 1071 1072 return handle_reserve_ticket(fs_info, space_info, &ticket, flush); 1073 } 1074 1075 /** 1076 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 1077 * @root - the root we're allocating for 1078 * @block_rsv - the block_rsv we're allocating for 1079 * @orig_bytes - the number of bytes we want 1080 * @flush - whether or not we can flush to make our reservation 1081 * 1082 * This will reserve orig_bytes number of bytes from the space info associated 1083 * with the block_rsv. If there is not enough space it will make an attempt to 1084 * flush out space to make room. It will do this by flushing delalloc if 1085 * possible or committing the transaction. If flush is 0 then no attempts to 1086 * regain reservations will be made and this will fail if there is not enough 1087 * space already. 1088 */ 1089 int btrfs_reserve_metadata_bytes(struct btrfs_root *root, 1090 struct btrfs_block_rsv *block_rsv, 1091 u64 orig_bytes, 1092 enum btrfs_reserve_flush_enum flush) 1093 { 1094 struct btrfs_fs_info *fs_info = root->fs_info; 1095 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 1096 int ret; 1097 bool system_chunk = (root == fs_info->chunk_root); 1098 1099 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 1100 orig_bytes, flush, system_chunk); 1101 if (ret == -ENOSPC && 1102 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 1103 if (block_rsv != global_rsv && 1104 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) 1105 ret = 0; 1106 } 1107 if (ret == -ENOSPC) { 1108 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 1109 block_rsv->space_info->flags, 1110 orig_bytes, 1); 1111 1112 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 1113 btrfs_dump_space_info(fs_info, block_rsv->space_info, 1114 orig_bytes, 0); 1115 } 1116 return ret; 1117 } 1118