1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "ctree.h" 4 #include "space-info.h" 5 #include "sysfs.h" 6 #include "volumes.h" 7 #include "free-space-cache.h" 8 #include "ordered-data.h" 9 #include "transaction.h" 10 #include "math.h" 11 12 u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 13 bool may_use_included) 14 { 15 ASSERT(s_info); 16 return s_info->bytes_used + s_info->bytes_reserved + 17 s_info->bytes_pinned + s_info->bytes_readonly + 18 (may_use_included ? s_info->bytes_may_use : 0); 19 } 20 21 /* 22 * after adding space to the filesystem, we need to clear the full flags 23 * on all the space infos. 24 */ 25 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 26 { 27 struct list_head *head = &info->space_info; 28 struct btrfs_space_info *found; 29 30 rcu_read_lock(); 31 list_for_each_entry_rcu(found, head, list) 32 found->full = 0; 33 rcu_read_unlock(); 34 } 35 36 static const char *alloc_name(u64 flags) 37 { 38 switch (flags) { 39 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 40 return "mixed"; 41 case BTRFS_BLOCK_GROUP_METADATA: 42 return "metadata"; 43 case BTRFS_BLOCK_GROUP_DATA: 44 return "data"; 45 case BTRFS_BLOCK_GROUP_SYSTEM: 46 return "system"; 47 default: 48 WARN_ON(1); 49 return "invalid-combination"; 50 }; 51 } 52 53 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 54 { 55 56 struct btrfs_space_info *space_info; 57 int i; 58 int ret; 59 60 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 61 if (!space_info) 62 return -ENOMEM; 63 64 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 65 GFP_KERNEL); 66 if (ret) { 67 kfree(space_info); 68 return ret; 69 } 70 71 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 72 INIT_LIST_HEAD(&space_info->block_groups[i]); 73 init_rwsem(&space_info->groups_sem); 74 spin_lock_init(&space_info->lock); 75 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 76 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 77 init_waitqueue_head(&space_info->wait); 78 INIT_LIST_HEAD(&space_info->ro_bgs); 79 INIT_LIST_HEAD(&space_info->tickets); 80 INIT_LIST_HEAD(&space_info->priority_tickets); 81 82 ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, 83 info->space_info_kobj, "%s", 84 alloc_name(space_info->flags)); 85 if (ret) { 86 kobject_put(&space_info->kobj); 87 return ret; 88 } 89 90 list_add_rcu(&space_info->list, &info->space_info); 91 if (flags & BTRFS_BLOCK_GROUP_DATA) 92 info->data_sinfo = space_info; 93 94 return ret; 95 } 96 97 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 98 { 99 struct btrfs_super_block *disk_super; 100 u64 features; 101 u64 flags; 102 int mixed = 0; 103 int ret; 104 105 disk_super = fs_info->super_copy; 106 if (!btrfs_super_root(disk_super)) 107 return -EINVAL; 108 109 features = btrfs_super_incompat_flags(disk_super); 110 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 111 mixed = 1; 112 113 flags = BTRFS_BLOCK_GROUP_SYSTEM; 114 ret = create_space_info(fs_info, flags); 115 if (ret) 116 goto out; 117 118 if (mixed) { 119 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 120 ret = create_space_info(fs_info, flags); 121 } else { 122 flags = BTRFS_BLOCK_GROUP_METADATA; 123 ret = create_space_info(fs_info, flags); 124 if (ret) 125 goto out; 126 127 flags = BTRFS_BLOCK_GROUP_DATA; 128 ret = create_space_info(fs_info, flags); 129 } 130 out: 131 return ret; 132 } 133 134 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, 135 u64 total_bytes, u64 bytes_used, 136 u64 bytes_readonly, 137 struct btrfs_space_info **space_info) 138 { 139 struct btrfs_space_info *found; 140 int factor; 141 142 factor = btrfs_bg_type_to_factor(flags); 143 144 found = btrfs_find_space_info(info, flags); 145 ASSERT(found); 146 spin_lock(&found->lock); 147 found->total_bytes += total_bytes; 148 found->disk_total += total_bytes * factor; 149 found->bytes_used += bytes_used; 150 found->disk_used += bytes_used * factor; 151 found->bytes_readonly += bytes_readonly; 152 if (total_bytes > 0) 153 found->full = 0; 154 btrfs_space_info_add_new_bytes(info, found, 155 total_bytes - bytes_used - 156 bytes_readonly); 157 spin_unlock(&found->lock); 158 *space_info = found; 159 } 160 161 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 162 u64 flags) 163 { 164 struct list_head *head = &info->space_info; 165 struct btrfs_space_info *found; 166 167 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 168 169 rcu_read_lock(); 170 list_for_each_entry_rcu(found, head, list) { 171 if (found->flags & flags) { 172 rcu_read_unlock(); 173 return found; 174 } 175 } 176 rcu_read_unlock(); 177 return NULL; 178 } 179 180 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 181 { 182 return (global->size << 1); 183 } 184 185 static int can_overcommit(struct btrfs_fs_info *fs_info, 186 struct btrfs_space_info *space_info, u64 bytes, 187 enum btrfs_reserve_flush_enum flush, 188 bool system_chunk) 189 { 190 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 191 u64 profile; 192 u64 space_size; 193 u64 avail; 194 u64 used; 195 int factor; 196 197 /* Don't overcommit when in mixed mode. */ 198 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 199 return 0; 200 201 if (system_chunk) 202 profile = btrfs_system_alloc_profile(fs_info); 203 else 204 profile = btrfs_metadata_alloc_profile(fs_info); 205 206 used = btrfs_space_info_used(space_info, false); 207 208 /* 209 * We only want to allow over committing if we have lots of actual space 210 * free, but if we don't have enough space to handle the global reserve 211 * space then we could end up having a real enospc problem when trying 212 * to allocate a chunk or some other such important allocation. 213 */ 214 spin_lock(&global_rsv->lock); 215 space_size = calc_global_rsv_need_space(global_rsv); 216 spin_unlock(&global_rsv->lock); 217 if (used + space_size >= space_info->total_bytes) 218 return 0; 219 220 used += space_info->bytes_may_use; 221 222 avail = atomic64_read(&fs_info->free_chunk_space); 223 224 /* 225 * If we have dup, raid1 or raid10 then only half of the free 226 * space is actually usable. For raid56, the space info used 227 * doesn't include the parity drive, so we don't have to 228 * change the math 229 */ 230 factor = btrfs_bg_type_to_factor(profile); 231 avail = div_u64(avail, factor); 232 233 /* 234 * If we aren't flushing all things, let us overcommit up to 235 * 1/2th of the space. If we can flush, don't let us overcommit 236 * too much, let it overcommit up to 1/8 of the space. 237 */ 238 if (flush == BTRFS_RESERVE_FLUSH_ALL) 239 avail >>= 3; 240 else 241 avail >>= 1; 242 243 if (used + bytes < space_info->total_bytes + avail) 244 return 1; 245 return 0; 246 } 247 248 /* 249 * This is for space we already have accounted in space_info->bytes_may_use, so 250 * basically when we're returning space from block_rsv's. 251 */ 252 void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 253 struct btrfs_space_info *space_info, 254 u64 num_bytes) 255 { 256 struct reserve_ticket *ticket; 257 struct list_head *head; 258 u64 used; 259 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 260 bool check_overcommit = false; 261 262 spin_lock(&space_info->lock); 263 head = &space_info->priority_tickets; 264 265 /* 266 * If we are over our limit then we need to check and see if we can 267 * overcommit, and if we can't then we just need to free up our space 268 * and not satisfy any requests. 269 */ 270 used = btrfs_space_info_used(space_info, true); 271 if (used - num_bytes >= space_info->total_bytes) 272 check_overcommit = true; 273 again: 274 while (!list_empty(head) && num_bytes) { 275 ticket = list_first_entry(head, struct reserve_ticket, 276 list); 277 /* 278 * We use 0 bytes because this space is already reserved, so 279 * adding the ticket space would be a double count. 280 */ 281 if (check_overcommit && 282 !can_overcommit(fs_info, space_info, 0, flush, false)) 283 break; 284 if (num_bytes >= ticket->bytes) { 285 list_del_init(&ticket->list); 286 num_bytes -= ticket->bytes; 287 ticket->bytes = 0; 288 space_info->tickets_id++; 289 wake_up(&ticket->wait); 290 } else { 291 ticket->bytes -= num_bytes; 292 num_bytes = 0; 293 } 294 } 295 296 if (num_bytes && head == &space_info->priority_tickets) { 297 head = &space_info->tickets; 298 flush = BTRFS_RESERVE_FLUSH_ALL; 299 goto again; 300 } 301 btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); 302 trace_btrfs_space_reservation(fs_info, "space_info", 303 space_info->flags, num_bytes, 0); 304 spin_unlock(&space_info->lock); 305 } 306 307 /* 308 * This is for newly allocated space that isn't accounted in 309 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent 310 * we use this helper. 311 */ 312 void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 313 struct btrfs_space_info *space_info, 314 u64 num_bytes) 315 { 316 struct reserve_ticket *ticket; 317 struct list_head *head = &space_info->priority_tickets; 318 319 again: 320 while (!list_empty(head) && num_bytes) { 321 ticket = list_first_entry(head, struct reserve_ticket, 322 list); 323 if (num_bytes >= ticket->bytes) { 324 trace_btrfs_space_reservation(fs_info, "space_info", 325 space_info->flags, 326 ticket->bytes, 1); 327 list_del_init(&ticket->list); 328 num_bytes -= ticket->bytes; 329 btrfs_space_info_update_bytes_may_use(fs_info, 330 space_info, 331 ticket->bytes); 332 ticket->bytes = 0; 333 space_info->tickets_id++; 334 wake_up(&ticket->wait); 335 } else { 336 trace_btrfs_space_reservation(fs_info, "space_info", 337 space_info->flags, 338 num_bytes, 1); 339 btrfs_space_info_update_bytes_may_use(fs_info, 340 space_info, 341 num_bytes); 342 ticket->bytes -= num_bytes; 343 num_bytes = 0; 344 } 345 } 346 347 if (num_bytes && head == &space_info->priority_tickets) { 348 head = &space_info->tickets; 349 goto again; 350 } 351 } 352 353 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 354 do { \ 355 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 356 spin_lock(&__rsv->lock); \ 357 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 358 __rsv->size, __rsv->reserved); \ 359 spin_unlock(&__rsv->lock); \ 360 } while (0) 361 362 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 363 struct btrfs_space_info *info, u64 bytes, 364 int dump_block_groups) 365 { 366 struct btrfs_block_group_cache *cache; 367 int index = 0; 368 369 spin_lock(&info->lock); 370 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 371 info->flags, 372 info->total_bytes - btrfs_space_info_used(info, true), 373 info->full ? "" : "not "); 374 btrfs_info(fs_info, 375 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 376 info->total_bytes, info->bytes_used, info->bytes_pinned, 377 info->bytes_reserved, info->bytes_may_use, 378 info->bytes_readonly); 379 spin_unlock(&info->lock); 380 381 DUMP_BLOCK_RSV(fs_info, global_block_rsv); 382 DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 383 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 384 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 385 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 386 387 if (!dump_block_groups) 388 return; 389 390 down_read(&info->groups_sem); 391 again: 392 list_for_each_entry(cache, &info->block_groups[index], list) { 393 spin_lock(&cache->lock); 394 btrfs_info(fs_info, 395 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 396 cache->key.objectid, cache->key.offset, 397 btrfs_block_group_used(&cache->item), cache->pinned, 398 cache->reserved, cache->ro ? "[readonly]" : ""); 399 btrfs_dump_free_space(cache, bytes); 400 spin_unlock(&cache->lock); 401 } 402 if (++index < BTRFS_NR_RAID_TYPES) 403 goto again; 404 up_read(&info->groups_sem); 405 } 406 407 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 408 unsigned long nr_pages, int nr_items) 409 { 410 struct super_block *sb = fs_info->sb; 411 412 if (down_read_trylock(&sb->s_umount)) { 413 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 414 up_read(&sb->s_umount); 415 } else { 416 /* 417 * We needn't worry the filesystem going from r/w to r/o though 418 * we don't acquire ->s_umount mutex, because the filesystem 419 * should guarantee the delalloc inodes list be empty after 420 * the filesystem is readonly(all dirty pages are written to 421 * the disk). 422 */ 423 btrfs_start_delalloc_roots(fs_info, nr_items); 424 if (!current->journal_info) 425 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 426 } 427 } 428 429 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 430 u64 to_reclaim) 431 { 432 u64 bytes; 433 u64 nr; 434 435 bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 436 nr = div64_u64(to_reclaim, bytes); 437 if (!nr) 438 nr = 1; 439 return nr; 440 } 441 442 #define EXTENT_SIZE_PER_ITEM SZ_256K 443 444 /* 445 * shrink metadata reservation for delalloc 446 */ 447 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 448 u64 orig, bool wait_ordered) 449 { 450 struct btrfs_space_info *space_info; 451 struct btrfs_trans_handle *trans; 452 u64 delalloc_bytes; 453 u64 dio_bytes; 454 u64 async_pages; 455 u64 items; 456 long time_left; 457 unsigned long nr_pages; 458 int loops; 459 460 /* Calc the number of the pages we need flush for space reservation */ 461 items = calc_reclaim_items_nr(fs_info, to_reclaim); 462 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 463 464 trans = (struct btrfs_trans_handle *)current->journal_info; 465 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 466 467 delalloc_bytes = percpu_counter_sum_positive( 468 &fs_info->delalloc_bytes); 469 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 470 if (delalloc_bytes == 0 && dio_bytes == 0) { 471 if (trans) 472 return; 473 if (wait_ordered) 474 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 475 return; 476 } 477 478 /* 479 * If we are doing more ordered than delalloc we need to just wait on 480 * ordered extents, otherwise we'll waste time trying to flush delalloc 481 * that likely won't give us the space back we need. 482 */ 483 if (dio_bytes > delalloc_bytes) 484 wait_ordered = true; 485 486 loops = 0; 487 while ((delalloc_bytes || dio_bytes) && loops < 3) { 488 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 489 490 /* 491 * Triggers inode writeback for up to nr_pages. This will invoke 492 * ->writepages callback and trigger delalloc filling 493 * (btrfs_run_delalloc_range()). 494 */ 495 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 496 497 /* 498 * We need to wait for the compressed pages to start before 499 * we continue. 500 */ 501 async_pages = atomic_read(&fs_info->async_delalloc_pages); 502 if (!async_pages) 503 goto skip_async; 504 505 /* 506 * Calculate how many compressed pages we want to be written 507 * before we continue. I.e if there are more async pages than we 508 * require wait_event will wait until nr_pages are written. 509 */ 510 if (async_pages <= nr_pages) 511 async_pages = 0; 512 else 513 async_pages -= nr_pages; 514 515 wait_event(fs_info->async_submit_wait, 516 atomic_read(&fs_info->async_delalloc_pages) <= 517 (int)async_pages); 518 skip_async: 519 spin_lock(&space_info->lock); 520 if (list_empty(&space_info->tickets) && 521 list_empty(&space_info->priority_tickets)) { 522 spin_unlock(&space_info->lock); 523 break; 524 } 525 spin_unlock(&space_info->lock); 526 527 loops++; 528 if (wait_ordered && !trans) { 529 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 530 } else { 531 time_left = schedule_timeout_killable(1); 532 if (time_left) 533 break; 534 } 535 delalloc_bytes = percpu_counter_sum_positive( 536 &fs_info->delalloc_bytes); 537 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 538 } 539 } 540 541 /** 542 * maybe_commit_transaction - possibly commit the transaction if its ok to 543 * @root - the root we're allocating for 544 * @bytes - the number of bytes we want to reserve 545 * @force - force the commit 546 * 547 * This will check to make sure that committing the transaction will actually 548 * get us somewhere and then commit the transaction if it does. Otherwise it 549 * will return -ENOSPC. 550 */ 551 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 552 struct btrfs_space_info *space_info) 553 { 554 struct reserve_ticket *ticket = NULL; 555 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 556 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 557 struct btrfs_trans_handle *trans; 558 u64 bytes_needed; 559 u64 reclaim_bytes = 0; 560 561 trans = (struct btrfs_trans_handle *)current->journal_info; 562 if (trans) 563 return -EAGAIN; 564 565 spin_lock(&space_info->lock); 566 if (!list_empty(&space_info->priority_tickets)) 567 ticket = list_first_entry(&space_info->priority_tickets, 568 struct reserve_ticket, list); 569 else if (!list_empty(&space_info->tickets)) 570 ticket = list_first_entry(&space_info->tickets, 571 struct reserve_ticket, list); 572 bytes_needed = (ticket) ? ticket->bytes : 0; 573 spin_unlock(&space_info->lock); 574 575 if (!bytes_needed) 576 return 0; 577 578 trans = btrfs_join_transaction(fs_info->extent_root); 579 if (IS_ERR(trans)) 580 return PTR_ERR(trans); 581 582 /* 583 * See if there is enough pinned space to make this reservation, or if 584 * we have block groups that are going to be freed, allowing us to 585 * possibly do a chunk allocation the next loop through. 586 */ 587 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 588 __percpu_counter_compare(&space_info->total_bytes_pinned, 589 bytes_needed, 590 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 591 goto commit; 592 593 /* 594 * See if there is some space in the delayed insertion reservation for 595 * this reservation. 596 */ 597 if (space_info != delayed_rsv->space_info) 598 goto enospc; 599 600 spin_lock(&delayed_rsv->lock); 601 reclaim_bytes += delayed_rsv->reserved; 602 spin_unlock(&delayed_rsv->lock); 603 604 spin_lock(&delayed_refs_rsv->lock); 605 reclaim_bytes += delayed_refs_rsv->reserved; 606 spin_unlock(&delayed_refs_rsv->lock); 607 if (reclaim_bytes >= bytes_needed) 608 goto commit; 609 bytes_needed -= reclaim_bytes; 610 611 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 612 bytes_needed, 613 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 614 goto enospc; 615 616 commit: 617 return btrfs_commit_transaction(trans); 618 enospc: 619 btrfs_end_transaction(trans); 620 return -ENOSPC; 621 } 622 623 /* 624 * Try to flush some data based on policy set by @state. This is only advisory 625 * and may fail for various reasons. The caller is supposed to examine the 626 * state of @space_info to detect the outcome. 627 */ 628 static void flush_space(struct btrfs_fs_info *fs_info, 629 struct btrfs_space_info *space_info, u64 num_bytes, 630 int state) 631 { 632 struct btrfs_root *root = fs_info->extent_root; 633 struct btrfs_trans_handle *trans; 634 int nr; 635 int ret = 0; 636 637 switch (state) { 638 case FLUSH_DELAYED_ITEMS_NR: 639 case FLUSH_DELAYED_ITEMS: 640 if (state == FLUSH_DELAYED_ITEMS_NR) 641 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 642 else 643 nr = -1; 644 645 trans = btrfs_join_transaction(root); 646 if (IS_ERR(trans)) { 647 ret = PTR_ERR(trans); 648 break; 649 } 650 ret = btrfs_run_delayed_items_nr(trans, nr); 651 btrfs_end_transaction(trans); 652 break; 653 case FLUSH_DELALLOC: 654 case FLUSH_DELALLOC_WAIT: 655 shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 656 state == FLUSH_DELALLOC_WAIT); 657 break; 658 case FLUSH_DELAYED_REFS_NR: 659 case FLUSH_DELAYED_REFS: 660 trans = btrfs_join_transaction(root); 661 if (IS_ERR(trans)) { 662 ret = PTR_ERR(trans); 663 break; 664 } 665 if (state == FLUSH_DELAYED_REFS_NR) 666 nr = calc_reclaim_items_nr(fs_info, num_bytes); 667 else 668 nr = 0; 669 btrfs_run_delayed_refs(trans, nr); 670 btrfs_end_transaction(trans); 671 break; 672 case ALLOC_CHUNK: 673 case ALLOC_CHUNK_FORCE: 674 trans = btrfs_join_transaction(root); 675 if (IS_ERR(trans)) { 676 ret = PTR_ERR(trans); 677 break; 678 } 679 ret = btrfs_chunk_alloc(trans, 680 btrfs_metadata_alloc_profile(fs_info), 681 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : 682 CHUNK_ALLOC_FORCE); 683 btrfs_end_transaction(trans); 684 if (ret > 0 || ret == -ENOSPC) 685 ret = 0; 686 break; 687 case COMMIT_TRANS: 688 /* 689 * If we have pending delayed iputs then we could free up a 690 * bunch of pinned space, so make sure we run the iputs before 691 * we do our pinned bytes check below. 692 */ 693 btrfs_run_delayed_iputs(fs_info); 694 btrfs_wait_on_delayed_iputs(fs_info); 695 696 ret = may_commit_transaction(fs_info, space_info); 697 break; 698 default: 699 ret = -ENOSPC; 700 break; 701 } 702 703 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 704 ret); 705 return; 706 } 707 708 static inline u64 709 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 710 struct btrfs_space_info *space_info, 711 bool system_chunk) 712 { 713 struct reserve_ticket *ticket; 714 u64 used; 715 u64 expected; 716 u64 to_reclaim = 0; 717 718 list_for_each_entry(ticket, &space_info->tickets, list) 719 to_reclaim += ticket->bytes; 720 list_for_each_entry(ticket, &space_info->priority_tickets, list) 721 to_reclaim += ticket->bytes; 722 if (to_reclaim) 723 return to_reclaim; 724 725 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 726 if (can_overcommit(fs_info, space_info, to_reclaim, 727 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 728 return 0; 729 730 used = btrfs_space_info_used(space_info, true); 731 732 if (can_overcommit(fs_info, space_info, SZ_1M, 733 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 734 expected = div_factor_fine(space_info->total_bytes, 95); 735 else 736 expected = div_factor_fine(space_info->total_bytes, 90); 737 738 if (used > expected) 739 to_reclaim = used - expected; 740 else 741 to_reclaim = 0; 742 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 743 space_info->bytes_reserved); 744 return to_reclaim; 745 } 746 747 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 748 struct btrfs_space_info *space_info, 749 u64 used, bool system_chunk) 750 { 751 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 752 753 /* If we're just plain full then async reclaim just slows us down. */ 754 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 755 return 0; 756 757 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, 758 system_chunk)) 759 return 0; 760 761 return (used >= thresh && !btrfs_fs_closing(fs_info) && 762 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 763 } 764 765 static bool wake_all_tickets(struct list_head *head) 766 { 767 struct reserve_ticket *ticket; 768 769 while (!list_empty(head)) { 770 ticket = list_first_entry(head, struct reserve_ticket, list); 771 list_del_init(&ticket->list); 772 ticket->error = -ENOSPC; 773 wake_up(&ticket->wait); 774 if (ticket->bytes != ticket->orig_bytes) 775 return true; 776 } 777 return false; 778 } 779 780 /* 781 * This is for normal flushers, we can wait all goddamned day if we want to. We 782 * will loop and continuously try to flush as long as we are making progress. 783 * We count progress as clearing off tickets each time we have to loop. 784 */ 785 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 786 { 787 struct btrfs_fs_info *fs_info; 788 struct btrfs_space_info *space_info; 789 u64 to_reclaim; 790 int flush_state; 791 int commit_cycles = 0; 792 u64 last_tickets_id; 793 794 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 795 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 796 797 spin_lock(&space_info->lock); 798 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 799 false); 800 if (!to_reclaim) { 801 space_info->flush = 0; 802 spin_unlock(&space_info->lock); 803 return; 804 } 805 last_tickets_id = space_info->tickets_id; 806 spin_unlock(&space_info->lock); 807 808 flush_state = FLUSH_DELAYED_ITEMS_NR; 809 do { 810 flush_space(fs_info, space_info, to_reclaim, flush_state); 811 spin_lock(&space_info->lock); 812 if (list_empty(&space_info->tickets)) { 813 space_info->flush = 0; 814 spin_unlock(&space_info->lock); 815 return; 816 } 817 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 818 space_info, 819 false); 820 if (last_tickets_id == space_info->tickets_id) { 821 flush_state++; 822 } else { 823 last_tickets_id = space_info->tickets_id; 824 flush_state = FLUSH_DELAYED_ITEMS_NR; 825 if (commit_cycles) 826 commit_cycles--; 827 } 828 829 /* 830 * We don't want to force a chunk allocation until we've tried 831 * pretty hard to reclaim space. Think of the case where we 832 * freed up a bunch of space and so have a lot of pinned space 833 * to reclaim. We would rather use that than possibly create a 834 * underutilized metadata chunk. So if this is our first run 835 * through the flushing state machine skip ALLOC_CHUNK_FORCE and 836 * commit the transaction. If nothing has changed the next go 837 * around then we can force a chunk allocation. 838 */ 839 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 840 flush_state++; 841 842 if (flush_state > COMMIT_TRANS) { 843 commit_cycles++; 844 if (commit_cycles > 2) { 845 if (wake_all_tickets(&space_info->tickets)) { 846 flush_state = FLUSH_DELAYED_ITEMS_NR; 847 commit_cycles--; 848 } else { 849 space_info->flush = 0; 850 } 851 } else { 852 flush_state = FLUSH_DELAYED_ITEMS_NR; 853 } 854 } 855 spin_unlock(&space_info->lock); 856 } while (flush_state <= COMMIT_TRANS); 857 } 858 859 void btrfs_init_async_reclaim_work(struct work_struct *work) 860 { 861 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 862 } 863 864 static const enum btrfs_flush_state priority_flush_states[] = { 865 FLUSH_DELAYED_ITEMS_NR, 866 FLUSH_DELAYED_ITEMS, 867 ALLOC_CHUNK, 868 }; 869 870 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 871 struct btrfs_space_info *space_info, 872 struct reserve_ticket *ticket) 873 { 874 u64 to_reclaim; 875 int flush_state; 876 877 spin_lock(&space_info->lock); 878 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 879 false); 880 if (!to_reclaim) { 881 spin_unlock(&space_info->lock); 882 return; 883 } 884 spin_unlock(&space_info->lock); 885 886 flush_state = 0; 887 do { 888 flush_space(fs_info, space_info, to_reclaim, 889 priority_flush_states[flush_state]); 890 flush_state++; 891 spin_lock(&space_info->lock); 892 if (ticket->bytes == 0) { 893 spin_unlock(&space_info->lock); 894 return; 895 } 896 spin_unlock(&space_info->lock); 897 } while (flush_state < ARRAY_SIZE(priority_flush_states)); 898 } 899 900 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, 901 struct btrfs_space_info *space_info, 902 struct reserve_ticket *ticket) 903 904 { 905 DEFINE_WAIT(wait); 906 u64 reclaim_bytes = 0; 907 int ret = 0; 908 909 spin_lock(&space_info->lock); 910 while (ticket->bytes > 0 && ticket->error == 0) { 911 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 912 if (ret) { 913 ret = -EINTR; 914 break; 915 } 916 spin_unlock(&space_info->lock); 917 918 schedule(); 919 920 finish_wait(&ticket->wait, &wait); 921 spin_lock(&space_info->lock); 922 } 923 if (!ret) 924 ret = ticket->error; 925 if (!list_empty(&ticket->list)) 926 list_del_init(&ticket->list); 927 if (ticket->bytes && ticket->bytes < ticket->orig_bytes) 928 reclaim_bytes = ticket->orig_bytes - ticket->bytes; 929 spin_unlock(&space_info->lock); 930 931 if (reclaim_bytes) 932 btrfs_space_info_add_old_bytes(fs_info, space_info, 933 reclaim_bytes); 934 return ret; 935 } 936 937 /** 938 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 939 * @root - the root we're allocating for 940 * @space_info - the space info we want to allocate from 941 * @orig_bytes - the number of bytes we want 942 * @flush - whether or not we can flush to make our reservation 943 * 944 * This will reserve orig_bytes number of bytes from the space info associated 945 * with the block_rsv. If there is not enough space it will make an attempt to 946 * flush out space to make room. It will do this by flushing delalloc if 947 * possible or committing the transaction. If flush is 0 then no attempts to 948 * regain reservations will be made and this will fail if there is not enough 949 * space already. 950 */ 951 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 952 struct btrfs_space_info *space_info, 953 u64 orig_bytes, 954 enum btrfs_reserve_flush_enum flush, 955 bool system_chunk) 956 { 957 struct reserve_ticket ticket; 958 u64 used; 959 u64 reclaim_bytes = 0; 960 int ret = 0; 961 962 ASSERT(orig_bytes); 963 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 964 965 spin_lock(&space_info->lock); 966 ret = -ENOSPC; 967 used = btrfs_space_info_used(space_info, true); 968 969 /* 970 * Carry on if we have enough space (short-circuit) OR call 971 * can_overcommit() to ensure we can overcommit to continue. 972 */ 973 if ((used + orig_bytes <= space_info->total_bytes) || 974 can_overcommit(fs_info, space_info, orig_bytes, flush, 975 system_chunk)) { 976 btrfs_space_info_update_bytes_may_use(fs_info, space_info, 977 orig_bytes); 978 trace_btrfs_space_reservation(fs_info, "space_info", 979 space_info->flags, orig_bytes, 1); 980 ret = 0; 981 } 982 983 /* 984 * If we couldn't make a reservation then setup our reservation ticket 985 * and kick the async worker if it's not already running. 986 * 987 * If we are a priority flusher then we just need to add our ticket to 988 * the list and we will do our own flushing further down. 989 */ 990 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 991 ticket.orig_bytes = orig_bytes; 992 ticket.bytes = orig_bytes; 993 ticket.error = 0; 994 init_waitqueue_head(&ticket.wait); 995 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 996 list_add_tail(&ticket.list, &space_info->tickets); 997 if (!space_info->flush) { 998 space_info->flush = 1; 999 trace_btrfs_trigger_flush(fs_info, 1000 space_info->flags, 1001 orig_bytes, flush, 1002 "enospc"); 1003 queue_work(system_unbound_wq, 1004 &fs_info->async_reclaim_work); 1005 } 1006 } else { 1007 list_add_tail(&ticket.list, 1008 &space_info->priority_tickets); 1009 } 1010 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 1011 used += orig_bytes; 1012 /* 1013 * We will do the space reservation dance during log replay, 1014 * which means we won't have fs_info->fs_root set, so don't do 1015 * the async reclaim as we will panic. 1016 */ 1017 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 1018 need_do_async_reclaim(fs_info, space_info, 1019 used, system_chunk) && 1020 !work_busy(&fs_info->async_reclaim_work)) { 1021 trace_btrfs_trigger_flush(fs_info, space_info->flags, 1022 orig_bytes, flush, "preempt"); 1023 queue_work(system_unbound_wq, 1024 &fs_info->async_reclaim_work); 1025 } 1026 } 1027 spin_unlock(&space_info->lock); 1028 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 1029 return ret; 1030 1031 if (flush == BTRFS_RESERVE_FLUSH_ALL) 1032 return wait_reserve_ticket(fs_info, space_info, &ticket); 1033 1034 ret = 0; 1035 priority_reclaim_metadata_space(fs_info, space_info, &ticket); 1036 spin_lock(&space_info->lock); 1037 if (ticket.bytes) { 1038 if (ticket.bytes < orig_bytes) 1039 reclaim_bytes = orig_bytes - ticket.bytes; 1040 list_del_init(&ticket.list); 1041 ret = -ENOSPC; 1042 } 1043 spin_unlock(&space_info->lock); 1044 1045 if (reclaim_bytes) 1046 btrfs_space_info_add_old_bytes(fs_info, space_info, 1047 reclaim_bytes); 1048 ASSERT(list_empty(&ticket.list)); 1049 return ret; 1050 } 1051 1052 /** 1053 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 1054 * @root - the root we're allocating for 1055 * @block_rsv - the block_rsv we're allocating for 1056 * @orig_bytes - the number of bytes we want 1057 * @flush - whether or not we can flush to make our reservation 1058 * 1059 * This will reserve orig_bytes number of bytes from the space info associated 1060 * with the block_rsv. If there is not enough space it will make an attempt to 1061 * flush out space to make room. It will do this by flushing delalloc if 1062 * possible or committing the transaction. If flush is 0 then no attempts to 1063 * regain reservations will be made and this will fail if there is not enough 1064 * space already. 1065 */ 1066 int btrfs_reserve_metadata_bytes(struct btrfs_root *root, 1067 struct btrfs_block_rsv *block_rsv, 1068 u64 orig_bytes, 1069 enum btrfs_reserve_flush_enum flush) 1070 { 1071 struct btrfs_fs_info *fs_info = root->fs_info; 1072 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 1073 int ret; 1074 bool system_chunk = (root == fs_info->chunk_root); 1075 1076 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 1077 orig_bytes, flush, system_chunk); 1078 if (ret == -ENOSPC && 1079 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 1080 if (block_rsv != global_rsv && 1081 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) 1082 ret = 0; 1083 } 1084 if (ret == -ENOSPC) { 1085 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 1086 block_rsv->space_info->flags, 1087 orig_bytes, 1); 1088 1089 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 1090 btrfs_dump_space_info(fs_info, block_rsv->space_info, 1091 orig_bytes, 0); 1092 } 1093 return ret; 1094 } 1095