1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "linux/spinlock.h" 4 #include <linux/minmax.h> 5 #include "misc.h" 6 #include "ctree.h" 7 #include "space-info.h" 8 #include "sysfs.h" 9 #include "volumes.h" 10 #include "free-space-cache.h" 11 #include "ordered-data.h" 12 #include "transaction.h" 13 #include "block-group.h" 14 #include "fs.h" 15 #include "accessors.h" 16 #include "extent-tree.h" 17 #include "zoned.h" 18 19 /* 20 * HOW DOES SPACE RESERVATION WORK 21 * 22 * If you want to know about delalloc specifically, there is a separate comment 23 * for that with the delalloc code. This comment is about how the whole system 24 * works generally. 25 * 26 * BASIC CONCEPTS 27 * 28 * 1) space_info. This is the ultimate arbiter of how much space we can use. 29 * There's a description of the bytes_ fields with the struct declaration, 30 * refer to that for specifics on each field. Suffice it to say that for 31 * reservations we care about total_bytes - SUM(space_info->bytes_) when 32 * determining if there is space to make an allocation. There is a space_info 33 * for METADATA, SYSTEM, and DATA areas. 34 * 35 * 2) block_rsv's. These are basically buckets for every different type of 36 * metadata reservation we have. You can see the comment in the block_rsv 37 * code on the rules for each type, but generally block_rsv->reserved is how 38 * much space is accounted for in space_info->bytes_may_use. 39 * 40 * 3) btrfs_calc*_size. These are the worst case calculations we used based 41 * on the number of items we will want to modify. We have one for changing 42 * items, and one for inserting new items. Generally we use these helpers to 43 * determine the size of the block reserves, and then use the actual bytes 44 * values to adjust the space_info counters. 45 * 46 * MAKING RESERVATIONS, THE NORMAL CASE 47 * 48 * We call into either btrfs_reserve_data_bytes() or 49 * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with 50 * num_bytes we want to reserve. 51 * 52 * ->reserve 53 * space_info->bytes_may_reserve += num_bytes 54 * 55 * ->extent allocation 56 * Call btrfs_add_reserved_bytes() which does 57 * space_info->bytes_may_reserve -= num_bytes 58 * space_info->bytes_reserved += extent_bytes 59 * 60 * ->insert reference 61 * Call btrfs_update_block_group() which does 62 * space_info->bytes_reserved -= extent_bytes 63 * space_info->bytes_used += extent_bytes 64 * 65 * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority) 66 * 67 * Assume we are unable to simply make the reservation because we do not have 68 * enough space 69 * 70 * -> __reserve_bytes 71 * create a reserve_ticket with ->bytes set to our reservation, add it to 72 * the tail of space_info->tickets, kick async flush thread 73 * 74 * ->handle_reserve_ticket 75 * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set 76 * on the ticket. 77 * 78 * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space 79 * Flushes various things attempting to free up space. 80 * 81 * -> btrfs_try_granting_tickets() 82 * This is called by anything that either subtracts space from 83 * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the 84 * space_info->total_bytes. This loops through the ->priority_tickets and 85 * then the ->tickets list checking to see if the reservation can be 86 * completed. If it can the space is added to space_info->bytes_may_use and 87 * the ticket is woken up. 88 * 89 * -> ticket wakeup 90 * Check if ->bytes == 0, if it does we got our reservation and we can carry 91 * on, if not return the appropriate error (ENOSPC, but can be EINTR if we 92 * were interrupted.) 93 * 94 * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY 95 * 96 * Same as the above, except we add ourselves to the 97 * space_info->priority_tickets, and we do not use ticket->wait, we simply 98 * call flush_space() ourselves for the states that are safe for us to call 99 * without deadlocking and hope for the best. 100 * 101 * THE FLUSHING STATES 102 * 103 * Generally speaking we will have two cases for each state, a "nice" state 104 * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to 105 * reduce the locking over head on the various trees, and even to keep from 106 * doing any work at all in the case of delayed refs. Each of these delayed 107 * things however hold reservations, and so letting them run allows us to 108 * reclaim space so we can make new reservations. 109 * 110 * FLUSH_DELAYED_ITEMS 111 * Every inode has a delayed item to update the inode. Take a simple write 112 * for example, we would update the inode item at write time to update the 113 * mtime, and then again at finish_ordered_io() time in order to update the 114 * isize or bytes. We keep these delayed items to coalesce these operations 115 * into a single operation done on demand. These are an easy way to reclaim 116 * metadata space. 117 * 118 * FLUSH_DELALLOC 119 * Look at the delalloc comment to get an idea of how much space is reserved 120 * for delayed allocation. We can reclaim some of this space simply by 121 * running delalloc, but usually we need to wait for ordered extents to 122 * reclaim the bulk of this space. 123 * 124 * FLUSH_DELAYED_REFS 125 * We have a block reserve for the outstanding delayed refs space, and every 126 * delayed ref operation holds a reservation. Running these is a quick way 127 * to reclaim space, but we want to hold this until the end because COW can 128 * churn a lot and we can avoid making some extent tree modifications if we 129 * are able to delay for as long as possible. 130 * 131 * RESET_ZONES 132 * This state works only for the zoned mode. On the zoned mode, we cannot 133 * reuse once allocated then freed region until we reset the zone, due to 134 * the sequential write zone requirement. The RESET_ZONES state resets the 135 * zones of an unused block group and let us reuse the space. The reusing 136 * is faster than removing the block group and allocating another block 137 * group on the zones. 138 * 139 * ALLOC_CHUNK 140 * We will skip this the first time through space reservation, because of 141 * overcommit and we don't want to have a lot of useless metadata space when 142 * our worst case reservations will likely never come true. 143 * 144 * RUN_DELAYED_IPUTS 145 * If we're freeing inodes we're likely freeing checksums, file extent 146 * items, and extent tree items. Loads of space could be freed up by these 147 * operations, however they won't be usable until the transaction commits. 148 * 149 * COMMIT_TRANS 150 * This will commit the transaction. Historically we had a lot of logic 151 * surrounding whether or not we'd commit the transaction, but this waits born 152 * out of a pre-tickets era where we could end up committing the transaction 153 * thousands of times in a row without making progress. Now thanks to our 154 * ticketing system we know if we're not making progress and can error 155 * everybody out after a few commits rather than burning the disk hoping for 156 * a different answer. 157 * 158 * OVERCOMMIT 159 * 160 * Because we hold so many reservations for metadata we will allow you to 161 * reserve more space than is currently free in the currently allocate 162 * metadata space. This only happens with metadata, data does not allow 163 * overcommitting. 164 * 165 * You can see the current logic for when we allow overcommit in 166 * btrfs_can_overcommit(), but it only applies to unallocated space. If there 167 * is no unallocated space to be had, all reservations are kept within the 168 * free space in the allocated metadata chunks. 169 * 170 * Because of overcommitting, you generally want to use the 171 * btrfs_can_overcommit() logic for metadata allocations, as it does the right 172 * thing with or without extra unallocated space. 173 */ 174 175 u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, 176 bool may_use_included) 177 { 178 ASSERT(s_info); 179 return s_info->bytes_used + s_info->bytes_reserved + 180 s_info->bytes_pinned + s_info->bytes_readonly + 181 s_info->bytes_zone_unusable + 182 (may_use_included ? s_info->bytes_may_use : 0); 183 } 184 185 /* 186 * after adding space to the filesystem, we need to clear the full flags 187 * on all the space infos. 188 */ 189 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 190 { 191 struct list_head *head = &info->space_info; 192 struct btrfs_space_info *found; 193 194 list_for_each_entry(found, head, list) 195 found->full = 0; 196 } 197 198 /* 199 * Block groups with more than this value (percents) of unusable space will be 200 * scheduled for background reclaim. 201 */ 202 #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) 203 204 #define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL) 205 206 /* 207 * Calculate chunk size depending on volume type (regular or zoned). 208 */ 209 static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) 210 { 211 if (btrfs_is_zoned(fs_info)) 212 return fs_info->zone_size; 213 214 ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); 215 216 if (flags & BTRFS_BLOCK_GROUP_DATA) 217 return BTRFS_MAX_DATA_CHUNK_SIZE; 218 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 219 return SZ_32M; 220 221 /* Handle BTRFS_BLOCK_GROUP_METADATA */ 222 if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G) 223 return SZ_1G; 224 225 return SZ_256M; 226 } 227 228 /* 229 * Update default chunk size. 230 */ 231 void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, 232 u64 chunk_size) 233 { 234 WRITE_ONCE(space_info->chunk_size, chunk_size); 235 } 236 237 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 238 { 239 240 struct btrfs_space_info *space_info; 241 int i; 242 int ret; 243 244 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 245 if (!space_info) 246 return -ENOMEM; 247 248 space_info->fs_info = info; 249 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 250 INIT_LIST_HEAD(&space_info->block_groups[i]); 251 init_rwsem(&space_info->groups_sem); 252 spin_lock_init(&space_info->lock); 253 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 254 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 255 INIT_LIST_HEAD(&space_info->ro_bgs); 256 INIT_LIST_HEAD(&space_info->tickets); 257 INIT_LIST_HEAD(&space_info->priority_tickets); 258 space_info->clamp = 1; 259 btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); 260 261 if (btrfs_is_zoned(info)) 262 space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; 263 264 ret = btrfs_sysfs_add_space_info_type(info, space_info); 265 if (ret) 266 return ret; 267 268 list_add(&space_info->list, &info->space_info); 269 if (flags & BTRFS_BLOCK_GROUP_DATA) 270 info->data_sinfo = space_info; 271 272 return ret; 273 } 274 275 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 276 { 277 struct btrfs_super_block *disk_super; 278 u64 features; 279 u64 flags; 280 int mixed = 0; 281 int ret; 282 283 disk_super = fs_info->super_copy; 284 if (!btrfs_super_root(disk_super)) 285 return -EINVAL; 286 287 features = btrfs_super_incompat_flags(disk_super); 288 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 289 mixed = 1; 290 291 flags = BTRFS_BLOCK_GROUP_SYSTEM; 292 ret = create_space_info(fs_info, flags); 293 if (ret) 294 goto out; 295 296 if (mixed) { 297 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 298 ret = create_space_info(fs_info, flags); 299 } else { 300 flags = BTRFS_BLOCK_GROUP_METADATA; 301 ret = create_space_info(fs_info, flags); 302 if (ret) 303 goto out; 304 305 flags = BTRFS_BLOCK_GROUP_DATA; 306 ret = create_space_info(fs_info, flags); 307 } 308 out: 309 return ret; 310 } 311 312 void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, 313 struct btrfs_block_group *block_group) 314 { 315 struct btrfs_space_info *found; 316 int factor, index; 317 318 factor = btrfs_bg_type_to_factor(block_group->flags); 319 320 found = btrfs_find_space_info(info, block_group->flags); 321 ASSERT(found); 322 spin_lock(&found->lock); 323 found->total_bytes += block_group->length; 324 found->disk_total += block_group->length * factor; 325 found->bytes_used += block_group->used; 326 found->disk_used += block_group->used * factor; 327 found->bytes_readonly += block_group->bytes_super; 328 btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable); 329 if (block_group->length > 0) 330 found->full = 0; 331 btrfs_try_granting_tickets(info, found); 332 spin_unlock(&found->lock); 333 334 block_group->space_info = found; 335 336 index = btrfs_bg_flags_to_raid_index(block_group->flags); 337 down_write(&found->groups_sem); 338 list_add_tail(&block_group->list, &found->block_groups[index]); 339 up_write(&found->groups_sem); 340 } 341 342 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 343 u64 flags) 344 { 345 struct list_head *head = &info->space_info; 346 struct btrfs_space_info *found; 347 348 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 349 350 list_for_each_entry(found, head, list) { 351 if (found->flags & flags) 352 return found; 353 } 354 return NULL; 355 } 356 357 static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) 358 { 359 struct btrfs_space_info *data_sinfo; 360 u64 data_chunk_size; 361 362 /* 363 * Calculate the data_chunk_size, space_info->chunk_size is the 364 * "optimal" chunk size based on the fs size. However when we actually 365 * allocate the chunk we will strip this down further, making it no 366 * more than 10% of the disk or 1G, whichever is smaller. 367 * 368 * On the zoned mode, we need to use zone_size (= data_sinfo->chunk_size) 369 * as it is. 370 */ 371 data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 372 if (btrfs_is_zoned(fs_info)) 373 return data_sinfo->chunk_size; 374 data_chunk_size = min(data_sinfo->chunk_size, 375 mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); 376 return min_t(u64, data_chunk_size, SZ_1G); 377 } 378 379 static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, 380 const struct btrfs_space_info *space_info, 381 enum btrfs_reserve_flush_enum flush) 382 { 383 u64 profile; 384 u64 avail; 385 u64 data_chunk_size; 386 int factor; 387 388 if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) 389 profile = btrfs_system_alloc_profile(fs_info); 390 else 391 profile = btrfs_metadata_alloc_profile(fs_info); 392 393 avail = atomic64_read(&fs_info->free_chunk_space); 394 395 /* 396 * If we have dup, raid1 or raid10 then only half of the free 397 * space is actually usable. For raid56, the space info used 398 * doesn't include the parity drive, so we don't have to 399 * change the math 400 */ 401 factor = btrfs_bg_type_to_factor(profile); 402 avail = div_u64(avail, factor); 403 if (avail == 0) 404 return 0; 405 406 data_chunk_size = calc_effective_data_chunk_size(fs_info); 407 408 /* 409 * Since data allocations immediately use block groups as part of the 410 * reservation, because we assume that data reservations will == actual 411 * usage, we could potentially overcommit and then immediately have that 412 * available space used by a data allocation, which could put us in a 413 * bind when we get close to filling the file system. 414 * 415 * To handle this simply remove the data_chunk_size from the available 416 * space. If we are relatively empty this won't affect our ability to 417 * overcommit much, and if we're very close to full it'll keep us from 418 * getting into a position where we've given ourselves very little 419 * metadata wiggle room. 420 */ 421 if (avail <= data_chunk_size) 422 return 0; 423 avail -= data_chunk_size; 424 425 /* 426 * If we aren't flushing all things, let us overcommit up to 427 * 1/2th of the space. If we can flush, don't let us overcommit 428 * too much, let it overcommit up to 1/8 of the space. 429 */ 430 if (flush == BTRFS_RESERVE_FLUSH_ALL) 431 avail >>= 3; 432 else 433 avail >>= 1; 434 435 /* 436 * On the zoned mode, we always allocate one zone as one chunk. 437 * Returning non-zone size alingned bytes here will result in 438 * less pressure for the async metadata reclaim process, and it 439 * will over-commit too much leading to ENOSPC. Align down to the 440 * zone size to avoid that. 441 */ 442 if (btrfs_is_zoned(fs_info)) 443 avail = ALIGN_DOWN(avail, fs_info->zone_size); 444 445 return avail; 446 } 447 448 int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, 449 const struct btrfs_space_info *space_info, u64 bytes, 450 enum btrfs_reserve_flush_enum flush) 451 { 452 u64 avail; 453 u64 used; 454 455 /* Don't overcommit when in mixed mode */ 456 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 457 return 0; 458 459 used = btrfs_space_info_used(space_info, true); 460 avail = calc_available_free_space(fs_info, space_info, flush); 461 462 if (used + bytes < space_info->total_bytes + avail) 463 return 1; 464 return 0; 465 } 466 467 static void remove_ticket(struct btrfs_space_info *space_info, 468 struct reserve_ticket *ticket) 469 { 470 if (!list_empty(&ticket->list)) { 471 list_del_init(&ticket->list); 472 ASSERT(space_info->reclaim_size >= ticket->bytes); 473 space_info->reclaim_size -= ticket->bytes; 474 } 475 } 476 477 /* 478 * This is for space we already have accounted in space_info->bytes_may_use, so 479 * basically when we're returning space from block_rsv's. 480 */ 481 void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, 482 struct btrfs_space_info *space_info) 483 { 484 struct list_head *head; 485 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 486 487 lockdep_assert_held(&space_info->lock); 488 489 head = &space_info->priority_tickets; 490 again: 491 while (!list_empty(head)) { 492 struct reserve_ticket *ticket; 493 u64 used = btrfs_space_info_used(space_info, true); 494 495 ticket = list_first_entry(head, struct reserve_ticket, list); 496 497 /* Check and see if our ticket can be satisfied now. */ 498 if ((used + ticket->bytes <= space_info->total_bytes) || 499 btrfs_can_overcommit(fs_info, space_info, ticket->bytes, 500 flush)) { 501 btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes); 502 remove_ticket(space_info, ticket); 503 ticket->bytes = 0; 504 space_info->tickets_id++; 505 wake_up(&ticket->wait); 506 } else { 507 break; 508 } 509 } 510 511 if (head == &space_info->priority_tickets) { 512 head = &space_info->tickets; 513 flush = BTRFS_RESERVE_FLUSH_ALL; 514 goto again; 515 } 516 } 517 518 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 519 do { \ 520 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 521 spin_lock(&__rsv->lock); \ 522 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 523 __rsv->size, __rsv->reserved); \ 524 spin_unlock(&__rsv->lock); \ 525 } while (0) 526 527 static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info) 528 { 529 switch (space_info->flags) { 530 case BTRFS_BLOCK_GROUP_SYSTEM: 531 return "SYSTEM"; 532 case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: 533 return "DATA+METADATA"; 534 case BTRFS_BLOCK_GROUP_DATA: 535 return "DATA"; 536 case BTRFS_BLOCK_GROUP_METADATA: 537 return "METADATA"; 538 default: 539 return "UNKNOWN"; 540 } 541 } 542 543 static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) 544 { 545 DUMP_BLOCK_RSV(fs_info, global_block_rsv); 546 DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 547 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 548 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 549 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 550 } 551 552 static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, 553 const struct btrfs_space_info *info) 554 { 555 const char *flag_str = space_info_flag_to_str(info); 556 lockdep_assert_held(&info->lock); 557 558 /* The free space could be negative in case of overcommit */ 559 btrfs_info(fs_info, "space_info %s has %lld free, is %sfull", 560 flag_str, 561 (s64)(info->total_bytes - btrfs_space_info_used(info, true)), 562 info->full ? "" : "not "); 563 btrfs_info(fs_info, 564 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", 565 info->total_bytes, info->bytes_used, info->bytes_pinned, 566 info->bytes_reserved, info->bytes_may_use, 567 info->bytes_readonly, info->bytes_zone_unusable); 568 } 569 570 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 571 struct btrfs_space_info *info, u64 bytes, 572 int dump_block_groups) 573 { 574 struct btrfs_block_group *cache; 575 u64 total_avail = 0; 576 int index = 0; 577 578 spin_lock(&info->lock); 579 __btrfs_dump_space_info(fs_info, info); 580 dump_global_block_rsv(fs_info); 581 spin_unlock(&info->lock); 582 583 if (!dump_block_groups) 584 return; 585 586 down_read(&info->groups_sem); 587 again: 588 list_for_each_entry(cache, &info->block_groups[index], list) { 589 u64 avail; 590 591 spin_lock(&cache->lock); 592 avail = cache->length - cache->used - cache->pinned - 593 cache->reserved - cache->bytes_super - cache->zone_unusable; 594 btrfs_info(fs_info, 595 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", 596 cache->start, cache->length, cache->used, cache->pinned, 597 cache->reserved, cache->delalloc_bytes, 598 cache->bytes_super, cache->zone_unusable, 599 avail, cache->ro ? "[readonly]" : ""); 600 spin_unlock(&cache->lock); 601 btrfs_dump_free_space(cache, bytes); 602 total_avail += avail; 603 } 604 if (++index < BTRFS_NR_RAID_TYPES) 605 goto again; 606 up_read(&info->groups_sem); 607 608 btrfs_info(fs_info, "%llu bytes available across all block groups", total_avail); 609 } 610 611 static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, 612 u64 to_reclaim) 613 { 614 u64 bytes; 615 u64 nr; 616 617 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 618 nr = div64_u64(to_reclaim, bytes); 619 if (!nr) 620 nr = 1; 621 return nr; 622 } 623 624 /* 625 * shrink metadata reservation for delalloc 626 */ 627 static void shrink_delalloc(struct btrfs_fs_info *fs_info, 628 struct btrfs_space_info *space_info, 629 u64 to_reclaim, bool wait_ordered, 630 bool for_preempt) 631 { 632 struct btrfs_trans_handle *trans; 633 u64 delalloc_bytes; 634 u64 ordered_bytes; 635 u64 items; 636 long time_left; 637 int loops; 638 639 delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes); 640 ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); 641 if (delalloc_bytes == 0 && ordered_bytes == 0) 642 return; 643 644 /* Calc the number of the pages we need flush for space reservation */ 645 if (to_reclaim == U64_MAX) { 646 items = U64_MAX; 647 } else { 648 /* 649 * to_reclaim is set to however much metadata we need to 650 * reclaim, but reclaiming that much data doesn't really track 651 * exactly. What we really want to do is reclaim full inode's 652 * worth of reservations, however that's not available to us 653 * here. We will take a fraction of the delalloc bytes for our 654 * flushing loops and hope for the best. Delalloc will expand 655 * the amount we write to cover an entire dirty extent, which 656 * will reclaim the metadata reservation for that range. If 657 * it's not enough subsequent flush stages will be more 658 * aggressive. 659 */ 660 to_reclaim = max(to_reclaim, delalloc_bytes >> 3); 661 items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; 662 } 663 664 trans = current->journal_info; 665 666 /* 667 * If we are doing more ordered than delalloc we need to just wait on 668 * ordered extents, otherwise we'll waste time trying to flush delalloc 669 * that likely won't give us the space back we need. 670 */ 671 if (ordered_bytes > delalloc_bytes && !for_preempt) 672 wait_ordered = true; 673 674 loops = 0; 675 while ((delalloc_bytes || ordered_bytes) && loops < 3) { 676 u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 677 long nr_pages = min_t(u64, temp, LONG_MAX); 678 int async_pages; 679 680 btrfs_start_delalloc_roots(fs_info, nr_pages, true); 681 682 /* 683 * We need to make sure any outstanding async pages are now 684 * processed before we continue. This is because things like 685 * sync_inode() try to be smart and skip writing if the inode is 686 * marked clean. We don't use filemap_fwrite for flushing 687 * because we want to control how many pages we write out at a 688 * time, thus this is the only safe way to make sure we've 689 * waited for outstanding compressed workers to have started 690 * their jobs and thus have ordered extents set up properly. 691 * 692 * This exists because we do not want to wait for each 693 * individual inode to finish its async work, we simply want to 694 * start the IO on everybody, and then come back here and wait 695 * for all of the async work to catch up. Once we're done with 696 * that we know we'll have ordered extents for everything and we 697 * can decide if we wait for that or not. 698 * 699 * If we choose to replace this in the future, make absolutely 700 * sure that the proper waiting is being done in the async case, 701 * as there have been bugs in that area before. 702 */ 703 async_pages = atomic_read(&fs_info->async_delalloc_pages); 704 if (!async_pages) 705 goto skip_async; 706 707 /* 708 * We don't want to wait forever, if we wrote less pages in this 709 * loop than we have outstanding, only wait for that number of 710 * pages, otherwise we can wait for all async pages to finish 711 * before continuing. 712 */ 713 if (async_pages > nr_pages) 714 async_pages -= nr_pages; 715 else 716 async_pages = 0; 717 wait_event(fs_info->async_submit_wait, 718 atomic_read(&fs_info->async_delalloc_pages) <= 719 async_pages); 720 skip_async: 721 loops++; 722 if (wait_ordered && !trans) { 723 btrfs_wait_ordered_roots(fs_info, items, NULL); 724 } else { 725 time_left = schedule_timeout_killable(1); 726 if (time_left) 727 break; 728 } 729 730 /* 731 * If we are for preemption we just want a one-shot of delalloc 732 * flushing so we can stop flushing if we decide we don't need 733 * to anymore. 734 */ 735 if (for_preempt) 736 break; 737 738 spin_lock(&space_info->lock); 739 if (list_empty(&space_info->tickets) && 740 list_empty(&space_info->priority_tickets)) { 741 spin_unlock(&space_info->lock); 742 break; 743 } 744 spin_unlock(&space_info->lock); 745 746 delalloc_bytes = percpu_counter_sum_positive( 747 &fs_info->delalloc_bytes); 748 ordered_bytes = percpu_counter_sum_positive( 749 &fs_info->ordered_bytes); 750 } 751 } 752 753 /* 754 * Try to flush some data based on policy set by @state. This is only advisory 755 * and may fail for various reasons. The caller is supposed to examine the 756 * state of @space_info to detect the outcome. 757 */ 758 static void flush_space(struct btrfs_fs_info *fs_info, 759 struct btrfs_space_info *space_info, u64 num_bytes, 760 enum btrfs_flush_state state, bool for_preempt) 761 { 762 struct btrfs_root *root = fs_info->tree_root; 763 struct btrfs_trans_handle *trans; 764 int nr; 765 int ret = 0; 766 767 switch (state) { 768 case FLUSH_DELAYED_ITEMS_NR: 769 case FLUSH_DELAYED_ITEMS: 770 if (state == FLUSH_DELAYED_ITEMS_NR) 771 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 772 else 773 nr = -1; 774 775 trans = btrfs_join_transaction_nostart(root); 776 if (IS_ERR(trans)) { 777 ret = PTR_ERR(trans); 778 if (ret == -ENOENT) 779 ret = 0; 780 break; 781 } 782 ret = btrfs_run_delayed_items_nr(trans, nr); 783 btrfs_end_transaction(trans); 784 break; 785 case FLUSH_DELALLOC: 786 case FLUSH_DELALLOC_WAIT: 787 case FLUSH_DELALLOC_FULL: 788 if (state == FLUSH_DELALLOC_FULL) 789 num_bytes = U64_MAX; 790 shrink_delalloc(fs_info, space_info, num_bytes, 791 state != FLUSH_DELALLOC, for_preempt); 792 break; 793 case FLUSH_DELAYED_REFS_NR: 794 case FLUSH_DELAYED_REFS: 795 trans = btrfs_join_transaction_nostart(root); 796 if (IS_ERR(trans)) { 797 ret = PTR_ERR(trans); 798 if (ret == -ENOENT) 799 ret = 0; 800 break; 801 } 802 if (state == FLUSH_DELAYED_REFS_NR) 803 btrfs_run_delayed_refs(trans, num_bytes); 804 else 805 btrfs_run_delayed_refs(trans, 0); 806 btrfs_end_transaction(trans); 807 break; 808 case ALLOC_CHUNK: 809 case ALLOC_CHUNK_FORCE: 810 trans = btrfs_join_transaction(root); 811 if (IS_ERR(trans)) { 812 ret = PTR_ERR(trans); 813 break; 814 } 815 ret = btrfs_chunk_alloc(trans, 816 btrfs_get_alloc_profile(fs_info, space_info->flags), 817 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : 818 CHUNK_ALLOC_FORCE); 819 btrfs_end_transaction(trans); 820 821 if (ret > 0 || ret == -ENOSPC) 822 ret = 0; 823 break; 824 case RUN_DELAYED_IPUTS: 825 /* 826 * If we have pending delayed iputs then we could free up a 827 * bunch of pinned space, so make sure we run the iputs before 828 * we do our pinned bytes check below. 829 */ 830 btrfs_run_delayed_iputs(fs_info); 831 btrfs_wait_on_delayed_iputs(fs_info); 832 break; 833 case COMMIT_TRANS: 834 ASSERT(current->journal_info == NULL); 835 /* 836 * We don't want to start a new transaction, just attach to the 837 * current one or wait it fully commits in case its commit is 838 * happening at the moment. Note: we don't use a nostart join 839 * because that does not wait for a transaction to fully commit 840 * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED). 841 */ 842 ret = btrfs_commit_current_transaction(root); 843 break; 844 case RESET_ZONES: 845 ret = btrfs_reset_unused_block_groups(space_info, num_bytes); 846 break; 847 default: 848 ret = -ENOSPC; 849 break; 850 } 851 852 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 853 ret, for_preempt); 854 return; 855 } 856 857 static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 858 const struct btrfs_space_info *space_info) 859 { 860 u64 used; 861 u64 avail; 862 u64 to_reclaim = space_info->reclaim_size; 863 864 lockdep_assert_held(&space_info->lock); 865 866 avail = calc_available_free_space(fs_info, space_info, 867 BTRFS_RESERVE_FLUSH_ALL); 868 used = btrfs_space_info_used(space_info, true); 869 870 /* 871 * We may be flushing because suddenly we have less space than we had 872 * before, and now we're well over-committed based on our current free 873 * space. If that's the case add in our overage so we make sure to put 874 * appropriate pressure on the flushing state machine. 875 */ 876 if (space_info->total_bytes + avail < used) 877 to_reclaim += used - (space_info->total_bytes + avail); 878 879 return to_reclaim; 880 } 881 882 static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, 883 const struct btrfs_space_info *space_info) 884 { 885 const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); 886 u64 ordered, delalloc; 887 u64 thresh; 888 u64 used; 889 890 thresh = mult_perc(space_info->total_bytes, 90); 891 892 lockdep_assert_held(&space_info->lock); 893 894 /* If we're just plain full then async reclaim just slows us down. */ 895 if ((space_info->bytes_used + space_info->bytes_reserved + 896 global_rsv_size) >= thresh) 897 return false; 898 899 used = space_info->bytes_may_use + space_info->bytes_pinned; 900 901 /* The total flushable belongs to the global rsv, don't flush. */ 902 if (global_rsv_size >= used) 903 return false; 904 905 /* 906 * 128MiB is 1/4 of the maximum global rsv size. If we have less than 907 * that devoted to other reservations then there's no sense in flushing, 908 * we don't have a lot of things that need flushing. 909 */ 910 if (used - global_rsv_size <= SZ_128M) 911 return false; 912 913 /* 914 * We have tickets queued, bail so we don't compete with the async 915 * flushers. 916 */ 917 if (space_info->reclaim_size) 918 return false; 919 920 /* 921 * If we have over half of the free space occupied by reservations or 922 * pinned then we want to start flushing. 923 * 924 * We do not do the traditional thing here, which is to say 925 * 926 * if (used >= ((total_bytes + avail) / 2)) 927 * return 1; 928 * 929 * because this doesn't quite work how we want. If we had more than 50% 930 * of the space_info used by bytes_used and we had 0 available we'd just 931 * constantly run the background flusher. Instead we want it to kick in 932 * if our reclaimable space exceeds our clamped free space. 933 * 934 * Our clamping range is 2^1 -> 2^8. Practically speaking that means 935 * the following: 936 * 937 * Amount of RAM Minimum threshold Maximum threshold 938 * 939 * 256GiB 1GiB 128GiB 940 * 128GiB 512MiB 64GiB 941 * 64GiB 256MiB 32GiB 942 * 32GiB 128MiB 16GiB 943 * 16GiB 64MiB 8GiB 944 * 945 * These are the range our thresholds will fall in, corresponding to how 946 * much delalloc we need for the background flusher to kick in. 947 */ 948 949 thresh = calc_available_free_space(fs_info, space_info, 950 BTRFS_RESERVE_FLUSH_ALL); 951 used = space_info->bytes_used + space_info->bytes_reserved + 952 space_info->bytes_readonly + global_rsv_size; 953 if (used < space_info->total_bytes) 954 thresh += space_info->total_bytes - used; 955 thresh >>= space_info->clamp; 956 957 used = space_info->bytes_pinned; 958 959 /* 960 * If we have more ordered bytes than delalloc bytes then we're either 961 * doing a lot of DIO, or we simply don't have a lot of delalloc waiting 962 * around. Preemptive flushing is only useful in that it can free up 963 * space before tickets need to wait for things to finish. In the case 964 * of ordered extents, preemptively waiting on ordered extents gets us 965 * nothing, if our reservations are tied up in ordered extents we'll 966 * simply have to slow down writers by forcing them to wait on ordered 967 * extents. 968 * 969 * In the case that ordered is larger than delalloc, only include the 970 * block reserves that we would actually be able to directly reclaim 971 * from. In this case if we're heavy on metadata operations this will 972 * clearly be heavy enough to warrant preemptive flushing. In the case 973 * of heavy DIO or ordered reservations, preemptive flushing will just 974 * waste time and cause us to slow down. 975 * 976 * We want to make sure we truly are maxed out on ordered however, so 977 * cut ordered in half, and if it's still higher than delalloc then we 978 * can keep flushing. This is to avoid the case where we start 979 * flushing, and now delalloc == ordered and we stop preemptively 980 * flushing when we could still have several gigs of delalloc to flush. 981 */ 982 ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1; 983 delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); 984 if (ordered >= delalloc) 985 used += btrfs_block_rsv_reserved(&fs_info->delayed_refs_rsv) + 986 btrfs_block_rsv_reserved(&fs_info->delayed_block_rsv); 987 else 988 used += space_info->bytes_may_use - global_rsv_size; 989 990 return (used >= thresh && !btrfs_fs_closing(fs_info) && 991 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 992 } 993 994 static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, 995 struct btrfs_space_info *space_info, 996 struct reserve_ticket *ticket) 997 { 998 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 999 u64 min_bytes; 1000 1001 if (!ticket->steal) 1002 return false; 1003 1004 if (global_rsv->space_info != space_info) 1005 return false; 1006 1007 spin_lock(&global_rsv->lock); 1008 min_bytes = mult_perc(global_rsv->size, 10); 1009 if (global_rsv->reserved < min_bytes + ticket->bytes) { 1010 spin_unlock(&global_rsv->lock); 1011 return false; 1012 } 1013 global_rsv->reserved -= ticket->bytes; 1014 remove_ticket(space_info, ticket); 1015 ticket->bytes = 0; 1016 wake_up(&ticket->wait); 1017 space_info->tickets_id++; 1018 if (global_rsv->reserved < global_rsv->size) 1019 global_rsv->full = 0; 1020 spin_unlock(&global_rsv->lock); 1021 1022 return true; 1023 } 1024 1025 /* 1026 * We've exhausted our flushing, start failing tickets. 1027 * 1028 * @fs_info - fs_info for this fs 1029 * @space_info - the space info we were flushing 1030 * 1031 * We call this when we've exhausted our flushing ability and haven't made 1032 * progress in satisfying tickets. The reservation code handles tickets in 1033 * order, so if there is a large ticket first and then smaller ones we could 1034 * very well satisfy the smaller tickets. This will attempt to wake up any 1035 * tickets in the list to catch this case. 1036 * 1037 * This function returns true if it was able to make progress by clearing out 1038 * other tickets, or if it stumbles across a ticket that was smaller than the 1039 * first ticket. 1040 */ 1041 static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, 1042 struct btrfs_space_info *space_info) 1043 { 1044 struct reserve_ticket *ticket; 1045 u64 tickets_id = space_info->tickets_id; 1046 const bool aborted = BTRFS_FS_ERROR(fs_info); 1047 1048 trace_btrfs_fail_all_tickets(fs_info, space_info); 1049 1050 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 1051 btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); 1052 __btrfs_dump_space_info(fs_info, space_info); 1053 } 1054 1055 while (!list_empty(&space_info->tickets) && 1056 tickets_id == space_info->tickets_id) { 1057 ticket = list_first_entry(&space_info->tickets, 1058 struct reserve_ticket, list); 1059 1060 if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket)) 1061 return true; 1062 1063 if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 1064 btrfs_info(fs_info, "failing ticket with %llu bytes", 1065 ticket->bytes); 1066 1067 remove_ticket(space_info, ticket); 1068 if (aborted) 1069 ticket->error = -EIO; 1070 else 1071 ticket->error = -ENOSPC; 1072 wake_up(&ticket->wait); 1073 1074 /* 1075 * We're just throwing tickets away, so more flushing may not 1076 * trip over btrfs_try_granting_tickets, so we need to call it 1077 * here to see if we can make progress with the next ticket in 1078 * the list. 1079 */ 1080 if (!aborted) 1081 btrfs_try_granting_tickets(fs_info, space_info); 1082 } 1083 return (tickets_id != space_info->tickets_id); 1084 } 1085 1086 /* 1087 * This is for normal flushers, we can wait all goddamned day if we want to. We 1088 * will loop and continuously try to flush as long as we are making progress. 1089 * We count progress as clearing off tickets each time we have to loop. 1090 */ 1091 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 1092 { 1093 struct btrfs_fs_info *fs_info; 1094 struct btrfs_space_info *space_info; 1095 u64 to_reclaim; 1096 enum btrfs_flush_state flush_state; 1097 int commit_cycles = 0; 1098 u64 last_tickets_id; 1099 enum btrfs_flush_state final_state; 1100 1101 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 1102 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 1103 if (btrfs_is_zoned(fs_info)) 1104 final_state = RESET_ZONES; 1105 else 1106 final_state = COMMIT_TRANS; 1107 1108 spin_lock(&space_info->lock); 1109 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); 1110 if (!to_reclaim) { 1111 space_info->flush = 0; 1112 spin_unlock(&space_info->lock); 1113 return; 1114 } 1115 last_tickets_id = space_info->tickets_id; 1116 spin_unlock(&space_info->lock); 1117 1118 flush_state = FLUSH_DELAYED_ITEMS_NR; 1119 do { 1120 flush_space(fs_info, space_info, to_reclaim, flush_state, false); 1121 spin_lock(&space_info->lock); 1122 if (list_empty(&space_info->tickets)) { 1123 space_info->flush = 0; 1124 spin_unlock(&space_info->lock); 1125 return; 1126 } 1127 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 1128 space_info); 1129 if (last_tickets_id == space_info->tickets_id) { 1130 flush_state++; 1131 } else { 1132 last_tickets_id = space_info->tickets_id; 1133 flush_state = FLUSH_DELAYED_ITEMS_NR; 1134 if (commit_cycles) 1135 commit_cycles--; 1136 } 1137 1138 /* 1139 * We do not want to empty the system of delalloc unless we're 1140 * under heavy pressure, so allow one trip through the flushing 1141 * logic before we start doing a FLUSH_DELALLOC_FULL. 1142 */ 1143 if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles) 1144 flush_state++; 1145 1146 /* 1147 * We don't want to force a chunk allocation until we've tried 1148 * pretty hard to reclaim space. Think of the case where we 1149 * freed up a bunch of space and so have a lot of pinned space 1150 * to reclaim. We would rather use that than possibly create a 1151 * underutilized metadata chunk. So if this is our first run 1152 * through the flushing state machine skip ALLOC_CHUNK_FORCE and 1153 * commit the transaction. If nothing has changed the next go 1154 * around then we can force a chunk allocation. 1155 */ 1156 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 1157 flush_state++; 1158 1159 if (flush_state > final_state) { 1160 commit_cycles++; 1161 if (commit_cycles > 2) { 1162 if (maybe_fail_all_tickets(fs_info, space_info)) { 1163 flush_state = FLUSH_DELAYED_ITEMS_NR; 1164 commit_cycles--; 1165 } else { 1166 space_info->flush = 0; 1167 } 1168 } else { 1169 flush_state = FLUSH_DELAYED_ITEMS_NR; 1170 } 1171 } 1172 spin_unlock(&space_info->lock); 1173 } while (flush_state <= final_state); 1174 } 1175 1176 /* 1177 * This handles pre-flushing of metadata space before we get to the point that 1178 * we need to start blocking threads on tickets. The logic here is different 1179 * from the other flush paths because it doesn't rely on tickets to tell us how 1180 * much we need to flush, instead it attempts to keep us below the 80% full 1181 * watermark of space by flushing whichever reservation pool is currently the 1182 * largest. 1183 */ 1184 static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) 1185 { 1186 struct btrfs_fs_info *fs_info; 1187 struct btrfs_space_info *space_info; 1188 struct btrfs_block_rsv *delayed_block_rsv; 1189 struct btrfs_block_rsv *delayed_refs_rsv; 1190 struct btrfs_block_rsv *global_rsv; 1191 struct btrfs_block_rsv *trans_rsv; 1192 int loops = 0; 1193 1194 fs_info = container_of(work, struct btrfs_fs_info, 1195 preempt_reclaim_work); 1196 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 1197 delayed_block_rsv = &fs_info->delayed_block_rsv; 1198 delayed_refs_rsv = &fs_info->delayed_refs_rsv; 1199 global_rsv = &fs_info->global_block_rsv; 1200 trans_rsv = &fs_info->trans_block_rsv; 1201 1202 spin_lock(&space_info->lock); 1203 while (need_preemptive_reclaim(fs_info, space_info)) { 1204 enum btrfs_flush_state flush; 1205 u64 delalloc_size = 0; 1206 u64 to_reclaim, block_rsv_size; 1207 const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv); 1208 1209 loops++; 1210 1211 /* 1212 * We don't have a precise counter for the metadata being 1213 * reserved for delalloc, so we'll approximate it by subtracting 1214 * out the block rsv's space from the bytes_may_use. If that 1215 * amount is higher than the individual reserves, then we can 1216 * assume it's tied up in delalloc reservations. 1217 */ 1218 block_rsv_size = global_rsv_size + 1219 btrfs_block_rsv_reserved(delayed_block_rsv) + 1220 btrfs_block_rsv_reserved(delayed_refs_rsv) + 1221 btrfs_block_rsv_reserved(trans_rsv); 1222 if (block_rsv_size < space_info->bytes_may_use) 1223 delalloc_size = space_info->bytes_may_use - block_rsv_size; 1224 1225 /* 1226 * We don't want to include the global_rsv in our calculation, 1227 * because that's space we can't touch. Subtract it from the 1228 * block_rsv_size for the next checks. 1229 */ 1230 block_rsv_size -= global_rsv_size; 1231 1232 /* 1233 * We really want to avoid flushing delalloc too much, as it 1234 * could result in poor allocation patterns, so only flush it if 1235 * it's larger than the rest of the pools combined. 1236 */ 1237 if (delalloc_size > block_rsv_size) { 1238 to_reclaim = delalloc_size; 1239 flush = FLUSH_DELALLOC; 1240 } else if (space_info->bytes_pinned > 1241 (btrfs_block_rsv_reserved(delayed_block_rsv) + 1242 btrfs_block_rsv_reserved(delayed_refs_rsv))) { 1243 to_reclaim = space_info->bytes_pinned; 1244 flush = COMMIT_TRANS; 1245 } else if (btrfs_block_rsv_reserved(delayed_block_rsv) > 1246 btrfs_block_rsv_reserved(delayed_refs_rsv)) { 1247 to_reclaim = btrfs_block_rsv_reserved(delayed_block_rsv); 1248 flush = FLUSH_DELAYED_ITEMS_NR; 1249 } else { 1250 to_reclaim = btrfs_block_rsv_reserved(delayed_refs_rsv); 1251 flush = FLUSH_DELAYED_REFS_NR; 1252 } 1253 1254 spin_unlock(&space_info->lock); 1255 1256 /* 1257 * We don't want to reclaim everything, just a portion, so scale 1258 * down the to_reclaim by 1/4. If it takes us down to 0, 1259 * reclaim 1 items worth. 1260 */ 1261 to_reclaim >>= 2; 1262 if (!to_reclaim) 1263 to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1); 1264 flush_space(fs_info, space_info, to_reclaim, flush, true); 1265 cond_resched(); 1266 spin_lock(&space_info->lock); 1267 } 1268 1269 /* We only went through once, back off our clamping. */ 1270 if (loops == 1 && !space_info->reclaim_size) 1271 space_info->clamp = max(1, space_info->clamp - 1); 1272 trace_btrfs_done_preemptive_reclaim(fs_info, space_info); 1273 spin_unlock(&space_info->lock); 1274 } 1275 1276 /* 1277 * FLUSH_DELALLOC_WAIT: 1278 * Space is freed from flushing delalloc in one of two ways. 1279 * 1280 * 1) compression is on and we allocate less space than we reserved 1281 * 2) we are overwriting existing space 1282 * 1283 * For #1 that extra space is reclaimed as soon as the delalloc pages are 1284 * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent 1285 * length to ->bytes_reserved, and subtracts the reserved space from 1286 * ->bytes_may_use. 1287 * 1288 * For #2 this is trickier. Once the ordered extent runs we will drop the 1289 * extent in the range we are overwriting, which creates a delayed ref for 1290 * that freed extent. This however is not reclaimed until the transaction 1291 * commits, thus the next stages. 1292 * 1293 * RUN_DELAYED_IPUTS 1294 * If we are freeing inodes, we want to make sure all delayed iputs have 1295 * completed, because they could have been on an inode with i_nlink == 0, and 1296 * thus have been truncated and freed up space. But again this space is not 1297 * immediately reusable, it comes in the form of a delayed ref, which must be 1298 * run and then the transaction must be committed. 1299 * 1300 * COMMIT_TRANS 1301 * This is where we reclaim all of the pinned space generated by running the 1302 * iputs 1303 * 1304 * RESET_ZONES 1305 * This state works only for the zoned mode. We scan the unused block group 1306 * list and reset the zones and reuse the block group. 1307 * 1308 * ALLOC_CHUNK_FORCE 1309 * For data we start with alloc chunk force, however we could have been full 1310 * before, and then the transaction commit could have freed new block groups, 1311 * so if we now have space to allocate do the force chunk allocation. 1312 */ 1313 static const enum btrfs_flush_state data_flush_states[] = { 1314 FLUSH_DELALLOC_FULL, 1315 RUN_DELAYED_IPUTS, 1316 COMMIT_TRANS, 1317 RESET_ZONES, 1318 ALLOC_CHUNK_FORCE, 1319 }; 1320 1321 static void btrfs_async_reclaim_data_space(struct work_struct *work) 1322 { 1323 struct btrfs_fs_info *fs_info; 1324 struct btrfs_space_info *space_info; 1325 u64 last_tickets_id; 1326 enum btrfs_flush_state flush_state = 0; 1327 1328 fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); 1329 space_info = fs_info->data_sinfo; 1330 1331 spin_lock(&space_info->lock); 1332 if (list_empty(&space_info->tickets)) { 1333 space_info->flush = 0; 1334 spin_unlock(&space_info->lock); 1335 return; 1336 } 1337 last_tickets_id = space_info->tickets_id; 1338 spin_unlock(&space_info->lock); 1339 1340 while (!space_info->full) { 1341 flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); 1342 spin_lock(&space_info->lock); 1343 if (list_empty(&space_info->tickets)) { 1344 space_info->flush = 0; 1345 spin_unlock(&space_info->lock); 1346 return; 1347 } 1348 1349 /* Something happened, fail everything and bail. */ 1350 if (BTRFS_FS_ERROR(fs_info)) 1351 goto aborted_fs; 1352 last_tickets_id = space_info->tickets_id; 1353 spin_unlock(&space_info->lock); 1354 } 1355 1356 while (flush_state < ARRAY_SIZE(data_flush_states)) { 1357 flush_space(fs_info, space_info, U64_MAX, 1358 data_flush_states[flush_state], false); 1359 spin_lock(&space_info->lock); 1360 if (list_empty(&space_info->tickets)) { 1361 space_info->flush = 0; 1362 spin_unlock(&space_info->lock); 1363 return; 1364 } 1365 1366 if (last_tickets_id == space_info->tickets_id) { 1367 flush_state++; 1368 } else { 1369 last_tickets_id = space_info->tickets_id; 1370 flush_state = 0; 1371 } 1372 1373 if (flush_state >= ARRAY_SIZE(data_flush_states)) { 1374 if (space_info->full) { 1375 if (maybe_fail_all_tickets(fs_info, space_info)) 1376 flush_state = 0; 1377 else 1378 space_info->flush = 0; 1379 } else { 1380 flush_state = 0; 1381 } 1382 1383 /* Something happened, fail everything and bail. */ 1384 if (BTRFS_FS_ERROR(fs_info)) 1385 goto aborted_fs; 1386 1387 } 1388 spin_unlock(&space_info->lock); 1389 } 1390 return; 1391 1392 aborted_fs: 1393 maybe_fail_all_tickets(fs_info, space_info); 1394 space_info->flush = 0; 1395 spin_unlock(&space_info->lock); 1396 } 1397 1398 void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) 1399 { 1400 INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); 1401 INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); 1402 INIT_WORK(&fs_info->preempt_reclaim_work, 1403 btrfs_preempt_reclaim_metadata_space); 1404 } 1405 1406 static const enum btrfs_flush_state priority_flush_states[] = { 1407 FLUSH_DELAYED_ITEMS_NR, 1408 FLUSH_DELAYED_ITEMS, 1409 RESET_ZONES, 1410 ALLOC_CHUNK, 1411 }; 1412 1413 static const enum btrfs_flush_state evict_flush_states[] = { 1414 FLUSH_DELAYED_ITEMS_NR, 1415 FLUSH_DELAYED_ITEMS, 1416 FLUSH_DELAYED_REFS_NR, 1417 FLUSH_DELAYED_REFS, 1418 FLUSH_DELALLOC, 1419 FLUSH_DELALLOC_WAIT, 1420 FLUSH_DELALLOC_FULL, 1421 ALLOC_CHUNK, 1422 COMMIT_TRANS, 1423 RESET_ZONES, 1424 }; 1425 1426 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 1427 struct btrfs_space_info *space_info, 1428 struct reserve_ticket *ticket, 1429 const enum btrfs_flush_state *states, 1430 int states_nr) 1431 { 1432 u64 to_reclaim; 1433 int flush_state = 0; 1434 1435 spin_lock(&space_info->lock); 1436 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); 1437 /* 1438 * This is the priority reclaim path, so to_reclaim could be >0 still 1439 * because we may have only satisfied the priority tickets and still 1440 * left non priority tickets on the list. We would then have 1441 * to_reclaim but ->bytes == 0. 1442 */ 1443 if (ticket->bytes == 0) { 1444 spin_unlock(&space_info->lock); 1445 return; 1446 } 1447 1448 while (flush_state < states_nr) { 1449 spin_unlock(&space_info->lock); 1450 flush_space(fs_info, space_info, to_reclaim, states[flush_state], 1451 false); 1452 flush_state++; 1453 spin_lock(&space_info->lock); 1454 if (ticket->bytes == 0) { 1455 spin_unlock(&space_info->lock); 1456 return; 1457 } 1458 } 1459 1460 /* 1461 * Attempt to steal from the global rsv if we can, except if the fs was 1462 * turned into error mode due to a transaction abort when flushing space 1463 * above, in that case fail with the abort error instead of returning 1464 * success to the caller if we can steal from the global rsv - this is 1465 * just to have caller fail immeditelly instead of later when trying to 1466 * modify the fs, making it easier to debug -ENOSPC problems. 1467 */ 1468 if (BTRFS_FS_ERROR(fs_info)) { 1469 ticket->error = BTRFS_FS_ERROR(fs_info); 1470 remove_ticket(space_info, ticket); 1471 } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) { 1472 ticket->error = -ENOSPC; 1473 remove_ticket(space_info, ticket); 1474 } 1475 1476 /* 1477 * We must run try_granting_tickets here because we could be a large 1478 * ticket in front of a smaller ticket that can now be satisfied with 1479 * the available space. 1480 */ 1481 btrfs_try_granting_tickets(fs_info, space_info); 1482 spin_unlock(&space_info->lock); 1483 } 1484 1485 static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, 1486 struct btrfs_space_info *space_info, 1487 struct reserve_ticket *ticket) 1488 { 1489 spin_lock(&space_info->lock); 1490 1491 /* We could have been granted before we got here. */ 1492 if (ticket->bytes == 0) { 1493 spin_unlock(&space_info->lock); 1494 return; 1495 } 1496 1497 while (!space_info->full) { 1498 spin_unlock(&space_info->lock); 1499 flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); 1500 spin_lock(&space_info->lock); 1501 if (ticket->bytes == 0) { 1502 spin_unlock(&space_info->lock); 1503 return; 1504 } 1505 } 1506 1507 ticket->error = -ENOSPC; 1508 remove_ticket(space_info, ticket); 1509 btrfs_try_granting_tickets(fs_info, space_info); 1510 spin_unlock(&space_info->lock); 1511 } 1512 1513 static void wait_reserve_ticket(struct btrfs_space_info *space_info, 1514 struct reserve_ticket *ticket) 1515 1516 { 1517 DEFINE_WAIT(wait); 1518 int ret = 0; 1519 1520 spin_lock(&space_info->lock); 1521 while (ticket->bytes > 0 && ticket->error == 0) { 1522 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 1523 if (ret) { 1524 /* 1525 * Delete us from the list. After we unlock the space 1526 * info, we don't want the async reclaim job to reserve 1527 * space for this ticket. If that would happen, then the 1528 * ticket's task would not known that space was reserved 1529 * despite getting an error, resulting in a space leak 1530 * (bytes_may_use counter of our space_info). 1531 */ 1532 remove_ticket(space_info, ticket); 1533 ticket->error = -EINTR; 1534 break; 1535 } 1536 spin_unlock(&space_info->lock); 1537 1538 schedule(); 1539 1540 finish_wait(&ticket->wait, &wait); 1541 spin_lock(&space_info->lock); 1542 } 1543 spin_unlock(&space_info->lock); 1544 } 1545 1546 /* 1547 * Do the appropriate flushing and waiting for a ticket. 1548 * 1549 * @fs_info: the filesystem 1550 * @space_info: space info for the reservation 1551 * @ticket: ticket for the reservation 1552 * @start_ns: timestamp when the reservation started 1553 * @orig_bytes: amount of bytes originally reserved 1554 * @flush: how much we can flush 1555 * 1556 * This does the work of figuring out how to flush for the ticket, waiting for 1557 * the reservation, and returning the appropriate error if there is one. 1558 */ 1559 static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, 1560 struct btrfs_space_info *space_info, 1561 struct reserve_ticket *ticket, 1562 u64 start_ns, u64 orig_bytes, 1563 enum btrfs_reserve_flush_enum flush) 1564 { 1565 int ret; 1566 1567 switch (flush) { 1568 case BTRFS_RESERVE_FLUSH_DATA: 1569 case BTRFS_RESERVE_FLUSH_ALL: 1570 case BTRFS_RESERVE_FLUSH_ALL_STEAL: 1571 wait_reserve_ticket(space_info, ticket); 1572 break; 1573 case BTRFS_RESERVE_FLUSH_LIMIT: 1574 priority_reclaim_metadata_space(fs_info, space_info, ticket, 1575 priority_flush_states, 1576 ARRAY_SIZE(priority_flush_states)); 1577 break; 1578 case BTRFS_RESERVE_FLUSH_EVICT: 1579 priority_reclaim_metadata_space(fs_info, space_info, ticket, 1580 evict_flush_states, 1581 ARRAY_SIZE(evict_flush_states)); 1582 break; 1583 case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: 1584 priority_reclaim_data_space(fs_info, space_info, ticket); 1585 break; 1586 default: 1587 ASSERT(0); 1588 break; 1589 } 1590 1591 ret = ticket->error; 1592 ASSERT(list_empty(&ticket->list)); 1593 /* 1594 * Check that we can't have an error set if the reservation succeeded, 1595 * as that would confuse tasks and lead them to error out without 1596 * releasing reserved space (if an error happens the expectation is that 1597 * space wasn't reserved at all). 1598 */ 1599 ASSERT(!(ticket->bytes == 0 && ticket->error)); 1600 trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes, 1601 start_ns, flush, ticket->error); 1602 return ret; 1603 } 1604 1605 /* 1606 * This returns true if this flush state will go through the ordinary flushing 1607 * code. 1608 */ 1609 static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) 1610 { 1611 return (flush == BTRFS_RESERVE_FLUSH_ALL) || 1612 (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); 1613 } 1614 1615 static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, 1616 struct btrfs_space_info *space_info) 1617 { 1618 u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); 1619 u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); 1620 1621 /* 1622 * If we're heavy on ordered operations then clamping won't help us. We 1623 * need to clamp specifically to keep up with dirty'ing buffered 1624 * writers, because there's not a 1:1 correlation of writing delalloc 1625 * and freeing space, like there is with flushing delayed refs or 1626 * delayed nodes. If we're already more ordered than delalloc then 1627 * we're keeping up, otherwise we aren't and should probably clamp. 1628 */ 1629 if (ordered < delalloc) 1630 space_info->clamp = min(space_info->clamp + 1, 8); 1631 } 1632 1633 static inline bool can_steal(enum btrfs_reserve_flush_enum flush) 1634 { 1635 return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || 1636 flush == BTRFS_RESERVE_FLUSH_EVICT); 1637 } 1638 1639 /* 1640 * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to 1641 * fail as quickly as possible. 1642 */ 1643 static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) 1644 { 1645 return (flush != BTRFS_RESERVE_NO_FLUSH && 1646 flush != BTRFS_RESERVE_FLUSH_EMERGENCY); 1647 } 1648 1649 /* 1650 * Try to reserve bytes from the block_rsv's space. 1651 * 1652 * @fs_info: the filesystem 1653 * @space_info: space info we want to allocate from 1654 * @orig_bytes: number of bytes we want 1655 * @flush: whether or not we can flush to make our reservation 1656 * 1657 * This will reserve orig_bytes number of bytes from the space info associated 1658 * with the block_rsv. If there is not enough space it will make an attempt to 1659 * flush out space to make room. It will do this by flushing delalloc if 1660 * possible or committing the transaction. If flush is 0 then no attempts to 1661 * regain reservations will be made and this will fail if there is not enough 1662 * space already. 1663 */ 1664 static int __reserve_bytes(struct btrfs_fs_info *fs_info, 1665 struct btrfs_space_info *space_info, u64 orig_bytes, 1666 enum btrfs_reserve_flush_enum flush) 1667 { 1668 struct work_struct *async_work; 1669 struct reserve_ticket ticket; 1670 u64 start_ns = 0; 1671 u64 used; 1672 int ret = -ENOSPC; 1673 bool pending_tickets; 1674 1675 ASSERT(orig_bytes); 1676 /* 1677 * If have a transaction handle (current->journal_info != NULL), then 1678 * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor 1679 * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those 1680 * flushing methods can trigger transaction commits. 1681 */ 1682 if (current->journal_info) { 1683 /* One assert per line for easier debugging. */ 1684 ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL); 1685 ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL); 1686 ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT); 1687 } 1688 1689 if (flush == BTRFS_RESERVE_FLUSH_DATA) 1690 async_work = &fs_info->async_data_reclaim_work; 1691 else 1692 async_work = &fs_info->async_reclaim_work; 1693 1694 spin_lock(&space_info->lock); 1695 used = btrfs_space_info_used(space_info, true); 1696 1697 /* 1698 * We don't want NO_FLUSH allocations to jump everybody, they can 1699 * generally handle ENOSPC in a different way, so treat them the same as 1700 * normal flushers when it comes to skipping pending tickets. 1701 */ 1702 if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH)) 1703 pending_tickets = !list_empty(&space_info->tickets) || 1704 !list_empty(&space_info->priority_tickets); 1705 else 1706 pending_tickets = !list_empty(&space_info->priority_tickets); 1707 1708 /* 1709 * Carry on if we have enough space (short-circuit) OR call 1710 * can_overcommit() to ensure we can overcommit to continue. 1711 */ 1712 if (!pending_tickets && 1713 ((used + orig_bytes <= space_info->total_bytes) || 1714 btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { 1715 btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); 1716 ret = 0; 1717 } 1718 1719 /* 1720 * Things are dire, we need to make a reservation so we don't abort. We 1721 * will let this reservation go through as long as we have actual space 1722 * left to allocate for the block. 1723 */ 1724 if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { 1725 used = btrfs_space_info_used(space_info, false); 1726 if (used + orig_bytes <= space_info->total_bytes) { 1727 btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); 1728 ret = 0; 1729 } 1730 } 1731 1732 /* 1733 * If we couldn't make a reservation then setup our reservation ticket 1734 * and kick the async worker if it's not already running. 1735 * 1736 * If we are a priority flusher then we just need to add our ticket to 1737 * the list and we will do our own flushing further down. 1738 */ 1739 if (ret && can_ticket(flush)) { 1740 ticket.bytes = orig_bytes; 1741 ticket.error = 0; 1742 space_info->reclaim_size += ticket.bytes; 1743 init_waitqueue_head(&ticket.wait); 1744 ticket.steal = can_steal(flush); 1745 if (trace_btrfs_reserve_ticket_enabled()) 1746 start_ns = ktime_get_ns(); 1747 1748 if (flush == BTRFS_RESERVE_FLUSH_ALL || 1749 flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || 1750 flush == BTRFS_RESERVE_FLUSH_DATA) { 1751 list_add_tail(&ticket.list, &space_info->tickets); 1752 if (!space_info->flush) { 1753 /* 1754 * We were forced to add a reserve ticket, so 1755 * our preemptive flushing is unable to keep 1756 * up. Clamp down on the threshold for the 1757 * preemptive flushing in order to keep up with 1758 * the workload. 1759 */ 1760 maybe_clamp_preempt(fs_info, space_info); 1761 1762 space_info->flush = 1; 1763 trace_btrfs_trigger_flush(fs_info, 1764 space_info->flags, 1765 orig_bytes, flush, 1766 "enospc"); 1767 queue_work(system_unbound_wq, async_work); 1768 } 1769 } else { 1770 list_add_tail(&ticket.list, 1771 &space_info->priority_tickets); 1772 } 1773 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 1774 /* 1775 * We will do the space reservation dance during log replay, 1776 * which means we won't have fs_info->fs_root set, so don't do 1777 * the async reclaim as we will panic. 1778 */ 1779 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 1780 !work_busy(&fs_info->preempt_reclaim_work) && 1781 need_preemptive_reclaim(fs_info, space_info)) { 1782 trace_btrfs_trigger_flush(fs_info, space_info->flags, 1783 orig_bytes, flush, "preempt"); 1784 queue_work(system_unbound_wq, 1785 &fs_info->preempt_reclaim_work); 1786 } 1787 } 1788 spin_unlock(&space_info->lock); 1789 if (!ret || !can_ticket(flush)) 1790 return ret; 1791 1792 return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, 1793 orig_bytes, flush); 1794 } 1795 1796 /* 1797 * Try to reserve metadata bytes from the block_rsv's space. 1798 * 1799 * @fs_info: the filesystem 1800 * @space_info: the space_info we're allocating for 1801 * @orig_bytes: number of bytes we want 1802 * @flush: whether or not we can flush to make our reservation 1803 * 1804 * This will reserve orig_bytes number of bytes from the space info associated 1805 * with the block_rsv. If there is not enough space it will make an attempt to 1806 * flush out space to make room. It will do this by flushing delalloc if 1807 * possible or committing the transaction. If flush is 0 then no attempts to 1808 * regain reservations will be made and this will fail if there is not enough 1809 * space already. 1810 */ 1811 int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 1812 struct btrfs_space_info *space_info, 1813 u64 orig_bytes, 1814 enum btrfs_reserve_flush_enum flush) 1815 { 1816 int ret; 1817 1818 ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush); 1819 if (ret == -ENOSPC) { 1820 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 1821 space_info->flags, orig_bytes, 1); 1822 1823 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 1824 btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0); 1825 } 1826 return ret; 1827 } 1828 1829 /* 1830 * Try to reserve data bytes for an allocation. 1831 * 1832 * @fs_info: the filesystem 1833 * @bytes: number of bytes we need 1834 * @flush: how we are allowed to flush 1835 * 1836 * This will reserve bytes from the data space info. If there is not enough 1837 * space then we will attempt to flush space as specified by flush. 1838 */ 1839 int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, 1840 enum btrfs_reserve_flush_enum flush) 1841 { 1842 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; 1843 int ret; 1844 1845 ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || 1846 flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE || 1847 flush == BTRFS_RESERVE_NO_FLUSH); 1848 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); 1849 1850 ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); 1851 if (ret == -ENOSPC) { 1852 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 1853 data_sinfo->flags, bytes, 1); 1854 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 1855 btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); 1856 } 1857 return ret; 1858 } 1859 1860 /* Dump all the space infos when we abort a transaction due to ENOSPC. */ 1861 __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) 1862 { 1863 struct btrfs_space_info *space_info; 1864 1865 btrfs_info(fs_info, "dumping space info:"); 1866 list_for_each_entry(space_info, &fs_info->space_info, list) { 1867 spin_lock(&space_info->lock); 1868 __btrfs_dump_space_info(fs_info, space_info); 1869 spin_unlock(&space_info->lock); 1870 } 1871 dump_global_block_rsv(fs_info); 1872 } 1873 1874 /* 1875 * Account the unused space of all the readonly block group in the space_info. 1876 * takes mirrors into account. 1877 */ 1878 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 1879 { 1880 struct btrfs_block_group *block_group; 1881 u64 free_bytes = 0; 1882 int factor; 1883 1884 /* It's df, we don't care if it's racy */ 1885 if (list_empty(&sinfo->ro_bgs)) 1886 return 0; 1887 1888 spin_lock(&sinfo->lock); 1889 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { 1890 spin_lock(&block_group->lock); 1891 1892 if (!block_group->ro) { 1893 spin_unlock(&block_group->lock); 1894 continue; 1895 } 1896 1897 factor = btrfs_bg_type_to_factor(block_group->flags); 1898 free_bytes += (block_group->length - 1899 block_group->used) * factor; 1900 1901 spin_unlock(&block_group->lock); 1902 } 1903 spin_unlock(&sinfo->lock); 1904 1905 return free_bytes; 1906 } 1907 1908 static u64 calc_pct_ratio(u64 x, u64 y) 1909 { 1910 int err; 1911 1912 if (!y) 1913 return 0; 1914 again: 1915 err = check_mul_overflow(100, x, &x); 1916 if (err) 1917 goto lose_precision; 1918 return div64_u64(x, y); 1919 lose_precision: 1920 x >>= 10; 1921 y >>= 10; 1922 if (!y) 1923 y = 1; 1924 goto again; 1925 } 1926 1927 /* 1928 * A reasonable buffer for unallocated space is 10 data block_groups. 1929 * If we claw this back repeatedly, we can still achieve efficient 1930 * utilization when near full, and not do too much reclaim while 1931 * always maintaining a solid buffer for workloads that quickly 1932 * allocate and pressure the unallocated space. 1933 */ 1934 static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info) 1935 { 1936 u64 chunk_sz = calc_effective_data_chunk_size(fs_info); 1937 1938 return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * chunk_sz; 1939 } 1940 1941 /* 1942 * The fundamental goal of automatic reclaim is to protect the filesystem's 1943 * unallocated space and thus minimize the probability of the filesystem going 1944 * read only when a metadata allocation failure causes a transaction abort. 1945 * 1946 * However, relocations happen into the space_info's unused space, therefore 1947 * automatic reclaim must also back off as that space runs low. There is no 1948 * value in doing trivial "relocations" of re-writing the same block group 1949 * into a fresh one. 1950 * 1951 * Furthermore, we want to avoid doing too much reclaim even if there are good 1952 * candidates. This is because the allocator is pretty good at filling up the 1953 * holes with writes. So we want to do just enough reclaim to try and stay 1954 * safe from running out of unallocated space but not be wasteful about it. 1955 * 1956 * Therefore, the dynamic reclaim threshold is calculated as follows: 1957 * - calculate a target unallocated amount of 5 block group sized chunks 1958 * - ratchet up the intensity of reclaim depending on how far we are from 1959 * that target by using a formula of unalloc / target to set the threshold. 1960 * 1961 * Typically with 10 block groups as the target, the discrete values this comes 1962 * out to are 0, 10, 20, ... , 80, 90, and 99. 1963 */ 1964 static int calc_dynamic_reclaim_threshold(const struct btrfs_space_info *space_info) 1965 { 1966 struct btrfs_fs_info *fs_info = space_info->fs_info; 1967 u64 unalloc = atomic64_read(&fs_info->free_chunk_space); 1968 u64 target = calc_unalloc_target(fs_info); 1969 u64 alloc = space_info->total_bytes; 1970 u64 used = btrfs_space_info_used(space_info, false); 1971 u64 unused = alloc - used; 1972 u64 want = target > unalloc ? target - unalloc : 0; 1973 u64 data_chunk_size = calc_effective_data_chunk_size(fs_info); 1974 1975 /* If we have no unused space, don't bother, it won't work anyway. */ 1976 if (unused < data_chunk_size) 1977 return 0; 1978 1979 /* Cast to int is OK because want <= target. */ 1980 return calc_pct_ratio(want, target); 1981 } 1982 1983 int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info) 1984 { 1985 lockdep_assert_held(&space_info->lock); 1986 1987 if (READ_ONCE(space_info->dynamic_reclaim)) 1988 return calc_dynamic_reclaim_threshold(space_info); 1989 return READ_ONCE(space_info->bg_reclaim_threshold); 1990 } 1991 1992 /* 1993 * Under "urgent" reclaim, we will reclaim even fresh block groups that have 1994 * recently seen successful allocations, as we are desperate to reclaim 1995 * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs. 1996 */ 1997 static bool is_reclaim_urgent(struct btrfs_space_info *space_info) 1998 { 1999 struct btrfs_fs_info *fs_info = space_info->fs_info; 2000 u64 unalloc = atomic64_read(&fs_info->free_chunk_space); 2001 u64 data_chunk_size = calc_effective_data_chunk_size(fs_info); 2002 2003 return unalloc < data_chunk_size; 2004 } 2005 2006 static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) 2007 { 2008 struct btrfs_block_group *bg; 2009 int thresh_pct; 2010 bool try_again = true; 2011 bool urgent; 2012 2013 spin_lock(&space_info->lock); 2014 urgent = is_reclaim_urgent(space_info); 2015 thresh_pct = btrfs_calc_reclaim_threshold(space_info); 2016 spin_unlock(&space_info->lock); 2017 2018 down_read(&space_info->groups_sem); 2019 again: 2020 list_for_each_entry(bg, &space_info->block_groups[raid], list) { 2021 u64 thresh; 2022 bool reclaim = false; 2023 2024 btrfs_get_block_group(bg); 2025 spin_lock(&bg->lock); 2026 thresh = mult_perc(bg->length, thresh_pct); 2027 if (bg->used < thresh && bg->reclaim_mark) { 2028 try_again = false; 2029 reclaim = true; 2030 } 2031 bg->reclaim_mark++; 2032 spin_unlock(&bg->lock); 2033 if (reclaim) 2034 btrfs_mark_bg_to_reclaim(bg); 2035 btrfs_put_block_group(bg); 2036 } 2037 2038 /* 2039 * In situations where we are very motivated to reclaim (low unalloc) 2040 * use two passes to make the reclaim mark check best effort. 2041 * 2042 * If we have any staler groups, we don't touch the fresher ones, but if we 2043 * really need a block group, do take a fresh one. 2044 */ 2045 if (try_again && urgent) { 2046 try_again = false; 2047 goto again; 2048 } 2049 2050 up_read(&space_info->groups_sem); 2051 } 2052 2053 void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes) 2054 { 2055 u64 chunk_sz = calc_effective_data_chunk_size(space_info->fs_info); 2056 2057 lockdep_assert_held(&space_info->lock); 2058 space_info->reclaimable_bytes += bytes; 2059 2060 if (space_info->reclaimable_bytes >= chunk_sz) 2061 btrfs_set_periodic_reclaim_ready(space_info, true); 2062 } 2063 2064 void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready) 2065 { 2066 lockdep_assert_held(&space_info->lock); 2067 if (!READ_ONCE(space_info->periodic_reclaim)) 2068 return; 2069 if (ready != space_info->periodic_reclaim_ready) { 2070 space_info->periodic_reclaim_ready = ready; 2071 if (!ready) 2072 space_info->reclaimable_bytes = 0; 2073 } 2074 } 2075 2076 bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) 2077 { 2078 bool ret; 2079 2080 if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) 2081 return false; 2082 if (!READ_ONCE(space_info->periodic_reclaim)) 2083 return false; 2084 2085 spin_lock(&space_info->lock); 2086 ret = space_info->periodic_reclaim_ready; 2087 btrfs_set_periodic_reclaim_ready(space_info, false); 2088 spin_unlock(&space_info->lock); 2089 2090 return ret; 2091 } 2092 2093 void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) 2094 { 2095 int raid; 2096 struct btrfs_space_info *space_info; 2097 2098 list_for_each_entry(space_info, &fs_info->space_info, list) { 2099 if (!btrfs_should_periodic_reclaim(space_info)) 2100 continue; 2101 for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) 2102 do_reclaim_sweep(space_info, raid); 2103 } 2104 } 2105 2106 void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len) 2107 { 2108 struct btrfs_fs_info *fs_info = space_info->fs_info; 2109 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 2110 2111 lockdep_assert_held(&space_info->lock); 2112 2113 /* Prioritize the global reservation to receive the freed space. */ 2114 if (global_rsv->space_info != space_info) 2115 goto grant; 2116 2117 spin_lock(&global_rsv->lock); 2118 if (!global_rsv->full) { 2119 u64 to_add = min(len, global_rsv->size - global_rsv->reserved); 2120 2121 global_rsv->reserved += to_add; 2122 btrfs_space_info_update_bytes_may_use(space_info, to_add); 2123 if (global_rsv->reserved >= global_rsv->size) 2124 global_rsv->full = 1; 2125 len -= to_add; 2126 } 2127 spin_unlock(&global_rsv->lock); 2128 2129 grant: 2130 /* Add to any tickets we may have. */ 2131 if (len) 2132 btrfs_try_granting_tickets(fs_info, space_info); 2133 } 2134