1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/jiffies.h> 4 #include <linux/kernel.h> 5 #include <linux/ktime.h> 6 #include <linux/list.h> 7 #include <linux/math64.h> 8 #include <linux/sizes.h> 9 #include <linux/workqueue.h> 10 #include "ctree.h" 11 #include "block-group.h" 12 #include "discard.h" 13 #include "free-space-cache.h" 14 #include "fs.h" 15 16 /* 17 * This contains the logic to handle async discard. 18 * 19 * Async discard manages trimming of free space outside of transaction commit. 20 * Discarding is done by managing the block_groups on a LRU list based on free 21 * space recency. Two passes are used to first prioritize discarding extents 22 * and then allow for trimming in the bitmap the best opportunity to coalesce. 23 * The block_groups are maintained on multiple lists to allow for multiple 24 * passes with different discard filter requirements. A delayed work item is 25 * used to manage discarding with timeout determined by a max of the delay 26 * incurred by the iops rate limit, the byte rate limit, and the max delay of 27 * BTRFS_DISCARD_MAX_DELAY. 28 * 29 * Note, this only keeps track of block_groups that are explicitly for data. 30 * Mixed block_groups are not supported. 31 * 32 * The first list is special to manage discarding of fully free block groups. 33 * This is necessary because we issue a final trim for a full free block group 34 * after forgetting it. When a block group becomes unused, instead of directly 35 * being added to the unused_bgs list, we add it to this first list. Then 36 * from there, if it becomes fully discarded, we place it onto the unused_bgs 37 * list. 38 * 39 * The in-memory free space cache serves as the backing state for discard. 40 * Consequently this means there is no persistence. We opt to load all the 41 * block groups in as not discarded, so the mount case degenerates to the 42 * crashing case. 43 * 44 * As the free space cache uses bitmaps, there exists a tradeoff between 45 * ease/efficiency for find_free_extent() and the accuracy of discard state. 46 * Here we opt to let untrimmed regions merge with everything while only letting 47 * trimmed regions merge with other trimmed regions. This can cause 48 * overtrimming, but the coalescing benefit seems to be worth it. Additionally, 49 * bitmap state is tracked as a whole. If we're able to fully trim a bitmap, 50 * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in, 51 * this resets the state and we will retry trimming the whole bitmap. This is a 52 * tradeoff between discard state accuracy and the cost of accounting. 53 */ 54 55 /* This is an initial delay to give some chance for block reuse */ 56 #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) 57 #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) 58 59 /* Target completion latency of discarding all discardable extents */ 60 #define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC) 61 #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) 62 #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) 63 #define BTRFS_DISCARD_MAX_IOPS (10U) 64 65 /* Monotonically decreasing minimum length filters after index 0 */ 66 static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { 67 0, 68 BTRFS_ASYNC_DISCARD_MAX_FILTER, 69 BTRFS_ASYNC_DISCARD_MIN_FILTER 70 }; 71 72 static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, 73 struct btrfs_block_group *block_group) 74 { 75 return &discard_ctl->discard_list[block_group->discard_index]; 76 } 77 78 static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, 79 struct btrfs_block_group *block_group) 80 { 81 lockdep_assert_held(&discard_ctl->lock); 82 if (!btrfs_run_discard_work(discard_ctl)) 83 return; 84 85 if (list_empty(&block_group->discard_list) || 86 block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { 87 if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) 88 block_group->discard_index = BTRFS_DISCARD_INDEX_START; 89 block_group->discard_eligible_time = (ktime_get_ns() + 90 BTRFS_DISCARD_DELAY); 91 block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; 92 } 93 if (list_empty(&block_group->discard_list)) 94 btrfs_get_block_group(block_group); 95 96 list_move_tail(&block_group->discard_list, 97 get_discard_list(discard_ctl, block_group)); 98 } 99 100 static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, 101 struct btrfs_block_group *block_group) 102 { 103 if (!btrfs_is_block_group_data_only(block_group)) 104 return; 105 106 spin_lock(&discard_ctl->lock); 107 __add_to_discard_list(discard_ctl, block_group); 108 spin_unlock(&discard_ctl->lock); 109 } 110 111 static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, 112 struct btrfs_block_group *block_group) 113 { 114 bool queued; 115 116 spin_lock(&discard_ctl->lock); 117 118 queued = !list_empty(&block_group->discard_list); 119 120 if (!btrfs_run_discard_work(discard_ctl)) { 121 spin_unlock(&discard_ctl->lock); 122 return; 123 } 124 125 list_del_init(&block_group->discard_list); 126 127 block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED; 128 block_group->discard_eligible_time = (ktime_get_ns() + 129 BTRFS_DISCARD_UNUSED_DELAY); 130 block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; 131 if (!queued) 132 btrfs_get_block_group(block_group); 133 list_add_tail(&block_group->discard_list, 134 &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); 135 136 spin_unlock(&discard_ctl->lock); 137 } 138 139 static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, 140 struct btrfs_block_group *block_group) 141 { 142 bool running = false; 143 bool queued = false; 144 145 spin_lock(&discard_ctl->lock); 146 147 if (block_group == discard_ctl->block_group) { 148 running = true; 149 discard_ctl->block_group = NULL; 150 } 151 152 block_group->discard_eligible_time = 0; 153 queued = !list_empty(&block_group->discard_list); 154 list_del_init(&block_group->discard_list); 155 /* 156 * If the block group is currently running in the discard workfn, we 157 * don't want to deref it, since it's still being used by the workfn. 158 * The workfn will notice this case and deref the block group when it is 159 * finished. 160 */ 161 if (queued && !running) 162 btrfs_put_block_group(block_group); 163 164 spin_unlock(&discard_ctl->lock); 165 166 return running; 167 } 168 169 /* 170 * Find block_group that's up next for discarding. 171 * 172 * @discard_ctl: discard control 173 * @now: current time 174 * 175 * Iterate over the discard lists to find the next block_group up for 176 * discarding checking the discard_eligible_time of block_group. 177 */ 178 static struct btrfs_block_group *find_next_block_group( 179 struct btrfs_discard_ctl *discard_ctl, 180 u64 now) 181 { 182 struct btrfs_block_group *ret_block_group = NULL, *block_group; 183 int i; 184 185 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { 186 struct list_head *discard_list = &discard_ctl->discard_list[i]; 187 188 if (!list_empty(discard_list)) { 189 block_group = list_first_entry(discard_list, 190 struct btrfs_block_group, 191 discard_list); 192 193 if (!ret_block_group) 194 ret_block_group = block_group; 195 196 if (ret_block_group->discard_eligible_time < now) 197 break; 198 199 if (ret_block_group->discard_eligible_time > 200 block_group->discard_eligible_time) 201 ret_block_group = block_group; 202 } 203 } 204 205 return ret_block_group; 206 } 207 208 /* 209 * Look up next block group and set it for use. 210 * 211 * @discard_ctl: discard control 212 * @discard_state: the discard_state of the block_group after state management 213 * @discard_index: the discard_index of the block_group after state management 214 * @now: time when discard was invoked, in ns 215 * 216 * Wrap find_next_block_group() and set the block_group to be in use. 217 * @discard_state's control flow is managed here. Variables related to 218 * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state 219 * and @discard_index are remembered as it may change while we're discarding, 220 * but we want the discard to execute in the context determined here. 221 */ 222 static struct btrfs_block_group *peek_discard_list( 223 struct btrfs_discard_ctl *discard_ctl, 224 enum btrfs_discard_state *discard_state, 225 int *discard_index, u64 now) 226 { 227 struct btrfs_block_group *block_group; 228 229 spin_lock(&discard_ctl->lock); 230 again: 231 block_group = find_next_block_group(discard_ctl, now); 232 233 if (block_group && now >= block_group->discard_eligible_time) { 234 if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && 235 block_group->used != 0) { 236 if (btrfs_is_block_group_data_only(block_group)) { 237 __add_to_discard_list(discard_ctl, block_group); 238 } else { 239 list_del_init(&block_group->discard_list); 240 btrfs_put_block_group(block_group); 241 } 242 goto again; 243 } 244 if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { 245 block_group->discard_cursor = block_group->start; 246 block_group->discard_state = BTRFS_DISCARD_EXTENTS; 247 } 248 discard_ctl->block_group = block_group; 249 } 250 if (block_group) { 251 *discard_state = block_group->discard_state; 252 *discard_index = block_group->discard_index; 253 } 254 spin_unlock(&discard_ctl->lock); 255 256 return block_group; 257 } 258 259 /* 260 * Update a block group's filters. 261 * 262 * @block_group: block group of interest 263 * @bytes: recently freed region size after coalescing 264 * 265 * Async discard maintains multiple lists with progressively smaller filters 266 * to prioritize discarding based on size. Should a free space that matches 267 * a larger filter be returned to the free_space_cache, prioritize that discard 268 * by moving @block_group to the proper filter. 269 */ 270 void btrfs_discard_check_filter(struct btrfs_block_group *block_group, 271 u64 bytes) 272 { 273 struct btrfs_discard_ctl *discard_ctl; 274 275 if (!block_group || 276 !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 277 return; 278 279 discard_ctl = &block_group->fs_info->discard_ctl; 280 281 if (block_group->discard_index > BTRFS_DISCARD_INDEX_START && 282 bytes >= discard_minlen[block_group->discard_index - 1]) { 283 int i; 284 285 remove_from_discard_list(discard_ctl, block_group); 286 287 for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS; 288 i++) { 289 if (bytes >= discard_minlen[i]) { 290 block_group->discard_index = i; 291 add_to_discard_list(discard_ctl, block_group); 292 break; 293 } 294 } 295 } 296 } 297 298 /* 299 * Move a block group along the discard lists. 300 * 301 * @discard_ctl: discard control 302 * @block_group: block_group of interest 303 * 304 * Increment @block_group's discard_index. If it falls of the list, let it be. 305 * Otherwise add it back to the appropriate list. 306 */ 307 static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, 308 struct btrfs_block_group *block_group) 309 { 310 block_group->discard_index++; 311 if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) { 312 block_group->discard_index = 1; 313 return; 314 } 315 316 add_to_discard_list(discard_ctl, block_group); 317 } 318 319 /* 320 * Remove a block_group from the discard lists. 321 * 322 * @discard_ctl: discard control 323 * @block_group: block_group of interest 324 * 325 * Remove @block_group from the discard lists. If necessary, wait on the 326 * current work and then reschedule the delayed work. 327 */ 328 void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, 329 struct btrfs_block_group *block_group) 330 { 331 if (remove_from_discard_list(discard_ctl, block_group)) { 332 cancel_delayed_work_sync(&discard_ctl->work); 333 btrfs_discard_schedule_work(discard_ctl, true); 334 } 335 } 336 337 /* 338 * Handles queuing the block_groups. 339 * 340 * @discard_ctl: discard control 341 * @block_group: block_group of interest 342 * 343 * Maintain the LRU order of the discard lists. 344 */ 345 void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, 346 struct btrfs_block_group *block_group) 347 { 348 if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 349 return; 350 351 if (block_group->used == 0) 352 add_to_discard_unused_list(discard_ctl, block_group); 353 else 354 add_to_discard_list(discard_ctl, block_group); 355 356 if (!delayed_work_pending(&discard_ctl->work)) 357 btrfs_discard_schedule_work(discard_ctl, false); 358 } 359 360 static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, 361 u64 now, bool override) 362 { 363 struct btrfs_block_group *block_group; 364 365 if (!btrfs_run_discard_work(discard_ctl)) 366 return; 367 if (!override && delayed_work_pending(&discard_ctl->work)) 368 return; 369 370 block_group = find_next_block_group(discard_ctl, now); 371 if (block_group) { 372 u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC; 373 u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit); 374 375 /* 376 * A single delayed workqueue item is responsible for 377 * discarding, so we can manage the bytes rate limit by keeping 378 * track of the previous discard. 379 */ 380 if (kbps_limit && discard_ctl->prev_discard) { 381 u64 bps_limit = ((u64)kbps_limit) * SZ_1K; 382 u64 bps_delay = div64_u64(discard_ctl->prev_discard * 383 NSEC_PER_SEC, bps_limit); 384 385 delay = max(delay, bps_delay); 386 } 387 388 /* 389 * This timeout is to hopefully prevent immediate discarding 390 * in a recently allocated block group. 391 */ 392 if (now < block_group->discard_eligible_time) { 393 u64 bg_timeout = block_group->discard_eligible_time - now; 394 395 delay = max(delay, bg_timeout); 396 } 397 398 if (override && discard_ctl->prev_discard) { 399 u64 elapsed = now - discard_ctl->prev_discard_time; 400 401 if (delay > elapsed) 402 delay -= elapsed; 403 else 404 delay = 0; 405 } 406 407 mod_delayed_work(discard_ctl->discard_workers, 408 &discard_ctl->work, nsecs_to_jiffies(delay)); 409 } 410 } 411 412 /* 413 * Responsible for scheduling the discard work. 414 * 415 * @discard_ctl: discard control 416 * @override: override the current timer 417 * 418 * Discards are issued by a delayed workqueue item. @override is used to 419 * update the current delay as the baseline delay interval is reevaluated on 420 * transaction commit. This is also maxed with any other rate limit. 421 */ 422 void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, 423 bool override) 424 { 425 const u64 now = ktime_get_ns(); 426 427 spin_lock(&discard_ctl->lock); 428 __btrfs_discard_schedule_work(discard_ctl, now, override); 429 spin_unlock(&discard_ctl->lock); 430 } 431 432 /* 433 * Determine next step of a block_group. 434 * 435 * @discard_ctl: discard control 436 * @block_group: block_group of interest 437 * 438 * Determine the next step for a block group after it's finished going through 439 * a pass on a discard list. If it is unused and fully trimmed, we can mark it 440 * unused and send it to the unused_bgs path. Otherwise, pass it onto the 441 * appropriate filter list or let it fall off. 442 */ 443 static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, 444 struct btrfs_block_group *block_group) 445 { 446 remove_from_discard_list(discard_ctl, block_group); 447 448 if (block_group->used == 0) { 449 if (btrfs_is_free_space_trimmed(block_group)) 450 btrfs_mark_bg_unused(block_group); 451 else 452 add_to_discard_unused_list(discard_ctl, block_group); 453 } else { 454 btrfs_update_discard_index(discard_ctl, block_group); 455 } 456 } 457 458 /* 459 * Discard work queue callback 460 * 461 * @work: work 462 * 463 * Find the next block_group to start discarding and then discard a single 464 * region. It does this in a two-pass fashion: first extents and second 465 * bitmaps. Completely discarded block groups are sent to the unused_bgs path. 466 */ 467 static void btrfs_discard_workfn(struct work_struct *work) 468 { 469 struct btrfs_discard_ctl *discard_ctl; 470 struct btrfs_block_group *block_group; 471 enum btrfs_discard_state discard_state; 472 int discard_index = 0; 473 u64 trimmed = 0; 474 u64 minlen = 0; 475 u64 now = ktime_get_ns(); 476 477 discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); 478 479 block_group = peek_discard_list(discard_ctl, &discard_state, 480 &discard_index, now); 481 if (!block_group || !btrfs_run_discard_work(discard_ctl)) 482 return; 483 if (now < block_group->discard_eligible_time) { 484 btrfs_discard_schedule_work(discard_ctl, false); 485 return; 486 } 487 488 /* Perform discarding */ 489 minlen = discard_minlen[discard_index]; 490 491 if (discard_state == BTRFS_DISCARD_BITMAPS) { 492 u64 maxlen = 0; 493 494 /* 495 * Use the previous levels minimum discard length as the max 496 * length filter. In the case something is added to make a 497 * region go beyond the max filter, the entire bitmap is set 498 * back to BTRFS_TRIM_STATE_UNTRIMMED. 499 */ 500 if (discard_index != BTRFS_DISCARD_INDEX_UNUSED) 501 maxlen = discard_minlen[discard_index - 1]; 502 503 btrfs_trim_block_group_bitmaps(block_group, &trimmed, 504 block_group->discard_cursor, 505 btrfs_block_group_end(block_group), 506 minlen, maxlen, true); 507 discard_ctl->discard_bitmap_bytes += trimmed; 508 } else { 509 btrfs_trim_block_group_extents(block_group, &trimmed, 510 block_group->discard_cursor, 511 btrfs_block_group_end(block_group), 512 minlen, true); 513 discard_ctl->discard_extent_bytes += trimmed; 514 } 515 516 /* Determine next steps for a block_group */ 517 if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { 518 if (discard_state == BTRFS_DISCARD_BITMAPS) { 519 btrfs_finish_discard_pass(discard_ctl, block_group); 520 } else { 521 block_group->discard_cursor = block_group->start; 522 spin_lock(&discard_ctl->lock); 523 if (block_group->discard_state != 524 BTRFS_DISCARD_RESET_CURSOR) 525 block_group->discard_state = 526 BTRFS_DISCARD_BITMAPS; 527 spin_unlock(&discard_ctl->lock); 528 } 529 } 530 531 now = ktime_get_ns(); 532 spin_lock(&discard_ctl->lock); 533 discard_ctl->prev_discard = trimmed; 534 discard_ctl->prev_discard_time = now; 535 /* 536 * If the block group was removed from the discard list while it was 537 * running in this workfn, then we didn't deref it, since this function 538 * still owned that reference. But we set the discard_ctl->block_group 539 * back to NULL, so we can use that condition to know that now we need 540 * to deref the block_group. 541 */ 542 if (discard_ctl->block_group == NULL) 543 btrfs_put_block_group(block_group); 544 discard_ctl->block_group = NULL; 545 __btrfs_discard_schedule_work(discard_ctl, now, false); 546 spin_unlock(&discard_ctl->lock); 547 } 548 549 /* 550 * Determine if async discard should be running. 551 * 552 * @discard_ctl: discard control 553 * 554 * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. 555 */ 556 bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) 557 { 558 struct btrfs_fs_info *fs_info = container_of(discard_ctl, 559 struct btrfs_fs_info, 560 discard_ctl); 561 562 return (!(fs_info->sb->s_flags & SB_RDONLY) && 563 test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); 564 } 565 566 /* 567 * Recalculate the base delay. 568 * 569 * @discard_ctl: discard control 570 * 571 * Recalculate the base delay which is based off the total number of 572 * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms) 573 * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC). 574 */ 575 void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) 576 { 577 s32 discardable_extents; 578 s64 discardable_bytes; 579 u32 iops_limit; 580 unsigned long delay; 581 582 discardable_extents = atomic_read(&discard_ctl->discardable_extents); 583 if (!discardable_extents) 584 return; 585 586 spin_lock(&discard_ctl->lock); 587 588 /* 589 * The following is to fix a potential -1 discrepancy that we're not 590 * sure how to reproduce. But given that this is the only place that 591 * utilizes these numbers and this is only called by from 592 * btrfs_finish_extent_commit() which is synchronized, we can correct 593 * here. 594 */ 595 if (discardable_extents < 0) 596 atomic_add(-discardable_extents, 597 &discard_ctl->discardable_extents); 598 599 discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes); 600 if (discardable_bytes < 0) 601 atomic64_add(-discardable_bytes, 602 &discard_ctl->discardable_bytes); 603 604 if (discardable_extents <= 0) { 605 spin_unlock(&discard_ctl->lock); 606 return; 607 } 608 609 iops_limit = READ_ONCE(discard_ctl->iops_limit); 610 if (iops_limit) 611 delay = MSEC_PER_SEC / iops_limit; 612 else 613 delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents; 614 615 delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC, 616 BTRFS_DISCARD_MAX_DELAY_MSEC); 617 discard_ctl->delay_ms = delay; 618 619 spin_unlock(&discard_ctl->lock); 620 } 621 622 /* 623 * Propagate discard counters. 624 * 625 * @block_group: block_group of interest 626 * 627 * Propagate deltas of counters up to the discard_ctl. It maintains a current 628 * counter and a previous counter passing the delta up to the global stat. 629 * Then the current counter value becomes the previous counter value. 630 */ 631 void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) 632 { 633 struct btrfs_free_space_ctl *ctl; 634 struct btrfs_discard_ctl *discard_ctl; 635 s32 extents_delta; 636 s64 bytes_delta; 637 638 if (!block_group || 639 !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) || 640 !btrfs_is_block_group_data_only(block_group)) 641 return; 642 643 ctl = block_group->free_space_ctl; 644 discard_ctl = &block_group->fs_info->discard_ctl; 645 646 lockdep_assert_held(&ctl->tree_lock); 647 extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] - 648 ctl->discardable_extents[BTRFS_STAT_PREV]; 649 if (extents_delta) { 650 atomic_add(extents_delta, &discard_ctl->discardable_extents); 651 ctl->discardable_extents[BTRFS_STAT_PREV] = 652 ctl->discardable_extents[BTRFS_STAT_CURR]; 653 } 654 655 bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] - 656 ctl->discardable_bytes[BTRFS_STAT_PREV]; 657 if (bytes_delta) { 658 atomic64_add(bytes_delta, &discard_ctl->discardable_bytes); 659 ctl->discardable_bytes[BTRFS_STAT_PREV] = 660 ctl->discardable_bytes[BTRFS_STAT_CURR]; 661 } 662 } 663 664 /* 665 * Punt unused_bgs list to discard lists. 666 * 667 * @fs_info: fs_info of interest 668 * 669 * The unused_bgs list needs to be punted to the discard lists because the 670 * order of operations is changed. In the normal synchronous discard path, the 671 * block groups are trimmed via a single large trim in transaction commit. This 672 * is ultimately what we are trying to avoid with asynchronous discard. Thus, 673 * it must be done before going down the unused_bgs path. 674 */ 675 void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) 676 { 677 struct btrfs_block_group *block_group, *next; 678 679 spin_lock(&fs_info->unused_bgs_lock); 680 /* We enabled async discard, so punt all to the queue */ 681 list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, 682 bg_list) { 683 list_del_init(&block_group->bg_list); 684 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); 685 /* 686 * This put is for the get done by btrfs_mark_bg_unused. 687 * Queueing discard incremented it for discard's reference. 688 */ 689 btrfs_put_block_group(block_group); 690 } 691 spin_unlock(&fs_info->unused_bgs_lock); 692 } 693 694 /* 695 * Purge discard lists. 696 * 697 * @discard_ctl: discard control 698 * 699 * If we are disabling async discard, we may have intercepted block groups that 700 * are completely free and ready for the unused_bgs path. As discarding will 701 * now happen in transaction commit or not at all, we can safely mark the 702 * corresponding block groups as unused and they will be sent on their merry 703 * way to the unused_bgs list. 704 */ 705 static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) 706 { 707 struct btrfs_block_group *block_group, *next; 708 int i; 709 710 spin_lock(&discard_ctl->lock); 711 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { 712 list_for_each_entry_safe(block_group, next, 713 &discard_ctl->discard_list[i], 714 discard_list) { 715 list_del_init(&block_group->discard_list); 716 spin_unlock(&discard_ctl->lock); 717 if (block_group->used == 0) 718 btrfs_mark_bg_unused(block_group); 719 spin_lock(&discard_ctl->lock); 720 btrfs_put_block_group(block_group); 721 } 722 } 723 spin_unlock(&discard_ctl->lock); 724 } 725 726 void btrfs_discard_resume(struct btrfs_fs_info *fs_info) 727 { 728 if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) { 729 btrfs_discard_cleanup(fs_info); 730 return; 731 } 732 733 btrfs_discard_punt_unused_bgs_list(fs_info); 734 735 set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); 736 } 737 738 void btrfs_discard_stop(struct btrfs_fs_info *fs_info) 739 { 740 clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); 741 } 742 743 void btrfs_discard_init(struct btrfs_fs_info *fs_info) 744 { 745 struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; 746 int i; 747 748 spin_lock_init(&discard_ctl->lock); 749 INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn); 750 751 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) 752 INIT_LIST_HEAD(&discard_ctl->discard_list[i]); 753 754 discard_ctl->prev_discard = 0; 755 discard_ctl->prev_discard_time = 0; 756 atomic_set(&discard_ctl->discardable_extents, 0); 757 atomic64_set(&discard_ctl->discardable_bytes, 0); 758 discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE; 759 discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC; 760 discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS; 761 discard_ctl->kbps_limit = 0; 762 discard_ctl->discard_extent_bytes = 0; 763 discard_ctl->discard_bitmap_bytes = 0; 764 atomic64_set(&discard_ctl->discard_bytes_saved, 0); 765 } 766 767 void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info) 768 { 769 btrfs_discard_stop(fs_info); 770 cancel_delayed_work_sync(&fs_info->discard_ctl.work); 771 btrfs_discard_purge_list(&fs_info->discard_ctl); 772 } 773