1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/jiffies.h>
4 #include <linux/kernel.h>
5 #include <linux/ktime.h>
6 #include <linux/list.h>
7 #include <linux/math64.h>
8 #include <linux/sizes.h>
9 #include <linux/workqueue.h>
10 #include "ctree.h"
11 #include "block-group.h"
12 #include "discard.h"
13 #include "free-space-cache.h"
14 #include "fs.h"
15
16 /*
17 * This contains the logic to handle async discard.
18 *
19 * Async discard manages trimming of free space outside of transaction commit.
20 * Discarding is done by managing the block_groups on a LRU list based on free
21 * space recency. Two passes are used to first prioritize discarding extents
22 * and then allow for trimming in the bitmap the best opportunity to coalesce.
23 * The block_groups are maintained on multiple lists to allow for multiple
24 * passes with different discard filter requirements. A delayed work item is
25 * used to manage discarding with timeout determined by a max of the delay
26 * incurred by the iops rate limit, the byte rate limit, and the max delay of
27 * BTRFS_DISCARD_MAX_DELAY.
28 *
29 * Note, this only keeps track of block_groups that are explicitly for data.
30 * Mixed block_groups are not supported.
31 *
32 * The first list is special to manage discarding of fully free block groups.
33 * This is necessary because we issue a final trim for a full free block group
34 * after forgetting it. When a block group becomes unused, instead of directly
35 * being added to the unused_bgs list, we add it to this first list. Then
36 * from there, if it becomes fully discarded, we place it onto the unused_bgs
37 * list.
38 *
39 * The in-memory free space cache serves as the backing state for discard.
40 * Consequently this means there is no persistence. We opt to load all the
41 * block groups in as not discarded, so the mount case degenerates to the
42 * crashing case.
43 *
44 * As the free space cache uses bitmaps, there exists a tradeoff between
45 * ease/efficiency for find_free_extent() and the accuracy of discard state.
46 * Here we opt to let untrimmed regions merge with everything while only letting
47 * trimmed regions merge with other trimmed regions. This can cause
48 * overtrimming, but the coalescing benefit seems to be worth it. Additionally,
49 * bitmap state is tracked as a whole. If we're able to fully trim a bitmap,
50 * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in,
51 * this resets the state and we will retry trimming the whole bitmap. This is a
52 * tradeoff between discard state accuracy and the cost of accounting.
53 */
54
55 /* This is an initial delay to give some chance for block reuse */
56 #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC)
57 #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC)
58
59 #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL)
60 #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
61 #define BTRFS_DISCARD_MAX_IOPS (1000U)
62
63 /* Monotonically decreasing minimum length filters after index 0 */
64 static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
65 0,
66 BTRFS_ASYNC_DISCARD_MAX_FILTER,
67 BTRFS_ASYNC_DISCARD_MIN_FILTER
68 };
69
get_discard_list(struct btrfs_discard_ctl * discard_ctl,const struct btrfs_block_group * block_group)70 static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
71 const struct btrfs_block_group *block_group)
72 {
73 return &discard_ctl->discard_list[block_group->discard_index];
74 }
75
76 /*
77 * Determine if async discard should be running.
78 *
79 * @discard_ctl: discard control
80 *
81 * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
82 */
btrfs_run_discard_work(const struct btrfs_discard_ctl * discard_ctl)83 static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl)
84 {
85 struct btrfs_fs_info *fs_info = container_of(discard_ctl,
86 struct btrfs_fs_info,
87 discard_ctl);
88
89 return (!(fs_info->sb->s_flags & SB_RDONLY) &&
90 test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
91 }
92
__add_to_discard_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)93 static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
94 struct btrfs_block_group *block_group)
95 {
96 lockdep_assert_held(&discard_ctl->lock);
97
98 if (list_empty(&block_group->discard_list) ||
99 block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
100 if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
101 block_group->discard_index = BTRFS_DISCARD_INDEX_START;
102 block_group->discard_eligible_time = (ktime_get_ns() +
103 BTRFS_DISCARD_DELAY);
104 block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
105 }
106 if (list_empty(&block_group->discard_list))
107 btrfs_get_block_group(block_group);
108
109 list_move_tail(&block_group->discard_list,
110 get_discard_list(discard_ctl, block_group));
111 }
112
add_to_discard_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)113 static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
114 struct btrfs_block_group *block_group)
115 {
116 if (!btrfs_is_block_group_data_only(block_group))
117 return;
118
119 if (!btrfs_run_discard_work(discard_ctl))
120 return;
121
122 spin_lock(&discard_ctl->lock);
123 __add_to_discard_list(discard_ctl, block_group);
124 spin_unlock(&discard_ctl->lock);
125 }
126
add_to_discard_unused_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)127 static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
128 struct btrfs_block_group *block_group)
129 {
130 bool queued;
131
132 spin_lock(&discard_ctl->lock);
133
134 queued = !list_empty(&block_group->discard_list);
135
136 if (!btrfs_run_discard_work(discard_ctl)) {
137 spin_unlock(&discard_ctl->lock);
138 return;
139 }
140
141 list_del_init(&block_group->discard_list);
142
143 block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
144 block_group->discard_eligible_time = (ktime_get_ns() +
145 BTRFS_DISCARD_UNUSED_DELAY);
146 block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
147 if (!queued)
148 btrfs_get_block_group(block_group);
149 list_add_tail(&block_group->discard_list,
150 &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
151
152 spin_unlock(&discard_ctl->lock);
153 }
154
remove_from_discard_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)155 static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
156 struct btrfs_block_group *block_group)
157 {
158 bool running = false;
159 bool queued = false;
160
161 spin_lock(&discard_ctl->lock);
162
163 if (block_group == discard_ctl->block_group) {
164 running = true;
165 discard_ctl->block_group = NULL;
166 }
167
168 block_group->discard_eligible_time = 0;
169 queued = !list_empty(&block_group->discard_list);
170 list_del_init(&block_group->discard_list);
171 if (queued)
172 btrfs_put_block_group(block_group);
173
174 spin_unlock(&discard_ctl->lock);
175
176 return running;
177 }
178
179 /*
180 * Find block_group that's up next for discarding.
181 *
182 * @discard_ctl: discard control
183 * @now: current time
184 *
185 * Iterate over the discard lists to find the next block_group up for
186 * discarding checking the discard_eligible_time of block_group.
187 */
find_next_block_group(struct btrfs_discard_ctl * discard_ctl,u64 now)188 static struct btrfs_block_group *find_next_block_group(
189 struct btrfs_discard_ctl *discard_ctl,
190 u64 now)
191 {
192 struct btrfs_block_group *ret_block_group = NULL, *block_group;
193 int i;
194
195 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
196 struct list_head *discard_list = &discard_ctl->discard_list[i];
197
198 if (!list_empty(discard_list)) {
199 block_group = list_first_entry(discard_list,
200 struct btrfs_block_group,
201 discard_list);
202
203 if (!ret_block_group)
204 ret_block_group = block_group;
205
206 if (ret_block_group->discard_eligible_time < now)
207 break;
208
209 if (ret_block_group->discard_eligible_time >
210 block_group->discard_eligible_time)
211 ret_block_group = block_group;
212 }
213 }
214
215 return ret_block_group;
216 }
217
218 /*
219 * Look up next block group and set it for use.
220 *
221 * @discard_ctl: discard control
222 * @discard_state: the discard_state of the block_group after state management
223 * @discard_index: the discard_index of the block_group after state management
224 * @now: time when discard was invoked, in ns
225 *
226 * Wrap find_next_block_group() and set the block_group to be in use.
227 * @discard_state's control flow is managed here. Variables related to
228 * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state
229 * and @discard_index are remembered as it may change while we're discarding,
230 * but we want the discard to execute in the context determined here.
231 */
peek_discard_list(struct btrfs_discard_ctl * discard_ctl,enum btrfs_discard_state * discard_state,int * discard_index,u64 now)232 static struct btrfs_block_group *peek_discard_list(
233 struct btrfs_discard_ctl *discard_ctl,
234 enum btrfs_discard_state *discard_state,
235 int *discard_index, u64 now)
236 {
237 struct btrfs_block_group *block_group;
238
239 spin_lock(&discard_ctl->lock);
240 again:
241 block_group = find_next_block_group(discard_ctl, now);
242
243 if (block_group && now >= block_group->discard_eligible_time) {
244 if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
245 block_group->used != 0) {
246 if (btrfs_is_block_group_data_only(block_group)) {
247 __add_to_discard_list(discard_ctl, block_group);
248 /*
249 * The block group must have been moved to other
250 * discard list even if discard was disabled in
251 * the meantime or a transaction abort happened,
252 * otherwise we can end up in an infinite loop,
253 * always jumping into the 'again' label and
254 * keep getting this block group over and over
255 * in case there are no other block groups in
256 * the discard lists.
257 */
258 ASSERT(block_group->discard_index !=
259 BTRFS_DISCARD_INDEX_UNUSED,
260 "discard_index=%d",
261 block_group->discard_index);
262 } else {
263 list_del_init(&block_group->discard_list);
264 btrfs_put_block_group(block_group);
265 }
266 goto again;
267 }
268 if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
269 block_group->discard_cursor = block_group->start;
270 block_group->discard_state = BTRFS_DISCARD_EXTENTS;
271 }
272 }
273 if (block_group) {
274 btrfs_get_block_group(block_group);
275 discard_ctl->block_group = block_group;
276 *discard_state = block_group->discard_state;
277 *discard_index = block_group->discard_index;
278 }
279 spin_unlock(&discard_ctl->lock);
280
281 return block_group;
282 }
283
284 /*
285 * Update a block group's filters.
286 *
287 * @block_group: block group of interest
288 * @bytes: recently freed region size after coalescing
289 *
290 * Async discard maintains multiple lists with progressively smaller filters
291 * to prioritize discarding based on size. Should a free space that matches
292 * a larger filter be returned to the free_space_cache, prioritize that discard
293 * by moving @block_group to the proper filter.
294 */
btrfs_discard_check_filter(struct btrfs_block_group * block_group,u64 bytes)295 void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
296 u64 bytes)
297 {
298 struct btrfs_discard_ctl *discard_ctl;
299
300 if (!block_group ||
301 !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
302 return;
303
304 discard_ctl = &block_group->fs_info->discard_ctl;
305
306 if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
307 bytes >= discard_minlen[block_group->discard_index - 1]) {
308 int i;
309
310 remove_from_discard_list(discard_ctl, block_group);
311
312 for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
313 i++) {
314 if (bytes >= discard_minlen[i]) {
315 block_group->discard_index = i;
316 add_to_discard_list(discard_ctl, block_group);
317 break;
318 }
319 }
320 }
321 }
322
323 /*
324 * Move a block group along the discard lists.
325 *
326 * @discard_ctl: discard control
327 * @block_group: block_group of interest
328 *
329 * Increment @block_group's discard_index. If it falls of the list, let it be.
330 * Otherwise add it back to the appropriate list.
331 */
btrfs_update_discard_index(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)332 static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
333 struct btrfs_block_group *block_group)
334 {
335 block_group->discard_index++;
336 if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
337 block_group->discard_index = 1;
338 return;
339 }
340
341 add_to_discard_list(discard_ctl, block_group);
342 }
343
344 /*
345 * Remove a block_group from the discard lists.
346 *
347 * @discard_ctl: discard control
348 * @block_group: block_group of interest
349 *
350 * Remove @block_group from the discard lists. If necessary, wait on the
351 * current work and then reschedule the delayed work.
352 */
btrfs_discard_cancel_work(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)353 void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
354 struct btrfs_block_group *block_group)
355 {
356 if (remove_from_discard_list(discard_ctl, block_group)) {
357 cancel_delayed_work_sync(&discard_ctl->work);
358 btrfs_discard_schedule_work(discard_ctl, true);
359 }
360 }
361
362 /*
363 * Handles queuing the block_groups.
364 *
365 * @discard_ctl: discard control
366 * @block_group: block_group of interest
367 *
368 * Maintain the LRU order of the discard lists.
369 */
btrfs_discard_queue_work(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)370 void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
371 struct btrfs_block_group *block_group)
372 {
373 if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
374 return;
375
376 if (block_group->used == 0)
377 add_to_discard_unused_list(discard_ctl, block_group);
378 else
379 add_to_discard_list(discard_ctl, block_group);
380
381 if (!delayed_work_pending(&discard_ctl->work))
382 btrfs_discard_schedule_work(discard_ctl, false);
383 }
384
__btrfs_discard_schedule_work(struct btrfs_discard_ctl * discard_ctl,u64 now,bool override)385 static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
386 u64 now, bool override)
387 {
388 struct btrfs_block_group *block_group;
389
390 if (!btrfs_run_discard_work(discard_ctl))
391 return;
392 if (!override && delayed_work_pending(&discard_ctl->work))
393 return;
394
395 block_group = find_next_block_group(discard_ctl, now);
396 if (block_group) {
397 u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
398 u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
399
400 /*
401 * A single delayed workqueue item is responsible for
402 * discarding, so we can manage the bytes rate limit by keeping
403 * track of the previous discard.
404 */
405 if (kbps_limit && discard_ctl->prev_discard) {
406 u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
407 u64 bps_delay = div64_u64(discard_ctl->prev_discard *
408 NSEC_PER_SEC, bps_limit);
409
410 delay = max(delay, bps_delay);
411 }
412
413 /*
414 * This timeout is to hopefully prevent immediate discarding
415 * in a recently allocated block group.
416 */
417 if (now < block_group->discard_eligible_time) {
418 u64 bg_timeout = block_group->discard_eligible_time - now;
419
420 delay = max(delay, bg_timeout);
421 }
422
423 if (override && discard_ctl->prev_discard) {
424 u64 elapsed = now - discard_ctl->prev_discard_time;
425
426 if (delay > elapsed)
427 delay -= elapsed;
428 else
429 delay = 0;
430 }
431
432 mod_delayed_work(discard_ctl->discard_workers,
433 &discard_ctl->work, nsecs_to_jiffies(delay));
434 }
435 }
436
437 /*
438 * Responsible for scheduling the discard work.
439 *
440 * @discard_ctl: discard control
441 * @override: override the current timer
442 *
443 * Discards are issued by a delayed workqueue item. @override is used to
444 * update the current delay as the baseline delay interval is reevaluated on
445 * transaction commit. This is also maxed with any other rate limit.
446 */
btrfs_discard_schedule_work(struct btrfs_discard_ctl * discard_ctl,bool override)447 void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
448 bool override)
449 {
450 const u64 now = ktime_get_ns();
451
452 spin_lock(&discard_ctl->lock);
453 __btrfs_discard_schedule_work(discard_ctl, now, override);
454 spin_unlock(&discard_ctl->lock);
455 }
456
457 /*
458 * Determine next step of a block_group.
459 *
460 * @discard_ctl: discard control
461 * @block_group: block_group of interest
462 *
463 * Determine the next step for a block group after it's finished going through
464 * a pass on a discard list. If it is unused and fully trimmed, we can mark it
465 * unused and send it to the unused_bgs path. Otherwise, pass it onto the
466 * appropriate filter list or let it fall off.
467 */
btrfs_finish_discard_pass(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)468 static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
469 struct btrfs_block_group *block_group)
470 {
471 remove_from_discard_list(discard_ctl, block_group);
472
473 if (block_group->used == 0) {
474 if (btrfs_is_free_space_trimmed(block_group))
475 btrfs_mark_bg_unused(block_group);
476 else
477 add_to_discard_unused_list(discard_ctl, block_group);
478 } else {
479 btrfs_update_discard_index(discard_ctl, block_group);
480 }
481 }
482
483 /*
484 * Discard work queue callback
485 *
486 * @work: work
487 *
488 * Find the next block_group to start discarding and then discard a single
489 * region. It does this in a two-pass fashion: first extents and second
490 * bitmaps. Completely discarded block groups are sent to the unused_bgs path.
491 */
btrfs_discard_workfn(struct work_struct * work)492 static void btrfs_discard_workfn(struct work_struct *work)
493 {
494 struct btrfs_discard_ctl *discard_ctl;
495 struct btrfs_block_group *block_group;
496 enum btrfs_discard_state discard_state;
497 int discard_index = 0;
498 u64 trimmed = 0;
499 u64 minlen = 0;
500 u64 now = ktime_get_ns();
501
502 discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
503
504 block_group = peek_discard_list(discard_ctl, &discard_state,
505 &discard_index, now);
506 if (!block_group)
507 return;
508 if (!btrfs_run_discard_work(discard_ctl)) {
509 spin_lock(&discard_ctl->lock);
510 btrfs_put_block_group(block_group);
511 discard_ctl->block_group = NULL;
512 spin_unlock(&discard_ctl->lock);
513 return;
514 }
515 if (now < block_group->discard_eligible_time) {
516 spin_lock(&discard_ctl->lock);
517 btrfs_put_block_group(block_group);
518 discard_ctl->block_group = NULL;
519 spin_unlock(&discard_ctl->lock);
520 btrfs_discard_schedule_work(discard_ctl, false);
521 return;
522 }
523
524 /* Perform discarding */
525 minlen = discard_minlen[discard_index];
526
527 if (discard_state == BTRFS_DISCARD_BITMAPS) {
528 u64 maxlen = 0;
529
530 /*
531 * Use the previous levels minimum discard length as the max
532 * length filter. In the case something is added to make a
533 * region go beyond the max filter, the entire bitmap is set
534 * back to BTRFS_TRIM_STATE_UNTRIMMED.
535 */
536 if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
537 maxlen = discard_minlen[discard_index - 1];
538
539 btrfs_trim_block_group_bitmaps(block_group, &trimmed,
540 block_group->discard_cursor,
541 btrfs_block_group_end(block_group),
542 minlen, maxlen, true);
543 discard_ctl->discard_bitmap_bytes += trimmed;
544 } else {
545 btrfs_trim_block_group_extents(block_group, &trimmed,
546 block_group->discard_cursor,
547 btrfs_block_group_end(block_group),
548 minlen, true);
549 discard_ctl->discard_extent_bytes += trimmed;
550 }
551
552 /* Determine next steps for a block_group */
553 if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
554 if (discard_state == BTRFS_DISCARD_BITMAPS) {
555 btrfs_finish_discard_pass(discard_ctl, block_group);
556 } else {
557 block_group->discard_cursor = block_group->start;
558 spin_lock(&discard_ctl->lock);
559 if (block_group->discard_state !=
560 BTRFS_DISCARD_RESET_CURSOR)
561 block_group->discard_state =
562 BTRFS_DISCARD_BITMAPS;
563 spin_unlock(&discard_ctl->lock);
564 }
565 }
566
567 now = ktime_get_ns();
568 spin_lock(&discard_ctl->lock);
569 discard_ctl->prev_discard = trimmed;
570 discard_ctl->prev_discard_time = now;
571 btrfs_put_block_group(block_group);
572 discard_ctl->block_group = NULL;
573 __btrfs_discard_schedule_work(discard_ctl, now, false);
574 spin_unlock(&discard_ctl->lock);
575 }
576
577 /*
578 * Recalculate the base delay.
579 *
580 * @discard_ctl: discard control
581 *
582 * Recalculate the base delay which is based off the total number of
583 * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms)
584 * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
585 */
btrfs_discard_calc_delay(struct btrfs_discard_ctl * discard_ctl)586 void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
587 {
588 s32 discardable_extents;
589 s64 discardable_bytes;
590 u32 iops_limit;
591 unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC;
592 unsigned long delay;
593
594 discardable_extents = atomic_read(&discard_ctl->discardable_extents);
595 if (!discardable_extents)
596 return;
597
598 spin_lock(&discard_ctl->lock);
599
600 /*
601 * The following is to fix a potential -1 discrepancy that we're not
602 * sure how to reproduce. But given that this is the only place that
603 * utilizes these numbers and this is only called by from
604 * btrfs_finish_extent_commit() which is synchronized, we can correct
605 * here.
606 */
607 if (discardable_extents < 0)
608 atomic_add(-discardable_extents,
609 &discard_ctl->discardable_extents);
610
611 discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
612 if (discardable_bytes < 0)
613 atomic64_add(-discardable_bytes,
614 &discard_ctl->discardable_bytes);
615
616 if (discardable_extents <= 0) {
617 spin_unlock(&discard_ctl->lock);
618 return;
619 }
620
621 iops_limit = READ_ONCE(discard_ctl->iops_limit);
622
623 if (iops_limit) {
624 delay = MSEC_PER_SEC / iops_limit;
625 } else {
626 /*
627 * Unset iops_limit means go as fast as possible, so allow a
628 * delay of 0.
629 */
630 delay = 0;
631 min_delay = 0;
632 }
633
634 delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC);
635 discard_ctl->delay_ms = delay;
636
637 spin_unlock(&discard_ctl->lock);
638 }
639
640 /*
641 * Propagate discard counters.
642 *
643 * @block_group: block_group of interest
644 *
645 * Propagate deltas of counters up to the discard_ctl. It maintains a current
646 * counter and a previous counter passing the delta up to the global stat.
647 * Then the current counter value becomes the previous counter value.
648 */
btrfs_discard_update_discardable(struct btrfs_block_group * block_group)649 void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
650 {
651 struct btrfs_free_space_ctl *ctl;
652 struct btrfs_discard_ctl *discard_ctl;
653 s32 extents_delta;
654 s64 bytes_delta;
655
656 if (!block_group ||
657 !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
658 !btrfs_is_block_group_data_only(block_group))
659 return;
660
661 ctl = block_group->free_space_ctl;
662 discard_ctl = &block_group->fs_info->discard_ctl;
663
664 lockdep_assert_held(&ctl->tree_lock);
665 extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
666 ctl->discardable_extents[BTRFS_STAT_PREV];
667 if (extents_delta) {
668 atomic_add(extents_delta, &discard_ctl->discardable_extents);
669 ctl->discardable_extents[BTRFS_STAT_PREV] =
670 ctl->discardable_extents[BTRFS_STAT_CURR];
671 }
672
673 bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
674 ctl->discardable_bytes[BTRFS_STAT_PREV];
675 if (bytes_delta) {
676 atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
677 ctl->discardable_bytes[BTRFS_STAT_PREV] =
678 ctl->discardable_bytes[BTRFS_STAT_CURR];
679 }
680 }
681
682 /*
683 * Punt unused_bgs list to discard lists.
684 *
685 * @fs_info: fs_info of interest
686 *
687 * The unused_bgs list needs to be punted to the discard lists because the
688 * order of operations is changed. In the normal synchronous discard path, the
689 * block groups are trimmed via a single large trim in transaction commit. This
690 * is ultimately what we are trying to avoid with asynchronous discard. Thus,
691 * it must be done before going down the unused_bgs path.
692 */
btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info * fs_info)693 void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
694 {
695 struct btrfs_block_group *block_group, *next;
696
697 spin_lock(&fs_info->unused_bgs_lock);
698 /* We enabled async discard, so punt all to the queue */
699 list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
700 bg_list) {
701 list_del_init(&block_group->bg_list);
702 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
703 /*
704 * This put is for the get done by btrfs_mark_bg_unused.
705 * Queueing discard incremented it for discard's reference.
706 */
707 btrfs_put_block_group(block_group);
708 }
709 spin_unlock(&fs_info->unused_bgs_lock);
710 }
711
712 /*
713 * Purge discard lists.
714 *
715 * @discard_ctl: discard control
716 *
717 * If we are disabling async discard, we may have intercepted block groups that
718 * are completely free and ready for the unused_bgs path. As discarding will
719 * now happen in transaction commit or not at all, we can safely mark the
720 * corresponding block groups as unused and they will be sent on their merry
721 * way to the unused_bgs list.
722 */
btrfs_discard_purge_list(struct btrfs_discard_ctl * discard_ctl)723 static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
724 {
725 struct btrfs_block_group *block_group, *next;
726 int i;
727
728 spin_lock(&discard_ctl->lock);
729 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
730 list_for_each_entry_safe(block_group, next,
731 &discard_ctl->discard_list[i],
732 discard_list) {
733 list_del_init(&block_group->discard_list);
734 spin_unlock(&discard_ctl->lock);
735 if (block_group->used == 0)
736 btrfs_mark_bg_unused(block_group);
737 spin_lock(&discard_ctl->lock);
738 btrfs_put_block_group(block_group);
739 }
740 }
741 spin_unlock(&discard_ctl->lock);
742 }
743
btrfs_discard_resume(struct btrfs_fs_info * fs_info)744 void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
745 {
746 if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
747 btrfs_discard_cleanup(fs_info);
748 return;
749 }
750
751 btrfs_discard_punt_unused_bgs_list(fs_info);
752
753 set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
754 }
755
btrfs_discard_stop(struct btrfs_fs_info * fs_info)756 void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
757 {
758 clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
759 }
760
btrfs_discard_init(struct btrfs_fs_info * fs_info)761 void btrfs_discard_init(struct btrfs_fs_info *fs_info)
762 {
763 struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
764 int i;
765
766 spin_lock_init(&discard_ctl->lock);
767 INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
768
769 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
770 INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
771
772 discard_ctl->prev_discard = 0;
773 discard_ctl->prev_discard_time = 0;
774 atomic_set(&discard_ctl->discardable_extents, 0);
775 atomic64_set(&discard_ctl->discardable_bytes, 0);
776 discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
777 discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
778 discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
779 discard_ctl->kbps_limit = 0;
780 discard_ctl->discard_extent_bytes = 0;
781 discard_ctl->discard_bitmap_bytes = 0;
782 atomic64_set(&discard_ctl->discard_bytes_saved, 0);
783 }
784
btrfs_discard_cleanup(struct btrfs_fs_info * fs_info)785 void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
786 {
787 btrfs_discard_stop(fs_info);
788 cancel_delayed_work_sync(&fs_info->discard_ctl.work);
789 btrfs_discard_purge_list(&fs_info->discard_ctl);
790 }
791