xref: /linux/fs/btrfs/discard.c (revision 7cddbb4339d4be16aa5341e3a27e63c34d2c4e0d)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/jiffies.h>
4 #include <linux/kernel.h>
5 #include <linux/ktime.h>
6 #include <linux/list.h>
7 #include <linux/math64.h>
8 #include <linux/sizes.h>
9 #include <linux/workqueue.h>
10 #include "ctree.h"
11 #include "block-group.h"
12 #include "discard.h"
13 #include "free-space-cache.h"
14 #include "fs.h"
15 
16 /*
17  * This contains the logic to handle async discard.
18  *
19  * Async discard manages trimming of free space outside of transaction commit.
20  * Discarding is done by managing the block_groups on a LRU list based on free
21  * space recency.  Two passes are used to first prioritize discarding extents
22  * and then allow for trimming in the bitmap the best opportunity to coalesce.
23  * The block_groups are maintained on multiple lists to allow for multiple
24  * passes with different discard filter requirements.  A delayed work item is
25  * used to manage discarding with timeout determined by a max of the delay
26  * incurred by the iops rate limit, the byte rate limit, and the max delay of
27  * BTRFS_DISCARD_MAX_DELAY.
28  *
29  * Note, this only keeps track of block_groups that are explicitly for data.
30  * Mixed block_groups are not supported.
31  *
32  * The first list is special to manage discarding of fully free block groups.
33  * This is necessary because we issue a final trim for a full free block group
34  * after forgetting it.  When a block group becomes unused, instead of directly
35  * being added to the unused_bgs list, we add it to this first list.  Then
36  * from there, if it becomes fully discarded, we place it onto the unused_bgs
37  * list.
38  *
39  * The in-memory free space cache serves as the backing state for discard.
40  * Consequently this means there is no persistence.  We opt to load all the
41  * block groups in as not discarded, so the mount case degenerates to the
42  * crashing case.
43  *
44  * As the free space cache uses bitmaps, there exists a tradeoff between
45  * ease/efficiency for find_free_extent() and the accuracy of discard state.
46  * Here we opt to let untrimmed regions merge with everything while only letting
47  * trimmed regions merge with other trimmed regions.  This can cause
48  * overtrimming, but the coalescing benefit seems to be worth it.  Additionally,
49  * bitmap state is tracked as a whole.  If we're able to fully trim a bitmap,
50  * the trimmed flag is set on the bitmap.  Otherwise, if an allocation comes in,
51  * this resets the state and we will retry trimming the whole bitmap.  This is a
52  * tradeoff between discard state accuracy and the cost of accounting.
53  */
54 
55 /* This is an initial delay to give some chance for block reuse */
56 #define BTRFS_DISCARD_DELAY		(120ULL * NSEC_PER_SEC)
57 #define BTRFS_DISCARD_UNUSED_DELAY	(10ULL * NSEC_PER_SEC)
58 
59 #define BTRFS_DISCARD_MIN_DELAY_MSEC	(1UL)
60 #define BTRFS_DISCARD_MAX_DELAY_MSEC	(1000UL)
61 #define BTRFS_DISCARD_MAX_IOPS		(1000U)
62 
63 /* Monotonically decreasing minimum length filters after index 0 */
64 static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
65 	0,
66 	BTRFS_ASYNC_DISCARD_MAX_FILTER,
67 	BTRFS_ASYNC_DISCARD_MIN_FILTER
68 };
69 
70 static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
71 					  const struct btrfs_block_group *block_group)
72 {
73 	return &discard_ctl->discard_list[block_group->discard_index];
74 }
75 
76 /*
77  * Determine if async discard should be running.
78  *
79  * @discard_ctl: discard control
80  *
81  * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
82  */
83 static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl)
84 {
85 	struct btrfs_fs_info *fs_info = container_of(discard_ctl,
86 						     struct btrfs_fs_info,
87 						     discard_ctl);
88 
89 	return (!(fs_info->sb->s_flags & SB_RDONLY) &&
90 		test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
91 }
92 
93 static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
94 				  struct btrfs_block_group *block_group)
95 {
96 	lockdep_assert_held(&discard_ctl->lock);
97 
98 	if (list_empty(&block_group->discard_list) ||
99 	    block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
100 		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
101 			block_group->discard_index = BTRFS_DISCARD_INDEX_START;
102 		block_group->discard_eligible_time = (ktime_get_ns() +
103 						      BTRFS_DISCARD_DELAY);
104 		block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
105 	}
106 	if (list_empty(&block_group->discard_list))
107 		btrfs_get_block_group(block_group);
108 
109 	list_move_tail(&block_group->discard_list,
110 		       get_discard_list(discard_ctl, block_group));
111 }
112 
113 static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
114 				struct btrfs_block_group *block_group)
115 {
116 	if (!btrfs_is_block_group_data_only(block_group))
117 		return;
118 
119 	if (!btrfs_run_discard_work(discard_ctl))
120 		return;
121 
122 	spin_lock(&discard_ctl->lock);
123 	__add_to_discard_list(discard_ctl, block_group);
124 	spin_unlock(&discard_ctl->lock);
125 }
126 
127 static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
128 				       struct btrfs_block_group *block_group)
129 {
130 	bool queued;
131 
132 	spin_lock(&discard_ctl->lock);
133 
134 	queued = !list_empty(&block_group->discard_list);
135 
136 	if (!btrfs_run_discard_work(discard_ctl)) {
137 		spin_unlock(&discard_ctl->lock);
138 		return;
139 	}
140 
141 	list_del_init(&block_group->discard_list);
142 
143 	block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
144 	block_group->discard_eligible_time = (ktime_get_ns() +
145 					      BTRFS_DISCARD_UNUSED_DELAY);
146 	block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
147 	if (!queued)
148 		btrfs_get_block_group(block_group);
149 	list_add_tail(&block_group->discard_list,
150 		      &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
151 
152 	spin_unlock(&discard_ctl->lock);
153 }
154 
155 static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
156 				     struct btrfs_block_group *block_group)
157 {
158 	bool running = false;
159 	bool queued = false;
160 
161 	spin_lock(&discard_ctl->lock);
162 
163 	if (block_group == discard_ctl->block_group) {
164 		running = true;
165 		discard_ctl->block_group = NULL;
166 	}
167 
168 	block_group->discard_eligible_time = 0;
169 	queued = !list_empty(&block_group->discard_list);
170 	list_del_init(&block_group->discard_list);
171 	if (queued)
172 		btrfs_put_block_group(block_group);
173 
174 	spin_unlock(&discard_ctl->lock);
175 
176 	return running;
177 }
178 
179 /*
180  * Find block_group that's up next for discarding.
181  *
182  * @discard_ctl:  discard control
183  * @now:          current time
184  *
185  * Iterate over the discard lists to find the next block_group up for
186  * discarding checking the discard_eligible_time of block_group.
187  */
188 static struct btrfs_block_group *find_next_block_group(
189 					struct btrfs_discard_ctl *discard_ctl,
190 					u64 now)
191 {
192 	struct btrfs_block_group *ret_block_group = NULL, *block_group;
193 	int i;
194 
195 	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
196 		struct list_head *discard_list = &discard_ctl->discard_list[i];
197 
198 		if (!list_empty(discard_list)) {
199 			block_group = list_first_entry(discard_list,
200 						       struct btrfs_block_group,
201 						       discard_list);
202 
203 			if (!ret_block_group)
204 				ret_block_group = block_group;
205 
206 			if (ret_block_group->discard_eligible_time < now)
207 				break;
208 
209 			if (ret_block_group->discard_eligible_time >
210 			    block_group->discard_eligible_time)
211 				ret_block_group = block_group;
212 		}
213 	}
214 
215 	return ret_block_group;
216 }
217 
218 /*
219  * Check whether a block group is empty.
220  *
221  * "Empty" here means that there are no extents physically located within the
222  * device extents corresponding to this block group.
223  *
224  * For a remapped block group, this means that all of its identity remaps have
225  * been removed. For a non-remapped block group, this means that no extents
226  * have an address within its range, and that nothing has been remapped to be
227  * within it.
228  */
229 static bool block_group_is_empty(const struct btrfs_block_group *bg)
230 {
231 	if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)
232 		return bg->identity_remap_count == 0;
233 
234 	return bg->used == 0 && bg->remap_bytes == 0;
235 }
236 
237 /*
238  * Look up next block group and set it for use.
239  *
240  * @discard_ctl:   discard control
241  * @discard_state: the discard_state of the block_group after state management
242  * @discard_index: the discard_index of the block_group after state management
243  * @now:           time when discard was invoked, in ns
244  *
245  * Wrap find_next_block_group() and set the block_group to be in use.
246  * @discard_state's control flow is managed here.  Variables related to
247  * @discard_state are reset here as needed (eg. @discard_cursor).  @discard_state
248  * and @discard_index are remembered as it may change while we're discarding,
249  * but we want the discard to execute in the context determined here.
250  */
251 static struct btrfs_block_group *peek_discard_list(
252 					struct btrfs_discard_ctl *discard_ctl,
253 					enum btrfs_discard_state *discard_state,
254 					int *discard_index, u64 now)
255 {
256 	struct btrfs_block_group *block_group;
257 
258 	spin_lock(&discard_ctl->lock);
259 again:
260 	block_group = find_next_block_group(discard_ctl, now);
261 
262 	if (block_group && now >= block_group->discard_eligible_time) {
263 		const bool empty = block_group_is_empty(block_group);
264 
265 		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
266 		    !empty) {
267 			if (btrfs_is_block_group_data_only(block_group)) {
268 				__add_to_discard_list(discard_ctl, block_group);
269 				/*
270 				 * The block group must have been moved to other
271 				 * discard list even if discard was disabled in
272 				 * the meantime or a transaction abort happened,
273 				 * otherwise we can end up in an infinite loop,
274 				 * always jumping into the 'again' label and
275 				 * keep getting this block group over and over
276 				 * in case there are no other block groups in
277 				 * the discard lists.
278 				 */
279 				ASSERT(block_group->discard_index !=
280 				       BTRFS_DISCARD_INDEX_UNUSED,
281 				       "discard_index=%d",
282 				       block_group->discard_index);
283 			} else {
284 				list_del_init(&block_group->discard_list);
285 				btrfs_put_block_group(block_group);
286 			}
287 			goto again;
288 		}
289 		if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
290 			block_group->discard_cursor = block_group->start;
291 
292 			if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED && empty) {
293 				block_group->discard_state = BTRFS_DISCARD_FULLY_REMAPPED;
294 			} else {
295 				block_group->discard_state = BTRFS_DISCARD_EXTENTS;
296 			}
297 		}
298 	}
299 	if (block_group) {
300 		btrfs_get_block_group(block_group);
301 		discard_ctl->block_group = block_group;
302 		*discard_state = block_group->discard_state;
303 		*discard_index = block_group->discard_index;
304 	}
305 	spin_unlock(&discard_ctl->lock);
306 
307 	return block_group;
308 }
309 
310 /*
311  * Update a block group's filters.
312  *
313  * @block_group:  block group of interest
314  * @bytes:        recently freed region size after coalescing
315  *
316  * Async discard maintains multiple lists with progressively smaller filters
317  * to prioritize discarding based on size.  Should a free space that matches
318  * a larger filter be returned to the free_space_cache, prioritize that discard
319  * by moving @block_group to the proper filter.
320  */
321 void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
322 				u64 bytes)
323 {
324 	struct btrfs_discard_ctl *discard_ctl;
325 
326 	if (!block_group ||
327 	    !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
328 		return;
329 
330 	discard_ctl = &block_group->fs_info->discard_ctl;
331 
332 	if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
333 	    bytes >= discard_minlen[block_group->discard_index - 1]) {
334 		int i;
335 
336 		remove_from_discard_list(discard_ctl, block_group);
337 
338 		for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
339 		     i++) {
340 			if (bytes >= discard_minlen[i]) {
341 				block_group->discard_index = i;
342 				add_to_discard_list(discard_ctl, block_group);
343 				break;
344 			}
345 		}
346 	}
347 }
348 
349 /*
350  * Move a block group along the discard lists.
351  *
352  * @discard_ctl: discard control
353  * @block_group: block_group of interest
354  *
355  * Increment @block_group's discard_index.  If it falls of the list, let it be.
356  * Otherwise add it back to the appropriate list.
357  */
358 static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
359 				       struct btrfs_block_group *block_group)
360 {
361 	block_group->discard_index++;
362 	if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
363 		block_group->discard_index = 1;
364 		return;
365 	}
366 
367 	add_to_discard_list(discard_ctl, block_group);
368 }
369 
370 /*
371  * Remove a block_group from the discard lists.
372  *
373  * @discard_ctl: discard control
374  * @block_group: block_group of interest
375  *
376  * Remove @block_group from the discard lists.  If necessary, wait on the
377  * current work and then reschedule the delayed work.
378  */
379 void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
380 			       struct btrfs_block_group *block_group)
381 {
382 	if (remove_from_discard_list(discard_ctl, block_group)) {
383 		cancel_delayed_work_sync(&discard_ctl->work);
384 		btrfs_discard_schedule_work(discard_ctl, true);
385 	}
386 }
387 
388 /*
389  * Handles queuing the block_groups.
390  *
391  * @discard_ctl: discard control
392  * @block_group: block_group of interest
393  *
394  * Maintain the LRU order of the discard lists.
395  */
396 void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
397 			      struct btrfs_block_group *block_group)
398 {
399 	if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
400 		return;
401 
402 	if (block_group_is_empty(block_group))
403 		add_to_discard_unused_list(discard_ctl, block_group);
404 	else
405 		add_to_discard_list(discard_ctl, block_group);
406 
407 	if (!delayed_work_pending(&discard_ctl->work))
408 		btrfs_discard_schedule_work(discard_ctl, false);
409 }
410 
411 static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
412 					  u64 now, bool override)
413 {
414 	struct btrfs_block_group *block_group;
415 
416 	if (!btrfs_run_discard_work(discard_ctl))
417 		return;
418 	if (!override && delayed_work_pending(&discard_ctl->work))
419 		return;
420 
421 	block_group = find_next_block_group(discard_ctl, now);
422 	if (block_group) {
423 		u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
424 		u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
425 
426 		/*
427 		 * A single delayed workqueue item is responsible for
428 		 * discarding, so we can manage the bytes rate limit by keeping
429 		 * track of the previous discard.
430 		 */
431 		if (kbps_limit && discard_ctl->prev_discard) {
432 			u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
433 			u64 bps_delay = div64_u64(discard_ctl->prev_discard *
434 						  NSEC_PER_SEC, bps_limit);
435 
436 			delay = max(delay, bps_delay);
437 		}
438 
439 		/*
440 		 * This timeout is to hopefully prevent immediate discarding
441 		 * in a recently allocated block group.
442 		 */
443 		if (now < block_group->discard_eligible_time) {
444 			u64 bg_timeout = block_group->discard_eligible_time - now;
445 
446 			delay = max(delay, bg_timeout);
447 		}
448 
449 		if (override && discard_ctl->prev_discard) {
450 			u64 elapsed = now - discard_ctl->prev_discard_time;
451 
452 			if (delay > elapsed)
453 				delay -= elapsed;
454 			else
455 				delay = 0;
456 		}
457 
458 		mod_delayed_work(discard_ctl->discard_workers,
459 				 &discard_ctl->work, nsecs_to_jiffies(delay));
460 	}
461 }
462 
463 /*
464  * Responsible for scheduling the discard work.
465  *
466  * @discard_ctl:  discard control
467  * @override:     override the current timer
468  *
469  * Discards are issued by a delayed workqueue item.  @override is used to
470  * update the current delay as the baseline delay interval is reevaluated on
471  * transaction commit.  This is also maxed with any other rate limit.
472  */
473 void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
474 				 bool override)
475 {
476 	const u64 now = ktime_get_ns();
477 
478 	spin_lock(&discard_ctl->lock);
479 	__btrfs_discard_schedule_work(discard_ctl, now, override);
480 	spin_unlock(&discard_ctl->lock);
481 }
482 
483 /*
484  * Determine next step of a block_group.
485  *
486  * @discard_ctl: discard control
487  * @block_group: block_group of interest
488  *
489  * Determine the next step for a block group after it's finished going through
490  * a pass on a discard list.  If it is unused and fully trimmed, we can mark it
491  * unused and send it to the unused_bgs path.  Otherwise, pass it onto the
492  * appropriate filter list or let it fall off.
493  */
494 static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
495 				      struct btrfs_block_group *block_group)
496 {
497 	remove_from_discard_list(discard_ctl, block_group);
498 
499 	if (block_group_is_empty(block_group)) {
500 		if (btrfs_is_free_space_trimmed(block_group))
501 			btrfs_mark_bg_unused(block_group);
502 		else
503 			add_to_discard_unused_list(discard_ctl, block_group);
504 	} else {
505 		btrfs_update_discard_index(discard_ctl, block_group);
506 	}
507 }
508 
509 /*
510  * Discard work queue callback
511  *
512  * @work: work
513  *
514  * Find the next block_group to start discarding and then discard a single
515  * region.  It does this in a two-pass fashion: first extents and second
516  * bitmaps.  Completely discarded block groups are sent to the unused_bgs path.
517  */
518 static void btrfs_discard_workfn(struct work_struct *work)
519 {
520 	struct btrfs_discard_ctl *discard_ctl;
521 	struct btrfs_block_group *block_group;
522 	enum btrfs_discard_state discard_state;
523 	int discard_index = 0;
524 	u64 trimmed = 0;
525 	u64 minlen = 0;
526 	u64 now = ktime_get_ns();
527 
528 	discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
529 
530 	block_group = peek_discard_list(discard_ctl, &discard_state,
531 					&discard_index, now);
532 	if (!block_group)
533 		return;
534 	if (!btrfs_run_discard_work(discard_ctl)) {
535 		spin_lock(&discard_ctl->lock);
536 		btrfs_put_block_group(block_group);
537 		discard_ctl->block_group = NULL;
538 		spin_unlock(&discard_ctl->lock);
539 		return;
540 	}
541 	if (now < block_group->discard_eligible_time) {
542 		spin_lock(&discard_ctl->lock);
543 		btrfs_put_block_group(block_group);
544 		discard_ctl->block_group = NULL;
545 		spin_unlock(&discard_ctl->lock);
546 		btrfs_discard_schedule_work(discard_ctl, false);
547 		return;
548 	}
549 
550 	/* Perform discarding */
551 	minlen = discard_minlen[discard_index];
552 
553 	switch (discard_state) {
554 	case BTRFS_DISCARD_BITMAPS: {
555 		u64 maxlen = 0;
556 
557 		/*
558 		 * Use the previous levels minimum discard length as the max
559 		 * length filter.  In the case something is added to make a
560 		 * region go beyond the max filter, the entire bitmap is set
561 		 * back to BTRFS_TRIM_STATE_UNTRIMMED.
562 		 */
563 		if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
564 			maxlen = discard_minlen[discard_index - 1];
565 
566 		btrfs_trim_block_group_bitmaps(block_group, &trimmed,
567 				       block_group->discard_cursor,
568 				       btrfs_block_group_end(block_group),
569 				       minlen, maxlen, true);
570 		discard_ctl->discard_bitmap_bytes += trimmed;
571 
572 		break;
573 	}
574 
575 	case BTRFS_DISCARD_FULLY_REMAPPED:
576 		btrfs_trim_fully_remapped_block_group(block_group);
577 		break;
578 
579 	default:
580 		btrfs_trim_block_group_extents(block_group, &trimmed,
581 				       block_group->discard_cursor,
582 				       btrfs_block_group_end(block_group),
583 				       minlen, true);
584 		discard_ctl->discard_extent_bytes += trimmed;
585 
586 		break;
587 	}
588 
589 	/* Determine next steps for a block_group */
590 	if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
591 		if (discard_state == BTRFS_DISCARD_BITMAPS ||
592 		    discard_state == BTRFS_DISCARD_FULLY_REMAPPED) {
593 			btrfs_finish_discard_pass(discard_ctl, block_group);
594 		} else {
595 			block_group->discard_cursor = block_group->start;
596 			spin_lock(&discard_ctl->lock);
597 			if (block_group->discard_state !=
598 			    BTRFS_DISCARD_RESET_CURSOR)
599 				block_group->discard_state =
600 							BTRFS_DISCARD_BITMAPS;
601 			spin_unlock(&discard_ctl->lock);
602 		}
603 	}
604 
605 	now = ktime_get_ns();
606 	spin_lock(&discard_ctl->lock);
607 	discard_ctl->prev_discard = trimmed;
608 	discard_ctl->prev_discard_time = now;
609 	btrfs_put_block_group(block_group);
610 	discard_ctl->block_group = NULL;
611 	__btrfs_discard_schedule_work(discard_ctl, now, false);
612 	spin_unlock(&discard_ctl->lock);
613 }
614 
615 /*
616  * Recalculate the base delay.
617  *
618  * @discard_ctl: discard control
619  *
620  * Recalculate the base delay which is based off the total number of
621  * discardable_extents.  Clamp this between the lower_limit (iops_limit or 1ms)
622  * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
623  */
624 void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
625 {
626 	s32 discardable_extents;
627 	s64 discardable_bytes;
628 	u32 iops_limit;
629 	unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC;
630 	unsigned long delay;
631 
632 	discardable_extents = atomic_read(&discard_ctl->discardable_extents);
633 	if (!discardable_extents)
634 		return;
635 
636 	spin_lock(&discard_ctl->lock);
637 
638 	/*
639 	 * The following is to fix a potential -1 discrepancy that we're not
640 	 * sure how to reproduce. But given that this is the only place that
641 	 * utilizes these numbers and this is only called by from
642 	 * btrfs_finish_extent_commit() which is synchronized, we can correct
643 	 * here.
644 	 */
645 	if (discardable_extents < 0)
646 		atomic_add(-discardable_extents,
647 			   &discard_ctl->discardable_extents);
648 
649 	discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
650 	if (discardable_bytes < 0)
651 		atomic64_add(-discardable_bytes,
652 			     &discard_ctl->discardable_bytes);
653 
654 	if (discardable_extents <= 0) {
655 		spin_unlock(&discard_ctl->lock);
656 		return;
657 	}
658 
659 	iops_limit = READ_ONCE(discard_ctl->iops_limit);
660 
661 	if (iops_limit) {
662 		delay = MSEC_PER_SEC / iops_limit;
663 	} else {
664 		/*
665 		 * Unset iops_limit means go as fast as possible, so allow a
666 		 * delay of 0.
667 		 */
668 		delay = 0;
669 		min_delay = 0;
670 	}
671 
672 	delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC);
673 	discard_ctl->delay_ms = delay;
674 
675 	spin_unlock(&discard_ctl->lock);
676 }
677 
678 /*
679  * Propagate discard counters.
680  *
681  * @block_group: block_group of interest
682  *
683  * Propagate deltas of counters up to the discard_ctl.  It maintains a current
684  * counter and a previous counter passing the delta up to the global stat.
685  * Then the current counter value becomes the previous counter value.
686  */
687 void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
688 {
689 	struct btrfs_free_space_ctl *ctl;
690 	struct btrfs_discard_ctl *discard_ctl;
691 	s32 extents_delta;
692 	s64 bytes_delta;
693 
694 	if (!block_group ||
695 	    !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
696 	    !btrfs_is_block_group_data_only(block_group))
697 		return;
698 
699 	ctl = block_group->free_space_ctl;
700 	discard_ctl = &block_group->fs_info->discard_ctl;
701 
702 	lockdep_assert_held(&ctl->tree_lock);
703 	extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
704 			ctl->discardable_extents[BTRFS_STAT_PREV];
705 	if (extents_delta) {
706 		atomic_add(extents_delta, &discard_ctl->discardable_extents);
707 		ctl->discardable_extents[BTRFS_STAT_PREV] =
708 			ctl->discardable_extents[BTRFS_STAT_CURR];
709 	}
710 
711 	bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
712 		      ctl->discardable_bytes[BTRFS_STAT_PREV];
713 	if (bytes_delta) {
714 		atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
715 		ctl->discardable_bytes[BTRFS_STAT_PREV] =
716 			ctl->discardable_bytes[BTRFS_STAT_CURR];
717 	}
718 }
719 
720 /*
721  * Punt unused_bgs list to discard lists.
722  *
723  * @fs_info: fs_info of interest
724  *
725  * The unused_bgs list needs to be punted to the discard lists because the
726  * order of operations is changed.  In the normal synchronous discard path, the
727  * block groups are trimmed via a single large trim in transaction commit.  This
728  * is ultimately what we are trying to avoid with asynchronous discard.  Thus,
729  * it must be done before going down the unused_bgs path.
730  */
731 void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
732 {
733 	struct btrfs_block_group *block_group, *next;
734 
735 	spin_lock(&fs_info->unused_bgs_lock);
736 	/* We enabled async discard, so punt all to the queue */
737 	list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
738 				 bg_list) {
739 		list_del_init(&block_group->bg_list);
740 		btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
741 		/*
742 		 * This put is for the get done by btrfs_mark_bg_unused.
743 		 * Queueing discard incremented it for discard's reference.
744 		 */
745 		btrfs_put_block_group(block_group);
746 	}
747 	spin_unlock(&fs_info->unused_bgs_lock);
748 }
749 
750 /*
751  * Purge discard lists.
752  *
753  * @discard_ctl: discard control
754  *
755  * If we are disabling async discard, we may have intercepted block groups that
756  * are completely free and ready for the unused_bgs path.  As discarding will
757  * now happen in transaction commit or not at all, we can safely mark the
758  * corresponding block groups as unused and they will be sent on their merry
759  * way to the unused_bgs list.
760  */
761 static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
762 {
763 	struct btrfs_block_group *block_group, *next;
764 	int i;
765 
766 	spin_lock(&discard_ctl->lock);
767 	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
768 		list_for_each_entry_safe(block_group, next,
769 					 &discard_ctl->discard_list[i],
770 					 discard_list) {
771 			list_del_init(&block_group->discard_list);
772 			spin_unlock(&discard_ctl->lock);
773 			if (block_group->used == 0)
774 				btrfs_mark_bg_unused(block_group);
775 			spin_lock(&discard_ctl->lock);
776 			btrfs_put_block_group(block_group);
777 		}
778 	}
779 	spin_unlock(&discard_ctl->lock);
780 }
781 
782 void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
783 {
784 	if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
785 		btrfs_discard_cleanup(fs_info);
786 		return;
787 	}
788 
789 	btrfs_discard_punt_unused_bgs_list(fs_info);
790 
791 	set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
792 }
793 
794 void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
795 {
796 	clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
797 }
798 
799 void btrfs_discard_init(struct btrfs_fs_info *fs_info)
800 {
801 	struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
802 	int i;
803 
804 	spin_lock_init(&discard_ctl->lock);
805 	INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
806 
807 	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
808 		INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
809 
810 	discard_ctl->prev_discard = 0;
811 	discard_ctl->prev_discard_time = 0;
812 	atomic_set(&discard_ctl->discardable_extents, 0);
813 	atomic64_set(&discard_ctl->discardable_bytes, 0);
814 	discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
815 	discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
816 	discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
817 	discard_ctl->kbps_limit = 0;
818 	discard_ctl->discard_extent_bytes = 0;
819 	discard_ctl->discard_bitmap_bytes = 0;
820 	atomic64_set(&discard_ctl->discard_bytes_saved, 0);
821 }
822 
823 void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
824 {
825 	btrfs_discard_stop(fs_info);
826 	cancel_delayed_work_sync(&fs_info->discard_ctl.work);
827 	btrfs_discard_purge_list(&fs_info->discard_ctl);
828 }
829