xref: /linux/fs/btrfs/discard.c (revision 547c5775a742d9c83891b629b75d1d4c8e88d8c0)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/jiffies.h>
4 #include <linux/kernel.h>
5 #include <linux/ktime.h>
6 #include <linux/list.h>
7 #include <linux/math64.h>
8 #include <linux/sizes.h>
9 #include <linux/workqueue.h>
10 #include "ctree.h"
11 #include "block-group.h"
12 #include "discard.h"
13 #include "free-space-cache.h"
14 #include "fs.h"
15 
16 /*
17  * This contains the logic to handle async discard.
18  *
19  * Async discard manages trimming of free space outside of transaction commit.
20  * Discarding is done by managing the block_groups on a LRU list based on free
21  * space recency.  Two passes are used to first prioritize discarding extents
22  * and then allow for trimming in the bitmap the best opportunity to coalesce.
23  * The block_groups are maintained on multiple lists to allow for multiple
24  * passes with different discard filter requirements.  A delayed work item is
25  * used to manage discarding with timeout determined by a max of the delay
26  * incurred by the iops rate limit, the byte rate limit, and the max delay of
27  * BTRFS_DISCARD_MAX_DELAY.
28  *
29  * Note, this only keeps track of block_groups that are explicitly for data.
30  * Mixed block_groups are not supported.
31  *
32  * The first list is special to manage discarding of fully free block groups.
33  * This is necessary because we issue a final trim for a full free block group
34  * after forgetting it.  When a block group becomes unused, instead of directly
35  * being added to the unused_bgs list, we add it to this first list.  Then
36  * from there, if it becomes fully discarded, we place it onto the unused_bgs
37  * list.
38  *
39  * The in-memory free space cache serves as the backing state for discard.
40  * Consequently this means there is no persistence.  We opt to load all the
41  * block groups in as not discarded, so the mount case degenerates to the
42  * crashing case.
43  *
44  * As the free space cache uses bitmaps, there exists a tradeoff between
45  * ease/efficiency for find_free_extent() and the accuracy of discard state.
46  * Here we opt to let untrimmed regions merge with everything while only letting
47  * trimmed regions merge with other trimmed regions.  This can cause
48  * overtrimming, but the coalescing benefit seems to be worth it.  Additionally,
49  * bitmap state is tracked as a whole.  If we're able to fully trim a bitmap,
50  * the trimmed flag is set on the bitmap.  Otherwise, if an allocation comes in,
51  * this resets the state and we will retry trimming the whole bitmap.  This is a
52  * tradeoff between discard state accuracy and the cost of accounting.
53  */
54 
55 /* This is an initial delay to give some chance for block reuse */
56 #define BTRFS_DISCARD_DELAY		(120ULL * NSEC_PER_SEC)
57 #define BTRFS_DISCARD_UNUSED_DELAY	(10ULL * NSEC_PER_SEC)
58 
59 #define BTRFS_DISCARD_MIN_DELAY_MSEC	(1UL)
60 #define BTRFS_DISCARD_MAX_DELAY_MSEC	(1000UL)
61 #define BTRFS_DISCARD_MAX_IOPS		(1000U)
62 
63 /* Monotonically decreasing minimum length filters after index 0 */
64 static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
65 	0,
66 	BTRFS_ASYNC_DISCARD_MAX_FILTER,
67 	BTRFS_ASYNC_DISCARD_MIN_FILTER
68 };
69 
get_discard_list(struct btrfs_discard_ctl * discard_ctl,const struct btrfs_block_group * block_group)70 static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
71 					  const struct btrfs_block_group *block_group)
72 {
73 	return &discard_ctl->discard_list[block_group->discard_index];
74 }
75 
76 /*
77  * Determine if async discard should be running.
78  *
79  * @discard_ctl: discard control
80  *
81  * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
82  */
btrfs_run_discard_work(const struct btrfs_discard_ctl * discard_ctl)83 static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl)
84 {
85 	struct btrfs_fs_info *fs_info = container_of(discard_ctl,
86 						     struct btrfs_fs_info,
87 						     discard_ctl);
88 
89 	return (!(fs_info->sb->s_flags & SB_RDONLY) &&
90 		test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
91 }
92 
__add_to_discard_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)93 static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
94 				  struct btrfs_block_group *block_group)
95 {
96 	lockdep_assert_held(&discard_ctl->lock);
97 
98 	if (list_empty(&block_group->discard_list) ||
99 	    block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
100 		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
101 			block_group->discard_index = BTRFS_DISCARD_INDEX_START;
102 		block_group->discard_eligible_time = (ktime_get_ns() +
103 						      BTRFS_DISCARD_DELAY);
104 		block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
105 	}
106 	if (list_empty(&block_group->discard_list))
107 		btrfs_get_block_group(block_group);
108 
109 	list_move_tail(&block_group->discard_list,
110 		       get_discard_list(discard_ctl, block_group));
111 }
112 
add_to_discard_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)113 static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
114 				struct btrfs_block_group *block_group)
115 {
116 	if (!btrfs_is_block_group_data_only(block_group))
117 		return;
118 
119 	if (!btrfs_run_discard_work(discard_ctl))
120 		return;
121 
122 	spin_lock(&discard_ctl->lock);
123 	__add_to_discard_list(discard_ctl, block_group);
124 	spin_unlock(&discard_ctl->lock);
125 }
126 
add_to_discard_unused_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)127 static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
128 				       struct btrfs_block_group *block_group)
129 {
130 	bool queued;
131 
132 	spin_lock(&discard_ctl->lock);
133 
134 	queued = !list_empty(&block_group->discard_list);
135 
136 	if (!btrfs_run_discard_work(discard_ctl)) {
137 		spin_unlock(&discard_ctl->lock);
138 		return;
139 	}
140 
141 	list_del_init(&block_group->discard_list);
142 
143 	block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
144 	block_group->discard_eligible_time = (ktime_get_ns() +
145 					      BTRFS_DISCARD_UNUSED_DELAY);
146 	block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
147 	if (!queued)
148 		btrfs_get_block_group(block_group);
149 	list_add_tail(&block_group->discard_list,
150 		      &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
151 
152 	spin_unlock(&discard_ctl->lock);
153 }
154 
remove_from_discard_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)155 static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
156 				     struct btrfs_block_group *block_group)
157 {
158 	bool running = false;
159 	bool queued = false;
160 
161 	spin_lock(&discard_ctl->lock);
162 
163 	if (block_group == discard_ctl->block_group) {
164 		running = true;
165 		discard_ctl->block_group = NULL;
166 	}
167 
168 	block_group->discard_eligible_time = 0;
169 	queued = !list_empty(&block_group->discard_list);
170 	list_del_init(&block_group->discard_list);
171 	if (queued)
172 		btrfs_put_block_group(block_group);
173 
174 	spin_unlock(&discard_ctl->lock);
175 
176 	return running;
177 }
178 
179 /*
180  * Find block_group that's up next for discarding.
181  *
182  * @discard_ctl:  discard control
183  * @now:          current time
184  *
185  * Iterate over the discard lists to find the next block_group up for
186  * discarding checking the discard_eligible_time of block_group.
187  */
find_next_block_group(struct btrfs_discard_ctl * discard_ctl,u64 now)188 static struct btrfs_block_group *find_next_block_group(
189 					struct btrfs_discard_ctl *discard_ctl,
190 					u64 now)
191 {
192 	struct btrfs_block_group *ret_block_group = NULL, *block_group;
193 	int i;
194 
195 	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
196 		struct list_head *discard_list = &discard_ctl->discard_list[i];
197 
198 		if (!list_empty(discard_list)) {
199 			block_group = list_first_entry(discard_list,
200 						       struct btrfs_block_group,
201 						       discard_list);
202 
203 			if (!ret_block_group)
204 				ret_block_group = block_group;
205 
206 			if (ret_block_group->discard_eligible_time < now)
207 				break;
208 
209 			if (ret_block_group->discard_eligible_time >
210 			    block_group->discard_eligible_time)
211 				ret_block_group = block_group;
212 		}
213 	}
214 
215 	return ret_block_group;
216 }
217 
218 /*
219  * Look up next block group and set it for use.
220  *
221  * @discard_ctl:   discard control
222  * @discard_state: the discard_state of the block_group after state management
223  * @discard_index: the discard_index of the block_group after state management
224  * @now:           time when discard was invoked, in ns
225  *
226  * Wrap find_next_block_group() and set the block_group to be in use.
227  * @discard_state's control flow is managed here.  Variables related to
228  * @discard_state are reset here as needed (eg. @discard_cursor).  @discard_state
229  * and @discard_index are remembered as it may change while we're discarding,
230  * but we want the discard to execute in the context determined here.
231  */
peek_discard_list(struct btrfs_discard_ctl * discard_ctl,enum btrfs_discard_state * discard_state,int * discard_index,u64 now)232 static struct btrfs_block_group *peek_discard_list(
233 					struct btrfs_discard_ctl *discard_ctl,
234 					enum btrfs_discard_state *discard_state,
235 					int *discard_index, u64 now)
236 {
237 	struct btrfs_block_group *block_group;
238 
239 	spin_lock(&discard_ctl->lock);
240 again:
241 	block_group = find_next_block_group(discard_ctl, now);
242 
243 	if (block_group && now >= block_group->discard_eligible_time) {
244 		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
245 		    block_group->used != 0) {
246 			if (btrfs_is_block_group_data_only(block_group)) {
247 				__add_to_discard_list(discard_ctl, block_group);
248 				/*
249 				 * The block group must have been moved to other
250 				 * discard list even if discard was disabled in
251 				 * the meantime or a transaction abort happened,
252 				 * otherwise we can end up in an infinite loop,
253 				 * always jumping into the 'again' label and
254 				 * keep getting this block group over and over
255 				 * in case there are no other block groups in
256 				 * the discard lists.
257 				 */
258 				ASSERT(block_group->discard_index !=
259 				       BTRFS_DISCARD_INDEX_UNUSED,
260 				       "discard_index=%d",
261 				       block_group->discard_index);
262 			} else {
263 				list_del_init(&block_group->discard_list);
264 				btrfs_put_block_group(block_group);
265 			}
266 			goto again;
267 		}
268 		if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
269 			block_group->discard_cursor = block_group->start;
270 			block_group->discard_state = BTRFS_DISCARD_EXTENTS;
271 		}
272 	}
273 	if (block_group) {
274 		btrfs_get_block_group(block_group);
275 		discard_ctl->block_group = block_group;
276 		*discard_state = block_group->discard_state;
277 		*discard_index = block_group->discard_index;
278 	}
279 	spin_unlock(&discard_ctl->lock);
280 
281 	return block_group;
282 }
283 
284 /*
285  * Update a block group's filters.
286  *
287  * @block_group:  block group of interest
288  * @bytes:        recently freed region size after coalescing
289  *
290  * Async discard maintains multiple lists with progressively smaller filters
291  * to prioritize discarding based on size.  Should a free space that matches
292  * a larger filter be returned to the free_space_cache, prioritize that discard
293  * by moving @block_group to the proper filter.
294  */
btrfs_discard_check_filter(struct btrfs_block_group * block_group,u64 bytes)295 void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
296 				u64 bytes)
297 {
298 	struct btrfs_discard_ctl *discard_ctl;
299 
300 	if (!block_group ||
301 	    !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
302 		return;
303 
304 	discard_ctl = &block_group->fs_info->discard_ctl;
305 
306 	if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
307 	    bytes >= discard_minlen[block_group->discard_index - 1]) {
308 		int i;
309 
310 		remove_from_discard_list(discard_ctl, block_group);
311 
312 		for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
313 		     i++) {
314 			if (bytes >= discard_minlen[i]) {
315 				block_group->discard_index = i;
316 				add_to_discard_list(discard_ctl, block_group);
317 				break;
318 			}
319 		}
320 	}
321 }
322 
323 /*
324  * Move a block group along the discard lists.
325  *
326  * @discard_ctl: discard control
327  * @block_group: block_group of interest
328  *
329  * Increment @block_group's discard_index.  If it falls of the list, let it be.
330  * Otherwise add it back to the appropriate list.
331  */
btrfs_update_discard_index(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)332 static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
333 				       struct btrfs_block_group *block_group)
334 {
335 	block_group->discard_index++;
336 	if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
337 		block_group->discard_index = 1;
338 		return;
339 	}
340 
341 	add_to_discard_list(discard_ctl, block_group);
342 }
343 
344 /*
345  * Remove a block_group from the discard lists.
346  *
347  * @discard_ctl: discard control
348  * @block_group: block_group of interest
349  *
350  * Remove @block_group from the discard lists.  If necessary, wait on the
351  * current work and then reschedule the delayed work.
352  */
btrfs_discard_cancel_work(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)353 void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
354 			       struct btrfs_block_group *block_group)
355 {
356 	if (remove_from_discard_list(discard_ctl, block_group)) {
357 		cancel_delayed_work_sync(&discard_ctl->work);
358 		btrfs_discard_schedule_work(discard_ctl, true);
359 	}
360 }
361 
362 /*
363  * Handles queuing the block_groups.
364  *
365  * @discard_ctl: discard control
366  * @block_group: block_group of interest
367  *
368  * Maintain the LRU order of the discard lists.
369  */
btrfs_discard_queue_work(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)370 void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
371 			      struct btrfs_block_group *block_group)
372 {
373 	if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
374 		return;
375 
376 	if (block_group->used == 0)
377 		add_to_discard_unused_list(discard_ctl, block_group);
378 	else
379 		add_to_discard_list(discard_ctl, block_group);
380 
381 	if (!delayed_work_pending(&discard_ctl->work))
382 		btrfs_discard_schedule_work(discard_ctl, false);
383 }
384 
__btrfs_discard_schedule_work(struct btrfs_discard_ctl * discard_ctl,u64 now,bool override)385 static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
386 					  u64 now, bool override)
387 {
388 	struct btrfs_block_group *block_group;
389 
390 	if (!btrfs_run_discard_work(discard_ctl))
391 		return;
392 	if (!override && delayed_work_pending(&discard_ctl->work))
393 		return;
394 
395 	block_group = find_next_block_group(discard_ctl, now);
396 	if (block_group) {
397 		u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
398 		u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
399 
400 		/*
401 		 * A single delayed workqueue item is responsible for
402 		 * discarding, so we can manage the bytes rate limit by keeping
403 		 * track of the previous discard.
404 		 */
405 		if (kbps_limit && discard_ctl->prev_discard) {
406 			u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
407 			u64 bps_delay = div64_u64(discard_ctl->prev_discard *
408 						  NSEC_PER_SEC, bps_limit);
409 
410 			delay = max(delay, bps_delay);
411 		}
412 
413 		/*
414 		 * This timeout is to hopefully prevent immediate discarding
415 		 * in a recently allocated block group.
416 		 */
417 		if (now < block_group->discard_eligible_time) {
418 			u64 bg_timeout = block_group->discard_eligible_time - now;
419 
420 			delay = max(delay, bg_timeout);
421 		}
422 
423 		if (override && discard_ctl->prev_discard) {
424 			u64 elapsed = now - discard_ctl->prev_discard_time;
425 
426 			if (delay > elapsed)
427 				delay -= elapsed;
428 			else
429 				delay = 0;
430 		}
431 
432 		mod_delayed_work(discard_ctl->discard_workers,
433 				 &discard_ctl->work, nsecs_to_jiffies(delay));
434 	}
435 }
436 
437 /*
438  * Responsible for scheduling the discard work.
439  *
440  * @discard_ctl:  discard control
441  * @override:     override the current timer
442  *
443  * Discards are issued by a delayed workqueue item.  @override is used to
444  * update the current delay as the baseline delay interval is reevaluated on
445  * transaction commit.  This is also maxed with any other rate limit.
446  */
btrfs_discard_schedule_work(struct btrfs_discard_ctl * discard_ctl,bool override)447 void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
448 				 bool override)
449 {
450 	const u64 now = ktime_get_ns();
451 
452 	spin_lock(&discard_ctl->lock);
453 	__btrfs_discard_schedule_work(discard_ctl, now, override);
454 	spin_unlock(&discard_ctl->lock);
455 }
456 
457 /*
458  * Determine next step of a block_group.
459  *
460  * @discard_ctl: discard control
461  * @block_group: block_group of interest
462  *
463  * Determine the next step for a block group after it's finished going through
464  * a pass on a discard list.  If it is unused and fully trimmed, we can mark it
465  * unused and send it to the unused_bgs path.  Otherwise, pass it onto the
466  * appropriate filter list or let it fall off.
467  */
btrfs_finish_discard_pass(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)468 static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
469 				      struct btrfs_block_group *block_group)
470 {
471 	remove_from_discard_list(discard_ctl, block_group);
472 
473 	if (block_group->used == 0) {
474 		if (btrfs_is_free_space_trimmed(block_group))
475 			btrfs_mark_bg_unused(block_group);
476 		else
477 			add_to_discard_unused_list(discard_ctl, block_group);
478 	} else {
479 		btrfs_update_discard_index(discard_ctl, block_group);
480 	}
481 }
482 
483 /*
484  * Discard work queue callback
485  *
486  * @work: work
487  *
488  * Find the next block_group to start discarding and then discard a single
489  * region.  It does this in a two-pass fashion: first extents and second
490  * bitmaps.  Completely discarded block groups are sent to the unused_bgs path.
491  */
btrfs_discard_workfn(struct work_struct * work)492 static void btrfs_discard_workfn(struct work_struct *work)
493 {
494 	struct btrfs_discard_ctl *discard_ctl;
495 	struct btrfs_block_group *block_group;
496 	enum btrfs_discard_state discard_state;
497 	int discard_index = 0;
498 	u64 trimmed = 0;
499 	u64 minlen = 0;
500 	u64 now = ktime_get_ns();
501 
502 	discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
503 
504 	block_group = peek_discard_list(discard_ctl, &discard_state,
505 					&discard_index, now);
506 	if (!block_group)
507 		return;
508 	if (!btrfs_run_discard_work(discard_ctl)) {
509 		spin_lock(&discard_ctl->lock);
510 		btrfs_put_block_group(block_group);
511 		discard_ctl->block_group = NULL;
512 		spin_unlock(&discard_ctl->lock);
513 		return;
514 	}
515 	if (now < block_group->discard_eligible_time) {
516 		spin_lock(&discard_ctl->lock);
517 		btrfs_put_block_group(block_group);
518 		discard_ctl->block_group = NULL;
519 		spin_unlock(&discard_ctl->lock);
520 		btrfs_discard_schedule_work(discard_ctl, false);
521 		return;
522 	}
523 
524 	/* Perform discarding */
525 	minlen = discard_minlen[discard_index];
526 
527 	if (discard_state == BTRFS_DISCARD_BITMAPS) {
528 		u64 maxlen = 0;
529 
530 		/*
531 		 * Use the previous levels minimum discard length as the max
532 		 * length filter.  In the case something is added to make a
533 		 * region go beyond the max filter, the entire bitmap is set
534 		 * back to BTRFS_TRIM_STATE_UNTRIMMED.
535 		 */
536 		if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
537 			maxlen = discard_minlen[discard_index - 1];
538 
539 		btrfs_trim_block_group_bitmaps(block_group, &trimmed,
540 				       block_group->discard_cursor,
541 				       btrfs_block_group_end(block_group),
542 				       minlen, maxlen, true);
543 		discard_ctl->discard_bitmap_bytes += trimmed;
544 	} else {
545 		btrfs_trim_block_group_extents(block_group, &trimmed,
546 				       block_group->discard_cursor,
547 				       btrfs_block_group_end(block_group),
548 				       minlen, true);
549 		discard_ctl->discard_extent_bytes += trimmed;
550 	}
551 
552 	/* Determine next steps for a block_group */
553 	if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
554 		if (discard_state == BTRFS_DISCARD_BITMAPS) {
555 			btrfs_finish_discard_pass(discard_ctl, block_group);
556 		} else {
557 			block_group->discard_cursor = block_group->start;
558 			spin_lock(&discard_ctl->lock);
559 			if (block_group->discard_state !=
560 			    BTRFS_DISCARD_RESET_CURSOR)
561 				block_group->discard_state =
562 							BTRFS_DISCARD_BITMAPS;
563 			spin_unlock(&discard_ctl->lock);
564 		}
565 	}
566 
567 	now = ktime_get_ns();
568 	spin_lock(&discard_ctl->lock);
569 	discard_ctl->prev_discard = trimmed;
570 	discard_ctl->prev_discard_time = now;
571 	btrfs_put_block_group(block_group);
572 	discard_ctl->block_group = NULL;
573 	__btrfs_discard_schedule_work(discard_ctl, now, false);
574 	spin_unlock(&discard_ctl->lock);
575 }
576 
577 /*
578  * Recalculate the base delay.
579  *
580  * @discard_ctl: discard control
581  *
582  * Recalculate the base delay which is based off the total number of
583  * discardable_extents.  Clamp this between the lower_limit (iops_limit or 1ms)
584  * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
585  */
btrfs_discard_calc_delay(struct btrfs_discard_ctl * discard_ctl)586 void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
587 {
588 	s32 discardable_extents;
589 	s64 discardable_bytes;
590 	u32 iops_limit;
591 	unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC;
592 	unsigned long delay;
593 
594 	discardable_extents = atomic_read(&discard_ctl->discardable_extents);
595 	if (!discardable_extents)
596 		return;
597 
598 	spin_lock(&discard_ctl->lock);
599 
600 	/*
601 	 * The following is to fix a potential -1 discrepancy that we're not
602 	 * sure how to reproduce. But given that this is the only place that
603 	 * utilizes these numbers and this is only called by from
604 	 * btrfs_finish_extent_commit() which is synchronized, we can correct
605 	 * here.
606 	 */
607 	if (discardable_extents < 0)
608 		atomic_add(-discardable_extents,
609 			   &discard_ctl->discardable_extents);
610 
611 	discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
612 	if (discardable_bytes < 0)
613 		atomic64_add(-discardable_bytes,
614 			     &discard_ctl->discardable_bytes);
615 
616 	if (discardable_extents <= 0) {
617 		spin_unlock(&discard_ctl->lock);
618 		return;
619 	}
620 
621 	iops_limit = READ_ONCE(discard_ctl->iops_limit);
622 
623 	if (iops_limit) {
624 		delay = MSEC_PER_SEC / iops_limit;
625 	} else {
626 		/*
627 		 * Unset iops_limit means go as fast as possible, so allow a
628 		 * delay of 0.
629 		 */
630 		delay = 0;
631 		min_delay = 0;
632 	}
633 
634 	delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC);
635 	discard_ctl->delay_ms = delay;
636 
637 	spin_unlock(&discard_ctl->lock);
638 }
639 
640 /*
641  * Propagate discard counters.
642  *
643  * @block_group: block_group of interest
644  *
645  * Propagate deltas of counters up to the discard_ctl.  It maintains a current
646  * counter and a previous counter passing the delta up to the global stat.
647  * Then the current counter value becomes the previous counter value.
648  */
btrfs_discard_update_discardable(struct btrfs_block_group * block_group)649 void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
650 {
651 	struct btrfs_free_space_ctl *ctl;
652 	struct btrfs_discard_ctl *discard_ctl;
653 	s32 extents_delta;
654 	s64 bytes_delta;
655 
656 	if (!block_group ||
657 	    !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
658 	    !btrfs_is_block_group_data_only(block_group))
659 		return;
660 
661 	ctl = block_group->free_space_ctl;
662 	discard_ctl = &block_group->fs_info->discard_ctl;
663 
664 	lockdep_assert_held(&ctl->tree_lock);
665 	extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
666 			ctl->discardable_extents[BTRFS_STAT_PREV];
667 	if (extents_delta) {
668 		atomic_add(extents_delta, &discard_ctl->discardable_extents);
669 		ctl->discardable_extents[BTRFS_STAT_PREV] =
670 			ctl->discardable_extents[BTRFS_STAT_CURR];
671 	}
672 
673 	bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
674 		      ctl->discardable_bytes[BTRFS_STAT_PREV];
675 	if (bytes_delta) {
676 		atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
677 		ctl->discardable_bytes[BTRFS_STAT_PREV] =
678 			ctl->discardable_bytes[BTRFS_STAT_CURR];
679 	}
680 }
681 
682 /*
683  * Punt unused_bgs list to discard lists.
684  *
685  * @fs_info: fs_info of interest
686  *
687  * The unused_bgs list needs to be punted to the discard lists because the
688  * order of operations is changed.  In the normal synchronous discard path, the
689  * block groups are trimmed via a single large trim in transaction commit.  This
690  * is ultimately what we are trying to avoid with asynchronous discard.  Thus,
691  * it must be done before going down the unused_bgs path.
692  */
btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info * fs_info)693 void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
694 {
695 	struct btrfs_block_group *block_group, *next;
696 
697 	spin_lock(&fs_info->unused_bgs_lock);
698 	/* We enabled async discard, so punt all to the queue */
699 	list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
700 				 bg_list) {
701 		list_del_init(&block_group->bg_list);
702 		btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
703 		/*
704 		 * This put is for the get done by btrfs_mark_bg_unused.
705 		 * Queueing discard incremented it for discard's reference.
706 		 */
707 		btrfs_put_block_group(block_group);
708 	}
709 	spin_unlock(&fs_info->unused_bgs_lock);
710 }
711 
712 /*
713  * Purge discard lists.
714  *
715  * @discard_ctl: discard control
716  *
717  * If we are disabling async discard, we may have intercepted block groups that
718  * are completely free and ready for the unused_bgs path.  As discarding will
719  * now happen in transaction commit or not at all, we can safely mark the
720  * corresponding block groups as unused and they will be sent on their merry
721  * way to the unused_bgs list.
722  */
btrfs_discard_purge_list(struct btrfs_discard_ctl * discard_ctl)723 static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
724 {
725 	struct btrfs_block_group *block_group, *next;
726 	int i;
727 
728 	spin_lock(&discard_ctl->lock);
729 	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
730 		list_for_each_entry_safe(block_group, next,
731 					 &discard_ctl->discard_list[i],
732 					 discard_list) {
733 			list_del_init(&block_group->discard_list);
734 			spin_unlock(&discard_ctl->lock);
735 			if (block_group->used == 0)
736 				btrfs_mark_bg_unused(block_group);
737 			spin_lock(&discard_ctl->lock);
738 			btrfs_put_block_group(block_group);
739 		}
740 	}
741 	spin_unlock(&discard_ctl->lock);
742 }
743 
btrfs_discard_resume(struct btrfs_fs_info * fs_info)744 void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
745 {
746 	if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
747 		btrfs_discard_cleanup(fs_info);
748 		return;
749 	}
750 
751 	btrfs_discard_punt_unused_bgs_list(fs_info);
752 
753 	set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
754 }
755 
btrfs_discard_stop(struct btrfs_fs_info * fs_info)756 void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
757 {
758 	clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
759 }
760 
btrfs_discard_init(struct btrfs_fs_info * fs_info)761 void btrfs_discard_init(struct btrfs_fs_info *fs_info)
762 {
763 	struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
764 	int i;
765 
766 	spin_lock_init(&discard_ctl->lock);
767 	INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
768 
769 	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
770 		INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
771 
772 	discard_ctl->prev_discard = 0;
773 	discard_ctl->prev_discard_time = 0;
774 	atomic_set(&discard_ctl->discardable_extents, 0);
775 	atomic64_set(&discard_ctl->discardable_bytes, 0);
776 	discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
777 	discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
778 	discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
779 	discard_ctl->kbps_limit = 0;
780 	discard_ctl->discard_extent_bytes = 0;
781 	discard_ctl->discard_bitmap_bytes = 0;
782 	atomic64_set(&discard_ctl->discard_bytes_saved, 0);
783 }
784 
btrfs_discard_cleanup(struct btrfs_fs_info * fs_info)785 void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
786 {
787 	btrfs_discard_stop(fs_info);
788 	cancel_delayed_work_sync(&fs_info->discard_ctl.work);
789 	btrfs_discard_purge_list(&fs_info->discard_ctl);
790 }
791