xref: /linux/fs/xfs/xfs_discard.c (revision f3f5edc5e41e038cf66d124a4cbacf6ff0983513)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2010, 2023 Red Hat, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_trans.h"
12 #include "xfs_mount.h"
13 #include "xfs_btree.h"
14 #include "xfs_alloc_btree.h"
15 #include "xfs_alloc.h"
16 #include "xfs_discard.h"
17 #include "xfs_error.h"
18 #include "xfs_extent_busy.h"
19 #include "xfs_trace.h"
20 #include "xfs_log.h"
21 #include "xfs_ag.h"
22 #include "xfs_health.h"
23 #include "xfs_rtbitmap.h"
24 #include "xfs_rtgroup.h"
25 
26 /*
27  * Notes on an efficient, low latency fstrim algorithm
28  *
29  * We need to walk the filesystem free space and issue discards on the free
30  * space that meet the search criteria (size and location). We cannot issue
31  * discards on extents that might be in use, or are so recently in use they are
32  * still marked as busy. To serialise against extent state changes whilst we are
33  * gathering extents to trim, we must hold the AGF lock to lock out other
34  * allocations and extent free operations that might change extent state.
35  *
36  * However, we cannot just hold the AGF for the entire AG free space walk whilst
37  * we issue discards on each free space that is found. Storage devices can have
38  * extremely slow discard implementations (e.g. ceph RBD) and so walking a
39  * couple of million free extents and issuing synchronous discards on each
40  * extent can take a *long* time. Whilst we are doing this walk, nothing else
41  * can access the AGF, and we can stall transactions and hence the log whilst
42  * modifications wait for the AGF lock to be released. This can lead hung tasks
43  * kicking the hung task timer and rebooting the system. This is bad.
44  *
45  * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI
46  * lock, gathers a range of inode cluster buffers that are allocated, drops the
47  * AGI lock and then reads all the inode cluster buffers and processes them. It
48  * loops doing this, using a cursor to keep track of where it is up to in the AG
49  * for each iteration to restart the INOBT lookup from.
50  *
51  * We can't do this exactly with free space - once we drop the AGF lock, the
52  * state of the free extent is out of our control and we cannot run a discard
53  * safely on it in this situation. Unless, of course, we've marked the free
54  * extent as busy and undergoing a discard operation whilst we held the AGF
55  * locked.
56  *
57  * This is exactly how online discard works - free extents are marked busy when
58  * they are freed, and once the extent free has been committed to the journal,
59  * the busy extent record is marked as "undergoing discard" and the discard is
60  * then issued on the free extent. Once the discard completes, the busy extent
61  * record is removed and the extent is able to be allocated again.
62  *
63  * In the context of fstrim, if we find a free extent we need to discard, we
64  * don't have to discard it immediately. All we need to do it record that free
65  * extent as being busy and under discard, and all the allocation routines will
66  * now avoid trying to allocate it. Hence if we mark the extent as busy under
67  * the AGF lock, we can safely discard it without holding the AGF lock because
68  * nothing will attempt to allocate that free space until the discard completes.
69  *
70  * This also allows us to issue discards asynchronously like we do with online
71  * discard, and so for fast devices fstrim will run much faster as we can have
72  * multiple discard operations in flight at once, as well as pipeline the free
73  * extent search so that it overlaps in flight discard IO.
74  */
75 
76 #define XFS_DISCARD_MAX_EXAMINE	(100)
77 
78 struct workqueue_struct *xfs_discard_wq;
79 
80 static void
xfs_discard_endio_work(struct work_struct * work)81 xfs_discard_endio_work(
82 	struct work_struct	*work)
83 {
84 	struct xfs_busy_extents	*extents =
85 		container_of(work, struct xfs_busy_extents, endio_work);
86 
87 	xfs_extent_busy_clear(&extents->extent_list, false);
88 	kfree(extents->owner);
89 }
90 
91 /*
92  * Queue up the actual completion to a thread to avoid IRQ-safe locking for
93  * eb_lock.
94  */
95 static void
xfs_discard_endio(struct bio * bio)96 xfs_discard_endio(
97 	struct bio		*bio)
98 {
99 	struct xfs_busy_extents	*extents = bio->bi_private;
100 
101 	INIT_WORK(&extents->endio_work, xfs_discard_endio_work);
102 	queue_work(xfs_discard_wq, &extents->endio_work);
103 	bio_put(bio);
104 }
105 
106 /*
107  * Walk the discard list and issue discards on all the busy extents in the
108  * list. We plug and chain the bios so that we only need a single completion
109  * call to clear all the busy extents once the discards are complete.
110  */
111 int
xfs_discard_extents(struct xfs_mount * mp,struct xfs_busy_extents * extents)112 xfs_discard_extents(
113 	struct xfs_mount	*mp,
114 	struct xfs_busy_extents	*extents)
115 {
116 	struct xfs_extent_busy	*busyp;
117 	struct bio		*bio = NULL;
118 	struct blk_plug		plug;
119 	int			error = 0;
120 
121 	blk_start_plug(&plug);
122 	list_for_each_entry(busyp, &extents->extent_list, list) {
123 		struct xfs_group	*xg = busyp->group;
124 		struct xfs_buftarg	*btp =
125 			xfs_group_type_buftarg(xg->xg_mount, xg->xg_type);
126 
127 		trace_xfs_discard_extent(xg, busyp->bno, busyp->length);
128 
129 		error = __blkdev_issue_discard(btp->bt_bdev,
130 				xfs_gbno_to_daddr(xg, busyp->bno),
131 				XFS_FSB_TO_BB(mp, busyp->length),
132 				GFP_KERNEL, &bio);
133 		if (error && error != -EOPNOTSUPP) {
134 			xfs_info(mp,
135 	 "discard failed for extent [0x%llx,%u], error %d",
136 				 (unsigned long long)busyp->bno,
137 				 busyp->length,
138 				 error);
139 			break;
140 		}
141 	}
142 
143 	if (bio) {
144 		bio->bi_private = extents;
145 		bio->bi_end_io = xfs_discard_endio;
146 		submit_bio(bio);
147 	} else {
148 		xfs_discard_endio_work(&extents->endio_work);
149 	}
150 	blk_finish_plug(&plug);
151 
152 	return error;
153 }
154 
155 /*
156  * Care must be taken setting up the trim cursor as the perags may not have been
157  * initialised when the cursor is initialised. e.g. a clean mount which hasn't
158  * read in AGFs and the first operation run on the mounted fs is a trim. This
159  * can result in perag fields that aren't initialised until
160  * xfs_trim_gather_extents() calls xfs_alloc_read_agf() to lock down the AG for
161  * the free space search.
162  */
163 struct xfs_trim_cur {
164 	xfs_agblock_t	start;
165 	xfs_extlen_t	count;
166 	xfs_agblock_t	end;
167 	xfs_extlen_t	minlen;
168 	bool		by_bno;
169 };
170 
171 static int
xfs_trim_gather_extents(struct xfs_perag * pag,struct xfs_trim_cur * tcur,struct xfs_busy_extents * extents)172 xfs_trim_gather_extents(
173 	struct xfs_perag	*pag,
174 	struct xfs_trim_cur	*tcur,
175 	struct xfs_busy_extents	*extents)
176 {
177 	struct xfs_mount	*mp = pag_mount(pag);
178 	struct xfs_trans	*tp;
179 	struct xfs_btree_cur	*cur;
180 	struct xfs_buf		*agbp;
181 	int			error;
182 	int			i;
183 	int			batch = XFS_DISCARD_MAX_EXAMINE;
184 
185 	/*
186 	 * Force out the log.  This means any transactions that might have freed
187 	 * space before we take the AGF buffer lock are now on disk, and the
188 	 * volatile disk cache is flushed.
189 	 */
190 	xfs_log_force(mp, XFS_LOG_SYNC);
191 
192 	tp = xfs_trans_alloc_empty(mp);
193 
194 	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
195 	if (error)
196 		goto out_trans_cancel;
197 
198 	/*
199 	 * First time through tcur->count will not have been initialised as
200 	 * pag->pagf_longest is not guaranteed to be valid before we read
201 	 * the AGF buffer above.
202 	 */
203 	if (!tcur->count)
204 		tcur->count = pag->pagf_longest;
205 
206 	if (tcur->by_bno) {
207 		/* sub-AG discard request always starts at tcur->start */
208 		cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
209 		error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i);
210 		if (!error && !i)
211 			error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i);
212 	} else if (tcur->start == 0) {
213 		/* first time through a by-len starts with max length */
214 		cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
215 		error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i);
216 	} else {
217 		/* nth time through a by-len starts where we left off */
218 		cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
219 		error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i);
220 	}
221 	if (error)
222 		goto out_del_cursor;
223 	if (i == 0) {
224 		/* nothing of that length left in the AG, we are done */
225 		tcur->count = 0;
226 		goto out_del_cursor;
227 	}
228 
229 	/*
230 	 * Loop until we are done with all extents that are large
231 	 * enough to be worth discarding or we hit batch limits.
232 	 */
233 	while (i) {
234 		xfs_agblock_t	fbno;
235 		xfs_extlen_t	flen;
236 
237 		error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
238 		if (error)
239 			break;
240 		if (XFS_IS_CORRUPT(mp, i != 1)) {
241 			xfs_btree_mark_sick(cur);
242 			error = -EFSCORRUPTED;
243 			break;
244 		}
245 
246 		if (--batch <= 0) {
247 			/*
248 			 * Update the cursor to point at this extent so we
249 			 * restart the next batch from this extent.
250 			 */
251 			tcur->start = fbno;
252 			tcur->count = flen;
253 			break;
254 		}
255 
256 		/*
257 		 * If the extent is entirely outside of the range we are
258 		 * supposed to skip it.  Do not bother to trim down partially
259 		 * overlapping ranges for now.
260 		 */
261 		if (fbno + flen < tcur->start) {
262 			trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
263 			goto next_extent;
264 		}
265 		if (fbno > tcur->end) {
266 			trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
267 			if (tcur->by_bno) {
268 				tcur->count = 0;
269 				break;
270 			}
271 			goto next_extent;
272 		}
273 
274 		/* Trim the extent returned to the range we want. */
275 		if (fbno < tcur->start) {
276 			flen -= tcur->start - fbno;
277 			fbno = tcur->start;
278 		}
279 		if (fbno + flen > tcur->end + 1)
280 			flen = tcur->end - fbno + 1;
281 
282 		/* Too small?  Give up. */
283 		if (flen < tcur->minlen) {
284 			trace_xfs_discard_toosmall(pag_group(pag), fbno, flen);
285 			if (tcur->by_bno)
286 				goto next_extent;
287 			tcur->count = 0;
288 			break;
289 		}
290 
291 		/*
292 		 * If any blocks in the range are still busy, skip the
293 		 * discard and try again the next time.
294 		 */
295 		if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) {
296 			trace_xfs_discard_busy(pag_group(pag), fbno, flen);
297 			goto next_extent;
298 		}
299 
300 		xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen,
301 				&extents->extent_list);
302 next_extent:
303 		if (tcur->by_bno)
304 			error = xfs_btree_increment(cur, 0, &i);
305 		else
306 			error = xfs_btree_decrement(cur, 0, &i);
307 		if (error)
308 			break;
309 
310 		/*
311 		 * If there's no more records in the tree, we are done. Set the
312 		 * cursor block count to 0 to indicate to the caller that there
313 		 * is no more extents to search.
314 		 */
315 		if (i == 0)
316 			tcur->count = 0;
317 	}
318 
319 	/*
320 	 * If there was an error, release all the gathered busy extents because
321 	 * we aren't going to issue a discard on them any more.
322 	 */
323 	if (error)
324 		xfs_extent_busy_clear(&extents->extent_list, false);
325 out_del_cursor:
326 	xfs_btree_del_cursor(cur, error);
327 out_trans_cancel:
328 	xfs_trans_cancel(tp);
329 	return error;
330 }
331 
332 static bool
xfs_trim_should_stop(void)333 xfs_trim_should_stop(void)
334 {
335 	return fatal_signal_pending(current) || freezing(current);
336 }
337 
338 /*
339  * Iterate the free list gathering extents and discarding them. We need a cursor
340  * for the repeated iteration of gather/discard loop, so use the longest extent
341  * we found in the last batch as the key to start the next.
342  */
343 static int
xfs_trim_perag_extents(struct xfs_perag * pag,xfs_agblock_t start,xfs_agblock_t end,xfs_extlen_t minlen)344 xfs_trim_perag_extents(
345 	struct xfs_perag	*pag,
346 	xfs_agblock_t		start,
347 	xfs_agblock_t		end,
348 	xfs_extlen_t		minlen)
349 {
350 	struct xfs_trim_cur	tcur = {
351 		.start		= start,
352 		.end		= end,
353 		.minlen		= minlen,
354 	};
355 	int			error = 0;
356 
357 	if (start != 0 || end != pag_group(pag)->xg_block_count)
358 		tcur.by_bno = true;
359 
360 	do {
361 		struct xfs_busy_extents	*extents;
362 
363 		extents = kzalloc(sizeof(*extents), GFP_KERNEL);
364 		if (!extents) {
365 			error = -ENOMEM;
366 			break;
367 		}
368 
369 		extents->owner = extents;
370 		INIT_LIST_HEAD(&extents->extent_list);
371 
372 		error = xfs_trim_gather_extents(pag, &tcur, extents);
373 		if (error) {
374 			kfree(extents);
375 			break;
376 		}
377 
378 		/*
379 		 * We hand the extent list to the discard function here so the
380 		 * discarded extents can be removed from the busy extent list.
381 		 * This allows the discards to run asynchronously with gathering
382 		 * the next round of extents to discard.
383 		 *
384 		 * However, we must ensure that we do not reference the extent
385 		 * list  after this function call, as it may have been freed by
386 		 * the time control returns to us.
387 		 */
388 		error = xfs_discard_extents(pag_mount(pag), extents);
389 		if (error)
390 			break;
391 
392 		if (xfs_trim_should_stop())
393 			break;
394 
395 	} while (tcur.count != 0);
396 
397 	return error;
398 
399 }
400 
401 static int
xfs_trim_datadev_extents(struct xfs_mount * mp,xfs_daddr_t start,xfs_daddr_t end,xfs_extlen_t minlen)402 xfs_trim_datadev_extents(
403 	struct xfs_mount	*mp,
404 	xfs_daddr_t		start,
405 	xfs_daddr_t		end,
406 	xfs_extlen_t		minlen)
407 {
408 	xfs_agnumber_t		start_agno, end_agno;
409 	xfs_agblock_t		start_agbno, end_agbno;
410 	struct xfs_perag	*pag = NULL;
411 	xfs_daddr_t		ddev_end;
412 	int			last_error = 0, error;
413 
414 	ddev_end = min_t(xfs_daddr_t, end,
415 			 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1);
416 
417 	start_agno = xfs_daddr_to_agno(mp, start);
418 	start_agbno = xfs_daddr_to_agbno(mp, start);
419 	end_agno = xfs_daddr_to_agno(mp, ddev_end);
420 	end_agbno = xfs_daddr_to_agbno(mp, ddev_end);
421 
422 	while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) {
423 		xfs_agblock_t	agend = pag_group(pag)->xg_block_count;
424 
425 		if (pag_agno(pag) == end_agno)
426 			agend = end_agbno;
427 		error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen);
428 		if (error)
429 			last_error = error;
430 
431 		if (xfs_trim_should_stop()) {
432 			xfs_perag_rele(pag);
433 			break;
434 		}
435 		start_agbno = 0;
436 	}
437 
438 	return last_error;
439 }
440 
441 #ifdef CONFIG_XFS_RT
442 struct xfs_trim_rtdev {
443 	/* list of rt extents to free */
444 	struct list_head	extent_list;
445 
446 	/* minimum length that caller allows us to trim */
447 	xfs_rtblock_t		minlen_fsb;
448 
449 	/* restart point for the rtbitmap walk */
450 	xfs_rtxnum_t		restart_rtx;
451 
452 	/* stopping point for the current rtbitmap walk */
453 	xfs_rtxnum_t		stop_rtx;
454 };
455 
456 struct xfs_rtx_busy {
457 	struct list_head	list;
458 	xfs_rtblock_t		bno;
459 	xfs_rtblock_t		length;
460 };
461 
462 static void
xfs_discard_free_rtdev_extents(struct xfs_trim_rtdev * tr)463 xfs_discard_free_rtdev_extents(
464 	struct xfs_trim_rtdev	*tr)
465 {
466 	struct xfs_rtx_busy	*busyp, *n;
467 
468 	list_for_each_entry_safe(busyp, n, &tr->extent_list, list) {
469 		list_del_init(&busyp->list);
470 		kfree(busyp);
471 	}
472 }
473 
474 /*
475  * Walk the discard list and issue discards on all the busy extents in the
476  * list. We plug and chain the bios so that we only need a single completion
477  * call to clear all the busy extents once the discards are complete.
478  */
479 static int
xfs_discard_rtdev_extents(struct xfs_mount * mp,struct xfs_trim_rtdev * tr)480 xfs_discard_rtdev_extents(
481 	struct xfs_mount	*mp,
482 	struct xfs_trim_rtdev	*tr)
483 {
484 	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
485 	struct xfs_rtx_busy	*busyp;
486 	struct bio		*bio = NULL;
487 	struct blk_plug		plug;
488 	xfs_rtblock_t		start = NULLRTBLOCK, length = 0;
489 	int			error = 0;
490 
491 	blk_start_plug(&plug);
492 	list_for_each_entry(busyp, &tr->extent_list, list) {
493 		if (start == NULLRTBLOCK)
494 			start = busyp->bno;
495 		length += busyp->length;
496 
497 		trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length);
498 
499 		error = __blkdev_issue_discard(bdev,
500 				xfs_rtb_to_daddr(mp, busyp->bno),
501 				XFS_FSB_TO_BB(mp, busyp->length),
502 				GFP_NOFS, &bio);
503 		if (error)
504 			break;
505 	}
506 	xfs_discard_free_rtdev_extents(tr);
507 
508 	if (bio) {
509 		error = submit_bio_wait(bio);
510 		if (error == -EOPNOTSUPP)
511 			error = 0;
512 		if (error)
513 			xfs_info(mp,
514 	 "discard failed for rtextent [0x%llx,%llu], error %d",
515 				 (unsigned long long)start,
516 				 (unsigned long long)length,
517 				 error);
518 		bio_put(bio);
519 	}
520 	blk_finish_plug(&plug);
521 
522 	return error;
523 }
524 
525 static int
xfs_trim_gather_rtextent(struct xfs_rtgroup * rtg,struct xfs_trans * tp,const struct xfs_rtalloc_rec * rec,void * priv)526 xfs_trim_gather_rtextent(
527 	struct xfs_rtgroup		*rtg,
528 	struct xfs_trans		*tp,
529 	const struct xfs_rtalloc_rec	*rec,
530 	void				*priv)
531 {
532 	struct xfs_trim_rtdev		*tr = priv;
533 	struct xfs_rtx_busy		*busyp;
534 	xfs_rtblock_t			rbno, rlen;
535 
536 	if (rec->ar_startext > tr->stop_rtx) {
537 		/*
538 		 * If we've scanned a large number of rtbitmap blocks, update
539 		 * the cursor to point at this extent so we restart the next
540 		 * batch from this extent.
541 		 */
542 		tr->restart_rtx = rec->ar_startext;
543 		return -ECANCELED;
544 	}
545 
546 	rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext);
547 	rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount);
548 
549 	/* Ignore too small. */
550 	if (rlen < tr->minlen_fsb) {
551 		trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen);
552 		return 0;
553 	}
554 
555 	busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL);
556 	if (!busyp)
557 		return -ENOMEM;
558 
559 	busyp->bno = rbno;
560 	busyp->length = rlen;
561 	INIT_LIST_HEAD(&busyp->list);
562 	list_add_tail(&busyp->list, &tr->extent_list);
563 
564 	tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
565 	return 0;
566 }
567 
568 /* Trim extents on an !rtgroups realtime device */
569 static int
xfs_trim_rtextents(struct xfs_rtgroup * rtg,xfs_rtxnum_t low,xfs_rtxnum_t high,xfs_daddr_t minlen)570 xfs_trim_rtextents(
571 	struct xfs_rtgroup	*rtg,
572 	xfs_rtxnum_t		low,
573 	xfs_rtxnum_t		high,
574 	xfs_daddr_t		minlen)
575 {
576 	struct xfs_mount	*mp = rtg_mount(rtg);
577 	struct xfs_trim_rtdev	tr = {
578 		.minlen_fsb	= XFS_BB_TO_FSB(mp, minlen),
579 		.extent_list	= LIST_HEAD_INIT(tr.extent_list),
580 	};
581 	struct xfs_trans	*tp;
582 	int			error;
583 
584 	tp = xfs_trans_alloc_empty(mp);
585 
586 	/*
587 	 * Walk the free ranges between low and high.  The query_range function
588 	 * trims the extents returned.
589 	 */
590 	do {
591 		tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp);
592 		xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
593 		error = xfs_rtalloc_query_range(rtg, tp, low, high,
594 				xfs_trim_gather_rtextent, &tr);
595 
596 		if (error == -ECANCELED)
597 			error = 0;
598 		if (error) {
599 			xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
600 			xfs_discard_free_rtdev_extents(&tr);
601 			break;
602 		}
603 
604 		if (list_empty(&tr.extent_list)) {
605 			xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
606 			break;
607 		}
608 
609 		error = xfs_discard_rtdev_extents(mp, &tr);
610 		xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
611 		if (error)
612 			break;
613 
614 		low = tr.restart_rtx;
615 	} while (!xfs_trim_should_stop() && low <= high);
616 
617 	xfs_trans_cancel(tp);
618 	return error;
619 }
620 
621 struct xfs_trim_rtgroup {
622 	/* list of rtgroup extents to free */
623 	struct xfs_busy_extents	*extents;
624 
625 	/* minimum length that caller allows us to trim */
626 	xfs_rtblock_t		minlen_fsb;
627 
628 	/* restart point for the rtbitmap walk */
629 	xfs_rtxnum_t		restart_rtx;
630 
631 	/* number of extents to examine before stopping to issue discard ios */
632 	int			batch;
633 
634 	/* number of extents queued for discard */
635 	int			queued;
636 };
637 
638 static int
xfs_trim_gather_rtgroup_extent(struct xfs_rtgroup * rtg,struct xfs_trans * tp,const struct xfs_rtalloc_rec * rec,void * priv)639 xfs_trim_gather_rtgroup_extent(
640 	struct xfs_rtgroup		*rtg,
641 	struct xfs_trans		*tp,
642 	const struct xfs_rtalloc_rec	*rec,
643 	void				*priv)
644 {
645 	struct xfs_trim_rtgroup		*tr = priv;
646 	xfs_rgblock_t			rgbno;
647 	xfs_extlen_t			len;
648 
649 	if (--tr->batch <= 0) {
650 		/*
651 		 * If we've checked a large number of extents, update the
652 		 * cursor to point at this extent so we restart the next batch
653 		 * from this extent.
654 		 */
655 		tr->restart_rtx = rec->ar_startext;
656 		return -ECANCELED;
657 	}
658 
659 	rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext);
660 	len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount);
661 
662 	/* Ignore too small. */
663 	if (len < tr->minlen_fsb) {
664 		trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len);
665 		return 0;
666 	}
667 
668 	/*
669 	 * If any blocks in the range are still busy, skip the discard and try
670 	 * again the next time.
671 	 */
672 	if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) {
673 		trace_xfs_discard_busy(rtg_group(rtg), rgbno, len);
674 		return 0;
675 	}
676 
677 	xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len,
678 			&tr->extents->extent_list);
679 
680 	tr->queued++;
681 	tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
682 	return 0;
683 }
684 
685 /* Trim extents in this rtgroup using the busy extent machinery. */
686 static int
xfs_trim_rtgroup_extents(struct xfs_rtgroup * rtg,xfs_rtxnum_t low,xfs_rtxnum_t high,xfs_daddr_t minlen)687 xfs_trim_rtgroup_extents(
688 	struct xfs_rtgroup	*rtg,
689 	xfs_rtxnum_t		low,
690 	xfs_rtxnum_t		high,
691 	xfs_daddr_t		minlen)
692 {
693 	struct xfs_mount	*mp = rtg_mount(rtg);
694 	struct xfs_trim_rtgroup	tr = {
695 		.minlen_fsb	= XFS_BB_TO_FSB(mp, minlen),
696 	};
697 	struct xfs_trans	*tp;
698 	int			error;
699 
700 	tp = xfs_trans_alloc_empty(mp);
701 
702 	/*
703 	 * Walk the free ranges between low and high.  The query_range function
704 	 * trims the extents returned.
705 	 */
706 	do {
707 		tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL);
708 		if (!tr.extents) {
709 			error = -ENOMEM;
710 			break;
711 		}
712 
713 		tr.queued = 0;
714 		tr.batch = XFS_DISCARD_MAX_EXAMINE;
715 		tr.extents->owner = tr.extents;
716 		INIT_LIST_HEAD(&tr.extents->extent_list);
717 
718 		xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
719 		error = xfs_rtalloc_query_range(rtg, tp, low, high,
720 				xfs_trim_gather_rtgroup_extent, &tr);
721 		xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
722 		if (error == -ECANCELED)
723 			error = 0;
724 		if (error) {
725 			kfree(tr.extents);
726 			break;
727 		}
728 
729 		if (!tr.queued)
730 			break;
731 
732 		/*
733 		 * We hand the extent list to the discard function here so the
734 		 * discarded extents can be removed from the busy extent list.
735 		 * This allows the discards to run asynchronously with
736 		 * gathering the next round of extents to discard.
737 		 *
738 		 * However, we must ensure that we do not reference the extent
739 		 * list  after this function call, as it may have been freed by
740 		 * the time control returns to us.
741 		 */
742 		error = xfs_discard_extents(rtg_mount(rtg), tr.extents);
743 		if (error)
744 			break;
745 
746 		low = tr.restart_rtx;
747 	} while (!xfs_trim_should_stop() && low <= high);
748 
749 	xfs_trans_cancel(tp);
750 	return error;
751 }
752 
753 static int
xfs_trim_rtdev_extents(struct xfs_mount * mp,xfs_daddr_t start,xfs_daddr_t end,xfs_daddr_t minlen)754 xfs_trim_rtdev_extents(
755 	struct xfs_mount	*mp,
756 	xfs_daddr_t		start,
757 	xfs_daddr_t		end,
758 	xfs_daddr_t		minlen)
759 {
760 	xfs_rtblock_t		start_rtbno, end_rtbno;
761 	xfs_rtxnum_t		start_rtx, end_rtx;
762 	xfs_rgnumber_t		start_rgno, end_rgno;
763 	xfs_daddr_t		daddr_offset;
764 	int			last_error = 0, error;
765 	struct xfs_rtgroup	*rtg = NULL;
766 
767 	/* Shift the start and end downwards to match the rt device. */
768 	daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
769 	if (start > daddr_offset)
770 		start -= daddr_offset;
771 	else
772 		start = 0;
773 	start_rtbno = xfs_daddr_to_rtb(mp, start);
774 	start_rtx = xfs_rtb_to_rtx(mp, start_rtbno);
775 	start_rgno = xfs_rtb_to_rgno(mp, start_rtbno);
776 
777 	if (end <= daddr_offset)
778 		return 0;
779 	else
780 		end -= daddr_offset;
781 	end_rtbno = xfs_daddr_to_rtb(mp, end);
782 	end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1);
783 	end_rgno = xfs_rtb_to_rgno(mp, end_rtbno);
784 
785 	while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) {
786 		xfs_rtxnum_t	rtg_end = rtg->rtg_extents;
787 
788 		if (rtg_rgno(rtg) == end_rgno)
789 			rtg_end = min(rtg_end, end_rtx);
790 
791 		if (xfs_has_rtgroups(mp))
792 			error = xfs_trim_rtgroup_extents(rtg, start_rtx,
793 					rtg_end, minlen);
794 		else
795 			error = xfs_trim_rtextents(rtg, start_rtx, rtg_end,
796 					minlen);
797 		if (error)
798 			last_error = error;
799 
800 		if (xfs_trim_should_stop()) {
801 			xfs_rtgroup_rele(rtg);
802 			break;
803 		}
804 		start_rtx = 0;
805 	}
806 
807 	return last_error;
808 }
809 #else
810 # define xfs_trim_rtdev_extents(...)	(-EOPNOTSUPP)
811 #endif /* CONFIG_XFS_RT */
812 
813 /*
814  * trim a range of the filesystem.
815  *
816  * Note: the parameters passed from userspace are byte ranges into the
817  * filesystem which does not match to the format we use for filesystem block
818  * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
819  * is a linear address range. Hence we need to use DADDR based conversions and
820  * comparisons for determining the correct offset and regions to trim.
821  *
822  * The realtime device is mapped into the FITRIM "address space" immediately
823  * after the data device.
824  */
825 int
xfs_ioc_trim(struct xfs_mount * mp,struct fstrim_range __user * urange)826 xfs_ioc_trim(
827 	struct xfs_mount		*mp,
828 	struct fstrim_range __user	*urange)
829 {
830 	unsigned int		granularity =
831 		bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
832 	struct block_device	*rt_bdev = NULL;
833 	struct fstrim_range	range;
834 	xfs_daddr_t		start, end;
835 	xfs_extlen_t		minlen;
836 	xfs_rfsblock_t		max_blocks;
837 	int			error, last_error = 0;
838 
839 	if (!capable(CAP_SYS_ADMIN))
840 		return -EPERM;
841 
842 	if (mp->m_rtdev_targp && !xfs_has_zoned(mp) &&
843 	    bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
844 		rt_bdev = mp->m_rtdev_targp->bt_bdev;
845 	if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
846 		return -EOPNOTSUPP;
847 
848 	if (rt_bdev)
849 		granularity = max(granularity,
850 				  bdev_discard_granularity(rt_bdev));
851 
852 	/*
853 	 * We haven't recovered the log, so we cannot use our bnobt-guided
854 	 * storage zapping commands.
855 	 */
856 	if (xfs_has_norecovery(mp))
857 		return -EROFS;
858 
859 	if (copy_from_user(&range, urange, sizeof(range)))
860 		return -EFAULT;
861 
862 	range.minlen = max_t(u64, granularity, range.minlen);
863 	minlen = XFS_B_TO_FSB(mp, range.minlen);
864 
865 	/*
866 	 * Truncating down the len isn't actually quite correct, but using
867 	 * BBTOB would mean we trivially get overflows for values
868 	 * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
869 	 * used by the fstrim application.  In the end it really doesn't
870 	 * matter as trimming blocks is an advisory interface.
871 	 */
872 	max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks;
873 	if (range.start >= XFS_FSB_TO_B(mp, max_blocks) ||
874 	    range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) ||
875 	    range.len < mp->m_sb.sb_blocksize)
876 		return -EINVAL;
877 
878 	start = BTOBB(range.start);
879 	end = start + BTOBBT(range.len) - 1;
880 
881 	if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) {
882 		error = xfs_trim_datadev_extents(mp, start, end, minlen);
883 		if (error)
884 			last_error = error;
885 	}
886 
887 	if (rt_bdev && !xfs_trim_should_stop()) {
888 		error = xfs_trim_rtdev_extents(mp, start, end, minlen);
889 		if (error)
890 			last_error = error;
891 	}
892 
893 	if (last_error)
894 		return last_error;
895 
896 	range.len = min_t(unsigned long long, range.len,
897 			  XFS_FSB_TO_B(mp, max_blocks) - range.start);
898 	if (copy_to_user(urange, &range, sizeof(range)))
899 		return -EFAULT;
900 	return 0;
901 }
902