1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2010, 2023 Red Hat, Inc.
4 * All Rights Reserved.
5 */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_trans.h"
12 #include "xfs_mount.h"
13 #include "xfs_btree.h"
14 #include "xfs_alloc_btree.h"
15 #include "xfs_alloc.h"
16 #include "xfs_discard.h"
17 #include "xfs_error.h"
18 #include "xfs_extent_busy.h"
19 #include "xfs_trace.h"
20 #include "xfs_log.h"
21 #include "xfs_ag.h"
22 #include "xfs_health.h"
23 #include "xfs_rtbitmap.h"
24 #include "xfs_rtgroup.h"
25
26 /*
27 * Notes on an efficient, low latency fstrim algorithm
28 *
29 * We need to walk the filesystem free space and issue discards on the free
30 * space that meet the search criteria (size and location). We cannot issue
31 * discards on extents that might be in use, or are so recently in use they are
32 * still marked as busy. To serialise against extent state changes whilst we are
33 * gathering extents to trim, we must hold the AGF lock to lock out other
34 * allocations and extent free operations that might change extent state.
35 *
36 * However, we cannot just hold the AGF for the entire AG free space walk whilst
37 * we issue discards on each free space that is found. Storage devices can have
38 * extremely slow discard implementations (e.g. ceph RBD) and so walking a
39 * couple of million free extents and issuing synchronous discards on each
40 * extent can take a *long* time. Whilst we are doing this walk, nothing else
41 * can access the AGF, and we can stall transactions and hence the log whilst
42 * modifications wait for the AGF lock to be released. This can lead hung tasks
43 * kicking the hung task timer and rebooting the system. This is bad.
44 *
45 * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI
46 * lock, gathers a range of inode cluster buffers that are allocated, drops the
47 * AGI lock and then reads all the inode cluster buffers and processes them. It
48 * loops doing this, using a cursor to keep track of where it is up to in the AG
49 * for each iteration to restart the INOBT lookup from.
50 *
51 * We can't do this exactly with free space - once we drop the AGF lock, the
52 * state of the free extent is out of our control and we cannot run a discard
53 * safely on it in this situation. Unless, of course, we've marked the free
54 * extent as busy and undergoing a discard operation whilst we held the AGF
55 * locked.
56 *
57 * This is exactly how online discard works - free extents are marked busy when
58 * they are freed, and once the extent free has been committed to the journal,
59 * the busy extent record is marked as "undergoing discard" and the discard is
60 * then issued on the free extent. Once the discard completes, the busy extent
61 * record is removed and the extent is able to be allocated again.
62 *
63 * In the context of fstrim, if we find a free extent we need to discard, we
64 * don't have to discard it immediately. All we need to do it record that free
65 * extent as being busy and under discard, and all the allocation routines will
66 * now avoid trying to allocate it. Hence if we mark the extent as busy under
67 * the AGF lock, we can safely discard it without holding the AGF lock because
68 * nothing will attempt to allocate that free space until the discard completes.
69 *
70 * This also allows us to issue discards asynchronously like we do with online
71 * discard, and so for fast devices fstrim will run much faster as we can have
72 * multiple discard operations in flight at once, as well as pipeline the free
73 * extent search so that it overlaps in flight discard IO.
74 */
75
76 #define XFS_DISCARD_MAX_EXAMINE (100)
77
78 struct workqueue_struct *xfs_discard_wq;
79
80 static void
xfs_discard_endio_work(struct work_struct * work)81 xfs_discard_endio_work(
82 struct work_struct *work)
83 {
84 struct xfs_busy_extents *extents =
85 container_of(work, struct xfs_busy_extents, endio_work);
86
87 xfs_extent_busy_clear(&extents->extent_list, false);
88 kfree(extents->owner);
89 }
90
91 /*
92 * Queue up the actual completion to a thread to avoid IRQ-safe locking for
93 * eb_lock.
94 */
95 static void
xfs_discard_endio(struct bio * bio)96 xfs_discard_endio(
97 struct bio *bio)
98 {
99 struct xfs_busy_extents *extents = bio->bi_private;
100
101 INIT_WORK(&extents->endio_work, xfs_discard_endio_work);
102 queue_work(xfs_discard_wq, &extents->endio_work);
103 bio_put(bio);
104 }
105
106 /*
107 * Walk the discard list and issue discards on all the busy extents in the
108 * list. We plug and chain the bios so that we only need a single completion
109 * call to clear all the busy extents once the discards are complete.
110 */
111 void
xfs_discard_extents(struct xfs_mount * mp,struct xfs_busy_extents * extents)112 xfs_discard_extents(
113 struct xfs_mount *mp,
114 struct xfs_busy_extents *extents)
115 {
116 struct xfs_extent_busy *busyp;
117 struct bio *bio = NULL;
118 struct blk_plug plug;
119
120 blk_start_plug(&plug);
121 list_for_each_entry(busyp, &extents->extent_list, list) {
122 struct xfs_group *xg = busyp->group;
123 struct xfs_buftarg *btp =
124 xfs_group_type_buftarg(xg->xg_mount, xg->xg_type);
125
126 trace_xfs_discard_extent(xg, busyp->bno, busyp->length);
127
128 __blkdev_issue_discard(btp->bt_bdev,
129 xfs_gbno_to_daddr(xg, busyp->bno),
130 XFS_FSB_TO_BB(mp, busyp->length),
131 GFP_KERNEL, &bio);
132 }
133
134 if (bio) {
135 bio->bi_private = extents;
136 bio->bi_end_io = xfs_discard_endio;
137 submit_bio(bio);
138 } else {
139 xfs_discard_endio_work(&extents->endio_work);
140 }
141 blk_finish_plug(&plug);
142 }
143
144 /*
145 * Care must be taken setting up the trim cursor as the perags may not have been
146 * initialised when the cursor is initialised. e.g. a clean mount which hasn't
147 * read in AGFs and the first operation run on the mounted fs is a trim. This
148 * can result in perag fields that aren't initialised until
149 * xfs_trim_gather_extents() calls xfs_alloc_read_agf() to lock down the AG for
150 * the free space search.
151 */
152 struct xfs_trim_cur {
153 xfs_agblock_t start;
154 xfs_extlen_t count;
155 xfs_agblock_t end;
156 xfs_extlen_t minlen;
157 bool by_bno;
158 };
159
160 static int
xfs_trim_gather_extents(struct xfs_perag * pag,struct xfs_trim_cur * tcur,struct xfs_busy_extents * extents)161 xfs_trim_gather_extents(
162 struct xfs_perag *pag,
163 struct xfs_trim_cur *tcur,
164 struct xfs_busy_extents *extents)
165 {
166 struct xfs_mount *mp = pag_mount(pag);
167 struct xfs_trans *tp;
168 struct xfs_btree_cur *cur;
169 struct xfs_buf *agbp;
170 int error;
171 int i;
172 int batch = XFS_DISCARD_MAX_EXAMINE;
173
174 /*
175 * Force out the log. This means any transactions that might have freed
176 * space before we take the AGF buffer lock are now on disk, and the
177 * volatile disk cache is flushed.
178 */
179 xfs_log_force(mp, XFS_LOG_SYNC);
180
181 tp = xfs_trans_alloc_empty(mp);
182
183 error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
184 if (error)
185 goto out_trans_cancel;
186
187 /*
188 * First time through tcur->count will not have been initialised as
189 * pag->pagf_longest is not guaranteed to be valid before we read
190 * the AGF buffer above.
191 */
192 if (!tcur->count)
193 tcur->count = pag->pagf_longest;
194
195 if (tcur->by_bno) {
196 /* sub-AG discard request always starts at tcur->start */
197 cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
198 error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i);
199 if (!error && !i)
200 error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i);
201 } else if (tcur->start == 0) {
202 /* first time through a by-len starts with max length */
203 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
204 error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i);
205 } else {
206 /* nth time through a by-len starts where we left off */
207 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
208 error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i);
209 }
210 if (error)
211 goto out_del_cursor;
212 if (i == 0) {
213 /* nothing of that length left in the AG, we are done */
214 tcur->count = 0;
215 goto out_del_cursor;
216 }
217
218 /*
219 * Loop until we are done with all extents that are large
220 * enough to be worth discarding or we hit batch limits.
221 */
222 while (i) {
223 xfs_agblock_t fbno;
224 xfs_extlen_t flen;
225
226 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
227 if (error)
228 break;
229 if (XFS_IS_CORRUPT(mp, i != 1)) {
230 xfs_btree_mark_sick(cur);
231 error = -EFSCORRUPTED;
232 break;
233 }
234
235 if (--batch <= 0) {
236 /*
237 * Update the cursor to point at this extent so we
238 * restart the next batch from this extent.
239 */
240 tcur->start = fbno;
241 tcur->count = flen;
242 break;
243 }
244
245 /*
246 * If the extent is entirely outside of the range we are
247 * supposed to skip it. Do not bother to trim down partially
248 * overlapping ranges for now.
249 */
250 if (fbno + flen < tcur->start) {
251 trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
252 goto next_extent;
253 }
254 if (fbno > tcur->end) {
255 trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
256 if (tcur->by_bno) {
257 tcur->count = 0;
258 break;
259 }
260 goto next_extent;
261 }
262
263 /* Trim the extent returned to the range we want. */
264 if (fbno < tcur->start) {
265 flen -= tcur->start - fbno;
266 fbno = tcur->start;
267 }
268 if (fbno + flen > tcur->end + 1)
269 flen = tcur->end - fbno + 1;
270
271 /* Too small? Give up. */
272 if (flen < tcur->minlen) {
273 trace_xfs_discard_toosmall(pag_group(pag), fbno, flen);
274 if (tcur->by_bno)
275 goto next_extent;
276 tcur->count = 0;
277 break;
278 }
279
280 /*
281 * If any blocks in the range are still busy, skip the
282 * discard and try again the next time.
283 */
284 if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) {
285 trace_xfs_discard_busy(pag_group(pag), fbno, flen);
286 goto next_extent;
287 }
288
289 xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen,
290 &extents->extent_list);
291 next_extent:
292 if (tcur->by_bno)
293 error = xfs_btree_increment(cur, 0, &i);
294 else
295 error = xfs_btree_decrement(cur, 0, &i);
296 if (error)
297 break;
298
299 /*
300 * If there's no more records in the tree, we are done. Set the
301 * cursor block count to 0 to indicate to the caller that there
302 * is no more extents to search.
303 */
304 if (i == 0)
305 tcur->count = 0;
306 }
307
308 /*
309 * If there was an error, release all the gathered busy extents because
310 * we aren't going to issue a discard on them any more.
311 */
312 if (error)
313 xfs_extent_busy_clear(&extents->extent_list, false);
314 out_del_cursor:
315 xfs_btree_del_cursor(cur, error);
316 out_trans_cancel:
317 xfs_trans_cancel(tp);
318 return error;
319 }
320
321 static bool
xfs_trim_should_stop(void)322 xfs_trim_should_stop(void)
323 {
324 return fatal_signal_pending(current) || freezing(current);
325 }
326
327 /*
328 * Iterate the free list gathering extents and discarding them. We need a cursor
329 * for the repeated iteration of gather/discard loop, so use the longest extent
330 * we found in the last batch as the key to start the next.
331 */
332 static int
xfs_trim_perag_extents(struct xfs_perag * pag,xfs_agblock_t start,xfs_agblock_t end,xfs_extlen_t minlen)333 xfs_trim_perag_extents(
334 struct xfs_perag *pag,
335 xfs_agblock_t start,
336 xfs_agblock_t end,
337 xfs_extlen_t minlen)
338 {
339 struct xfs_trim_cur tcur = {
340 .start = start,
341 .end = end,
342 .minlen = minlen,
343 };
344 int error = 0;
345
346 if (start != 0 || end != pag_group(pag)->xg_block_count)
347 tcur.by_bno = true;
348
349 do {
350 struct xfs_busy_extents *extents;
351
352 extents = kzalloc(sizeof(*extents), GFP_KERNEL);
353 if (!extents) {
354 error = -ENOMEM;
355 break;
356 }
357
358 extents->owner = extents;
359 INIT_LIST_HEAD(&extents->extent_list);
360
361 error = xfs_trim_gather_extents(pag, &tcur, extents);
362 if (error) {
363 kfree(extents);
364 break;
365 }
366
367 /*
368 * We hand the extent list to the discard function here so the
369 * discarded extents can be removed from the busy extent list.
370 * This allows the discards to run asynchronously with gathering
371 * the next round of extents to discard.
372 *
373 * However, we must ensure that we do not reference the extent
374 * list after this function call, as it may have been freed by
375 * the time control returns to us.
376 */
377 xfs_discard_extents(pag_mount(pag), extents);
378
379 if (xfs_trim_should_stop())
380 break;
381
382 } while (tcur.count != 0);
383
384 return error;
385
386 }
387
388 static int
xfs_trim_datadev_extents(struct xfs_mount * mp,xfs_daddr_t start,xfs_daddr_t end,xfs_extlen_t minlen)389 xfs_trim_datadev_extents(
390 struct xfs_mount *mp,
391 xfs_daddr_t start,
392 xfs_daddr_t end,
393 xfs_extlen_t minlen)
394 {
395 xfs_agnumber_t start_agno, end_agno;
396 xfs_agblock_t start_agbno, end_agbno;
397 struct xfs_perag *pag = NULL;
398 xfs_daddr_t ddev_end;
399 int last_error = 0, error;
400
401 ddev_end = min_t(xfs_daddr_t, end,
402 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1);
403
404 start_agno = xfs_daddr_to_agno(mp, start);
405 start_agbno = xfs_daddr_to_agbno(mp, start);
406 end_agno = xfs_daddr_to_agno(mp, ddev_end);
407 end_agbno = xfs_daddr_to_agbno(mp, ddev_end);
408
409 while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) {
410 xfs_agblock_t agend = pag_group(pag)->xg_block_count;
411
412 if (pag_agno(pag) == end_agno)
413 agend = end_agbno;
414 error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen);
415 if (error)
416 last_error = error;
417
418 if (xfs_trim_should_stop()) {
419 xfs_perag_rele(pag);
420 break;
421 }
422 start_agbno = 0;
423 }
424
425 return last_error;
426 }
427
428 #ifdef CONFIG_XFS_RT
429 struct xfs_trim_rtdev {
430 /* list of rt extents to free */
431 struct list_head extent_list;
432
433 /* minimum length that caller allows us to trim */
434 xfs_rtblock_t minlen_fsb;
435
436 /* restart point for the rtbitmap walk */
437 xfs_rtxnum_t restart_rtx;
438
439 /* stopping point for the current rtbitmap walk */
440 xfs_rtxnum_t stop_rtx;
441 };
442
443 struct xfs_rtx_busy {
444 struct list_head list;
445 xfs_rtblock_t bno;
446 xfs_rtblock_t length;
447 };
448
449 static void
xfs_discard_free_rtdev_extents(struct xfs_trim_rtdev * tr)450 xfs_discard_free_rtdev_extents(
451 struct xfs_trim_rtdev *tr)
452 {
453 struct xfs_rtx_busy *busyp, *n;
454
455 list_for_each_entry_safe(busyp, n, &tr->extent_list, list) {
456 list_del_init(&busyp->list);
457 kfree(busyp);
458 }
459 }
460
461 /*
462 * Walk the discard list and issue discards on all the busy extents in the
463 * list. We plug and chain the bios so that we only need a single completion
464 * call to clear all the busy extents once the discards are complete.
465 */
466 static int
xfs_discard_rtdev_extents(struct xfs_mount * mp,struct xfs_trim_rtdev * tr)467 xfs_discard_rtdev_extents(
468 struct xfs_mount *mp,
469 struct xfs_trim_rtdev *tr)
470 {
471 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
472 struct xfs_rtx_busy *busyp;
473 struct bio *bio = NULL;
474 struct blk_plug plug;
475 xfs_rtblock_t start = NULLRTBLOCK, length = 0;
476 int error = 0;
477
478 blk_start_plug(&plug);
479 list_for_each_entry(busyp, &tr->extent_list, list) {
480 if (start == NULLRTBLOCK)
481 start = busyp->bno;
482 length += busyp->length;
483
484 trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length);
485
486 __blkdev_issue_discard(bdev,
487 xfs_rtb_to_daddr(mp, busyp->bno),
488 XFS_FSB_TO_BB(mp, busyp->length),
489 GFP_NOFS, &bio);
490 }
491 xfs_discard_free_rtdev_extents(tr);
492
493 if (bio) {
494 error = submit_bio_wait(bio);
495 if (error == -EOPNOTSUPP)
496 error = 0;
497 if (error)
498 xfs_info(mp,
499 "discard failed for rtextent [0x%llx,%llu], error %d",
500 (unsigned long long)start,
501 (unsigned long long)length,
502 error);
503 bio_put(bio);
504 }
505 blk_finish_plug(&plug);
506
507 return error;
508 }
509
510 static int
xfs_trim_gather_rtextent(struct xfs_rtgroup * rtg,struct xfs_trans * tp,const struct xfs_rtalloc_rec * rec,void * priv)511 xfs_trim_gather_rtextent(
512 struct xfs_rtgroup *rtg,
513 struct xfs_trans *tp,
514 const struct xfs_rtalloc_rec *rec,
515 void *priv)
516 {
517 struct xfs_trim_rtdev *tr = priv;
518 struct xfs_rtx_busy *busyp;
519 xfs_rtblock_t rbno, rlen;
520
521 if (rec->ar_startext > tr->stop_rtx) {
522 /*
523 * If we've scanned a large number of rtbitmap blocks, update
524 * the cursor to point at this extent so we restart the next
525 * batch from this extent.
526 */
527 tr->restart_rtx = rec->ar_startext;
528 return -ECANCELED;
529 }
530
531 rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext);
532 rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount);
533
534 /* Ignore too small. */
535 if (rlen < tr->minlen_fsb) {
536 trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen);
537 return 0;
538 }
539
540 busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL);
541 if (!busyp)
542 return -ENOMEM;
543
544 busyp->bno = rbno;
545 busyp->length = rlen;
546 INIT_LIST_HEAD(&busyp->list);
547 list_add_tail(&busyp->list, &tr->extent_list);
548
549 tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
550 return 0;
551 }
552
553 /* Trim extents on an !rtgroups realtime device */
554 static int
xfs_trim_rtextents(struct xfs_rtgroup * rtg,xfs_rtxnum_t low,xfs_rtxnum_t high,xfs_daddr_t minlen)555 xfs_trim_rtextents(
556 struct xfs_rtgroup *rtg,
557 xfs_rtxnum_t low,
558 xfs_rtxnum_t high,
559 xfs_daddr_t minlen)
560 {
561 struct xfs_mount *mp = rtg_mount(rtg);
562 struct xfs_trim_rtdev tr = {
563 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen),
564 .extent_list = LIST_HEAD_INIT(tr.extent_list),
565 };
566 struct xfs_trans *tp;
567 int error;
568
569 tp = xfs_trans_alloc_empty(mp);
570
571 /*
572 * Walk the free ranges between low and high. The query_range function
573 * trims the extents returned.
574 */
575 do {
576 tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp);
577 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
578 error = xfs_rtalloc_query_range(rtg, tp, low, high,
579 xfs_trim_gather_rtextent, &tr);
580
581 if (error == -ECANCELED)
582 error = 0;
583 if (error) {
584 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
585 xfs_discard_free_rtdev_extents(&tr);
586 break;
587 }
588
589 if (list_empty(&tr.extent_list)) {
590 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
591 break;
592 }
593
594 error = xfs_discard_rtdev_extents(mp, &tr);
595 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
596 if (error)
597 break;
598
599 low = tr.restart_rtx;
600 } while (!xfs_trim_should_stop() && low <= high);
601
602 xfs_trans_cancel(tp);
603 return error;
604 }
605
606 struct xfs_trim_rtgroup {
607 /* list of rtgroup extents to free */
608 struct xfs_busy_extents *extents;
609
610 /* minimum length that caller allows us to trim */
611 xfs_rtblock_t minlen_fsb;
612
613 /* restart point for the rtbitmap walk */
614 xfs_rtxnum_t restart_rtx;
615
616 /* number of extents to examine before stopping to issue discard ios */
617 int batch;
618
619 /* number of extents queued for discard */
620 int queued;
621 };
622
623 static int
xfs_trim_gather_rtgroup_extent(struct xfs_rtgroup * rtg,struct xfs_trans * tp,const struct xfs_rtalloc_rec * rec,void * priv)624 xfs_trim_gather_rtgroup_extent(
625 struct xfs_rtgroup *rtg,
626 struct xfs_trans *tp,
627 const struct xfs_rtalloc_rec *rec,
628 void *priv)
629 {
630 struct xfs_trim_rtgroup *tr = priv;
631 xfs_rgblock_t rgbno;
632 xfs_extlen_t len;
633
634 if (--tr->batch <= 0) {
635 /*
636 * If we've checked a large number of extents, update the
637 * cursor to point at this extent so we restart the next batch
638 * from this extent.
639 */
640 tr->restart_rtx = rec->ar_startext;
641 return -ECANCELED;
642 }
643
644 rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext);
645 len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount);
646
647 /* Ignore too small. */
648 if (len < tr->minlen_fsb) {
649 trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len);
650 return 0;
651 }
652
653 /*
654 * If any blocks in the range are still busy, skip the discard and try
655 * again the next time.
656 */
657 if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) {
658 trace_xfs_discard_busy(rtg_group(rtg), rgbno, len);
659 return 0;
660 }
661
662 xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len,
663 &tr->extents->extent_list);
664
665 tr->queued++;
666 tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
667 return 0;
668 }
669
670 /* Trim extents in this rtgroup using the busy extent machinery. */
671 static int
xfs_trim_rtgroup_extents(struct xfs_rtgroup * rtg,xfs_rtxnum_t low,xfs_rtxnum_t high,xfs_daddr_t minlen)672 xfs_trim_rtgroup_extents(
673 struct xfs_rtgroup *rtg,
674 xfs_rtxnum_t low,
675 xfs_rtxnum_t high,
676 xfs_daddr_t minlen)
677 {
678 struct xfs_mount *mp = rtg_mount(rtg);
679 struct xfs_trim_rtgroup tr = {
680 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen),
681 };
682 struct xfs_trans *tp;
683 int error;
684
685 tp = xfs_trans_alloc_empty(mp);
686
687 /*
688 * Walk the free ranges between low and high. The query_range function
689 * trims the extents returned.
690 */
691 do {
692 tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL);
693 if (!tr.extents) {
694 error = -ENOMEM;
695 break;
696 }
697
698 tr.queued = 0;
699 tr.batch = XFS_DISCARD_MAX_EXAMINE;
700 tr.extents->owner = tr.extents;
701 INIT_LIST_HEAD(&tr.extents->extent_list);
702
703 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
704 error = xfs_rtalloc_query_range(rtg, tp, low, high,
705 xfs_trim_gather_rtgroup_extent, &tr);
706 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
707 if (error == -ECANCELED)
708 error = 0;
709 if (error) {
710 kfree(tr.extents);
711 break;
712 }
713
714 if (!tr.queued) {
715 kfree(tr.extents);
716 break;
717 }
718
719 /*
720 * We hand the extent list to the discard function here so the
721 * discarded extents can be removed from the busy extent list.
722 * This allows the discards to run asynchronously with
723 * gathering the next round of extents to discard.
724 *
725 * However, we must ensure that we do not reference the extent
726 * list after this function call, as it may have been freed by
727 * the time control returns to us.
728 */
729 xfs_discard_extents(rtg_mount(rtg), tr.extents);
730
731 low = tr.restart_rtx;
732 } while (!xfs_trim_should_stop() && low <= high);
733
734 xfs_trans_cancel(tp);
735 return error;
736 }
737
738 static int
xfs_trim_rtdev_extents(struct xfs_mount * mp,xfs_daddr_t start,xfs_daddr_t end,xfs_daddr_t minlen)739 xfs_trim_rtdev_extents(
740 struct xfs_mount *mp,
741 xfs_daddr_t start,
742 xfs_daddr_t end,
743 xfs_daddr_t minlen)
744 {
745 xfs_rtblock_t start_rtbno, end_rtbno;
746 xfs_rtxnum_t start_rtx, end_rtx;
747 xfs_rgnumber_t start_rgno, end_rgno;
748 xfs_daddr_t daddr_offset;
749 int last_error = 0, error;
750 struct xfs_rtgroup *rtg = NULL;
751
752 /* Shift the start and end downwards to match the rt device. */
753 daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
754 if (start > daddr_offset)
755 start -= daddr_offset;
756 else
757 start = 0;
758 start_rtbno = xfs_daddr_to_rtb(mp, start);
759 start_rtx = xfs_rtb_to_rtx(mp, start_rtbno);
760 start_rgno = xfs_rtb_to_rgno(mp, start_rtbno);
761
762 if (end <= daddr_offset)
763 return 0;
764 else
765 end -= daddr_offset;
766 end_rtbno = xfs_daddr_to_rtb(mp, end);
767 end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1);
768 end_rgno = xfs_rtb_to_rgno(mp, end_rtbno);
769
770 while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) {
771 xfs_rtxnum_t rtg_end = rtg->rtg_extents;
772
773 if (rtg_rgno(rtg) == end_rgno)
774 rtg_end = min(rtg_end, end_rtx);
775
776 if (xfs_has_rtgroups(mp))
777 error = xfs_trim_rtgroup_extents(rtg, start_rtx,
778 rtg_end, minlen);
779 else
780 error = xfs_trim_rtextents(rtg, start_rtx, rtg_end,
781 minlen);
782 if (error)
783 last_error = error;
784
785 if (xfs_trim_should_stop()) {
786 xfs_rtgroup_rele(rtg);
787 break;
788 }
789 start_rtx = 0;
790 }
791
792 return last_error;
793 }
794 #else
795 # define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP)
796 #endif /* CONFIG_XFS_RT */
797
798 /*
799 * trim a range of the filesystem.
800 *
801 * Note: the parameters passed from userspace are byte ranges into the
802 * filesystem which does not match to the format we use for filesystem block
803 * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
804 * is a linear address range. Hence we need to use DADDR based conversions and
805 * comparisons for determining the correct offset and regions to trim.
806 *
807 * The realtime device is mapped into the FITRIM "address space" immediately
808 * after the data device.
809 */
810 int
xfs_ioc_trim(struct xfs_mount * mp,struct fstrim_range __user * urange)811 xfs_ioc_trim(
812 struct xfs_mount *mp,
813 struct fstrim_range __user *urange)
814 {
815 unsigned int granularity =
816 bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
817 struct block_device *rt_bdev = NULL;
818 struct fstrim_range range;
819 xfs_daddr_t start, end;
820 xfs_extlen_t minlen;
821 xfs_rfsblock_t max_blocks;
822 int error, last_error = 0;
823
824 if (!capable(CAP_SYS_ADMIN))
825 return -EPERM;
826
827 if (mp->m_rtdev_targp && !xfs_has_zoned(mp) &&
828 bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
829 rt_bdev = mp->m_rtdev_targp->bt_bdev;
830 if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
831 return -EOPNOTSUPP;
832
833 if (rt_bdev)
834 granularity = max(granularity,
835 bdev_discard_granularity(rt_bdev));
836
837 /*
838 * We haven't recovered the log, so we cannot use our bnobt-guided
839 * storage zapping commands.
840 */
841 if (xfs_has_norecovery(mp))
842 return -EROFS;
843
844 if (copy_from_user(&range, urange, sizeof(range)))
845 return -EFAULT;
846
847 range.minlen = max_t(u64, granularity, range.minlen);
848 minlen = XFS_B_TO_FSB(mp, range.minlen);
849
850 /*
851 * Truncating down the len isn't actually quite correct, but using
852 * BBTOB would mean we trivially get overflows for values
853 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
854 * used by the fstrim application. In the end it really doesn't
855 * matter as trimming blocks is an advisory interface.
856 */
857 max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks;
858 if (range.start >= XFS_FSB_TO_B(mp, max_blocks) ||
859 range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) ||
860 range.len < mp->m_sb.sb_blocksize)
861 return -EINVAL;
862
863 start = BTOBB(range.start);
864 end = start + BTOBBT(range.len) - 1;
865
866 if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) {
867 error = xfs_trim_datadev_extents(mp, start, end, minlen);
868 if (error)
869 last_error = error;
870 }
871
872 if (rt_bdev && !xfs_trim_should_stop()) {
873 error = xfs_trim_rtdev_extents(mp, start, end, minlen);
874 if (error)
875 last_error = error;
876 }
877
878 if (last_error)
879 return last_error;
880
881 range.len = min_t(unsigned long long, range.len,
882 XFS_FSB_TO_B(mp, max_blocks) - range.start);
883 if (copy_to_user(urange, &range, sizeof(range)))
884 return -EFAULT;
885 return 0;
886 }
887