1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2010, 2023 Red Hat, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_trans.h" 12 #include "xfs_mount.h" 13 #include "xfs_btree.h" 14 #include "xfs_alloc_btree.h" 15 #include "xfs_alloc.h" 16 #include "xfs_discard.h" 17 #include "xfs_error.h" 18 #include "xfs_extent_busy.h" 19 #include "xfs_trace.h" 20 #include "xfs_log.h" 21 #include "xfs_ag.h" 22 #include "xfs_health.h" 23 #include "xfs_rtbitmap.h" 24 #include "xfs_rtgroup.h" 25 26 /* 27 * Notes on an efficient, low latency fstrim algorithm 28 * 29 * We need to walk the filesystem free space and issue discards on the free 30 * space that meet the search criteria (size and location). We cannot issue 31 * discards on extents that might be in use, or are so recently in use they are 32 * still marked as busy. To serialise against extent state changes whilst we are 33 * gathering extents to trim, we must hold the AGF lock to lock out other 34 * allocations and extent free operations that might change extent state. 35 * 36 * However, we cannot just hold the AGF for the entire AG free space walk whilst 37 * we issue discards on each free space that is found. Storage devices can have 38 * extremely slow discard implementations (e.g. ceph RBD) and so walking a 39 * couple of million free extents and issuing synchronous discards on each 40 * extent can take a *long* time. Whilst we are doing this walk, nothing else 41 * can access the AGF, and we can stall transactions and hence the log whilst 42 * modifications wait for the AGF lock to be released. This can lead hung tasks 43 * kicking the hung task timer and rebooting the system. This is bad. 44 * 45 * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI 46 * lock, gathers a range of inode cluster buffers that are allocated, drops the 47 * AGI lock and then reads all the inode cluster buffers and processes them. It 48 * loops doing this, using a cursor to keep track of where it is up to in the AG 49 * for each iteration to restart the INOBT lookup from. 50 * 51 * We can't do this exactly with free space - once we drop the AGF lock, the 52 * state of the free extent is out of our control and we cannot run a discard 53 * safely on it in this situation. Unless, of course, we've marked the free 54 * extent as busy and undergoing a discard operation whilst we held the AGF 55 * locked. 56 * 57 * This is exactly how online discard works - free extents are marked busy when 58 * they are freed, and once the extent free has been committed to the journal, 59 * the busy extent record is marked as "undergoing discard" and the discard is 60 * then issued on the free extent. Once the discard completes, the busy extent 61 * record is removed and the extent is able to be allocated again. 62 * 63 * In the context of fstrim, if we find a free extent we need to discard, we 64 * don't have to discard it immediately. All we need to do it record that free 65 * extent as being busy and under discard, and all the allocation routines will 66 * now avoid trying to allocate it. Hence if we mark the extent as busy under 67 * the AGF lock, we can safely discard it without holding the AGF lock because 68 * nothing will attempt to allocate that free space until the discard completes. 69 * 70 * This also allows us to issue discards asynchronously like we do with online 71 * discard, and so for fast devices fstrim will run much faster as we can have 72 * multiple discard operations in flight at once, as well as pipeline the free 73 * extent search so that it overlaps in flight discard IO. 74 */ 75 76 #define XFS_DISCARD_MAX_EXAMINE (100) 77 78 struct workqueue_struct *xfs_discard_wq; 79 80 static void 81 xfs_discard_endio_work( 82 struct work_struct *work) 83 { 84 struct xfs_busy_extents *extents = 85 container_of(work, struct xfs_busy_extents, endio_work); 86 87 xfs_extent_busy_clear(&extents->extent_list, false); 88 kfree(extents->owner); 89 } 90 91 /* 92 * Queue up the actual completion to a thread to avoid IRQ-safe locking for 93 * eb_lock. 94 */ 95 static void 96 xfs_discard_endio( 97 struct bio *bio) 98 { 99 struct xfs_busy_extents *extents = bio->bi_private; 100 101 INIT_WORK(&extents->endio_work, xfs_discard_endio_work); 102 queue_work(xfs_discard_wq, &extents->endio_work); 103 bio_put(bio); 104 } 105 106 /* 107 * Walk the discard list and issue discards on all the busy extents in the 108 * list. We plug and chain the bios so that we only need a single completion 109 * call to clear all the busy extents once the discards are complete. 110 */ 111 int 112 xfs_discard_extents( 113 struct xfs_mount *mp, 114 struct xfs_busy_extents *extents) 115 { 116 struct xfs_extent_busy *busyp; 117 struct bio *bio = NULL; 118 struct blk_plug plug; 119 int error = 0; 120 121 blk_start_plug(&plug); 122 list_for_each_entry(busyp, &extents->extent_list, list) { 123 struct xfs_group *xg = busyp->group; 124 struct xfs_buftarg *btp = 125 xfs_group_type_buftarg(xg->xg_mount, xg->xg_type); 126 127 trace_xfs_discard_extent(xg, busyp->bno, busyp->length); 128 129 error = __blkdev_issue_discard(btp->bt_bdev, 130 xfs_gbno_to_daddr(xg, busyp->bno), 131 XFS_FSB_TO_BB(mp, busyp->length), 132 GFP_KERNEL, &bio); 133 if (error && error != -EOPNOTSUPP) { 134 xfs_info(mp, 135 "discard failed for extent [0x%llx,%u], error %d", 136 (unsigned long long)busyp->bno, 137 busyp->length, 138 error); 139 break; 140 } 141 } 142 143 if (bio) { 144 bio->bi_private = extents; 145 bio->bi_end_io = xfs_discard_endio; 146 submit_bio(bio); 147 } else { 148 xfs_discard_endio_work(&extents->endio_work); 149 } 150 blk_finish_plug(&plug); 151 152 return error; 153 } 154 155 /* 156 * Care must be taken setting up the trim cursor as the perags may not have been 157 * initialised when the cursor is initialised. e.g. a clean mount which hasn't 158 * read in AGFs and the first operation run on the mounted fs is a trim. This 159 * can result in perag fields that aren't initialised until 160 * xfs_trim_gather_extents() calls xfs_alloc_read_agf() to lock down the AG for 161 * the free space search. 162 */ 163 struct xfs_trim_cur { 164 xfs_agblock_t start; 165 xfs_extlen_t count; 166 xfs_agblock_t end; 167 xfs_extlen_t minlen; 168 bool by_bno; 169 }; 170 171 static int 172 xfs_trim_gather_extents( 173 struct xfs_perag *pag, 174 struct xfs_trim_cur *tcur, 175 struct xfs_busy_extents *extents) 176 { 177 struct xfs_mount *mp = pag_mount(pag); 178 struct xfs_trans *tp; 179 struct xfs_btree_cur *cur; 180 struct xfs_buf *agbp; 181 int error; 182 int i; 183 int batch = XFS_DISCARD_MAX_EXAMINE; 184 185 /* 186 * Force out the log. This means any transactions that might have freed 187 * space before we take the AGF buffer lock are now on disk, and the 188 * volatile disk cache is flushed. 189 */ 190 xfs_log_force(mp, XFS_LOG_SYNC); 191 192 tp = xfs_trans_alloc_empty(mp); 193 194 error = xfs_alloc_read_agf(pag, tp, 0, &agbp); 195 if (error) 196 goto out_trans_cancel; 197 198 /* 199 * First time through tcur->count will not have been initialised as 200 * pag->pagf_longest is not guaranteed to be valid before we read 201 * the AGF buffer above. 202 */ 203 if (!tcur->count) 204 tcur->count = pag->pagf_longest; 205 206 if (tcur->by_bno) { 207 /* sub-AG discard request always starts at tcur->start */ 208 cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); 209 error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i); 210 if (!error && !i) 211 error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i); 212 } else if (tcur->start == 0) { 213 /* first time through a by-len starts with max length */ 214 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); 215 error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i); 216 } else { 217 /* nth time through a by-len starts where we left off */ 218 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); 219 error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i); 220 } 221 if (error) 222 goto out_del_cursor; 223 if (i == 0) { 224 /* nothing of that length left in the AG, we are done */ 225 tcur->count = 0; 226 goto out_del_cursor; 227 } 228 229 /* 230 * Loop until we are done with all extents that are large 231 * enough to be worth discarding or we hit batch limits. 232 */ 233 while (i) { 234 xfs_agblock_t fbno; 235 xfs_extlen_t flen; 236 237 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); 238 if (error) 239 break; 240 if (XFS_IS_CORRUPT(mp, i != 1)) { 241 xfs_btree_mark_sick(cur); 242 error = -EFSCORRUPTED; 243 break; 244 } 245 246 if (--batch <= 0) { 247 /* 248 * Update the cursor to point at this extent so we 249 * restart the next batch from this extent. 250 */ 251 tcur->start = fbno; 252 tcur->count = flen; 253 break; 254 } 255 256 /* 257 * If the extent is entirely outside of the range we are 258 * supposed to skip it. Do not bother to trim down partially 259 * overlapping ranges for now. 260 */ 261 if (fbno + flen < tcur->start) { 262 trace_xfs_discard_exclude(pag_group(pag), fbno, flen); 263 goto next_extent; 264 } 265 if (fbno > tcur->end) { 266 trace_xfs_discard_exclude(pag_group(pag), fbno, flen); 267 if (tcur->by_bno) { 268 tcur->count = 0; 269 break; 270 } 271 goto next_extent; 272 } 273 274 /* Trim the extent returned to the range we want. */ 275 if (fbno < tcur->start) { 276 flen -= tcur->start - fbno; 277 fbno = tcur->start; 278 } 279 if (fbno + flen > tcur->end + 1) 280 flen = tcur->end - fbno + 1; 281 282 /* Too small? Give up. */ 283 if (flen < tcur->minlen) { 284 trace_xfs_discard_toosmall(pag_group(pag), fbno, flen); 285 if (tcur->by_bno) 286 goto next_extent; 287 tcur->count = 0; 288 break; 289 } 290 291 /* 292 * If any blocks in the range are still busy, skip the 293 * discard and try again the next time. 294 */ 295 if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) { 296 trace_xfs_discard_busy(pag_group(pag), fbno, flen); 297 goto next_extent; 298 } 299 300 xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen, 301 &extents->extent_list); 302 next_extent: 303 if (tcur->by_bno) 304 error = xfs_btree_increment(cur, 0, &i); 305 else 306 error = xfs_btree_decrement(cur, 0, &i); 307 if (error) 308 break; 309 310 /* 311 * If there's no more records in the tree, we are done. Set the 312 * cursor block count to 0 to indicate to the caller that there 313 * is no more extents to search. 314 */ 315 if (i == 0) 316 tcur->count = 0; 317 } 318 319 /* 320 * If there was an error, release all the gathered busy extents because 321 * we aren't going to issue a discard on them any more. 322 */ 323 if (error) 324 xfs_extent_busy_clear(&extents->extent_list, false); 325 out_del_cursor: 326 xfs_btree_del_cursor(cur, error); 327 out_trans_cancel: 328 xfs_trans_cancel(tp); 329 return error; 330 } 331 332 static bool 333 xfs_trim_should_stop(void) 334 { 335 return fatal_signal_pending(current) || freezing(current); 336 } 337 338 /* 339 * Iterate the free list gathering extents and discarding them. We need a cursor 340 * for the repeated iteration of gather/discard loop, so use the longest extent 341 * we found in the last batch as the key to start the next. 342 */ 343 static int 344 xfs_trim_perag_extents( 345 struct xfs_perag *pag, 346 xfs_agblock_t start, 347 xfs_agblock_t end, 348 xfs_extlen_t minlen) 349 { 350 struct xfs_trim_cur tcur = { 351 .start = start, 352 .end = end, 353 .minlen = minlen, 354 }; 355 int error = 0; 356 357 if (start != 0 || end != pag_group(pag)->xg_block_count) 358 tcur.by_bno = true; 359 360 do { 361 struct xfs_busy_extents *extents; 362 363 extents = kzalloc(sizeof(*extents), GFP_KERNEL); 364 if (!extents) { 365 error = -ENOMEM; 366 break; 367 } 368 369 extents->owner = extents; 370 INIT_LIST_HEAD(&extents->extent_list); 371 372 error = xfs_trim_gather_extents(pag, &tcur, extents); 373 if (error) { 374 kfree(extents); 375 break; 376 } 377 378 /* 379 * We hand the extent list to the discard function here so the 380 * discarded extents can be removed from the busy extent list. 381 * This allows the discards to run asynchronously with gathering 382 * the next round of extents to discard. 383 * 384 * However, we must ensure that we do not reference the extent 385 * list after this function call, as it may have been freed by 386 * the time control returns to us. 387 */ 388 error = xfs_discard_extents(pag_mount(pag), extents); 389 if (error) 390 break; 391 392 if (xfs_trim_should_stop()) 393 break; 394 395 } while (tcur.count != 0); 396 397 return error; 398 399 } 400 401 static int 402 xfs_trim_datadev_extents( 403 struct xfs_mount *mp, 404 xfs_daddr_t start, 405 xfs_daddr_t end, 406 xfs_extlen_t minlen) 407 { 408 xfs_agnumber_t start_agno, end_agno; 409 xfs_agblock_t start_agbno, end_agbno; 410 struct xfs_perag *pag = NULL; 411 xfs_daddr_t ddev_end; 412 int last_error = 0, error; 413 414 ddev_end = min_t(xfs_daddr_t, end, 415 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1); 416 417 start_agno = xfs_daddr_to_agno(mp, start); 418 start_agbno = xfs_daddr_to_agbno(mp, start); 419 end_agno = xfs_daddr_to_agno(mp, ddev_end); 420 end_agbno = xfs_daddr_to_agbno(mp, ddev_end); 421 422 while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) { 423 xfs_agblock_t agend = pag_group(pag)->xg_block_count; 424 425 if (pag_agno(pag) == end_agno) 426 agend = end_agbno; 427 error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen); 428 if (error) 429 last_error = error; 430 431 if (xfs_trim_should_stop()) { 432 xfs_perag_rele(pag); 433 break; 434 } 435 start_agbno = 0; 436 } 437 438 return last_error; 439 } 440 441 #ifdef CONFIG_XFS_RT 442 struct xfs_trim_rtdev { 443 /* list of rt extents to free */ 444 struct list_head extent_list; 445 446 /* minimum length that caller allows us to trim */ 447 xfs_rtblock_t minlen_fsb; 448 449 /* restart point for the rtbitmap walk */ 450 xfs_rtxnum_t restart_rtx; 451 452 /* stopping point for the current rtbitmap walk */ 453 xfs_rtxnum_t stop_rtx; 454 }; 455 456 struct xfs_rtx_busy { 457 struct list_head list; 458 xfs_rtblock_t bno; 459 xfs_rtblock_t length; 460 }; 461 462 static void 463 xfs_discard_free_rtdev_extents( 464 struct xfs_trim_rtdev *tr) 465 { 466 struct xfs_rtx_busy *busyp, *n; 467 468 list_for_each_entry_safe(busyp, n, &tr->extent_list, list) { 469 list_del_init(&busyp->list); 470 kfree(busyp); 471 } 472 } 473 474 /* 475 * Walk the discard list and issue discards on all the busy extents in the 476 * list. We plug and chain the bios so that we only need a single completion 477 * call to clear all the busy extents once the discards are complete. 478 */ 479 static int 480 xfs_discard_rtdev_extents( 481 struct xfs_mount *mp, 482 struct xfs_trim_rtdev *tr) 483 { 484 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 485 struct xfs_rtx_busy *busyp; 486 struct bio *bio = NULL; 487 struct blk_plug plug; 488 xfs_rtblock_t start = NULLRTBLOCK, length = 0; 489 int error = 0; 490 491 blk_start_plug(&plug); 492 list_for_each_entry(busyp, &tr->extent_list, list) { 493 if (start == NULLRTBLOCK) 494 start = busyp->bno; 495 length += busyp->length; 496 497 trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length); 498 499 error = __blkdev_issue_discard(bdev, 500 xfs_rtb_to_daddr(mp, busyp->bno), 501 XFS_FSB_TO_BB(mp, busyp->length), 502 GFP_NOFS, &bio); 503 if (error) 504 break; 505 } 506 xfs_discard_free_rtdev_extents(tr); 507 508 if (bio) { 509 error = submit_bio_wait(bio); 510 if (error == -EOPNOTSUPP) 511 error = 0; 512 if (error) 513 xfs_info(mp, 514 "discard failed for rtextent [0x%llx,%llu], error %d", 515 (unsigned long long)start, 516 (unsigned long long)length, 517 error); 518 bio_put(bio); 519 } 520 blk_finish_plug(&plug); 521 522 return error; 523 } 524 525 static int 526 xfs_trim_gather_rtextent( 527 struct xfs_rtgroup *rtg, 528 struct xfs_trans *tp, 529 const struct xfs_rtalloc_rec *rec, 530 void *priv) 531 { 532 struct xfs_trim_rtdev *tr = priv; 533 struct xfs_rtx_busy *busyp; 534 xfs_rtblock_t rbno, rlen; 535 536 if (rec->ar_startext > tr->stop_rtx) { 537 /* 538 * If we've scanned a large number of rtbitmap blocks, update 539 * the cursor to point at this extent so we restart the next 540 * batch from this extent. 541 */ 542 tr->restart_rtx = rec->ar_startext; 543 return -ECANCELED; 544 } 545 546 rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext); 547 rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount); 548 549 /* Ignore too small. */ 550 if (rlen < tr->minlen_fsb) { 551 trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen); 552 return 0; 553 } 554 555 busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL); 556 if (!busyp) 557 return -ENOMEM; 558 559 busyp->bno = rbno; 560 busyp->length = rlen; 561 INIT_LIST_HEAD(&busyp->list); 562 list_add_tail(&busyp->list, &tr->extent_list); 563 564 tr->restart_rtx = rec->ar_startext + rec->ar_extcount; 565 return 0; 566 } 567 568 /* Trim extents on an !rtgroups realtime device */ 569 static int 570 xfs_trim_rtextents( 571 struct xfs_rtgroup *rtg, 572 xfs_rtxnum_t low, 573 xfs_rtxnum_t high, 574 xfs_daddr_t minlen) 575 { 576 struct xfs_mount *mp = rtg_mount(rtg); 577 struct xfs_trim_rtdev tr = { 578 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), 579 .extent_list = LIST_HEAD_INIT(tr.extent_list), 580 }; 581 struct xfs_trans *tp; 582 int error; 583 584 tp = xfs_trans_alloc_empty(mp); 585 586 /* 587 * Walk the free ranges between low and high. The query_range function 588 * trims the extents returned. 589 */ 590 do { 591 tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp); 592 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 593 error = xfs_rtalloc_query_range(rtg, tp, low, high, 594 xfs_trim_gather_rtextent, &tr); 595 596 if (error == -ECANCELED) 597 error = 0; 598 if (error) { 599 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 600 xfs_discard_free_rtdev_extents(&tr); 601 break; 602 } 603 604 if (list_empty(&tr.extent_list)) { 605 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 606 break; 607 } 608 609 error = xfs_discard_rtdev_extents(mp, &tr); 610 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 611 if (error) 612 break; 613 614 low = tr.restart_rtx; 615 } while (!xfs_trim_should_stop() && low <= high); 616 617 xfs_trans_cancel(tp); 618 return error; 619 } 620 621 struct xfs_trim_rtgroup { 622 /* list of rtgroup extents to free */ 623 struct xfs_busy_extents *extents; 624 625 /* minimum length that caller allows us to trim */ 626 xfs_rtblock_t minlen_fsb; 627 628 /* restart point for the rtbitmap walk */ 629 xfs_rtxnum_t restart_rtx; 630 631 /* number of extents to examine before stopping to issue discard ios */ 632 int batch; 633 634 /* number of extents queued for discard */ 635 int queued; 636 }; 637 638 static int 639 xfs_trim_gather_rtgroup_extent( 640 struct xfs_rtgroup *rtg, 641 struct xfs_trans *tp, 642 const struct xfs_rtalloc_rec *rec, 643 void *priv) 644 { 645 struct xfs_trim_rtgroup *tr = priv; 646 xfs_rgblock_t rgbno; 647 xfs_extlen_t len; 648 649 if (--tr->batch <= 0) { 650 /* 651 * If we've checked a large number of extents, update the 652 * cursor to point at this extent so we restart the next batch 653 * from this extent. 654 */ 655 tr->restart_rtx = rec->ar_startext; 656 return -ECANCELED; 657 } 658 659 rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext); 660 len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount); 661 662 /* Ignore too small. */ 663 if (len < tr->minlen_fsb) { 664 trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len); 665 return 0; 666 } 667 668 /* 669 * If any blocks in the range are still busy, skip the discard and try 670 * again the next time. 671 */ 672 if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) { 673 trace_xfs_discard_busy(rtg_group(rtg), rgbno, len); 674 return 0; 675 } 676 677 xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len, 678 &tr->extents->extent_list); 679 680 tr->queued++; 681 tr->restart_rtx = rec->ar_startext + rec->ar_extcount; 682 return 0; 683 } 684 685 /* Trim extents in this rtgroup using the busy extent machinery. */ 686 static int 687 xfs_trim_rtgroup_extents( 688 struct xfs_rtgroup *rtg, 689 xfs_rtxnum_t low, 690 xfs_rtxnum_t high, 691 xfs_daddr_t minlen) 692 { 693 struct xfs_mount *mp = rtg_mount(rtg); 694 struct xfs_trim_rtgroup tr = { 695 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), 696 }; 697 struct xfs_trans *tp; 698 int error; 699 700 tp = xfs_trans_alloc_empty(mp); 701 702 /* 703 * Walk the free ranges between low and high. The query_range function 704 * trims the extents returned. 705 */ 706 do { 707 tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL); 708 if (!tr.extents) { 709 error = -ENOMEM; 710 break; 711 } 712 713 tr.queued = 0; 714 tr.batch = XFS_DISCARD_MAX_EXAMINE; 715 tr.extents->owner = tr.extents; 716 INIT_LIST_HEAD(&tr.extents->extent_list); 717 718 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 719 error = xfs_rtalloc_query_range(rtg, tp, low, high, 720 xfs_trim_gather_rtgroup_extent, &tr); 721 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 722 if (error == -ECANCELED) 723 error = 0; 724 if (error) { 725 kfree(tr.extents); 726 break; 727 } 728 729 if (!tr.queued) 730 break; 731 732 /* 733 * We hand the extent list to the discard function here so the 734 * discarded extents can be removed from the busy extent list. 735 * This allows the discards to run asynchronously with 736 * gathering the next round of extents to discard. 737 * 738 * However, we must ensure that we do not reference the extent 739 * list after this function call, as it may have been freed by 740 * the time control returns to us. 741 */ 742 error = xfs_discard_extents(rtg_mount(rtg), tr.extents); 743 if (error) 744 break; 745 746 low = tr.restart_rtx; 747 } while (!xfs_trim_should_stop() && low <= high); 748 749 xfs_trans_cancel(tp); 750 return error; 751 } 752 753 static int 754 xfs_trim_rtdev_extents( 755 struct xfs_mount *mp, 756 xfs_daddr_t start, 757 xfs_daddr_t end, 758 xfs_daddr_t minlen) 759 { 760 xfs_rtblock_t start_rtbno, end_rtbno; 761 xfs_rtxnum_t start_rtx, end_rtx; 762 xfs_rgnumber_t start_rgno, end_rgno; 763 xfs_daddr_t daddr_offset; 764 int last_error = 0, error; 765 struct xfs_rtgroup *rtg = NULL; 766 767 /* Shift the start and end downwards to match the rt device. */ 768 daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 769 if (start > daddr_offset) 770 start -= daddr_offset; 771 else 772 start = 0; 773 start_rtbno = xfs_daddr_to_rtb(mp, start); 774 start_rtx = xfs_rtb_to_rtx(mp, start_rtbno); 775 start_rgno = xfs_rtb_to_rgno(mp, start_rtbno); 776 777 if (end <= daddr_offset) 778 return 0; 779 else 780 end -= daddr_offset; 781 end_rtbno = xfs_daddr_to_rtb(mp, end); 782 end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1); 783 end_rgno = xfs_rtb_to_rgno(mp, end_rtbno); 784 785 while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) { 786 xfs_rtxnum_t rtg_end = rtg->rtg_extents; 787 788 if (rtg_rgno(rtg) == end_rgno) 789 rtg_end = min(rtg_end, end_rtx); 790 791 if (xfs_has_rtgroups(mp)) 792 error = xfs_trim_rtgroup_extents(rtg, start_rtx, 793 rtg_end, minlen); 794 else 795 error = xfs_trim_rtextents(rtg, start_rtx, rtg_end, 796 minlen); 797 if (error) 798 last_error = error; 799 800 if (xfs_trim_should_stop()) { 801 xfs_rtgroup_rele(rtg); 802 break; 803 } 804 start_rtx = 0; 805 } 806 807 return last_error; 808 } 809 #else 810 # define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP) 811 #endif /* CONFIG_XFS_RT */ 812 813 /* 814 * trim a range of the filesystem. 815 * 816 * Note: the parameters passed from userspace are byte ranges into the 817 * filesystem which does not match to the format we use for filesystem block 818 * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format 819 * is a linear address range. Hence we need to use DADDR based conversions and 820 * comparisons for determining the correct offset and regions to trim. 821 * 822 * The realtime device is mapped into the FITRIM "address space" immediately 823 * after the data device. 824 */ 825 int 826 xfs_ioc_trim( 827 struct xfs_mount *mp, 828 struct fstrim_range __user *urange) 829 { 830 unsigned int granularity = 831 bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); 832 struct block_device *rt_bdev = NULL; 833 struct fstrim_range range; 834 xfs_daddr_t start, end; 835 xfs_extlen_t minlen; 836 xfs_rfsblock_t max_blocks; 837 int error, last_error = 0; 838 839 if (!capable(CAP_SYS_ADMIN)) 840 return -EPERM; 841 842 if (mp->m_rtdev_targp && !xfs_has_zoned(mp) && 843 bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev)) 844 rt_bdev = mp->m_rtdev_targp->bt_bdev; 845 if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev) 846 return -EOPNOTSUPP; 847 848 if (rt_bdev) 849 granularity = max(granularity, 850 bdev_discard_granularity(rt_bdev)); 851 852 /* 853 * We haven't recovered the log, so we cannot use our bnobt-guided 854 * storage zapping commands. 855 */ 856 if (xfs_has_norecovery(mp)) 857 return -EROFS; 858 859 if (copy_from_user(&range, urange, sizeof(range))) 860 return -EFAULT; 861 862 range.minlen = max_t(u64, granularity, range.minlen); 863 minlen = XFS_B_TO_FSB(mp, range.minlen); 864 865 /* 866 * Truncating down the len isn't actually quite correct, but using 867 * BBTOB would mean we trivially get overflows for values 868 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default 869 * used by the fstrim application. In the end it really doesn't 870 * matter as trimming blocks is an advisory interface. 871 */ 872 max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks; 873 if (range.start >= XFS_FSB_TO_B(mp, max_blocks) || 874 range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) || 875 range.len < mp->m_sb.sb_blocksize) 876 return -EINVAL; 877 878 start = BTOBB(range.start); 879 end = start + BTOBBT(range.len) - 1; 880 881 if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) { 882 error = xfs_trim_datadev_extents(mp, start, end, minlen); 883 if (error) 884 last_error = error; 885 } 886 887 if (rt_bdev && !xfs_trim_should_stop()) { 888 error = xfs_trim_rtdev_extents(mp, start, end, minlen); 889 if (error) 890 last_error = error; 891 } 892 893 if (last_error) 894 return last_error; 895 896 range.len = min_t(unsigned long long, range.len, 897 XFS_FSB_TO_B(mp, max_blocks) - range.start); 898 if (copy_to_user(urange, &range, sizeof(range))) 899 return -EFAULT; 900 return 0; 901 } 902