1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2010, 2023 Red Hat, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_trans.h" 12 #include "xfs_mount.h" 13 #include "xfs_btree.h" 14 #include "xfs_alloc_btree.h" 15 #include "xfs_alloc.h" 16 #include "xfs_discard.h" 17 #include "xfs_error.h" 18 #include "xfs_extent_busy.h" 19 #include "xfs_trace.h" 20 #include "xfs_log.h" 21 #include "xfs_ag.h" 22 #include "xfs_health.h" 23 #include "xfs_rtbitmap.h" 24 #include "xfs_rtgroup.h" 25 26 /* 27 * Notes on an efficient, low latency fstrim algorithm 28 * 29 * We need to walk the filesystem free space and issue discards on the free 30 * space that meet the search criteria (size and location). We cannot issue 31 * discards on extents that might be in use, or are so recently in use they are 32 * still marked as busy. To serialise against extent state changes whilst we are 33 * gathering extents to trim, we must hold the AGF lock to lock out other 34 * allocations and extent free operations that might change extent state. 35 * 36 * However, we cannot just hold the AGF for the entire AG free space walk whilst 37 * we issue discards on each free space that is found. Storage devices can have 38 * extremely slow discard implementations (e.g. ceph RBD) and so walking a 39 * couple of million free extents and issuing synchronous discards on each 40 * extent can take a *long* time. Whilst we are doing this walk, nothing else 41 * can access the AGF, and we can stall transactions and hence the log whilst 42 * modifications wait for the AGF lock to be released. This can lead hung tasks 43 * kicking the hung task timer and rebooting the system. This is bad. 44 * 45 * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI 46 * lock, gathers a range of inode cluster buffers that are allocated, drops the 47 * AGI lock and then reads all the inode cluster buffers and processes them. It 48 * loops doing this, using a cursor to keep track of where it is up to in the AG 49 * for each iteration to restart the INOBT lookup from. 50 * 51 * We can't do this exactly with free space - once we drop the AGF lock, the 52 * state of the free extent is out of our control and we cannot run a discard 53 * safely on it in this situation. Unless, of course, we've marked the free 54 * extent as busy and undergoing a discard operation whilst we held the AGF 55 * locked. 56 * 57 * This is exactly how online discard works - free extents are marked busy when 58 * they are freed, and once the extent free has been committed to the journal, 59 * the busy extent record is marked as "undergoing discard" and the discard is 60 * then issued on the free extent. Once the discard completes, the busy extent 61 * record is removed and the extent is able to be allocated again. 62 * 63 * In the context of fstrim, if we find a free extent we need to discard, we 64 * don't have to discard it immediately. All we need to do it record that free 65 * extent as being busy and under discard, and all the allocation routines will 66 * now avoid trying to allocate it. Hence if we mark the extent as busy under 67 * the AGF lock, we can safely discard it without holding the AGF lock because 68 * nothing will attempt to allocate that free space until the discard completes. 69 * 70 * This also allows us to issue discards asynchronously like we do with online 71 * discard, and so for fast devices fstrim will run much faster as we can have 72 * multiple discard operations in flight at once, as well as pipeline the free 73 * extent search so that it overlaps in flight discard IO. 74 */ 75 76 #define XFS_DISCARD_MAX_EXAMINE (100) 77 78 struct workqueue_struct *xfs_discard_wq; 79 80 static void 81 xfs_discard_endio_work( 82 struct work_struct *work) 83 { 84 struct xfs_busy_extents *extents = 85 container_of(work, struct xfs_busy_extents, endio_work); 86 87 xfs_extent_busy_clear(&extents->extent_list, false); 88 kfree(extents->owner); 89 } 90 91 /* 92 * Queue up the actual completion to a thread to avoid IRQ-safe locking for 93 * eb_lock. 94 */ 95 static void 96 xfs_discard_endio( 97 struct bio *bio) 98 { 99 struct xfs_busy_extents *extents = bio->bi_private; 100 101 INIT_WORK(&extents->endio_work, xfs_discard_endio_work); 102 queue_work(xfs_discard_wq, &extents->endio_work); 103 bio_put(bio); 104 } 105 106 /* 107 * Walk the discard list and issue discards on all the busy extents in the 108 * list. We plug and chain the bios so that we only need a single completion 109 * call to clear all the busy extents once the discards are complete. 110 */ 111 int 112 xfs_discard_extents( 113 struct xfs_mount *mp, 114 struct xfs_busy_extents *extents) 115 { 116 struct xfs_extent_busy *busyp; 117 struct bio *bio = NULL; 118 struct blk_plug plug; 119 int error = 0; 120 121 blk_start_plug(&plug); 122 list_for_each_entry(busyp, &extents->extent_list, list) { 123 struct xfs_group *xg = busyp->group; 124 struct xfs_buftarg *btp = 125 xfs_group_type_buftarg(xg->xg_mount, xg->xg_type); 126 127 trace_xfs_discard_extent(xg, busyp->bno, busyp->length); 128 129 error = __blkdev_issue_discard(btp->bt_bdev, 130 xfs_gbno_to_daddr(xg, busyp->bno), 131 XFS_FSB_TO_BB(mp, busyp->length), 132 GFP_KERNEL, &bio); 133 if (error && error != -EOPNOTSUPP) { 134 xfs_info(mp, 135 "discard failed for extent [0x%llx,%u], error %d", 136 (unsigned long long)busyp->bno, 137 busyp->length, 138 error); 139 break; 140 } 141 } 142 143 if (bio) { 144 bio->bi_private = extents; 145 bio->bi_end_io = xfs_discard_endio; 146 submit_bio(bio); 147 } else { 148 xfs_discard_endio_work(&extents->endio_work); 149 } 150 blk_finish_plug(&plug); 151 152 return error; 153 } 154 155 /* 156 * Care must be taken setting up the trim cursor as the perags may not have been 157 * initialised when the cursor is initialised. e.g. a clean mount which hasn't 158 * read in AGFs and the first operation run on the mounted fs is a trim. This 159 * can result in perag fields that aren't initialised until 160 * xfs_trim_gather_extents() calls xfs_alloc_read_agf() to lock down the AG for 161 * the free space search. 162 */ 163 struct xfs_trim_cur { 164 xfs_agblock_t start; 165 xfs_extlen_t count; 166 xfs_agblock_t end; 167 xfs_extlen_t minlen; 168 bool by_bno; 169 }; 170 171 static int 172 xfs_trim_gather_extents( 173 struct xfs_perag *pag, 174 struct xfs_trim_cur *tcur, 175 struct xfs_busy_extents *extents) 176 { 177 struct xfs_mount *mp = pag_mount(pag); 178 struct xfs_trans *tp; 179 struct xfs_btree_cur *cur; 180 struct xfs_buf *agbp; 181 int error; 182 int i; 183 int batch = XFS_DISCARD_MAX_EXAMINE; 184 185 /* 186 * Force out the log. This means any transactions that might have freed 187 * space before we take the AGF buffer lock are now on disk, and the 188 * volatile disk cache is flushed. 189 */ 190 xfs_log_force(mp, XFS_LOG_SYNC); 191 192 error = xfs_trans_alloc_empty(mp, &tp); 193 if (error) 194 return error; 195 196 error = xfs_alloc_read_agf(pag, tp, 0, &agbp); 197 if (error) 198 goto out_trans_cancel; 199 200 /* 201 * First time through tcur->count will not have been initialised as 202 * pag->pagf_longest is not guaranteed to be valid before we read 203 * the AGF buffer above. 204 */ 205 if (!tcur->count) 206 tcur->count = pag->pagf_longest; 207 208 if (tcur->by_bno) { 209 /* sub-AG discard request always starts at tcur->start */ 210 cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); 211 error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i); 212 if (!error && !i) 213 error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i); 214 } else if (tcur->start == 0) { 215 /* first time through a by-len starts with max length */ 216 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); 217 error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i); 218 } else { 219 /* nth time through a by-len starts where we left off */ 220 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); 221 error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i); 222 } 223 if (error) 224 goto out_del_cursor; 225 if (i == 0) { 226 /* nothing of that length left in the AG, we are done */ 227 tcur->count = 0; 228 goto out_del_cursor; 229 } 230 231 /* 232 * Loop until we are done with all extents that are large 233 * enough to be worth discarding or we hit batch limits. 234 */ 235 while (i) { 236 xfs_agblock_t fbno; 237 xfs_extlen_t flen; 238 239 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); 240 if (error) 241 break; 242 if (XFS_IS_CORRUPT(mp, i != 1)) { 243 xfs_btree_mark_sick(cur); 244 error = -EFSCORRUPTED; 245 break; 246 } 247 248 if (--batch <= 0) { 249 /* 250 * Update the cursor to point at this extent so we 251 * restart the next batch from this extent. 252 */ 253 tcur->start = fbno; 254 tcur->count = flen; 255 break; 256 } 257 258 /* 259 * If the extent is entirely outside of the range we are 260 * supposed to skip it. Do not bother to trim down partially 261 * overlapping ranges for now. 262 */ 263 if (fbno + flen < tcur->start) { 264 trace_xfs_discard_exclude(pag_group(pag), fbno, flen); 265 goto next_extent; 266 } 267 if (fbno > tcur->end) { 268 trace_xfs_discard_exclude(pag_group(pag), fbno, flen); 269 if (tcur->by_bno) { 270 tcur->count = 0; 271 break; 272 } 273 goto next_extent; 274 } 275 276 /* Trim the extent returned to the range we want. */ 277 if (fbno < tcur->start) { 278 flen -= tcur->start - fbno; 279 fbno = tcur->start; 280 } 281 if (fbno + flen > tcur->end + 1) 282 flen = tcur->end - fbno + 1; 283 284 /* Too small? Give up. */ 285 if (flen < tcur->minlen) { 286 trace_xfs_discard_toosmall(pag_group(pag), fbno, flen); 287 if (tcur->by_bno) 288 goto next_extent; 289 tcur->count = 0; 290 break; 291 } 292 293 /* 294 * If any blocks in the range are still busy, skip the 295 * discard and try again the next time. 296 */ 297 if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) { 298 trace_xfs_discard_busy(pag_group(pag), fbno, flen); 299 goto next_extent; 300 } 301 302 xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen, 303 &extents->extent_list); 304 next_extent: 305 if (tcur->by_bno) 306 error = xfs_btree_increment(cur, 0, &i); 307 else 308 error = xfs_btree_decrement(cur, 0, &i); 309 if (error) 310 break; 311 312 /* 313 * If there's no more records in the tree, we are done. Set the 314 * cursor block count to 0 to indicate to the caller that there 315 * is no more extents to search. 316 */ 317 if (i == 0) 318 tcur->count = 0; 319 } 320 321 /* 322 * If there was an error, release all the gathered busy extents because 323 * we aren't going to issue a discard on them any more. 324 */ 325 if (error) 326 xfs_extent_busy_clear(&extents->extent_list, false); 327 out_del_cursor: 328 xfs_btree_del_cursor(cur, error); 329 out_trans_cancel: 330 xfs_trans_cancel(tp); 331 return error; 332 } 333 334 static bool 335 xfs_trim_should_stop(void) 336 { 337 return fatal_signal_pending(current) || freezing(current); 338 } 339 340 /* 341 * Iterate the free list gathering extents and discarding them. We need a cursor 342 * for the repeated iteration of gather/discard loop, so use the longest extent 343 * we found in the last batch as the key to start the next. 344 */ 345 static int 346 xfs_trim_perag_extents( 347 struct xfs_perag *pag, 348 xfs_agblock_t start, 349 xfs_agblock_t end, 350 xfs_extlen_t minlen) 351 { 352 struct xfs_trim_cur tcur = { 353 .start = start, 354 .end = end, 355 .minlen = minlen, 356 }; 357 int error = 0; 358 359 if (start != 0 || end != pag_group(pag)->xg_block_count) 360 tcur.by_bno = true; 361 362 do { 363 struct xfs_busy_extents *extents; 364 365 extents = kzalloc(sizeof(*extents), GFP_KERNEL); 366 if (!extents) { 367 error = -ENOMEM; 368 break; 369 } 370 371 extents->owner = extents; 372 INIT_LIST_HEAD(&extents->extent_list); 373 374 error = xfs_trim_gather_extents(pag, &tcur, extents); 375 if (error) { 376 kfree(extents); 377 break; 378 } 379 380 /* 381 * We hand the extent list to the discard function here so the 382 * discarded extents can be removed from the busy extent list. 383 * This allows the discards to run asynchronously with gathering 384 * the next round of extents to discard. 385 * 386 * However, we must ensure that we do not reference the extent 387 * list after this function call, as it may have been freed by 388 * the time control returns to us. 389 */ 390 error = xfs_discard_extents(pag_mount(pag), extents); 391 if (error) 392 break; 393 394 if (xfs_trim_should_stop()) 395 break; 396 397 } while (tcur.count != 0); 398 399 return error; 400 401 } 402 403 static int 404 xfs_trim_datadev_extents( 405 struct xfs_mount *mp, 406 xfs_daddr_t start, 407 xfs_daddr_t end, 408 xfs_extlen_t minlen) 409 { 410 xfs_agnumber_t start_agno, end_agno; 411 xfs_agblock_t start_agbno, end_agbno; 412 struct xfs_perag *pag = NULL; 413 xfs_daddr_t ddev_end; 414 int last_error = 0, error; 415 416 ddev_end = min_t(xfs_daddr_t, end, 417 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1); 418 419 start_agno = xfs_daddr_to_agno(mp, start); 420 start_agbno = xfs_daddr_to_agbno(mp, start); 421 end_agno = xfs_daddr_to_agno(mp, ddev_end); 422 end_agbno = xfs_daddr_to_agbno(mp, ddev_end); 423 424 while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) { 425 xfs_agblock_t agend = pag_group(pag)->xg_block_count; 426 427 if (pag_agno(pag) == end_agno) 428 agend = end_agbno; 429 error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen); 430 if (error) 431 last_error = error; 432 433 if (xfs_trim_should_stop()) { 434 xfs_perag_rele(pag); 435 break; 436 } 437 start_agbno = 0; 438 } 439 440 return last_error; 441 } 442 443 #ifdef CONFIG_XFS_RT 444 struct xfs_trim_rtdev { 445 /* list of rt extents to free */ 446 struct list_head extent_list; 447 448 /* minimum length that caller allows us to trim */ 449 xfs_rtblock_t minlen_fsb; 450 451 /* restart point for the rtbitmap walk */ 452 xfs_rtxnum_t restart_rtx; 453 454 /* stopping point for the current rtbitmap walk */ 455 xfs_rtxnum_t stop_rtx; 456 }; 457 458 struct xfs_rtx_busy { 459 struct list_head list; 460 xfs_rtblock_t bno; 461 xfs_rtblock_t length; 462 }; 463 464 static void 465 xfs_discard_free_rtdev_extents( 466 struct xfs_trim_rtdev *tr) 467 { 468 struct xfs_rtx_busy *busyp, *n; 469 470 list_for_each_entry_safe(busyp, n, &tr->extent_list, list) { 471 list_del_init(&busyp->list); 472 kfree(busyp); 473 } 474 } 475 476 /* 477 * Walk the discard list and issue discards on all the busy extents in the 478 * list. We plug and chain the bios so that we only need a single completion 479 * call to clear all the busy extents once the discards are complete. 480 */ 481 static int 482 xfs_discard_rtdev_extents( 483 struct xfs_mount *mp, 484 struct xfs_trim_rtdev *tr) 485 { 486 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 487 struct xfs_rtx_busy *busyp; 488 struct bio *bio = NULL; 489 struct blk_plug plug; 490 xfs_rtblock_t start = NULLRTBLOCK, length = 0; 491 int error = 0; 492 493 blk_start_plug(&plug); 494 list_for_each_entry(busyp, &tr->extent_list, list) { 495 if (start == NULLRTBLOCK) 496 start = busyp->bno; 497 length += busyp->length; 498 499 trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length); 500 501 error = __blkdev_issue_discard(bdev, 502 xfs_rtb_to_daddr(mp, busyp->bno), 503 XFS_FSB_TO_BB(mp, busyp->length), 504 GFP_NOFS, &bio); 505 if (error) 506 break; 507 } 508 xfs_discard_free_rtdev_extents(tr); 509 510 if (bio) { 511 error = submit_bio_wait(bio); 512 if (error == -EOPNOTSUPP) 513 error = 0; 514 if (error) 515 xfs_info(mp, 516 "discard failed for rtextent [0x%llx,%llu], error %d", 517 (unsigned long long)start, 518 (unsigned long long)length, 519 error); 520 bio_put(bio); 521 } 522 blk_finish_plug(&plug); 523 524 return error; 525 } 526 527 static int 528 xfs_trim_gather_rtextent( 529 struct xfs_rtgroup *rtg, 530 struct xfs_trans *tp, 531 const struct xfs_rtalloc_rec *rec, 532 void *priv) 533 { 534 struct xfs_trim_rtdev *tr = priv; 535 struct xfs_rtx_busy *busyp; 536 xfs_rtblock_t rbno, rlen; 537 538 if (rec->ar_startext > tr->stop_rtx) { 539 /* 540 * If we've scanned a large number of rtbitmap blocks, update 541 * the cursor to point at this extent so we restart the next 542 * batch from this extent. 543 */ 544 tr->restart_rtx = rec->ar_startext; 545 return -ECANCELED; 546 } 547 548 rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext); 549 rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount); 550 551 /* Ignore too small. */ 552 if (rlen < tr->minlen_fsb) { 553 trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen); 554 return 0; 555 } 556 557 busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL); 558 if (!busyp) 559 return -ENOMEM; 560 561 busyp->bno = rbno; 562 busyp->length = rlen; 563 INIT_LIST_HEAD(&busyp->list); 564 list_add_tail(&busyp->list, &tr->extent_list); 565 566 tr->restart_rtx = rec->ar_startext + rec->ar_extcount; 567 return 0; 568 } 569 570 /* Trim extents on an !rtgroups realtime device */ 571 static int 572 xfs_trim_rtextents( 573 struct xfs_rtgroup *rtg, 574 xfs_rtxnum_t low, 575 xfs_rtxnum_t high, 576 xfs_daddr_t minlen) 577 { 578 struct xfs_mount *mp = rtg_mount(rtg); 579 struct xfs_trim_rtdev tr = { 580 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), 581 .extent_list = LIST_HEAD_INIT(tr.extent_list), 582 }; 583 struct xfs_trans *tp; 584 int error; 585 586 error = xfs_trans_alloc_empty(mp, &tp); 587 if (error) 588 return error; 589 590 /* 591 * Walk the free ranges between low and high. The query_range function 592 * trims the extents returned. 593 */ 594 do { 595 tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp); 596 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 597 error = xfs_rtalloc_query_range(rtg, tp, low, high, 598 xfs_trim_gather_rtextent, &tr); 599 600 if (error == -ECANCELED) 601 error = 0; 602 if (error) { 603 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 604 xfs_discard_free_rtdev_extents(&tr); 605 break; 606 } 607 608 if (list_empty(&tr.extent_list)) { 609 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 610 break; 611 } 612 613 error = xfs_discard_rtdev_extents(mp, &tr); 614 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 615 if (error) 616 break; 617 618 low = tr.restart_rtx; 619 } while (!xfs_trim_should_stop() && low <= high); 620 621 xfs_trans_cancel(tp); 622 return error; 623 } 624 625 struct xfs_trim_rtgroup { 626 /* list of rtgroup extents to free */ 627 struct xfs_busy_extents *extents; 628 629 /* minimum length that caller allows us to trim */ 630 xfs_rtblock_t minlen_fsb; 631 632 /* restart point for the rtbitmap walk */ 633 xfs_rtxnum_t restart_rtx; 634 635 /* number of extents to examine before stopping to issue discard ios */ 636 int batch; 637 638 /* number of extents queued for discard */ 639 int queued; 640 }; 641 642 static int 643 xfs_trim_gather_rtgroup_extent( 644 struct xfs_rtgroup *rtg, 645 struct xfs_trans *tp, 646 const struct xfs_rtalloc_rec *rec, 647 void *priv) 648 { 649 struct xfs_trim_rtgroup *tr = priv; 650 xfs_rgblock_t rgbno; 651 xfs_extlen_t len; 652 653 if (--tr->batch <= 0) { 654 /* 655 * If we've checked a large number of extents, update the 656 * cursor to point at this extent so we restart the next batch 657 * from this extent. 658 */ 659 tr->restart_rtx = rec->ar_startext; 660 return -ECANCELED; 661 } 662 663 rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext); 664 len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount); 665 666 /* Ignore too small. */ 667 if (len < tr->minlen_fsb) { 668 trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len); 669 return 0; 670 } 671 672 /* 673 * If any blocks in the range are still busy, skip the discard and try 674 * again the next time. 675 */ 676 if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) { 677 trace_xfs_discard_busy(rtg_group(rtg), rgbno, len); 678 return 0; 679 } 680 681 xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len, 682 &tr->extents->extent_list); 683 684 tr->queued++; 685 tr->restart_rtx = rec->ar_startext + rec->ar_extcount; 686 return 0; 687 } 688 689 /* Trim extents in this rtgroup using the busy extent machinery. */ 690 static int 691 xfs_trim_rtgroup_extents( 692 struct xfs_rtgroup *rtg, 693 xfs_rtxnum_t low, 694 xfs_rtxnum_t high, 695 xfs_daddr_t minlen) 696 { 697 struct xfs_mount *mp = rtg_mount(rtg); 698 struct xfs_trim_rtgroup tr = { 699 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), 700 }; 701 struct xfs_trans *tp; 702 int error; 703 704 error = xfs_trans_alloc_empty(mp, &tp); 705 if (error) 706 return error; 707 708 /* 709 * Walk the free ranges between low and high. The query_range function 710 * trims the extents returned. 711 */ 712 do { 713 tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL); 714 if (!tr.extents) { 715 error = -ENOMEM; 716 break; 717 } 718 719 tr.queued = 0; 720 tr.batch = XFS_DISCARD_MAX_EXAMINE; 721 tr.extents->owner = tr.extents; 722 INIT_LIST_HEAD(&tr.extents->extent_list); 723 724 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 725 error = xfs_rtalloc_query_range(rtg, tp, low, high, 726 xfs_trim_gather_rtgroup_extent, &tr); 727 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 728 if (error == -ECANCELED) 729 error = 0; 730 if (error) { 731 kfree(tr.extents); 732 break; 733 } 734 735 if (!tr.queued) 736 break; 737 738 /* 739 * We hand the extent list to the discard function here so the 740 * discarded extents can be removed from the busy extent list. 741 * This allows the discards to run asynchronously with 742 * gathering the next round of extents to discard. 743 * 744 * However, we must ensure that we do not reference the extent 745 * list after this function call, as it may have been freed by 746 * the time control returns to us. 747 */ 748 error = xfs_discard_extents(rtg_mount(rtg), tr.extents); 749 if (error) 750 break; 751 752 low = tr.restart_rtx; 753 } while (!xfs_trim_should_stop() && low <= high); 754 755 xfs_trans_cancel(tp); 756 return error; 757 } 758 759 static int 760 xfs_trim_rtdev_extents( 761 struct xfs_mount *mp, 762 xfs_daddr_t start, 763 xfs_daddr_t end, 764 xfs_daddr_t minlen) 765 { 766 xfs_rtblock_t start_rtbno, end_rtbno; 767 xfs_rtxnum_t start_rtx, end_rtx; 768 xfs_rgnumber_t start_rgno, end_rgno; 769 xfs_daddr_t daddr_offset; 770 int last_error = 0, error; 771 struct xfs_rtgroup *rtg = NULL; 772 773 /* Shift the start and end downwards to match the rt device. */ 774 daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 775 if (start > daddr_offset) 776 start -= daddr_offset; 777 else 778 start = 0; 779 start_rtbno = xfs_daddr_to_rtb(mp, start); 780 start_rtx = xfs_rtb_to_rtx(mp, start_rtbno); 781 start_rgno = xfs_rtb_to_rgno(mp, start_rtbno); 782 783 if (end <= daddr_offset) 784 return 0; 785 else 786 end -= daddr_offset; 787 end_rtbno = xfs_daddr_to_rtb(mp, end); 788 end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1); 789 end_rgno = xfs_rtb_to_rgno(mp, end_rtbno); 790 791 while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) { 792 xfs_rtxnum_t rtg_end = rtg->rtg_extents; 793 794 if (rtg_rgno(rtg) == end_rgno) 795 rtg_end = min(rtg_end, end_rtx); 796 797 if (xfs_has_rtgroups(mp)) 798 error = xfs_trim_rtgroup_extents(rtg, start_rtx, 799 rtg_end, minlen); 800 else 801 error = xfs_trim_rtextents(rtg, start_rtx, rtg_end, 802 minlen); 803 if (error) 804 last_error = error; 805 806 if (xfs_trim_should_stop()) { 807 xfs_rtgroup_rele(rtg); 808 break; 809 } 810 start_rtx = 0; 811 } 812 813 return last_error; 814 } 815 #else 816 # define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP) 817 #endif /* CONFIG_XFS_RT */ 818 819 /* 820 * trim a range of the filesystem. 821 * 822 * Note: the parameters passed from userspace are byte ranges into the 823 * filesystem which does not match to the format we use for filesystem block 824 * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format 825 * is a linear address range. Hence we need to use DADDR based conversions and 826 * comparisons for determining the correct offset and regions to trim. 827 * 828 * The realtime device is mapped into the FITRIM "address space" immediately 829 * after the data device. 830 */ 831 int 832 xfs_ioc_trim( 833 struct xfs_mount *mp, 834 struct fstrim_range __user *urange) 835 { 836 unsigned int granularity = 837 bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); 838 struct block_device *rt_bdev = NULL; 839 struct fstrim_range range; 840 xfs_daddr_t start, end; 841 xfs_extlen_t minlen; 842 xfs_rfsblock_t max_blocks; 843 int error, last_error = 0; 844 845 if (!capable(CAP_SYS_ADMIN)) 846 return -EPERM; 847 848 if (mp->m_rtdev_targp && !xfs_has_zoned(mp) && 849 bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev)) 850 rt_bdev = mp->m_rtdev_targp->bt_bdev; 851 if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev) 852 return -EOPNOTSUPP; 853 854 if (rt_bdev) 855 granularity = max(granularity, 856 bdev_discard_granularity(rt_bdev)); 857 858 /* 859 * We haven't recovered the log, so we cannot use our bnobt-guided 860 * storage zapping commands. 861 */ 862 if (xfs_has_norecovery(mp)) 863 return -EROFS; 864 865 if (copy_from_user(&range, urange, sizeof(range))) 866 return -EFAULT; 867 868 range.minlen = max_t(u64, granularity, range.minlen); 869 minlen = XFS_B_TO_FSB(mp, range.minlen); 870 871 /* 872 * Truncating down the len isn't actually quite correct, but using 873 * BBTOB would mean we trivially get overflows for values 874 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default 875 * used by the fstrim application. In the end it really doesn't 876 * matter as trimming blocks is an advisory interface. 877 */ 878 max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks; 879 if (range.start >= XFS_FSB_TO_B(mp, max_blocks) || 880 range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) || 881 range.len < mp->m_sb.sb_blocksize) 882 return -EINVAL; 883 884 start = BTOBB(range.start); 885 end = start + BTOBBT(range.len) - 1; 886 887 if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) { 888 error = xfs_trim_datadev_extents(mp, start, end, minlen); 889 if (error) 890 last_error = error; 891 } 892 893 if (rt_bdev && !xfs_trim_should_stop()) { 894 error = xfs_trim_rtdev_extents(mp, start, end, minlen); 895 if (error) 896 last_error = error; 897 } 898 899 if (last_error) 900 return last_error; 901 902 range.len = min_t(unsigned long long, range.len, 903 XFS_FSB_TO_B(mp, max_blocks) - range.start); 904 if (copy_to_user(urange, &range, sizeof(range))) 905 return -EFAULT; 906 return 0; 907 } 908