1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2010, 2023 Red Hat, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_trans.h" 12 #include "xfs_mount.h" 13 #include "xfs_btree.h" 14 #include "xfs_alloc_btree.h" 15 #include "xfs_alloc.h" 16 #include "xfs_discard.h" 17 #include "xfs_error.h" 18 #include "xfs_extent_busy.h" 19 #include "xfs_trace.h" 20 #include "xfs_log.h" 21 #include "xfs_ag.h" 22 #include "xfs_health.h" 23 #include "xfs_rtbitmap.h" 24 #include "xfs_rtgroup.h" 25 26 /* 27 * Notes on an efficient, low latency fstrim algorithm 28 * 29 * We need to walk the filesystem free space and issue discards on the free 30 * space that meet the search criteria (size and location). We cannot issue 31 * discards on extents that might be in use, or are so recently in use they are 32 * still marked as busy. To serialise against extent state changes whilst we are 33 * gathering extents to trim, we must hold the AGF lock to lock out other 34 * allocations and extent free operations that might change extent state. 35 * 36 * However, we cannot just hold the AGF for the entire AG free space walk whilst 37 * we issue discards on each free space that is found. Storage devices can have 38 * extremely slow discard implementations (e.g. ceph RBD) and so walking a 39 * couple of million free extents and issuing synchronous discards on each 40 * extent can take a *long* time. Whilst we are doing this walk, nothing else 41 * can access the AGF, and we can stall transactions and hence the log whilst 42 * modifications wait for the AGF lock to be released. This can lead hung tasks 43 * kicking the hung task timer and rebooting the system. This is bad. 44 * 45 * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI 46 * lock, gathers a range of inode cluster buffers that are allocated, drops the 47 * AGI lock and then reads all the inode cluster buffers and processes them. It 48 * loops doing this, using a cursor to keep track of where it is up to in the AG 49 * for each iteration to restart the INOBT lookup from. 50 * 51 * We can't do this exactly with free space - once we drop the AGF lock, the 52 * state of the free extent is out of our control and we cannot run a discard 53 * safely on it in this situation. Unless, of course, we've marked the free 54 * extent as busy and undergoing a discard operation whilst we held the AGF 55 * locked. 56 * 57 * This is exactly how online discard works - free extents are marked busy when 58 * they are freed, and once the extent free has been committed to the journal, 59 * the busy extent record is marked as "undergoing discard" and the discard is 60 * then issued on the free extent. Once the discard completes, the busy extent 61 * record is removed and the extent is able to be allocated again. 62 * 63 * In the context of fstrim, if we find a free extent we need to discard, we 64 * don't have to discard it immediately. All we need to do it record that free 65 * extent as being busy and under discard, and all the allocation routines will 66 * now avoid trying to allocate it. Hence if we mark the extent as busy under 67 * the AGF lock, we can safely discard it without holding the AGF lock because 68 * nothing will attempt to allocate that free space until the discard completes. 69 * 70 * This also allows us to issue discards asynchronously like we do with online 71 * discard, and so for fast devices fstrim will run much faster as we can have 72 * multiple discard operations in flight at once, as well as pipeline the free 73 * extent search so that it overlaps in flight discard IO. 74 */ 75 76 #define XFS_DISCARD_MAX_EXAMINE (100) 77 78 struct workqueue_struct *xfs_discard_wq; 79 80 static void 81 xfs_discard_endio_work( 82 struct work_struct *work) 83 { 84 struct xfs_busy_extents *extents = 85 container_of(work, struct xfs_busy_extents, endio_work); 86 87 xfs_extent_busy_clear(&extents->extent_list, false); 88 kfree(extents->owner); 89 } 90 91 /* 92 * Queue up the actual completion to a thread to avoid IRQ-safe locking for 93 * eb_lock. 94 */ 95 static void 96 xfs_discard_endio( 97 struct bio *bio) 98 { 99 struct xfs_busy_extents *extents = bio->bi_private; 100 101 INIT_WORK(&extents->endio_work, xfs_discard_endio_work); 102 queue_work(xfs_discard_wq, &extents->endio_work); 103 bio_put(bio); 104 } 105 106 /* 107 * Walk the discard list and issue discards on all the busy extents in the 108 * list. We plug and chain the bios so that we only need a single completion 109 * call to clear all the busy extents once the discards are complete. 110 */ 111 void 112 xfs_discard_extents( 113 struct xfs_mount *mp, 114 struct xfs_busy_extents *extents) 115 { 116 struct xfs_extent_busy *busyp; 117 struct bio *bio = NULL; 118 struct blk_plug plug; 119 120 blk_start_plug(&plug); 121 list_for_each_entry(busyp, &extents->extent_list, list) { 122 struct xfs_group *xg = busyp->group; 123 struct xfs_buftarg *btp = 124 xfs_group_type_buftarg(xg->xg_mount, xg->xg_type); 125 126 trace_xfs_discard_extent(xg, busyp->bno, busyp->length); 127 128 __blkdev_issue_discard(btp->bt_bdev, 129 xfs_gbno_to_daddr(xg, busyp->bno), 130 XFS_FSB_TO_BB(mp, busyp->length), 131 GFP_KERNEL, &bio); 132 } 133 134 if (bio) { 135 bio->bi_private = extents; 136 bio->bi_end_io = xfs_discard_endio; 137 submit_bio(bio); 138 } else { 139 xfs_discard_endio_work(&extents->endio_work); 140 } 141 blk_finish_plug(&plug); 142 } 143 144 /* 145 * Care must be taken setting up the trim cursor as the perags may not have been 146 * initialised when the cursor is initialised. e.g. a clean mount which hasn't 147 * read in AGFs and the first operation run on the mounted fs is a trim. This 148 * can result in perag fields that aren't initialised until 149 * xfs_trim_gather_extents() calls xfs_alloc_read_agf() to lock down the AG for 150 * the free space search. 151 */ 152 struct xfs_trim_cur { 153 xfs_agblock_t start; 154 xfs_extlen_t count; 155 xfs_agblock_t end; 156 xfs_extlen_t minlen; 157 bool by_bno; 158 }; 159 160 static int 161 xfs_trim_gather_extents( 162 struct xfs_perag *pag, 163 struct xfs_trim_cur *tcur, 164 struct xfs_busy_extents *extents) 165 { 166 struct xfs_mount *mp = pag_mount(pag); 167 struct xfs_trans *tp; 168 struct xfs_btree_cur *cur; 169 struct xfs_buf *agbp; 170 int error; 171 int i; 172 int batch = XFS_DISCARD_MAX_EXAMINE; 173 174 /* 175 * Force out the log. This means any transactions that might have freed 176 * space before we take the AGF buffer lock are now on disk, and the 177 * volatile disk cache is flushed. 178 */ 179 xfs_log_force(mp, XFS_LOG_SYNC); 180 181 tp = xfs_trans_alloc_empty(mp); 182 183 error = xfs_alloc_read_agf(pag, tp, 0, &agbp); 184 if (error) 185 goto out_trans_cancel; 186 187 /* 188 * First time through tcur->count will not have been initialised as 189 * pag->pagf_longest is not guaranteed to be valid before we read 190 * the AGF buffer above. 191 */ 192 if (!tcur->count) 193 tcur->count = pag->pagf_longest; 194 195 if (tcur->by_bno) { 196 /* sub-AG discard request always starts at tcur->start */ 197 cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); 198 error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i); 199 if (!error && !i) 200 error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i); 201 } else if (tcur->start == 0) { 202 /* first time through a by-len starts with max length */ 203 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); 204 error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i); 205 } else { 206 /* nth time through a by-len starts where we left off */ 207 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); 208 error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i); 209 } 210 if (error) 211 goto out_del_cursor; 212 if (i == 0) { 213 /* nothing of that length left in the AG, we are done */ 214 tcur->count = 0; 215 goto out_del_cursor; 216 } 217 218 /* 219 * Loop until we are done with all extents that are large 220 * enough to be worth discarding or we hit batch limits. 221 */ 222 while (i) { 223 xfs_agblock_t fbno; 224 xfs_extlen_t flen; 225 226 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); 227 if (error) 228 break; 229 if (XFS_IS_CORRUPT(mp, i != 1)) { 230 xfs_btree_mark_sick(cur); 231 error = -EFSCORRUPTED; 232 break; 233 } 234 235 if (--batch <= 0) { 236 /* 237 * Update the cursor to point at this extent so we 238 * restart the next batch from this extent. 239 */ 240 tcur->start = fbno; 241 tcur->count = flen; 242 break; 243 } 244 245 /* 246 * If the extent is entirely outside of the range we are 247 * supposed to skip it. Do not bother to trim down partially 248 * overlapping ranges for now. 249 */ 250 if (fbno + flen < tcur->start) { 251 trace_xfs_discard_exclude(pag_group(pag), fbno, flen); 252 goto next_extent; 253 } 254 if (fbno > tcur->end) { 255 trace_xfs_discard_exclude(pag_group(pag), fbno, flen); 256 if (tcur->by_bno) { 257 tcur->count = 0; 258 break; 259 } 260 goto next_extent; 261 } 262 263 /* Trim the extent returned to the range we want. */ 264 if (fbno < tcur->start) { 265 flen -= tcur->start - fbno; 266 fbno = tcur->start; 267 } 268 if (fbno + flen > tcur->end + 1) 269 flen = tcur->end - fbno + 1; 270 271 /* Too small? Give up. */ 272 if (flen < tcur->minlen) { 273 trace_xfs_discard_toosmall(pag_group(pag), fbno, flen); 274 if (tcur->by_bno) 275 goto next_extent; 276 tcur->count = 0; 277 break; 278 } 279 280 /* 281 * If any blocks in the range are still busy, skip the 282 * discard and try again the next time. 283 */ 284 if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) { 285 trace_xfs_discard_busy(pag_group(pag), fbno, flen); 286 goto next_extent; 287 } 288 289 xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen, 290 &extents->extent_list); 291 next_extent: 292 if (tcur->by_bno) 293 error = xfs_btree_increment(cur, 0, &i); 294 else 295 error = xfs_btree_decrement(cur, 0, &i); 296 if (error) 297 break; 298 299 /* 300 * If there's no more records in the tree, we are done. Set the 301 * cursor block count to 0 to indicate to the caller that there 302 * is no more extents to search. 303 */ 304 if (i == 0) 305 tcur->count = 0; 306 } 307 308 /* 309 * If there was an error, release all the gathered busy extents because 310 * we aren't going to issue a discard on them any more. 311 */ 312 if (error) 313 xfs_extent_busy_clear(&extents->extent_list, false); 314 out_del_cursor: 315 xfs_btree_del_cursor(cur, error); 316 out_trans_cancel: 317 xfs_trans_cancel(tp); 318 return error; 319 } 320 321 static bool 322 xfs_trim_should_stop(void) 323 { 324 return fatal_signal_pending(current) || freezing(current); 325 } 326 327 /* 328 * Iterate the free list gathering extents and discarding them. We need a cursor 329 * for the repeated iteration of gather/discard loop, so use the longest extent 330 * we found in the last batch as the key to start the next. 331 */ 332 static int 333 xfs_trim_perag_extents( 334 struct xfs_perag *pag, 335 xfs_agblock_t start, 336 xfs_agblock_t end, 337 xfs_extlen_t minlen) 338 { 339 struct xfs_trim_cur tcur = { 340 .start = start, 341 .end = end, 342 .minlen = minlen, 343 }; 344 int error = 0; 345 346 if (start != 0 || end != pag_group(pag)->xg_block_count) 347 tcur.by_bno = true; 348 349 do { 350 struct xfs_busy_extents *extents; 351 352 extents = kzalloc(sizeof(*extents), GFP_KERNEL); 353 if (!extents) { 354 error = -ENOMEM; 355 break; 356 } 357 358 extents->owner = extents; 359 INIT_LIST_HEAD(&extents->extent_list); 360 361 error = xfs_trim_gather_extents(pag, &tcur, extents); 362 if (error) { 363 kfree(extents); 364 break; 365 } 366 367 /* 368 * We hand the extent list to the discard function here so the 369 * discarded extents can be removed from the busy extent list. 370 * This allows the discards to run asynchronously with gathering 371 * the next round of extents to discard. 372 * 373 * However, we must ensure that we do not reference the extent 374 * list after this function call, as it may have been freed by 375 * the time control returns to us. 376 */ 377 xfs_discard_extents(pag_mount(pag), extents); 378 379 if (xfs_trim_should_stop()) 380 break; 381 382 } while (tcur.count != 0); 383 384 return error; 385 386 } 387 388 static int 389 xfs_trim_datadev_extents( 390 struct xfs_mount *mp, 391 xfs_daddr_t start, 392 xfs_daddr_t end, 393 xfs_extlen_t minlen) 394 { 395 xfs_agnumber_t start_agno, end_agno; 396 xfs_agblock_t start_agbno, end_agbno; 397 struct xfs_perag *pag = NULL; 398 xfs_daddr_t ddev_end; 399 int last_error = 0, error; 400 401 ddev_end = min_t(xfs_daddr_t, end, 402 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1); 403 404 start_agno = xfs_daddr_to_agno(mp, start); 405 start_agbno = xfs_daddr_to_agbno(mp, start); 406 end_agno = xfs_daddr_to_agno(mp, ddev_end); 407 end_agbno = xfs_daddr_to_agbno(mp, ddev_end); 408 409 while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) { 410 xfs_agblock_t agend = pag_group(pag)->xg_block_count; 411 412 if (pag_agno(pag) == end_agno) 413 agend = end_agbno; 414 error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen); 415 if (error) 416 last_error = error; 417 418 if (xfs_trim_should_stop()) { 419 xfs_perag_rele(pag); 420 break; 421 } 422 start_agbno = 0; 423 } 424 425 return last_error; 426 } 427 428 #ifdef CONFIG_XFS_RT 429 struct xfs_trim_rtdev { 430 /* list of rt extents to free */ 431 struct list_head extent_list; 432 433 /* minimum length that caller allows us to trim */ 434 xfs_rtblock_t minlen_fsb; 435 436 /* restart point for the rtbitmap walk */ 437 xfs_rtxnum_t restart_rtx; 438 439 /* stopping point for the current rtbitmap walk */ 440 xfs_rtxnum_t stop_rtx; 441 }; 442 443 struct xfs_rtx_busy { 444 struct list_head list; 445 xfs_rtblock_t bno; 446 xfs_rtblock_t length; 447 }; 448 449 static void 450 xfs_discard_free_rtdev_extents( 451 struct xfs_trim_rtdev *tr) 452 { 453 struct xfs_rtx_busy *busyp, *n; 454 455 list_for_each_entry_safe(busyp, n, &tr->extent_list, list) { 456 list_del_init(&busyp->list); 457 kfree(busyp); 458 } 459 } 460 461 /* 462 * Walk the discard list and issue discards on all the busy extents in the 463 * list. We plug and chain the bios so that we only need a single completion 464 * call to clear all the busy extents once the discards are complete. 465 */ 466 static int 467 xfs_discard_rtdev_extents( 468 struct xfs_mount *mp, 469 struct xfs_trim_rtdev *tr) 470 { 471 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 472 struct xfs_rtx_busy *busyp; 473 struct bio *bio = NULL; 474 struct blk_plug plug; 475 xfs_rtblock_t start = NULLRTBLOCK, length = 0; 476 int error = 0; 477 478 blk_start_plug(&plug); 479 list_for_each_entry(busyp, &tr->extent_list, list) { 480 if (start == NULLRTBLOCK) 481 start = busyp->bno; 482 length += busyp->length; 483 484 trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length); 485 486 __blkdev_issue_discard(bdev, 487 xfs_rtb_to_daddr(mp, busyp->bno), 488 XFS_FSB_TO_BB(mp, busyp->length), 489 GFP_NOFS, &bio); 490 } 491 xfs_discard_free_rtdev_extents(tr); 492 493 if (bio) { 494 error = submit_bio_wait(bio); 495 if (error == -EOPNOTSUPP) 496 error = 0; 497 if (error) 498 xfs_info(mp, 499 "discard failed for rtextent [0x%llx,%llu], error %d", 500 (unsigned long long)start, 501 (unsigned long long)length, 502 error); 503 bio_put(bio); 504 } 505 blk_finish_plug(&plug); 506 507 return error; 508 } 509 510 static int 511 xfs_trim_gather_rtextent( 512 struct xfs_rtgroup *rtg, 513 struct xfs_trans *tp, 514 const struct xfs_rtalloc_rec *rec, 515 void *priv) 516 { 517 struct xfs_trim_rtdev *tr = priv; 518 struct xfs_rtx_busy *busyp; 519 xfs_rtblock_t rbno, rlen; 520 521 if (rec->ar_startext > tr->stop_rtx) { 522 /* 523 * If we've scanned a large number of rtbitmap blocks, update 524 * the cursor to point at this extent so we restart the next 525 * batch from this extent. 526 */ 527 tr->restart_rtx = rec->ar_startext; 528 return -ECANCELED; 529 } 530 531 rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext); 532 rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount); 533 534 /* Ignore too small. */ 535 if (rlen < tr->minlen_fsb) { 536 trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen); 537 return 0; 538 } 539 540 busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL); 541 if (!busyp) 542 return -ENOMEM; 543 544 busyp->bno = rbno; 545 busyp->length = rlen; 546 INIT_LIST_HEAD(&busyp->list); 547 list_add_tail(&busyp->list, &tr->extent_list); 548 549 tr->restart_rtx = rec->ar_startext + rec->ar_extcount; 550 return 0; 551 } 552 553 /* Trim extents on an !rtgroups realtime device */ 554 static int 555 xfs_trim_rtextents( 556 struct xfs_rtgroup *rtg, 557 xfs_rtxnum_t low, 558 xfs_rtxnum_t high, 559 xfs_daddr_t minlen) 560 { 561 struct xfs_mount *mp = rtg_mount(rtg); 562 struct xfs_trim_rtdev tr = { 563 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), 564 .extent_list = LIST_HEAD_INIT(tr.extent_list), 565 }; 566 struct xfs_trans *tp; 567 int error; 568 569 tp = xfs_trans_alloc_empty(mp); 570 571 /* 572 * Walk the free ranges between low and high. The query_range function 573 * trims the extents returned. 574 */ 575 do { 576 tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp); 577 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 578 error = xfs_rtalloc_query_range(rtg, tp, low, high, 579 xfs_trim_gather_rtextent, &tr); 580 581 if (error == -ECANCELED) 582 error = 0; 583 if (error) { 584 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 585 xfs_discard_free_rtdev_extents(&tr); 586 break; 587 } 588 589 if (list_empty(&tr.extent_list)) { 590 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 591 break; 592 } 593 594 error = xfs_discard_rtdev_extents(mp, &tr); 595 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 596 if (error) 597 break; 598 599 low = tr.restart_rtx; 600 } while (!xfs_trim_should_stop() && low <= high); 601 602 xfs_trans_cancel(tp); 603 return error; 604 } 605 606 struct xfs_trim_rtgroup { 607 /* list of rtgroup extents to free */ 608 struct xfs_busy_extents *extents; 609 610 /* minimum length that caller allows us to trim */ 611 xfs_rtblock_t minlen_fsb; 612 613 /* restart point for the rtbitmap walk */ 614 xfs_rtxnum_t restart_rtx; 615 616 /* number of extents to examine before stopping to issue discard ios */ 617 int batch; 618 619 /* number of extents queued for discard */ 620 int queued; 621 }; 622 623 static int 624 xfs_trim_gather_rtgroup_extent( 625 struct xfs_rtgroup *rtg, 626 struct xfs_trans *tp, 627 const struct xfs_rtalloc_rec *rec, 628 void *priv) 629 { 630 struct xfs_trim_rtgroup *tr = priv; 631 xfs_rgblock_t rgbno; 632 xfs_extlen_t len; 633 634 if (--tr->batch <= 0) { 635 /* 636 * If we've checked a large number of extents, update the 637 * cursor to point at this extent so we restart the next batch 638 * from this extent. 639 */ 640 tr->restart_rtx = rec->ar_startext; 641 return -ECANCELED; 642 } 643 644 rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext); 645 len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount); 646 647 /* Ignore too small. */ 648 if (len < tr->minlen_fsb) { 649 trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len); 650 return 0; 651 } 652 653 /* 654 * If any blocks in the range are still busy, skip the discard and try 655 * again the next time. 656 */ 657 if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) { 658 trace_xfs_discard_busy(rtg_group(rtg), rgbno, len); 659 return 0; 660 } 661 662 xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len, 663 &tr->extents->extent_list); 664 665 tr->queued++; 666 tr->restart_rtx = rec->ar_startext + rec->ar_extcount; 667 return 0; 668 } 669 670 /* Trim extents in this rtgroup using the busy extent machinery. */ 671 static int 672 xfs_trim_rtgroup_extents( 673 struct xfs_rtgroup *rtg, 674 xfs_rtxnum_t low, 675 xfs_rtxnum_t high, 676 xfs_daddr_t minlen) 677 { 678 struct xfs_mount *mp = rtg_mount(rtg); 679 struct xfs_trim_rtgroup tr = { 680 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), 681 }; 682 struct xfs_trans *tp; 683 int error; 684 685 tp = xfs_trans_alloc_empty(mp); 686 687 /* 688 * Walk the free ranges between low and high. The query_range function 689 * trims the extents returned. 690 */ 691 do { 692 tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL); 693 if (!tr.extents) { 694 error = -ENOMEM; 695 break; 696 } 697 698 tr.queued = 0; 699 tr.batch = XFS_DISCARD_MAX_EXAMINE; 700 tr.extents->owner = tr.extents; 701 INIT_LIST_HEAD(&tr.extents->extent_list); 702 703 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 704 error = xfs_rtalloc_query_range(rtg, tp, low, high, 705 xfs_trim_gather_rtgroup_extent, &tr); 706 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 707 if (error == -ECANCELED) 708 error = 0; 709 if (error) { 710 kfree(tr.extents); 711 break; 712 } 713 714 if (!tr.queued) { 715 kfree(tr.extents); 716 break; 717 } 718 719 /* 720 * We hand the extent list to the discard function here so the 721 * discarded extents can be removed from the busy extent list. 722 * This allows the discards to run asynchronously with 723 * gathering the next round of extents to discard. 724 * 725 * However, we must ensure that we do not reference the extent 726 * list after this function call, as it may have been freed by 727 * the time control returns to us. 728 */ 729 xfs_discard_extents(rtg_mount(rtg), tr.extents); 730 731 low = tr.restart_rtx; 732 } while (!xfs_trim_should_stop() && low <= high); 733 734 xfs_trans_cancel(tp); 735 return error; 736 } 737 738 static int 739 xfs_trim_rtdev_extents( 740 struct xfs_mount *mp, 741 xfs_daddr_t start, 742 xfs_daddr_t end, 743 xfs_daddr_t minlen) 744 { 745 xfs_rtblock_t start_rtbno, end_rtbno; 746 xfs_rtxnum_t start_rtx, end_rtx; 747 xfs_rgnumber_t start_rgno, end_rgno; 748 xfs_daddr_t daddr_offset; 749 int last_error = 0, error; 750 struct xfs_rtgroup *rtg = NULL; 751 752 /* Shift the start and end downwards to match the rt device. */ 753 daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 754 if (start > daddr_offset) 755 start -= daddr_offset; 756 else 757 start = 0; 758 start_rtbno = xfs_daddr_to_rtb(mp, start); 759 start_rtx = xfs_rtb_to_rtx(mp, start_rtbno); 760 start_rgno = xfs_rtb_to_rgno(mp, start_rtbno); 761 762 if (end <= daddr_offset) 763 return 0; 764 else 765 end -= daddr_offset; 766 end_rtbno = xfs_daddr_to_rtb(mp, end); 767 end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1); 768 end_rgno = xfs_rtb_to_rgno(mp, end_rtbno); 769 770 while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) { 771 xfs_rtxnum_t rtg_end = rtg->rtg_extents; 772 773 if (rtg_rgno(rtg) == end_rgno) 774 rtg_end = min(rtg_end, end_rtx); 775 776 if (xfs_has_rtgroups(mp)) 777 error = xfs_trim_rtgroup_extents(rtg, start_rtx, 778 rtg_end, minlen); 779 else 780 error = xfs_trim_rtextents(rtg, start_rtx, rtg_end, 781 minlen); 782 if (error) 783 last_error = error; 784 785 if (xfs_trim_should_stop()) { 786 xfs_rtgroup_rele(rtg); 787 break; 788 } 789 start_rtx = 0; 790 } 791 792 return last_error; 793 } 794 #else 795 # define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP) 796 #endif /* CONFIG_XFS_RT */ 797 798 /* 799 * trim a range of the filesystem. 800 * 801 * Note: the parameters passed from userspace are byte ranges into the 802 * filesystem which does not match to the format we use for filesystem block 803 * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format 804 * is a linear address range. Hence we need to use DADDR based conversions and 805 * comparisons for determining the correct offset and regions to trim. 806 * 807 * The realtime device is mapped into the FITRIM "address space" immediately 808 * after the data device. 809 */ 810 int 811 xfs_ioc_trim( 812 struct xfs_mount *mp, 813 struct fstrim_range __user *urange) 814 { 815 unsigned int granularity = 816 bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); 817 struct block_device *rt_bdev = NULL; 818 struct fstrim_range range; 819 xfs_daddr_t start, end; 820 xfs_extlen_t minlen; 821 xfs_rfsblock_t max_blocks; 822 int error, last_error = 0; 823 824 if (!capable(CAP_SYS_ADMIN)) 825 return -EPERM; 826 827 if (mp->m_rtdev_targp && !xfs_has_zoned(mp) && 828 bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev)) 829 rt_bdev = mp->m_rtdev_targp->bt_bdev; 830 if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev) 831 return -EOPNOTSUPP; 832 833 if (rt_bdev) 834 granularity = max(granularity, 835 bdev_discard_granularity(rt_bdev)); 836 837 /* 838 * We haven't recovered the log, so we cannot use our bnobt-guided 839 * storage zapping commands. 840 */ 841 if (xfs_has_norecovery(mp)) 842 return -EROFS; 843 844 if (copy_from_user(&range, urange, sizeof(range))) 845 return -EFAULT; 846 847 range.minlen = max_t(u64, granularity, range.minlen); 848 minlen = XFS_B_TO_FSB(mp, range.minlen); 849 850 /* 851 * Truncating down the len isn't actually quite correct, but using 852 * BBTOB would mean we trivially get overflows for values 853 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default 854 * used by the fstrim application. In the end it really doesn't 855 * matter as trimming blocks is an advisory interface. 856 */ 857 max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks; 858 if (range.start >= XFS_FSB_TO_B(mp, max_blocks) || 859 range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) || 860 range.len < mp->m_sb.sb_blocksize) 861 return -EINVAL; 862 863 start = BTOBB(range.start); 864 end = start + BTOBBT(range.len) - 1; 865 866 if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) { 867 error = xfs_trim_datadev_extents(mp, start, end, minlen); 868 if (error) 869 last_error = error; 870 } 871 872 if (rt_bdev && !xfs_trim_should_stop()) { 873 error = xfs_trim_rtdev_extents(mp, start, end, minlen); 874 if (error) 875 last_error = error; 876 } 877 878 if (last_error) 879 return last_error; 880 881 range.len = min_t(unsigned long long, range.len, 882 XFS_FSB_TO_B(mp, max_blocks) - range.start); 883 if (copy_to_user(urange, &range, sizeof(range))) 884 return -EFAULT; 885 return 0; 886 } 887